Skip to content

Commit

Permalink
lib/x86/adler32: add back an AVX-512BW implementation
Browse files Browse the repository at this point in the history
libdeflate used to (before commit 416bac3) have an AVX-512BW
implementation of Adler-32, but I removed it due to AVX-512's
downclocking issues.  Since then, newer Intel and AMD CPUs have come out
with better AVX-512 implementations.  I also recently added AVX-512
implementations of CRC-32.  An exclusion list is used to prevent 512-bit
vectors from being used on older Intel CPUs.  Therefore, add back an
AVX-512BW implementation of Adler32.  Unlike the original
implementation, I went with the unpack-based approach, for consistency
with the current adler32_avx2().  The new code is also MSVC-compatible.
  • Loading branch information
ebiggers committed Feb 21, 2024
1 parent aa4a06e commit 69e29f6
Show file tree
Hide file tree
Showing 5 changed files with 124 additions and 7 deletions.
103 changes: 99 additions & 4 deletions lib/x86/adler32_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,19 @@
ADLER32_FINISH_VEC_CHUNK_128((s1), (s2), s1_128bit, s2_128bit); \
}

#define ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2) \
{ \
__m256i /* __v8su */ s1_256bit, s2_256bit; \
\
/* 512 => 256 bits */ \
s1_256bit = _mm256_add_epi32(_mm512_extracti64x4_epi64((v_s1), 0), \
_mm512_extracti64x4_epi64((v_s1), 1)); \
s2_256bit = _mm256_add_epi32(_mm512_extracti64x4_epi64((v_s2), 0), \
_mm512_extracti64x4_epi64((v_s2), 1)); \
\
ADLER32_FINISH_VEC_CHUNK_256((s1), (s2), s1_256bit, s2_256bit); \
}

/*
* This is a very silly partial workaround for gcc bug
* https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107892. The bug causes gcc to
Expand Down Expand Up @@ -263,14 +276,97 @@ adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2)
# include "../adler32_vec_template.h"
#endif /* HAVE_AVX2_INTRIN */

#if defined(adler32_avx2) && HAVE_AVX2_NATIVE
#define DEFAULT_IMPL adler32_avx2
#else
/* AVX-512BW implementation */
#if HAVE_AVX512BW_INTRIN
# define adler32_avx512bw adler32_avx512bw
# define FUNCNAME adler32_avx512bw
# define FUNCNAME_CHUNK adler32_avx512bw_chunk
# define IMPL_ALIGNMENT 64
# define IMPL_SEGMENT_LEN 128
# define IMPL_MAX_CHUNK_LEN (128 * (0x7FFF / 0xFF))
# if HAVE_AVX512BW_NATIVE
# define ATTRIBUTES
# else
# define ATTRIBUTES _target_attribute("avx512bw")
# endif
# include <immintrin.h>
/*
* With clang in MSVC compatibility mode, immintrin.h incorrectly skips
* including some sub-headers.
*/
# if defined(__clang__) && defined(_MSC_VER)
# include <avxintrin.h>
# include <avx2intrin.h>
# include <avx512bwintrin.h>
# endif
static forceinline ATTRIBUTES void
adler32_avx512bw_chunk(const __m512i *p, const __m512i *const end,
u32 *s1, u32 *s2)
{
static const u16 _aligned_attribute(64) mults[128] = {
128, 127, 126, 125, 124, 123, 122, 121, 112, 111, 110, 109, 108, 107, 106, 105,
96, 95, 94, 93, 92, 91, 90, 89, 80, 79, 78, 77, 76, 75, 74, 73,

120, 119, 118, 117, 116, 115, 114, 113, 104, 103, 102, 101, 100, 99, 98, 97,
88, 87, 86, 85, 84, 83, 82, 81, 72, 71, 70, 69, 68, 67, 66, 65,

64, 63, 62, 61, 60, 59, 58, 57, 48, 47, 46, 45, 44, 43, 42, 41,
32, 31, 30, 29, 28, 27, 26, 25, 16, 15, 14, 13, 12, 11, 10, 9,

56, 55, 54, 53, 52, 51, 50, 49, 40, 39, 38, 37, 36, 35, 34, 33,
24, 23, 22, 21, 20, 19, 18, 17, 8, 7, 6, 5, 4, 3, 2, 1,
};
const __m512i zeroes = _mm512_setzero_si512();
const __m512i /* __v32hu */ mults_a = _mm512_loadu_epi8(&mults[0]);
const __m512i /* __v32hu */ mults_b = _mm512_loadu_epi8(&mults[32]);
const __m512i /* __v32hu */ mults_c = _mm512_loadu_epi8(&mults[64]);
const __m512i /* __v32hu */ mults_d = _mm512_loadu_epi8(&mults[96]);
__m512i /* __v16su */ v_s1 = zeroes;
__m512i /* __v16su */ v_s2 = zeroes;
__m512i /* __v32hu */ v_byte_sums_a = zeroes;
__m512i /* __v32hu */ v_byte_sums_b = zeroes;
__m512i /* __v32hu */ v_byte_sums_c = zeroes;
__m512i /* __v32hu */ v_byte_sums_d = zeroes;

do {
const __m512i bytes1 = *p++;
const __m512i bytes2 = *p++;

v_s2 = _mm512_add_epi32(v_s2, v_s1);
v_s1 = _mm512_add_epi32(v_s1, _mm512_sad_epu8(bytes1, zeroes));
v_s1 = _mm512_add_epi32(v_s1, _mm512_sad_epu8(bytes2, zeroes));
v_byte_sums_a = _mm512_add_epi16(
v_byte_sums_a, _mm512_unpacklo_epi8(bytes1, zeroes));
v_byte_sums_b = _mm512_add_epi16(
v_byte_sums_b, _mm512_unpackhi_epi8(bytes1, zeroes));
v_byte_sums_c = _mm512_add_epi16(
v_byte_sums_c, _mm512_unpacklo_epi8(bytes2, zeroes));
v_byte_sums_d = _mm512_add_epi16(
v_byte_sums_d, _mm512_unpackhi_epi8(bytes2, zeroes));

GCC_UPDATE_VARS(v_s1, v_s2, v_byte_sums_a, v_byte_sums_b,
v_byte_sums_c, v_byte_sums_d);
} while (p != end);

v_s2 = _mm512_slli_epi32(v_s2, 7);
v_s2 = _mm512_add_epi32(v_s2, _mm512_madd_epi16(v_byte_sums_a, mults_a));
v_s2 = _mm512_add_epi32(v_s2, _mm512_madd_epi16(v_byte_sums_b, mults_b));
v_s2 = _mm512_add_epi32(v_s2, _mm512_madd_epi16(v_byte_sums_c, mults_c));
v_s2 = _mm512_add_epi32(v_s2, _mm512_madd_epi16(v_byte_sums_d, mults_d));
ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2);
}
# include "../adler32_vec_template.h"
#endif /* HAVE_AVX512BW_INTRIN */

static inline adler32_func_t
arch_select_adler32_func(void)
{
const u32 features MAYBE_UNUSED = get_x86_cpu_features();

#ifdef adler32_avx512bw
if ((features & X86_CPU_FEATURE_ZMM) && HAVE_AVX512BW(features))
return adler32_avx512bw;
#endif
#ifdef adler32_avx2
if (HAVE_AVX2(features))
return adler32_avx2;
Expand All @@ -282,6 +378,5 @@ arch_select_adler32_func(void)
return NULL;
}
#define arch_select_adler32_func arch_select_adler32_func
#endif

#endif /* LIB_X86_ADLER32_IMPL_H */
3 changes: 3 additions & 0 deletions lib/x86/cpu_features.c
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ static const struct cpu_feature x86_cpu_feature_table[] = {
{X86_CPU_FEATURE_BMI2, "bmi2"},
{X86_CPU_FEATURE_ZMM, "zmm"},
{X86_CPU_FEATURE_AVX512F, "avx512f"},
{X86_CPU_FEATURE_AVX512BW, "avx512bw"},
{X86_CPU_FEATURE_AVX512VL, "avx512vl"},
{X86_CPU_FEATURE_VPCLMULQDQ, "vpclmulqdq"},
};
Expand Down Expand Up @@ -171,6 +172,8 @@ void libdeflate_init_x86_cpu_features(void)
features |= X86_CPU_FEATURE_ZMM;
if ((b & (1 << 16)) && ((xcr0 & 0xe6) == 0xe6))
features |= X86_CPU_FEATURE_AVX512F;
if ((b & (1 << 30)) && ((xcr0 & 0xe6) == 0xe6))
features |= X86_CPU_FEATURE_AVX512BW;
if ((b & (1U << 31)) && ((xcr0 & 0xe6) == 0xe6))
features |= X86_CPU_FEATURE_AVX512VL;
if ((c & (1 << 10)) && ((xcr0 & 0x6) == 0x6))
Expand Down
19 changes: 17 additions & 2 deletions lib/x86/cpu_features.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,15 +52,17 @@
*/
#define X86_CPU_FEATURE_ZMM 0x00000020
#define X86_CPU_FEATURE_AVX512F 0x00000040
#define X86_CPU_FEATURE_AVX512VL 0x00000080
#define X86_CPU_FEATURE_VPCLMULQDQ 0x00000100
#define X86_CPU_FEATURE_AVX512BW 0x00000080
#define X86_CPU_FEATURE_AVX512VL 0x00000100
#define X86_CPU_FEATURE_VPCLMULQDQ 0x00000200

#define HAVE_SSE2(features) (HAVE_SSE2_NATIVE || ((features) & X86_CPU_FEATURE_SSE2))
#define HAVE_PCLMULQDQ(features) (HAVE_PCLMULQDQ_NATIVE || ((features) & X86_CPU_FEATURE_PCLMULQDQ))
#define HAVE_AVX(features) (HAVE_AVX_NATIVE || ((features) & X86_CPU_FEATURE_AVX))
#define HAVE_AVX2(features) (HAVE_AVX2_NATIVE || ((features) & X86_CPU_FEATURE_AVX2))
#define HAVE_BMI2(features) (HAVE_BMI2_NATIVE || ((features) & X86_CPU_FEATURE_BMI2))
#define HAVE_AVX512F(features) (HAVE_AVX512F_NATIVE || ((features) & X86_CPU_FEATURE_AVX512F))
#define HAVE_AVX512BW(features) (HAVE_AVX512BW_NATIVE || ((features) & X86_CPU_FEATURE_AVX512BW))
#define HAVE_AVX512VL(features) (HAVE_AVX512VL_NATIVE || ((features) & X86_CPU_FEATURE_AVX512VL))
#define HAVE_VPCLMULQDQ(features) (HAVE_VPCLMULQDQ_NATIVE || ((features) & X86_CPU_FEATURE_VPCLMULQDQ))

Expand Down Expand Up @@ -182,6 +184,19 @@ static inline u32 get_x86_cpu_features(void) { return 0; }
# define HAVE_AVX512F_INTRIN 0
#endif

/* AVX-512BW */
#ifdef __AVX512BW__
# define HAVE_AVX512BW_NATIVE 1
#else
# define HAVE_AVX512BW_NATIVE 0
#endif
#if HAVE_AVX512BW_NATIVE || GCC_PREREQ(5, 1) || CLANG_PREREQ(3, 9, 0) || \
defined(_MSC_VER)
# define HAVE_AVX512BW_INTRIN 1
#else
# define HAVE_AVX512BW_INTRIN 0
#endif

/* AVX-512VL */
#ifdef __AVX512VL__
# define HAVE_AVX512VL_NATIVE 1
Expand Down
4 changes: 4 additions & 0 deletions scripts/checksum_benchmarks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,10 @@ echo
{
case $ARCH in
i386|x86_64)
if have_cpu_feature avx512bw; then
do_benchmark "AVX512BW"
disable_cpu_feature "avx512bw" "-mno-avx512bw"
fi
if have_cpu_feature avx2; then
do_benchmark "AVX2"
disable_cpu_feature "avx2" "-mno-avx2"
Expand Down
2 changes: 1 addition & 1 deletion scripts/run_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ build_and_run_tests()
if ! [[ "$CFLAGS" =~ "-march=native" ]] && ! $quick; then
case "$ARCH" in
i386|x86_64)
features+=(zmm vpclmulqdq avx512vl avx512f
features+=(zmm vpclmulqdq avx512vl avx512bw avx512f
avx2 avx bmi2 pclmulqdq sse2)
;;
arm*|aarch*)
Expand Down

0 comments on commit 69e29f6

Please sign in to comment.