From 2929aea4ed833da7993d04a7abcccc95422dc276 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers3@gmail.com>
Date: Fri, 23 Feb 2024 20:48:14 -0800
Subject: [PATCH 1/4] ci.yml: ensure MSVC can find zlib

---
 .github/workflows/ci.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index f738eed4..e158e7db 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -126,8 +126,9 @@ jobs:
     - run: >
         cmake -B build -G "${{matrix.gen}}" -T ${{matrix.toolset}}
         -A ${{matrix.vs}} -DLIBDEFLATE_BUILD_TESTS=1
-        -DCMAKE_C_FLAGS="/W4 /WX /DLIBDEFLATE_ENABLE_ASSERTIONS /IC:\vcpkg\packages\zlib_${{matrix.vcpkg}}\include"
+        -DCMAKE_C_FLAGS="/W4 /WX /DLIBDEFLATE_ENABLE_ASSERTIONS"
         -DZLIB_LIBRARY=C:\vcpkg\packages\zlib_${{matrix.vcpkg}}\lib\zlib.lib
+        -DZLIB_INCLUDE_DIR=C:\vcpkg\packages\zlib_${{matrix.vcpkg}}\include
         -DCMAKE_INSTALL_PREFIX=build\install
     - run: cmake --build build --verbose --config Debug
     - run: cmake --install build --verbose --config Debug

From ea028f147da84918a383418ac3d80bc10748e72b Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers3@gmail.com>
Date: Fri, 23 Feb 2024 20:48:14 -0800
Subject: [PATCH 2/4] lib/x86: disambiguate 512-bit vector from AVX-512F

crc32_x86_vpclmulqdq_avx512vl and crc32_x86_vpclmulqdq_avx512f_avx512vl
actually use the same CPU features, considering that vpternlog always
requires at least avx512f, and compilers consider avx512vl to imply
avx512f.  Rename them to *_avx512_vl256 and *_avx512_vl512 to reflect
that they differ only in vector length, and fix the CPU feature checking
to use a separate flag for whether 512-bit vectors are enabled.
---
 lib/x86/cpu_features.c          |  5 ++++-
 lib/x86/cpu_features.h          | 13 ++++++++---
 lib/x86/crc32_impl.h            | 39 ++++++++++++++++++---------------
 lib/x86/crc32_pclmul_template.h |  2 +-
 scripts/checksum_benchmarks.sh  | 22 +++++++++----------
 scripts/run_tests.sh            |  2 +-
 6 files changed, 47 insertions(+), 36 deletions(-)

diff --git a/lib/x86/cpu_features.c b/lib/x86/cpu_features.c
index 27564c56..72bd37c2 100644
--- a/lib/x86/cpu_features.c
+++ b/lib/x86/cpu_features.c
@@ -90,6 +90,7 @@ static const struct cpu_feature x86_cpu_feature_table[] = {
 	{X86_CPU_FEATURE_AVX,		"avx"},
 	{X86_CPU_FEATURE_AVX2,		"avx2"},
 	{X86_CPU_FEATURE_BMI2,		"bmi2"},
+	{X86_CPU_FEATURE_ZMM,		"zmm"},
 	{X86_CPU_FEATURE_AVX512F,	"avx512f"},
 	{X86_CPU_FEATURE_AVX512VL,	"avx512vl"},
 	{X86_CPU_FEATURE_VPCLMULQDQ,	"vpclmulqdq"},
@@ -165,8 +166,10 @@ void libdeflate_init_x86_cpu_features(void)
 		features |= X86_CPU_FEATURE_AVX2;
 	if (b & (1 << 8))
 		features |= X86_CPU_FEATURE_BMI2;
-	if ((b & (1 << 16)) && ((xcr0 & 0xe6) == 0xe6) &&
+	if (((xcr0 & 0xe6) == 0xe6) &&
 	    allow_512bit_vectors(manufacturer, family, model))
+		features |= X86_CPU_FEATURE_ZMM;
+	if ((b & (1 << 16)) && ((xcr0 & 0xe6) == 0xe6))
 		features |= X86_CPU_FEATURE_AVX512F;
 	if ((b & (1U << 31)) && ((xcr0 & 0xa6) == 0xa6))
 		features |= X86_CPU_FEATURE_AVX512VL;
diff --git a/lib/x86/cpu_features.h b/lib/x86/cpu_features.h
index b5fbf573..6f68e032 100644
--- a/lib/x86/cpu_features.h
+++ b/lib/x86/cpu_features.h
@@ -44,9 +44,16 @@
 #define X86_CPU_FEATURE_AVX		0x00000004
 #define X86_CPU_FEATURE_AVX2		0x00000008
 #define X86_CPU_FEATURE_BMI2		0x00000010
-#define X86_CPU_FEATURE_AVX512F		0x00000020
-#define X86_CPU_FEATURE_AVX512VL	0x00000040
-#define X86_CPU_FEATURE_VPCLMULQDQ	0x00000080
+/*
+ * ZMM indicates whether 512-bit vectors (zmm registers) should be used.  On
+ * some CPUs, to avoid downclocking issues we don't set ZMM even if the CPU
+ * supports it, i.e. even if AVX512F is set.  On these CPUs, we may still use
+ * AVX-512 instructions, but only with ymm and xmm registers.
+ */
+#define X86_CPU_FEATURE_ZMM		0x00000020
+#define X86_CPU_FEATURE_AVX512F		0x00000040
+#define X86_CPU_FEATURE_AVX512VL	0x00000080
+#define X86_CPU_FEATURE_VPCLMULQDQ	0x00000100
 
 #define HAVE_SSE2(features)	(HAVE_SSE2_NATIVE     || ((features) & X86_CPU_FEATURE_SSE2))
 #define HAVE_PCLMULQDQ(features) (HAVE_PCLMULQDQ_NATIVE || ((features) & X86_CPU_FEATURE_PCLMULQDQ))
diff --git a/lib/x86/crc32_impl.h b/lib/x86/crc32_impl.h
index e818d0a6..d50547bd 100644
--- a/lib/x86/crc32_impl.h
+++ b/lib/x86/crc32_impl.h
@@ -95,14 +95,16 @@
 #endif
 
 /*
- * VPCLMULQDQ/AVX512VL implementation.  This takes advantage of some AVX-512
- * instructions but uses 256-bit vectors rather than 512-bit.  This can be
- * useful on CPUs where 512-bit vectors cause downclocking.
+ * VPCLMULQDQ/AVX512 implementation with 256-bit vectors.  This takes advantage
+ * of some AVX-512 instructions but uses 256-bit vectors rather than 512-bit.
+ * This can be useful on CPUs where 512-bit vectors cause downclocking.
  */
-#if HAVE_VPCLMULQDQ_INTRIN && HAVE_PCLMULQDQ_INTRIN && HAVE_AVX512VL_INTRIN
-#  define crc32_x86_vpclmulqdq_avx512vl	crc32_x86_vpclmulqdq_avx512vl
-#  define SUFFIX				 _vpclmulqdq_avx512vl
-#  if HAVE_VPCLMULQDQ_NATIVE && HAVE_PCLMULQDQ_NATIVE && HAVE_AVX512VL_NATIVE
+#if HAVE_VPCLMULQDQ_INTRIN && HAVE_PCLMULQDQ_INTRIN && \
+	HAVE_AVX512F_INTRIN && HAVE_AVX512VL_INTRIN
+#  define crc32_x86_vpclmulqdq_avx512_vl256  crc32_x86_vpclmulqdq_avx512_vl256
+#  define SUFFIX				      _vpclmulqdq_avx512_vl256
+#  if HAVE_VPCLMULQDQ_NATIVE && HAVE_PCLMULQDQ_NATIVE && \
+	HAVE_AVX512F_NATIVE && HAVE_AVX512VL_NATIVE
 #    define ATTRIBUTES
 #  else
 #    define ATTRIBUTES		_target_attribute("vpclmulqdq,pclmul,avx512vl")
@@ -113,16 +115,16 @@
 #  include "crc32_pclmul_template.h"
 #endif
 
-/* VPCLMULQDQ/AVX512F/AVX512VL implementation.  Uses 512-bit vectors. */
+/* VPCLMULQDQ/AVX512 implementation with 512-bit vectors */
 #if HAVE_VPCLMULQDQ_INTRIN && HAVE_PCLMULQDQ_INTRIN && \
 	HAVE_AVX512F_INTRIN && HAVE_AVX512VL_INTRIN
-#  define crc32_x86_vpclmulqdq_avx512f_avx512vl	crc32_x86_vpclmulqdq_avx512f_avx512vl
-#  define SUFFIX					 _vpclmulqdq_avx512f_avx512vl
-#if HAVE_VPCLMULQDQ_NATIVE && HAVE_PCLMULQDQ_NATIVE && \
+#  define crc32_x86_vpclmulqdq_avx512_vl512  crc32_x86_vpclmulqdq_avx512_vl512
+#  define SUFFIX				      _vpclmulqdq_avx512_vl512
+#  if HAVE_VPCLMULQDQ_NATIVE && HAVE_PCLMULQDQ_NATIVE && \
 	HAVE_AVX512F_NATIVE && HAVE_AVX512VL_NATIVE
 #    define ATTRIBUTES
 #  else
-#    define ATTRIBUTES		_target_attribute("vpclmulqdq,pclmul,avx512f,avx512vl")
+#    define ATTRIBUTES		_target_attribute("vpclmulqdq,pclmul,avx512vl")
 #  endif
 #  define VL			64
 #  define FOLD_LESSTHAN16BYTES	1
@@ -136,15 +138,16 @@ arch_select_crc32_func(void)
 {
 	const u32 features MAYBE_UNUSED = get_x86_cpu_features();
 
-#ifdef crc32_x86_vpclmulqdq_avx512f_avx512vl
-	if (HAVE_VPCLMULQDQ(features) && HAVE_PCLMULQDQ(features) &&
+#ifdef crc32_x86_vpclmulqdq_avx512_vl512
+	if ((features & X86_CPU_FEATURE_ZMM) &&
+	    HAVE_VPCLMULQDQ(features) && HAVE_PCLMULQDQ(features) &&
 	    HAVE_AVX512F(features) && HAVE_AVX512VL(features))
-		return crc32_x86_vpclmulqdq_avx512f_avx512vl;
+		return crc32_x86_vpclmulqdq_avx512_vl512;
 #endif
-#ifdef crc32_x86_vpclmulqdq_avx512vl
+#ifdef crc32_x86_vpclmulqdq_avx512_vl256
 	if (HAVE_VPCLMULQDQ(features) && HAVE_PCLMULQDQ(features) &&
-	    HAVE_AVX512VL(features))
-		return crc32_x86_vpclmulqdq_avx512vl;
+	    HAVE_AVX512F(features) && HAVE_AVX512VL(features))
+		return crc32_x86_vpclmulqdq_avx512_vl256;
 #endif
 #ifdef crc32_x86_vpclmulqdq_avx2
 	if (HAVE_VPCLMULQDQ(features) && HAVE_PCLMULQDQ(features) &&
diff --git a/lib/x86/crc32_pclmul_template.h b/lib/x86/crc32_pclmul_template.h
index 16e4ebf5..eb06262b 100644
--- a/lib/x86/crc32_pclmul_template.h
+++ b/lib/x86/crc32_pclmul_template.h
@@ -38,7 +38,7 @@
  *	   VL=16 && FOLD_LESSTHAN16BYTES=1: at least pclmul,sse4.1
  *	   VL=32 && USE_TERNARYLOGIC=0: at least vpclmulqdq,pclmul,avx2
  *	   VL=32 && USE_TERNARYLOGIC=1: at least vpclmulqdq,pclmul,avx512vl
- *	   VL=64: at least vpclmulqdq,pclmul,avx512f,avx512vl
+ *	   VL=64: at least vpclmulqdq,pclmul,avx512vl
  * VL:
  *	Vector length in bytes.  Supported values are 16, 32, and 64.
  * FOLD_LESSTHAN16BYTES:
diff --git a/scripts/checksum_benchmarks.sh b/scripts/checksum_benchmarks.sh
index b359fba3..b9c38698 100755
--- a/scripts/checksum_benchmarks.sh
+++ b/scripts/checksum_benchmarks.sh
@@ -67,12 +67,11 @@ sort_by_speed() {
 }
 
 disable_cpu_feature() {
-	local name="$1"
+	LIBDEFLATE_DISABLE_CPU_FEATURES+=",$1"
 	shift
-	local extra_cflags=("$@")
-
-	LIBDEFLATE_DISABLE_CPU_FEATURES+=",$name"
-	EXTRA_CFLAGS+=("${extra_cflags[@]}")
+	if (( $# > 0 )); then
+		EXTRA_CFLAGS+=("$@")
+	fi
 }
 
 cleanup() {
@@ -114,15 +113,14 @@ export LIBDEFLATE_DISABLE_CPU_FEATURES=""
 {
 case $ARCH in
 i386|x86_64)
-	if have_cpu_features vpclmulqdq avx512f avx512vl; then
-		do_benchmark "VPCLMULQDQ/AVX512F/AVX512VL"
-		disable_cpu_feature "avx512f" "-mno-avx512f"
-	fi
-	if have_cpu_features vpclmulqdq avx512vl; then
-		do_benchmark "VPCLMULQDQ/AVX512VL"
+	if have_cpu_features vpclmulqdq pclmulqdq avx512f avx512vl; then
+		do_benchmark "VPCLMULQDQ/AVX512/VL512"
+		disable_cpu_feature "zmm"
+		do_benchmark "VPCLMULQDQ/AVX512/VL256"
 		disable_cpu_feature "avx512vl" "-mno-avx512vl"
+		disable_cpu_feature "avx512f" "-mno-avx512f"
 	fi
-	if have_cpu_features vpclmulqdq avx2; then
+	if have_cpu_features vpclmulqdq pclmulqdq avx2; then
 		do_benchmark "VPCLMULQDQ/AVX2"
 		disable_cpu_feature "vpclmulqdq" "-mno-vpclmulqdq"
 	fi
diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh
index ff15cc19..b07f71a7 100755
--- a/scripts/run_tests.sh
+++ b/scripts/run_tests.sh
@@ -142,7 +142,7 @@ build_and_run_tests()
 	if ! [[ "$CFLAGS" =~ "-march=native" ]] && ! $quick; then
 		case "$ARCH" in
 		i386|x86_64)
-			features+=(vpclmulqdq avx512vl avx512f
+			features+=(zmm vpclmulqdq avx512vl avx512f
 				   avx2 avx bmi2 pclmulqdq sse2)
 			;;
 		arm*|aarch*)

From 5b217a14d6802ffe5cd586dd8aa5cc44610eb633 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers3@gmail.com>
Date: Fri, 23 Feb 2024 20:48:14 -0800
Subject: [PATCH 3/4] lib/x86: fix XCR0 check for AVX-512VL

According to the Intel manual, the ZMM_Hi256 bit needs to be checked for
all AVX-512 instructions, even if 512-bit vectors aren't being used.
---
 lib/x86/cpu_features.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/x86/cpu_features.c b/lib/x86/cpu_features.c
index 72bd37c2..ba7a39ff 100644
--- a/lib/x86/cpu_features.c
+++ b/lib/x86/cpu_features.c
@@ -171,7 +171,7 @@ void libdeflate_init_x86_cpu_features(void)
 		features |= X86_CPU_FEATURE_ZMM;
 	if ((b & (1 << 16)) && ((xcr0 & 0xe6) == 0xe6))
 		features |= X86_CPU_FEATURE_AVX512F;
-	if ((b & (1U << 31)) && ((xcr0 & 0xa6) == 0xa6))
+	if ((b & (1U << 31)) && ((xcr0 & 0xe6) == 0xe6))
 		features |= X86_CPU_FEATURE_AVX512VL;
 	if ((c & (1 << 10)) && ((xcr0 & 0x6) == 0x6))
 		features |= X86_CPU_FEATURE_VPCLMULQDQ;

From a026a046e05f4fdd1a27f880f8344a97c2ed5571 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers3@gmail.com>
Date: Fri, 23 Feb 2024 20:48:14 -0800
Subject: [PATCH 4/4] lib/x86/adler32: add an AVX-512 implementation

libdeflate used to (before commit 416bac37a0c4) have an AVX512BW
implementation of Adler-32, but I removed it due to AVX-512's
downclocking issues.  Since then, newer Intel and AMD CPUs have come out
with better AVX-512 implementations, and these CPUs tend to have
AVX512VNNI which includes a dot product instruction which is useful for
Adler-32.  Therefore, add an AVX512VNNI/AVX512BW implementation.
---
 lib/x86/adler32_impl.h         | 141 +++++++++++++++++++++++++++------
 lib/x86/cpu_features.c         |   6 ++
 lib/x86/cpu_features.h         |  38 ++++++++-
 scripts/checksum_benchmarks.sh |   5 ++
 scripts/run_tests.sh           |   4 +-
 5 files changed, 163 insertions(+), 31 deletions(-)

diff --git a/lib/x86/adler32_impl.h b/lib/x86/adler32_impl.h
index 6285dc80..eb03083e 100644
--- a/lib/x86/adler32_impl.h
+++ b/lib/x86/adler32_impl.h
@@ -67,6 +67,19 @@
 	ADLER32_FINISH_VEC_CHUNK_128((s1), (s2), s1_128bit, s2_128bit);	    \
 }
 
+#define ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2)		    \
+{									    \
+	__m256i /* __v8su */ s1_256bit, s2_256bit;			    \
+									    \
+	/* 512 => 256 bits */						    \
+	s1_256bit = _mm256_add_epi32(_mm512_extracti64x4_epi64((v_s1), 0),  \
+				     _mm512_extracti64x4_epi64((v_s1), 1)); \
+	s2_256bit = _mm256_add_epi32(_mm512_extracti64x4_epi64((v_s2), 0),  \
+				     _mm512_extracti64x4_epi64((v_s2), 1)); \
+									    \
+	ADLER32_FINISH_VEC_CHUNK_256((s1), (s2), s1_256bit, s2_256bit);	    \
+}
+
 /*
  * This is a very silly partial workaround for gcc bug
  * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107892.  The bug causes gcc to
@@ -105,15 +118,17 @@
 static forceinline ATTRIBUTES void
 adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2)
 {
+	static const u16 _aligned_attribute(16) mults[4][16] = {
+		{ 32, 31, 30, 29, 28, 27, 26, 25 },
+		{ 24, 23, 22, 21, 20, 19, 18, 17 },
+		{ 16, 15, 14, 13, 12, 11, 10, 9  },
+		{ 8,  7,  6,  5,  4,  3,  2,  1  },
+	};
+	const __m128i /* __v8hu */ mults_a = _mm_load_si128((const __m128i *)mults[0]);
+	const __m128i /* __v8hu */ mults_b = _mm_load_si128((const __m128i *)mults[1]);
+	const __m128i /* __v8hu */ mults_c = _mm_load_si128((const __m128i *)mults[2]);
+	const __m128i /* __v8hu */ mults_d = _mm_load_si128((const __m128i *)mults[3]);
 	const __m128i zeroes = _mm_setzero_si128();
-	const __m128i /* __v8hu */ mults_a =
-		_mm_setr_epi16(32, 31, 30, 29, 28, 27, 26, 25);
-	const __m128i /* __v8hu */ mults_b =
-		_mm_setr_epi16(24, 23, 22, 21, 20, 19, 18, 17);
-	const __m128i /* __v8hu */ mults_c =
-		_mm_setr_epi16(16, 15, 14, 13, 12, 11, 10, 9);
-	const __m128i /* __v8hu */ mults_d =
-		_mm_setr_epi16(8,  7,  6,  5,  4,  3,  2,  1);
 
 	/* s1 counters: 32-bit, sum of bytes */
 	__m128i /* __v4su */ v_s1 = zeroes;
@@ -209,23 +224,21 @@ adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2)
 static forceinline ATTRIBUTES void
 adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2)
 {
-	const __m256i zeroes = _mm256_setzero_si256();
 	/*
 	 * Note, the multipliers have to be in this order because
 	 * _mm256_unpack{lo,hi}_epi8 work on each 128-bit lane separately.
 	 */
-	const __m256i /* __v16hu */ mults_a =
-		_mm256_setr_epi16(64, 63, 62, 61, 60, 59, 58, 57,
-				  48, 47, 46, 45, 44, 43, 42, 41);
-	const __m256i /* __v16hu */ mults_b =
-		_mm256_setr_epi16(56, 55, 54, 53, 52, 51, 50, 49,
-				  40, 39, 38, 37, 36, 35, 34, 33);
-	const __m256i /* __v16hu */ mults_c =
-		_mm256_setr_epi16(32, 31, 30, 29, 28, 27, 26, 25,
-				  16, 15, 14, 13, 12, 11, 10,  9);
-	const __m256i /* __v16hu */ mults_d =
-		_mm256_setr_epi16(24, 23, 22, 21, 20, 19, 18, 17,
-				  8,  7,  6,  5,  4,  3,  2,  1);
+	static const u16 _aligned_attribute(32) mults[4][16] = {
+		{ 64, 63, 62, 61, 60, 59, 58, 57, 48, 47, 46, 45, 44, 43, 42, 41 },
+		{ 56, 55, 54, 53, 52, 51, 50, 49, 40, 39, 38, 37, 36, 35, 34, 33 },
+		{ 32, 31, 30, 29, 28, 27, 26, 25, 16, 15, 14, 13, 12, 11, 10,  9 },
+		{ 24, 23, 22, 21, 20, 19, 18, 17,  8,  7,  6,  5,  4,  3,  2,  1 },
+	};
+	const __m256i /* __v16hu */ mults_a = _mm256_load_si256((const __m256i *)mults[0]);
+	const __m256i /* __v16hu */ mults_b = _mm256_load_si256((const __m256i *)mults[1]);
+	const __m256i /* __v16hu */ mults_c = _mm256_load_si256((const __m256i *)mults[2]);
+	const __m256i /* __v16hu */ mults_d = _mm256_load_si256((const __m256i *)mults[3]);
+	const __m256i zeroes = _mm256_setzero_si256();
 	__m256i /* __v8su */ v_s1 = zeroes;
 	__m256i /* __v8su */ v_s2 = zeroes;
 	__m256i /* __v16hu */ v_byte_sums_a = zeroes;
@@ -263,14 +276,93 @@ adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2)
 #  include "../adler32_vec_template.h"
 #endif /* HAVE_AVX2_INTRIN */
 
-#if defined(adler32_avx2) && HAVE_AVX2_NATIVE
-#define DEFAULT_IMPL	adler32_avx2
-#else
+/*
+ * AVX512VNNI/AVX512BW implementation.  Uses the dot product instruction
+ * vpdpbusd from AVX512VNNI and the vpsadbw instruction from AVX512BW.
+ * (vpdpbusd with ones could be used instead of vpsadbw, but it's slower.)
+ *
+ * We currently don't include an implementation using AVX512VNNI or AVX512BW
+ * alone, since CPUs with good AVX-512 implementations tend to have both.
+ */
+#if HAVE_AVX512VNNI_INTRIN && HAVE_AVX512BW_INTRIN
+#  define adler32_avx512vnni_avx512bw	adler32_avx512vnni_avx512bw
+#  define FUNCNAME			adler32_avx512vnni_avx512bw
+#  define FUNCNAME_CHUNK		adler32_avx512vnni_avx512bw_chunk
+#  define IMPL_ALIGNMENT	64
+#  define IMPL_SEGMENT_LEN	128
+#  define IMPL_MAX_CHUNK_LEN	(128 * (0xFFFF / 0xFF))
+#  if HAVE_AVX512VNNI_NATIVE && HAVE_AVX512BW_NATIVE
+#    define ATTRIBUTES
+#  else
+#    define ATTRIBUTES		_target_attribute("avx512vnni,avx512bw")
+#  endif
+#  include <immintrin.h>
+  /*
+   * With clang in MSVC compatibility mode, immintrin.h incorrectly skips
+   * including some sub-headers.
+   */
+#  if defined(__clang__) && defined(_MSC_VER)
+#    include <tmmintrin.h>
+#    include <smmintrin.h>
+#    include <wmmintrin.h>
+#    include <avxintrin.h>
+#    include <avx2intrin.h>
+#    include <avx512fintrin.h>
+#    include <avx512bwintrin.h>
+#    include <avx512vnniintrin.h>
+#  endif
+static forceinline ATTRIBUTES void
+adler32_avx512vnni_avx512bw_chunk(const __m512i *p, const __m512i *const end,
+				  u32 *s1, u32 *s2)
+{
+	static const u8 _aligned_attribute(64) mults[64] = {
+		64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,
+		48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33,
+		32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
+		16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,
+	};
+	const __m512i /* __v64qu */ mults_a = _mm512_load_si512((const __m512i *)mults);
+	const __m512i zeroes = _mm512_setzero_si512();
+	__m512i /* __v16su */ v_s1 = zeroes;
+	__m512i /* __v16su */ v_s1_sums_a = zeroes;
+	__m512i /* __v16su */ v_s1_sums_b = zeroes;
+	__m512i /* __v16su */ v_s2_a = zeroes;
+	__m512i /* __v16su */ v_s2_b = zeroes;
+
+	do {
+		const __m512i bytes_a = *p++;
+		const __m512i bytes_b = *p++;
+		__m512i v_sad_a, v_sad_b;
+
+		v_s2_a = _mm512_dpbusd_epi32(v_s2_a, bytes_a, mults_a);
+		v_s2_b = _mm512_dpbusd_epi32(v_s2_b, bytes_b, mults_a);
+		v_sad_a = _mm512_sad_epu8(bytes_a, zeroes);
+		v_sad_b = _mm512_sad_epu8(bytes_b, zeroes);
+		v_s1_sums_a = _mm512_add_epi32(v_s1_sums_a, v_s1);
+		v_s1 = _mm512_add_epi32(v_s1, v_sad_a);
+		v_s1_sums_b = _mm512_add_epi32(v_s1_sums_b, v_s1);
+		v_s1 = _mm512_add_epi32(v_s1, v_sad_b);
+	} while (p != end);
+
+	v_s1_sums_a = _mm512_add_epi32(v_s1_sums_a, v_s1_sums_b);
+	v_s1_sums_a = _mm512_slli_epi32(v_s1_sums_a, 6);
+	v_s2_a = _mm512_add_epi32(v_s2_a, v_s2_b);
+	v_s2_a = _mm512_add_epi32(v_s2_a, v_s1_sums_a);
+	ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2_a);
+}
+#  include "../adler32_vec_template.h"
+#endif /* HAVE_AVX512VNNI_INTRIN && HAVE_AVX512BW_INTRIN */
+
 static inline adler32_func_t
 arch_select_adler32_func(void)
 {
 	const u32 features MAYBE_UNUSED = get_x86_cpu_features();
 
+#ifdef adler32_avx512vnni_avx512bw
+	if ((features & X86_CPU_FEATURE_ZMM) &&
+	    HAVE_AVX512VNNI(features) && HAVE_AVX512BW(features))
+		return adler32_avx512vnni_avx512bw;
+#endif
 #ifdef adler32_avx2
 	if (HAVE_AVX2(features))
 		return adler32_avx2;
@@ -282,6 +374,5 @@ arch_select_adler32_func(void)
 	return NULL;
 }
 #define arch_select_adler32_func	arch_select_adler32_func
-#endif
 
 #endif /* LIB_X86_ADLER32_IMPL_H */
diff --git a/lib/x86/cpu_features.c b/lib/x86/cpu_features.c
index ba7a39ff..750bf66c 100644
--- a/lib/x86/cpu_features.c
+++ b/lib/x86/cpu_features.c
@@ -92,8 +92,10 @@ static const struct cpu_feature x86_cpu_feature_table[] = {
 	{X86_CPU_FEATURE_BMI2,		"bmi2"},
 	{X86_CPU_FEATURE_ZMM,		"zmm"},
 	{X86_CPU_FEATURE_AVX512F,	"avx512f"},
+	{X86_CPU_FEATURE_AVX512BW,	"avx512bw"},
 	{X86_CPU_FEATURE_AVX512VL,	"avx512vl"},
 	{X86_CPU_FEATURE_VPCLMULQDQ,	"vpclmulqdq"},
+	{X86_CPU_FEATURE_AVX512VNNI,	"avx512vnni"},
 };
 
 volatile u32 libdeflate_x86_cpu_features = 0;
@@ -171,10 +173,14 @@ void libdeflate_init_x86_cpu_features(void)
 		features |= X86_CPU_FEATURE_ZMM;
 	if ((b & (1 << 16)) && ((xcr0 & 0xe6) == 0xe6))
 		features |= X86_CPU_FEATURE_AVX512F;
+	if ((b & (1 << 30)) && ((xcr0 & 0xe6) == 0xe6))
+		features |= X86_CPU_FEATURE_AVX512BW;
 	if ((b & (1U << 31)) && ((xcr0 & 0xe6) == 0xe6))
 		features |= X86_CPU_FEATURE_AVX512VL;
 	if ((c & (1 << 10)) && ((xcr0 & 0x6) == 0x6))
 		features |= X86_CPU_FEATURE_VPCLMULQDQ;
+	if ((c & (1 << 11)) && ((xcr0 & 0xe6) == 0xe6))
+		features |= X86_CPU_FEATURE_AVX512VNNI;
 
 out:
 	disable_cpu_features_for_testing(&features, x86_cpu_feature_table,
diff --git a/lib/x86/cpu_features.h b/lib/x86/cpu_features.h
index 6f68e032..10afd8dd 100644
--- a/lib/x86/cpu_features.h
+++ b/lib/x86/cpu_features.h
@@ -52,8 +52,10 @@
  */
 #define X86_CPU_FEATURE_ZMM		0x00000020
 #define X86_CPU_FEATURE_AVX512F		0x00000040
-#define X86_CPU_FEATURE_AVX512VL	0x00000080
-#define X86_CPU_FEATURE_VPCLMULQDQ	0x00000100
+#define X86_CPU_FEATURE_AVX512BW	0x00000080
+#define X86_CPU_FEATURE_AVX512VL	0x00000100
+#define X86_CPU_FEATURE_VPCLMULQDQ	0x00000200
+#define X86_CPU_FEATURE_AVX512VNNI	0x00000400
 
 #define HAVE_SSE2(features)	(HAVE_SSE2_NATIVE     || ((features) & X86_CPU_FEATURE_SSE2))
 #define HAVE_PCLMULQDQ(features) (HAVE_PCLMULQDQ_NATIVE || ((features) & X86_CPU_FEATURE_PCLMULQDQ))
@@ -61,8 +63,10 @@
 #define HAVE_AVX2(features)	(HAVE_AVX2_NATIVE     || ((features) & X86_CPU_FEATURE_AVX2))
 #define HAVE_BMI2(features)	(HAVE_BMI2_NATIVE     || ((features) & X86_CPU_FEATURE_BMI2))
 #define HAVE_AVX512F(features)	(HAVE_AVX512F_NATIVE || ((features) & X86_CPU_FEATURE_AVX512F))
+#define HAVE_AVX512BW(features)	(HAVE_AVX512BW_NATIVE || ((features) & X86_CPU_FEATURE_AVX512BW))
 #define HAVE_AVX512VL(features)	(HAVE_AVX512VL_NATIVE || ((features) & X86_CPU_FEATURE_AVX512VL))
 #define HAVE_VPCLMULQDQ(features) (HAVE_VPCLMULQDQ_NATIVE || ((features) & X86_CPU_FEATURE_VPCLMULQDQ))
+#define HAVE_AVX512VNNI(features) (HAVE_AVX512VNNI_NATIVE || ((features) & X86_CPU_FEATURE_AVX512VNNI))
 
 #if HAVE_DYNAMIC_X86_CPU_FEATURES
 #define X86_CPU_FEATURES_KNOWN		0x80000000
@@ -182,6 +186,19 @@ static inline u32 get_x86_cpu_features(void) { return 0; }
 #  define HAVE_AVX512F_INTRIN	0
 #endif
 
+/* AVX-512BW */
+#ifdef __AVX512BW__
+#  define HAVE_AVX512BW_NATIVE	1
+#else
+#  define HAVE_AVX512BW_NATIVE	0
+#endif
+#if HAVE_AVX512BW_NATIVE || GCC_PREREQ(5, 1) || CLANG_PREREQ(3, 9, 0) || \
+			    defined(_MSC_VER)
+#  define HAVE_AVX512BW_INTRIN	1
+#else
+#  define HAVE_AVX512BW_INTRIN	0
+#endif
+
 /* AVX-512VL */
 #ifdef __AVX512VL__
 #  define HAVE_AVX512VL_NATIVE	1
@@ -201,13 +218,26 @@ static inline u32 get_x86_cpu_features(void) { return 0; }
 #else
 #  define HAVE_VPCLMULQDQ_NATIVE	0
 #endif
-#if HAVE_VPCLMULQDQ_NATIVE || (GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 0) || \
-			       defined(_MSC_VER))
+#if HAVE_VPCLMULQDQ_NATIVE || GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 0) || \
+			      defined(_MSC_VER)
 #  define HAVE_VPCLMULQDQ_INTRIN	1
 #else
 #  define HAVE_VPCLMULQDQ_INTRIN	0
 #endif
 
+/* AVX512VNNI */
+#ifdef __AVX512VNNI__
+#  define HAVE_AVX512VNNI_NATIVE	1
+#else
+#  define HAVE_AVX512VNNI_NATIVE	0
+#endif
+#if HAVE_AVX512VNNI_NATIVE || GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 0) || \
+			      defined(_MSC_VER)
+#  define HAVE_AVX512VNNI_INTRIN	1
+#else
+#  define HAVE_AVX512VNNI_INTRIN	0
+#endif
+
 #endif /* ARCH_X86_32 || ARCH_X86_64 */
 
 #endif /* LIB_X86_CPU_FEATURES_H */
diff --git a/scripts/checksum_benchmarks.sh b/scripts/checksum_benchmarks.sh
index b9c38698..6b418190 100755
--- a/scripts/checksum_benchmarks.sh
+++ b/scripts/checksum_benchmarks.sh
@@ -157,6 +157,11 @@ echo
 {
 case $ARCH in
 i386|x86_64)
+	if have_cpu_features avx512_vnni avx512bw; then
+		do_benchmark "AVX512VNNI/AVX512BW"
+		disable_cpu_feature "avx512vnni" "-mno-avx512vnni"
+		disable_cpu_feature "avx512bw" "-mno-avx512bw"
+	fi
 	if have_cpu_feature avx2; then
 		do_benchmark "AVX2"
 		disable_cpu_feature "avx2" "-mno-avx2"
diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh
index b07f71a7..9bca9677 100755
--- a/scripts/run_tests.sh
+++ b/scripts/run_tests.sh
@@ -142,8 +142,8 @@ build_and_run_tests()
 	if ! [[ "$CFLAGS" =~ "-march=native" ]] && ! $quick; then
 		case "$ARCH" in
 		i386|x86_64)
-			features+=(zmm vpclmulqdq avx512vl avx512f
-				   avx2 avx bmi2 pclmulqdq sse2)
+			features+=(zmm avx512vnni vpclmulqdq avx512vl avx512bw
+				   avx512f avx2 avx bmi2 pclmulqdq sse2)
 			;;
 		arm*|aarch*)
 			features+=(dotprod sha3 crc32 pmull neon)