diff --git a/CMakeLists.txt b/CMakeLists.txt
index 917fefb0..f902c675 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -131,6 +131,7 @@ if(LIBDEFLATE_ZLIB_SUPPORT)
          lib/adler32_vec_template.h
          lib/arm/adler32_impl.h
          lib/x86/adler32_impl.h
+         lib/x86/adler32_template.h
          lib/zlib_constants.h
     )
     if(LIBDEFLATE_COMPRESSION_SUPPORT)
diff --git a/lib/x86/adler32_impl.h b/lib/x86/adler32_impl.h
index c8b5153b..f0c68ace 100644
--- a/lib/x86/adler32_impl.h
+++ b/lib/x86/adler32_impl.h
@@ -30,371 +30,67 @@
 
 #include "cpu_features.h"
 
-/*
- * The following macros horizontally sum the s1 counters and add them to the
- * real s1, and likewise for s2.  They do this via a series of reductions, each
- * of which halves the vector length, until just one counter remains.
- *
- * The s1 reductions don't depend on the s2 reductions and vice versa, so for
- * efficiency they are interleaved.
- *
- * If used_sad=1, then the bytes were summed using one of the Sum of Absolute
- * Differences instructions (psadbw or vpsadbw) before they were added to v_s1.
- * In this case every other counter in v_s1 is 0, so we skip one of the s1
- * reductions when going from 128 => 32 bits.
- */
-
-#define ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2, used_sad)	    \
-{									    \
-	__m128i /* __v4su */ s1_last = (v_s1), s2_last = (v_s2);	    \
-									    \
-	/* 128 => 32 bits */						    \
-	if (!(used_sad))						    \
-		s1_last = _mm_add_epi32(s1_last,			    \
-					_mm_shuffle_epi32(s1_last, 0x31));  \
-	s2_last = _mm_add_epi32(s2_last, _mm_shuffle_epi32(s2_last, 0x31)); \
-	s1_last = _mm_add_epi32(s1_last, _mm_shuffle_epi32(s1_last, 0x02)); \
-	s2_last = _mm_add_epi32(s2_last, _mm_shuffle_epi32(s2_last, 0x02)); \
-									    \
-	*(s1) += (u32)_mm_cvtsi128_si32(s1_last);			    \
-	*(s2) += (u32)_mm_cvtsi128_si32(s2_last);			    \
-}
-
-#define ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1, v_s2, used_sad)	    \
-{									    \
-	__m128i /* __v4su */ s1_128bit, s2_128bit;			    \
-									    \
-	/* 256 => 128 bits */						    \
-	s1_128bit = _mm_add_epi32(_mm256_extracti128_si256((v_s1), 0),	    \
-				  _mm256_extracti128_si256((v_s1), 1));	    \
-	s2_128bit = _mm_add_epi32(_mm256_extracti128_si256((v_s2), 0),	    \
-				  _mm256_extracti128_si256((v_s2), 1));	    \
-									    \
-	ADLER32_FINISH_VEC_CHUNK_128((s1), (s2), s1_128bit, s2_128bit,	    \
-				     (used_sad));			    \
-}
-
-#define ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2, used_sad)	    \
-{									    \
-	__m256i /* __v8su */ s1_256bit, s2_256bit;			    \
-									    \
-	/* 512 => 256 bits */						    \
-	s1_256bit = _mm256_add_epi32(_mm512_extracti64x4_epi64((v_s1), 0),  \
-				     _mm512_extracti64x4_epi64((v_s1), 1)); \
-	s2_256bit = _mm256_add_epi32(_mm512_extracti64x4_epi64((v_s2), 0),  \
-				     _mm512_extracti64x4_epi64((v_s2), 1)); \
-									    \
-	ADLER32_FINISH_VEC_CHUNK_256((s1), (s2), s1_256bit, s2_256bit,	    \
-				     (used_sad));			    \
-}
-
-/*
- * This is a very silly partial workaround for gcc bug
- * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107892.  The bug causes gcc to
- * generate extra move instructions in some loops containing vector intrinsics.
- *
- * An alternate workaround would be to use gcc native vector operations instead
- * of vector intrinsics.  But that would result in MSVC needing its own code.
- */
-#if GCC_PREREQ(1, 0)
-#  define GCC_UPDATE_VARS(a, b, c, d, e, f) \
-	__asm__("" : "+x" (a), "+x" (b), "+x" (c), "+x" (d), "+x" (e), "+x" (f))
-#else
-#  define GCC_UPDATE_VARS(a, b, c, d, e, f) \
-	(void)a, (void)b, (void)c, (void)d, (void)e, (void)f
-#endif
-
-/*
- * SSE2 and AVX2 implementations.  They are very similar; the AVX2
- * implementation just uses twice the vector width as the SSE2 one.
- */
+/* SSE2 and AVX2 implementations.  Used on older CPUs. */
 #if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)
-#  define adler32_sse2		adler32_sse2
-#  define FUNCNAME		adler32_sse2
-#  define FUNCNAME_CHUNK	adler32_sse2_chunk
-#  define IMPL_ALIGNMENT	16
-#  define IMPL_SEGMENT_LEN	32
-/*
- * The 16-bit precision byte counters must not be allowed to undergo *signed*
- * overflow, otherwise the signed multiplications at the end (_mm_madd_epi16)
- * would behave incorrectly.
- */
-#  define IMPL_MAX_CHUNK_LEN	(32 * (0x7FFF / 0xFF))
-#  define ATTRIBUTES		_target_attribute("sse2")
-static forceinline ATTRIBUTES void
-adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2)
-{
-	static const u16 _aligned_attribute(16) mults[4][8] = {
-		{ 32, 31, 30, 29, 28, 27, 26, 25 },
-		{ 24, 23, 22, 21, 20, 19, 18, 17 },
-		{ 16, 15, 14, 13, 12, 11, 10, 9  },
-		{ 8,  7,  6,  5,  4,  3,  2,  1  },
-	};
-	const __m128i /* __v8hu */ mults_a = _mm_load_si128((const __m128i *)mults[0]);
-	const __m128i /* __v8hu */ mults_b = _mm_load_si128((const __m128i *)mults[1]);
-	const __m128i /* __v8hu */ mults_c = _mm_load_si128((const __m128i *)mults[2]);
-	const __m128i /* __v8hu */ mults_d = _mm_load_si128((const __m128i *)mults[3]);
-	const __m128i zeroes = _mm_setzero_si128();
-
-	/* s1 counters: 32-bit, sum of bytes */
-	__m128i /* __v4su */ v_s1 = zeroes;
-
-	/* s2 counters: 32-bit, sum of s1 values */
-	__m128i /* __v4su */ v_s2 = zeroes;
-
-	/*
-	 * Thirty-two 16-bit counters for byte sums.  Each accumulates the bytes
-	 * that eventually need to be multiplied by a number 32...1 for addition
-	 * into s2.
-	 */
-	__m128i /* __v8hu */ v_byte_sums_a = zeroes;
-	__m128i /* __v8hu */ v_byte_sums_b = zeroes;
-	__m128i /* __v8hu */ v_byte_sums_c = zeroes;
-	__m128i /* __v8hu */ v_byte_sums_d = zeroes;
-
-	do {
-		/* Load the next 32 bytes. */
-		const __m128i bytes1 = *p++;
-		const __m128i bytes2 = *p++;
-
-		/*
-		 * Accumulate the previous s1 counters into the s2 counters.
-		 * Logically, this really should be v_s2 += v_s1 * 32, but we
-		 * can do the multiplication (or left shift) later.
-		 */
-		v_s2 = _mm_add_epi32(v_s2, v_s1);
-
-		/*
-		 * s1 update: use "Packed Sum of Absolute Differences" to add
-		 * the bytes horizontally with 8 bytes per sum.  Then add the
-		 * sums to the s1 counters.
-		 */
-		v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes1, zeroes));
-		v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes2, zeroes));
-
-		/*
-		 * Also accumulate the bytes into 32 separate counters that have
-		 * 16-bit precision.
-		 */
-		v_byte_sums_a = _mm_add_epi16(
-			v_byte_sums_a, _mm_unpacklo_epi8(bytes1, zeroes));
-		v_byte_sums_b = _mm_add_epi16(
-			v_byte_sums_b, _mm_unpackhi_epi8(bytes1, zeroes));
-		v_byte_sums_c = _mm_add_epi16(
-			v_byte_sums_c, _mm_unpacklo_epi8(bytes2, zeroes));
-		v_byte_sums_d = _mm_add_epi16(
-			v_byte_sums_d, _mm_unpackhi_epi8(bytes2, zeroes));
-
-		GCC_UPDATE_VARS(v_s1, v_s2, v_byte_sums_a, v_byte_sums_b,
-				v_byte_sums_c, v_byte_sums_d);
-	} while (p != end);
-
-	/* Finish calculating the s2 counters. */
-	v_s2 = _mm_slli_epi32(v_s2, 5);
-	v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(v_byte_sums_a, mults_a));
-	v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(v_byte_sums_b, mults_b));
-	v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(v_byte_sums_c, mults_c));
-	v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(v_byte_sums_d, mults_d));
-
-	/* Add the counters to the real s1 and s2. */
-	ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2, 1);
-}
-#  include "../adler32_vec_template.h"
-
-#  define adler32_avx2		adler32_avx2
-#  define FUNCNAME		adler32_avx2
-#  define FUNCNAME_CHUNK	adler32_avx2_chunk
-#  define IMPL_ALIGNMENT	32
-#  define IMPL_SEGMENT_LEN	64
-#  define IMPL_MAX_CHUNK_LEN	(64 * (0x7FFF / 0xFF))
-#  define ATTRIBUTES		_target_attribute("avx2")
-static forceinline ATTRIBUTES void
-adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2)
-{
-	/*
-	 * Note, the multipliers have to be in this order because
-	 * _mm256_unpack{lo,hi}_epi8 work on each 128-bit lane separately.
-	 */
-	static const u16 _aligned_attribute(32) mults[4][16] = {
-		{ 64, 63, 62, 61, 60, 59, 58, 57, 48, 47, 46, 45, 44, 43, 42, 41 },
-		{ 56, 55, 54, 53, 52, 51, 50, 49, 40, 39, 38, 37, 36, 35, 34, 33 },
-		{ 32, 31, 30, 29, 28, 27, 26, 25, 16, 15, 14, 13, 12, 11, 10,  9 },
-		{ 24, 23, 22, 21, 20, 19, 18, 17,  8,  7,  6,  5,  4,  3,  2,  1 },
-	};
-	const __m256i /* __v16hu */ mults_a = _mm256_load_si256((const __m256i *)mults[0]);
-	const __m256i /* __v16hu */ mults_b = _mm256_load_si256((const __m256i *)mults[1]);
-	const __m256i /* __v16hu */ mults_c = _mm256_load_si256((const __m256i *)mults[2]);
-	const __m256i /* __v16hu */ mults_d = _mm256_load_si256((const __m256i *)mults[3]);
-	const __m256i zeroes = _mm256_setzero_si256();
-	__m256i /* __v8su */ v_s1 = zeroes;
-	__m256i /* __v8su */ v_s2 = zeroes;
-	__m256i /* __v16hu */ v_byte_sums_a = zeroes;
-	__m256i /* __v16hu */ v_byte_sums_b = zeroes;
-	__m256i /* __v16hu */ v_byte_sums_c = zeroes;
-	__m256i /* __v16hu */ v_byte_sums_d = zeroes;
-
-	do {
-		const __m256i bytes1 = *p++;
-		const __m256i bytes2 = *p++;
-
-		v_s2 = _mm256_add_epi32(v_s2, v_s1);
-		v_s1 = _mm256_add_epi32(v_s1, _mm256_sad_epu8(bytes1, zeroes));
-		v_s1 = _mm256_add_epi32(v_s1, _mm256_sad_epu8(bytes2, zeroes));
-		v_byte_sums_a = _mm256_add_epi16(
-			v_byte_sums_a, _mm256_unpacklo_epi8(bytes1, zeroes));
-		v_byte_sums_b = _mm256_add_epi16(
-			v_byte_sums_b, _mm256_unpackhi_epi8(bytes1, zeroes));
-		v_byte_sums_c = _mm256_add_epi16(
-			v_byte_sums_c, _mm256_unpacklo_epi8(bytes2, zeroes));
-		v_byte_sums_d = _mm256_add_epi16(
-			v_byte_sums_d, _mm256_unpackhi_epi8(bytes2, zeroes));
-
-		GCC_UPDATE_VARS(v_s1, v_s2, v_byte_sums_a, v_byte_sums_b,
-				v_byte_sums_c, v_byte_sums_d);
-	} while (p != end);
-
-	v_s2 = _mm256_slli_epi32(v_s2, 6);
-	v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(v_byte_sums_a, mults_a));
-	v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(v_byte_sums_b, mults_b));
-	v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(v_byte_sums_c, mults_c));
-	v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(v_byte_sums_d, mults_d));
-	ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1, v_s2, 1);
-}
-#  include "../adler32_vec_template.h"
+#  define adler32_x86_sse2	adler32_x86_sse2
+#  define SUFFIX		       _x86_sse2
+#  define ATTRIBUTES	_target_attribute("sse2")
+#  define VL		16
+#  define USE_VNNI	0
+#  define USE_MASKING	0
+#  include "adler32_template.h"
+
+#  define adler32_x86_avx2	adler32_x86_avx2
+#  define SUFFIX		       _x86_avx2
+#  define ATTRIBUTES	_target_attribute("avx2")
+#  define VL		32
+#  define USE_VNNI	0
+#  define USE_MASKING	0
+#  include "adler32_template.h"
 #endif
 
 /*
- * AVX2/AVX-VNNI implementation.  This is similar to the AVX512BW/AVX512VNNI
- * implementation, but instead of using AVX-512 it uses AVX2 plus AVX-VNNI.
- * AVX-VNNI adds dot product instructions to CPUs without AVX-512.
+ * AVX-VNNI implementation.  This is used on CPUs that have AVX2 and AVX-VNNI
+ * but don't have AVX-512, for example Intel Alder Lake.
  */
 #if GCC_PREREQ(11, 1) || CLANG_PREREQ(12, 0, 13000000) || MSVC_PREREQ(1930)
-#  define adler32_avx2_vnni	adler32_avx2_vnni
-#  define FUNCNAME		adler32_avx2_vnni
-#  define FUNCNAME_CHUNK	adler32_avx2_vnni_chunk
-#  define IMPL_ALIGNMENT	32
-#  define IMPL_SEGMENT_LEN	128
-#  define IMPL_MAX_CHUNK_LEN	MAX_CHUNK_LEN
+#  define adler32_x86_avx2_vnni	adler32_x86_avx2_vnni
+#  define SUFFIX		       _x86_avx2_vnni
 #  define ATTRIBUTES		_target_attribute("avx2,avxvnni")
-static forceinline ATTRIBUTES void
-adler32_avx2_vnni_chunk(const __m256i *p, const __m256i *const end,
-			u32 *s1, u32 *s2)
-{
-	static const u8 _aligned_attribute(32) mults[2][32] = {
-		{ 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,
-		  48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, },
-		{ 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
-		  16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1, },
-	};
-	const __m256i /* __v32qu */ mults_a = _mm256_load_si256((const __m256i *)mults[0]);
-	const __m256i /* __v32qu */ mults_b = _mm256_load_si256((const __m256i *)mults[1]);
-	const __m256i zeroes = _mm256_setzero_si256();
-	__m256i /* __v8su */ v_s1_a = zeroes;
-	__m256i /* __v8su */ v_s1_b = zeroes;
-	__m256i /* __v8su */ v_s1_sums_a = zeroes;
-	__m256i /* __v8su */ v_s1_sums_b = zeroes;
-	__m256i /* __v8su */ v_s2_a = zeroes;
-	__m256i /* __v8su */ v_s2_b = zeroes;
-	__m256i /* __v8su */ v_s2_c = zeroes;
-	__m256i /* __v8su */ v_s2_d = zeroes;
-
-	do {
-		const __m256i bytes_a = *p++;
-		const __m256i bytes_b = *p++;
-		const __m256i bytes_c = *p++;
-		const __m256i bytes_d = *p++;
-
-		v_s2_a = _mm256_dpbusd_avx_epi32(v_s2_a, bytes_a, mults_a);
-		v_s2_b = _mm256_dpbusd_avx_epi32(v_s2_b, bytes_b, mults_b);
-		v_s2_c = _mm256_dpbusd_avx_epi32(v_s2_c, bytes_c, mults_a);
-		v_s2_d = _mm256_dpbusd_avx_epi32(v_s2_d, bytes_d, mults_b);
-		v_s1_sums_a = _mm256_add_epi32(v_s1_sums_a, v_s1_a);
-		v_s1_sums_b = _mm256_add_epi32(v_s1_sums_b, v_s1_b);
-		v_s1_a = _mm256_add_epi32(v_s1_a,
-				  _mm256_add_epi32(_mm256_sad_epu8(bytes_a, zeroes),
-						   _mm256_sad_epu8(bytes_b, zeroes)));
-		v_s1_b = _mm256_add_epi32(v_s1_b,
-				  _mm256_add_epi32(_mm256_sad_epu8(bytes_c, zeroes),
-						   _mm256_sad_epu8(bytes_d, zeroes)));
-#if GCC_PREREQ(1, 0)
-		__asm__("" : "+x" (v_s1_a), "+x" (v_s1_b), "+x" (v_s1_sums_a),
-			     "+x" (v_s1_sums_b), "+x" (v_s2_a), "+x" (v_s2_b),
-			     "+x" (v_s2_c), "+x" (v_s2_d));
+#  define VL			32
+#  define USE_VNNI		1
+#  define USE_MASKING		0
+#  include "adler32_template.h"
 #endif
-	} while (p != end);
 
-	v_s1_sums_a = _mm256_add_epi32(v_s1_sums_a, v_s1_sums_b);
-	v_s1_sums_a = _mm256_slli_epi32(v_s1_sums_a, 7);
-	v_s1_sums_a = _mm256_add_epi32(v_s1_sums_a, _mm256_slli_epi32(v_s1_a, 6));
-	v_s1_a = _mm256_add_epi32(v_s1_a, v_s1_b);
-	v_s2_a = _mm256_add_epi32(v_s2_a, v_s2_b);
-	v_s2_c = _mm256_add_epi32(v_s2_c, v_s2_d);
-	v_s2_a = _mm256_add_epi32(v_s2_a, v_s2_c);
-	v_s2_a = _mm256_add_epi32(v_s2_a, v_s1_sums_a);
-	ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1_a, v_s2_a, 1);
-}
-#  include "../adler32_vec_template.h"
-#endif
+#if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920)
+/*
+ * AVX512VNNI implementation using 256-bit vectors.  This is very similar to the
+ * AVX-VNNI implementation but takes advantage of masking and more registers.
+ * This is used on CPUs that support AVX-512 but where using 512-bit vectors
+ * causes downclocking.  This should also be the optimal implementation for CPUs
+ * that support AVX10/256 but not AVX10/512.
+ */
+#  define adler32_x86_avx512_vl256_vnni	adler32_x86_avx512_vl256_vnni
+#  define SUFFIX			       _x86_avx512_vl256_vnni
+#  define ATTRIBUTES		_target_attribute("avx512bw,avx512vl,avx512vnni")
+#  define VL			32
+#  define USE_VNNI		1
+#  define USE_MASKING		1
+#  include "adler32_template.h"
 
 /*
- * AVX512BW/AVX512VNNI implementation.  Uses the vpdpbusd (dot product)
- * instruction from AVX512VNNI.
+ * AVX512VNNI implementation using 512-bit vectors.  This is used on CPUs that
+ * have a good AVX-512 implementation including AVX512VNNI.  This should also be
+ * the optimal implementation for CPUs that support AVX10/512.
  */
-#if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920)
-#  define adler32_avx512_vnni	adler32_avx512_vnni
-#  define FUNCNAME		adler32_avx512_vnni
-#  define FUNCNAME_CHUNK	adler32_avx512_vnni_chunk
-#  define IMPL_ALIGNMENT	64
-#  define IMPL_SEGMENT_LEN	128
-#  define IMPL_MAX_CHUNK_LEN	MAX_CHUNK_LEN
+#  define adler32_x86_avx512_vl512_vnni	adler32_x86_avx512_vl512_vnni
+#  define SUFFIX			       _x86_avx512_vl512_vnni
 #  define ATTRIBUTES		_target_attribute("avx512bw,avx512vnni")
-static forceinline ATTRIBUTES void
-adler32_avx512_vnni_chunk(const __m512i *p, const __m512i *const end,
-			  u32 *s1, u32 *s2)
-{
-	static const u8 _aligned_attribute(64) mults[64] = {
-		64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,
-		48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33,
-		32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
-		16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,
-	};
-	const __m512i /* __v64qu */ mults_a = _mm512_load_si512((const __m512i *)mults);
-	const __m512i /* __v64qu */ ones = _mm512_set1_epi8(1);
-	const __m512i zeroes = _mm512_setzero_si512();
-	__m512i /* __v16su */ v_s1_a = zeroes;
-	__m512i /* __v16su */ v_s1_b = zeroes;
-	__m512i /* __v16su */ v_s1_sums_a = zeroes;
-	__m512i /* __v16su */ v_s1_sums_b = zeroes;
-	__m512i /* __v16su */ v_s2_a = zeroes;
-	__m512i /* __v16su */ v_s2_b = zeroes;
-
-	do {
-		const __m512i bytes_a = *p++;
-		const __m512i bytes_b = *p++;
-
-		v_s2_a = _mm512_dpbusd_epi32(v_s2_a, bytes_a, mults_a);
-		v_s2_b = _mm512_dpbusd_epi32(v_s2_b, bytes_b, mults_a);
-		v_s1_sums_a = _mm512_add_epi32(v_s1_sums_a, v_s1_a);
-		v_s1_sums_b = _mm512_add_epi32(v_s1_sums_b, v_s1_b);
-		/*
-		 * vpdpbusd with 1's seems to be faster than vpsadbw + vpaddd
-		 * here, provided that the accumulators are independent.
-		 */
-		v_s1_a = _mm512_dpbusd_epi32(v_s1_a, bytes_a, ones);
-		v_s1_b = _mm512_dpbusd_epi32(v_s1_b, bytes_b, ones);
-		GCC_UPDATE_VARS(v_s1_a, v_s1_b, v_s1_sums_a, v_s1_sums_b,
-				v_s2_a, v_s2_b);
-	} while (p != end);
-
-	v_s1_sums_a = _mm512_add_epi32(v_s1_sums_a, v_s1_sums_b);
-	v_s1_sums_a = _mm512_slli_epi32(v_s1_sums_a, 7);
-	v_s1_sums_a = _mm512_add_epi32(v_s1_sums_a, _mm512_slli_epi32(v_s1_a, 6));
-	v_s1_a = _mm512_add_epi32(v_s1_a, v_s1_b);
-	v_s2_a = _mm512_add_epi32(v_s2_a, v_s2_b);
-	v_s2_a = _mm512_add_epi32(v_s2_a, v_s1_sums_a);
-	ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1_a, v_s2_a, 0);
-}
-#  include "../adler32_vec_template.h"
+#  define VL			64
+#  define USE_VNNI		1
+#  define USE_MASKING		1
+#  include "adler32_template.h"
 #endif
 
 static inline adler32_func_t
@@ -402,22 +98,27 @@ arch_select_adler32_func(void)
 {
 	const u32 features MAYBE_UNUSED = get_x86_cpu_features();
 
-#ifdef adler32_avx512_vnni
+#ifdef adler32_x86_avx512_vl512_vnni
 	if ((features & X86_CPU_FEATURE_ZMM) &&
 	    HAVE_AVX512BW(features) && HAVE_AVX512VNNI(features))
-		return adler32_avx512_vnni;
+		return adler32_x86_avx512_vl512_vnni;
+#endif
+#ifdef adler32_x86_avx512_vl256_vnni
+	if (HAVE_AVX512BW(features) && HAVE_AVX512VL(features) &&
+	    HAVE_AVX512VNNI(features))
+		return adler32_x86_avx512_vl256_vnni;
 #endif
-#ifdef adler32_avx2_vnni
+#ifdef adler32_x86_avx2_vnni
 	if (HAVE_AVX2(features) && HAVE_AVXVNNI(features))
-		return adler32_avx2_vnni;
+		return adler32_x86_avx2_vnni;
 #endif
-#ifdef adler32_avx2
+#ifdef adler32_x86_avx2
 	if (HAVE_AVX2(features))
-		return adler32_avx2;
+		return adler32_x86_avx2;
 #endif
-#ifdef adler32_sse2
+#ifdef adler32_x86_sse2
 	if (HAVE_SSE2(features))
-		return adler32_sse2;
+		return adler32_x86_sse2;
 #endif
 	return NULL;
 }
diff --git a/lib/x86/adler32_template.h b/lib/x86/adler32_template.h
new file mode 100644
index 00000000..9a3146b8
--- /dev/null
+++ b/lib/x86/adler32_template.h
@@ -0,0 +1,515 @@
+/*
+ * x86/adler32_template.h - template for vectorized Adler-32 implementations
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * This file is a "template" for instantiating Adler-32 functions for x86.
+ * The "parameters" are:
+ *
+ * SUFFIX:
+ *	Name suffix to append to all instantiated functions.
+ * ATTRIBUTES:
+ *	Target function attributes to use.  Must satisfy the dependencies of the
+ *	other parameters as follows:
+ *	   VL=16 && USE_VNNI=0 && USE_MASKING=0: at least sse2
+ *	   VL=32 && USE_VNNI=0 && USE_MASKING=0: at least avx2
+ *	   VL=32 && USE_VNNI=1 && USE_MASKING=0: at least avx2,avxvnni
+ *	   VL=32 && USE_VNNI=1 && USE_MASKING=1: at least avx512bw,avx512vl,avx512vnni
+ *	   VL=64 && USE_VNNI=1 && USE_MASKING=1: at least avx512bw,avx512vnni
+ *	   (Other combinations are not useful and have not been tested.)
+ * VL:
+ *	Vector length in bytes.  Must be 16, 32, and 64.
+ * USE_VNNI:
+ *	If 1, use the VNNI dot product based algorithm.
+ *	If 0, use the legacy SSE2 and AVX2 compatible algorithm.
+ * USE_MASKING:
+ *	If 1, use AVX-512 features such as masking.
+ *	If 0, assume that the CPU might not support AVX-512.
+ */
+
+#if VL == 16
+#  define vec_t			__m128i
+#  define mask_t		u16
+#  define LOG2_VL		4
+#  define VADD8(a, b)		_mm_add_epi8((a), (b))
+#  define VADD16(a, b)		_mm_add_epi16((a), (b))
+#  define VADD32(a, b)		_mm_add_epi32((a), (b))
+#  if USE_MASKING
+#    define VDPBUSD(a, b, c)	_mm_dpbusd_epi32((a), (b), (c))
+#  else
+#    define VDPBUSD(a, b, c)	_mm_dpbusd_avx_epi32((a), (b), (c))
+#  endif
+#  define VLOAD(p)		_mm_load_si128((const void *)(p))
+#  define VLOADU(p)		_mm_loadu_si128((const void *)(p))
+#  define VMADD16(a, b)		_mm_madd_epi16((a), (b))
+#  define VMASKZ_LOADU(mask, p) _mm_maskz_loadu_epi8((mask), (p))
+#  define VMULLO32(a, b)	_mm_mullo_epi32((a), (b))
+#  define VSAD8(a, b)		_mm_sad_epu8((a), (b))
+#  define VSET1_32(a)		_mm_set1_epi32(a)
+#  define VSET1_8(a)		_mm_set1_epi8(a)
+#  define VSETZERO()		_mm_setzero_si128()
+#  define VSLL32(a, b)		_mm_slli_epi32((a), (b))
+#  define VUNPACKHI8(a, b)	_mm_unpackhi_epi8((a), (b))
+#  define VUNPACKLO8(a, b)	_mm_unpacklo_epi8((a), (b))
+#elif VL == 32
+#  define vec_t			__m256i
+#  define mask_t		u32
+#  define LOG2_VL		5
+#  define VADD8(a, b)		_mm256_add_epi8((a), (b))
+#  define VADD16(a, b)		_mm256_add_epi16((a), (b))
+#  define VADD32(a, b)		_mm256_add_epi32((a), (b))
+#  if USE_MASKING
+#    define VDPBUSD(a, b, c)	_mm256_dpbusd_epi32((a), (b), (c))
+#  else
+#    define VDPBUSD(a, b, c)	_mm256_dpbusd_avx_epi32((a), (b), (c))
+#  endif
+#  define VLOAD(p)		_mm256_load_si256((const void *)(p))
+#  define VLOADU(p)		_mm256_loadu_si256((const void *)(p))
+#  define VMADD16(a, b)		_mm256_madd_epi16((a), (b))
+#  define VMASKZ_LOADU(mask, p) _mm256_maskz_loadu_epi8((mask), (p))
+#  define VMULLO32(a, b)	_mm256_mullo_epi32((a), (b))
+#  define VSAD8(a, b)		_mm256_sad_epu8((a), (b))
+#  define VSET1_32(a)		_mm256_set1_epi32(a)
+#  define VSET1_8(a)		_mm256_set1_epi8(a)
+#  define VSETZERO()		_mm256_setzero_si256()
+#  define VSLL32(a, b)		_mm256_slli_epi32((a), (b))
+#  define VUNPACKHI8(a, b)	_mm256_unpackhi_epi8((a), (b))
+#  define VUNPACKLO8(a, b)	_mm256_unpacklo_epi8((a), (b))
+#elif VL == 64
+#  define vec_t			__m512i
+#  define mask_t		u64
+#  define LOG2_VL		6
+#  define VADD8(a, b)		_mm512_add_epi8((a), (b))
+#  define VADD32(a, b)		_mm512_add_epi32((a), (b))
+#  define VDPBUSD(a, b, c)	_mm512_dpbusd_epi32((a), (b), (c))
+#  define VLOAD(p)		_mm512_load_si512((const void *)(p))
+#  define VLOADU(p)		_mm512_loadu_si512((const void *)(p))
+#  define VMASKZ_LOADU(mask, p) _mm512_maskz_loadu_epi8((mask), (p))
+#  define VMULLO32(a, b)	_mm512_mullo_epi32((a), (b))
+#  define VSET1_32(a)		_mm512_set1_epi32(a)
+#  define VSET1_8(a)		_mm512_set1_epi8(a)
+#  define VSETZERO()		_mm512_setzero_si512()
+#  define VSLL32(a, b)		_mm512_slli_epi32((a), (b))
+#else
+#  error "unsupported vector length"
+#endif
+
+#define VADD32_3X(a, b, c)	VADD32(VADD32((a), (b)), (c))
+#define VADD32_4X(a, b, c, d)	VADD32(VADD32((a), (b)), VADD32((c), (d)))
+#define VADD32_5X(a, b, c, d, e) VADD32((a), VADD32_4X((b), (c), (d), (e)))
+#define VADD32_7X(a, b, c, d, e, f, g)	\
+	VADD32(VADD32_3X((a), (b), (c)), VADD32_4X((d), (e), (f), (g)))
+
+/* Sum the 32-bit elements of v_s1 and add them to s1, and likewise for s2. */
+#undef reduce_to_32bits
+static forceinline ATTRIBUTES void
+ADD_SUFFIX(reduce_to_32bits)(vec_t v_s1, vec_t v_s2, u32 *s1_p, u32 *s2_p)
+{
+	__m128i v_s1_128, v_s2_128;
+#if VL == 16
+	{
+		v_s1_128 = v_s1;
+		v_s2_128 = v_s2;
+	}
+#else
+	{
+		__m256i v_s1_256, v_s2_256;
+#  if VL == 32
+		v_s1_256 = v_s1;
+		v_s2_256 = v_s2;
+#  else
+		/* Reduce 512 bits to 256 bits. */
+		v_s1_256 = _mm256_add_epi32(_mm512_extracti64x4_epi64(v_s1, 0),
+					    _mm512_extracti64x4_epi64(v_s1, 1));
+		v_s2_256 = _mm256_add_epi32(_mm512_extracti64x4_epi64(v_s2, 0),
+					    _mm512_extracti64x4_epi64(v_s2, 1));
+#  endif
+		/* Reduce 256 bits to 128 bits. */
+		v_s1_128 = _mm_add_epi32(_mm256_extracti128_si256(v_s1_256, 0),
+					 _mm256_extracti128_si256(v_s1_256, 1));
+		v_s2_128 = _mm_add_epi32(_mm256_extracti128_si256(v_s2_256, 0),
+					 _mm256_extracti128_si256(v_s2_256, 1));
+	}
+#endif
+
+	/*
+	 * Reduce 128 bits to 32 bits.
+	 *
+	 * If the bytes were summed into v_s1 using psadbw + paddd, then ignore
+	 * the odd-indexed elements of v_s1_128 since they are zero.
+	 */
+#if USE_VNNI
+	v_s1_128 = _mm_add_epi32(v_s1_128, _mm_shuffle_epi32(v_s1_128, 0x31));
+#endif
+	v_s2_128 = _mm_add_epi32(v_s2_128, _mm_shuffle_epi32(v_s2_128, 0x31));
+	v_s1_128 = _mm_add_epi32(v_s1_128, _mm_shuffle_epi32(v_s1_128, 0x02));
+	v_s2_128 = _mm_add_epi32(v_s2_128, _mm_shuffle_epi32(v_s2_128, 0x02));
+
+	*s1_p += (u32)_mm_cvtsi128_si32(v_s1_128);
+	*s2_p += (u32)_mm_cvtsi128_si32(v_s2_128);
+}
+#define reduce_to_32bits	ADD_SUFFIX(reduce_to_32bits)
+
+static u32 ATTRIBUTES
+ADD_SUFFIX(adler32)(u32 adler, const u8 *p, size_t len)
+{
+#if USE_VNNI
+	/* This contains the bytes [VL, VL-1, VL-2, ..., 1]. */
+	static const u8 _aligned_attribute(VL) raw_mults[VL] = {
+	#if VL == 64
+		64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,
+		48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33,
+	#endif
+	#if VL >= 32
+		32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
+	#endif
+		16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,
+	};
+	const vec_t ones = VSET1_8(1);
+#else
+	/*
+	 * This contains the 16-bit values [2*VL, 2*VL - 1, 2*VL - 2, ..., 1].
+	 * For VL==32 the ordering is weird because it has to match the way that
+	 * vpunpcklbw and vpunpckhbw work on 128-bit lanes separately.
+	 */
+	static const u16 _aligned_attribute(VL) raw_mults[4][VL / 2] = {
+	#if VL == 16
+		{ 32, 31, 30, 29, 28, 27, 26, 25 },
+		{ 24, 23, 22, 21, 20, 19, 18, 17 },
+		{ 16, 15, 14, 13, 12, 11, 10, 9  },
+		{ 8,  7,  6,  5,  4,  3,  2,  1  },
+	#elif VL == 32
+		{ 64, 63, 62, 61, 60, 59, 58, 57, 48, 47, 46, 45, 44, 43, 42, 41 },
+		{ 56, 55, 54, 53, 52, 51, 50, 49, 40, 39, 38, 37, 36, 35, 34, 33 },
+		{ 32, 31, 30, 29, 28, 27, 26, 25, 16, 15, 14, 13, 12, 11, 10,  9 },
+		{ 24, 23, 22, 21, 20, 19, 18, 17,  8,  7,  6,  5,  4,  3,  2,  1 },
+	#else
+	#  error "unsupported parameters"
+	#endif
+	};
+	const vec_t mults_a = VLOAD(raw_mults[0]);
+	const vec_t mults_b = VLOAD(raw_mults[1]);
+	const vec_t mults_c = VLOAD(raw_mults[2]);
+	const vec_t mults_d = VLOAD(raw_mults[3]);
+#endif
+	const vec_t zeroes = VSETZERO();
+	u32 s1 = adler & 0xFFFF;
+	u32 s2 = adler >> 16;
+
+	/*
+	 * If the length is large and the pointer is misaligned, align it.
+	 * For smaller lengths, just take the unaligned load penalty.
+	 */
+	if (unlikely(len > 65536 && ((uintptr_t)p & (VL-1)))) {
+		do {
+			s1 += *p++;
+			s2 += s1;
+			len--;
+		} while ((uintptr_t)p & (VL-1));
+		s1 %= DIVISOR;
+		s2 %= DIVISOR;
+	}
+
+#if USE_VNNI
+	/*
+	 * This is Adler-32 using the vpdpbusd instruction from AVX512-VNNI or
+	 * AVX-VNNI.  vpdpbusd multiplies the unsigned bytes of one vector by
+	 * the signed bytes of another vector and adds the sums in groups of 4
+	 * to the 32-bit elements of a third vector.  We use it in two ways:
+	 * multiplying the data bytes by a sequence like 64,63,62,...,1 for
+	 * calculating part of s2, and multiplying the data bytes by an all-ones
+	 * sequence 1,1,1,...,1 for calculating s1 and part of s2.  The all-ones
+	 * trick seems to be faster than the alternative of vpsadbw + vpaddd.
+	 */
+	while (len) {
+		/*
+		 * Calculate the length of the next data chunk such that s1 and
+		 * s2 are guaranteed to not exceed UINT32_MAX.
+		 */
+		size_t n = MIN(len, MAX_CHUNK_LEN & ~(4*VL - 1));
+		vec_t mults = VLOAD(raw_mults);
+		vec_t v_s1 = zeroes;
+		vec_t v_s2 = zeroes;
+
+		s2 += s1 * n;
+		len -= n;
+
+		if (n >= 4 * VL) {
+			vec_t v_s1_b = zeroes;
+			vec_t v_s1_c = zeroes;
+			vec_t v_s1_d = zeroes;
+			vec_t v_s2_b = zeroes;
+			vec_t v_s2_c = zeroes;
+			vec_t v_s2_d = zeroes;
+			vec_t v_s1_sums   = zeroes;
+			vec_t v_s1_sums_b = zeroes;
+			vec_t v_s1_sums_c = zeroes;
+			vec_t v_s1_sums_d = zeroes;
+			vec_t tmp0, tmp1;
+
+			do {
+				vec_t data_a = VLOADU(p + 0*VL);
+				vec_t data_b = VLOADU(p + 1*VL);
+				vec_t data_c = VLOADU(p + 2*VL);
+				vec_t data_d = VLOADU(p + 3*VL);
+
+				/*
+				 * Workaround for gcc bug where it generates
+				 * unnecessary move instructions
+				 * (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107892)
+				 */
+			#if GCC_PREREQ(1, 0)
+				__asm__("" : "+v" (data_a), "+v" (data_b),
+					     "+v" (data_c), "+v" (data_d));
+			#endif
+
+				v_s2   = VDPBUSD(v_s2,   data_a, mults);
+				v_s2_b = VDPBUSD(v_s2_b, data_b, mults);
+				v_s2_c = VDPBUSD(v_s2_c, data_c, mults);
+				v_s2_d = VDPBUSD(v_s2_d, data_d, mults);
+
+				v_s1_sums   = VADD32(v_s1_sums,   v_s1);
+				v_s1_sums_b = VADD32(v_s1_sums_b, v_s1_b);
+				v_s1_sums_c = VADD32(v_s1_sums_c, v_s1_c);
+				v_s1_sums_d = VADD32(v_s1_sums_d, v_s1_d);
+
+				v_s1   = VDPBUSD(v_s1,   data_a, ones);
+				v_s1_b = VDPBUSD(v_s1_b, data_b, ones);
+				v_s1_c = VDPBUSD(v_s1_c, data_c, ones);
+				v_s1_d = VDPBUSD(v_s1_d, data_d, ones);
+
+				/* Same gcc bug workaround.  See above */
+			#if GCC_PREREQ(1, 0) && !defined(ARCH_X86_32)
+				__asm__("" : "+v" (v_s2), "+v" (v_s2_b),
+					     "+v" (v_s2_c), "+v" (v_s2_d),
+					     "+v" (v_s1_sums),
+					     "+v" (v_s1_sums_b),
+					     "+v" (v_s1_sums_c),
+					     "+v" (v_s1_sums_d),
+					     "+v" (v_s1), "+v" (v_s1_b),
+					     "+v" (v_s1_c), "+v" (v_s1_d));
+			#endif
+				p += 4*VL;
+				n -= 4*VL;
+			} while (n >= 4*VL);
+
+			/*
+			 * Reduce into v_s1 and v_s2 as follows:
+			 *
+			 * v_s2 = v_s2 + v_s2_b + v_s2_c + v_s2_d +
+			 *	  (4*VL)*(v_s1_sums   + v_s1_sums_b +
+			 *		  v_s1_sums_c + v_s1_sums_d) +
+			 *	  (3*VL)*v_s1 + (2*VL)*v_s1_b + VL*v_s1_c
+			 * v_s1 = v_s1 + v_s1_b + v_s1_c + v_s1_d
+			 */
+			tmp0 = VADD32(v_s1, v_s1_b);
+			tmp1 = VADD32(v_s1, v_s1_c);
+			v_s1_sums = VADD32_4X(v_s1_sums, v_s1_sums_b,
+					      v_s1_sums_c, v_s1_sums_d);
+			v_s1 = VADD32_3X(tmp0, v_s1_c, v_s1_d);
+			v_s2 = VADD32_7X(VSLL32(v_s1_sums, LOG2_VL + 2),
+					 VSLL32(tmp0, LOG2_VL + 1),
+					 VSLL32(tmp1, LOG2_VL),
+					 v_s2, v_s2_b, v_s2_c, v_s2_d);
+		}
+
+		/* Process the last 0 <= n < 4*VL bytes of the chunk. */
+		if (n >= 2*VL) {
+			const vec_t data_a = VLOADU(p + 0*VL);
+			const vec_t data_b = VLOADU(p + 1*VL);
+
+			v_s2 = VADD32(v_s2, VSLL32(v_s1, LOG2_VL + 1));
+			v_s1 = VDPBUSD(v_s1, data_a, ones);
+			v_s1 = VDPBUSD(v_s1, data_b, ones);
+			v_s2 = VDPBUSD(v_s2, data_a, VSET1_8(VL));
+			v_s2 = VDPBUSD(v_s2, data_a, mults);
+			v_s2 = VDPBUSD(v_s2, data_b, mults);
+			p += 2*VL;
+			n -= 2*VL;
+		}
+		if (n) {
+			/* Process the last 0 < n < 2*VL bytes of the chunk. */
+			vec_t data;
+
+			v_s2 = VADD32(v_s2, VMULLO32(v_s1, VSET1_32(n)));
+
+			mults = VADD8(mults, VSET1_8((int)n - VL));
+			if (n > VL) {
+				data = VLOADU(p);
+				v_s1 = VDPBUSD(v_s1, data, ones);
+				v_s2 = VDPBUSD(v_s2, data, mults);
+				p += VL;
+				n -= VL;
+				mults = VADD8(mults, VSET1_8(-VL));
+			}
+			/*
+			 * Process the last 0 < n <= VL bytes of the chunk.
+			 * Utilize a masked load if it's available.
+			 */
+		#if USE_MASKING
+			data = VMASKZ_LOADU((mask_t)-1 >> (VL - n), p);
+		#else
+			data = zeroes;
+			memcpy(&data, p, n);
+		#endif
+			v_s1 = VDPBUSD(v_s1, data, ones);
+			v_s2 = VDPBUSD(v_s2, data, mults);
+			p += n;
+		}
+
+		reduce_to_32bits(v_s1, v_s2, &s1, &s2);
+		s1 %= DIVISOR;
+		s2 %= DIVISOR;
+	}
+#else /* USE_VNNI */
+	/*
+	 * This is Adler-32 for SSE2 and AVX2.
+	 *
+	 * To horizontally sum bytes, use psadbw + paddd, where one of the
+	 * arguments to psadbw is all-zeroes.
+	 *
+	 * For the s2 contribution from (2*VL - i)*data[i] for each of the 2*VL
+	 * bytes of each iteration of the inner loop, use punpck{l,h}bw + paddw
+	 * to sum, for each i across iterations, byte i into a corresponding
+	 * 16-bit counter in v_byte_sums_*.  After the inner loop, use pmaddw to
+	 * multiply each counter i by (2*VL - i), then add the products to s2.
+	 *
+	 * An alternative implementation would use pmaddubsw and pmaddwd in the
+	 * inner loop to do (2*VL-i)*data[i] directly and add the products in
+	 * groups of 4 to 32-bit counters.  However, on average that approach
+	 * seems to be slower than the current approach which delays the
+	 * multiplications.  Also, pmaddubsw requires SSSE3; the current
+	 * approach keeps the implementation aligned between SSE2 and AVX2.
+	 *
+	 * The inner loop processes 2*VL bytes per iteration.  Increasing this
+	 * to 4*VL doesn't seem to be helpful here.
+	 */
+	while (len) {
+		/*
+		 * Calculate the length of the next data chunk such that s1 and
+		 * s2 are guaranteed to not exceed UINT32_MAX, and every
+		 * v_byte_sums_* counter is guaranteed to not exceed INT16_MAX.
+		 * It's INT16_MAX, not UINT16_MAX, because v_byte_sums_* are
+		 * used with pmaddw which does signed multiplication.  In the
+		 * SSE2 case this limits chunks to 4096 bytes instead of 5504.
+		 */
+		size_t n = MIN(len, MIN(2 * VL * (INT16_MAX / UINT8_MAX),
+					MAX_CHUNK_LEN) & ~(2*VL - 1));
+		len -= n;
+
+		if (n >= 2*VL) {
+			vec_t v_s1 = zeroes;
+			vec_t v_s1_sums = zeroes;
+			vec_t v_byte_sums_a = zeroes;
+			vec_t v_byte_sums_b = zeroes;
+			vec_t v_byte_sums_c = zeroes;
+			vec_t v_byte_sums_d = zeroes;
+			vec_t v_s2;
+
+			s2 += s1 * (n & ~(2*VL - 1));
+
+			do {
+				vec_t data_a = VLOADU(p + 0*VL);
+				vec_t data_b = VLOADU(p + 1*VL);
+
+				v_s1_sums = VADD32(v_s1_sums, v_s1);
+				v_byte_sums_a = VADD16(v_byte_sums_a,
+						       VUNPACKLO8(data_a, zeroes));
+				v_byte_sums_b = VADD16(v_byte_sums_b,
+						       VUNPACKHI8(data_a, zeroes));
+				v_byte_sums_c = VADD16(v_byte_sums_c,
+						       VUNPACKLO8(data_b, zeroes));
+				v_byte_sums_d = VADD16(v_byte_sums_d,
+						       VUNPACKHI8(data_b, zeroes));
+				v_s1 = VADD32(v_s1,
+					      VADD32(VSAD8(data_a, zeroes),
+						     VSAD8(data_b, zeroes)));
+				/*
+				 * Workaround for gcc bug where it generates
+				 * unnecessary move instructions
+				 * (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107892)
+				 */
+			#if GCC_PREREQ(1, 0)
+				__asm__("" : "+x" (v_s1), "+x" (v_s1_sums),
+					     "+x" (v_byte_sums_a),
+					     "+x" (v_byte_sums_b),
+					     "+x" (v_byte_sums_c),
+					     "+x" (v_byte_sums_d));
+			#endif
+				p += 2*VL;
+				n -= 2*VL;
+			} while (n >= 2*VL);
+
+			/*
+			 * Calculate v_s2 as (2*VL)*v_s1_sums +
+			 * [2*VL, 2*VL - 1, 2*VL - 2, ..., 1] * v_byte_sums.
+			 * Then update s1 and s2 from v_s1 and v_s2.
+			 */
+			v_s2 = VADD32_5X(VSLL32(v_s1_sums, LOG2_VL + 1),
+					 VMADD16(v_byte_sums_a, mults_a),
+					 VMADD16(v_byte_sums_b, mults_b),
+					 VMADD16(v_byte_sums_c, mults_c),
+					 VMADD16(v_byte_sums_d, mults_d));
+			reduce_to_32bits(v_s1, v_s2, &s1, &s2);
+		}
+		/*
+		 * Process the last 0 <= n < 2*VL bytes of the chunk using
+		 * scalar instructions, then reduce s1 and s2 mod DIVISOR.
+		 */
+		adler32_generic_noreduce(&s1, &s2, p, n);
+		p += n;
+		s1 %= DIVISOR;
+		s2 %= DIVISOR;
+	}
+#endif /* !USE_VNNI */
+	return (s2 << 16) | s1;
+}
+
+#undef vec_t
+#undef mask_t
+#undef LOG2_VL
+#undef VADD8
+#undef VADD16
+#undef VADD32
+#undef VDPBUSD
+#undef VLOAD
+#undef VLOADU
+#undef VMADD16
+#undef VMASKZ_LOADU
+#undef VMULLO32
+#undef VSAD8
+#undef VSET1_8
+#undef VSET1_32
+#undef VSETZERO
+#undef VSLL32
+#undef VUNPACKHI8
+#undef VUNPACKLO8
+
+#undef SUFFIX
+#undef ATTRIBUTES
+#undef VL
+#undef USE_VNNI
+#undef USE_MASKING
diff --git a/lib/x86/cpu_features.h b/lib/x86/cpu_features.h
index 5582fcd4..c6eae775 100644
--- a/lib/x86/cpu_features.h
+++ b/lib/x86/cpu_features.h
@@ -85,6 +85,9 @@ static inline u32 get_x86_cpu_features(void)
 #    if __has_include(<avx512vnniintrin.h>)
 #      include <avx512vnniintrin.h>
 #    endif
+#    if __has_include(<avx512vlvnniintrin.h>)
+#      include <avx512vlvnniintrin.h>
+#    endif
 #    if __has_include(<avxvnniintrin.h>)
 #      include <avxvnniintrin.h>
 #    endif
diff --git a/scripts/checksum_benchmarks.sh b/scripts/checksum_benchmarks.sh
index 03707752..12fb405f 100755
--- a/scripts/checksum_benchmarks.sh
+++ b/scripts/checksum_benchmarks.sh
@@ -198,12 +198,16 @@ echo
 case $ARCH in
 i386|x86_64)
 	if have_cpu_features avx512bw avx512_vnni; then
-		do_benchmark "AVX512BW/AVX512VNNI"
+		do_benchmark "AVX512VNNI/VL512"
+		disable_cpu_feature zmm
+		if have_cpu_features avx512vl; then
+			do_benchmark "AVX512VNNI/VL256"
+		fi
 		disable_cpu_feature avx512_vnni "-mno-avx512vnni"
 		disable_cpu_feature avx512bw "-mno-avx512bw"
 	fi
 	if have_cpu_features avx2 avx_vnni; then
-		do_benchmark "AVX2/AVX-VNNI"
+		do_benchmark "AVX-VNNI"
 		disable_cpu_feature avx_vnni "-mno-avxvnni"
 	fi
 	if have_cpu_features avx2; then
diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh
index 51a7f4b0..4419bc06 100755
--- a/scripts/run_tests.sh
+++ b/scripts/run_tests.sh
@@ -142,8 +142,7 @@ build_and_run_tests()
 	if ! [[ "$CFLAGS" =~ "-march=native" ]] && ! $quick; then
 		case "$ARCH" in
 		i386|x86_64)
-			features+=(zmm avx_vnni avx512_vnni vpclmulqdq
-				   avx512vl avx512bw avx512f
+			features+=(zmm avx512_vnni avx512vl avx_vnni vpclmulqdq
 				   avx2 avx bmi2 pclmulqdq sse2)
 			;;
 		arm*|aarch*)