diff --git a/lib/arm/adler32_impl.h b/lib/arm/adler32_impl.h
index 99a5f3f9..e411fd3b 100644
--- a/lib/arm/adler32_impl.h
+++ b/lib/arm/adler32_impl.h
@@ -43,7 +43,7 @@
 #    endif
 #  endif
 #  include <arm_neon.h>
-static u32 ATTRIBUTES MAYBE_UNUSED
+static ATTRIBUTES MAYBE_UNUSED u32
 adler32_arm_neon(u32 adler, const u8 *p, size_t len)
 {
 	static const u16 _aligned_attribute(16) mults[64] = {
@@ -225,7 +225,7 @@ adler32_arm_neon(u32 adler, const u8 *p, size_t len)
 #    endif
 #  endif
 #  include <arm_neon.h>
-static u32 ATTRIBUTES
+static ATTRIBUTES u32
 adler32_arm_neon_dotprod(u32 adler, const u8 *p, size_t len)
 {
 	static const u8 _aligned_attribute(16) mults[64] = {
diff --git a/lib/arm/crc32_impl.h b/lib/arm/crc32_impl.h
index d6ea30c0..d52954a6 100644
--- a/lib/arm/crc32_impl.h
+++ b/lib/arm/crc32_impl.h
@@ -113,7 +113,7 @@ combine_crcs_slow(u32 crc0, u32 crc1, u32 crc2, u32 crc3)
 }
 
 #define crc32_arm_crc	crc32_arm_crc
-static u32 ATTRIBUTES MAYBE_UNUSED
+static ATTRIBUTES MAYBE_UNUSED u32
 crc32_arm_crc(u32 crc, const u8 *p, size_t len)
 {
 	if (len >= 64) {
@@ -289,7 +289,7 @@ combine_crcs_fast(u32 crc0, u32 crc1, u32 crc2, u32 crc3, size_t i)
 }
 
 #define crc32_arm_crc_pmullcombine	crc32_arm_crc_pmullcombine
-static u32 ATTRIBUTES MAYBE_UNUSED
+static ATTRIBUTES MAYBE_UNUSED u32
 crc32_arm_crc_pmullcombine(u32 crc, const u8 *p, size_t len)
 {
 	const size_t align = -(uintptr_t)p & 7;
@@ -470,7 +470,7 @@ crc32_arm_crc_pmullcombine(u32 crc, const u8 *p, size_t len)
 #  define ENABLE_EOR3		0
 #  include "crc32_pmull_helpers.h"
 
-static u32 ATTRIBUTES MAYBE_UNUSED
+static ATTRIBUTES MAYBE_UNUSED u32
 crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len)
 {
 	static const u64 _aligned_attribute(16) mults[3][2] = {
diff --git a/lib/arm/crc32_pmull_wide.h b/lib/arm/crc32_pmull_wide.h
index c2f8af06..5a4bd0ca 100644
--- a/lib/arm/crc32_pmull_wide.h
+++ b/lib/arm/crc32_pmull_wide.h
@@ -52,7 +52,7 @@
 
 #include "crc32_pmull_helpers.h"
 
-static u32 ATTRIBUTES MAYBE_UNUSED
+static ATTRIBUTES MAYBE_UNUSED u32
 ADD_SUFFIX(crc32_arm)(u32 crc, const u8 *p, size_t len)
 {
 	uint8x16_t v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11;
diff --git a/lib/decompress_template.h b/lib/decompress_template.h
index 3c1da677..8c874c36 100644
--- a/lib/decompress_template.h
+++ b/lib/decompress_template.h
@@ -41,7 +41,7 @@
 #  define EXTRACT_VARBITS8(word, count)	((word) & BITMASK((u8)(count)))
 #endif
 
-static enum libdeflate_result ATTRIBUTES MAYBE_UNUSED
+static ATTRIBUTES MAYBE_UNUSED enum libdeflate_result
 FUNCNAME(struct libdeflate_decompressor * restrict d,
 	 const void * restrict in, size_t in_nbytes,
 	 void * restrict out, size_t out_nbytes_avail,
diff --git a/lib/x86/adler32_impl.h b/lib/x86/adler32_impl.h
index a32af413..ba559e6e 100644
--- a/lib/x86/adler32_impl.h
+++ b/lib/x86/adler32_impl.h
@@ -33,19 +33,19 @@
 /* SSE2 and AVX2 implementations.  Used on older CPUs. */
 #if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)
 #  define adler32_x86_sse2	adler32_x86_sse2
-#  define SUFFIX		       _x86_sse2
+#  define SUFFIX			   _sse2
 #  define ATTRIBUTES		_target_attribute("sse2")
 #  define VL			16
 #  define USE_VNNI		0
-#  define USE_MASKING		0
+#  define USE_AVX512		0
 #  include "adler32_template.h"
 
 #  define adler32_x86_avx2	adler32_x86_avx2
-#  define SUFFIX		       _x86_avx2
+#  define SUFFIX			   _avx2
 #  define ATTRIBUTES		_target_attribute("avx2")
 #  define VL			32
 #  define USE_VNNI		0
-#  define USE_MASKING		0
+#  define USE_AVX512		0
 #  include "adler32_template.h"
 #endif
 
@@ -55,11 +55,11 @@
  */
 #if GCC_PREREQ(11, 1) || CLANG_PREREQ(12, 0, 13000000) || MSVC_PREREQ(1930)
 #  define adler32_x86_avx2_vnni	adler32_x86_avx2_vnni
-#  define SUFFIX		       _x86_avx2_vnni
+#  define SUFFIX			   _avx2_vnni
 #  define ATTRIBUTES		_target_attribute("avx2,avxvnni")
 #  define VL			32
 #  define USE_VNNI		1
-#  define USE_MASKING		0
+#  define USE_AVX512		0
 #  include "adler32_template.h"
 #endif
 
@@ -72,11 +72,11 @@
  * that support AVX10/256 but not AVX10/512.
  */
 #  define adler32_x86_avx512_vl256_vnni	adler32_x86_avx512_vl256_vnni
-#  define SUFFIX			       _x86_avx512_vl256_vnni
+#  define SUFFIX				   _avx512_vl256_vnni
 #  define ATTRIBUTES		_target_attribute("avx512bw,avx512vl,avx512vnni")
 #  define VL			32
 #  define USE_VNNI		1
-#  define USE_MASKING		1
+#  define USE_AVX512		1
 #  include "adler32_template.h"
 
 /*
@@ -85,11 +85,11 @@
  * the optimal implementation on CPUs that support AVX10/512.
  */
 #  define adler32_x86_avx512_vl512_vnni	adler32_x86_avx512_vl512_vnni
-#  define SUFFIX			       _x86_avx512_vl512_vnni
+#  define SUFFIX				   _avx512_vl512_vnni
 #  define ATTRIBUTES		_target_attribute("avx512bw,avx512vnni")
 #  define VL			64
 #  define USE_VNNI		1
-#  define USE_MASKING		1
+#  define USE_AVX512		1
 #  include "adler32_template.h"
 #endif
 
diff --git a/lib/x86/adler32_template.h b/lib/x86/adler32_template.h
index c788acc5..1593ee5a 100644
--- a/lib/x86/adler32_template.h
+++ b/lib/x86/adler32_template.h
@@ -34,20 +34,21 @@
  * ATTRIBUTES:
  *	Target function attributes to use.  Must satisfy the dependencies of the
  *	other parameters as follows:
- *	   VL=16 && USE_VNNI=0 && USE_MASKING=0: at least sse2
- *	   VL=32 && USE_VNNI=0 && USE_MASKING=0: at least avx2
- *	   VL=32 && USE_VNNI=1 && USE_MASKING=0: at least avx2,avxvnni
- *	   VL=32 && USE_VNNI=1 && USE_MASKING=1: at least avx512bw,avx512vl,avx512vnni
- *	   VL=64 && USE_VNNI=1 && USE_MASKING=1: at least avx512bw,avx512vnni
+ *	   VL=16 && USE_VNNI=0 && USE_AVX512=0: at least sse2
+ *	   VL=32 && USE_VNNI=0 && USE_AVX512=0: at least avx2
+ *	   VL=32 && USE_VNNI=1 && USE_AVX512=0: at least avx2,avxvnni
+ *	   VL=32 && USE_VNNI=1 && USE_AVX512=1: at least avx512bw,avx512vl,avx512vnni
+ *	   VL=64 && USE_VNNI=1 && USE_AVX512=1: at least avx512bw,avx512vnni
  *	   (Other combinations are not useful and have not been tested.)
  * VL:
- *	Vector length in bytes.  Must be 16, 32, and 64.
+ *	Vector length in bytes.  Must be 16, 32, or 64.
  * USE_VNNI:
  *	If 1, use the VNNI dot product based algorithm.
  *	If 0, use the legacy SSE2 and AVX2 compatible algorithm.
- * USE_MASKING:
- *	If 1, use AVX-512 features such as masking.
- *	If 0, assume that the CPU might not support AVX-512.
+ * USE_AVX512:
+ *	If 1, take advantage of AVX-512 features such as masking.  This doesn't
+ *	enable the use of 512-bit vectors; the vector length is controlled by
+ *	VL.  If 0, assume that the CPU might not support AVX-512.
  */
 
 #if VL == 16
@@ -57,7 +58,7 @@
 #  define VADD8(a, b)		_mm_add_epi8((a), (b))
 #  define VADD16(a, b)		_mm_add_epi16((a), (b))
 #  define VADD32(a, b)		_mm_add_epi32((a), (b))
-#  if USE_MASKING
+#  if USE_AVX512
 #    define VDPBUSD(a, b, c)	_mm_dpbusd_epi32((a), (b), (c))
 #  else
 #    define VDPBUSD(a, b, c)	_mm_dpbusd_avx_epi32((a), (b), (c))
@@ -68,12 +69,12 @@
 #  define VMASKZ_LOADU(mask, p) _mm_maskz_loadu_epi8((mask), (p))
 #  define VMULLO32(a, b)	_mm_mullo_epi32((a), (b))
 #  define VSAD8(a, b)		_mm_sad_epu8((a), (b))
-#  define VSET1_32(a)		_mm_set1_epi32(a)
 #  define VSET1_8(a)		_mm_set1_epi8(a)
+#  define VSET1_32(a)		_mm_set1_epi32(a)
 #  define VSETZERO()		_mm_setzero_si128()
 #  define VSLL32(a, b)		_mm_slli_epi32((a), (b))
-#  define VUNPACKHI8(a, b)	_mm_unpackhi_epi8((a), (b))
 #  define VUNPACKLO8(a, b)	_mm_unpacklo_epi8((a), (b))
+#  define VUNPACKHI8(a, b)	_mm_unpackhi_epi8((a), (b))
 #elif VL == 32
 #  define vec_t			__m256i
 #  define mask_t		u32
@@ -81,7 +82,7 @@
 #  define VADD8(a, b)		_mm256_add_epi8((a), (b))
 #  define VADD16(a, b)		_mm256_add_epi16((a), (b))
 #  define VADD32(a, b)		_mm256_add_epi32((a), (b))
-#  if USE_MASKING
+#  if USE_AVX512
 #    define VDPBUSD(a, b, c)	_mm256_dpbusd_epi32((a), (b), (c))
 #  else
 #    define VDPBUSD(a, b, c)	_mm256_dpbusd_avx_epi32((a), (b), (c))
@@ -92,27 +93,32 @@
 #  define VMASKZ_LOADU(mask, p) _mm256_maskz_loadu_epi8((mask), (p))
 #  define VMULLO32(a, b)	_mm256_mullo_epi32((a), (b))
 #  define VSAD8(a, b)		_mm256_sad_epu8((a), (b))
-#  define VSET1_32(a)		_mm256_set1_epi32(a)
 #  define VSET1_8(a)		_mm256_set1_epi8(a)
+#  define VSET1_32(a)		_mm256_set1_epi32(a)
 #  define VSETZERO()		_mm256_setzero_si256()
 #  define VSLL32(a, b)		_mm256_slli_epi32((a), (b))
-#  define VUNPACKHI8(a, b)	_mm256_unpackhi_epi8((a), (b))
 #  define VUNPACKLO8(a, b)	_mm256_unpacklo_epi8((a), (b))
+#  define VUNPACKHI8(a, b)	_mm256_unpackhi_epi8((a), (b))
 #elif VL == 64
 #  define vec_t			__m512i
 #  define mask_t		u64
 #  define LOG2_VL		6
 #  define VADD8(a, b)		_mm512_add_epi8((a), (b))
+#  define VADD16(a, b)		_mm512_add_epi16((a), (b))
 #  define VADD32(a, b)		_mm512_add_epi32((a), (b))
 #  define VDPBUSD(a, b, c)	_mm512_dpbusd_epi32((a), (b), (c))
 #  define VLOAD(p)		_mm512_load_si512((const void *)(p))
 #  define VLOADU(p)		_mm512_loadu_si512((const void *)(p))
+#  define VMADD16(a, b)		_mm512_madd_epi16((a), (b))
 #  define VMASKZ_LOADU(mask, p) _mm512_maskz_loadu_epi8((mask), (p))
 #  define VMULLO32(a, b)	_mm512_mullo_epi32((a), (b))
-#  define VSET1_32(a)		_mm512_set1_epi32(a)
+#  define VSAD8(a, b)		_mm512_sad_epu8((a), (b))
 #  define VSET1_8(a)		_mm512_set1_epi8(a)
+#  define VSET1_32(a)		_mm512_set1_epi32(a)
 #  define VSETZERO()		_mm512_setzero_si512()
 #  define VSLL32(a, b)		_mm512_slli_epi32((a), (b))
+#  define VUNPACKLO8(a, b)	_mm512_unpacklo_epi8((a), (b))
+#  define VUNPACKHI8(a, b)	_mm512_unpackhi_epi8((a), (b))
 #else
 #  error "unsupported vector length"
 #endif
@@ -173,8 +179,8 @@ ADD_SUFFIX(reduce_to_32bits)(vec_t v_s1, vec_t v_s2, u32 *s1_p, u32 *s2_p)
 }
 #define reduce_to_32bits	ADD_SUFFIX(reduce_to_32bits)
 
-static u32 ATTRIBUTES
-ADD_SUFFIX(adler32)(u32 adler, const u8 *p, size_t len)
+static ATTRIBUTES u32
+ADD_SUFFIX(adler32_x86)(u32 adler, const u8 *p, size_t len)
 {
 #if USE_VNNI
 	/* This contains the bytes [VL, VL-1, VL-2, ..., 1]. */
@@ -235,7 +241,7 @@ ADD_SUFFIX(adler32)(u32 adler, const u8 *p, size_t len)
 
 #if USE_VNNI
 	/*
-	 * This is Adler-32 using the vpdpbusd instruction from AVX512-VNNI or
+	 * This is Adler-32 using the vpdpbusd instruction from AVX512VNNI or
 	 * AVX-VNNI.  vpdpbusd multiplies the unsigned bytes of one vector by
 	 * the signed bytes of another vector and adds the sums in groups of 4
 	 * to the 32-bit elements of a third vector.  We use it in two ways:
@@ -369,7 +375,7 @@ ADD_SUFFIX(adler32)(u32 adler, const u8 *p, size_t len)
 			 * Process the last 0 < n <= VL bytes of the chunk.
 			 * Utilize a masked load if it's available.
 			 */
-		#if USE_MASKING
+		#if USE_AVX512
 			data = VMASKZ_LOADU((mask_t)-1 >> (VL - n), p);
 		#else
 			data = zeroes;
@@ -414,7 +420,7 @@ ADD_SUFFIX(adler32)(u32 adler, const u8 *p, size_t len)
 		 * v_byte_sums_* counter is guaranteed to not exceed INT16_MAX.
 		 * It's INT16_MAX, not UINT16_MAX, because v_byte_sums_* are
 		 * used with pmaddwd which does signed multiplication.  In the
-		 * SSE2 case this limits chunks to 4096 bytes instead of 5504.
+		 * SSE2 case this limits chunks to 4096 bytes instead of 5536.
 		 */
 		size_t n = MIN(len, MIN(2 * VL * (INT16_MAX / UINT8_MAX),
 					MAX_CHUNK_LEN) & ~(2*VL - 1));
@@ -502,11 +508,11 @@ ADD_SUFFIX(adler32)(u32 adler, const u8 *p, size_t len)
 #undef VSET1_32
 #undef VSETZERO
 #undef VSLL32
-#undef VUNPACKHI8
 #undef VUNPACKLO8
+#undef VUNPACKHI8
 
 #undef SUFFIX
 #undef ATTRIBUTES
 #undef VL
 #undef USE_VNNI
-#undef USE_MASKING
+#undef USE_AVX512
diff --git a/lib/x86/crc32_impl.h b/lib/x86/crc32_impl.h
index 3d8e254d..8b23b904 100644
--- a/lib/x86/crc32_impl.h
+++ b/lib/x86/crc32_impl.h
@@ -36,8 +36,8 @@
 #  define SUFFIX			 _pclmulqdq
 #  define ATTRIBUTES		_target_attribute("pclmul")
 #  define VL			16
-#  define FOLD_LESSTHAN16BYTES	0
-#  define USE_TERNARYLOGIC	0
+#  define USE_SSE4_1		0
+#  define USE_AVX512		0
 #  include "crc32_pclmul_template.h"
 
 /*
@@ -49,55 +49,62 @@
  * non-destructive VEX-encoded instructions.  Second, AVX support implies SSSE3
  * and SSE4.1 support, and we can use SSSE3 and SSE4.1 intrinsics for efficient
  * handling of partial blocks.  (We *could* compile a variant with
- * PCLMULQDQ+SSE4.1 without AVX, but for simplicity we don't currently bother.)
+ * PCLMULQDQ+SSE4.1 without AVX, but for simplicity we currently don't bother.)
  */
 #  define crc32_x86_pclmulqdq_avx	crc32_x86_pclmulqdq_avx
 #  define SUFFIX				 _pclmulqdq_avx
 #  define ATTRIBUTES		_target_attribute("pclmul,avx")
 #  define VL			16
-#  define FOLD_LESSTHAN16BYTES	1
-#  define USE_TERNARYLOGIC	0
+#  define USE_SSE4_1		1
+#  define USE_AVX512		0
 #  include "crc32_pclmul_template.h"
 #endif
 
 /*
- * VPCLMULQDQ/AVX2 implementation.  Uses 256-bit vectors.
+ * VPCLMULQDQ/AVX2 implementation.  This is used on CPUs that have AVX2 and
+ * VPCLMULQDQ but don't have AVX-512, for example Intel Alder Lake.
  *
  * Currently this can't be enabled with MSVC because MSVC has a bug where it
  * incorrectly assumes that VPCLMULQDQ implies AVX-512:
- * https://developercommunity.visualstudio.com/t/Compiler-incorrectly-assumes-VAES-and-VP/10578785?space=62&q=AVX512&sort=newest
+ * https://developercommunity.visualstudio.com/t/Compiler-incorrectly-assumes-VAES-and-VP/10578785
  */
 #if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000)
 #  define crc32_x86_vpclmulqdq_avx2	crc32_x86_vpclmulqdq_avx2
 #  define SUFFIX				 _vpclmulqdq_avx2
 #  define ATTRIBUTES		_target_attribute("vpclmulqdq,pclmul,avx2")
 #  define VL			32
-#  define FOLD_LESSTHAN16BYTES	1
-#  define USE_TERNARYLOGIC	0
+#  define USE_SSE4_1		1
+#  define USE_AVX512		0
 #  include "crc32_pclmul_template.h"
 #endif
 
 #if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920)
 /*
- * VPCLMULQDQ/AVX512 implementation with 256-bit vectors.  This takes advantage
- * of some AVX-512 instructions but uses 256-bit vectors rather than 512-bit.
- * This can be useful on CPUs where 512-bit vectors cause downclocking.
+ * VPCLMULQDQ/AVX512 implementation using 256-bit vectors.  This is very similar
+ * to the VPCLMULQDQ/AVX2 implementation but takes advantage of the vpternlog
+ * instruction and more registers.  This is used on CPUs that support AVX-512
+ * but where using 512-bit vectors causes downclocking.  This should also be the
+ * optimal implementation on CPUs that support AVX10/256 but not AVX10/512.
  */
 #  define crc32_x86_vpclmulqdq_avx512_vl256  crc32_x86_vpclmulqdq_avx512_vl256
 #  define SUFFIX				      _vpclmulqdq_avx512_vl256
 #  define ATTRIBUTES		_target_attribute("vpclmulqdq,pclmul,avx512vl")
 #  define VL			32
-#  define FOLD_LESSTHAN16BYTES	1
-#  define USE_TERNARYLOGIC	1
+#  define USE_SSE4_1		1
+#  define USE_AVX512		1
 #  include "crc32_pclmul_template.h"
 
-/* VPCLMULQDQ/AVX512 implementation with 512-bit vectors */
+/*
+ * VPCLMULQDQ/AVX512 implementation using 512-bit vectors.  This is used on CPUs
+ * that have a good AVX-512 implementation including VPCLMULQDQ.  This should
+ * also be the optimal implementation on CPUs that support AVX10/512.
+ */
 #  define crc32_x86_vpclmulqdq_avx512_vl512  crc32_x86_vpclmulqdq_avx512_vl512
 #  define SUFFIX				      _vpclmulqdq_avx512_vl512
 #  define ATTRIBUTES		_target_attribute("vpclmulqdq,pclmul,avx512vl")
 #  define VL			64
-#  define FOLD_LESSTHAN16BYTES	1
-#  define USE_TERNARYLOGIC	1
+#  define USE_SSE4_1		1
+#  define USE_AVX512		1
 #  include "crc32_pclmul_template.h"
 #endif
 
diff --git a/lib/x86/crc32_pclmul_template.h b/lib/x86/crc32_pclmul_template.h
index 4257d449..bb892d82 100644
--- a/lib/x86/crc32_pclmul_template.h
+++ b/lib/x86/crc32_pclmul_template.h
@@ -34,18 +34,22 @@
  * ATTRIBUTES:
  *	Target function attributes to use.  Must satisfy the dependencies of the
  *	other parameters as follows:
- *	   VL=16 && FOLD_LESSTHAN16BYTES=0: at least pclmul
- *	   VL=16 && FOLD_LESSTHAN16BYTES=1: at least pclmul,sse4.1
- *	   VL=32 && USE_TERNARYLOGIC=0: at least vpclmulqdq,pclmul,avx2
- *	   VL=32 && USE_TERNARYLOGIC=1: at least vpclmulqdq,pclmul,avx512vl
- *	   VL=64: at least vpclmulqdq,pclmul,avx512vl
+ *	   VL=16 && USE_SSE4_1=0 && USE_AVX512=0: at least pclmul
+ *	   VL=16 && USE_SSE4_1=1 && USE_AVX512=0: at least pclmul,sse4.1
+ *	   VL=32 && USE_SSE4_1=1 && USE_AVX512=0: at least vpclmulqdq,pclmul,avx2
+ *	   VL=32 && USE_SSE4_1=1 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512vl
+ *	   VL=64 && USE_SSE4_1=1 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512vl
+ *	   (Other combinations are not useful and have not been tested.)
  * VL:
- *	Vector length in bytes.  Supported values are 16, 32, and 64.
- * FOLD_LESSTHAN16BYTES:
- *	Use vector instructions to handle any partial blocks at the beginning
- *	and end, instead of falling back to scalar instructions for those parts.
- * USE_TERNARYLOGIC:
- *	Use the vpternlog instruction to do three-argument XORs.
+ *	Vector length in bytes.  Must be 16, 32, or 64.
+ * USE_SSE4_1:
+ *	If 1, take advantage of SSE4.1 instructions such as pblendvb.
+ *	If 0, assume that the CPU might not support SSE4.1.
+ * USE_AVX512:
+ *	If 1, take advantage of AVX-512 features such as masking and the
+ *	vpternlog instruction.  This doesn't enable the use of 512-bit vectors;
+ *	the vector length is controlled by VL.  If 0, assume that the CPU might
+ *	not support AVX-512.
  *
  * The overall algorithm used is CRC folding with carryless multiplication
  * instructions.  Note that the x86 crc32 instruction cannot be used, as it is
@@ -62,55 +66,10 @@
  * or AVX512VL, or four in combination with AVX512F.
  */
 
-#undef fold_vec128
-static forceinline ATTRIBUTES __m128i
-ADD_SUFFIX(fold_vec128)(__m128i src, __m128i dst, __m128i multipliers)
-{
-	dst = _mm_xor_si128(dst, _mm_clmulepi64_si128(src, multipliers, 0x00));
-	dst = _mm_xor_si128(dst, _mm_clmulepi64_si128(src, multipliers, 0x11));
-	return dst;
-}
-#define fold_vec128	ADD_SUFFIX(fold_vec128)
-
-#if VL >= 32
-#undef fold_vec256
-static forceinline ATTRIBUTES __m256i
-ADD_SUFFIX(fold_vec256)(__m256i src, __m256i dst, __m256i multipliers)
-{
-#if USE_TERNARYLOGIC
-	return _mm256_ternarylogic_epi32(
-			_mm256_clmulepi64_epi128(src, multipliers, 0x00),
-			_mm256_clmulepi64_epi128(src, multipliers, 0x11),
-			dst,
-			0x96);
-#else
-	return _mm256_xor_si256(
-			_mm256_xor_si256(dst,
-					 _mm256_clmulepi64_epi128(src, multipliers, 0x00)),
-			_mm256_clmulepi64_epi128(src, multipliers, 0x11));
-#endif
-}
-#define fold_vec256	ADD_SUFFIX(fold_vec256)
-#endif /* VL >= 32 */
-
-#if VL >= 64
-#undef fold_vec512
-static forceinline ATTRIBUTES __m512i
-ADD_SUFFIX(fold_vec512)(__m512i src, __m512i dst, __m512i multipliers)
-{
-	return _mm512_ternarylogic_epi32(
-			_mm512_clmulepi64_epi128(src, multipliers, 0x00),
-			_mm512_clmulepi64_epi128(src, multipliers, 0x11),
-			dst,
-			0x96);
-}
-#define fold_vec512	ADD_SUFFIX(fold_vec512)
-#endif /* VL >= 64 */
-
 #if VL == 16
 #  define vec_t			__m128i
 #  define fold_vec		fold_vec128
-#  define VLOAD_UNALIGNED(p)	_mm_loadu_si128((const void *)(p))
+#  define VLOADU(p)		_mm_loadu_si128((const void *)(p))
 #  define VXOR(a, b)		_mm_xor_si128((a), (b))
 #  define M128I_TO_VEC(a)	a
 #  define MULTS_8V		_mm_set_epi64x(CRC32_X991_MODG, CRC32_X1055_MODG)
@@ -120,7 +79,7 @@ ADD_SUFFIX(fold_vec512)(__m512i src, __m512i dst, __m512i multipliers)
 #elif VL == 32
 #  define vec_t			__m256i
 #  define fold_vec		fold_vec256
-#  define VLOAD_UNALIGNED(p)	_mm256_loadu_si256((const void *)(p))
+#  define VLOADU(p)		_mm256_loadu_si256((const void *)(p))
 #  define VXOR(a, b)		_mm256_xor_si256((a), (b))
 #  define M128I_TO_VEC(a)	_mm256_castsi128_si256(a)
 #  define MULTS(a, b)		_mm256_set_epi64x(a, b, a, b)
@@ -131,7 +90,7 @@ ADD_SUFFIX(fold_vec512)(__m512i src, __m512i dst, __m512i multipliers)
 #elif VL == 64
 #  define vec_t			__m512i
 #  define fold_vec		fold_vec512
-#  define VLOAD_UNALIGNED(p)	_mm512_loadu_si512((const void *)(p))
+#  define VLOADU(p)		_mm512_loadu_si512((const void *)(p))
 #  define VXOR(a, b)		_mm512_xor_si512((a), (b))
 #  define M128I_TO_VEC(a)	_mm512_castsi128_si512(a)
 #  define MULTS(a, b)		_mm512_set_epi64(a, b, a, b, a, b, a, b)
@@ -143,7 +102,54 @@ ADD_SUFFIX(fold_vec512)(__m512i src, __m512i dst, __m512i multipliers)
 #  error "unsupported vector length"
 #endif
 
-#if FOLD_LESSTHAN16BYTES
+#undef fold_vec128
+static forceinline ATTRIBUTES __m128i
+ADD_SUFFIX(fold_vec128)(__m128i src, __m128i dst, __m128i /* __v2du */ mults)
+{
+	dst = _mm_xor_si128(dst, _mm_clmulepi64_si128(src, mults, 0x00));
+	dst = _mm_xor_si128(dst, _mm_clmulepi64_si128(src, mults, 0x11));
+	return dst;
+}
+#define fold_vec128	ADD_SUFFIX(fold_vec128)
+
+#if VL >= 32
+#undef fold_vec256
+static forceinline ATTRIBUTES __m256i
+ADD_SUFFIX(fold_vec256)(__m256i src, __m256i dst, __m256i /* __v4du */ mults)
+{
+#if USE_AVX512
+	/* vpternlog with immediate 0x96 is a three-argument XOR. */
+	return _mm256_ternarylogic_epi32(
+			_mm256_clmulepi64_epi128(src, mults, 0x00),
+			_mm256_clmulepi64_epi128(src, mults, 0x11),
+			dst,
+			0x96);
+#else
+	return _mm256_xor_si256(
+			_mm256_xor_si256(dst,
+					 _mm256_clmulepi64_epi128(src, mults, 0x00)),
+			_mm256_clmulepi64_epi128(src, mults, 0x11));
+#endif
+}
+#define fold_vec256	ADD_SUFFIX(fold_vec256)
+#endif /* VL >= 32 */
+
+#if VL >= 64
+#undef fold_vec512
+static forceinline ATTRIBUTES __m512i
+ADD_SUFFIX(fold_vec512)(__m512i src, __m512i dst, __m512i /* __v8du */ mults)
+{
+	/* vpternlog with immediate 0x96 is a three-argument XOR. */
+	return _mm512_ternarylogic_epi32(
+			_mm512_clmulepi64_epi128(src, mults, 0x00),
+			_mm512_clmulepi64_epi128(src, mults, 0x11),
+			dst,
+			0x96);
+}
+#define fold_vec512	ADD_SUFFIX(fold_vec512)
+#endif /* VL >= 64 */
+
+#if USE_SSE4_1
 /*
  * Given 'x' containing a 16-byte polynomial, and a pointer 'p' that points to
  * the next '1 <= len <= 15' data bytes, rearrange the concatenation of 'x' and
@@ -154,7 +160,7 @@ ADD_SUFFIX(fold_vec512)(__m512i src, __m512i dst, __m512i multipliers)
 #undef fold_lessthan16bytes
 static forceinline ATTRIBUTES __m128i
 ADD_SUFFIX(fold_lessthan16bytes)(__m128i x, const u8 *p, size_t len,
-				 __m128i /* __v2du */ multipliers_128b)
+				 __m128i /* __v2du */ mults_128b)
 {
 	/*
 	 * pshufb(x, shift_tab[len..len+15]) left shifts x by 16-len bytes.
@@ -184,26 +190,31 @@ ADD_SUFFIX(fold_lessthan16bytes)(__m128i x, const u8 *p, size_t len,
 			     /* msb 0/1 of each byte selects byte from arg1/2 */
 			     rshift);
 
-	return fold_vec128(x0, x1, multipliers_128b);
+	return fold_vec128(x0, x1, mults_128b);
 }
 #define fold_lessthan16bytes	ADD_SUFFIX(fold_lessthan16bytes)
-#endif /* FOLD_LESSTHAN16BYTES */
+#endif /* USE_SSE4_1 */
 
-static u32 ATTRIBUTES
+static ATTRIBUTES u32
 ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
 {
-	const vec_t multipliers_8v = MULTS_8V; /* 8 vecs */
-	const vec_t multipliers_4v = MULTS_4V; /* 4 vecs */
-	const vec_t multipliers_2v = MULTS_2V; /* 2 vecs */
-	const vec_t multipliers_1v = MULTS_1V; /* 1 vecs */
-	const __m128i /* __v2du */ multipliers_128b =
-		_mm_set_epi64x(CRC32_X95_MODG, CRC32_X159_MODG);
-	const __m128i /* __v2du */ final_multiplier =
-		_mm_set_epi64x(0, CRC32_X63_MODG);
+	/*
+	 * mults_{N}v are the vectors of multipliers for folding across N vec_t
+	 * vectors, i.e. N*VL*8 bits.  mults_128b are the two multipliers for
+	 * folding across 128 bits.  mults_128b differs from mults_1v when
+	 * VL != 16.  All multipliers are 64-bit, to match what pclmulqdq needs,
+	 * but since this is for CRC-32 only their low 32 bits are nonzero.
+	 * For more details, see scripts/gen_crc32_multipliers.c.
+	 */
+	const vec_t mults_8v = MULTS_8V;
+	const vec_t mults_4v = MULTS_4V;
+	const vec_t mults_2v = MULTS_2V;
+	const vec_t mults_1v = MULTS_1V;
+	const __m128i mults_128b = _mm_set_epi64x(CRC32_X95_MODG, CRC32_X159_MODG);
+	const __m128i final_mult = _mm_set_epi64x(0, CRC32_X63_MODG);
 	const __m128i mask32 = _mm_set_epi32(0, 0, 0, 0xFFFFFFFF);
-	const __m128i /* __v2du */ barrett_reduction_constants =
-		_mm_set_epi64x(CRC32_BARRETT_CONSTANT_2,
-			       CRC32_BARRETT_CONSTANT_1);
+	const __m128i barrett_reduction_constants =
+		_mm_set_epi64x(CRC32_BARRETT_CONSTANT_2, CRC32_BARRETT_CONSTANT_1);
 	vec_t v0, v1, v2, v3, v4, v5, v6, v7;
 	__m128i x0, x1;
 
@@ -218,50 +229,40 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
 		if (len < VL)
 			return crc32_slice1(crc, p, len);
 
-		v0 = VXOR(VLOAD_UNALIGNED(p),
-			  M128I_TO_VEC(_mm_cvtsi32_si128(crc)));
+		v0 = VXOR(VLOADU(p), M128I_TO_VEC(_mm_cvtsi32_si128(crc)));
 		p += VL;
 
 		if (len >= 4*VL) {
-			v1 = VLOAD_UNALIGNED(p + 0*VL);
-			v2 = VLOAD_UNALIGNED(p + 1*VL);
-			v3 = VLOAD_UNALIGNED(p + 2*VL);
+			v1 = VLOADU(p + 0*VL);
+			v2 = VLOADU(p + 1*VL);
+			v3 = VLOADU(p + 2*VL);
 			p += 3*VL;
 			while (len >= 8*VL) {
-				v0 = fold_vec(v0, VLOAD_UNALIGNED(p + 0*VL),
-					      multipliers_4v);
-				v1 = fold_vec(v1, VLOAD_UNALIGNED(p + 1*VL),
-					      multipliers_4v);
-				v2 = fold_vec(v2, VLOAD_UNALIGNED(p + 2*VL),
-					      multipliers_4v);
-				v3 = fold_vec(v3, VLOAD_UNALIGNED(p + 3*VL),
-					      multipliers_4v);
+				v0 = fold_vec(v0, VLOADU(p + 0*VL), mults_4v);
+				v1 = fold_vec(v1, VLOADU(p + 1*VL), mults_4v);
+				v2 = fold_vec(v2, VLOADU(p + 2*VL), mults_4v);
+				v3 = fold_vec(v3, VLOADU(p + 3*VL), mults_4v);
 				p += 4*VL;
 				len -= 4*VL;
 			}
-			v0 = fold_vec(v0, v2, multipliers_2v);
-			v1 = fold_vec(v1, v3, multipliers_2v);
+			v0 = fold_vec(v0, v2, mults_2v);
+			v1 = fold_vec(v1, v3, mults_2v);
 			if (len & (2*VL)) {
-				v0 = fold_vec(v0, VLOAD_UNALIGNED(p + 0*VL),
-					      multipliers_2v);
-				v1 = fold_vec(v1, VLOAD_UNALIGNED(p + 1*VL),
-					      multipliers_2v);
+				v0 = fold_vec(v0, VLOADU(p + 0*VL), mults_2v);
+				v1 = fold_vec(v1, VLOADU(p + 1*VL), mults_2v);
 				p += 2*VL;
 			}
-			v0 = fold_vec(v0, v1, multipliers_1v);
+			v0 = fold_vec(v0, v1, mults_1v);
 			if (len & VL) {
-				v0 = fold_vec(v0, VLOAD_UNALIGNED(p),
-					      multipliers_1v);
+				v0 = fold_vec(v0, VLOADU(p), mults_1v);
 				p += VL;
 			}
 		} else {
 			if (len >= 2*VL) {
-				v0 = fold_vec(v0, VLOAD_UNALIGNED(p),
-					      multipliers_1v);
+				v0 = fold_vec(v0, VLOADU(p), mults_1v);
 				p += VL;
 				if (len >= 3*VL) {
-					v0 = fold_vec(v0, VLOAD_UNALIGNED(p),
-						      multipliers_1v);
+					v0 = fold_vec(v0, VLOADU(p), mults_1v);
 					p += VL;
 				}
 			}
@@ -276,19 +277,19 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
 			v0 = VXOR(*vp++, M128I_TO_VEC(_mm_cvtsi32_si128(crc)));
 		} else {
 			len -= align;
-		#if FOLD_LESSTHAN16BYTES
+		#if USE_SSE4_1
 			x0 = _mm_xor_si128(_mm_loadu_si128((const void *)p),
 					   _mm_cvtsi32_si128(crc));
 			p += 16;
 			if (align & 15) {
 				x0 = fold_lessthan16bytes(x0, p, align & 15,
-							  multipliers_128b);
+							  mults_128b);
 				p += align & 15;
 				align &= ~15;
 			}
 			while (align >= 16) {
 				x0 = fold_vec128(x0, *(const __m128i *)p,
-						 multipliers_128b);
+						 mults_128b);
 				p += 16;
 				align -= 16;
 			}
@@ -318,14 +319,14 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
 		v6 = *vp++;
 		v7 = *vp++;
 		do {
-			v0 = fold_vec(v0, *vp++, multipliers_8v);
-			v1 = fold_vec(v1, *vp++, multipliers_8v);
-			v2 = fold_vec(v2, *vp++, multipliers_8v);
-			v3 = fold_vec(v3, *vp++, multipliers_8v);
-			v4 = fold_vec(v4, *vp++, multipliers_8v);
-			v5 = fold_vec(v5, *vp++, multipliers_8v);
-			v6 = fold_vec(v6, *vp++, multipliers_8v);
-			v7 = fold_vec(v7, *vp++, multipliers_8v);
+			v0 = fold_vec(v0, *vp++, mults_8v);
+			v1 = fold_vec(v1, *vp++, mults_8v);
+			v2 = fold_vec(v2, *vp++, mults_8v);
+			v3 = fold_vec(v3, *vp++, mults_8v);
+			v4 = fold_vec(v4, *vp++, mults_8v);
+			v5 = fold_vec(v5, *vp++, mults_8v);
+			v6 = fold_vec(v6, *vp++, mults_8v);
+			v7 = fold_vec(v7, *vp++, mults_8v);
 			len -= 8*VL;
 		} while (len >= 16*VL);
 
@@ -333,58 +334,57 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
 		 * Reduce v0-v7 (length 8*VL bytes) to v0 (length VL bytes)
 		 * and fold in any VL-byte data segments that remain.
 		 */
-		v0 = fold_vec(v0, v4, multipliers_4v);
-		v1 = fold_vec(v1, v5, multipliers_4v);
-		v2 = fold_vec(v2, v6, multipliers_4v);
-		v3 = fold_vec(v3, v7, multipliers_4v);
+		v0 = fold_vec(v0, v4, mults_4v);
+		v1 = fold_vec(v1, v5, mults_4v);
+		v2 = fold_vec(v2, v6, mults_4v);
+		v3 = fold_vec(v3, v7, mults_4v);
 		if (len & (4*VL)) {
-			v0 = fold_vec(v0, *vp++, multipliers_4v);
-			v1 = fold_vec(v1, *vp++, multipliers_4v);
-			v2 = fold_vec(v2, *vp++, multipliers_4v);
-			v3 = fold_vec(v3, *vp++, multipliers_4v);
+			v0 = fold_vec(v0, *vp++, mults_4v);
+			v1 = fold_vec(v1, *vp++, mults_4v);
+			v2 = fold_vec(v2, *vp++, mults_4v);
+			v3 = fold_vec(v3, *vp++, mults_4v);
 		}
-		v0 = fold_vec(v0, v2, multipliers_2v);
-		v1 = fold_vec(v1, v3, multipliers_2v);
+		v0 = fold_vec(v0, v2, mults_2v);
+		v1 = fold_vec(v1, v3, mults_2v);
 		if (len & (2*VL)) {
-			v0 = fold_vec(v0, *vp++, multipliers_2v);
-			v1 = fold_vec(v1, *vp++, multipliers_2v);
+			v0 = fold_vec(v0, *vp++, mults_2v);
+			v1 = fold_vec(v1, *vp++, mults_2v);
 		}
-		v0 = fold_vec(v0, v1, multipliers_1v);
+		v0 = fold_vec(v0, v1, mults_1v);
 		if (len & VL)
-			v0 = fold_vec(v0, *vp++, multipliers_1v);
+			v0 = fold_vec(v0, *vp++, mults_1v);
 		p = (const u8 *)vp;
 	}
 
 	/*
-	 * Reduce v0 (length VL bytes) to x0 (length 16 bytes)
-	 * and fold in any 16-byte data segments that remain.
+	 * Fewer than VL bytes remain.  Reduce v0 (length VL bytes) to x0
+	 * (length 16 bytes) and fold in any 16-byte data segments that remain.
 	 */
 #if VL == 16
 	x0 = v0;
 #else
 	{
-#  if VL == 32
+	#if VL == 32
 		__m256i y0 = v0;
-#  else
-		const __m256i multipliers_256b =
+	#else
+		const __m256i mults_256b =
 			_mm256_set_epi64x(CRC32_X223_MODG, CRC32_X287_MODG,
 					  CRC32_X223_MODG, CRC32_X287_MODG);
 		__m256i y0 = fold_vec256(_mm512_extracti64x4_epi64(v0, 0),
 					 _mm512_extracti64x4_epi64(v0, 1),
-					 multipliers_256b);
+					 mults_256b);
 		if (len & 32) {
 			y0 = fold_vec256(y0, _mm256_loadu_si256((const void *)p),
-					 multipliers_256b);
+					 mults_256b);
 			p += 32;
 		}
-#  endif
+	#endif
 		x0 = fold_vec128(_mm256_extracti128_si256(y0, 0),
-				 _mm256_extracti128_si256(y0, 1),
-				 multipliers_128b);
+				 _mm256_extracti128_si256(y0, 1), mults_128b);
 	}
 	if (len & 16) {
 		x0 = fold_vec128(x0, _mm_loadu_si128((const void *)p),
-				 multipliers_128b);
+				 mults_128b);
 		p += 16;
 	}
 #endif
@@ -394,9 +394,9 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
 	 * If fold_lessthan16bytes() is available, handle any remainder
 	 * of 1 to 15 bytes now, before reducing to 32 bits.
 	 */
-#if FOLD_LESSTHAN16BYTES
+#if USE_SSE4_1
 	if (len)
-		x0 = fold_lessthan16bytes(x0, p, len, multipliers_128b);
+		x0 = fold_lessthan16bytes(x0, p, len, mults_128b);
 #endif
 
 	/*
@@ -405,12 +405,12 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
 	 * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x).
 	 */
 	x0 = _mm_xor_si128(_mm_srli_si128(x0, 8),
-			   _mm_clmulepi64_si128(x0, multipliers_128b, 0x10));
+			   _mm_clmulepi64_si128(x0, mults_128b, 0x10));
 
 	/* Fold 96 => 64 bits. */
 	x0 = _mm_xor_si128(_mm_srli_si128(x0, 4),
 			   _mm_clmulepi64_si128(_mm_and_si128(x0, mask32),
-						final_multiplier, 0x00));
+						final_mult, 0x00));
 
 	/*
 	 * Reduce 64 => 32 bits using Barrett reduction.
@@ -459,7 +459,7 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
 	x1 = _mm_clmulepi64_si128(_mm_and_si128(x1, mask32),
 				  barrett_reduction_constants, 0x10);
 	x0 = _mm_xor_si128(x0, x1);
-#if FOLD_LESSTHAN16BYTES
+#if USE_SSE4_1
 	crc = _mm_extract_epi32(x0, 1);
 #else
 	crc = _mm_cvtsi128_si32(_mm_shuffle_epi32(x0, 0x01));
@@ -471,7 +471,7 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
 
 #undef vec_t
 #undef fold_vec
-#undef VLOAD_UNALIGNED
+#undef VLOADU
 #undef VXOR
 #undef M128I_TO_VEC
 #undef MULTS
@@ -483,5 +483,5 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
 #undef SUFFIX
 #undef ATTRIBUTES
 #undef VL
-#undef FOLD_LESSTHAN16BYTES
-#undef USE_TERNARYLOGIC
+#undef USE_SSE4_1
+#undef USE_AVX512