lib/{adler32,crc32}: misc cleanups

Various cleanups, including tweaks to make the Adler-32 code more consistent with the CRC-32 code and vice versa. No behavior changes.
ebiggers · Mar 12, 2024 · 8ae3a19 · 8ae3a19
1 parent 511893f
commit 8ae3a19
Show file tree

Hide file tree

Showing 8 changed files with 215 additions and 202 deletions.
diff --git a/lib/arm/adler32_impl.h b/lib/arm/adler32_impl.h
@@ -43,7 +43,7 @@
 #    endif
 #  endif
 #  include <arm_neon.h>
-static u32 ATTRIBUTES MAYBE_UNUSED
+static ATTRIBUTES MAYBE_UNUSED u32
 adler32_arm_neon(u32 adler, const u8 *p, size_t len)
 {
 	static const u16 _aligned_attribute(16) mults[64] = {
@@ -225,7 +225,7 @@ adler32_arm_neon(u32 adler, const u8 *p, size_t len)
 #    endif
 #  endif
 #  include <arm_neon.h>
-static u32 ATTRIBUTES
+static ATTRIBUTES u32
 adler32_arm_neon_dotprod(u32 adler, const u8 *p, size_t len)
 {
 	static const u8 _aligned_attribute(16) mults[64] = {

diff --git a/lib/arm/crc32_impl.h b/lib/arm/crc32_impl.h
@@ -113,7 +113,7 @@ combine_crcs_slow(u32 crc0, u32 crc1, u32 crc2, u32 crc3)
 }
 
 #define crc32_arm_crc	crc32_arm_crc
-static u32 ATTRIBUTES MAYBE_UNUSED
+static ATTRIBUTES MAYBE_UNUSED u32
 crc32_arm_crc(u32 crc, const u8 *p, size_t len)
 {
 	if (len >= 64) {
@@ -289,7 +289,7 @@ combine_crcs_fast(u32 crc0, u32 crc1, u32 crc2, u32 crc3, size_t i)
 }
 
 #define crc32_arm_crc_pmullcombine	crc32_arm_crc_pmullcombine
-static u32 ATTRIBUTES MAYBE_UNUSED
+static ATTRIBUTES MAYBE_UNUSED u32
 crc32_arm_crc_pmullcombine(u32 crc, const u8 *p, size_t len)
 {
 	const size_t align = -(uintptr_t)p & 7;
@@ -470,7 +470,7 @@ crc32_arm_crc_pmullcombine(u32 crc, const u8 *p, size_t len)
 #  define ENABLE_EOR3		0
 #  include "crc32_pmull_helpers.h"
 
-static u32 ATTRIBUTES MAYBE_UNUSED
+static ATTRIBUTES MAYBE_UNUSED u32
 crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len)
 {
 	static const u64 _aligned_attribute(16) mults[3][2] = {

diff --git a/lib/arm/crc32_pmull_wide.h b/lib/arm/crc32_pmull_wide.h
@@ -52,7 +52,7 @@
 
 #include "crc32_pmull_helpers.h"
 
-static u32 ATTRIBUTES MAYBE_UNUSED
+static ATTRIBUTES MAYBE_UNUSED u32
 ADD_SUFFIX(crc32_arm)(u32 crc, const u8 *p, size_t len)
 {
 	uint8x16_t v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11;

diff --git a/lib/decompress_template.h b/lib/decompress_template.h
@@ -41,7 +41,7 @@
 #  define EXTRACT_VARBITS8(word, count)	((word) & BITMASK((u8)(count)))
 #endif
 
-static enum libdeflate_result ATTRIBUTES MAYBE_UNUSED
+static ATTRIBUTES MAYBE_UNUSED enum libdeflate_result
 FUNCNAME(struct libdeflate_decompressor * restrict d,
 	 const void * restrict in, size_t in_nbytes,
 	 void * restrict out, size_t out_nbytes_avail,

diff --git a/lib/x86/adler32_impl.h b/lib/x86/adler32_impl.h
@@ -33,19 +33,19 @@
 /* SSE2 and AVX2 implementations.  Used on older CPUs. */
 #if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)
 #  define adler32_x86_sse2	adler32_x86_sse2
-#  define SUFFIX		       _x86_sse2
+#  define SUFFIX			   _sse2
 #  define ATTRIBUTES		_target_attribute("sse2")
 #  define VL			16
 #  define USE_VNNI		0
-#  define USE_MASKING		0
+#  define USE_AVX512		0
 #  include "adler32_template.h"
 
 #  define adler32_x86_avx2	adler32_x86_avx2
-#  define SUFFIX		       _x86_avx2
+#  define SUFFIX			   _avx2
 #  define ATTRIBUTES		_target_attribute("avx2")
 #  define VL			32
 #  define USE_VNNI		0
-#  define USE_MASKING		0
+#  define USE_AVX512		0
 #  include "adler32_template.h"
 #endif
 
@@ -55,11 +55,11 @@
  */
 #if GCC_PREREQ(11, 1) || CLANG_PREREQ(12, 0, 13000000) || MSVC_PREREQ(1930)
 #  define adler32_x86_avx2_vnni	adler32_x86_avx2_vnni
-#  define SUFFIX		       _x86_avx2_vnni
+#  define SUFFIX			   _avx2_vnni
 #  define ATTRIBUTES		_target_attribute("avx2,avxvnni")
 #  define VL			32
 #  define USE_VNNI		1
-#  define USE_MASKING		0
+#  define USE_AVX512		0
 #  include "adler32_template.h"
 #endif
 
@@ -72,11 +72,11 @@
  * that support AVX10/256 but not AVX10/512.
  */
 #  define adler32_x86_avx512_vl256_vnni	adler32_x86_avx512_vl256_vnni
-#  define SUFFIX			       _x86_avx512_vl256_vnni
+#  define SUFFIX				   _avx512_vl256_vnni
 #  define ATTRIBUTES		_target_attribute("avx512bw,avx512vl,avx512vnni")
 #  define VL			32
 #  define USE_VNNI		1
-#  define USE_MASKING		1
+#  define USE_AVX512		1
 #  include "adler32_template.h"
 
 /*
@@ -85,11 +85,11 @@
  * the optimal implementation on CPUs that support AVX10/512.
  */
 #  define adler32_x86_avx512_vl512_vnni	adler32_x86_avx512_vl512_vnni
-#  define SUFFIX			       _x86_avx512_vl512_vnni
+#  define SUFFIX				   _avx512_vl512_vnni
 #  define ATTRIBUTES		_target_attribute("avx512bw,avx512vnni")
 #  define VL			64
 #  define USE_VNNI		1
-#  define USE_MASKING		1
+#  define USE_AVX512		1
 #  include "adler32_template.h"
 #endif
 

diff --git a/lib/x86/adler32_template.h b/lib/x86/adler32_template.h
@@ -34,20 +34,21 @@
  * ATTRIBUTES:
  *	Target function attributes to use.  Must satisfy the dependencies of the
  *	other parameters as follows:
- *	   VL=16 && USE_VNNI=0 && USE_MASKING=0: at least sse2
- *	   VL=32 && USE_VNNI=0 && USE_MASKING=0: at least avx2
- *	   VL=32 && USE_VNNI=1 && USE_MASKING=0: at least avx2,avxvnni
- *	   VL=32 && USE_VNNI=1 && USE_MASKING=1: at least avx512bw,avx512vl,avx512vnni
- *	   VL=64 && USE_VNNI=1 && USE_MASKING=1: at least avx512bw,avx512vnni
+ *	   VL=16 && USE_VNNI=0 && USE_AVX512=0: at least sse2
+ *	   VL=32 && USE_VNNI=0 && USE_AVX512=0: at least avx2
+ *	   VL=32 && USE_VNNI=1 && USE_AVX512=0: at least avx2,avxvnni
+ *	   VL=32 && USE_VNNI=1 && USE_AVX512=1: at least avx512bw,avx512vl,avx512vnni
+ *	   VL=64 && USE_VNNI=1 && USE_AVX512=1: at least avx512bw,avx512vnni
  *	   (Other combinations are not useful and have not been tested.)
  * VL:
- *	Vector length in bytes.  Must be 16, 32, and 64.
+ *	Vector length in bytes.  Must be 16, 32, or 64.
  * USE_VNNI:
  *	If 1, use the VNNI dot product based algorithm.
  *	If 0, use the legacy SSE2 and AVX2 compatible algorithm.
- * USE_MASKING:
- *	If 1, use AVX-512 features such as masking.
- *	If 0, assume that the CPU might not support AVX-512.
+ * USE_AVX512:
+ *	If 1, take advantage of AVX-512 features such as masking.  This doesn't
+ *	enable the use of 512-bit vectors; the vector length is controlled by
+ *	VL.  If 0, assume that the CPU might not support AVX-512.
  */
 
 #if VL == 16
@@ -57,7 +58,7 @@
 #  define VADD8(a, b)		_mm_add_epi8((a), (b))
 #  define VADD16(a, b)		_mm_add_epi16((a), (b))
 #  define VADD32(a, b)		_mm_add_epi32((a), (b))
-#  if USE_MASKING
+#  if USE_AVX512
 #    define VDPBUSD(a, b, c)	_mm_dpbusd_epi32((a), (b), (c))
 #  else
 #    define VDPBUSD(a, b, c)	_mm_dpbusd_avx_epi32((a), (b), (c))
@@ -68,20 +69,20 @@
 #  define VMASKZ_LOADU(mask, p) _mm_maskz_loadu_epi8((mask), (p))
 #  define VMULLO32(a, b)	_mm_mullo_epi32((a), (b))
 #  define VSAD8(a, b)		_mm_sad_epu8((a), (b))
-#  define VSET1_32(a)		_mm_set1_epi32(a)
 #  define VSET1_8(a)		_mm_set1_epi8(a)
+#  define VSET1_32(a)		_mm_set1_epi32(a)
 #  define VSETZERO()		_mm_setzero_si128()
 #  define VSLL32(a, b)		_mm_slli_epi32((a), (b))
-#  define VUNPACKHI8(a, b)	_mm_unpackhi_epi8((a), (b))
 #  define VUNPACKLO8(a, b)	_mm_unpacklo_epi8((a), (b))
+#  define VUNPACKHI8(a, b)	_mm_unpackhi_epi8((a), (b))
 #elif VL == 32
 #  define vec_t			__m256i
 #  define mask_t		u32
 #  define LOG2_VL		5
 #  define VADD8(a, b)		_mm256_add_epi8((a), (b))
 #  define VADD16(a, b)		_mm256_add_epi16((a), (b))
 #  define VADD32(a, b)		_mm256_add_epi32((a), (b))
-#  if USE_MASKING
+#  if USE_AVX512
 #    define VDPBUSD(a, b, c)	_mm256_dpbusd_epi32((a), (b), (c))
 #  else
 #    define VDPBUSD(a, b, c)	_mm256_dpbusd_avx_epi32((a), (b), (c))
@@ -92,27 +93,32 @@
 #  define VMASKZ_LOADU(mask, p) _mm256_maskz_loadu_epi8((mask), (p))
 #  define VMULLO32(a, b)	_mm256_mullo_epi32((a), (b))
 #  define VSAD8(a, b)		_mm256_sad_epu8((a), (b))
-#  define VSET1_32(a)		_mm256_set1_epi32(a)
 #  define VSET1_8(a)		_mm256_set1_epi8(a)
+#  define VSET1_32(a)		_mm256_set1_epi32(a)
 #  define VSETZERO()		_mm256_setzero_si256()
 #  define VSLL32(a, b)		_mm256_slli_epi32((a), (b))
-#  define VUNPACKHI8(a, b)	_mm256_unpackhi_epi8((a), (b))
 #  define VUNPACKLO8(a, b)	_mm256_unpacklo_epi8((a), (b))
+#  define VUNPACKHI8(a, b)	_mm256_unpackhi_epi8((a), (b))
 #elif VL == 64
 #  define vec_t			__m512i
 #  define mask_t		u64
 #  define LOG2_VL		6
 #  define VADD8(a, b)		_mm512_add_epi8((a), (b))
+#  define VADD16(a, b)		_mm512_add_epi16((a), (b))
 #  define VADD32(a, b)		_mm512_add_epi32((a), (b))
 #  define VDPBUSD(a, b, c)	_mm512_dpbusd_epi32((a), (b), (c))
 #  define VLOAD(p)		_mm512_load_si512((const void *)(p))
 #  define VLOADU(p)		_mm512_loadu_si512((const void *)(p))
+#  define VMADD16(a, b)		_mm512_madd_epi16((a), (b))
 #  define VMASKZ_LOADU(mask, p) _mm512_maskz_loadu_epi8((mask), (p))
 #  define VMULLO32(a, b)	_mm512_mullo_epi32((a), (b))
-#  define VSET1_32(a)		_mm512_set1_epi32(a)
+#  define VSAD8(a, b)		_mm512_sad_epu8((a), (b))
 #  define VSET1_8(a)		_mm512_set1_epi8(a)
+#  define VSET1_32(a)		_mm512_set1_epi32(a)
 #  define VSETZERO()		_mm512_setzero_si512()
 #  define VSLL32(a, b)		_mm512_slli_epi32((a), (b))
+#  define VUNPACKLO8(a, b)	_mm512_unpacklo_epi8((a), (b))
+#  define VUNPACKHI8(a, b)	_mm512_unpackhi_epi8((a), (b))
 #else
 #  error "unsupported vector length"
 #endif
@@ -173,8 +179,8 @@ ADD_SUFFIX(reduce_to_32bits)(vec_t v_s1, vec_t v_s2, u32 *s1_p, u32 *s2_p)
 }
 #define reduce_to_32bits	ADD_SUFFIX(reduce_to_32bits)
 
-static u32 ATTRIBUTES
-ADD_SUFFIX(adler32)(u32 adler, const u8 *p, size_t len)
+static ATTRIBUTES u32
+ADD_SUFFIX(adler32_x86)(u32 adler, const u8 *p, size_t len)
 {
 #if USE_VNNI
 	/* This contains the bytes [VL, VL-1, VL-2, ..., 1]. */
@@ -235,7 +241,7 @@ ADD_SUFFIX(adler32)(u32 adler, const u8 *p, size_t len)
 
 #if USE_VNNI
 	/*
-	 * This is Adler-32 using the vpdpbusd instruction from AVX512-VNNI or
+	 * This is Adler-32 using the vpdpbusd instruction from AVX512VNNI or
 	 * AVX-VNNI.  vpdpbusd multiplies the unsigned bytes of one vector by
 	 * the signed bytes of another vector and adds the sums in groups of 4
 	 * to the 32-bit elements of a third vector.  We use it in two ways:
@@ -369,7 +375,7 @@ ADD_SUFFIX(adler32)(u32 adler, const u8 *p, size_t len)
 			 * Process the last 0 < n <= VL bytes of the chunk.
 			 * Utilize a masked load if it's available.
 			 */
-		#if USE_MASKING
+		#if USE_AVX512
 			data = VMASKZ_LOADU((mask_t)-1 >> (VL - n), p);
 		#else
 			data = zeroes;
@@ -414,7 +420,7 @@ ADD_SUFFIX(adler32)(u32 adler, const u8 *p, size_t len)
 		 * v_byte_sums_* counter is guaranteed to not exceed INT16_MAX.
 		 * It's INT16_MAX, not UINT16_MAX, because v_byte_sums_* are
 		 * used with pmaddwd which does signed multiplication.  In the
-		 * SSE2 case this limits chunks to 4096 bytes instead of 5504.
+		 * SSE2 case this limits chunks to 4096 bytes instead of 5536.
 		 */
 		size_t n = MIN(len, MIN(2 * VL * (INT16_MAX / UINT8_MAX),
 					MAX_CHUNK_LEN) & ~(2*VL - 1));
@@ -502,11 +508,11 @@ ADD_SUFFIX(adler32)(u32 adler, const u8 *p, size_t len)
 #undef VSET1_32
 #undef VSETZERO
 #undef VSLL32
-#undef VUNPACKHI8
 #undef VUNPACKLO8
+#undef VUNPACKHI8
 
 #undef SUFFIX
 #undef ATTRIBUTES
 #undef VL
 #undef USE_VNNI
-#undef USE_MASKING
+#undef USE_AVX512
diff --git a/lib/x86/crc32_impl.h b/lib/x86/crc32_impl.h
@@ -36,8 +36,8 @@
 #  define SUFFIX			 _pclmulqdq
 #  define ATTRIBUTES		_target_attribute("pclmul")
 #  define VL			16
-#  define FOLD_LESSTHAN16BYTES	0
-#  define USE_TERNARYLOGIC	0
+#  define USE_SSE4_1		0
+#  define USE_AVX512		0
 #  include "crc32_pclmul_template.h"
 
 /*
@@ -49,55 +49,62 @@
  * non-destructive VEX-encoded instructions.  Second, AVX support implies SSSE3
  * and SSE4.1 support, and we can use SSSE3 and SSE4.1 intrinsics for efficient
  * handling of partial blocks.  (We *could* compile a variant with
- * PCLMULQDQ+SSE4.1 without AVX, but for simplicity we don't currently bother.)
+ * PCLMULQDQ+SSE4.1 without AVX, but for simplicity we currently don't bother.)
  */
 #  define crc32_x86_pclmulqdq_avx	crc32_x86_pclmulqdq_avx
 #  define SUFFIX				 _pclmulqdq_avx
 #  define ATTRIBUTES		_target_attribute("pclmul,avx")
 #  define VL			16
-#  define FOLD_LESSTHAN16BYTES	1
-#  define USE_TERNARYLOGIC	0
+#  define USE_SSE4_1		1
+#  define USE_AVX512		0
 #  include "crc32_pclmul_template.h"
 #endif
 
 /*
- * VPCLMULQDQ/AVX2 implementation.  Uses 256-bit vectors.
+ * VPCLMULQDQ/AVX2 implementation.  This is used on CPUs that have AVX2 and
+ * VPCLMULQDQ but don't have AVX-512, for example Intel Alder Lake.
  *
  * Currently this can't be enabled with MSVC because MSVC has a bug where it
  * incorrectly assumes that VPCLMULQDQ implies AVX-512:
- * https://developercommunity.visualstudio.com/t/Compiler-incorrectly-assumes-VAES-and-VP/10578785?space=62&q=AVX512&sort=newest
+ * https://developercommunity.visualstudio.com/t/Compiler-incorrectly-assumes-VAES-and-VP/10578785
  */
 #if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000)
 #  define crc32_x86_vpclmulqdq_avx2	crc32_x86_vpclmulqdq_avx2
 #  define SUFFIX				 _vpclmulqdq_avx2
 #  define ATTRIBUTES		_target_attribute("vpclmulqdq,pclmul,avx2")
 #  define VL			32
-#  define FOLD_LESSTHAN16BYTES	1
-#  define USE_TERNARYLOGIC	0
+#  define USE_SSE4_1		1
+#  define USE_AVX512		0
 #  include "crc32_pclmul_template.h"
 #endif
 
 #if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920)
 /*
- * VPCLMULQDQ/AVX512 implementation with 256-bit vectors.  This takes advantage
- * of some AVX-512 instructions but uses 256-bit vectors rather than 512-bit.
- * This can be useful on CPUs where 512-bit vectors cause downclocking.
+ * VPCLMULQDQ/AVX512 implementation using 256-bit vectors.  This is very similar
+ * to the VPCLMULQDQ/AVX2 implementation but takes advantage of the vpternlog
+ * instruction and more registers.  This is used on CPUs that support AVX-512
+ * but where using 512-bit vectors causes downclocking.  This should also be the
+ * optimal implementation on CPUs that support AVX10/256 but not AVX10/512.
  */
 #  define crc32_x86_vpclmulqdq_avx512_vl256  crc32_x86_vpclmulqdq_avx512_vl256
 #  define SUFFIX				      _vpclmulqdq_avx512_vl256
 #  define ATTRIBUTES		_target_attribute("vpclmulqdq,pclmul,avx512vl")
 #  define VL			32
-#  define FOLD_LESSTHAN16BYTES	1
-#  define USE_TERNARYLOGIC	1
+#  define USE_SSE4_1		1
+#  define USE_AVX512		1
 #  include "crc32_pclmul_template.h"
 
-/* VPCLMULQDQ/AVX512 implementation with 512-bit vectors */
+/*
+ * VPCLMULQDQ/AVX512 implementation using 512-bit vectors.  This is used on CPUs
+ * that have a good AVX-512 implementation including VPCLMULQDQ.  This should
+ * also be the optimal implementation on CPUs that support AVX10/512.
+ */
 #  define crc32_x86_vpclmulqdq_avx512_vl512  crc32_x86_vpclmulqdq_avx512_vl512
 #  define SUFFIX				      _vpclmulqdq_avx512_vl512
 #  define ATTRIBUTES		_target_attribute("vpclmulqdq,pclmul,avx512vl")
 #  define VL			64
-#  define FOLD_LESSTHAN16BYTES	1
-#  define USE_TERNARYLOGIC	1
+#  define USE_SSE4_1		1
+#  define USE_AVX512		1
 #  include "crc32_pclmul_template.h"
 #endif