ebiggers · ebiggers · Mar 16, 2024 · Mar 10, 2024 · Mar 10, 2024 · Mar 10, 2024
diff --git a/lib/arm/adler32_impl.h b/lib/arm/adler32_impl.h
@@ -43,7 +43,7 @@
 #    endif
 #  endif
 #  include <arm_neon.h>
-static u32 ATTRIBUTES MAYBE_UNUSED
+static ATTRIBUTES MAYBE_UNUSED u32
 adler32_arm_neon(u32 adler, const u8 *p, size_t len)
 {
 	static const u16 _aligned_attribute(16) mults[64] = {
@@ -225,7 +225,7 @@ adler32_arm_neon(u32 adler, const u8 *p, size_t len)
 #    endif
 #  endif
 #  include <arm_neon.h>
-static u32 ATTRIBUTES
+static ATTRIBUTES u32
 adler32_arm_neon_dotprod(u32 adler, const u8 *p, size_t len)
 {
 	static const u8 _aligned_attribute(16) mults[64] = {

diff --git a/lib/arm/crc32_impl.h b/lib/arm/crc32_impl.h
@@ -113,7 +113,7 @@ combine_crcs_slow(u32 crc0, u32 crc1, u32 crc2, u32 crc3)
 }
 
 #define crc32_arm_crc	crc32_arm_crc
-static u32 ATTRIBUTES MAYBE_UNUSED
+static ATTRIBUTES MAYBE_UNUSED u32
 crc32_arm_crc(u32 crc, const u8 *p, size_t len)
 {
 	if (len >= 64) {
@@ -289,7 +289,7 @@ combine_crcs_fast(u32 crc0, u32 crc1, u32 crc2, u32 crc3, size_t i)
 }
 
 #define crc32_arm_crc_pmullcombine	crc32_arm_crc_pmullcombine
-static u32 ATTRIBUTES MAYBE_UNUSED
+static ATTRIBUTES MAYBE_UNUSED u32
 crc32_arm_crc_pmullcombine(u32 crc, const u8 *p, size_t len)
 {
 	const size_t align = -(uintptr_t)p & 7;
@@ -470,7 +470,7 @@ crc32_arm_crc_pmullcombine(u32 crc, const u8 *p, size_t len)
 #  define ENABLE_EOR3		0
 #  include "crc32_pmull_helpers.h"
 
-static u32 ATTRIBUTES MAYBE_UNUSED
+static ATTRIBUTES MAYBE_UNUSED u32
 crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len)
 {
 	static const u64 _aligned_attribute(16) mults[3][2] = {

diff --git a/lib/arm/crc32_pmull_wide.h b/lib/arm/crc32_pmull_wide.h
@@ -52,7 +52,7 @@
 
 #include "crc32_pmull_helpers.h"
 
-static u32 ATTRIBUTES MAYBE_UNUSED
+static ATTRIBUTES MAYBE_UNUSED u32
 ADD_SUFFIX(crc32_arm)(u32 crc, const u8 *p, size_t len)
 {
 	uint8x16_t v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11;

diff --git a/lib/decompress_template.h b/lib/decompress_template.h
@@ -41,7 +41,7 @@
 #  define EXTRACT_VARBITS8(word, count)	((word) & BITMASK((u8)(count)))
 #endif
 
-static enum libdeflate_result ATTRIBUTES MAYBE_UNUSED
+static ATTRIBUTES MAYBE_UNUSED enum libdeflate_result
 FUNCNAME(struct libdeflate_decompressor * restrict d,
 	 const void * restrict in, size_t in_nbytes,
 	 void * restrict out, size_t out_nbytes_avail,

diff --git a/lib/x86/adler32_impl.h b/lib/x86/adler32_impl.h
@@ -33,19 +33,19 @@
 /* SSE2 and AVX2 implementations.  Used on older CPUs. */
 #if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)
 #  define adler32_x86_sse2	adler32_x86_sse2
-#  define SUFFIX		       _x86_sse2
+#  define SUFFIX			   _sse2
 #  define ATTRIBUTES		_target_attribute("sse2")
 #  define VL			16
 #  define USE_VNNI		0
-#  define USE_MASKING		0
+#  define USE_AVX512		0
 #  include "adler32_template.h"
 
 #  define adler32_x86_avx2	adler32_x86_avx2
-#  define SUFFIX		       _x86_avx2
+#  define SUFFIX			   _avx2
 #  define ATTRIBUTES		_target_attribute("avx2")
 #  define VL			32
 #  define USE_VNNI		0
-#  define USE_MASKING		0
+#  define USE_AVX512		0
 #  include "adler32_template.h"
 #endif
 
@@ -55,11 +55,11 @@
  */
 #if GCC_PREREQ(11, 1) || CLANG_PREREQ(12, 0, 13000000) || MSVC_PREREQ(1930)
 #  define adler32_x86_avx2_vnni	adler32_x86_avx2_vnni
-#  define SUFFIX		       _x86_avx2_vnni
+#  define SUFFIX			   _avx2_vnni
 #  define ATTRIBUTES		_target_attribute("avx2,avxvnni")
 #  define VL			32
 #  define USE_VNNI		1
-#  define USE_MASKING		0
+#  define USE_AVX512		0
 #  include "adler32_template.h"
 #endif
 
@@ -72,11 +72,11 @@
  * that support AVX10/256 but not AVX10/512.
  */
 #  define adler32_x86_avx512_vl256_vnni	adler32_x86_avx512_vl256_vnni
-#  define SUFFIX			       _x86_avx512_vl256_vnni
+#  define SUFFIX				   _avx512_vl256_vnni
 #  define ATTRIBUTES		_target_attribute("avx512bw,avx512vl,avx512vnni")
 #  define VL			32
 #  define USE_VNNI		1
-#  define USE_MASKING		1
+#  define USE_AVX512		1
 #  include "adler32_template.h"
 
 /*
@@ -85,11 +85,11 @@
  * the optimal implementation on CPUs that support AVX10/512.
  */
 #  define adler32_x86_avx512_vl512_vnni	adler32_x86_avx512_vl512_vnni
-#  define SUFFIX			       _x86_avx512_vl512_vnni
+#  define SUFFIX				   _avx512_vl512_vnni
 #  define ATTRIBUTES		_target_attribute("avx512bw,avx512vnni")
 #  define VL			64
 #  define USE_VNNI		1
-#  define USE_MASKING		1
+#  define USE_AVX512		1
 #  include "adler32_template.h"
 #endif
 

diff --git a/lib/x86/adler32_template.h b/lib/x86/adler32_template.h
@@ -34,20 +34,21 @@
  * ATTRIBUTES:
  *	Target function attributes to use.  Must satisfy the dependencies of the
  *	other parameters as follows:
- *	   VL=16 && USE_VNNI=0 && USE_MASKING=0: at least sse2
- *	   VL=32 && USE_VNNI=0 && USE_MASKING=0: at least avx2
- *	   VL=32 && USE_VNNI=1 && USE_MASKING=0: at least avx2,avxvnni
- *	   VL=32 && USE_VNNI=1 && USE_MASKING=1: at least avx512bw,avx512vl,avx512vnni
- *	   VL=64 && USE_VNNI=1 && USE_MASKING=1: at least avx512bw,avx512vnni
+ *	   VL=16 && USE_VNNI=0 && USE_AVX512=0: at least sse2
+ *	   VL=32 && USE_VNNI=0 && USE_AVX512=0: at least avx2
+ *	   VL=32 && USE_VNNI=1 && USE_AVX512=0: at least avx2,avxvnni
+ *	   VL=32 && USE_VNNI=1 && USE_AVX512=1: at least avx512bw,avx512vl,avx512vnni
+ *	   VL=64 && USE_VNNI=1 && USE_AVX512=1: at least avx512bw,avx512vnni
  *	   (Other combinations are not useful and have not been tested.)
  * VL:
- *	Vector length in bytes.  Must be 16, 32, and 64.
+ *	Vector length in bytes.  Must be 16, 32, or 64.
  * USE_VNNI:
  *	If 1, use the VNNI dot product based algorithm.
  *	If 0, use the legacy SSE2 and AVX2 compatible algorithm.
- * USE_MASKING:
- *	If 1, use AVX-512 features such as masking.
- *	If 0, assume that the CPU might not support AVX-512.
+ * USE_AVX512:
+ *	If 1, take advantage of AVX-512 features such as masking.  This doesn't
+ *	enable the use of 512-bit vectors; the vector length is controlled by
+ *	VL.  If 0, assume that the CPU might not support AVX-512.
  */
 
 #if VL == 16
@@ -57,7 +58,7 @@
 #  define VADD8(a, b)		_mm_add_epi8((a), (b))
 #  define VADD16(a, b)		_mm_add_epi16((a), (b))
 #  define VADD32(a, b)		_mm_add_epi32((a), (b))
-#  if USE_MASKING
+#  if USE_AVX512
 #    define VDPBUSD(a, b, c)	_mm_dpbusd_epi32((a), (b), (c))
 #  else
 #    define VDPBUSD(a, b, c)	_mm_dpbusd_avx_epi32((a), (b), (c))
@@ -68,20 +69,20 @@
 #  define VMASKZ_LOADU(mask, p) _mm_maskz_loadu_epi8((mask), (p))
 #  define VMULLO32(a, b)	_mm_mullo_epi32((a), (b))
 #  define VSAD8(a, b)		_mm_sad_epu8((a), (b))
-#  define VSET1_32(a)		_mm_set1_epi32(a)
 #  define VSET1_8(a)		_mm_set1_epi8(a)
+#  define VSET1_32(a)		_mm_set1_epi32(a)
 #  define VSETZERO()		_mm_setzero_si128()
 #  define VSLL32(a, b)		_mm_slli_epi32((a), (b))
-#  define VUNPACKHI8(a, b)	_mm_unpackhi_epi8((a), (b))
 #  define VUNPACKLO8(a, b)	_mm_unpacklo_epi8((a), (b))
+#  define VUNPACKHI8(a, b)	_mm_unpackhi_epi8((a), (b))
 #elif VL == 32
 #  define vec_t			__m256i
 #  define mask_t		u32
 #  define LOG2_VL		5
 #  define VADD8(a, b)		_mm256_add_epi8((a), (b))
 #  define VADD16(a, b)		_mm256_add_epi16((a), (b))
 #  define VADD32(a, b)		_mm256_add_epi32((a), (b))
-#  if USE_MASKING
+#  if USE_AVX512
 #    define VDPBUSD(a, b, c)	_mm256_dpbusd_epi32((a), (b), (c))
 #  else
 #    define VDPBUSD(a, b, c)	_mm256_dpbusd_avx_epi32((a), (b), (c))
@@ -92,27 +93,32 @@
 #  define VMASKZ_LOADU(mask, p) _mm256_maskz_loadu_epi8((mask), (p))
 #  define VMULLO32(a, b)	_mm256_mullo_epi32((a), (b))
 #  define VSAD8(a, b)		_mm256_sad_epu8((a), (b))
-#  define VSET1_32(a)		_mm256_set1_epi32(a)
 #  define VSET1_8(a)		_mm256_set1_epi8(a)
+#  define VSET1_32(a)		_mm256_set1_epi32(a)
 #  define VSETZERO()		_mm256_setzero_si256()
 #  define VSLL32(a, b)		_mm256_slli_epi32((a), (b))
-#  define VUNPACKHI8(a, b)	_mm256_unpackhi_epi8((a), (b))
 #  define VUNPACKLO8(a, b)	_mm256_unpacklo_epi8((a), (b))
+#  define VUNPACKHI8(a, b)	_mm256_unpackhi_epi8((a), (b))
 #elif VL == 64
 #  define vec_t			__m512i
 #  define mask_t		u64
 #  define LOG2_VL		6
 #  define VADD8(a, b)		_mm512_add_epi8((a), (b))
+#  define VADD16(a, b)		_mm512_add_epi16((a), (b))
 #  define VADD32(a, b)		_mm512_add_epi32((a), (b))
 #  define VDPBUSD(a, b, c)	_mm512_dpbusd_epi32((a), (b), (c))
 #  define VLOAD(p)		_mm512_load_si512((const void *)(p))
 #  define VLOADU(p)		_mm512_loadu_si512((const void *)(p))
+#  define VMADD16(a, b)		_mm512_madd_epi16((a), (b))
 #  define VMASKZ_LOADU(mask, p) _mm512_maskz_loadu_epi8((mask), (p))
 #  define VMULLO32(a, b)	_mm512_mullo_epi32((a), (b))
-#  define VSET1_32(a)		_mm512_set1_epi32(a)
+#  define VSAD8(a, b)		_mm512_sad_epu8((a), (b))
 #  define VSET1_8(a)		_mm512_set1_epi8(a)
+#  define VSET1_32(a)		_mm512_set1_epi32(a)
 #  define VSETZERO()		_mm512_setzero_si512()
 #  define VSLL32(a, b)		_mm512_slli_epi32((a), (b))
+#  define VUNPACKLO8(a, b)	_mm512_unpacklo_epi8((a), (b))
+#  define VUNPACKHI8(a, b)	_mm512_unpackhi_epi8((a), (b))
 #else
 #  error "unsupported vector length"
 #endif
@@ -173,8 +179,8 @@ ADD_SUFFIX(reduce_to_32bits)(vec_t v_s1, vec_t v_s2, u32 *s1_p, u32 *s2_p)
 }
 #define reduce_to_32bits	ADD_SUFFIX(reduce_to_32bits)
 
-static u32 ATTRIBUTES
-ADD_SUFFIX(adler32)(u32 adler, const u8 *p, size_t len)
+static ATTRIBUTES u32
+ADD_SUFFIX(adler32_x86)(u32 adler, const u8 *p, size_t len)
 {
 #if USE_VNNI
 	/* This contains the bytes [VL, VL-1, VL-2, ..., 1]. */
@@ -235,7 +241,7 @@ ADD_SUFFIX(adler32)(u32 adler, const u8 *p, size_t len)
 
 #if USE_VNNI
 	/*
-	 * This is Adler-32 using the vpdpbusd instruction from AVX512-VNNI or
+	 * This is Adler-32 using the vpdpbusd instruction from AVX512VNNI or
 	 * AVX-VNNI.  vpdpbusd multiplies the unsigned bytes of one vector by
 	 * the signed bytes of another vector and adds the sums in groups of 4
 	 * to the 32-bit elements of a third vector.  We use it in two ways:
@@ -369,7 +375,7 @@ ADD_SUFFIX(adler32)(u32 adler, const u8 *p, size_t len)
 			 * Process the last 0 < n <= VL bytes of the chunk.
 			 * Utilize a masked load if it's available.
 			 */
-		#if USE_MASKING
+		#if USE_AVX512
 			data = VMASKZ_LOADU((mask_t)-1 >> (VL - n), p);
 		#else
 			data = zeroes;
@@ -414,7 +420,7 @@ ADD_SUFFIX(adler32)(u32 adler, const u8 *p, size_t len)
 		 * v_byte_sums_* counter is guaranteed to not exceed INT16_MAX.
 		 * It's INT16_MAX, not UINT16_MAX, because v_byte_sums_* are
 		 * used with pmaddwd which does signed multiplication.  In the
-		 * SSE2 case this limits chunks to 4096 bytes instead of 5504.
+		 * SSE2 case this limits chunks to 4096 bytes instead of 5536.
 		 */
 		size_t n = MIN(len, MIN(2 * VL * (INT16_MAX / UINT8_MAX),
 					MAX_CHUNK_LEN) & ~(2*VL - 1));
@@ -502,11 +508,11 @@ ADD_SUFFIX(adler32)(u32 adler, const u8 *p, size_t len)
 #undef VSET1_32
 #undef VSETZERO
 #undef VSLL32
-#undef VUNPACKHI8
 #undef VUNPACKLO8
+#undef VUNPACKHI8
 
 #undef SUFFIX
 #undef ATTRIBUTES
 #undef VL
 #undef USE_VNNI
-#undef USE_MASKING
+#undef USE_AVX512
diff --git a/lib/x86/cpu_features.c b/lib/x86/cpu_features.c
@@ -82,7 +82,6 @@ static const struct cpu_feature x86_cpu_feature_table[] = {
 	{X86_CPU_FEATURE_AVX2,		"avx2"},
 	{X86_CPU_FEATURE_BMI2,		"bmi2"},
 	{X86_CPU_FEATURE_ZMM,		"zmm"},
-	{X86_CPU_FEATURE_AVX512F,	"avx512f"},
 	{X86_CPU_FEATURE_AVX512BW,	"avx512bw"},
 	{X86_CPU_FEATURE_AVX512VL,	"avx512vl"},
 	{X86_CPU_FEATURE_VPCLMULQDQ,	"vpclmulqdq"},
@@ -163,8 +162,6 @@ void libdeflate_init_x86_cpu_features(void)
 	if (((xcr0 & 0xe6) == 0xe6) &&
 	    allow_512bit_vectors(manufacturer, family, model))
 		features |= X86_CPU_FEATURE_ZMM;
-	if ((b & (1 << 16)) && ((xcr0 & 0xe6) == 0xe6))
-		features |= X86_CPU_FEATURE_AVX512F;
 	if ((b & (1 << 30)) && ((xcr0 & 0xe6) == 0xe6))
 		features |= X86_CPU_FEATURE_AVX512BW;
 	if ((b & (1U << 31)) && ((xcr0 & 0xe6) == 0xe6))

diff --git a/lib/x86/cpu_features.h b/lib/x86/cpu_features.h
@@ -39,17 +39,16 @@
 #define X86_CPU_FEATURE_BMI2		(1 << 4)
 /*
  * ZMM indicates whether 512-bit vectors (zmm registers) should be used.  On
- * some CPUs, to avoid downclocking issues we don't set ZMM even if the CPU
- * supports it, i.e. even if AVX512F is set.  On these CPUs, we may still use
- * AVX-512 instructions, but only with ymm and xmm registers.
+ * some CPUs, to avoid downclocking issues we don't set ZMM even if the CPU and
+ * operating system support AVX-512.  On these CPUs, we may still use AVX-512
+ * instructions, but only with xmm and ymm registers.
  */
 #define X86_CPU_FEATURE_ZMM		(1 << 5)
-#define X86_CPU_FEATURE_AVX512F		(1 << 6)
-#define X86_CPU_FEATURE_AVX512BW	(1 << 7)
-#define X86_CPU_FEATURE_AVX512VL	(1 << 8)
-#define X86_CPU_FEATURE_VPCLMULQDQ	(1 << 9)
-#define X86_CPU_FEATURE_AVX512VNNI	(1 << 10)
-#define X86_CPU_FEATURE_AVXVNNI		(1 << 11)
+#define X86_CPU_FEATURE_AVX512BW	(1 << 6)
+#define X86_CPU_FEATURE_AVX512VL	(1 << 7)
+#define X86_CPU_FEATURE_VPCLMULQDQ	(1 << 8)
+#define X86_CPU_FEATURE_AVX512VNNI	(1 << 9)
+#define X86_CPU_FEATURE_AVXVNNI		(1 << 10)
 
 #if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)
 /* Runtime x86 CPU feature detection is supported. */
@@ -135,12 +134,6 @@ static inline u32 get_x86_cpu_features(void) { return 0; }
 #  define HAVE_BMI2_NATIVE		0
 #endif
 
-#ifdef __AVX512F__
-#  define HAVE_AVX512F(features)	1
-#else
-#  define HAVE_AVX512F(features)	((features) & X86_CPU_FEATURE_AVX512F)
-#endif
-
 #ifdef __AVX512BW__
 #  define HAVE_AVX512BW(features)	1
 #else