ebiggers · ebiggers · Mar 2, 2024 · Mar 2, 2024 · Mar 2, 2024 · Mar 2, 2024
diff --git a/README.md b/README.md
@@ -77,11 +77,28 @@ You should compile both `lib/*.c` and `lib/*/*.c`.  You don't need to worry
 about excluding irrelevant architecture-specific code, as this is already
 handled in the source files themselves using `#ifdef`s.
 
-It is strongly recommended to use either gcc or clang, and to use `-O2`.
-
 If you are doing a freestanding build with `-ffreestanding`, you must add
 `-DFREESTANDING` as well (matching what the `CMakeLists.txt` does).
 
+## Supported compilers
+
+- gcc: v4.9 and later
+- clang: v3.9 and later (upstream), Xcode 8 and later (Apple)
+- MSVC: Visual Studio 2015 and later
+- Other compilers: any other C99-compatible compiler should work, though if your
+  compiler pretends to be gcc, clang, or MSVC, it needs to be sufficiently
+  compatible with the compiler it pretends to be.
+
+The above are the minimums, but using a newer compiler allows more of the
+architecture-optimized code to be built.  libdeflate is most heavily optimized
+for gcc and clang, but MSVC is supported fairly well now too.
+
+The recommended optimization flag is `-O2`, and the `CMakeLists.txt` sets this
+for release builds.  `-O3` is fine too, but often `-O2` actually gives better
+results.  It's unnecessary to add flags such as `-mavx2` or `/arch:AVX2`, though
+you can do so if you want to.  Most of the relevant optimized functions are
+built regardless of such flags, and appropriate ones are selected at runtime.
+
 # API
 
 libdeflate has a simple API that is not zlib-compatible.  You can create

diff --git a/common_defs.h b/common_defs.h
@@ -135,6 +135,9 @@ typedef size_t machine_word_t;
 #  define GCC_PREREQ(major, minor)		\
 	(__GNUC__ > (major) ||			\
 	 (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)))
+#  if !GCC_PREREQ(4, 9)
+#    error "gcc versions older than 4.9 are no longer supported"
+#  endif
 #else
 #  define GCC_PREREQ(major, minor)	0
 #endif
@@ -147,18 +150,35 @@ typedef size_t machine_word_t;
 	(__clang_major__ > (major) ||			\
 	 (__clang_major__ == (major) && __clang_minor__ >= (minor)))
 #  endif
+#  if !CLANG_PREREQ(3, 9, 8000000)
+#    error "clang versions older than 3.9 are no longer supported"
+#  endif
 #else
 #  define CLANG_PREREQ(major, minor, apple_version)	0
 #endif
+#ifdef _MSC_VER
+#  define MSVC_PREREQ(version)	(_MSC_VER >= (version))
+#  if !MSVC_PREREQ(1900)
+#    error "MSVC versions older than Visual Studio 2015 are no longer supported"
+#  endif
+#else
+#  define MSVC_PREREQ(version)	0
+#endif
 
 /*
- * Macros to check for compiler support for attributes and builtins.  clang
- * implements these macros, but gcc doesn't, so generally any use of one of
- * these macros must also be combined with a gcc version check.
+ * __has_attribute(attribute) - check whether the compiler supports the given
+ * attribute (and also supports doing the check in the first place).  Mostly
+ * useful just for clang, since gcc didn't add this macro until gcc 5.
  */
 #ifndef __has_attribute
 #  define __has_attribute(attribute)	0
 #endif
+
+/*
+ * __has_builtin(builtin) - check whether the compiler supports the given
+ * builtin (and also supports doing the check in the first place).  Mostly
+ * useful just for clang, since gcc didn't add this macro until gcc 10.
+ */
 #ifndef __has_builtin
 #  define __has_builtin(builtin)	0
 #endif
@@ -266,12 +286,10 @@ typedef size_t machine_word_t;
  * code as well as the corresponding intrinsics.  On other compilers this macro
  * expands to nothing, though MSVC allows intrinsics to be used anywhere anyway.
  */
-#if GCC_PREREQ(4, 4) || __has_attribute(target)
+#if defined(__GNUC__) || __has_attribute(target)
 #  define _target_attribute(attrs)	__attribute__((target(attrs)))
-#  define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE	1
 #else
 #  define _target_attribute(attrs)
-#  define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE	0
 #endif
 
 /* ========================================================================== */
@@ -316,7 +334,7 @@ static forceinline bool CPU_IS_LITTLE_ENDIAN(void)
 /* bswap16(v) - swap the bytes of a 16-bit integer */
 static forceinline u16 bswap16(u16 v)
 {
-#if GCC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16)
+#if defined(__GNUC__) || __has_builtin(__builtin_bswap16)
 	return __builtin_bswap16(v);
 #elif defined(_MSC_VER)
 	return _byteswap_ushort(v);
@@ -328,7 +346,7 @@ static forceinline u16 bswap16(u16 v)
 /* bswap32(v) - swap the bytes of a 32-bit integer */
 static forceinline u32 bswap32(u32 v)
 {
-#if GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap32)
+#if defined(__GNUC__) || __has_builtin(__builtin_bswap32)
 	return __builtin_bswap32(v);
 #elif defined(_MSC_VER)
 	return _byteswap_ulong(v);
@@ -343,7 +361,7 @@ static forceinline u32 bswap32(u32 v)
 /* bswap64(v) - swap the bytes of a 64-bit integer */
 static forceinline u64 bswap64(u64 v)
 {
-#if GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap64)
+#if defined(__GNUC__) || __has_builtin(__builtin_bswap64)
 	return __builtin_bswap64(v);
 #elif defined(_MSC_VER)
 	return _byteswap_uint64(v);

diff --git a/lib/arm/cpu_features.h b/lib/arm/cpu_features.h
@@ -35,19 +35,19 @@
 #if defined(ARCH_ARM32) || defined(ARCH_ARM64)
 
 #if !defined(FREESTANDING) && \
-    (COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE || defined(_MSC_VER)) && \
+    (defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)) && \
     (defined(__linux__) || \
      (defined(__APPLE__) && defined(ARCH_ARM64)) || \
      (defined(_WIN32) && defined(ARCH_ARM64)))
 #  undef HAVE_DYNAMIC_ARM_CPU_FEATURES
 #  define HAVE_DYNAMIC_ARM_CPU_FEATURES	1
 #endif
 
-#define ARM_CPU_FEATURE_NEON		0x00000001
-#define ARM_CPU_FEATURE_PMULL		0x00000002
-#define ARM_CPU_FEATURE_CRC32		0x00000004
-#define ARM_CPU_FEATURE_SHA3		0x00000008
-#define ARM_CPU_FEATURE_DOTPROD		0x00000010
+#define ARM_CPU_FEATURE_NEON		(1 << 0)
+#define ARM_CPU_FEATURE_PMULL		(1 << 1)
+#define ARM_CPU_FEATURE_CRC32		(1 << 2)
+#define ARM_CPU_FEATURE_SHA3		(1 << 3)
+#define ARM_CPU_FEATURE_DOTPROD		(1 << 4)
 
 #define HAVE_NEON(features)	(HAVE_NEON_NATIVE    || ((features) & ARM_CPU_FEATURE_NEON))
 #define HAVE_PMULL(features)	(HAVE_PMULL_NATIVE   || ((features) & ARM_CPU_FEATURE_PMULL))
@@ -56,7 +56,7 @@
 #define HAVE_DOTPROD(features)	(HAVE_DOTPROD_NATIVE || ((features) & ARM_CPU_FEATURE_DOTPROD))
 
 #if HAVE_DYNAMIC_ARM_CPU_FEATURES
-#define ARM_CPU_FEATURES_KNOWN		0x80000000
+#define ARM_CPU_FEATURES_KNOWN		(1U << 31)
 extern volatile u32 libdeflate_arm_cpu_features;
 
 void libdeflate_init_arm_cpu_features(void);
@@ -98,8 +98,7 @@ static inline u32 get_arm_cpu_features(void) { return 0; }
 #if HAVE_PMULL_NATIVE || \
 	(HAVE_DYNAMIC_ARM_CPU_FEATURES && \
 	 HAVE_NEON_INTRIN /* needed to exclude soft float arm32 case */ && \
-	 (GCC_PREREQ(6, 1) || CLANG_PREREQ(3, 5, 6010000) || \
-	  defined(_MSC_VER)) && \
+	 (GCC_PREREQ(6, 1) || defined(__clang__) || defined(_MSC_VER)) && \
 	  /*
 	   * On arm32 with clang, the crypto intrinsics (which include pmull)
 	   * are not defined, even when using -mfpu=crypto-neon-fp-armv8,
@@ -179,9 +178,7 @@ static inline u32 get_arm_cpu_features(void) { return 0; }
 	!defined(__ARM_ARCH_7EM__)
 #      define HAVE_CRC32_INTRIN	1
 #    endif
-#  elif CLANG_PREREQ(3, 4, 6000000)
-#    define HAVE_CRC32_INTRIN	1
-#  elif defined(_MSC_VER)
+#  elif defined(__clang__) || defined(_MSC_VER)
 #    define HAVE_CRC32_INTRIN	1
 #  endif
 #endif

diff --git a/lib/x86/adler32_impl.h b/lib/x86/adler32_impl.h
@@ -104,8 +104,11 @@
 	(void)a, (void)b, (void)c, (void)d, (void)e, (void)f
 #endif
 
-/* SSE2 implementation */
-#if HAVE_SSE2_INTRIN
+/*
+ * SSE2 and AVX2 implementations.  They are very similar; the AVX2
+ * implementation just uses twice the vector width as the SSE2 one.
+ */
+#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)
 #  define adler32_sse2		adler32_sse2
 #  define FUNCNAME		adler32_sse2
 #  define FUNCNAME_CHUNK	adler32_sse2_chunk
@@ -117,12 +120,7 @@
  * would behave incorrectly.
  */
 #  define IMPL_MAX_CHUNK_LEN	(32 * (0x7FFF / 0xFF))
-#  if HAVE_SSE2_NATIVE
-#    define ATTRIBUTES
-#  else
-#    define ATTRIBUTES		_target_attribute("sse2")
-#  endif
-#  include <emmintrin.h>
+#  define ATTRIBUTES		_target_attribute("sse2")
 static forceinline ATTRIBUTES void
 adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2)
 {
@@ -202,33 +200,14 @@ adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2)
 	ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2, 1);
 }
 #  include "../adler32_vec_template.h"
-#endif /* HAVE_SSE2_INTRIN */
 
-/*
- * AVX2 implementation.  Basically the same as the SSE2 one, but with the vector
- * width doubled.
- */
-#if HAVE_AVX2_INTRIN
 #  define adler32_avx2		adler32_avx2
 #  define FUNCNAME		adler32_avx2
 #  define FUNCNAME_CHUNK	adler32_avx2_chunk
 #  define IMPL_ALIGNMENT	32
 #  define IMPL_SEGMENT_LEN	64
 #  define IMPL_MAX_CHUNK_LEN	(64 * (0x7FFF / 0xFF))
-#  if HAVE_AVX2_NATIVE
-#    define ATTRIBUTES
-#  else
-#    define ATTRIBUTES		_target_attribute("avx2")
-#  endif
-#  include <immintrin.h>
-  /*
-   * With clang in MSVC compatibility mode, immintrin.h incorrectly skips
-   * including some sub-headers.
-   */
-#  if defined(__clang__) && defined(_MSC_VER)
-#    include <avxintrin.h>
-#    include <avx2intrin.h>
-#  endif
+#  define ATTRIBUTES		_target_attribute("avx2")
 static forceinline ATTRIBUTES void
 adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2)
 {
@@ -282,38 +261,21 @@ adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2)
 	ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1, v_s2, 1);
 }
 #  include "../adler32_vec_template.h"
-#endif /* HAVE_AVX2_INTRIN */
+#endif
 
 /*
  * AVX2/AVX-VNNI implementation.  This is similar to the AVX512BW/AVX512VNNI
  * implementation, but instead of using AVX-512 it uses AVX2 plus AVX-VNNI.
  * AVX-VNNI adds dot product instructions to CPUs without AVX-512.
  */
-#if HAVE_AVX2_INTRIN && HAVE_AVXVNNI_INTRIN
+#if GCC_PREREQ(11, 1) || CLANG_PREREQ(12, 0, 13000000) || MSVC_PREREQ(1930)
 #  define adler32_avx2_vnni	adler32_avx2_vnni
 #  define FUNCNAME		adler32_avx2_vnni
 #  define FUNCNAME_CHUNK	adler32_avx2_vnni_chunk
 #  define IMPL_ALIGNMENT	32
 #  define IMPL_SEGMENT_LEN	128
 #  define IMPL_MAX_CHUNK_LEN	MAX_CHUNK_LEN
-#  if HAVE_AVX2_NATIVE && HAVE_AVXVNNI_NATIVE
-#    define ATTRIBUTES
-#  else
-#    define ATTRIBUTES		_target_attribute("avx2,avxvnni")
-#  endif
-#  include <immintrin.h>
-  /*
-   * With clang in MSVC compatibility mode, immintrin.h incorrectly skips
-   * including some sub-headers.
-   */
-#  if defined(__clang__) && defined(_MSC_VER)
-#    include <tmmintrin.h>
-#    include <smmintrin.h>
-#    include <wmmintrin.h>
-#    include <avxintrin.h>
-#    include <avx2intrin.h>
-#    include <avxvnniintrin.h>
-#  endif
+#  define ATTRIBUTES		_target_attribute("avx2,avxvnni")
 static forceinline ATTRIBUTES void
 adler32_avx2_vnni_chunk(const __m256i *p, const __m256i *const end,
 			u32 *s1, u32 *s2)
@@ -372,39 +334,20 @@ adler32_avx2_vnni_chunk(const __m256i *p, const __m256i *const end,
 	ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1_a, v_s2_a, 1);
 }
 #  include "../adler32_vec_template.h"
-#endif /* HAVE_AVX2_INTRIN && HAVE_AVXVNNI_INTRIN */
+#endif
 
 /*
  * AVX512BW/AVX512VNNI implementation.  Uses the vpdpbusd (dot product)
  * instruction from AVX512VNNI.
  */
-#if HAVE_AVX512BW_INTRIN && HAVE_AVX512VNNI_INTRIN
+#if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920)
 #  define adler32_avx512_vnni	adler32_avx512_vnni
 #  define FUNCNAME		adler32_avx512_vnni
 #  define FUNCNAME_CHUNK	adler32_avx512_vnni_chunk
 #  define IMPL_ALIGNMENT	64
 #  define IMPL_SEGMENT_LEN	128
 #  define IMPL_MAX_CHUNK_LEN	MAX_CHUNK_LEN
-#  if HAVE_AVX512BW_NATIVE && HAVE_AVX512VNNI_NATIVE
-#    define ATTRIBUTES
-#  else
-#    define ATTRIBUTES		_target_attribute("avx512bw,avx512vnni")
-#  endif
-#  include <immintrin.h>
-  /*
-   * With clang in MSVC compatibility mode, immintrin.h incorrectly skips
-   * including some sub-headers.
-   */
-#  if defined(__clang__) && defined(_MSC_VER)
-#    include <tmmintrin.h>
-#    include <smmintrin.h>
-#    include <wmmintrin.h>
-#    include <avxintrin.h>
-#    include <avx2intrin.h>
-#    include <avx512fintrin.h>
-#    include <avx512bwintrin.h>
-#    include <avx512vnniintrin.h>
-#  endif
+#  define ATTRIBUTES		_target_attribute("avx512bw,avx512vnni")
 static forceinline ATTRIBUTES void
 adler32_avx512_vnni_chunk(const __m512i *p, const __m512i *const end,
 			  u32 *s1, u32 *s2)
@@ -452,7 +395,7 @@ adler32_avx512_vnni_chunk(const __m512i *p, const __m512i *const end,
 	ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1_a, v_s2_a, 0);
 }
 #  include "../adler32_vec_template.h"
-#endif /* HAVE_AVX512BW_INTRIN && HAVE_AVX512VNNI_INTRIN */
+#endif
 
 static inline adler32_func_t
 arch_select_adler32_func(void)

diff --git a/lib/x86/cpu_features.c b/lib/x86/cpu_features.c
@@ -30,16 +30,6 @@
 
 #if HAVE_DYNAMIC_X86_CPU_FEATURES
 
-/*
- * With old GCC versions we have to manually save and restore the x86_32 PIC
- * register (ebx).  See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=47602
- */
-#if defined(ARCH_X86_32) && defined(__PIC__)
-#  define EBX_CONSTRAINT "=&r"
-#else
-#  define EBX_CONSTRAINT "=b"
-#endif
-
 /* Execute the CPUID instruction. */
 static inline void
 cpuid(u32 leaf, u32 subleaf, u32 *a, u32 *b, u32 *c, u32 *d)
@@ -56,7 +46,7 @@ cpuid(u32 leaf, u32 subleaf, u32 *a, u32 *b, u32 *c, u32 *d)
 	__asm__ volatile(".ifnc %%ebx, %1; mov  %%ebx, %1; .endif\n"
 			 "cpuid                                  \n"
 			 ".ifnc %%ebx, %1; xchg %%ebx, %1; .endif\n"
-			 : "=a" (*a), EBX_CONSTRAINT (*b), "=c" (*c), "=d" (*d)
+			 : "=a" (*a), "=b" (*b), "=c" (*c), "=d" (*d)
 			 : "a" (leaf), "c" (subleaf));
 #endif
 }