diff --git a/README.md b/README.md
index 81cfc867..d7fa34cb 100644
--- a/README.md
+++ b/README.md
@@ -77,11 +77,28 @@ You should compile both `lib/*.c` and `lib/*/*.c`.  You don't need to worry
 about excluding irrelevant architecture-specific code, as this is already
 handled in the source files themselves using `#ifdef`s.
 
-It is strongly recommended to use either gcc or clang, and to use `-O2`.
-
 If you are doing a freestanding build with `-ffreestanding`, you must add
 `-DFREESTANDING` as well (matching what the `CMakeLists.txt` does).
 
+## Supported compilers
+
+- gcc: v4.9 and later
+- clang: v3.9 and later (upstream), Xcode 8 and later (Apple)
+- MSVC: Visual Studio 2015 and later
+- Other compilers: any other C99-compatible compiler should work, though if your
+  compiler pretends to be gcc, clang, or MSVC, it needs to be sufficiently
+  compatible with the compiler it pretends to be.
+
+The above are the minimums, but using a newer compiler allows more of the
+architecture-optimized code to be built.  libdeflate is most heavily optimized
+for gcc and clang, but MSVC is supported fairly well now too.
+
+The recommended optimization flag is `-O2`, and the `CMakeLists.txt` sets this
+for release builds.  `-O3` is fine too, but often `-O2` actually gives better
+results.  It's unnecessary to add flags such as `-mavx2` or `/arch:AVX2`, though
+you can do so if you want to.  Most of the relevant optimized functions are
+built regardless of such flags, and appropriate ones are selected at runtime.
+
 # API
 
 libdeflate has a simple API that is not zlib-compatible.  You can create
diff --git a/common_defs.h b/common_defs.h
index 0a155371..a3773c4e 100644
--- a/common_defs.h
+++ b/common_defs.h
@@ -135,6 +135,9 @@ typedef size_t machine_word_t;
 #  define GCC_PREREQ(major, minor)		\
 	(__GNUC__ > (major) ||			\
 	 (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)))
+#  if !GCC_PREREQ(4, 9)
+#    error "gcc versions older than 4.9 are no longer supported"
+#  endif
 #else
 #  define GCC_PREREQ(major, minor)	0
 #endif
@@ -147,18 +150,35 @@ typedef size_t machine_word_t;
 	(__clang_major__ > (major) ||			\
 	 (__clang_major__ == (major) && __clang_minor__ >= (minor)))
 #  endif
+#  if !CLANG_PREREQ(3, 9, 8000000)
+#    error "clang versions older than 3.9 are no longer supported"
+#  endif
 #else
 #  define CLANG_PREREQ(major, minor, apple_version)	0
 #endif
+#ifdef _MSC_VER
+#  define MSVC_PREREQ(version)	(_MSC_VER >= (version))
+#  if !MSVC_PREREQ(1900)
+#    error "MSVC versions older than Visual Studio 2015 are no longer supported"
+#  endif
+#else
+#  define MSVC_PREREQ(version)	0
+#endif
 
 /*
- * Macros to check for compiler support for attributes and builtins.  clang
- * implements these macros, but gcc doesn't, so generally any use of one of
- * these macros must also be combined with a gcc version check.
+ * __has_attribute(attribute) - check whether the compiler supports the given
+ * attribute (and also supports doing the check in the first place).  Mostly
+ * useful just for clang, since gcc didn't add this macro until gcc 5.
  */
 #ifndef __has_attribute
 #  define __has_attribute(attribute)	0
 #endif
+
+/*
+ * __has_builtin(builtin) - check whether the compiler supports the given
+ * builtin (and also supports doing the check in the first place).  Mostly
+ * useful just for clang, since gcc didn't add this macro until gcc 10.
+ */
 #ifndef __has_builtin
 #  define __has_builtin(builtin)	0
 #endif
@@ -266,12 +286,10 @@ typedef size_t machine_word_t;
  * code as well as the corresponding intrinsics.  On other compilers this macro
  * expands to nothing, though MSVC allows intrinsics to be used anywhere anyway.
  */
-#if GCC_PREREQ(4, 4) || __has_attribute(target)
+#if defined(__GNUC__) || __has_attribute(target)
 #  define _target_attribute(attrs)	__attribute__((target(attrs)))
-#  define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE	1
 #else
 #  define _target_attribute(attrs)
-#  define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE	0
 #endif
 
 /* ========================================================================== */
@@ -316,7 +334,7 @@ static forceinline bool CPU_IS_LITTLE_ENDIAN(void)
 /* bswap16(v) - swap the bytes of a 16-bit integer */
 static forceinline u16 bswap16(u16 v)
 {
-#if GCC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16)
+#if defined(__GNUC__) || __has_builtin(__builtin_bswap16)
 	return __builtin_bswap16(v);
 #elif defined(_MSC_VER)
 	return _byteswap_ushort(v);
@@ -328,7 +346,7 @@ static forceinline u16 bswap16(u16 v)
 /* bswap32(v) - swap the bytes of a 32-bit integer */
 static forceinline u32 bswap32(u32 v)
 {
-#if GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap32)
+#if defined(__GNUC__) || __has_builtin(__builtin_bswap32)
 	return __builtin_bswap32(v);
 #elif defined(_MSC_VER)
 	return _byteswap_ulong(v);
@@ -343,7 +361,7 @@ static forceinline u32 bswap32(u32 v)
 /* bswap64(v) - swap the bytes of a 64-bit integer */
 static forceinline u64 bswap64(u64 v)
 {
-#if GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap64)
+#if defined(__GNUC__) || __has_builtin(__builtin_bswap64)
 	return __builtin_bswap64(v);
 #elif defined(_MSC_VER)
 	return _byteswap_uint64(v);
diff --git a/lib/arm/cpu_features.h b/lib/arm/cpu_features.h
index c55f007c..30920c6b 100644
--- a/lib/arm/cpu_features.h
+++ b/lib/arm/cpu_features.h
@@ -35,7 +35,7 @@
 #if defined(ARCH_ARM32) || defined(ARCH_ARM64)
 
 #if !defined(FREESTANDING) && \
-    (COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE || defined(_MSC_VER)) && \
+    (defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)) && \
     (defined(__linux__) || \
      (defined(__APPLE__) && defined(ARCH_ARM64)) || \
      (defined(_WIN32) && defined(ARCH_ARM64)))
@@ -43,11 +43,11 @@
 #  define HAVE_DYNAMIC_ARM_CPU_FEATURES	1
 #endif
 
-#define ARM_CPU_FEATURE_NEON		0x00000001
-#define ARM_CPU_FEATURE_PMULL		0x00000002
-#define ARM_CPU_FEATURE_CRC32		0x00000004
-#define ARM_CPU_FEATURE_SHA3		0x00000008
-#define ARM_CPU_FEATURE_DOTPROD		0x00000010
+#define ARM_CPU_FEATURE_NEON		(1 << 0)
+#define ARM_CPU_FEATURE_PMULL		(1 << 1)
+#define ARM_CPU_FEATURE_CRC32		(1 << 2)
+#define ARM_CPU_FEATURE_SHA3		(1 << 3)
+#define ARM_CPU_FEATURE_DOTPROD		(1 << 4)
 
 #define HAVE_NEON(features)	(HAVE_NEON_NATIVE    || ((features) & ARM_CPU_FEATURE_NEON))
 #define HAVE_PMULL(features)	(HAVE_PMULL_NATIVE   || ((features) & ARM_CPU_FEATURE_PMULL))
@@ -56,7 +56,7 @@
 #define HAVE_DOTPROD(features)	(HAVE_DOTPROD_NATIVE || ((features) & ARM_CPU_FEATURE_DOTPROD))
 
 #if HAVE_DYNAMIC_ARM_CPU_FEATURES
-#define ARM_CPU_FEATURES_KNOWN		0x80000000
+#define ARM_CPU_FEATURES_KNOWN		(1U << 31)
 extern volatile u32 libdeflate_arm_cpu_features;
 
 void libdeflate_init_arm_cpu_features(void);
@@ -98,8 +98,7 @@ static inline u32 get_arm_cpu_features(void) { return 0; }
 #if HAVE_PMULL_NATIVE || \
 	(HAVE_DYNAMIC_ARM_CPU_FEATURES && \
 	 HAVE_NEON_INTRIN /* needed to exclude soft float arm32 case */ && \
-	 (GCC_PREREQ(6, 1) || CLANG_PREREQ(3, 5, 6010000) || \
-	  defined(_MSC_VER)) && \
+	 (GCC_PREREQ(6, 1) || defined(__clang__) || defined(_MSC_VER)) && \
 	  /*
 	   * On arm32 with clang, the crypto intrinsics (which include pmull)
 	   * are not defined, even when using -mfpu=crypto-neon-fp-armv8,
@@ -179,9 +178,7 @@ static inline u32 get_arm_cpu_features(void) { return 0; }
 	!defined(__ARM_ARCH_7EM__)
 #      define HAVE_CRC32_INTRIN	1
 #    endif
-#  elif CLANG_PREREQ(3, 4, 6000000)
-#    define HAVE_CRC32_INTRIN	1
-#  elif defined(_MSC_VER)
+#  elif defined(__clang__) || defined(_MSC_VER)
 #    define HAVE_CRC32_INTRIN	1
 #  endif
 #endif
diff --git a/lib/x86/adler32_impl.h b/lib/x86/adler32_impl.h
index 618c30cc..7b3f02ac 100644
--- a/lib/x86/adler32_impl.h
+++ b/lib/x86/adler32_impl.h
@@ -104,8 +104,11 @@
 	(void)a, (void)b, (void)c, (void)d, (void)e, (void)f
 #endif
 
-/* SSE2 implementation */
-#if HAVE_SSE2_INTRIN
+/*
+ * SSE2 and AVX2 implementations.  They are very similar; the AVX2
+ * implementation just uses twice the vector width as the SSE2 one.
+ */
+#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)
 #  define adler32_sse2		adler32_sse2
 #  define FUNCNAME		adler32_sse2
 #  define FUNCNAME_CHUNK	adler32_sse2_chunk
@@ -117,12 +120,7 @@
  * would behave incorrectly.
  */
 #  define IMPL_MAX_CHUNK_LEN	(32 * (0x7FFF / 0xFF))
-#  if HAVE_SSE2_NATIVE
-#    define ATTRIBUTES
-#  else
-#    define ATTRIBUTES		_target_attribute("sse2")
-#  endif
-#  include <emmintrin.h>
+#  define ATTRIBUTES		_target_attribute("sse2")
 static forceinline ATTRIBUTES void
 adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2)
 {
@@ -202,33 +200,14 @@ adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2)
 	ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2, 1);
 }
 #  include "../adler32_vec_template.h"
-#endif /* HAVE_SSE2_INTRIN */
 
-/*
- * AVX2 implementation.  Basically the same as the SSE2 one, but with the vector
- * width doubled.
- */
-#if HAVE_AVX2_INTRIN
 #  define adler32_avx2		adler32_avx2
 #  define FUNCNAME		adler32_avx2
 #  define FUNCNAME_CHUNK	adler32_avx2_chunk
 #  define IMPL_ALIGNMENT	32
 #  define IMPL_SEGMENT_LEN	64
 #  define IMPL_MAX_CHUNK_LEN	(64 * (0x7FFF / 0xFF))
-#  if HAVE_AVX2_NATIVE
-#    define ATTRIBUTES
-#  else
-#    define ATTRIBUTES		_target_attribute("avx2")
-#  endif
-#  include <immintrin.h>
-  /*
-   * With clang in MSVC compatibility mode, immintrin.h incorrectly skips
-   * including some sub-headers.
-   */
-#  if defined(__clang__) && defined(_MSC_VER)
-#    include <avxintrin.h>
-#    include <avx2intrin.h>
-#  endif
+#  define ATTRIBUTES		_target_attribute("avx2")
 static forceinline ATTRIBUTES void
 adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2)
 {
@@ -282,38 +261,21 @@ adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2)
 	ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1, v_s2, 1);
 }
 #  include "../adler32_vec_template.h"
-#endif /* HAVE_AVX2_INTRIN */
+#endif
 
 /*
  * AVX2/AVX-VNNI implementation.  This is similar to the AVX512BW/AVX512VNNI
  * implementation, but instead of using AVX-512 it uses AVX2 plus AVX-VNNI.
  * AVX-VNNI adds dot product instructions to CPUs without AVX-512.
  */
-#if HAVE_AVX2_INTRIN && HAVE_AVXVNNI_INTRIN
+#if GCC_PREREQ(11, 1) || CLANG_PREREQ(12, 0, 13000000) || MSVC_PREREQ(1930)
 #  define adler32_avx2_vnni	adler32_avx2_vnni
 #  define FUNCNAME		adler32_avx2_vnni
 #  define FUNCNAME_CHUNK	adler32_avx2_vnni_chunk
 #  define IMPL_ALIGNMENT	32
 #  define IMPL_SEGMENT_LEN	128
 #  define IMPL_MAX_CHUNK_LEN	MAX_CHUNK_LEN
-#  if HAVE_AVX2_NATIVE && HAVE_AVXVNNI_NATIVE
-#    define ATTRIBUTES
-#  else
-#    define ATTRIBUTES		_target_attribute("avx2,avxvnni")
-#  endif
-#  include <immintrin.h>
-  /*
-   * With clang in MSVC compatibility mode, immintrin.h incorrectly skips
-   * including some sub-headers.
-   */
-#  if defined(__clang__) && defined(_MSC_VER)
-#    include <tmmintrin.h>
-#    include <smmintrin.h>
-#    include <wmmintrin.h>
-#    include <avxintrin.h>
-#    include <avx2intrin.h>
-#    include <avxvnniintrin.h>
-#  endif
+#  define ATTRIBUTES		_target_attribute("avx2,avxvnni")
 static forceinline ATTRIBUTES void
 adler32_avx2_vnni_chunk(const __m256i *p, const __m256i *const end,
 			u32 *s1, u32 *s2)
@@ -372,39 +334,20 @@ adler32_avx2_vnni_chunk(const __m256i *p, const __m256i *const end,
 	ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1_a, v_s2_a, 1);
 }
 #  include "../adler32_vec_template.h"
-#endif /* HAVE_AVX2_INTRIN && HAVE_AVXVNNI_INTRIN */
+#endif
 
 /*
  * AVX512BW/AVX512VNNI implementation.  Uses the vpdpbusd (dot product)
  * instruction from AVX512VNNI.
  */
-#if HAVE_AVX512BW_INTRIN && HAVE_AVX512VNNI_INTRIN
+#if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920)
 #  define adler32_avx512_vnni	adler32_avx512_vnni
 #  define FUNCNAME		adler32_avx512_vnni
 #  define FUNCNAME_CHUNK	adler32_avx512_vnni_chunk
 #  define IMPL_ALIGNMENT	64
 #  define IMPL_SEGMENT_LEN	128
 #  define IMPL_MAX_CHUNK_LEN	MAX_CHUNK_LEN
-#  if HAVE_AVX512BW_NATIVE && HAVE_AVX512VNNI_NATIVE
-#    define ATTRIBUTES
-#  else
-#    define ATTRIBUTES		_target_attribute("avx512bw,avx512vnni")
-#  endif
-#  include <immintrin.h>
-  /*
-   * With clang in MSVC compatibility mode, immintrin.h incorrectly skips
-   * including some sub-headers.
-   */
-#  if defined(__clang__) && defined(_MSC_VER)
-#    include <tmmintrin.h>
-#    include <smmintrin.h>
-#    include <wmmintrin.h>
-#    include <avxintrin.h>
-#    include <avx2intrin.h>
-#    include <avx512fintrin.h>
-#    include <avx512bwintrin.h>
-#    include <avx512vnniintrin.h>
-#  endif
+#  define ATTRIBUTES		_target_attribute("avx512bw,avx512vnni")
 static forceinline ATTRIBUTES void
 adler32_avx512_vnni_chunk(const __m512i *p, const __m512i *const end,
 			  u32 *s1, u32 *s2)
@@ -452,7 +395,7 @@ adler32_avx512_vnni_chunk(const __m512i *p, const __m512i *const end,
 	ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1_a, v_s2_a, 0);
 }
 #  include "../adler32_vec_template.h"
-#endif /* HAVE_AVX512BW_INTRIN && HAVE_AVX512VNNI_INTRIN */
+#endif
 
 static inline adler32_func_t
 arch_select_adler32_func(void)
diff --git a/lib/x86/cpu_features.c b/lib/x86/cpu_features.c
index 227fd221..ebe5c569 100644
--- a/lib/x86/cpu_features.c
+++ b/lib/x86/cpu_features.c
@@ -30,16 +30,6 @@
 
 #if HAVE_DYNAMIC_X86_CPU_FEATURES
 
-/*
- * With old GCC versions we have to manually save and restore the x86_32 PIC
- * register (ebx).  See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=47602
- */
-#if defined(ARCH_X86_32) && defined(__PIC__)
-#  define EBX_CONSTRAINT "=&r"
-#else
-#  define EBX_CONSTRAINT "=b"
-#endif
-
 /* Execute the CPUID instruction. */
 static inline void
 cpuid(u32 leaf, u32 subleaf, u32 *a, u32 *b, u32 *c, u32 *d)
@@ -56,7 +46,7 @@ cpuid(u32 leaf, u32 subleaf, u32 *a, u32 *b, u32 *c, u32 *d)
 	__asm__ volatile(".ifnc %%ebx, %1; mov  %%ebx, %1; .endif\n"
 			 "cpuid                                  \n"
 			 ".ifnc %%ebx, %1; xchg %%ebx, %1; .endif\n"
-			 : "=a" (*a), EBX_CONSTRAINT (*b), "=c" (*c), "=d" (*d)
+			 : "=a" (*a), "=b" (*b), "=c" (*c), "=d" (*d)
 			 : "a" (leaf), "c" (subleaf));
 #endif
 }
diff --git a/lib/x86/cpu_features.h b/lib/x86/cpu_features.h
index 4e14f2a8..b4c00118 100644
--- a/lib/x86/cpu_features.h
+++ b/lib/x86/cpu_features.h
@@ -34,44 +34,32 @@
 
 #if defined(ARCH_X86_32) || defined(ARCH_X86_64)
 
-#if COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE || defined(_MSC_VER)
+#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)
 #  undef HAVE_DYNAMIC_X86_CPU_FEATURES
 #  define HAVE_DYNAMIC_X86_CPU_FEATURES	1
 #endif
 
-#define X86_CPU_FEATURE_SSE2		0x00000001
-#define X86_CPU_FEATURE_PCLMULQDQ	0x00000002
-#define X86_CPU_FEATURE_AVX		0x00000004
-#define X86_CPU_FEATURE_AVX2		0x00000008
-#define X86_CPU_FEATURE_BMI2		0x00000010
+#define X86_CPU_FEATURE_SSE2		(1 << 0)
+#define X86_CPU_FEATURE_PCLMULQDQ	(1 << 1)
+#define X86_CPU_FEATURE_AVX		(1 << 2)
+#define X86_CPU_FEATURE_AVX2		(1 << 3)
+#define X86_CPU_FEATURE_BMI2		(1 << 4)
 /*
  * ZMM indicates whether 512-bit vectors (zmm registers) should be used.  On
  * some CPUs, to avoid downclocking issues we don't set ZMM even if the CPU
  * supports it, i.e. even if AVX512F is set.  On these CPUs, we may still use
  * AVX-512 instructions, but only with ymm and xmm registers.
  */
-#define X86_CPU_FEATURE_ZMM		0x00000020
-#define X86_CPU_FEATURE_AVX512F		0x00000040
-#define X86_CPU_FEATURE_AVX512BW	0x00000080
-#define X86_CPU_FEATURE_AVX512VL	0x00000100
-#define X86_CPU_FEATURE_VPCLMULQDQ	0x00000200
-#define X86_CPU_FEATURE_AVX512VNNI	0x00000400
-#define X86_CPU_FEATURE_AVXVNNI		0x00000800
-
-#define HAVE_SSE2(features)	  (HAVE_SSE2_NATIVE	  || ((features) & X86_CPU_FEATURE_SSE2))
-#define HAVE_PCLMULQDQ(features)  (HAVE_PCLMULQDQ_NATIVE  || ((features) & X86_CPU_FEATURE_PCLMULQDQ))
-#define HAVE_AVX(features)	  (HAVE_AVX_NATIVE	  || ((features) & X86_CPU_FEATURE_AVX))
-#define HAVE_AVX2(features)	  (HAVE_AVX2_NATIVE	  || ((features) & X86_CPU_FEATURE_AVX2))
-#define HAVE_BMI2(features)	  (HAVE_BMI2_NATIVE	  || ((features) & X86_CPU_FEATURE_BMI2))
-#define HAVE_AVX512F(features)	  (HAVE_AVX512F_NATIVE	  || ((features) & X86_CPU_FEATURE_AVX512F))
-#define HAVE_AVX512BW(features)	  (HAVE_AVX512BW_NATIVE	  || ((features) & X86_CPU_FEATURE_AVX512BW))
-#define HAVE_AVX512VL(features)	  (HAVE_AVX512VL_NATIVE	  || ((features) & X86_CPU_FEATURE_AVX512VL))
-#define HAVE_VPCLMULQDQ(features) (HAVE_VPCLMULQDQ_NATIVE || ((features) & X86_CPU_FEATURE_VPCLMULQDQ))
-#define HAVE_AVX512VNNI(features) (HAVE_AVX512VNNI_NATIVE || ((features) & X86_CPU_FEATURE_AVX512VNNI))
-#define HAVE_AVXVNNI(features)	  (HAVE_AVXVNNI_NATIVE	  || ((features) & X86_CPU_FEATURE_AVXVNNI))
+#define X86_CPU_FEATURE_ZMM		(1 << 5)
+#define X86_CPU_FEATURE_AVX512F		(1 << 6)
+#define X86_CPU_FEATURE_AVX512BW	(1 << 7)
+#define X86_CPU_FEATURE_AVX512VL	(1 << 8)
+#define X86_CPU_FEATURE_VPCLMULQDQ	(1 << 9)
+#define X86_CPU_FEATURE_AVX512VNNI	(1 << 10)
+#define X86_CPU_FEATURE_AVXVNNI		(1 << 11)
 
 #if HAVE_DYNAMIC_X86_CPU_FEATURES
-#define X86_CPU_FEATURES_KNOWN		0x80000000
+#define X86_CPU_FEATURES_KNOWN		(1U << 31)
 extern volatile u32 libdeflate_x86_cpu_features;
 
 void libdeflate_init_x86_cpu_features(void);
@@ -86,171 +74,103 @@ static inline u32 get_x86_cpu_features(void)
 static inline u32 get_x86_cpu_features(void) { return 0; }
 #endif /* !HAVE_DYNAMIC_X86_CPU_FEATURES */
 
-/*
- * Prior to gcc 4.9 (r200349) and clang 3.8 (r239883), x86 intrinsics not
- * available in the main target couldn't be used in 'target' attribute
- * functions.  Unfortunately clang has no feature test macro for this, so we
- * have to check its version.
- */
-#if HAVE_DYNAMIC_X86_CPU_FEATURES && \
-	(GCC_PREREQ(4, 9) || CLANG_PREREQ(3, 8, 7030000) || defined(_MSC_VER))
-#  define HAVE_TARGET_INTRINSICS	1
-#else
-#  define HAVE_TARGET_INTRINSICS	0
-#endif
-
-/* SSE2 */
 #if defined(__SSE2__) || \
 	(defined(_MSC_VER) && \
 	 (defined(ARCH_X86_64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)))
-#  define HAVE_SSE2_NATIVE	1
+#  define HAVE_SSE2(features)		1
+#  define HAVE_SSE2_NATIVE		1
 #else
-#  define HAVE_SSE2_NATIVE	0
+#  define HAVE_SSE2(features)		((features) & X86_CPU_FEATURE_SSE2)
+#  define HAVE_SSE2_NATIVE		0
 #endif
-#define HAVE_SSE2_INTRIN	(HAVE_SSE2_NATIVE || HAVE_TARGET_INTRINSICS)
 
-/* PCLMULQDQ */
 #if defined(__PCLMUL__) || (defined(_MSC_VER) && defined(__AVX2__))
-#  define HAVE_PCLMULQDQ_NATIVE	1
-#else
-#  define HAVE_PCLMULQDQ_NATIVE	0
-#endif
-#if HAVE_PCLMULQDQ_NATIVE || (HAVE_TARGET_INTRINSICS && \
-			      (GCC_PREREQ(4, 4) || CLANG_PREREQ(3, 2, 0) || \
-			       defined(_MSC_VER)))
-#  define HAVE_PCLMULQDQ_INTRIN	1
+#  define HAVE_PCLMULQDQ(features)	1
 #else
-#  define HAVE_PCLMULQDQ_INTRIN	0
+#  define HAVE_PCLMULQDQ(features)	((features) & X86_CPU_FEATURE_PCLMULQDQ)
 #endif
 
-/* AVX */
 #ifdef __AVX__
-#  define HAVE_AVX_NATIVE	1
+#  define HAVE_AVX(features)		1
 #else
-#  define HAVE_AVX_NATIVE	0
-#endif
-#if HAVE_AVX_NATIVE || (HAVE_TARGET_INTRINSICS && \
-			(GCC_PREREQ(4, 6) || CLANG_PREREQ(3, 0, 0) || \
-			 defined(_MSC_VER)))
-#  define HAVE_AVX_INTRIN	1
-#else
-#  define HAVE_AVX_INTRIN	0
+#  define HAVE_AVX(features)		((features) & X86_CPU_FEATURE_AVX)
 #endif
 
-/* AVX2 */
 #ifdef __AVX2__
-#  define HAVE_AVX2_NATIVE	1
-#else
-#  define HAVE_AVX2_NATIVE	0
-#endif
-#if HAVE_AVX2_NATIVE || (HAVE_TARGET_INTRINSICS && \
-			 (GCC_PREREQ(4, 7) || CLANG_PREREQ(3, 1, 0) || \
-			  defined(_MSC_VER)))
-#  define HAVE_AVX2_INTRIN	1
+#  define HAVE_AVX2(features)		1
 #else
-#  define HAVE_AVX2_INTRIN	0
+#  define HAVE_AVX2(features)		((features) & X86_CPU_FEATURE_AVX2)
 #endif
 
-/* BMI2 */
 #if defined(__BMI2__) || (defined(_MSC_VER) && defined(__AVX2__))
-#  define HAVE_BMI2_NATIVE	1
+#  define HAVE_BMI2(features)		1
+#  define HAVE_BMI2_NATIVE		1
 #else
-#  define HAVE_BMI2_NATIVE	0
-#endif
-#if HAVE_BMI2_NATIVE || (HAVE_TARGET_INTRINSICS && \
-			 (GCC_PREREQ(4, 7) || CLANG_PREREQ(3, 1, 0) || \
-			  defined(_MSC_VER)))
-#  define HAVE_BMI2_INTRIN	1
-#else
-#  define HAVE_BMI2_INTRIN	0
-#endif
-/*
- * MSVC from VS2017 (toolset v141) apparently miscompiles the _bzhi_*()
- * intrinsics.  It seems to be fixed in VS2022.
- */
-#if defined(_MSC_VER) && _MSC_VER < 1930 /* older than VS2022 (toolset v143) */
-#  undef HAVE_BMI2_NATIVE
-#  undef HAVE_BMI2_INTRIN
-#  define HAVE_BMI2_NATIVE	0
-#  define HAVE_BMI2_INTRIN	0
+#  define HAVE_BMI2(features)		((features) & X86_CPU_FEATURE_BMI2)
+#  define HAVE_BMI2_NATIVE		0
 #endif
 
-/* AVX512F */
 #ifdef __AVX512F__
-#  define HAVE_AVX512F_NATIVE	1
-#else
-#  define HAVE_AVX512F_NATIVE	0
-#endif
-#if HAVE_AVX512F_NATIVE || GCC_PREREQ(5, 1) || CLANG_PREREQ(3, 8, 0) || \
-			   defined(_MSC_VER)
-#  define HAVE_AVX512F_INTRIN	1
+#  define HAVE_AVX512F(features)	1
 #else
-#  define HAVE_AVX512F_INTRIN	0
+#  define HAVE_AVX512F(features)	((features) & X86_CPU_FEATURE_AVX512F)
 #endif
 
-/* AVX512BW */
 #ifdef __AVX512BW__
-#  define HAVE_AVX512BW_NATIVE	1
+#  define HAVE_AVX512BW(features)	1
 #else
-#  define HAVE_AVX512BW_NATIVE	0
-#endif
-#if HAVE_AVX512BW_NATIVE || GCC_PREREQ(5, 1) || CLANG_PREREQ(3, 9, 0) || \
-			    defined(_MSC_VER)
-#  define HAVE_AVX512BW_INTRIN	1
-#else
-#  define HAVE_AVX512BW_INTRIN	0
+#  define HAVE_AVX512BW(features)	((features) & X86_CPU_FEATURE_AVX512BW)
 #endif
 
-/* AVX512VL */
 #ifdef __AVX512VL__
-#  define HAVE_AVX512VL_NATIVE	1
-#else
-#  define HAVE_AVX512VL_NATIVE	0
-#endif
-#if HAVE_AVX512VL_NATIVE || GCC_PREREQ(5, 1) || CLANG_PREREQ(3, 8, 0) || \
-			    defined(_MSC_VER)
-#  define HAVE_AVX512VL_INTRIN	1
+#  define HAVE_AVX512VL(features)	1
 #else
-#  define HAVE_AVX512VL_INTRIN	0
+#  define HAVE_AVX512VL(features)	((features) & X86_CPU_FEATURE_AVX512VL)
 #endif
 
-/* VPCLMULQDQ */
 #ifdef __VPCLMULQDQ__
-#  define HAVE_VPCLMULQDQ_NATIVE	1
+#  define HAVE_VPCLMULQDQ(features)	1
 #else
-#  define HAVE_VPCLMULQDQ_NATIVE	0
-#endif
-#if HAVE_VPCLMULQDQ_NATIVE || GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 0) || \
-			      defined(_MSC_VER)
-#  define HAVE_VPCLMULQDQ_INTRIN	1
-#else
-#  define HAVE_VPCLMULQDQ_INTRIN	0
+#  define HAVE_VPCLMULQDQ(features)	((features) & X86_CPU_FEATURE_VPCLMULQDQ)
 #endif
 
-/* AVX512VNNI */
 #ifdef __AVX512VNNI__
-#  define HAVE_AVX512VNNI_NATIVE	1
-#else
-#  define HAVE_AVX512VNNI_NATIVE	0
-#endif
-#if HAVE_AVX512VNNI_NATIVE || GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 0) || \
-			      defined(_MSC_VER)
-#  define HAVE_AVX512VNNI_INTRIN	1
+#  define HAVE_AVX512VNNI(features)	1
 #else
-#  define HAVE_AVX512VNNI_INTRIN	0
+#  define HAVE_AVX512VNNI(features)	((features) & X86_CPU_FEATURE_AVX512VNNI)
 #endif
 
-/* AVX-VNNI */
 #ifdef __AVXVNNI__
-#  define HAVE_AVXVNNI_NATIVE	1
-#else
-#  define HAVE_AVXVNNI_NATIVE	0
-#endif
-#if HAVE_AVXVNNI_NATIVE || GCC_PREREQ(11, 1) || CLANG_PREREQ(12, 0, 0) || \
-			   defined(_MSC_VER)
-#  define HAVE_AVXVNNI_INTRIN	1
-#else
-#  define HAVE_AVXVNNI_INTRIN	0
+#  define HAVE_AVXVNNI(features)	1
+#else
+#  define HAVE_AVXVNNI(features)	((features) & X86_CPU_FEATURE_AVXVNNI)
+#endif
+
+#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)
+#  include <immintrin.h>
+#endif
+#if defined(_MSC_VER) && defined(__clang__)
+  /*
+   * With clang in MSVC compatibility mode, immintrin.h incorrectly skips
+   * including some sub-headers.
+   */
+#  include <tmmintrin.h>
+#  include <smmintrin.h>
+#  include <wmmintrin.h>
+#  include <avxintrin.h>
+#  include <avx2intrin.h>
+#  include <avx512fintrin.h>
+#  include <avx512bwintrin.h>
+#  include <avx512vlintrin.h>
+#  if __has_include(<vpclmulqdqintrin.h>)
+#    include <vpclmulqdqintrin.h>
+#  endif
+#  if __has_include(<avx512vnniintrin.h>)
+#    include <avx512vnniintrin.h>
+#  endif
+#  if __has_include(<avxvnniintrin.h>)
+#    include <avxvnniintrin.h>
+#  endif
 #endif
 
 #endif /* ARCH_X86_32 || ARCH_X86_64 */
diff --git a/lib/x86/crc32_impl.h b/lib/x86/crc32_impl.h
index d50547bd..3d8e254d 100644
--- a/lib/x86/crc32_impl.h
+++ b/lib/x86/crc32_impl.h
@@ -30,20 +30,15 @@
 
 #include "cpu_features.h"
 
+#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)
 /* PCLMULQDQ implementation */
-#if HAVE_PCLMULQDQ_INTRIN
 #  define crc32_x86_pclmulqdq	crc32_x86_pclmulqdq
 #  define SUFFIX			 _pclmulqdq
-#  if HAVE_PCLMULQDQ_NATIVE
-#    define ATTRIBUTES
-#  else
-#    define ATTRIBUTES		_target_attribute("pclmul")
-#  endif
+#  define ATTRIBUTES		_target_attribute("pclmul")
 #  define VL			16
 #  define FOLD_LESSTHAN16BYTES	0
 #  define USE_TERNARYLOGIC	0
 #  include "crc32_pclmul_template.h"
-#endif
 
 /*
  * PCLMULQDQ/AVX implementation.  Compared to the regular PCLMULQDQ
@@ -55,84 +50,57 @@
  * and SSE4.1 support, and we can use SSSE3 and SSE4.1 intrinsics for efficient
  * handling of partial blocks.  (We *could* compile a variant with
  * PCLMULQDQ+SSE4.1 without AVX, but for simplicity we don't currently bother.)
- *
- * FIXME: with MSVC, this isn't actually compiled with AVX code generation
- * enabled yet.  That would require that this be moved to its own .c file.
  */
-#if HAVE_PCLMULQDQ_INTRIN && HAVE_AVX_INTRIN
 #  define crc32_x86_pclmulqdq_avx	crc32_x86_pclmulqdq_avx
 #  define SUFFIX				 _pclmulqdq_avx
-#  if HAVE_PCLMULQDQ_NATIVE && HAVE_AVX_NATIVE
-#    define ATTRIBUTES
-#  else
-#    define ATTRIBUTES		_target_attribute("pclmul,avx")
-#  endif
+#  define ATTRIBUTES		_target_attribute("pclmul,avx")
 #  define VL			16
 #  define FOLD_LESSTHAN16BYTES	1
 #  define USE_TERNARYLOGIC	0
 #  include "crc32_pclmul_template.h"
 #endif
 
-/* VPCLMULQDQ/AVX2 implementation.  Uses 256-bit vectors. */
-#if HAVE_VPCLMULQDQ_INTRIN && HAVE_PCLMULQDQ_INTRIN && HAVE_AVX2_INTRIN && \
-	/*
-	 * This has to be disabled on MSVC because MSVC has a bug where it
-	 * incorrectly assumes that VPCLMULQDQ implies AVX-512:
-	 * https://developercommunity.visualstudio.com/t/Compiler-incorrectly-assumes-VAES-and-VP/10578785?space=62&q=AVX512&sort=newest
-	 */ \
-	!(defined(_MSC_VER) && !defined(__clang__))
+/*
+ * VPCLMULQDQ/AVX2 implementation.  Uses 256-bit vectors.
+ *
+ * Currently this can't be enabled with MSVC because MSVC has a bug where it
+ * incorrectly assumes that VPCLMULQDQ implies AVX-512:
+ * https://developercommunity.visualstudio.com/t/Compiler-incorrectly-assumes-VAES-and-VP/10578785?space=62&q=AVX512&sort=newest
+ */
+#if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000)
 #  define crc32_x86_vpclmulqdq_avx2	crc32_x86_vpclmulqdq_avx2
 #  define SUFFIX				 _vpclmulqdq_avx2
-#  if HAVE_VPCLMULQDQ_NATIVE && HAVE_PCLMULQDQ_NATIVE && HAVE_AVX2_NATIVE
-#    define ATTRIBUTES
-#  else
-#    define ATTRIBUTES		_target_attribute("vpclmulqdq,pclmul,avx2")
-#  endif
+#  define ATTRIBUTES		_target_attribute("vpclmulqdq,pclmul,avx2")
 #  define VL			32
 #  define FOLD_LESSTHAN16BYTES	1
 #  define USE_TERNARYLOGIC	0
 #  include "crc32_pclmul_template.h"
 #endif
 
+#if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920)
 /*
  * VPCLMULQDQ/AVX512 implementation with 256-bit vectors.  This takes advantage
  * of some AVX-512 instructions but uses 256-bit vectors rather than 512-bit.
  * This can be useful on CPUs where 512-bit vectors cause downclocking.
  */
-#if HAVE_VPCLMULQDQ_INTRIN && HAVE_PCLMULQDQ_INTRIN && \
-	HAVE_AVX512F_INTRIN && HAVE_AVX512VL_INTRIN
 #  define crc32_x86_vpclmulqdq_avx512_vl256  crc32_x86_vpclmulqdq_avx512_vl256
 #  define SUFFIX				      _vpclmulqdq_avx512_vl256
-#  if HAVE_VPCLMULQDQ_NATIVE && HAVE_PCLMULQDQ_NATIVE && \
-	HAVE_AVX512F_NATIVE && HAVE_AVX512VL_NATIVE
-#    define ATTRIBUTES
-#  else
-#    define ATTRIBUTES		_target_attribute("vpclmulqdq,pclmul,avx512vl")
-#  endif
+#  define ATTRIBUTES		_target_attribute("vpclmulqdq,pclmul,avx512vl")
 #  define VL			32
 #  define FOLD_LESSTHAN16BYTES	1
 #  define USE_TERNARYLOGIC	1
 #  include "crc32_pclmul_template.h"
-#endif
 
 /* VPCLMULQDQ/AVX512 implementation with 512-bit vectors */
-#if HAVE_VPCLMULQDQ_INTRIN && HAVE_PCLMULQDQ_INTRIN && \
-	HAVE_AVX512F_INTRIN && HAVE_AVX512VL_INTRIN
 #  define crc32_x86_vpclmulqdq_avx512_vl512  crc32_x86_vpclmulqdq_avx512_vl512
 #  define SUFFIX				      _vpclmulqdq_avx512_vl512
-#  if HAVE_VPCLMULQDQ_NATIVE && HAVE_PCLMULQDQ_NATIVE && \
-	HAVE_AVX512F_NATIVE && HAVE_AVX512VL_NATIVE
-#    define ATTRIBUTES
-#  else
-#    define ATTRIBUTES		_target_attribute("vpclmulqdq,pclmul,avx512vl")
-#  endif
+#  define ATTRIBUTES		_target_attribute("vpclmulqdq,pclmul,avx512vl")
 #  define VL			64
 #  define FOLD_LESSTHAN16BYTES	1
 #  define USE_TERNARYLOGIC	1
 #  include "crc32_pclmul_template.h"
 #endif
 
-/* Choose the best implementation at runtime. */
 static inline crc32_func_t
 arch_select_crc32_func(void)
 {
diff --git a/lib/x86/crc32_pclmul_template.h b/lib/x86/crc32_pclmul_template.h
index eb06262b..4257d449 100644
--- a/lib/x86/crc32_pclmul_template.h
+++ b/lib/x86/crc32_pclmul_template.h
@@ -62,22 +62,6 @@
  * or AVX512VL, or four in combination with AVX512F.
  */
 
-#include <immintrin.h>
-/*
- * With clang in MSVC compatibility mode, immintrin.h incorrectly skips
- * including some sub-headers.
- */
-#if defined(__clang__) && defined(_MSC_VER)
-#  include <tmmintrin.h>
-#  include <smmintrin.h>
-#  include <wmmintrin.h>
-#  include <avxintrin.h>
-#  include <avx2intrin.h>
-#  include <avx512fintrin.h>
-#  include <avx512vlintrin.h>
-#  include <vpclmulqdqintrin.h>
-#endif
-
 #undef fold_vec128
 static forceinline ATTRIBUTES __m128i
 ADD_SUFFIX(fold_vec128)(__m128i src, __m128i dst, __m128i multipliers)
diff --git a/lib/x86/decompress_impl.h b/lib/x86/decompress_impl.h
index 3e2ec37e..daedcf2d 100644
--- a/lib/x86/decompress_impl.h
+++ b/lib/x86/decompress_impl.h
@@ -4,17 +4,21 @@
 #include "cpu_features.h"
 
 /*
- * BMI2 optimized version
+ * BMI2 optimized decompression function.
  *
- * FIXME: with MSVC, this isn't actually compiled with BMI2 code generation
- * enabled yet.  That would require that this be moved to its own .c file.
+ * With gcc and clang we just compile the whole function with
+ * __attribute__((target("bmi2"))), and the compiler uses bmi2 automatically.
+ *
+ * With MSVC, there is no target function attribute, but it's still possible to
+ * use bmi2 intrinsics explicitly.  Currently we mostly don't, but there's a
+ * case in which we do (see below), so we at least take advantage of that.
+ * However, MSVC from VS2017 (toolset v141) apparently miscompiles the _bzhi_*()
+ * intrinsics.  It seems to be fixed in VS2022.  Hence, use MSVC_PREREQ(1930).
  */
-#if HAVE_BMI2_INTRIN
+#if defined(__GNUC__) || defined(__clang__) || MSVC_PREREQ(1930)
 #  define deflate_decompress_bmi2	deflate_decompress_bmi2
 #  define FUNCNAME			deflate_decompress_bmi2
-#  if !HAVE_BMI2_NATIVE
-#    define ATTRIBUTES			_target_attribute("bmi2")
-#  endif
+#  define ATTRIBUTES			_target_attribute("bmi2")
    /*
     * Even with __attribute__((target("bmi2"))), gcc doesn't reliably use the
     * bzhi instruction for 'word & BITMASK(count)'.  So use the bzhi intrinsic
@@ -24,7 +28,6 @@
     * as the bzhi instruction truncates the count to 8 bits implicitly.
     */
 #  ifndef __clang__
-#    include <immintrin.h>
 #    ifdef ARCH_X86_64
 #      define EXTRACT_VARBITS(word, count)  _bzhi_u64((word), (count))
 #      define EXTRACT_VARBITS8(word, count) _bzhi_u64((word), (count))
@@ -34,7 +37,7 @@
 #    endif
 #  endif
 #  include "../decompress_template.h"
-#endif /* HAVE_BMI2_INTRIN */
+#endif
 
 #if defined(deflate_decompress_bmi2) && HAVE_BMI2_NATIVE
 #define DEFAULT_IMPL	deflate_decompress_bmi2
diff --git a/lib/x86/matchfinder_impl.h b/lib/x86/matchfinder_impl.h
index 8433b9b1..37a4960a 100644
--- a/lib/x86/matchfinder_impl.h
+++ b/lib/x86/matchfinder_impl.h
@@ -30,8 +30,7 @@
 
 #include "cpu_features.h"
 
-#if HAVE_AVX2_NATIVE
-#  include <immintrin.h>
+#ifdef __AVX2__
 static forceinline void
 matchfinder_init_avx2(mf_pos_t *data, size_t size)
 {
@@ -76,7 +75,6 @@ matchfinder_rebase_avx2(mf_pos_t *data, size_t size)
 #define matchfinder_rebase matchfinder_rebase_avx2
 
 #elif HAVE_SSE2_NATIVE
-#  include <emmintrin.h>
 static forceinline void
 matchfinder_init_sse2(mf_pos_t *data, size_t size)
 {