diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0ce71c01..b8c3a978 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -90,6 +90,55 @@ if(LIBDEFLATE_FREESTANDING)
     add_definitions(-DFREESTANDING)
 endif()
 
+# Check for cases where the compiler supports an instruction set extension but
+# the assembler does not, and in those cases print a warning and add an
+# appropriate -DLIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_* flag.  libdeflate's C
+# source files already check the compiler version before using the corresponding
+# intrinsics, but in the rare case of gcc being paired with a binutils much
+# older than itself those checks are insufficient.  There is no way to check the
+# assembler version from C.  The proper fix for too-old binutils is for the user
+# to upgrade binutils.  Unfortunately, as libdeflate has started using newer
+# instructions, binutils incompatibilities have started being seen more
+# frequently.  Hence these checks for assembler support here in CMakeLists.txt
+# to provide a fallback for users who may be unable to fix their toolchain.
+# These don't solve the problem for users not using CMake, though such users can
+# add specific -DLIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_* flags they need.
+function(check_assembler_support  feature assembly_code)
+    execute_process(COMMAND echo "${assembly_code}"
+                    COMMAND ${CMAKE_C_COMPILER} -c -x assembler -o /dev/null -
+                    RESULT_VARIABLE result
+                    ERROR_QUIET)
+    if(NOT ${result} EQUAL 0)
+        add_definitions(-DLIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_${feature})
+        message(STATUS "Your gcc supports ${feature} instructions but it is paired with an assembler that does not.  Upgrading binutils is recommended.")
+    endif()
+endfunction()
+if(UNIX AND CMAKE_C_COMPILER_ID STREQUAL "GNU")
+    execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpmachine
+                    OUTPUT_VARIABLE machine)
+    if(${machine} MATCHES "^(x86_64|i[3-6]86)")
+        if(${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.1)
+            # Set LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_AVX_VNNI if needed.
+            check_assembler_support(AVX_VNNI "{vex} vpdpbusd %ymm0, %ymm0, %ymm0")
+        endif()
+        if(${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 8.1)
+            # Set LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_AVX512VNNI if needed.
+            check_assembler_support(AVX512_VNNI "vpdpbusd %zmm0, %zmm0, %zmm0")
+            # Set LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_VPCLMULQDQ if needed.
+            check_assembler_support(VPCLMULQDQ "vpclmulqdq $0, %zmm0, %zmm0, %zmm0")
+        endif()
+    elseif(${machine} MATCHES "^aarch64")
+        if(${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 8.1)
+            # Set LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_DOTPROD if needed.
+            check_assembler_support(DOTPROD ".arch armv8-a+dotprod\nudot v0.4s, v0.16b, v0.16b")
+        endif()
+        if(${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 9.1)
+            # Set LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_SHA3 if needed.
+            check_assembler_support(SHA3 ".arch armv8.2-a+sha3\neor3 v0.16b, v0.16b, v0.16b, v0.16b")
+        endif()
+    endif()
+endif()
+
 # Determine the list of source files and the list of compiler options that will
 # be used for both the static library and the shared library.
 
diff --git a/README.md b/README.md
index 9946cc05..74f7f593 100644
--- a/README.md
+++ b/README.md
@@ -99,6 +99,19 @@ for release builds.  `-O3` is fine too, but often `-O2` actually gives better
 results.  It's unnecessary to add flags such as `-mavx2` or `/arch:AVX2`, though
 you can do so if you want to.  Most of the relevant optimized functions are
 built regardless of such flags, and appropriate ones are selected at runtime.
+For the same reason, flags like `-mno-avx2` do *not* cause all code using the
+corresponding instruction set extension to be omitted from the binary; this is
+working as intended due to the use of runtime CPU feature detection.
+
+If using gcc, your gcc should always be paired with a binutils version that is
+not much older than itself, to avoid problems where the compiler generates
+instructions the assembler cannot assemble.  Usually systems have their gcc and
+binutils paired properly, but rarely a mismatch can arise in cases such as the
+user installing a newer gcc version without a proper binutils alongside it.
+Since libdeflate v1.22, the CMake-based build system will detect incompatible
+binutils versions and disable some optimized code accordingly.  In older
+versions of libdeflate, or if CMake is not being used, a too-old binutils can
+cause build errors like "no such instruction" coming from the assembler.
 
 # API
 
diff --git a/lib/arm/adler32_impl.h b/lib/arm/adler32_impl.h
index 9e833c39..66d2f2bf 100644
--- a/lib/arm/adler32_impl.h
+++ b/lib/arm/adler32_impl.h
@@ -209,7 +209,8 @@ adler32_arm_neon(u32 adler, const u8 *p, size_t len)
 #endif /* Regular NEON implementation */
 
 /* NEON+dotprod implementation */
-#if HAVE_DOTPROD_INTRIN && CPU_IS_LITTLE_ENDIAN()
+#if HAVE_DOTPROD_INTRIN && CPU_IS_LITTLE_ENDIAN() && \
+	!defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_DOTPROD)
 #  define adler32_arm_neon_dotprod	adler32_arm_neon_dotprod
 #  ifdef __clang__
 #    define ATTRIBUTES	_target_attribute("dotprod")
diff --git a/lib/arm/crc32_impl.h b/lib/arm/crc32_impl.h
index 01d0283a..e33ba680 100644
--- a/lib/arm/crc32_impl.h
+++ b/lib/arm/crc32_impl.h
@@ -545,7 +545,8 @@ crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len)
  * This like crc32_arm_pmullx12_crc(), but it adds the eor3 instruction (from
  * the sha3 extension) for even better performance.
  */
-#if HAVE_PMULL_INTRIN && HAVE_CRC32_INTRIN && HAVE_SHA3_INTRIN
+#if HAVE_PMULL_INTRIN && HAVE_CRC32_INTRIN && HAVE_SHA3_INTRIN && \
+	!defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_SHA3)
 #  define crc32_arm_pmullx12_crc_eor3	crc32_arm_pmullx12_crc_eor3
 #  define SUFFIX				 _pmullx12_crc_eor3
 #  ifdef __clang__
diff --git a/lib/x86/adler32_impl.h b/lib/x86/adler32_impl.h
index 00967e87..458a9e09 100644
--- a/lib/x86/adler32_impl.h
+++ b/lib/x86/adler32_impl.h
@@ -60,7 +60,8 @@
  * instead of gcc 11.  (libdeflate supports direct compilation without a
  * configure step, so checking the binutils version is not always an option.)
  */
-#if GCC_PREREQ(12, 1) || CLANG_PREREQ(12, 0, 13000000) || MSVC_PREREQ(1930)
+#if (GCC_PREREQ(12, 1) || CLANG_PREREQ(12, 0, 13000000) || MSVC_PREREQ(1930)) && \
+	!defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_AVX_VNNI)
 #  define adler32_x86_avx2_vnni	adler32_x86_avx2_vnni
 #  define SUFFIX			   _avx2_vnni
 #  define ATTRIBUTES		_target_attribute("avx2,avxvnni")
@@ -70,7 +71,8 @@
 #  include "adler32_template.h"
 #endif
 
-#if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920)
+#if (GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920)) && \
+	!defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_AVX512VNNI)
 /*
  * AVX512VNNI implementation using 256-bit vectors.  This is very similar to the
  * AVX-VNNI implementation but takes advantage of masking and more registers.
diff --git a/lib/x86/crc32_impl.h b/lib/x86/crc32_impl.h
index 3e4df937..54996d9c 100644
--- a/lib/x86/crc32_impl.h
+++ b/lib/x86/crc32_impl.h
@@ -84,7 +84,8 @@ static const u8 MAYBE_UNUSED shift_tab[48] = {
  * gcc 8.1 and 8.2 had a similar bug where they assumed that
  * _mm256_clmulepi64_epi128() always needed AVX512.  It's fixed in gcc 8.3.
  */
-#if GCC_PREREQ(8, 3) || CLANG_PREREQ(6, 0, 10000000)
+#if (GCC_PREREQ(8, 3) || CLANG_PREREQ(6, 0, 10000000)) && \
+	!defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_VPCLMULQDQ)
 #  define crc32_x86_vpclmulqdq_avx2	crc32_x86_vpclmulqdq_avx2
 #  define SUFFIX				 _vpclmulqdq_avx2
 #  define ATTRIBUTES		_target_attribute("vpclmulqdq,pclmul,avx2")
@@ -94,7 +95,8 @@ static const u8 MAYBE_UNUSED shift_tab[48] = {
 #  include "crc32_pclmul_template.h"
 #endif
 
-#if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920)
+#if (GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920)) && \
+	!defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_VPCLMULQDQ)
 /*
  * VPCLMULQDQ/AVX512 implementation using 256-bit vectors.  This is very similar
  * to the VPCLMULQDQ/AVX2 implementation but takes advantage of the vpternlog