From 90dc56a16e540af9af18f36224b34207237646da Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 2 Sep 2024 10:42:24 -0700 Subject: [PATCH] Try to work around broken toolchains with too-old binutils --- CMakeLists.txt | 49 ++++++++++++++++++++++++++++++++++++++++++ README.md | 13 +++++++++++ lib/arm/adler32_impl.h | 3 ++- lib/arm/crc32_impl.h | 3 ++- lib/x86/adler32_impl.h | 6 ++++-- lib/x86/crc32_impl.h | 6 ++++-- 6 files changed, 74 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0ce71c01..22ff45a5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -90,6 +90,55 @@ if(LIBDEFLATE_FREESTANDING) add_definitions(-DFREESTANDING) endif() +# Check for cases where the compiler supports an instruction set extension but +# the assembler does not, and in those cases print a warning and add an +# appropriate -DLIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_* flag. libdeflate's C +# source files already check the compiler version before using the corresponding +# intrinsics, but in the rare case of gcc being paired with a binutils much +# older than itself those checks are insufficient. There is no way to check the +# assembler version from C. The proper fix for too-old binutils is for the user +# to upgrade binutils. Unfortunately, as libdeflate has started using newer +# instructions, binutils incompatibilities have started being seen more +# frequently. Hence these checks for assembler support here in CMakeLists.txt +# to provide a fallback for users who may be unable to fix their toolchain. +# These don't solve the problem for users not using CMake, though such users can +# add specific -DLIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_* flags they need. +function(check_assembler_support feature assembly_code) + execute_process(COMMAND echo "${assembly_code}" + COMMAND ${CMAKE_C_COMPILER} -c -x assembler -o /dev/null - + RESULT_VARIABLE result + ERROR_QUIET) + if(NOT ${result} EQUAL 0) + add_definitions(-DLIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_${feature}) + message(STATUS "Your gcc supports ${feature} instructions but it is paired with an assembler that does not. Upgrading binutils is recommended.") + endif() +endfunction() +if(UNIX AND CMAKE_C_COMPILER_ID STREQUAL "GNU") + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpmachine + OUTPUT_VARIABLE machine) + if(${machine} MATCHES "^(x86_64|i[3-6]86)") + if(${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.1) + # Set LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_AVX_VNNI if needed. + check_assembler_support(AVX_VNNI "{vex} vpdpbusd %ymm0, %ymm0, %ymm0") + endif() + if(${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 8.1) + # Set LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_AVX512VNNI if needed. + check_assembler_support(AVX512_VNNI "vpdpbusd %zmm0, %zmm0, %zmm0") + # Set LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_VPCLMULQDQ if needed. + check_assembler_support(VPCLMULQDQ "vpclmulqdq $0, %zmm0, %zmm0, %zmm0") + endif() + elseif(${machine} MATCHES "^aarch64") + if(${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 8.1) + # Set LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_DOTPROD if needed. + check_assembler_support(DOTPROD ".arch armv8.2-a+dotprod\nudot v0.4s, v0.16b, v0.16b") + endif() + if(${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 9.1) + # Set LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_SHA3 if needed. + check_assembler_support(SHA3 ".arch armv8.2-a+sha3\neor3 v0.16b, v0.16b, v0.16b, v0.16b") + endif() + endif() +endif() + # Determine the list of source files and the list of compiler options that will # be used for both the static library and the shared library. diff --git a/README.md b/README.md index 9946cc05..be23ead7 100644 --- a/README.md +++ b/README.md @@ -99,6 +99,19 @@ for release builds. `-O3` is fine too, but often `-O2` actually gives better results. It's unnecessary to add flags such as `-mavx2` or `/arch:AVX2`, though you can do so if you want to. Most of the relevant optimized functions are built regardless of such flags, and appropriate ones are selected at runtime. +For the same reason, flags like `-mno-avx2` do *not* cause all code using the +corresponding instruction set extension to be omitted from the binary; this is +working as intended due to the use of runtime CPU feature detection. + +If using gcc, your gcc should always be paired with a binutils version that is +not much older than itself, to avoid problems where the compiler generates +instructions the assembler cannot assemble. Usually systems have their gcc and +binutils paired properly, but rarely a mismatch can arise in cases such as the +user installing a newer gcc version without a proper binutils alongside it. +Since libdeflate v1.22, the CMake-based build system will detect incompatible +binutils versions and disable some optimized code accordingly. In older +versions of libdeflate, or if CMake is not being used, a too-old binutils can +cause build errors like "no such instruction" from the assembler. # API diff --git a/lib/arm/adler32_impl.h b/lib/arm/adler32_impl.h index 9e833c39..66d2f2bf 100644 --- a/lib/arm/adler32_impl.h +++ b/lib/arm/adler32_impl.h @@ -209,7 +209,8 @@ adler32_arm_neon(u32 adler, const u8 *p, size_t len) #endif /* Regular NEON implementation */ /* NEON+dotprod implementation */ -#if HAVE_DOTPROD_INTRIN && CPU_IS_LITTLE_ENDIAN() +#if HAVE_DOTPROD_INTRIN && CPU_IS_LITTLE_ENDIAN() && \ + !defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_DOTPROD) # define adler32_arm_neon_dotprod adler32_arm_neon_dotprod # ifdef __clang__ # define ATTRIBUTES _target_attribute("dotprod") diff --git a/lib/arm/crc32_impl.h b/lib/arm/crc32_impl.h index 01d0283a..e33ba680 100644 --- a/lib/arm/crc32_impl.h +++ b/lib/arm/crc32_impl.h @@ -545,7 +545,8 @@ crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len) * This like crc32_arm_pmullx12_crc(), but it adds the eor3 instruction (from * the sha3 extension) for even better performance. */ -#if HAVE_PMULL_INTRIN && HAVE_CRC32_INTRIN && HAVE_SHA3_INTRIN +#if HAVE_PMULL_INTRIN && HAVE_CRC32_INTRIN && HAVE_SHA3_INTRIN && \ + !defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_SHA3) # define crc32_arm_pmullx12_crc_eor3 crc32_arm_pmullx12_crc_eor3 # define SUFFIX _pmullx12_crc_eor3 # ifdef __clang__ diff --git a/lib/x86/adler32_impl.h b/lib/x86/adler32_impl.h index 00967e87..458a9e09 100644 --- a/lib/x86/adler32_impl.h +++ b/lib/x86/adler32_impl.h @@ -60,7 +60,8 @@ * instead of gcc 11. (libdeflate supports direct compilation without a * configure step, so checking the binutils version is not always an option.) */ -#if GCC_PREREQ(12, 1) || CLANG_PREREQ(12, 0, 13000000) || MSVC_PREREQ(1930) +#if (GCC_PREREQ(12, 1) || CLANG_PREREQ(12, 0, 13000000) || MSVC_PREREQ(1930)) && \ + !defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_AVX_VNNI) # define adler32_x86_avx2_vnni adler32_x86_avx2_vnni # define SUFFIX _avx2_vnni # define ATTRIBUTES _target_attribute("avx2,avxvnni") @@ -70,7 +71,8 @@ # include "adler32_template.h" #endif -#if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920) +#if (GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920)) && \ + !defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_AVX512VNNI) /* * AVX512VNNI implementation using 256-bit vectors. This is very similar to the * AVX-VNNI implementation but takes advantage of masking and more registers. diff --git a/lib/x86/crc32_impl.h b/lib/x86/crc32_impl.h index 3e4df937..54996d9c 100644 --- a/lib/x86/crc32_impl.h +++ b/lib/x86/crc32_impl.h @@ -84,7 +84,8 @@ static const u8 MAYBE_UNUSED shift_tab[48] = { * gcc 8.1 and 8.2 had a similar bug where they assumed that * _mm256_clmulepi64_epi128() always needed AVX512. It's fixed in gcc 8.3. */ -#if GCC_PREREQ(8, 3) || CLANG_PREREQ(6, 0, 10000000) +#if (GCC_PREREQ(8, 3) || CLANG_PREREQ(6, 0, 10000000)) && \ + !defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_VPCLMULQDQ) # define crc32_x86_vpclmulqdq_avx2 crc32_x86_vpclmulqdq_avx2 # define SUFFIX _vpclmulqdq_avx2 # define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx2") @@ -94,7 +95,8 @@ static const u8 MAYBE_UNUSED shift_tab[48] = { # include "crc32_pclmul_template.h" #endif -#if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920) +#if (GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920)) && \ + !defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_VPCLMULQDQ) /* * VPCLMULQDQ/AVX512 implementation using 256-bit vectors. This is very similar * to the VPCLMULQDQ/AVX2 implementation but takes advantage of the vpternlog