diff --git a/README.md b/README.md index 81cfc867..d7fa34cb 100644 --- a/README.md +++ b/README.md @@ -77,11 +77,28 @@ You should compile both `lib/*.c` and `lib/*/*.c`. You don't need to worry about excluding irrelevant architecture-specific code, as this is already handled in the source files themselves using `#ifdef`s. -It is strongly recommended to use either gcc or clang, and to use `-O2`. - If you are doing a freestanding build with `-ffreestanding`, you must add `-DFREESTANDING` as well (matching what the `CMakeLists.txt` does). +## Supported compilers + +- gcc: v4.9 and later +- clang: v3.9 and later (upstream), Xcode 8 and later (Apple) +- MSVC: Visual Studio 2015 and later +- Other compilers: any other C99-compatible compiler should work, though if your + compiler pretends to be gcc, clang, or MSVC, it needs to be sufficiently + compatible with the compiler it pretends to be. + +The above are the minimums, but using a newer compiler allows more of the +architecture-optimized code to be built. libdeflate is most heavily optimized +for gcc and clang, but MSVC is supported fairly well now too. + +The recommended optimization flag is `-O2`, and the `CMakeLists.txt` sets this +for release builds. `-O3` is fine too, but often `-O2` actually gives better +results. It's unnecessary to add flags such as `-mavx2` or `/arch:AVX2`, though +you can do so if you want to. Most of the relevant optimized functions are +built regardless of such flags, and appropriate ones are selected at runtime. + # API libdeflate has a simple API that is not zlib-compatible. You can create diff --git a/common_defs.h b/common_defs.h index 0a155371..a3773c4e 100644 --- a/common_defs.h +++ b/common_defs.h @@ -135,6 +135,9 @@ typedef size_t machine_word_t; # define GCC_PREREQ(major, minor) \ (__GNUC__ > (major) || \ (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor))) +# if !GCC_PREREQ(4, 9) +# error "gcc versions older than 4.9 are no longer supported" +# endif #else # define GCC_PREREQ(major, minor) 0 #endif @@ -147,18 +150,35 @@ typedef size_t machine_word_t; (__clang_major__ > (major) || \ (__clang_major__ == (major) && __clang_minor__ >= (minor))) # endif +# if !CLANG_PREREQ(3, 9, 8000000) +# error "clang versions older than 3.9 are no longer supported" +# endif #else # define CLANG_PREREQ(major, minor, apple_version) 0 #endif +#ifdef _MSC_VER +# define MSVC_PREREQ(version) (_MSC_VER >= (version)) +# if !MSVC_PREREQ(1900) +# error "MSVC versions older than Visual Studio 2015 are no longer supported" +# endif +#else +# define MSVC_PREREQ(version) 0 +#endif /* - * Macros to check for compiler support for attributes and builtins. clang - * implements these macros, but gcc doesn't, so generally any use of one of - * these macros must also be combined with a gcc version check. + * __has_attribute(attribute) - check whether the compiler supports the given + * attribute (and also supports doing the check in the first place). Mostly + * useful just for clang, since gcc didn't add this macro until gcc 5. */ #ifndef __has_attribute # define __has_attribute(attribute) 0 #endif + +/* + * __has_builtin(builtin) - check whether the compiler supports the given + * builtin (and also supports doing the check in the first place). Mostly + * useful just for clang, since gcc didn't add this macro until gcc 10. + */ #ifndef __has_builtin # define __has_builtin(builtin) 0 #endif @@ -266,12 +286,10 @@ typedef size_t machine_word_t; * code as well as the corresponding intrinsics. On other compilers this macro * expands to nothing, though MSVC allows intrinsics to be used anywhere anyway. */ -#if GCC_PREREQ(4, 4) || __has_attribute(target) +#if defined(__GNUC__) || __has_attribute(target) # define _target_attribute(attrs) __attribute__((target(attrs))) -# define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE 1 #else # define _target_attribute(attrs) -# define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE 0 #endif /* ========================================================================== */ @@ -316,7 +334,7 @@ static forceinline bool CPU_IS_LITTLE_ENDIAN(void) /* bswap16(v) - swap the bytes of a 16-bit integer */ static forceinline u16 bswap16(u16 v) { -#if GCC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16) +#if defined(__GNUC__) || __has_builtin(__builtin_bswap16) return __builtin_bswap16(v); #elif defined(_MSC_VER) return _byteswap_ushort(v); @@ -328,7 +346,7 @@ static forceinline u16 bswap16(u16 v) /* bswap32(v) - swap the bytes of a 32-bit integer */ static forceinline u32 bswap32(u32 v) { -#if GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap32) +#if defined(__GNUC__) || __has_builtin(__builtin_bswap32) return __builtin_bswap32(v); #elif defined(_MSC_VER) return _byteswap_ulong(v); @@ -343,7 +361,7 @@ static forceinline u32 bswap32(u32 v) /* bswap64(v) - swap the bytes of a 64-bit integer */ static forceinline u64 bswap64(u64 v) { -#if GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap64) +#if defined(__GNUC__) || __has_builtin(__builtin_bswap64) return __builtin_bswap64(v); #elif defined(_MSC_VER) return _byteswap_uint64(v); diff --git a/lib/arm/cpu_features.h b/lib/arm/cpu_features.h index c55f007c..30920c6b 100644 --- a/lib/arm/cpu_features.h +++ b/lib/arm/cpu_features.h @@ -35,7 +35,7 @@ #if defined(ARCH_ARM32) || defined(ARCH_ARM64) #if !defined(FREESTANDING) && \ - (COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE || defined(_MSC_VER)) && \ + (defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)) && \ (defined(__linux__) || \ (defined(__APPLE__) && defined(ARCH_ARM64)) || \ (defined(_WIN32) && defined(ARCH_ARM64))) @@ -43,11 +43,11 @@ # define HAVE_DYNAMIC_ARM_CPU_FEATURES 1 #endif -#define ARM_CPU_FEATURE_NEON 0x00000001 -#define ARM_CPU_FEATURE_PMULL 0x00000002 -#define ARM_CPU_FEATURE_CRC32 0x00000004 -#define ARM_CPU_FEATURE_SHA3 0x00000008 -#define ARM_CPU_FEATURE_DOTPROD 0x00000010 +#define ARM_CPU_FEATURE_NEON (1 << 0) +#define ARM_CPU_FEATURE_PMULL (1 << 1) +#define ARM_CPU_FEATURE_CRC32 (1 << 2) +#define ARM_CPU_FEATURE_SHA3 (1 << 3) +#define ARM_CPU_FEATURE_DOTPROD (1 << 4) #define HAVE_NEON(features) (HAVE_NEON_NATIVE || ((features) & ARM_CPU_FEATURE_NEON)) #define HAVE_PMULL(features) (HAVE_PMULL_NATIVE || ((features) & ARM_CPU_FEATURE_PMULL)) @@ -56,7 +56,7 @@ #define HAVE_DOTPROD(features) (HAVE_DOTPROD_NATIVE || ((features) & ARM_CPU_FEATURE_DOTPROD)) #if HAVE_DYNAMIC_ARM_CPU_FEATURES -#define ARM_CPU_FEATURES_KNOWN 0x80000000 +#define ARM_CPU_FEATURES_KNOWN (1U << 31) extern volatile u32 libdeflate_arm_cpu_features; void libdeflate_init_arm_cpu_features(void); @@ -98,8 +98,7 @@ static inline u32 get_arm_cpu_features(void) { return 0; } #if HAVE_PMULL_NATIVE || \ (HAVE_DYNAMIC_ARM_CPU_FEATURES && \ HAVE_NEON_INTRIN /* needed to exclude soft float arm32 case */ && \ - (GCC_PREREQ(6, 1) || CLANG_PREREQ(3, 5, 6010000) || \ - defined(_MSC_VER)) && \ + (GCC_PREREQ(6, 1) || defined(__clang__) || defined(_MSC_VER)) && \ /* * On arm32 with clang, the crypto intrinsics (which include pmull) * are not defined, even when using -mfpu=crypto-neon-fp-armv8, @@ -179,9 +178,7 @@ static inline u32 get_arm_cpu_features(void) { return 0; } !defined(__ARM_ARCH_7EM__) # define HAVE_CRC32_INTRIN 1 # endif -# elif CLANG_PREREQ(3, 4, 6000000) -# define HAVE_CRC32_INTRIN 1 -# elif defined(_MSC_VER) +# elif defined(__clang__) || defined(_MSC_VER) # define HAVE_CRC32_INTRIN 1 # endif #endif diff --git a/lib/x86/adler32_impl.h b/lib/x86/adler32_impl.h index 618c30cc..7b3f02ac 100644 --- a/lib/x86/adler32_impl.h +++ b/lib/x86/adler32_impl.h @@ -104,8 +104,11 @@ (void)a, (void)b, (void)c, (void)d, (void)e, (void)f #endif -/* SSE2 implementation */ -#if HAVE_SSE2_INTRIN +/* + * SSE2 and AVX2 implementations. They are very similar; the AVX2 + * implementation just uses twice the vector width as the SSE2 one. + */ +#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) # define adler32_sse2 adler32_sse2 # define FUNCNAME adler32_sse2 # define FUNCNAME_CHUNK adler32_sse2_chunk @@ -117,12 +120,7 @@ * would behave incorrectly. */ # define IMPL_MAX_CHUNK_LEN (32 * (0x7FFF / 0xFF)) -# if HAVE_SSE2_NATIVE -# define ATTRIBUTES -# else -# define ATTRIBUTES _target_attribute("sse2") -# endif -# include +# define ATTRIBUTES _target_attribute("sse2") static forceinline ATTRIBUTES void adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2) { @@ -202,33 +200,14 @@ adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2) ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2, 1); } # include "../adler32_vec_template.h" -#endif /* HAVE_SSE2_INTRIN */ -/* - * AVX2 implementation. Basically the same as the SSE2 one, but with the vector - * width doubled. - */ -#if HAVE_AVX2_INTRIN # define adler32_avx2 adler32_avx2 # define FUNCNAME adler32_avx2 # define FUNCNAME_CHUNK adler32_avx2_chunk # define IMPL_ALIGNMENT 32 # define IMPL_SEGMENT_LEN 64 # define IMPL_MAX_CHUNK_LEN (64 * (0x7FFF / 0xFF)) -# if HAVE_AVX2_NATIVE -# define ATTRIBUTES -# else -# define ATTRIBUTES _target_attribute("avx2") -# endif -# include - /* - * With clang in MSVC compatibility mode, immintrin.h incorrectly skips - * including some sub-headers. - */ -# if defined(__clang__) && defined(_MSC_VER) -# include -# include -# endif +# define ATTRIBUTES _target_attribute("avx2") static forceinline ATTRIBUTES void adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2) { @@ -282,38 +261,21 @@ adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2) ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1, v_s2, 1); } # include "../adler32_vec_template.h" -#endif /* HAVE_AVX2_INTRIN */ +#endif /* * AVX2/AVX-VNNI implementation. This is similar to the AVX512BW/AVX512VNNI * implementation, but instead of using AVX-512 it uses AVX2 plus AVX-VNNI. * AVX-VNNI adds dot product instructions to CPUs without AVX-512. */ -#if HAVE_AVX2_INTRIN && HAVE_AVXVNNI_INTRIN +#if GCC_PREREQ(11, 1) || CLANG_PREREQ(12, 0, 13000000) || MSVC_PREREQ(1930) # define adler32_avx2_vnni adler32_avx2_vnni # define FUNCNAME adler32_avx2_vnni # define FUNCNAME_CHUNK adler32_avx2_vnni_chunk # define IMPL_ALIGNMENT 32 # define IMPL_SEGMENT_LEN 128 # define IMPL_MAX_CHUNK_LEN MAX_CHUNK_LEN -# if HAVE_AVX2_NATIVE && HAVE_AVXVNNI_NATIVE -# define ATTRIBUTES -# else -# define ATTRIBUTES _target_attribute("avx2,avxvnni") -# endif -# include - /* - * With clang in MSVC compatibility mode, immintrin.h incorrectly skips - * including some sub-headers. - */ -# if defined(__clang__) && defined(_MSC_VER) -# include -# include -# include -# include -# include -# include -# endif +# define ATTRIBUTES _target_attribute("avx2,avxvnni") static forceinline ATTRIBUTES void adler32_avx2_vnni_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2) @@ -372,39 +334,20 @@ adler32_avx2_vnni_chunk(const __m256i *p, const __m256i *const end, ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1_a, v_s2_a, 1); } # include "../adler32_vec_template.h" -#endif /* HAVE_AVX2_INTRIN && HAVE_AVXVNNI_INTRIN */ +#endif /* * AVX512BW/AVX512VNNI implementation. Uses the vpdpbusd (dot product) * instruction from AVX512VNNI. */ -#if HAVE_AVX512BW_INTRIN && HAVE_AVX512VNNI_INTRIN +#if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920) # define adler32_avx512_vnni adler32_avx512_vnni # define FUNCNAME adler32_avx512_vnni # define FUNCNAME_CHUNK adler32_avx512_vnni_chunk # define IMPL_ALIGNMENT 64 # define IMPL_SEGMENT_LEN 128 # define IMPL_MAX_CHUNK_LEN MAX_CHUNK_LEN -# if HAVE_AVX512BW_NATIVE && HAVE_AVX512VNNI_NATIVE -# define ATTRIBUTES -# else -# define ATTRIBUTES _target_attribute("avx512bw,avx512vnni") -# endif -# include - /* - * With clang in MSVC compatibility mode, immintrin.h incorrectly skips - * including some sub-headers. - */ -# if defined(__clang__) && defined(_MSC_VER) -# include -# include -# include -# include -# include -# include -# include -# include -# endif +# define ATTRIBUTES _target_attribute("avx512bw,avx512vnni") static forceinline ATTRIBUTES void adler32_avx512_vnni_chunk(const __m512i *p, const __m512i *const end, u32 *s1, u32 *s2) @@ -452,7 +395,7 @@ adler32_avx512_vnni_chunk(const __m512i *p, const __m512i *const end, ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1_a, v_s2_a, 0); } # include "../adler32_vec_template.h" -#endif /* HAVE_AVX512BW_INTRIN && HAVE_AVX512VNNI_INTRIN */ +#endif static inline adler32_func_t arch_select_adler32_func(void) diff --git a/lib/x86/cpu_features.c b/lib/x86/cpu_features.c index 227fd221..ebe5c569 100644 --- a/lib/x86/cpu_features.c +++ b/lib/x86/cpu_features.c @@ -30,16 +30,6 @@ #if HAVE_DYNAMIC_X86_CPU_FEATURES -/* - * With old GCC versions we have to manually save and restore the x86_32 PIC - * register (ebx). See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=47602 - */ -#if defined(ARCH_X86_32) && defined(__PIC__) -# define EBX_CONSTRAINT "=&r" -#else -# define EBX_CONSTRAINT "=b" -#endif - /* Execute the CPUID instruction. */ static inline void cpuid(u32 leaf, u32 subleaf, u32 *a, u32 *b, u32 *c, u32 *d) @@ -56,7 +46,7 @@ cpuid(u32 leaf, u32 subleaf, u32 *a, u32 *b, u32 *c, u32 *d) __asm__ volatile(".ifnc %%ebx, %1; mov %%ebx, %1; .endif\n" "cpuid \n" ".ifnc %%ebx, %1; xchg %%ebx, %1; .endif\n" - : "=a" (*a), EBX_CONSTRAINT (*b), "=c" (*c), "=d" (*d) + : "=a" (*a), "=b" (*b), "=c" (*c), "=d" (*d) : "a" (leaf), "c" (subleaf)); #endif } diff --git a/lib/x86/cpu_features.h b/lib/x86/cpu_features.h index 4e14f2a8..b4c00118 100644 --- a/lib/x86/cpu_features.h +++ b/lib/x86/cpu_features.h @@ -34,44 +34,32 @@ #if defined(ARCH_X86_32) || defined(ARCH_X86_64) -#if COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE || defined(_MSC_VER) +#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) # undef HAVE_DYNAMIC_X86_CPU_FEATURES # define HAVE_DYNAMIC_X86_CPU_FEATURES 1 #endif -#define X86_CPU_FEATURE_SSE2 0x00000001 -#define X86_CPU_FEATURE_PCLMULQDQ 0x00000002 -#define X86_CPU_FEATURE_AVX 0x00000004 -#define X86_CPU_FEATURE_AVX2 0x00000008 -#define X86_CPU_FEATURE_BMI2 0x00000010 +#define X86_CPU_FEATURE_SSE2 (1 << 0) +#define X86_CPU_FEATURE_PCLMULQDQ (1 << 1) +#define X86_CPU_FEATURE_AVX (1 << 2) +#define X86_CPU_FEATURE_AVX2 (1 << 3) +#define X86_CPU_FEATURE_BMI2 (1 << 4) /* * ZMM indicates whether 512-bit vectors (zmm registers) should be used. On * some CPUs, to avoid downclocking issues we don't set ZMM even if the CPU * supports it, i.e. even if AVX512F is set. On these CPUs, we may still use * AVX-512 instructions, but only with ymm and xmm registers. */ -#define X86_CPU_FEATURE_ZMM 0x00000020 -#define X86_CPU_FEATURE_AVX512F 0x00000040 -#define X86_CPU_FEATURE_AVX512BW 0x00000080 -#define X86_CPU_FEATURE_AVX512VL 0x00000100 -#define X86_CPU_FEATURE_VPCLMULQDQ 0x00000200 -#define X86_CPU_FEATURE_AVX512VNNI 0x00000400 -#define X86_CPU_FEATURE_AVXVNNI 0x00000800 - -#define HAVE_SSE2(features) (HAVE_SSE2_NATIVE || ((features) & X86_CPU_FEATURE_SSE2)) -#define HAVE_PCLMULQDQ(features) (HAVE_PCLMULQDQ_NATIVE || ((features) & X86_CPU_FEATURE_PCLMULQDQ)) -#define HAVE_AVX(features) (HAVE_AVX_NATIVE || ((features) & X86_CPU_FEATURE_AVX)) -#define HAVE_AVX2(features) (HAVE_AVX2_NATIVE || ((features) & X86_CPU_FEATURE_AVX2)) -#define HAVE_BMI2(features) (HAVE_BMI2_NATIVE || ((features) & X86_CPU_FEATURE_BMI2)) -#define HAVE_AVX512F(features) (HAVE_AVX512F_NATIVE || ((features) & X86_CPU_FEATURE_AVX512F)) -#define HAVE_AVX512BW(features) (HAVE_AVX512BW_NATIVE || ((features) & X86_CPU_FEATURE_AVX512BW)) -#define HAVE_AVX512VL(features) (HAVE_AVX512VL_NATIVE || ((features) & X86_CPU_FEATURE_AVX512VL)) -#define HAVE_VPCLMULQDQ(features) (HAVE_VPCLMULQDQ_NATIVE || ((features) & X86_CPU_FEATURE_VPCLMULQDQ)) -#define HAVE_AVX512VNNI(features) (HAVE_AVX512VNNI_NATIVE || ((features) & X86_CPU_FEATURE_AVX512VNNI)) -#define HAVE_AVXVNNI(features) (HAVE_AVXVNNI_NATIVE || ((features) & X86_CPU_FEATURE_AVXVNNI)) +#define X86_CPU_FEATURE_ZMM (1 << 5) +#define X86_CPU_FEATURE_AVX512F (1 << 6) +#define X86_CPU_FEATURE_AVX512BW (1 << 7) +#define X86_CPU_FEATURE_AVX512VL (1 << 8) +#define X86_CPU_FEATURE_VPCLMULQDQ (1 << 9) +#define X86_CPU_FEATURE_AVX512VNNI (1 << 10) +#define X86_CPU_FEATURE_AVXVNNI (1 << 11) #if HAVE_DYNAMIC_X86_CPU_FEATURES -#define X86_CPU_FEATURES_KNOWN 0x80000000 +#define X86_CPU_FEATURES_KNOWN (1U << 31) extern volatile u32 libdeflate_x86_cpu_features; void libdeflate_init_x86_cpu_features(void); @@ -86,171 +74,103 @@ static inline u32 get_x86_cpu_features(void) static inline u32 get_x86_cpu_features(void) { return 0; } #endif /* !HAVE_DYNAMIC_X86_CPU_FEATURES */ -/* - * Prior to gcc 4.9 (r200349) and clang 3.8 (r239883), x86 intrinsics not - * available in the main target couldn't be used in 'target' attribute - * functions. Unfortunately clang has no feature test macro for this, so we - * have to check its version. - */ -#if HAVE_DYNAMIC_X86_CPU_FEATURES && \ - (GCC_PREREQ(4, 9) || CLANG_PREREQ(3, 8, 7030000) || defined(_MSC_VER)) -# define HAVE_TARGET_INTRINSICS 1 -#else -# define HAVE_TARGET_INTRINSICS 0 -#endif - -/* SSE2 */ #if defined(__SSE2__) || \ (defined(_MSC_VER) && \ (defined(ARCH_X86_64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2))) -# define HAVE_SSE2_NATIVE 1 +# define HAVE_SSE2(features) 1 +# define HAVE_SSE2_NATIVE 1 #else -# define HAVE_SSE2_NATIVE 0 +# define HAVE_SSE2(features) ((features) & X86_CPU_FEATURE_SSE2) +# define HAVE_SSE2_NATIVE 0 #endif -#define HAVE_SSE2_INTRIN (HAVE_SSE2_NATIVE || HAVE_TARGET_INTRINSICS) -/* PCLMULQDQ */ #if defined(__PCLMUL__) || (defined(_MSC_VER) && defined(__AVX2__)) -# define HAVE_PCLMULQDQ_NATIVE 1 -#else -# define HAVE_PCLMULQDQ_NATIVE 0 -#endif -#if HAVE_PCLMULQDQ_NATIVE || (HAVE_TARGET_INTRINSICS && \ - (GCC_PREREQ(4, 4) || CLANG_PREREQ(3, 2, 0) || \ - defined(_MSC_VER))) -# define HAVE_PCLMULQDQ_INTRIN 1 +# define HAVE_PCLMULQDQ(features) 1 #else -# define HAVE_PCLMULQDQ_INTRIN 0 +# define HAVE_PCLMULQDQ(features) ((features) & X86_CPU_FEATURE_PCLMULQDQ) #endif -/* AVX */ #ifdef __AVX__ -# define HAVE_AVX_NATIVE 1 +# define HAVE_AVX(features) 1 #else -# define HAVE_AVX_NATIVE 0 -#endif -#if HAVE_AVX_NATIVE || (HAVE_TARGET_INTRINSICS && \ - (GCC_PREREQ(4, 6) || CLANG_PREREQ(3, 0, 0) || \ - defined(_MSC_VER))) -# define HAVE_AVX_INTRIN 1 -#else -# define HAVE_AVX_INTRIN 0 +# define HAVE_AVX(features) ((features) & X86_CPU_FEATURE_AVX) #endif -/* AVX2 */ #ifdef __AVX2__ -# define HAVE_AVX2_NATIVE 1 -#else -# define HAVE_AVX2_NATIVE 0 -#endif -#if HAVE_AVX2_NATIVE || (HAVE_TARGET_INTRINSICS && \ - (GCC_PREREQ(4, 7) || CLANG_PREREQ(3, 1, 0) || \ - defined(_MSC_VER))) -# define HAVE_AVX2_INTRIN 1 +# define HAVE_AVX2(features) 1 #else -# define HAVE_AVX2_INTRIN 0 +# define HAVE_AVX2(features) ((features) & X86_CPU_FEATURE_AVX2) #endif -/* BMI2 */ #if defined(__BMI2__) || (defined(_MSC_VER) && defined(__AVX2__)) -# define HAVE_BMI2_NATIVE 1 +# define HAVE_BMI2(features) 1 +# define HAVE_BMI2_NATIVE 1 #else -# define HAVE_BMI2_NATIVE 0 -#endif -#if HAVE_BMI2_NATIVE || (HAVE_TARGET_INTRINSICS && \ - (GCC_PREREQ(4, 7) || CLANG_PREREQ(3, 1, 0) || \ - defined(_MSC_VER))) -# define HAVE_BMI2_INTRIN 1 -#else -# define HAVE_BMI2_INTRIN 0 -#endif -/* - * MSVC from VS2017 (toolset v141) apparently miscompiles the _bzhi_*() - * intrinsics. It seems to be fixed in VS2022. - */ -#if defined(_MSC_VER) && _MSC_VER < 1930 /* older than VS2022 (toolset v143) */ -# undef HAVE_BMI2_NATIVE -# undef HAVE_BMI2_INTRIN -# define HAVE_BMI2_NATIVE 0 -# define HAVE_BMI2_INTRIN 0 +# define HAVE_BMI2(features) ((features) & X86_CPU_FEATURE_BMI2) +# define HAVE_BMI2_NATIVE 0 #endif -/* AVX512F */ #ifdef __AVX512F__ -# define HAVE_AVX512F_NATIVE 1 -#else -# define HAVE_AVX512F_NATIVE 0 -#endif -#if HAVE_AVX512F_NATIVE || GCC_PREREQ(5, 1) || CLANG_PREREQ(3, 8, 0) || \ - defined(_MSC_VER) -# define HAVE_AVX512F_INTRIN 1 +# define HAVE_AVX512F(features) 1 #else -# define HAVE_AVX512F_INTRIN 0 +# define HAVE_AVX512F(features) ((features) & X86_CPU_FEATURE_AVX512F) #endif -/* AVX512BW */ #ifdef __AVX512BW__ -# define HAVE_AVX512BW_NATIVE 1 +# define HAVE_AVX512BW(features) 1 #else -# define HAVE_AVX512BW_NATIVE 0 -#endif -#if HAVE_AVX512BW_NATIVE || GCC_PREREQ(5, 1) || CLANG_PREREQ(3, 9, 0) || \ - defined(_MSC_VER) -# define HAVE_AVX512BW_INTRIN 1 -#else -# define HAVE_AVX512BW_INTRIN 0 +# define HAVE_AVX512BW(features) ((features) & X86_CPU_FEATURE_AVX512BW) #endif -/* AVX512VL */ #ifdef __AVX512VL__ -# define HAVE_AVX512VL_NATIVE 1 -#else -# define HAVE_AVX512VL_NATIVE 0 -#endif -#if HAVE_AVX512VL_NATIVE || GCC_PREREQ(5, 1) || CLANG_PREREQ(3, 8, 0) || \ - defined(_MSC_VER) -# define HAVE_AVX512VL_INTRIN 1 +# define HAVE_AVX512VL(features) 1 #else -# define HAVE_AVX512VL_INTRIN 0 +# define HAVE_AVX512VL(features) ((features) & X86_CPU_FEATURE_AVX512VL) #endif -/* VPCLMULQDQ */ #ifdef __VPCLMULQDQ__ -# define HAVE_VPCLMULQDQ_NATIVE 1 +# define HAVE_VPCLMULQDQ(features) 1 #else -# define HAVE_VPCLMULQDQ_NATIVE 0 -#endif -#if HAVE_VPCLMULQDQ_NATIVE || GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 0) || \ - defined(_MSC_VER) -# define HAVE_VPCLMULQDQ_INTRIN 1 -#else -# define HAVE_VPCLMULQDQ_INTRIN 0 +# define HAVE_VPCLMULQDQ(features) ((features) & X86_CPU_FEATURE_VPCLMULQDQ) #endif -/* AVX512VNNI */ #ifdef __AVX512VNNI__ -# define HAVE_AVX512VNNI_NATIVE 1 -#else -# define HAVE_AVX512VNNI_NATIVE 0 -#endif -#if HAVE_AVX512VNNI_NATIVE || GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 0) || \ - defined(_MSC_VER) -# define HAVE_AVX512VNNI_INTRIN 1 +# define HAVE_AVX512VNNI(features) 1 #else -# define HAVE_AVX512VNNI_INTRIN 0 +# define HAVE_AVX512VNNI(features) ((features) & X86_CPU_FEATURE_AVX512VNNI) #endif -/* AVX-VNNI */ #ifdef __AVXVNNI__ -# define HAVE_AVXVNNI_NATIVE 1 -#else -# define HAVE_AVXVNNI_NATIVE 0 -#endif -#if HAVE_AVXVNNI_NATIVE || GCC_PREREQ(11, 1) || CLANG_PREREQ(12, 0, 0) || \ - defined(_MSC_VER) -# define HAVE_AVXVNNI_INTRIN 1 -#else -# define HAVE_AVXVNNI_INTRIN 0 +# define HAVE_AVXVNNI(features) 1 +#else +# define HAVE_AVXVNNI(features) ((features) & X86_CPU_FEATURE_AVXVNNI) +#endif + +#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) +# include +#endif +#if defined(_MSC_VER) && defined(__clang__) + /* + * With clang in MSVC compatibility mode, immintrin.h incorrectly skips + * including some sub-headers. + */ +# include +# include +# include +# include +# include +# include +# include +# include +# if __has_include() +# include +# endif +# if __has_include() +# include +# endif +# if __has_include() +# include +# endif #endif #endif /* ARCH_X86_32 || ARCH_X86_64 */ diff --git a/lib/x86/crc32_impl.h b/lib/x86/crc32_impl.h index d50547bd..3d8e254d 100644 --- a/lib/x86/crc32_impl.h +++ b/lib/x86/crc32_impl.h @@ -30,20 +30,15 @@ #include "cpu_features.h" +#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) /* PCLMULQDQ implementation */ -#if HAVE_PCLMULQDQ_INTRIN # define crc32_x86_pclmulqdq crc32_x86_pclmulqdq # define SUFFIX _pclmulqdq -# if HAVE_PCLMULQDQ_NATIVE -# define ATTRIBUTES -# else -# define ATTRIBUTES _target_attribute("pclmul") -# endif +# define ATTRIBUTES _target_attribute("pclmul") # define VL 16 # define FOLD_LESSTHAN16BYTES 0 # define USE_TERNARYLOGIC 0 # include "crc32_pclmul_template.h" -#endif /* * PCLMULQDQ/AVX implementation. Compared to the regular PCLMULQDQ @@ -55,84 +50,57 @@ * and SSE4.1 support, and we can use SSSE3 and SSE4.1 intrinsics for efficient * handling of partial blocks. (We *could* compile a variant with * PCLMULQDQ+SSE4.1 without AVX, but for simplicity we don't currently bother.) - * - * FIXME: with MSVC, this isn't actually compiled with AVX code generation - * enabled yet. That would require that this be moved to its own .c file. */ -#if HAVE_PCLMULQDQ_INTRIN && HAVE_AVX_INTRIN # define crc32_x86_pclmulqdq_avx crc32_x86_pclmulqdq_avx # define SUFFIX _pclmulqdq_avx -# if HAVE_PCLMULQDQ_NATIVE && HAVE_AVX_NATIVE -# define ATTRIBUTES -# else -# define ATTRIBUTES _target_attribute("pclmul,avx") -# endif +# define ATTRIBUTES _target_attribute("pclmul,avx") # define VL 16 # define FOLD_LESSTHAN16BYTES 1 # define USE_TERNARYLOGIC 0 # include "crc32_pclmul_template.h" #endif -/* VPCLMULQDQ/AVX2 implementation. Uses 256-bit vectors. */ -#if HAVE_VPCLMULQDQ_INTRIN && HAVE_PCLMULQDQ_INTRIN && HAVE_AVX2_INTRIN && \ - /* - * This has to be disabled on MSVC because MSVC has a bug where it - * incorrectly assumes that VPCLMULQDQ implies AVX-512: - * https://developercommunity.visualstudio.com/t/Compiler-incorrectly-assumes-VAES-and-VP/10578785?space=62&q=AVX512&sort=newest - */ \ - !(defined(_MSC_VER) && !defined(__clang__)) +/* + * VPCLMULQDQ/AVX2 implementation. Uses 256-bit vectors. + * + * Currently this can't be enabled with MSVC because MSVC has a bug where it + * incorrectly assumes that VPCLMULQDQ implies AVX-512: + * https://developercommunity.visualstudio.com/t/Compiler-incorrectly-assumes-VAES-and-VP/10578785?space=62&q=AVX512&sort=newest + */ +#if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) # define crc32_x86_vpclmulqdq_avx2 crc32_x86_vpclmulqdq_avx2 # define SUFFIX _vpclmulqdq_avx2 -# if HAVE_VPCLMULQDQ_NATIVE && HAVE_PCLMULQDQ_NATIVE && HAVE_AVX2_NATIVE -# define ATTRIBUTES -# else -# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx2") -# endif +# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx2") # define VL 32 # define FOLD_LESSTHAN16BYTES 1 # define USE_TERNARYLOGIC 0 # include "crc32_pclmul_template.h" #endif +#if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920) /* * VPCLMULQDQ/AVX512 implementation with 256-bit vectors. This takes advantage * of some AVX-512 instructions but uses 256-bit vectors rather than 512-bit. * This can be useful on CPUs where 512-bit vectors cause downclocking. */ -#if HAVE_VPCLMULQDQ_INTRIN && HAVE_PCLMULQDQ_INTRIN && \ - HAVE_AVX512F_INTRIN && HAVE_AVX512VL_INTRIN # define crc32_x86_vpclmulqdq_avx512_vl256 crc32_x86_vpclmulqdq_avx512_vl256 # define SUFFIX _vpclmulqdq_avx512_vl256 -# if HAVE_VPCLMULQDQ_NATIVE && HAVE_PCLMULQDQ_NATIVE && \ - HAVE_AVX512F_NATIVE && HAVE_AVX512VL_NATIVE -# define ATTRIBUTES -# else -# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512vl") -# endif +# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512vl") # define VL 32 # define FOLD_LESSTHAN16BYTES 1 # define USE_TERNARYLOGIC 1 # include "crc32_pclmul_template.h" -#endif /* VPCLMULQDQ/AVX512 implementation with 512-bit vectors */ -#if HAVE_VPCLMULQDQ_INTRIN && HAVE_PCLMULQDQ_INTRIN && \ - HAVE_AVX512F_INTRIN && HAVE_AVX512VL_INTRIN # define crc32_x86_vpclmulqdq_avx512_vl512 crc32_x86_vpclmulqdq_avx512_vl512 # define SUFFIX _vpclmulqdq_avx512_vl512 -# if HAVE_VPCLMULQDQ_NATIVE && HAVE_PCLMULQDQ_NATIVE && \ - HAVE_AVX512F_NATIVE && HAVE_AVX512VL_NATIVE -# define ATTRIBUTES -# else -# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512vl") -# endif +# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512vl") # define VL 64 # define FOLD_LESSTHAN16BYTES 1 # define USE_TERNARYLOGIC 1 # include "crc32_pclmul_template.h" #endif -/* Choose the best implementation at runtime. */ static inline crc32_func_t arch_select_crc32_func(void) { diff --git a/lib/x86/crc32_pclmul_template.h b/lib/x86/crc32_pclmul_template.h index eb06262b..4257d449 100644 --- a/lib/x86/crc32_pclmul_template.h +++ b/lib/x86/crc32_pclmul_template.h @@ -62,22 +62,6 @@ * or AVX512VL, or four in combination with AVX512F. */ -#include -/* - * With clang in MSVC compatibility mode, immintrin.h incorrectly skips - * including some sub-headers. - */ -#if defined(__clang__) && defined(_MSC_VER) -# include -# include -# include -# include -# include -# include -# include -# include -#endif - #undef fold_vec128 static forceinline ATTRIBUTES __m128i ADD_SUFFIX(fold_vec128)(__m128i src, __m128i dst, __m128i multipliers) diff --git a/lib/x86/decompress_impl.h b/lib/x86/decompress_impl.h index 3e2ec37e..daedcf2d 100644 --- a/lib/x86/decompress_impl.h +++ b/lib/x86/decompress_impl.h @@ -4,17 +4,21 @@ #include "cpu_features.h" /* - * BMI2 optimized version + * BMI2 optimized decompression function. * - * FIXME: with MSVC, this isn't actually compiled with BMI2 code generation - * enabled yet. That would require that this be moved to its own .c file. + * With gcc and clang we just compile the whole function with + * __attribute__((target("bmi2"))), and the compiler uses bmi2 automatically. + * + * With MSVC, there is no target function attribute, but it's still possible to + * use bmi2 intrinsics explicitly. Currently we mostly don't, but there's a + * case in which we do (see below), so we at least take advantage of that. + * However, MSVC from VS2017 (toolset v141) apparently miscompiles the _bzhi_*() + * intrinsics. It seems to be fixed in VS2022. Hence, use MSVC_PREREQ(1930). */ -#if HAVE_BMI2_INTRIN +#if defined(__GNUC__) || defined(__clang__) || MSVC_PREREQ(1930) # define deflate_decompress_bmi2 deflate_decompress_bmi2 # define FUNCNAME deflate_decompress_bmi2 -# if !HAVE_BMI2_NATIVE -# define ATTRIBUTES _target_attribute("bmi2") -# endif +# define ATTRIBUTES _target_attribute("bmi2") /* * Even with __attribute__((target("bmi2"))), gcc doesn't reliably use the * bzhi instruction for 'word & BITMASK(count)'. So use the bzhi intrinsic @@ -24,7 +28,6 @@ * as the bzhi instruction truncates the count to 8 bits implicitly. */ # ifndef __clang__ -# include # ifdef ARCH_X86_64 # define EXTRACT_VARBITS(word, count) _bzhi_u64((word), (count)) # define EXTRACT_VARBITS8(word, count) _bzhi_u64((word), (count)) @@ -34,7 +37,7 @@ # endif # endif # include "../decompress_template.h" -#endif /* HAVE_BMI2_INTRIN */ +#endif #if defined(deflate_decompress_bmi2) && HAVE_BMI2_NATIVE #define DEFAULT_IMPL deflate_decompress_bmi2 diff --git a/lib/x86/matchfinder_impl.h b/lib/x86/matchfinder_impl.h index 8433b9b1..37a4960a 100644 --- a/lib/x86/matchfinder_impl.h +++ b/lib/x86/matchfinder_impl.h @@ -30,8 +30,7 @@ #include "cpu_features.h" -#if HAVE_AVX2_NATIVE -# include +#ifdef __AVX2__ static forceinline void matchfinder_init_avx2(mf_pos_t *data, size_t size) { @@ -76,7 +75,6 @@ matchfinder_rebase_avx2(mf_pos_t *data, size_t size) #define matchfinder_rebase matchfinder_rebase_avx2 #elif HAVE_SSE2_NATIVE -# include static forceinline void matchfinder_init_sse2(mf_pos_t *data, size_t size) {