Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Drop support for very old compilers and simplify accordingly #345

Merged
merged 15 commits into from
Mar 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 19 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,11 +77,28 @@ You should compile both `lib/*.c` and `lib/*/*.c`. You don't need to worry
about excluding irrelevant architecture-specific code, as this is already
handled in the source files themselves using `#ifdef`s.

It is strongly recommended to use either gcc or clang, and to use `-O2`.

If you are doing a freestanding build with `-ffreestanding`, you must add
`-DFREESTANDING` as well (matching what the `CMakeLists.txt` does).

## Supported compilers

- gcc: v4.9 and later
- clang: v3.9 and later (upstream), Xcode 8 and later (Apple)
- MSVC: Visual Studio 2015 and later
- Other compilers: any other C99-compatible compiler should work, though if your
compiler pretends to be gcc, clang, or MSVC, it needs to be sufficiently
compatible with the compiler it pretends to be.

The above are the minimums, but using a newer compiler allows more of the
architecture-optimized code to be built. libdeflate is most heavily optimized
for gcc and clang, but MSVC is supported fairly well now too.

The recommended optimization flag is `-O2`, and the `CMakeLists.txt` sets this
for release builds. `-O3` is fine too, but often `-O2` actually gives better
results. It's unnecessary to add flags such as `-mavx2` or `/arch:AVX2`, though
you can do so if you want to. Most of the relevant optimized functions are
built regardless of such flags, and appropriate ones are selected at runtime.

# API

libdeflate has a simple API that is not zlib-compatible. You can create
Expand Down
36 changes: 27 additions & 9 deletions common_defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,9 @@ typedef size_t machine_word_t;
# define GCC_PREREQ(major, minor) \
(__GNUC__ > (major) || \
(__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)))
# if !GCC_PREREQ(4, 9)
# error "gcc versions older than 4.9 are no longer supported"
# endif
#else
# define GCC_PREREQ(major, minor) 0
#endif
Expand All @@ -147,18 +150,35 @@ typedef size_t machine_word_t;
(__clang_major__ > (major) || \
(__clang_major__ == (major) && __clang_minor__ >= (minor)))
# endif
# if !CLANG_PREREQ(3, 9, 8000000)
# error "clang versions older than 3.9 are no longer supported"
# endif
#else
# define CLANG_PREREQ(major, minor, apple_version) 0
#endif
#ifdef _MSC_VER
# define MSVC_PREREQ(version) (_MSC_VER >= (version))
# if !MSVC_PREREQ(1900)
# error "MSVC versions older than Visual Studio 2015 are no longer supported"
# endif
#else
# define MSVC_PREREQ(version) 0
#endif

/*
* Macros to check for compiler support for attributes and builtins. clang
* implements these macros, but gcc doesn't, so generally any use of one of
* these macros must also be combined with a gcc version check.
* __has_attribute(attribute) - check whether the compiler supports the given
* attribute (and also supports doing the check in the first place). Mostly
* useful just for clang, since gcc didn't add this macro until gcc 5.
*/
#ifndef __has_attribute
# define __has_attribute(attribute) 0
#endif

/*
* __has_builtin(builtin) - check whether the compiler supports the given
* builtin (and also supports doing the check in the first place). Mostly
* useful just for clang, since gcc didn't add this macro until gcc 10.
*/
#ifndef __has_builtin
# define __has_builtin(builtin) 0
#endif
Expand Down Expand Up @@ -266,12 +286,10 @@ typedef size_t machine_word_t;
* code as well as the corresponding intrinsics. On other compilers this macro
* expands to nothing, though MSVC allows intrinsics to be used anywhere anyway.
*/
#if GCC_PREREQ(4, 4) || __has_attribute(target)
#if defined(__GNUC__) || __has_attribute(target)
# define _target_attribute(attrs) __attribute__((target(attrs)))
# define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE 1
#else
# define _target_attribute(attrs)
# define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE 0
#endif

/* ========================================================================== */
Expand Down Expand Up @@ -316,7 +334,7 @@ static forceinline bool CPU_IS_LITTLE_ENDIAN(void)
/* bswap16(v) - swap the bytes of a 16-bit integer */
static forceinline u16 bswap16(u16 v)
{
#if GCC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16)
#if defined(__GNUC__) || __has_builtin(__builtin_bswap16)
return __builtin_bswap16(v);
#elif defined(_MSC_VER)
return _byteswap_ushort(v);
Expand All @@ -328,7 +346,7 @@ static forceinline u16 bswap16(u16 v)
/* bswap32(v) - swap the bytes of a 32-bit integer */
static forceinline u32 bswap32(u32 v)
{
#if GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap32)
#if defined(__GNUC__) || __has_builtin(__builtin_bswap32)
return __builtin_bswap32(v);
#elif defined(_MSC_VER)
return _byteswap_ulong(v);
Expand All @@ -343,7 +361,7 @@ static forceinline u32 bswap32(u32 v)
/* bswap64(v) - swap the bytes of a 64-bit integer */
static forceinline u64 bswap64(u64 v)
{
#if GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap64)
#if defined(__GNUC__) || __has_builtin(__builtin_bswap64)
return __builtin_bswap64(v);
#elif defined(_MSC_VER)
return _byteswap_uint64(v);
Expand Down
21 changes: 9 additions & 12 deletions lib/arm/cpu_features.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,19 +35,19 @@
#if defined(ARCH_ARM32) || defined(ARCH_ARM64)

#if !defined(FREESTANDING) && \
(COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE || defined(_MSC_VER)) && \
(defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)) && \
(defined(__linux__) || \
(defined(__APPLE__) && defined(ARCH_ARM64)) || \
(defined(_WIN32) && defined(ARCH_ARM64)))
# undef HAVE_DYNAMIC_ARM_CPU_FEATURES
# define HAVE_DYNAMIC_ARM_CPU_FEATURES 1
#endif

#define ARM_CPU_FEATURE_NEON 0x00000001
#define ARM_CPU_FEATURE_PMULL 0x00000002
#define ARM_CPU_FEATURE_CRC32 0x00000004
#define ARM_CPU_FEATURE_SHA3 0x00000008
#define ARM_CPU_FEATURE_DOTPROD 0x00000010
#define ARM_CPU_FEATURE_NEON (1 << 0)
#define ARM_CPU_FEATURE_PMULL (1 << 1)
#define ARM_CPU_FEATURE_CRC32 (1 << 2)
#define ARM_CPU_FEATURE_SHA3 (1 << 3)
#define ARM_CPU_FEATURE_DOTPROD (1 << 4)

#define HAVE_NEON(features) (HAVE_NEON_NATIVE || ((features) & ARM_CPU_FEATURE_NEON))
#define HAVE_PMULL(features) (HAVE_PMULL_NATIVE || ((features) & ARM_CPU_FEATURE_PMULL))
Expand All @@ -56,7 +56,7 @@
#define HAVE_DOTPROD(features) (HAVE_DOTPROD_NATIVE || ((features) & ARM_CPU_FEATURE_DOTPROD))

#if HAVE_DYNAMIC_ARM_CPU_FEATURES
#define ARM_CPU_FEATURES_KNOWN 0x80000000
#define ARM_CPU_FEATURES_KNOWN (1U << 31)
extern volatile u32 libdeflate_arm_cpu_features;

void libdeflate_init_arm_cpu_features(void);
Expand Down Expand Up @@ -98,8 +98,7 @@ static inline u32 get_arm_cpu_features(void) { return 0; }
#if HAVE_PMULL_NATIVE || \
(HAVE_DYNAMIC_ARM_CPU_FEATURES && \
HAVE_NEON_INTRIN /* needed to exclude soft float arm32 case */ && \
(GCC_PREREQ(6, 1) || CLANG_PREREQ(3, 5, 6010000) || \
defined(_MSC_VER)) && \
(GCC_PREREQ(6, 1) || defined(__clang__) || defined(_MSC_VER)) && \
/*
* On arm32 with clang, the crypto intrinsics (which include pmull)
* are not defined, even when using -mfpu=crypto-neon-fp-armv8,
Expand Down Expand Up @@ -179,9 +178,7 @@ static inline u32 get_arm_cpu_features(void) { return 0; }
!defined(__ARM_ARCH_7EM__)
# define HAVE_CRC32_INTRIN 1
# endif
# elif CLANG_PREREQ(3, 4, 6000000)
# define HAVE_CRC32_INTRIN 1
# elif defined(_MSC_VER)
# elif defined(__clang__) || defined(_MSC_VER)
# define HAVE_CRC32_INTRIN 1
# endif
#endif
Expand Down
85 changes: 14 additions & 71 deletions lib/x86/adler32_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,11 @@
(void)a, (void)b, (void)c, (void)d, (void)e, (void)f
#endif

/* SSE2 implementation */
#if HAVE_SSE2_INTRIN
/*
* SSE2 and AVX2 implementations. They are very similar; the AVX2
* implementation just uses twice the vector width as the SSE2 one.
*/
#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)
# define adler32_sse2 adler32_sse2
# define FUNCNAME adler32_sse2
# define FUNCNAME_CHUNK adler32_sse2_chunk
Expand All @@ -117,12 +120,7 @@
* would behave incorrectly.
*/
# define IMPL_MAX_CHUNK_LEN (32 * (0x7FFF / 0xFF))
# if HAVE_SSE2_NATIVE
# define ATTRIBUTES
# else
# define ATTRIBUTES _target_attribute("sse2")
# endif
# include <emmintrin.h>
# define ATTRIBUTES _target_attribute("sse2")
static forceinline ATTRIBUTES void
adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2)
{
Expand Down Expand Up @@ -202,33 +200,14 @@ adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2)
ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2, 1);
}
# include "../adler32_vec_template.h"
#endif /* HAVE_SSE2_INTRIN */

/*
* AVX2 implementation. Basically the same as the SSE2 one, but with the vector
* width doubled.
*/
#if HAVE_AVX2_INTRIN
# define adler32_avx2 adler32_avx2
# define FUNCNAME adler32_avx2
# define FUNCNAME_CHUNK adler32_avx2_chunk
# define IMPL_ALIGNMENT 32
# define IMPL_SEGMENT_LEN 64
# define IMPL_MAX_CHUNK_LEN (64 * (0x7FFF / 0xFF))
# if HAVE_AVX2_NATIVE
# define ATTRIBUTES
# else
# define ATTRIBUTES _target_attribute("avx2")
# endif
# include <immintrin.h>
/*
* With clang in MSVC compatibility mode, immintrin.h incorrectly skips
* including some sub-headers.
*/
# if defined(__clang__) && defined(_MSC_VER)
# include <avxintrin.h>
# include <avx2intrin.h>
# endif
# define ATTRIBUTES _target_attribute("avx2")
static forceinline ATTRIBUTES void
adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2)
{
Expand Down Expand Up @@ -282,38 +261,21 @@ adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2)
ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1, v_s2, 1);
}
# include "../adler32_vec_template.h"
#endif /* HAVE_AVX2_INTRIN */
#endif

/*
* AVX2/AVX-VNNI implementation. This is similar to the AVX512BW/AVX512VNNI
* implementation, but instead of using AVX-512 it uses AVX2 plus AVX-VNNI.
* AVX-VNNI adds dot product instructions to CPUs without AVX-512.
*/
#if HAVE_AVX2_INTRIN && HAVE_AVXVNNI_INTRIN
#if GCC_PREREQ(11, 1) || CLANG_PREREQ(12, 0, 13000000) || MSVC_PREREQ(1930)
# define adler32_avx2_vnni adler32_avx2_vnni
# define FUNCNAME adler32_avx2_vnni
# define FUNCNAME_CHUNK adler32_avx2_vnni_chunk
# define IMPL_ALIGNMENT 32
# define IMPL_SEGMENT_LEN 128
# define IMPL_MAX_CHUNK_LEN MAX_CHUNK_LEN
# if HAVE_AVX2_NATIVE && HAVE_AVXVNNI_NATIVE
# define ATTRIBUTES
# else
# define ATTRIBUTES _target_attribute("avx2,avxvnni")
# endif
# include <immintrin.h>
/*
* With clang in MSVC compatibility mode, immintrin.h incorrectly skips
* including some sub-headers.
*/
# if defined(__clang__) && defined(_MSC_VER)
# include <tmmintrin.h>
# include <smmintrin.h>
# include <wmmintrin.h>
# include <avxintrin.h>
# include <avx2intrin.h>
# include <avxvnniintrin.h>
# endif
# define ATTRIBUTES _target_attribute("avx2,avxvnni")
static forceinline ATTRIBUTES void
adler32_avx2_vnni_chunk(const __m256i *p, const __m256i *const end,
u32 *s1, u32 *s2)
Expand Down Expand Up @@ -372,39 +334,20 @@ adler32_avx2_vnni_chunk(const __m256i *p, const __m256i *const end,
ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1_a, v_s2_a, 1);
}
# include "../adler32_vec_template.h"
#endif /* HAVE_AVX2_INTRIN && HAVE_AVXVNNI_INTRIN */
#endif

/*
* AVX512BW/AVX512VNNI implementation. Uses the vpdpbusd (dot product)
* instruction from AVX512VNNI.
*/
#if HAVE_AVX512BW_INTRIN && HAVE_AVX512VNNI_INTRIN
#if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920)
# define adler32_avx512_vnni adler32_avx512_vnni
# define FUNCNAME adler32_avx512_vnni
# define FUNCNAME_CHUNK adler32_avx512_vnni_chunk
# define IMPL_ALIGNMENT 64
# define IMPL_SEGMENT_LEN 128
# define IMPL_MAX_CHUNK_LEN MAX_CHUNK_LEN
# if HAVE_AVX512BW_NATIVE && HAVE_AVX512VNNI_NATIVE
# define ATTRIBUTES
# else
# define ATTRIBUTES _target_attribute("avx512bw,avx512vnni")
# endif
# include <immintrin.h>
/*
* With clang in MSVC compatibility mode, immintrin.h incorrectly skips
* including some sub-headers.
*/
# if defined(__clang__) && defined(_MSC_VER)
# include <tmmintrin.h>
# include <smmintrin.h>
# include <wmmintrin.h>
# include <avxintrin.h>
# include <avx2intrin.h>
# include <avx512fintrin.h>
# include <avx512bwintrin.h>
# include <avx512vnniintrin.h>
# endif
# define ATTRIBUTES _target_attribute("avx512bw,avx512vnni")
static forceinline ATTRIBUTES void
adler32_avx512_vnni_chunk(const __m512i *p, const __m512i *const end,
u32 *s1, u32 *s2)
Expand Down Expand Up @@ -452,7 +395,7 @@ adler32_avx512_vnni_chunk(const __m512i *p, const __m512i *const end,
ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1_a, v_s2_a, 0);
}
# include "../adler32_vec_template.h"
#endif /* HAVE_AVX512BW_INTRIN && HAVE_AVX512VNNI_INTRIN */
#endif

static inline adler32_func_t
arch_select_adler32_func(void)
Expand Down
12 changes: 1 addition & 11 deletions lib/x86/cpu_features.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,6 @@

#if HAVE_DYNAMIC_X86_CPU_FEATURES

/*
* With old GCC versions we have to manually save and restore the x86_32 PIC
* register (ebx). See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=47602
*/
#if defined(ARCH_X86_32) && defined(__PIC__)
# define EBX_CONSTRAINT "=&r"
#else
# define EBX_CONSTRAINT "=b"
#endif

/* Execute the CPUID instruction. */
static inline void
cpuid(u32 leaf, u32 subleaf, u32 *a, u32 *b, u32 *c, u32 *d)
Expand All @@ -56,7 +46,7 @@ cpuid(u32 leaf, u32 subleaf, u32 *a, u32 *b, u32 *c, u32 *d)
__asm__ volatile(".ifnc %%ebx, %1; mov %%ebx, %1; .endif\n"
"cpuid \n"
".ifnc %%ebx, %1; xchg %%ebx, %1; .endif\n"
: "=a" (*a), EBX_CONSTRAINT (*b), "=c" (*c), "=d" (*d)
: "=a" (*a), "=b" (*b), "=c" (*c), "=d" (*d)
: "a" (leaf), "c" (subleaf));
#endif
}
Expand Down
Loading
Loading