Skip to content

Commit

Permalink
lib/arm: use asm fallback when clang intrinsics unusable
Browse files Browse the repository at this point in the history
Instead of manually defining macros like __ARM_FEATURE_CRC32 to get the
intrinsic headers of clang 15 and earlier to work, just use inline
assembly.  This should be a better solution as it does not rely on clang
implementation details as much.

We already used an inline assembly fallback for veor3q_u8 with gcc 8,
and with clang 7 through 12.  This commit extends the same pattern to
the crc32 and dotprod intrinsics, and extends the version range to clang
15.  It also drops gcc 8 from the veor3q_u8 fallback, as that is just a
single major version and not worth enabling the fallback for.
  • Loading branch information
ebiggers committed Mar 17, 2024
1 parent 5db1ebd commit 0c0530c
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 55 deletions.
106 changes: 64 additions & 42 deletions lib/arm/cpu_features.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ static inline u32 get_arm_cpu_features(void) { return 0; }
#if (defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)) && \
(HAVE_NEON_NATIVE || (GCC_PREREQ(6, 1) && defined(__ARM_FP)))
# define HAVE_NEON_INTRIN 1
# include <arm_neon.h>
#else
# define HAVE_NEON_INTRIN 0
#endif
Expand Down Expand Up @@ -124,6 +125,41 @@ static inline u32 get_arm_cpu_features(void) { return 0; }
#if defined(ARCH_ARM64) && \
(defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER))
# define HAVE_CRC32_INTRIN 1
# ifndef _MSC_VER
# include <arm_acle.h>
# endif
/*
* Use an inline assembly fallback for clang 15 and earlier, which only
* defined the crc32 intrinsics when crc32 is enabled in the main target.
*/
# if defined(__clang__) && !CLANG_PREREQ(16, 0, 16000000) && \
!defined(__ARM_FEATURE_CRC32)
# undef __crc32b
# define __crc32b(a, b) \
({ uint32_t res; \
__asm__("crc32b %w0, %w1, %w2" \
: "=r" (res) : "r" (a), "r" (b)); \
res; })
# undef __crc32h
# define __crc32h(a, b) \
({ uint32_t res; \
__asm__("crc32h %w0, %w1, %w2" \
: "=r" (res) : "r" (a), "r" (b)); \
res; })
# undef __crc32w
# define __crc32w(a, b) \
({ uint32_t res; \
__asm__("crc32w %w0, %w1, %w2" \
: "=r" (res) : "r" (a), "r" (b)); \
res; })
# undef __crc32d
# define __crc32d(a, b) \
({ uint32_t res; \
__asm__("crc32x %w0, %w1, %2" \
: "=r" (res) : "r" (a), "r" (b)); \
res; })
# pragma clang diagnostic ignored "-Wgnu-statement-expression"
# endif
#else
# define HAVE_CRC32_INTRIN 0
#endif
Expand All @@ -134,17 +170,24 @@ static inline u32 get_arm_cpu_features(void) { return 0; }
#else
# define HAVE_SHA3_NATIVE 0
#endif
#if defined(ARCH_ARM64) && \
(GCC_PREREQ(8, 1) /* r256478 */ || \
CLANG_PREREQ(7, 0, 10010463) /* r338010 */)
# define HAVE_SHA3_TARGET 1
#else
# define HAVE_SHA3_TARGET 0
#endif
#if defined(ARCH_ARM64) && HAVE_NEON_INTRIN && \
(GCC_PREREQ(9, 1) /* r268049 */ || \
CLANG_PREREQ(13, 0, 13160000))
CLANG_PREREQ(7, 0, 10010463) /* r338010 */)
# define HAVE_SHA3_INTRIN 1
/*
* Use an inline assembly fallback for clang 15 and earlier, which only
* defined the sha3 intrinsics when sha3 is enabled in the main target.
*/
# if defined(__clang__) && !CLANG_PREREQ(16, 0, 16000000) && \
!defined(__ARM_FEATURE_SHA3)
# undef veor3q_u8
# define veor3q_u8(a, b, c) \
({ uint8x16_t res; \
__asm__("eor3 %0.16b, %1.16b, %2.16b, %3.16b" \
: "=w" (res) : "w" (a), "w" (b), "w" (c)); \
res; })
# pragma clang diagnostic ignored "-Wgnu-statement-expression"
# endif
#else
# define HAVE_SHA3_INTRIN 0
#endif
Expand All @@ -158,44 +201,23 @@ static inline u32 get_arm_cpu_features(void) { return 0; }
#if defined(ARCH_ARM64) && HAVE_NEON_INTRIN && \
(GCC_PREREQ(8, 1) || CLANG_PREREQ(7, 0, 10010000) || defined(_MSC_VER))
# define HAVE_DOTPROD_INTRIN 1
/*
* Use an inline assembly fallback for clang 15 and earlier, which only
* defined the dotprod intrinsics when dotprod is enabled in the main target.
*/
# if defined(__clang__) && !CLANG_PREREQ(16, 0, 16000000)
# undef vdotq_u32
# define vdotq_u32(a, b, c) \
({ uint32x4_t res = (a); \
__asm__("udot %0.4s, %1.16b, %2.16b" \
: "+w" (res) : "w" (b), "w" (c)); \
res; })
# pragma clang diagnostic ignored "-Wgnu-statement-expression"
# endif
#else
# define HAVE_DOTPROD_INTRIN 0
#endif

/*
* Work around bugs in arm_acle.h and arm_neon.h where sometimes intrinsics are
* only defined when the corresponding __ARM_FEATURE_* macro is defined. The
* intrinsics actually work in target attribute functions too if they are
* defined, though, so work around this by temporarily defining the
* corresponding __ARM_FEATURE_* macros while including the headers.
*/
#if HAVE_CRC32_INTRIN && !HAVE_CRC32_NATIVE && defined(__clang__)
# define __ARM_FEATURE_CRC32 1
#endif
#if HAVE_SHA3_INTRIN && !HAVE_SHA3_NATIVE && defined(__clang__)
# define __ARM_FEATURE_SHA3 1
#endif
#if HAVE_DOTPROD_INTRIN && !HAVE_DOTPROD_NATIVE && defined(__clang__)
# define __ARM_FEATURE_DOTPROD 1
#endif

#if defined(__GNUC__) || defined(__clang__)
# include <arm_acle.h>
#endif
#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)
# include <arm_neon.h>
#endif

#if HAVE_CRC32_INTRIN && !HAVE_CRC32_NATIVE && defined(__clang__)
# undef __ARM_FEATURE_CRC32
#endif
#if HAVE_SHA3_INTRIN && !HAVE_SHA3_NATIVE && defined(__clang__)
# undef __ARM_FEATURE_SHA3
#endif
#if HAVE_DOTPROD_INTRIN && !HAVE_DOTPROD_NATIVE && defined(__clang__)
# undef __ARM_FEATURE_DOTPROD
#endif

#endif /* ARCH_ARM32 || ARCH_ARM64 */

#endif /* LIB_ARM_CPU_FEATURES_H */
5 changes: 1 addition & 4 deletions lib/arm/crc32_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -544,11 +544,8 @@ crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len)
*
* This like crc32_arm_pmullx12_crc(), but it adds the eor3 instruction (from
* the sha3 extension) for even better performance.
*
* Note: we require HAVE_SHA3_TARGET rather than HAVE_SHA3_INTRIN, as we have an
* inline asm fallback for eor3.
*/
#if HAVE_PMULL_INTRIN && HAVE_CRC32_INTRIN && HAVE_SHA3_TARGET
#if HAVE_PMULL_INTRIN && HAVE_CRC32_INTRIN && HAVE_SHA3_INTRIN
# define crc32_arm_pmullx12_crc_eor3 crc32_arm_pmullx12_crc_eor3
# define SUFFIX _pmullx12_crc_eor3
# ifdef __clang__
Expand Down
10 changes: 1 addition & 9 deletions lib/arm/crc32_pmull_helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,18 +91,10 @@ static forceinline ATTRIBUTES uint8x16_t
ADD_SUFFIX(eor3)(uint8x16_t a, uint8x16_t b, uint8x16_t c)
{
#if ENABLE_EOR3
#if HAVE_SHA3_INTRIN
return veor3q_u8(a, b, c);
#else
uint8x16_t res;

__asm__("eor3 %0.16b, %1.16b, %2.16b, %3.16b"
: "=w" (res) : "w" (a), "w" (b), "w" (c));
return res;
#endif
#else /* ENABLE_EOR3 */
return veorq_u8(veorq_u8(a, b), c);
#endif /* !ENABLE_EOR3 */
#endif
}
#define eor3 ADD_SUFFIX(eor3)

Expand Down

0 comments on commit 0c0530c

Please sign in to comment.