From 0ee549a106b1ee524fa8059888219c03635e11e6 Mon Sep 17 00:00:00 2001 From: Govind Date: Sat, 7 Dec 2024 12:08:15 +0100 Subject: [PATCH] Make: Inline ASM for detecting CPU features on ARM Closes #143 --- c/lib.c | 49 ++++++++++++++++++++++++------- include/stringzilla/stringzilla.h | 5 ++-- 2 files changed, 40 insertions(+), 14 deletions(-) diff --git a/c/lib.c b/c/lib.c index ee48400e..f38ac534 100644 --- a/c/lib.c +++ b/c/lib.c @@ -38,6 +38,43 @@ extern void *malloc(size_t length); #endif #endif +// On Apple Silicon, `mrs` is not allowed in user-space, so we need to use the `sysctl` API. +#if defined(__APPLE__) && defined(__MACH__) +#define SZ_APPLE 1 +#include +#endif + +#if defined(__linux__) +#define SZ_LINUX 1 +#endif + +SZ_INTERNAL sz_capability_t sz_capabilities_arm(void) { + // https://github.com/ashvardanian/SimSIMD/blob/28e536083602f85ad0c59456782c8864463ffb0e/include/simsimd/simsimd.h#L434 + // for documentation on how we detect capabilities across different ARM platforms. +#if defined(SZ_APPLE) + + // On Apple Silicon, `mrs` is not allowed in user-space, so we need to use the `sysctl` API. + uint32_t supports_neon = 0; + size_t size = sizeof(supports_neon); + if (sysctlbyname("hw.optional.neon", &supports_neon, &size, NULL, 0) != 0) supports_neon = 0; + + return (sz_capability_t)( // + (sz_cap_arm_neon_k * (supports_neon)) | // + (sz_cap_serial_k)); + +#elif defined(SZ_LINUX) + unsigned supports_neon = 1; // NEON is always supported + __asm__ __volatile__("mrs %0, ID_AA64PFR0_EL1" : "=r"(id_aa64pfr0_el1)); + unsigned supports_sve = ((id_aa64pfr0_el1 >> 32) & 0xF) >= 1; + return (sz_capability_t)( // + (sz_cap_neon_k * (supports_neon)) | // + (sz_cap_sve_k * (supports_sve)) | // + (sz_cap_serial_k)); +#else // SIMSIMD_DEFINED_LINUX + return sz_cap_serial_k; +#endif +} + SZ_DYNAMIC sz_capability_t sz_capabilities(void) { #if SZ_USE_X86_AVX512 || SZ_USE_X86_AVX2 @@ -96,22 +133,12 @@ SZ_DYNAMIC sz_capability_t sz_capabilities(void) { #if SZ_USE_ARM_NEON || SZ_USE_ARM_SVE - // Every 64-bit Arm CPU supports NEON - unsigned supports_neon = 1; - unsigned supports_sve = 0; - unsigned supports_sve2 = 0; - sz_unused(supports_sve); - sz_unused(supports_sve2); - - return (sz_capability_t)( // - (sz_cap_arm_neon_k * supports_neon) | // - (sz_cap_serial_k)); + return sz_capabilities_arm(); #endif // SIMSIMD_TARGET_ARM return sz_cap_serial_k; } - typedef struct sz_implementations_t { sz_equal_t equal; sz_order_t order; diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h index 7aa9e6da..588a3282 100644 --- a/include/stringzilla/stringzilla.h +++ b/include/stringzilla/stringzilla.h @@ -260,7 +260,8 @@ typedef enum sz_capability_t { sz_cap_arm_neon_k = 1 << 10, /// ARM NEON capability sz_cap_arm_sve_k = 1 << 11, /// ARM SVE capability TODO: Not yet supported or used - + sz_cap_arm_sve2_k = 1 << 12, + sz_cap_arm_sve2p1_k = 1 << 13, sz_cap_x86_avx2_k = 1 << 20, /// x86 AVX2 capability sz_cap_x86_avx512f_k = 1 << 21, /// x86 AVX512 F capability sz_cap_x86_avx512bw_k = 1 << 22, /// x86 AVX512 BW instruction capability @@ -268,8 +269,6 @@ typedef enum sz_capability_t { sz_cap_x86_avx512vbmi_k = 1 << 24, /// x86 AVX512 VBMI instruction capability sz_cap_x86_gfni_k = 1 << 25, /// x86 AVX512 GFNI instruction capability - sz_cap_x86_avx512vbmi2_k = 1 << 26, /// x86 AVX512 VBMI 2 instruction capability - } sz_capability_t; /**