diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml index e79d0607..9e101ccc 100644 --- a/.github/workflows/prerelease.yml +++ b/.github/workflows/prerelease.yml @@ -202,11 +202,11 @@ jobs: name: Cross Compilation runs-on: ubuntu-22.04 env: - CC: clang - CXX: clang++ - AR: llvm-ar - NM: llvm-nm - RANLIB: llvm-ranlib + CC: clang-16 + CXX: clang++-16 + AR: llvm-ar-16 + NM: llvm-nm-16 + RANLIB: llvm-ranlib-16 strategy: fail-fast: false @@ -222,11 +222,15 @@ jobs: # C/C++ # We need to install the cross-compilation toolchain for ARM64 and ARMHF + # Clang 16 isn't available from default repos on Ubuntu 22.04, so we have to install it manually - name: Install dependencies run: | sudo apt-get update - sudo apt-get install -y clang lld make crossbuild-essential-arm64 crossbuild-essential-armhf - + sudo apt-get install -y make build-essential crossbuild-essential-arm64 crossbuild-essential-armhf libjemalloc-dev + wget https://apt.llvm.org/llvm.sh + chmod +x llvm.sh + sudo ./llvm.sh 16 + - name: Build C/C++ run: | cmake -B build_artifacts \ diff --git a/CMakeLists.txt b/CMakeLists.txt index c62613fe..ddad3ea1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -38,6 +38,14 @@ if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) "MinSizeRel" "RelWithDebInfo") endif() +if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64|amd64") + SET(SZ_PLATFORM_X86 TRUE) + message(STATUS "Platform: x86") +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64|arm64|ARM64") + SET(SZ_PLATFORM_ARM TRUE) + message(STATUS "Platform: ARM") +endif() + # Determine if StringZilla is built as a subproject (using `add_subdirectory`) # or if it is the main project set(STRINGZILLA_IS_MAIN_PROJECT OFF) @@ -99,8 +107,17 @@ if(${CMAKE_VERSION} VERSION_EQUAL 3.13 OR ${CMAKE_VERSION} VERSION_GREATER 3.13) enable_testing() endif() +if (MSVC) + # Remove /RTC* from MSVC debug flags by default (it will be added back in the set_compiler_flags function) + # Beacuse /RTC* cannot be used without the crt so it needs to be disabled for that specifc target + string(REGEX REPLACE "/RTC[^ ]*" "" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}") + string(REGEX REPLACE "/RTC[^ ]*" "" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}") +endif() + # Function to set compiler-specific flags function(set_compiler_flags target cpp_standard target_arch) + get_target_property(target_type ${target} TYPE) + target_include_directories(${target} PRIVATE scripts) target_link_libraries(${target} PRIVATE ${STRINGZILLA_TARGET_NAME}) @@ -152,10 +169,18 @@ function(set_compiler_flags target cpp_standard target_arch) "$<$,$,$>>:/Zi>" ) + if(NOT target_type STREQUAL "SHARED_LIBRARY") + if(MSVC) + target_compile_options(${target} PRIVATE "$<$:/RTC1>") + endif() + endif() + # If available, enable Position Independent Code - if(CMAKE_POSITION_INDEPENDENT_CODE) + get_target_property(target_pic ${target} POSITION_INDEPENDENT_CODE) + if(target_pic) target_compile_options(${target} PRIVATE "$<$:-fPIC>") target_link_options(${target} PRIVATE "$<$:-fPIC>") + target_compile_definitions(${target} PRIVATE "$<$:SZ_PIC>") endif() # Avoid builtin functions where we know what we are doing. @@ -163,6 +188,7 @@ function(set_compiler_flags target cpp_standard target_arch) target_compile_options(${target} PRIVATE "$<$:-fno-builtin-memchr>") target_compile_options(${target} PRIVATE "$<$:-fno-builtin-memcpy>") target_compile_options(${target} PRIVATE "$<$:-fno-builtin-memset>") + target_compile_options(${target} PRIVATE "$<$:/Oi->") # Check for ${target_arch} and set it or use the current system if not defined if("${target_arch}" STREQUAL "") @@ -202,17 +228,19 @@ function(set_compiler_flags target cpp_standard target_arch) # Sanitizer options for Debug mode if(CMAKE_BUILD_TYPE STREQUAL "Debug") - target_compile_options( - ${target} - PRIVATE - "$<$:-fsanitize=address;-fsanitize=leak>" - "$<$:/fsanitize=address>") - - target_link_options( - ${target} - PRIVATE - "$<$:-fsanitize=address;-fsanitize=leak>" - "$<$:/fsanitize=address>") + if(NOT target_type STREQUAL "SHARED_LIBRARY") + target_compile_options( + ${target} + PRIVATE + "$<$:-fsanitize=address;-fsanitize=leak>" + "$<$:/fsanitize=address>") + + target_link_options( + ${target} + PRIVATE + "$<$:-fsanitize=address;-fsanitize=leak>" + "$<$:/fsanitize=address>") + endif() # Define SZ_DEBUG macro based on build configuration target_compile_definitions( @@ -248,7 +276,7 @@ if(${STRINGZILLA_BUILD_TEST}) # Check system architecture to avoid complex cross-compilation workflows, but # compile multiple backends: disabling all SIMD, enabling only AVX2, only AVX-512, only Arm Neon. - if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64|amd64") + if(SZ_PLATFORM_X86) # x86 specific backends if (MSVC) define_launcher(stringzilla_test_cpp20_x86_serial scripts/test.cpp 20 "AVX") @@ -259,7 +287,7 @@ if(${STRINGZILLA_BUILD_TEST}) define_launcher(stringzilla_test_cpp20_x86_avx2 scripts/test.cpp 20 "haswell") define_launcher(stringzilla_test_cpp20_x86_avx512 scripts/test.cpp 20 "sapphirerapids") endif() - elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64|arm64|ARM64") + elseif(SZ_PLATFORM_ARM) # ARM specific backends define_launcher(stringzilla_test_cpp20_arm_serial scripts/test.cpp 20 "armv8-a") define_launcher(stringzilla_test_cpp20_arm_neon scripts/test.cpp 20 "armv8-a+simd") @@ -267,29 +295,52 @@ if(${STRINGZILLA_BUILD_TEST}) endif() if(${STRINGZILLA_BUILD_SHARED}) - add_library(stringzilla_shared SHARED c/lib.c) - set_compiler_flags(stringzilla_shared "" "${STRINGZILLA_TARGET_ARCH}") + + function(define_shared target) + add_library(${target} SHARED c/lib.c) + + set_target_properties(${target} PROPERTIES + VERSION ${PROJECT_VERSION} + SOVERSION 1 + POSITION_INDEPENDENT_CODE ON + PUBLIC_HEADER include/stringzilla/stringzilla.h) + + if (SZ_PLATFORM_X86) + if (MSVC) + set_compiler_flags(${target} "" "SSE2") + else() + set_compiler_flags(${target} "" "ivybridge") + endif() + + target_compile_definitions(${target} PRIVATE + "SZ_USE_X86_AVX512=1" + "SZ_USE_X86_AVX2=1" + "SZ_USE_ARM_NEON=0" + "SZ_USE_ARM_SVE=0") + elseif(SZ_PLATFORM_ARM) + set_compiler_flags(${target} "" "armv8-a") + + target_compile_definitions(${target} PRIVATE + "SZ_USE_X86_AVX512=0" + "SZ_USE_X86_AVX2=0" + "SZ_USE_ARM_NEON=1" + "SZ_USE_ARM_SVE=1") + endif() + endfunction() + + define_shared(stringzilla_shared) target_compile_definitions(stringzilla_shared PRIVATE "SZ_AVOID_LIBC=0") target_compile_definitions(stringzilla_shared PRIVATE "SZ_OVERRIDE_LIBC=1") - set_target_properties(stringzilla_shared PROPERTIES - VERSION ${PROJECT_VERSION} - SOVERSION 1 - POSITION_INDEPENDENT_CODE ON - PUBLIC_HEADER include/stringzilla/stringzilla.h) - + # Try compiling a version without linking the LibC - add_library(stringzillite SHARED c/lib.c) - set_compiler_flags(stringzillite "" "${STRINGZILLA_TARGET_ARCH}") + define_shared(stringzillite) target_compile_definitions(stringzillite PRIVATE "SZ_AVOID_LIBC=1") target_compile_definitions(stringzillite PRIVATE "SZ_OVERRIDE_LIBC=1") - set_target_properties(stringzillite PROPERTIES - VERSION ${PROJECT_VERSION} - SOVERSION 1 - POSITION_INDEPENDENT_CODE ON - PUBLIC_HEADER include/stringzilla/stringzilla.h) # Avoid built-ins on MSVC and other compilers, as that will cause compileration errors target_compile_options(stringzillite PRIVATE "$<$:-fno-builtin;-nostdlib>" - "$<$:/Oi->") -endif() \ No newline at end of file + "$<$:/Oi-;/GS->") + target_link_options(stringzillite PRIVATE "$<$:-nostdlib>") + target_link_options(stringzillite PRIVATE "$<$:/NODEFAULTLIB>") +endif() diff --git a/c/lib.c b/c/lib.c index 3a66f6bd..edffd14e 100644 --- a/c/lib.c +++ b/c/lib.c @@ -224,6 +224,14 @@ BOOL WINAPI DllMain(HINSTANCE hints, DWORD forward_reason, LPVOID lp) { case DLL_PROCESS_DETACH: return TRUE; } } + +#if SZ_AVOID_LIBC +BOOL WINAPI _DllMainCRTStartup(HINSTANCE hints, DWORD forward_reason, LPVOID lp) { + DllMain(hints, forward_reason, lp); + return TRUE; +} +#endif + #else __attribute__((constructor)) static void sz_dispatch_table_init_on_gcc_or_clang(void) { sz_dispatch_table_init(); } #endif @@ -356,30 +364,54 @@ SZ_DYNAMIC void sz_generate(sz_cptr_t alphabet, sz_size_t alphabet_size, sz_ptr_ sz_generate_serial(alphabet, alphabet_size, result, result_length, generator, generator_user_data); } -// It's much harder to override the C standard library on Windows and MSVC, -// so we'll just provide the symbols for other Operating Systems. -#if SZ_OVERRIDE_LIBC && !(defined(_WIN32) || defined(__CYGWIN__)) +// Provide overrides for the libc mem* functions +#if SZ_OVERRIDE_LIBC && !(defined(__CYGWIN__)) +// SZ_DYNAMIC can't be use here for MSVC, because MSVC complains about different linkage (C2375), probably due to to the +// CRT headers specifying the function as __declspec(dllimport), there might be a combination of defines that works. But +// for now they will be manually exported using linker flags + +#if defined(_MSC_VER) +#pragma comment(linker, "/export:memchr") +void *__cdecl memchr(void const *s, int c_wide, size_t n) { +#else SZ_DYNAMIC void *memchr(void const *s, int c_wide, size_t n) { +#endif sz_u8_t c = (sz_u8_t)c_wide; return (void *)sz_find_byte(s, n, (sz_cptr_t)&c); } +#if defined(_MSC_VER) +#pragma comment(linker, "/export:memcpy") +void *__cdecl memcpy(void *dest, void const *src, size_t n) { +#else SZ_DYNAMIC void *memcpy(void *dest, void const *src, size_t n) { +#endif sz_copy(dest, src, n); return (void *)dest; } +#if defined(_MSC_VER) +#pragma comment(linker, "/export:memmove") +void *__cdecl memmove(void *dest, void const *src, size_t n) { +#else SZ_DYNAMIC void *memmove(void *dest, void const *src, size_t n) { +#endif sz_move(dest, src, n); return (void *)dest; } +#if defined(_MSC_VER) +#pragma comment(linker, "/export:memset") +void *__cdecl memset(void *s, int c, size_t n) { +#else SZ_DYNAMIC void *memset(void *s, int c, size_t n) { +#endif sz_fill(s, n, c); return (void *)s; } +#if !defined(_MSC_VER) SZ_DYNAMIC void *memmem(void const *h, size_t h_len, void const *n, size_t n_len) { return (void *)sz_find(h, h_len, n, n_len); } @@ -393,5 +425,5 @@ SZ_DYNAMIC void memfrob(void *s, size_t n) { char const *base64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; sz_generate(base64, 64, s, n, SZ_NULL, SZ_NULL); } - +#endif #endif // SZ_OVERRIDE_LIBC diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h index b160c327..fe5356d9 100644 --- a/include/stringzilla/stringzilla.h +++ b/include/stringzilla/stringzilla.h @@ -1144,11 +1144,15 @@ SZ_PUBLIC void sz_sort_intro(sz_sequence_t *sequence, sz_sequence_comparator_t l #include #endif // SZ_USE_X86... #if SZ_USE_ARM_NEON +#if !defined(_MSC_VER) #include +#endif #include #endif // SZ_USE_ARM_NEON #if SZ_USE_ARM_SVE +#if !defined(_MSC_VER) #include +#endif #endif // SZ_USE_ARM_SVE #pragma region Hardware - Specific API @@ -1282,7 +1286,7 @@ SZ_PUBLIC sz_cptr_t sz_rfind_charset_neon(sz_cptr_t text, sz_size_t length, sz_c * to check the invariants of the library. It's a no-op in the SZ_RELEASE mode. * @note If you want to catch it, put a breakpoint at @b `__GI_exit` */ -#if SZ_DEBUG +#if SZ_DEBUG && defined(SZ_AVOID_LIBC) && !SZ_AVOID_LIBC && !defined(SZ_PIC) #include // `fprintf` #include // `EXIT_FAILURE` #define sz_assert(condition) \ @@ -1341,7 +1345,11 @@ SZ_INTERNAL int sz_u32_popcount(sz_u32_t x) { return (int)__popcnt(x); } #endif SZ_INTERNAL int sz_u32_ctz(sz_u32_t x) { return (int)_tzcnt_u32(x); } SZ_INTERNAL int sz_u32_clz(sz_u32_t x) { return (int)_lzcnt_u32(x); } +// Force the byteswap functions to be intrinsics, because when /Oi- is given, these will turn into CRT function calls, +// which breaks when SZ_AVOID_LIBC is given +#pragma intrinsic(_byteswap_uint64) SZ_INTERNAL sz_u64_t sz_u64_bytes_reverse(sz_u64_t val) { return _byteswap_uint64(val); } +#pragma intrinsic(_byteswap_ulong) SZ_INTERNAL sz_u32_t sz_u32_bytes_reverse(sz_u32_t val) { return _byteswap_ulong(val); } #else SZ_INTERNAL int sz_u64_popcount(sz_u64_t x) { return __builtin_popcountll(x); } @@ -3352,6 +3360,11 @@ SZ_PUBLIC void sz_string_free(sz_string_t *string, sz_memory_allocator_t *alloca sz_string_init(string); } +// When overriding libc, disable optimisations for this function beacuse MSVC will optimize the loops into a memset. +// Which then causes a stack overflow due to infinite recursion (memset -> sz_fill_serial -> memset). +#if defined(_MSC_VER) && defined(SZ_OVERRIDE_LIBC) && SZ_OVERRIDE_LIBC +#pragma optimize("", off) +#endif SZ_PUBLIC void sz_fill_serial(sz_ptr_t target, sz_size_t length, sz_u8_t value) { sz_ptr_t end = target + length; // Dealing with short strings, a single sequential pass would be faster. @@ -3368,6 +3381,9 @@ SZ_PUBLIC void sz_fill_serial(sz_ptr_t target, sz_size_t length, sz_u8_t value) while (target != end) *(target++) = value; } } +#if defined(_MSC_VER) && defined(SZ_OVERRIDE_LIBC) && SZ_OVERRIDE_LIBC +#pragma optimize("", on) +#endif SZ_PUBLIC void sz_copy_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t length) { #if SZ_USE_MISALIGNED_LOADS @@ -4139,8 +4155,8 @@ SZ_PUBLIC sz_ordering_t sz_order_avx512(sz_cptr_t a, sz_size_t a_length, sz_cptr // The rare case, when both string are very long. while ((a_length >= 64) & (b_length >= 64)) { - a_vec.zmm = _mm512_loadu_epi8(a); - b_vec.zmm = _mm512_loadu_epi8(b); + a_vec.zmm = _mm512_loadu_si512(a); + b_vec.zmm = _mm512_loadu_si512(b); mask_not_equal = _mm512_cmpneq_epi8_mask(a_vec.zmm, b_vec.zmm); if (mask_not_equal != 0) { sz_u64_t first_diff = _tzcnt_u64(mask_not_equal); @@ -4181,8 +4197,8 @@ SZ_PUBLIC sz_bool_t sz_equal_avx512(sz_cptr_t a, sz_cptr_t b, sz_size_t length) sz_u512_vec_t a_vec, b_vec; while (length >= 64) { - a_vec.zmm = _mm512_loadu_epi8(a); - b_vec.zmm = _mm512_loadu_epi8(b); + a_vec.zmm = _mm512_loadu_si512(a); + b_vec.zmm = _mm512_loadu_si512(b); mask = _mm512_cmpneq_epi8_mask(a_vec.zmm, b_vec.zmm); if (mask != 0) return sz_false_k; a += 64, b += 64, length -= 64; @@ -4201,14 +4217,14 @@ SZ_PUBLIC sz_bool_t sz_equal_avx512(sz_cptr_t a, sz_cptr_t b, sz_size_t length) } SZ_PUBLIC void sz_fill_avx512(sz_ptr_t target, sz_size_t length, sz_u8_t value) { - for (; length >= 64; target += 64, length -= 64) _mm512_storeu_epi8(target, _mm512_set1_epi8(value)); + for (; length >= 64; target += 64, length -= 64) _mm512_storeu_si512(target, _mm512_set1_epi8(value)); // At this point the length is guaranteed to be under 64. _mm512_mask_storeu_epi8(target, _sz_u64_mask_until(length), _mm512_set1_epi8(value)); } SZ_PUBLIC void sz_copy_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t length) { for (; length >= 64; target += 64, source += 64, length -= 64) - _mm512_storeu_epi8(target, _mm512_loadu_epi8(source)); + _mm512_storeu_si512(target, _mm512_loadu_si512(source)); // At this point the length is guaranteed to be under 64. __mmask64 mask = _sz_u64_mask_until(length); _mm512_mask_storeu_epi8(target, mask, _mm512_maskz_loadu_epi8(mask, source)); @@ -4217,7 +4233,7 @@ SZ_PUBLIC void sz_copy_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t lengt SZ_PUBLIC void sz_move_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t length) { if (target < source || target >= source + length) { for (; length >= 64; target += 64, source += 64, length -= 64) - _mm512_storeu_epi8(target, _mm512_loadu_epi8(source)); + _mm512_storeu_si512(target, _mm512_loadu_si512(source)); // At this point the length is guaranteed to be under 64. __mmask64 mask = _sz_u64_mask_until(length); _mm512_mask_storeu_epi8(target, mask, _mm512_maskz_loadu_epi8(mask, source)); @@ -4225,7 +4241,7 @@ SZ_PUBLIC void sz_move_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t lengt else { // Jump to the end and walk backwards. for (target += length, source += length; length >= 64; length -= 64) - _mm512_storeu_epi8(target -= 64, _mm512_loadu_epi8(source -= 64)); + _mm512_storeu_si512(target -= 64, _mm512_loadu_si512(source -= 64)); // At this point the length is guaranteed to be under 64. __mmask64 mask = _sz_u64_mask_until(length); _mm512_mask_storeu_epi8(target - length, mask, _mm512_maskz_loadu_epi8(mask, source - length)); @@ -4238,7 +4254,7 @@ SZ_PUBLIC sz_cptr_t sz_find_byte_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr n_vec.zmm = _mm512_set1_epi8(n[0]); while (h_length >= 64) { - h_vec.zmm = _mm512_loadu_epi8(h); + h_vec.zmm = _mm512_loadu_si512(h); mask = _mm512_cmpeq_epi8_mask(h_vec.zmm, n_vec.zmm); if (mask) return h + sz_u64_ctz(mask); h += 64, h_length -= 64; @@ -4275,9 +4291,9 @@ SZ_PUBLIC sz_cptr_t sz_find_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, // Scan through the string. for (; h_length >= n_length + 64; h += 64, h_length -= 64) { - h_first_vec.zmm = _mm512_loadu_epi8(h + offset_first); - h_mid_vec.zmm = _mm512_loadu_epi8(h + offset_mid); - h_last_vec.zmm = _mm512_loadu_epi8(h + offset_last); + h_first_vec.zmm = _mm512_loadu_si512(h + offset_first); + h_mid_vec.zmm = _mm512_loadu_si512(h + offset_mid); + h_last_vec.zmm = _mm512_loadu_si512(h + offset_last); matches = _kand_mask64(_kand_mask64( // Intersect the masks _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm), _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)), @@ -4317,7 +4333,7 @@ SZ_PUBLIC sz_cptr_t sz_rfind_byte_avx512(sz_cptr_t h, sz_size_t h_length, sz_cpt n_vec.zmm = _mm512_set1_epi8(n[0]); while (h_length >= 64) { - h_vec.zmm = _mm512_loadu_epi8(h + h_length - 64); + h_vec.zmm = _mm512_loadu_si512(h + h_length - 64); mask = _mm512_cmpeq_epi8_mask(h_vec.zmm, n_vec.zmm); if (mask) return h + h_length - 1 - sz_u64_clz(mask); h_length -= 64; @@ -4356,9 +4372,9 @@ SZ_PUBLIC sz_cptr_t sz_rfind_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n sz_cptr_t h_reversed; for (; h_length >= n_length + 64; h_length -= 64) { h_reversed = h + h_length - n_length - 64 + 1; - h_first_vec.zmm = _mm512_loadu_epi8(h_reversed + offset_first); - h_mid_vec.zmm = _mm512_loadu_epi8(h_reversed + offset_mid); - h_last_vec.zmm = _mm512_loadu_epi8(h_reversed + offset_last); + h_first_vec.zmm = _mm512_loadu_si512(h_reversed + offset_first); + h_mid_vec.zmm = _mm512_loadu_si512(h_reversed + offset_mid); + h_last_vec.zmm = _mm512_loadu_si512(h_reversed + offset_last); matches = _kand_mask64(_kand_mask64( // Intersect the masks _mm512_cmpeq_epi8_mask(h_first_vec.zmm, n_first_vec.zmm), _mm512_cmpeq_epi8_mask(h_mid_vec.zmm, n_mid_vec.zmm)), @@ -4880,10 +4896,10 @@ SZ_INTERNAL sz_ssize_t _sz_alignment_score_wagner_fisher_upto17m_avx512( // // Load one row of the substitution matrix into four ZMM registers. sz_error_cost_t const *row_subs = subs + shorter_unsigned[idx_shorter] * 256u; - row_first_subs_vec.zmm = _mm512_loadu_epi8(row_subs + 64 * 0); - row_second_subs_vec.zmm = _mm512_loadu_epi8(row_subs + 64 * 1); - row_third_subs_vec.zmm = _mm512_loadu_epi8(row_subs + 64 * 2); - row_fourth_subs_vec.zmm = _mm512_loadu_epi8(row_subs + 64 * 3); + row_first_subs_vec.zmm = _mm512_loadu_si512(row_subs + 64 * 0); + row_second_subs_vec.zmm = _mm512_loadu_si512(row_subs + 64 * 1); + row_third_subs_vec.zmm = _mm512_loadu_si512(row_subs + 64 * 2); + row_fourth_subs_vec.zmm = _mm512_loadu_si512(row_subs + 64 * 3); // In the serial version we have one forward pass, that computes the deletion, // insertion, and substitution costs at once. diff --git a/pyproject.toml b/pyproject.toml index c0e25016..8fcdcfb9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ # = meaning 16 platforms * 7 Python versions = 112 builds. # = meaning over 500,000 tests. [build-system] -requires = ["setuptools>=42", "wheel", "cmake>=3.22"] +requires = ["setuptools>=42", "wheel"] build-backend = "setuptools.build_meta" [tool.pytest.ini_options] @@ -74,3 +74,45 @@ repair-wheel-command = "delocate-wheel --require-archs {delocate_archs} -w {dest [tool.cibuildwheel.windows] before-build = ["rd /s /q {project}\\build || echo Done"] + +# Detect x86 64-bit builds +[[tool.cibuildwheel.overrides]] +select = "*-win_amd64" +inherit.environment = "append" +environment.SZ_X86_64="1" + +[[tool.cibuildwheel.overrides]] +select = "*-manylinux*_x86_64" +inherit.environment = "append" +environment.SZ_X86_64="1" + +[[tool.cibuildwheel.overrides]] +select = "*-musllinux*_x86_64" +inherit.environment = "append" +environment.SZ_X86_64="1" + +[[tool.cibuildwheel.overrides]] +select = "*-macos*_x86_64" +inherit.environment = "append" +environment.SZ_X86_64="1" + +# Detect ARM 64-bit builds +[[tool.cibuildwheel.overrides]] +select = "*-win_arm64" +inherit.environment = "append" +environment.SZ_ARM64="1" + +[[tool.cibuildwheel.overrides]] +select = "*-manylinux*_aarch64" +inherit.environment = "append" +environment.SZ_ARM64="1" + +[[tool.cibuildwheel.overrides]] +select = "*-musllinux*_aarch64" +inherit.environment = "append" +environment.SZ_ARM64="1" + +[[tool.cibuildwheel.overrides]] +select = "*-macos*_arm64" +inherit.environment = "append" +environment.SZ_ARM64="1" diff --git a/setup.py b/setup.py index 8ab44991..e2b3e09f 100644 --- a/setup.py +++ b/setup.py @@ -14,14 +14,21 @@ def get_compiler() -> str: return "" +using_cibuildwheels = os.environ.get("CIBUILDWHEEL", "0") == "1" + + def is_64bit_x86() -> bool: + if using_cibuildwheels: + return "SZ_X86_64" in os.environ arch = platform.machine() - return arch == "x86_64" or arch == "i386" + return arch in ["x86_64", "x64", "AMD64"] def is_64bit_arm() -> bool: + if using_cibuildwheels: + return "SZ_ARM64" in os.environ arch = platform.machine() - return arch.startswith("arm") + return arch in ["arm64", "aarch64", "ARM64"] def is_big_endian() -> bool: @@ -54,9 +61,6 @@ def linux_settings() -> Tuple[List[str], List[str], List[Tuple[str]]]: ("SZ_DETECT_BIG_ENDIAN", "1" if is_big_endian() else "0"), ] - if is_64bit_arm(): - compile_args.append("-march=armv8-a+simd") - return compile_args, link_args, macros_args @@ -98,14 +102,14 @@ def windows_settings() -> Tuple[List[str], List[str], List[Tuple[str]]]: "/O2", # maximum optimization level ] - # Detect supported architectures for MSVC. - macros_args = [] - if "AVX512" in platform.processor(): - macros_args.append(("SZ_USE_X86_AVX512", "1")) - compile_args.append("/arch:AVX512") - if "AVX2" in platform.processor(): - macros_args.append(("SZ_USE_X86_AVX2", "1")) - compile_args.append("/arch:AVX2") + # When packaging the library, even if the current machine doesn't support AVX-512 or SVE, still precompile those. + macros_args = [ + ("SZ_USE_X86_AVX512", "1" if is_64bit_x86() else "0"), + ("SZ_USE_X86_AVX2", "1" if is_64bit_x86() else "0"), + ("SZ_USE_ARM_SVE", "1" if is_64bit_arm() else "0"), + ("SZ_USE_ARM_NEON", "1" if is_64bit_arm() else "0"), + ("SZ_DETECT_BIG_ENDIAN", "1" if is_big_endian() else "0"), + ] link_args = [] return compile_args, link_args, macros_args