diff --git a/.github/workflows/Dockerfile.libs b/.github/workflows/Dockerfile.libs deleted file mode 100644 index dfee1a26..00000000 --- a/.github/workflows/Dockerfile.libs +++ /dev/null @@ -1,48 +0,0 @@ -FROM ubuntu:22.04 -ENV DEBIAN_FRONTEND=noninteractive - -ENV user_name="runner" -ARG TARGETPLATFORM -ARG docker_ip -ARG user_pass - -RUN apt-get update && \ - apt-get install -y --no-install-recommends cmake g++-12 gcc-12 libc++-dev libc++abi-dev build-essential openssh-client sshpass && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -ENV CC=gcc-12 -ENV CXX=g++-12 - -# Copy only the parts we need to build the library. -COPY CMakeLists.txt CMakeLists.txt -COPY include/ include/ -COPY c/ c/ -COPY VERSION VERSION - -# As we are building from inside the container, the "main project" resolution -# may not work as expected, so override `STRINGZILLA_BUILD_SHARED=1`. -RUN cmake -DCMAKE_BUILD_TYPE=Release \ - -DSTRINGZILLA_BUILD_SHARED=1 \ - -DSTRINGZILLA_BUILD_BENCHMARK=0 \ - -DSTRINGZILLA_BUILD_TEST=0 \ - -B ./build_release -RUN cmake --build ./build_release --config Release --target stringzillite - -RUN if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \ - export arch="amd"; \ - elif [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ - export arch="arm"; \ - fi && \ - version=$(cat VERSION) && \ - mkdir -p stringzillite_linux_"$arch"_"$version"/DEBIAN && \ - touch stringzillite_linux_"$arch"_"$version"/DEBIAN/control && \ - mkdir -p stringzillite_linux_"$arch"_"$version"/usr/local/lib && \ - mkdir stringzillite_linux_"$arch"_"$version"/usr/local/include && \ - cp include/stringzilla/stringzilla.h stringzillite_linux_"$arch"_"$version"/usr/local/include/ && \ - cp ./build_release/libstringzillite.so stringzillite_linux_"$arch"_"$version"/usr/local/lib/ && \ - echo "Package: stringzilla\nVersion: $version\nMaintainer: Ash Vardanian\nArchitecture: "$arch"\nDescription: SIMD-accelerated string search, sort, hashes, fingerprints, & edit distances" > stringzillite_linux_"$arch"_"$version"/DEBIAN/control && \ - dpkg-deb --build stringzillite_linux_"$arch"_"$version" && \ - ls -l . && \ - sshpass -p "$user_pass" ssh -o StrictHostKeyChecking=no "$user_name"@"$docker_ip" ls -l /home/$user_name/work/stringzilla/ && \ - sshpass -p "$user_pass" scp -o StrictHostKeyChecking=no stringzillite_linux_"$arch"_"$version".deb "$user_name"@"$docker_ip":/home/$user_name/work/stringzilla/stringzillite_linux_"$arch"_"$version".deb diff --git a/.github/workflows/build_tools.sh b/.github/workflows/build_tools.sh index e1cde117..e2d980a2 100755 --- a/.github/workflows/build_tools.sh +++ b/.github/workflows/build_tools.sh @@ -27,11 +27,11 @@ esac # Set build type case "$BUILD_TYPE" in "Debug") - BUILD_DIR="./build_debug" + BUILD_DIR="build_debug" BUILD_FLAGS="-DCMAKE_BUILD_TYPE=Debug" ;; "Release") - BUILD_DIR="./build_release" + BUILD_DIR="build_release" BUILD_FLAGS="-DCMAKE_BUILD_TYPE=RelWithDebInfo" ;; *) diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml index 0213f679..47a7c872 100644 --- a/.github/workflows/prerelease.yml +++ b/.github/workflows/prerelease.yml @@ -31,11 +31,13 @@ jobs: # C/C++ # If the compilation fails, we want to log the compilation commands in addition to # the standard output. - - name: Build C/C++ + - name: Install dependencies run: | sudo apt update sudo apt install -y cmake build-essential libjemalloc-dev libomp-dev gcc-12 g++-12 + - name: Build C/C++ + run: | cmake -B build_artifacts \ -DCMAKE_BUILD_TYPE=RelWithDebInfo \ -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \ @@ -56,14 +58,14 @@ jobs: exit 1 } - name: Test C++ - run: ./build_artifacts/stringzilla_test_cpp20 + run: build_artifacts/stringzilla_test_cpp20 - name: Test on Real World Data run: | - ./build_artifacts/stringzilla_bench_search ${DATASET_PATH} # for substring search - ./build_artifacts/stringzilla_bench_token ${DATASET_PATH} # for hashing, equality comparisons, etc. - ./build_artifacts/stringzilla_bench_similarity ${DATASET_PATH} # for edit distances and alignment scores - ./build_artifacts/stringzilla_bench_sort ${DATASET_PATH} # for sorting arrays of strings - ./build_artifacts/stringzilla_bench_container ${DATASET_PATH} # for STL containers with string keys + build_artifacts/stringzilla_bench_search ${DATASET_PATH} # for substring search + build_artifacts/stringzilla_bench_token ${DATASET_PATH} # for hashing, equality comparisons, etc. + build_artifacts/stringzilla_bench_similarity ${DATASET_PATH} # for edit distances and alignment scores + build_artifacts/stringzilla_bench_sort ${DATASET_PATH} # for sorting arrays of strings + build_artifacts/stringzilla_bench_container ${DATASET_PATH} # for STL containers with string keys env: DATASET_PATH: ./README.md # Don't overload GitHub with our benchmarks. @@ -104,13 +106,21 @@ jobs: env: CC: clang-16 CXX: clang++-16 + strategy: + fail-fast: false + matrix: + include: + - arch: amd64 + target: x86_64-linux-gnu + - arch: arm64 + target: aarch64-linux-gnu steps: - uses: actions/checkout@v4 # C/C++ # Clang 16 isn't available from default repos on Ubuntu 22.04, so we have to install it manually - - name: Build C/C++ + - name: Install dependencies run: | sudo apt update sudo apt install -y cmake build-essential libjemalloc-dev @@ -118,6 +128,8 @@ jobs: chmod +x llvm.sh sudo ./llvm.sh 16 + - name: Build C/C++ + run: | cmake -B build_artifacts \ -DCMAKE_BUILD_TYPE=RelWithDebInfo \ -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \ @@ -138,14 +150,14 @@ jobs: exit 1 } - name: Test C++ - run: ./build_artifacts/stringzilla_test_cpp20 + run: build_artifacts/stringzilla_test_cpp20 - name: Test on Real World Data run: | - ./build_artifacts/stringzilla_bench_search ${DATASET_PATH} # for substring search - ./build_artifacts/stringzilla_bench_token ${DATASET_PATH} # for hashing, equality comparisons, etc. - ./build_artifacts/stringzilla_bench_similarity ${DATASET_PATH} # for edit distances and alignment scores - ./build_artifacts/stringzilla_bench_sort ${DATASET_PATH} # for sorting arrays of strings - ./build_artifacts/stringzilla_bench_container ${DATASET_PATH} # for STL containers with string keys + build_artifacts/stringzilla_bench_search ${DATASET_PATH} # for substring search + build_artifacts/stringzilla_bench_token ${DATASET_PATH} # for hashing, equality comparisons, etc. + build_artifacts/stringzilla_bench_similarity ${DATASET_PATH} # for edit distances and alignment scores + build_artifacts/stringzilla_bench_sort ${DATASET_PATH} # for sorting arrays of strings + build_artifacts/stringzilla_bench_container ${DATASET_PATH} # for STL containers with string keys env: DATASET_PATH: ./README.md # Don't overload GitHub with our benchmarks. @@ -186,7 +198,7 @@ jobs: # Temporary workaround to run Swift tests on Linux # Based on: https://github.com/swift-actions/setup-swift/issues/591#issuecomment-1685710678 test_ubuntu_swift: - name: Ubuntu (Swift) + name: Swift on Linux runs-on: ubuntu-22.04 container: swift:5.9 steps: @@ -194,6 +206,57 @@ jobs: - name: Test Swift run: swift test + test_ubuntu_cross_compilation: + name: Cross Compilation + runs-on: ubuntu-22.04 + env: + CC: clang + CXX: clang++ + AR: llvm-ar + NM: llvm-nm + RANLIB: llvm-ranlib + + strategy: + fail-fast: false + matrix: + include: + - arch: amd64 + target: x86_64-linux-gnu + - arch: arm64 + target: aarch64-linux-gnu + + steps: + - uses: actions/checkout@v4 + + # C/C++ + # We need to install the cross-compilation toolchain for ARM64 and ARMHF + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install -y clang lld make crossbuild-essential-arm64 crossbuild-essential-armhf + + - name: Build C/C++ + run: | + cmake -B build_artifacts \ + -DCMAKE_BUILD_TYPE=RelWithDebInfo \ + -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \ + -DCMAKE_C_COMPILER_TARGET=${{ matrix.target }} \ + -DCMAKE_CXX_COMPILER_TARGET=${{ matrix.target }} \ + -DCMAKE_SYSTEM_NAME=Linux \ + -DCMAKE_SYSTEM_PROCESSOR=${{ matrix.arch }} \ + -DSTRINGZILLA_BUILD_SHARED=1 \ + -DSTRINGZILLA_BUILD_BENCHMARK=1 \ + -DSTRINGZILLA_BUILD_TEST=1 + + cmake --build build_artifacts --config RelWithDebInfo + + # We can't run the produced builds, but we can make sure they exist + - name: Test artifacts presense + run: | + test -f build_artifacts/stringzilla_test_cpp20 + test -f build_artifacts/libstringzilla.so + test -f build_artifacts/libstringzillite.so + test_macos: name: MacOS runs-on: macos-12 @@ -202,10 +265,12 @@ jobs: - uses: actions/checkout@v4 # C/C++ - - name: Build C/C++ + - name: Install dependencies run: | brew update brew install cmake + - name: Build C/C++ + run: | cmake -B build_artifacts \ -DCMAKE_BUILD_TYPE=RelWithDebInfo \ -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \ @@ -213,14 +278,14 @@ jobs: -DSTRINGZILLA_BUILD_TEST=1 cmake --build build_artifacts --config RelWithDebInfo - name: Test C++ - run: ./build_artifacts/stringzilla_test_cpp17 + run: build_artifacts/stringzilla_test_cpp17 - name: Test on Real World Data run: | - ./build_artifacts/stringzilla_bench_search ${DATASET_PATH} # for substring search - ./build_artifacts/stringzilla_bench_token ${DATASET_PATH} # for hashing, equality comparisons, etc. - ./build_artifacts/stringzilla_bench_similarity ${DATASET_PATH} # for edit distances and alignment scores - ./build_artifacts/stringzilla_bench_sort ${DATASET_PATH} # for sorting arrays of strings - ./build_artifacts/stringzilla_bench_container ${DATASET_PATH} # for STL containers with string keys + build_artifacts/stringzilla_bench_search ${DATASET_PATH} # for substring search + build_artifacts/stringzilla_bench_token ${DATASET_PATH} # for hashing, equality comparisons, etc. + build_artifacts/stringzilla_bench_similarity ${DATASET_PATH} # for edit distances and alignment scores + build_artifacts/stringzilla_bench_sort ${DATASET_PATH} # for sorting arrays of strings + build_artifacts/stringzilla_bench_container ${DATASET_PATH} # for STL containers with string keys env: DATASET_PATH: ./README.md # Don't overload GitHub with our benchmarks. @@ -272,7 +337,7 @@ jobs: -DCMAKE_EXPORT_COMPILE_COMMANDS=1 ^ -DSTRINGZILLA_BUILD_BENCHMARK=1 ^ -DSTRINGZILLA_BUILD_TEST=1 - + cmake --build build_artifacts --config RelWithDebInfo > build_artifacts/logs.txt 2>&1 || ( echo "Compilation failed. Here are the logs:" type build_artifacts\logs.txt @@ -297,7 +362,7 @@ jobs: # Don't overload GitHub with our benchmarks. # The results in such an unstable environment will be meaningless anyway. if: 0 - + # Python - name: Set up Python ${{ env.PYTHON_VERSION }} uses: actions/setup-python@v5 @@ -331,7 +396,7 @@ jobs: -DSTRINGZILLA_BUILD_TEST=1 cmake --build build_artifacts --config RelWithDebInfo - name: Test C++ - run: ./build_artifacts/stringzilla_test_cpp20 + run: build_artifacts/stringzilla_test_cpp20 # Python - name: Build Python diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 44fcae17..40bf009a 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -155,47 +155,75 @@ jobs: name: Create Debian Package for ${{ matrix.arch }} runs-on: ubuntu-22.04 needs: versioning + env: + CC: clang + CXX: clang++ + AR: llvm-ar + NM: llvm-nm + RANLIB: llvm-ranlib + strategy: fail-fast: false matrix: - arch: ["linux/amd64", "linux/arm64"] + include: + - arch: amd64 + target: x86_64-linux-gnu + - arch: arm64 + target: aarch64-linux-gnu + steps: - uses: actions/checkout@v4 with: persist-credentials: false ref: "main" - - run: git submodule update --init --recursive - - - name: Setup QEMU - uses: docker/setup-qemu-action@v3 - - name: Setup Docker - uses: docker/setup-buildx-action@v3 - - name: Get Docker daemon IP - id: dockerip + - name: Get version + id: version run: | - docker_ip=$(ip addr show docker0 | grep -Po 'inet \K[\d.]+') && \ - echo "docker_ip=$docker_ip" >> "$GITHUB_OUTPUT" + version=$(cat VERSION) + echo "version=$version" >> $GITHUB_OUTPUT - - name: Set runner user password - id: userpass + - name: Install dependencies run: | - user_pass="1122" - echo "runner:$user_pass" | sudo chpasswd - echo "user_pass=$user_pass" >> "$GITHUB_OUTPUT" + sudo apt-get update + sudo apt-get install -y clang lld make crossbuild-essential-arm64 crossbuild-essential-armhf - - name: Build library + - name: Build C/C++ run: | - docker buildx create --use - docker buildx build \ - --platform ${{ matrix.arch }} \ - --build-arg docker_ip=${{ steps.dockerip.outputs.docker_ip }} \ - --build-arg user_pass=${{ steps.userpass.outputs.user_pass }} \ - --file .github/workflows/Dockerfile.libs \ - --tag ashvardanian/stringzillite-library:latest \ - --load . - - name: Clear Docker buildx cache - run: docker buildx prune --all --force + # Configure and build the project + cmake -B build_release \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \ + -DCMAKE_C_COMPILER_TARGET=${{ matrix.target }} \ + -DCMAKE_CXX_COMPILER_TARGET=${{ matrix.target }} \ + -DCMAKE_SYSTEM_NAME=Linux \ + -DCMAKE_SYSTEM_PROCESSOR=${{ matrix.arch }} \ + -DSTRINGZILLA_BUILD_SHARED=1 \ + -DSTRINGZILLA_BUILD_BENCHMARK=1 \ + -DSTRINGZILLA_BUILD_TEST=1 + + cmake --build build_release --config Release + + # Copy and package the library + echo "ARTIFACT_NAME=stringzillite_linux_${{ matrix.arch }}_${{ steps.set_version.outputs.version }}" >> $GITHUB_ENV + cp build_release/libstringzillite.so "$ARTIFACT_NAME.so" + + mkdir -p "$ARTIFACT_NAME/DEBIAN" + touch "$ARTIFACT_NAME/DEBIAN/control" + mkdir -p "$ARTIFACT_NAME/usr/local/lib" + mkdir "$ARTIFACT_NAME/usr/local/include" + cp include/stringzilla/stringzilla.h "$ARTIFACT_NAME/usr/local/include/" + cp build_release/libstringzillite.so "$ARTIFACT_NAME/usr/local/lib/" + echo "Package: stringzilla\nVersion: ${{ steps.set_version.outputs.version }}\nMaintainer: Ash Vardanian\nArchitecture: ${{ matrix.arch }}\nDescription: SIMD-accelerated string search, sort, hashes, fingerprints, & edit distances" > "$ARTIFACT_NAME/DEBIAN/control" + dpkg-deb --build "$ARTIFACT_NAME" + + - name: Upload library + uses: xresloader/upload-to-github-release@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + file: "*.so" + update_latest_release: true - name: Upload archive uses: xresloader/upload-to-github-release@v1 @@ -238,9 +266,9 @@ jobs: - name: Build library run: | - cmake -DCMAKE_BUILD_TYPE=Release -B ./build_release - cmake --build ./build_release --config Release - tar -cvf "stringzillite_windows_${{ matrix.arch }}_${{ steps.set_version.outputs.version }}.tar" "./build_release/stringzillite.dll" "./include/stringzilla/stringzilla.h" + cmake -DCMAKE_BUILD_TYPE=Release -B build_release + cmake --build build_release --config Release + tar -cvf "stringzillite_windows_${{ matrix.arch }}_${{ steps.set_version.outputs.version }}.tar" "build_release/stringzillite.dll" "./include/stringzilla/stringzilla.h" - name: Upload archive uses: xresloader/upload-to-github-release@v1 @@ -273,8 +301,8 @@ jobs: - name: Build library run: | - cmake -DCMAKE_BUILD_TYPE=Release -B ./build_release - cmake --build ./build_release --config Release + cmake -DCMAKE_BUILD_TYPE=Release -B build_release + cmake --build build_release --config Release zip -r stringzillite_macos_${{ matrix.arch }}_${{ steps.version.outputs.version }}.zip build_release/libstringzillite.dylib include/stringzilla/stringzilla.h - name: Upload archive diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 75bc203e..f8f156d0 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -113,10 +113,10 @@ Using modern syntax, this is how you build and run the test suite: ```bash cmake -DSTRINGZILLA_BUILD_TEST=1 -B build_debug -cmake --build ./build_debug --config Debug # Which will produce the following targets: -./build_debug/stringzilla_test_cpp20 # Unit test for the entire library compiled for current hardware -./build_debug/stringzilla_test_cpp20_x86_serial # x86 variant compiled for IvyBridge - last arch. before AVX2 -./build_debug/stringzilla_test_cpp20_arm_serial # Arm variant compiled without Neon +cmake --build build_debug --config Debug # Which will produce the following targets: +build_debug/stringzilla_test_cpp20 # Unit test for the entire library compiled for current hardware +build_debug/stringzilla_test_cpp20_x86_serial # x86 variant compiled for IvyBridge - last arch. before AVX2 +build_debug/stringzilla_test_cpp20_arm_serial # Arm variant compiled without Neon ``` To use CppCheck for static analysis make sure to export the compilation commands. @@ -148,12 +148,12 @@ For benchmarks, you can use the following commands: ```bash cmake -DSTRINGZILLA_BUILD_BENCHMARK=1 -B build_release -cmake --build ./build_release --config Release # Which will produce the following targets: -./build_release/stringzilla_bench_search # for substring search -./build_release/stringzilla_bench_token # for hashing, equality comparisons, etc. -./build_release/stringzilla_bench_similarity # for edit distances and alignment scores -./build_release/stringzilla_bench_sort # for sorting arrays of strings -./build_release/stringzilla_bench_container # for STL containers with string keys +cmake --build build_release --config Release # Which will produce the following targets: +build_release/stringzilla_bench_search # for substring search +build_release/stringzilla_bench_token # for hashing, equality comparisons, etc. +build_release/stringzilla_bench_similarity # for edit distances and alignment scores +build_release/stringzilla_bench_sort # for sorting arrays of strings +build_release/stringzilla_bench_container # for STL containers with string keys ``` ### Benchmarking Hardware-Specific Optimizations @@ -199,14 +199,14 @@ cmake -DSTRINGZILLA_BUILD_BENCHMARK=1 \ -DSTRINGZILLA_BUILD_SHARED=1 \ -DCMAKE_BUILD_TYPE=RelWithDebInfo \ -B build_profile -cmake --build ./build_profile --config Release --target stringzilla_bench_token +cmake --build build_profile --config Release --target stringzilla_bench_token # Check that the debugging symbols are there with your favorite tool -readelf --sections ./build_profile/stringzilla_bench_token | grep debug -objdump -h ./build_profile/stringzilla_bench_token | grep debug +readelf --sections build_profile/stringzilla_bench_token | grep debug +objdump -h build_profile/stringzilla_bench_token | grep debug # Profile -sudo perf record -g ./build_profile/stringzilla_bench_token ./leipzig1M.txt +sudo perf record -g build_profile/stringzilla_bench_token ./leipzig1M.txt sudo perf report ``` @@ -225,8 +225,8 @@ sudo docker run -it --rm -v "$(pwd)":/workspace/StringZilla alpine:latest /bin/a cd /workspace/StringZilla apk add --update make cmake g++ gcc cmake -DSTRINGZILLA_BUILD_TEST=1 -B build_debug -cmake --build ./build_debug --config Debug -./build_debug/stringzilla_test_cpp20 +cmake --build build_debug --config Debug +build_debug/stringzilla_test_cpp20 ``` #### Intel Clear Linux @@ -241,15 +241,15 @@ cd /workspace/StringZilla swupd update swupd bundle-add c-basic dev-utils cmake -DSTRINGZILLA_BUILD_TEST=1 -B build_debug -cmake --build ./build_debug --config Debug -./build_debug/stringzilla_test_cpp20 +cmake --build build_debug --config Debug +build_debug/stringzilla_test_cpp20 ``` For benchmarks: ```bash cmake -DSTRINGZILLA_BUILD_TEST=1 -DSTRINGZILLA_BUILD_BENCHMARK=1 -B build_release -cmake --build ./build_release --config Release +cmake --build build_release --config Release ``` #### Amazon Linux @@ -261,7 +261,7 @@ sudo docker run -it --rm -v "$(pwd)":/workspace/StringZilla amazonlinux:2023 bas cd /workspace/StringZilla yum install -y make cmake3 gcc g++ cmake3 -DSTRINGZILLA_BUILD_TEST=1 -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc -DSTRINGZILLA_TARGET_ARCH="ivybridge" -B build_debug -cmake3 --build ./build_debug --config Debug --target stringzilla_test_cpp11 +cmake3 --build build_debug --config Debug --target stringzilla_test_cpp11 build_debug/stringzilla_test_cpp11 ``` @@ -273,7 +273,7 @@ sudo docker run -it --rm -v "$(pwd)":/workspace/StringZilla amazonlinux:2 bash cd /workspace/StringZilla yum install -y make cmake3 gcc10 gcc10-c++ cmake3 -DSTRINGZILLA_BUILD_TEST=1 -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc -DSTRINGZILLA_TARGET_ARCH="ivybridge" -B build_debug -cmake3 --build ./build_debug --config Debug --target stringzilla_test_cpp11 +cmake3 --build build_debug --config Debug --target stringzilla_test_cpp11 build_debug/stringzilla_test_cpp11 ``` @@ -290,6 +290,44 @@ Don't forget to clean up Docker afterwards. docker system prune -a --volumes ``` +### Cross Compilation + +Unlike GCC, LLVM handles cross compilation very easily. +You just need to pass the right `TARGET_ARCH` and `BUILD_ARCH` to CMake. +The [list includes](https://packages.ubuntu.com/search?keywords=crossbuild-essential&searchon=names): + +- `crossbuild-essential-amd64` for 64-bit x86 +- `crossbuild-essential-arm64` for 64-bit Arm +- `crossbuild-essential-armhf` for 32-bit ARM hard-float +- `crossbuild-essential-armel` for 32-bit ARM soft-float (emulates `float`) +- `crossbuild-essential-riscv64` for RISC-V +- `crossbuild-essential-powerpc` for PowerPC +- `crossbuild-essential-s390x` for IBM Z +- `crossbuild-essential-mips` for MIPS +- `crossbuild-essential-ppc64el` for PowerPC 64-bit little-endian + +Here is an example for cross-compiling for Arm64 on an x86_64 machine: + +```sh +sudo apt-get update +sudo apt-get install -y clang lld make crossbuild-essential-arm64 crossbuild-essential-armhf +export CC="clang" +export CXX="clang++" +export AR="llvm-ar" +export NM="llvm-nm" +export RANLIB="llvm-ranlib" +export TARGET_ARCH="aarch64-linux-gnu" # Or "x86_64-linux-gnu" +export BUILD_ARCH="arm64" # Or "amd64" + +cmake -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_C_COMPILER_TARGET=${TARGET_ARCH} \ + -DCMAKE_CXX_COMPILER_TARGET=${TARGET_ARCH} \ + -DCMAKE_SYSTEM_NAME=Linux \ + -DCMAKE_SYSTEM_PROCESSOR=${BUILD_ARCH} \ + -B build_artifacts +cmake --build build_artifacts --config Release +``` + ## Contributing in Python Python bindings are implemented using pure CPython, so you wouldn't need to install SWIG, PyBind11, or any other third-party library. diff --git a/README.md b/README.md index 3c07735e..02fed7a0 100644 --- a/README.md +++ b/README.md @@ -348,7 +348,7 @@ x: Strs = text.splitlines(keeplinebreaks=False, maxsplit=sys.maxsize) It's important to note, that the last function behavior is slightly different from Python's `str.splitlines`. The [native version][faq-splitlines] matches `\n`, `\r`, `\v` or `\x0b`, `\f` or `\x0c`, `\x1c`, `\x1d`, `\x1e`, `\x85`, `\r\n`, `\u2028`, `\u2029`, including 3x two-bytes-long runes. -The StringZilla version matches only `\n` and is practically a shortcut for `text.split('\n')`. +The StringZilla version matches only `\n`, `\v`, `\f`, `\r`, `\x1c`, `\x1d`, `\x1e`, `\x85`, avoiding two-byte-long runes. [faq-splitlines]: https://docs.python.org/3/library/stdtypes.html#str.splitlines diff --git a/python/lib.c b/python/lib.c index a8645940..70db9d3b 100644 --- a/python/lib.c +++ b/python/lib.c @@ -2323,14 +2323,30 @@ static PyObject *Str_splitlines(PyObject *self, PyObject *args, PyObject *kwargs } } - // TODO: Support arbitrary newline characters: - // https://docs.python.org/3/library/stdtypes.html#str.splitlines - // \n, \r, \r\n, \v or \x0b, \f or \x0c, \x1c, \x1d, \x1e, \x85, \u2028, \u2029 - // https://github.com/ashvardanian/StringZilla/issues/29 + // The Unicode standard defines a number of characters that conforming applications + // should recognize as line terminators: + // + // LF: Line Feed, U+000A - 1 byte (\n) + // VT: Vertical Tab, U+000B - 1 byte (\v) + // FF: Form Feed, U+000C - 1 byte (\f) + // CR: Carriage Return, U+000D - 1 byte (\r) + // NEL: Next Line, U+0085 - 1 byte (\x85) + // LS: Line Separator, U+2028 - 2 bytes + // PS: Paragraph Separator, U+2029 - 2 bytes + // CR+LF: CR (U+000D) followed by LF (U+000A) - 2 bytes + // + // The Python standard is different, it also includes: + // + // FS: File Separator, U+001C - 1 byte (\x1C) + // GS: Group Separator, U+001D - 1 byte (\x1D) + // RS: Record Separator, U+001E - 1 byte (\x1E) + // + // We avoid all 2-byte sequences and only consider 1-byte delimiters. + // CPython docs: https://docs.python.org/3/library/stdtypes.html#str.splitlines sz_string_view_t separator; - separator.start = "\n"; - separator.length = 1; - return Str_split_(text_object, text, separator, keeplinebreaks, maxsplit, &sz_find, 1); + separator.start = "\x0A\x0B\x0C\x0D\x85\x1C\x1D\x1E"; + separator.length = 8; + return Str_split_(text_object, text, separator, keeplinebreaks, maxsplit, &sz_find_char_from, 1); } static PyObject *Str_concat(PyObject *self, PyObject *other) { diff --git a/scripts/bench_search.cpp b/scripts/bench_search.cpp index 6fdd70ae..ca869334 100644 --- a/scripts/bench_search.cpp +++ b/scripts/bench_search.cpp @@ -297,9 +297,9 @@ int main(int argc, char const **argv) { bench_finds(dataset.text, {"\n"}, find_functions()); bench_rfinds(dataset.text, {"\n"}, rfind_functions()); - std::printf("Benchmarking for an [\\n\\r] RegEx:\n"); - bench_finds(dataset.text, {{sz::newlines(), sizeof(sz::newlines())}}, find_charset_functions()); - bench_rfinds(dataset.text, {{sz::newlines(), sizeof(sz::newlines())}}, rfind_charset_functions()); + std::printf("Benchmarking for an [\\n\\r\\v\\f] RegEx:\n"); + bench_finds(dataset.text, {"\n\r\v\f"}, find_charset_functions()); + bench_rfinds(dataset.text, {"\n\r\v\f"}, rfind_charset_functions()); // Typical ASCII tokenization and validation benchmarks std::printf("Benchmarking for whitespaces:\n");