Merge pull request #111 from ashvardanian/main-dev

String decoding & MSVC builds
ashvardanian · Mar 3, 2024 · a86ae00 · a86ae00
2 parents 66fd8a9 + 184c1e6
commit a86ae00
Show file tree

Hide file tree

Showing 10 changed files with 226 additions and 51 deletions.
diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
@@ -262,7 +262,43 @@ jobs:
     runs-on: windows-latest
     steps:
       - uses: actions/checkout@v4
+      - uses: ilammy/msvc-dev-cmd@v1
 
+      - name: Build C/C++
+        shell: cmd
+        run: |
+          cmake -GNinja -B build_artifacts ^
+            -DCMAKE_BUILD_TYPE=RelWithDebInfo ^
+            -DCMAKE_EXPORT_COMPILE_COMMANDS=1 ^
+            -DSTRINGZILLA_BUILD_BENCHMARK=1 ^
+            -DSTRINGZILLA_BUILD_TEST=1
+          
+          cmake --build build_artifacts --config RelWithDebInfo > build_artifacts/logs.txt 2>&1 || (
+            echo "Compilation failed. Here are the logs:"
+            type build_artifacts\logs.txt
+            echo "The original compilation commands:"
+            type build_artifacts\compile_commands.json
+            echo:
+            echo "CPU Features:"
+            wmic cpu list /format:list
+            exit 1
+          )
+      - name: Test C++
+        run: .\build_artifacts\stringzilla_test_cpp20.exe
+      - name: Test on Real World Data
+        run: |
+          .\build_artifacts\stringzilla_bench_search.exe ${DATASET_PATH}     # for substring search
+          .\build_artifacts\stringzilla_bench_token.exe ${DATASET_PATH}      # for hashing, equality comparisons, etc.
+          .\build_artifacts\stringzilla_bench_similarity.exe ${DATASET_PATH} # for edit distances and alignment scores
+          .\build_artifacts\stringzilla_bench_sort.exe ${DATASET_PATH}       # for sorting arrays of strings
+          .\build_artifacts\stringzilla_bench_container.exe ${DATASET_PATH}  # for STL containers with string keys
+        env:
+          DATASET_PATH: ./README.md
+        # Don't overload GitHub with our benchmarks.
+        # The results in such an unstable environment will be meaningless anyway.
+        if: 0
+
+        # Python
       - name: Set up Python ${{ env.PYTHON_VERSION }}
         uses: actions/setup-python@v5
         with:

diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,7 @@
 build/
 build_debug/
 build_release/
+build_artifacts*
 
 # Yes, everyone loves keeping this file in the history.
 # But with a very minimalistic binding and just a couple of dependencies 
@@ -27,6 +28,7 @@ CMakeFiles
 *.pyd
 .venv/*
 node_modules/
+.vs/
 
 # Recommended datasets
 leipzig1M.txt

diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -252,7 +252,8 @@
     "xmemory": "cpp",
     "xtr1common": "cpp",
     "xtree": "cpp",
-    "xutility": "cpp"
+    "xutility": "cpp",
+    "errno.h": "c"
   },
   "python.pythonPath": "~/miniconda3/bin/python"
 }
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -239,9 +239,15 @@ if(${STRINGZILLA_BUILD_TEST})
   # compile multiple backends: disabling all SIMD, enabling only AVX2, only AVX-512, only Arm Neon.
   if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64|amd64")
     # x86 specific backends
-    define_launcher(stringzilla_test_cpp20_x86_serial scripts/test.cpp 20 "ivybridge")
-    define_launcher(stringzilla_test_cpp20_x86_avx2 scripts/test.cpp 20 "haswell")
-    define_launcher(stringzilla_test_cpp20_x86_avx512 scripts/test.cpp 20 "sapphirerapids")
+    if (MSVC)
+      define_launcher(stringzilla_test_cpp20_x86_serial scripts/test.cpp 20 "AVX")
+      define_launcher(stringzilla_test_cpp20_x86_avx2 scripts/test.cpp 20 "AVX2")
+      define_launcher(stringzilla_test_cpp20_x86_avx512 scripts/test.cpp 20 "AVX512")
+    else()
+      define_launcher(stringzilla_test_cpp20_x86_serial scripts/test.cpp 20 "ivybridge")
+      define_launcher(stringzilla_test_cpp20_x86_avx2 scripts/test.cpp 20 "haswell")
+      define_launcher(stringzilla_test_cpp20_x86_avx512 scripts/test.cpp 20 "sapphirerapids")
+    endif()
   elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
     # ARM specific backends
     define_launcher(stringzilla_test_cpp20_arm_serial scripts/test.cpp 20 "armv8-a")

diff --git a/README.md b/README.md
@@ -309,14 +309,14 @@ Consider contributing, if you need a feature that's not yet implemented.
 
 ### Basic Usage
 
-If you've ever used the Python `str` or `bytes` class, you'll know what to expect.
+If you've ever used the Python `str`, `bytes`, `bytearray`, `memoryview` class, you'll know what to expect.
 StringZilla's `Str` class is a hybrid of those two, providing `str`-like interface to byte-arrays.
 
 ```python
 from stringzilla import Str, File
 
-text_from_str = Str('some-string')
-text_from_file = Str(File('some-file.txt'))
+text_from_str = Str('some-string') # no copies, just a view
+text_from_file = Str(File('some-file.txt')) # memory-mapped file
 ```
 
 The `File` class memory-maps a file from persistent memory without loading its copy into RAM.
@@ -328,18 +328,23 @@ A standard dataset pre-processing use case would be to map a sizeable textual da
 - Length: `len(text) -> int`
 - Indexing: `text[42] -> str`
 - Slicing: `text[42:46] -> Str`
-- String conversion: `str(text) -> str`
 - Substring check: `'substring' in text -> bool`
 - Hashing: `hash(text) -> int`
+- String conversion: `str(text) -> str`
 
 ### Advanced Operations
 
-- `text.contains('substring', start=0, end=9223372036854775807) -> bool`
-- `text.find('substring', start=0, end=9223372036854775807) -> int`
-- `text.count('substring', start=0, end=9223372036854775807, allowoverlap=False) -> int`
-- `text.split(separator=' ', maxsplit=9223372036854775807, keepseparator=False) -> Strs`
-- `text.rsplit(separator=' ', maxsplit=9223372036854775807, keepseparator=False) -> Strs`
-- `text.splitlines(keeplinebreaks=False, maxsplit=9223372036854775807) -> Strs`
+```py
+import sys
+
+x: bool = text.contains('substring', start=0, end=sys.maxsize)
+x: int = text.find('substring', start=0, end=sys.maxsize)
+x: int = text.count('substring', start=0, end=sys.maxsize, allowoverlap=False)
+x: str = text.decode(encoding='utf-8', errors='strict')
+x: Strs = text.split(separator=' ', maxsplit=sys.maxsize, keepseparator=False)
+x: Strs = text.rsplit(separator=' ', maxsplit=sys.maxsize, keepseparator=False)
+x: Strs = text.splitlines(keeplinebreaks=False, maxsplit=sys.maxsize)
+```
 
 It's important to note, that the last function behavior is slightly different from Python's `str.splitlines`.
 The [native version][faq-splitlines] matches `\n`, `\r`, `\v` or `\x0b`, `\f` or `\x0c`, `\x1c`, `\x1d`, `\x1e`, `\x85`, `\r\n`, `\u2028`, `\u2029`, including 3x two-bytes-long runes.
@@ -353,15 +358,14 @@ Python strings don't natively support character set operations.
 This forces people to use regular expressions, which are slow and hard to read.
 To avoid the need for `re.finditer`, StringZilla provides the following interfaces:
 
-- `text.find_first_of('chars', start=0, end=9223372036854775807) -> int`
-- `text.find_last_of('chars', start=0, end=9223372036854775807) -> int`
-- `text.find_first_not_of('chars', start=0, end=9223372036854775807) -> int`
-- `text.find_last_not_of('chars', start=0, end=9223372036854775807) -> int`
-
-Similarly, for splitting operations:
-
-- `text.split_charset(separator='chars', maxsplit=9223372036854775807, keepseparator=False) -> Strs`
-- `text.rsplit_charset(separator='chars', maxsplit=9223372036854775807, keepseparator=False) -> Strs`
+```py
+x: int = text.find_first_of('chars', start=0, end=sys.maxsize)
+x: int = text.find_last_of('chars', start=0, end=sys.maxsize)
+x: int = text.find_first_not_of('chars', start=0, end=sys.maxsize)
+x: int = text.find_last_not_of('chars', start=0, end=sys.maxsize)
+x: Strs = text.split_charset(separator='chars', maxsplit=sys.maxsize, keepseparator=False)
+x: Strs = text.rsplit_charset(separator='chars', maxsplit=sys.maxsize, keepseparator=False)
+```
 
 ### Collection-Level Operations
 
@@ -420,9 +424,9 @@ Assuming StringZilla CPython bindings are implemented [without any intermediate
 ```py
 import stringzilla as sz
 
-contains: bool = sz.contains("haystack", "needle", start=0, end=9223372036854775807)
-offset: int = sz.find("haystack", "needle", start=0, end=9223372036854775807)
-count: int = sz.count("haystack", "needle", start=0, end=9223372036854775807, allowoverlap=False)
+contains: bool = sz.contains("haystack", "needle", start=0, end=sys.maxsize)
+offset: int = sz.find("haystack", "needle", start=0, end=sys.maxsize)
+count: int = sz.count("haystack", "needle", start=0, end=sys.maxsize, allowoverlap=False)
 ```
 
 ### Edit Distances
@@ -515,6 +519,20 @@ next_doc_offset = next_doc.offset_within(web_archieve)
 web_archieve.write_to("next_doc.html")
 ```
 
+#### PyArrow
+
+A `Str` is easy to cast to [PyArrow](https://arrow.apache.org/docs/python/arrays.html#string-and-binary-types) buffers.
+
+```py
+from pyarrow as foreign_buffer
+from stringzilla import Str
+
+original = "hello"
+view = Str(native)
+arrow = foreign_buffer(view.address, view.nbytes, view)
+```
+
+That means you can convert `Str` to `pyarrow.Buffer` and `Strs` to `pyarrow.Array` without extra copies.
 
 ## Quick Start: C/C++ 🛠️
 
@@ -1369,13 +1387,16 @@ Another one is the [Fibonacci hash trick](https://probablydance.com/2018/06/16/f
 
 ### Unicode, UTF-8, and Wide Characters
 
-StringZilla does not __yet__ implement any Unicode-specific algorithms.
-The content is addressed at byte-level, and the string is assumed to be encoded in UTF-8 or extended ASCII.
-Refer to [simdutf](https://github.com/simdutf/simdutf) for fast conversions and [icu](https://github.com/unicode-org/icu) for character metadata.
+Most StringZilla operations are byte-level, so they work well with ASCII and UTF8 content out of the box.
+In some cases, like edit-distance computation, the result of byte-level evaluation and character-level evaluation may differ.
+So StringZilla provides following functions to work with Unicode:
+
+- `sz_edit_distance_utf8` - computes the Levenshtein distance between two UTF-8 strings.
+- `sz_hamming_distance_utf8` - computes the Hamming distance between two UTF-8 strings.
 
-This may introduce frictions, when binding to some programming languages.
-Namely, Java, JavaScript, Python 2, C#, and Objective-C use wide characters (`wchar`) - two byte long codes.
+Java, JavaScript, Python 2, C#, and Objective-C, however, use wide characters (`wchar`) - two byte long codes, instead of the more reasonable fixed-length UTF32 or variable-length UTF8.
 This leads [to all kinds of offset-counting issues][wide-char-offsets] when facing four-byte long Unicode characters.
+So consider transcoding with [simdutf](https://github.com/simdutf/simdutf), if you are coming from such environments.
 
 [wide-char-offsets]: https://josephg.com/blog/string-length-lies/
 

diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h
@@ -4507,10 +4507,10 @@ SZ_PUBLIC void sz_hashes_avx512(sz_cptr_t start, sz_size_t length, sz_size_t win
         chars_vec.zmm = _mm512_add_epi8(chars_vec.zmm, shift_vec.zmm);
 
         // ... and prefetch the next four characters into Level 2 or higher.
-        _mm_prefetch(text_fourth + 1, _MM_HINT_T1);
-        _mm_prefetch(text_third + 1, _MM_HINT_T1);
-        _mm_prefetch(text_second + 1, _MM_HINT_T1);
-        _mm_prefetch(text_first + 1, _MM_HINT_T1);
+        _mm_prefetch((sz_cptr_t)text_fourth + 1, _MM_HINT_T1);
+        _mm_prefetch((sz_cptr_t)text_third + 1, _MM_HINT_T1);
+        _mm_prefetch((sz_cptr_t)text_second + 1, _MM_HINT_T1);
+        _mm_prefetch((sz_cptr_t)text_first + 1, _MM_HINT_T1);
 
         // 3. Add the incoming characters.
         hash_vec.zmm = _mm512_add_epi64(hash_vec.zmm, chars_vec.zmm);

diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp
@@ -458,8 +458,9 @@ class range_matches {
             return temp;
         }
 
-        bool operator!=(iterator const &other) const noexcept { return remaining_.begin() != other.remaining_.begin(); }
-        bool operator==(iterator const &other) const noexcept { return remaining_.begin() == other.remaining_.begin(); }
+        // Assumes both iterators point to the same underlying string.
+        bool operator!=(iterator const &other) const noexcept { return remaining_.data() != other.remaining_.data(); }
+        bool operator==(iterator const &other) const noexcept { return remaining_.data() == other.remaining_.data(); }
         bool operator!=(end_sentinel_type) const noexcept { return !remaining_.empty(); }
         bool operator==(end_sentinel_type) const noexcept { return remaining_.empty(); }
     };
@@ -550,8 +551,14 @@ class range_rmatches {
             return temp;
         }
 
-        bool operator!=(iterator const &other) const noexcept { return remaining_.end() != other.remaining_.end(); }
-        bool operator==(iterator const &other) const noexcept { return remaining_.end() == other.remaining_.end(); }
+        // Assumes both iterators point to the same underlying string.
+        // This has to be `.data() + .size()`, to be compatible with `std::string_view` on MSVC.
+        bool operator!=(iterator const &other) const noexcept {
+            return remaining_.data() + remaining_.size() != other.remaining_.data() + other.remaining_.size();
+        }
+        bool operator==(iterator const &other) const noexcept {
+            return remaining_.data() + remaining_.size() == other.remaining_.data() + other.remaining_.size();
+        }
         bool operator!=(end_sentinel_type) const noexcept { return !remaining_.empty(); }
         bool operator==(end_sentinel_type) const noexcept { return remaining_.empty(); }
     };