Merge pull request #106 from ashvardanian/main-dev

ashvardanian · Mar 2, 2024 · 6fc0649 · 6fc0649
2 parents c433698 + 34997a3
commit 6fc0649
Show file tree

Hide file tree

Showing 10 changed files with 2,353 additions and 661 deletions.
diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
@@ -78,7 +78,7 @@ jobs:
       - name: Build Python
         run: |
           python -m pip install --upgrade pip
-          pip install pytest pytest-repeat
+          pip install pytest pytest-repeat numpy pyarrow
           python -m pip install .
       - name: Test Python
         run: pytest scripts/test.py -s -x
@@ -160,7 +160,7 @@ jobs:
       - name: Build Python
         run: |
           python -m pip install --upgrade pip
-          pip install pytest pytest-repeat
+          pip install pytest pytest-repeat numpy pyarrow
           python -m pip install .
       - name: Test Python
         run: pytest scripts/test.py -s -x
@@ -235,7 +235,7 @@ jobs:
       - name: Build Python
         run: |
           python -m pip install --upgrade pip
-          pip install pytest pytest-repeat
+          pip install pytest pytest-repeat numpy pyarrow
           python -m pip install .
       - name: Test Python
         run: pytest scripts/test.py -s -x
@@ -270,7 +270,7 @@ jobs:
       - name: Build Python
         run: |
           python -m pip install --upgrade pip
-          pip install pytest pytest-repeat
+          pip install pytest pytest-repeat numpy pyarrow
           python -m pip install .
       - name: Test Python
         run: pytest scripts/test.py -s -x

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -136,6 +136,12 @@ cppcheck --project=build_artifacts/compile_commands.json --enable=all
 clang-tidy-11 -p build_artifacts
 ```
 
+I'd recommend putting the following breakpoints:
+
+- `__asan::ReportGenericError` - to detect illegal memory accesses.
+- `__GI_exit` - to stop at exit points - the end of running any executable.
+- `__builtin_unreachable` - to catch all the places where the code is expected to be unreachable.
+
 ### Benchmarking
 
 For benchmarks, you can use the following commands:

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/README.md b/README.md
@@ -337,34 +337,81 @@ A standard dataset pre-processing use case would be to map a sizeable textual da
 - `text.contains('substring', start=0, end=9223372036854775807) -> bool`
 - `text.find('substring', start=0, end=9223372036854775807) -> int`
 - `text.count('substring', start=0, end=9223372036854775807, allowoverlap=False) -> int`
-- `text.splitlines(keeplinebreaks=False, separator='\n') -> Strs`
 - `text.split(separator=' ', maxsplit=9223372036854775807, keepseparator=False) -> Strs`
+- `text.rsplit(separator=' ', maxsplit=9223372036854775807, keepseparator=False) -> Strs`
+- `text.splitlines(keeplinebreaks=False, maxsplit=9223372036854775807) -> Strs`
+
+It's important to note, that the last function behavior is slightly different from Python's `str.splitlines`.
+The [native version][faq-splitlines] matches `\n`, `\r`, `\v` or `\x0b`, `\f` or `\x0c`, `\x1c`, `\x1d`, `\x1e`, `\x85`, `\r\n`, `\u2028`, `\u2029`, including 3x two-bytes-long runes.
+The StringZilla version matches only `\n` and is practically a shortcut for `text.split('\n')`.
+
+[faq-splitlines]: https://docs.python.org/3/library/stdtypes.html#str.splitlines
+
+### Character Set Operations
+
+Python strings don't natively support character set operations.
+This forces people to use regular expressions, which are slow and hard to read.
+To avoid the need for `re.finditer`, StringZilla provides the following interfaces:
+
+- `text.find_first_of('chars', start=0, end=9223372036854775807) -> int`
+- `text.find_last_of('chars', start=0, end=9223372036854775807) -> int`
+- `text.find_first_not_of('chars', start=0, end=9223372036854775807) -> int`
+- `text.find_last_not_of('chars', start=0, end=9223372036854775807) -> int`
+
+Similarly, for splitting operations:
+
+- `text.split_charset(separator='chars', maxsplit=9223372036854775807, keepseparator=False) -> Strs`
+- `text.rsplit_charset(separator='chars', maxsplit=9223372036854775807, keepseparator=False) -> Strs`
 
 ### Collection-Level Operations
 
-Once split into a `Strs` object, you can sort, shuffle, and reorganize the slices.
+Once split into a `Strs` object, you can sort, shuffle, and reorganize the slices, with minimum memory footprint.
+If all the chunks are located in consecutive memory regions, the memory overhead can be as low as 4 bytes per chunk.
 
 ```python
 lines: Strs = text.split(separator='\n') # 4 bytes per line overhead for under 4 GB of text
-lines.sort() # explodes to 16 bytes per line overhead for any length text
-lines.shuffle(seed=42) # reproducing dataset shuffling with a seed
+batch: Strs = lines.sample(seed=42) # 10x faster than `random.choices`
+lines.shuffle(seed=42) # or shuffle all lines in place and shard with slices
+# WIP: lines.sort() # explodes to 16 bytes per line overhead for any length text
+# WIP: sorted_order: tuple = lines.argsort() # similar to `numpy.argsort`
 ```
 
-Assuming superior search speed splitting should also work 3x faster than with native Python strings.
-Need copies?
+Working on [RedPajama][redpajama], addressing 20 Billion annotated english documents, one will need only 160 GB of RAM instead of Terabytes.
+Once loaded, the data will be memory-mapped, and can be reused between multiple Python processes without copies.
+And of course, you can use slices to navigate the dataset and shard it between multiple workers.
 
 ```python
-sorted_copy: Strs = lines.sorted()
-shuffled_copy: Strs = lines.shuffled(seed=42)
+lines[::3] # every third line
+lines[1::1] # every odd line
+lines[:-100:-1] # last 100 lines in reverse order
 ```
 
-Those collections of `Strs` are designed to keep the memory consumption low.
-If all the chunks are located in consecutive memory regions, the memory overhead can be as low as 4 bytes per chunk.
-That's designed to handle very large datasets, like [RedPajama][redpajama].
-To address all 20 Billion annotated english documents in it, one will need only 160 GB of RAM instead of Terabytes.
-
 [redpajama]: https://github.com/togethercomputer/RedPajama-Data
 
+### Iterators and Memory Efficiency
+
+Python's operations like `split()` and `readlines()` immediately materialize a `list` of copied parts.
+This can be very memory-inefficient for large datasets.
+StringZilla saves a lot of memory by viewing existing memory regions as substrings, but even more memory can be saved by using lazily evaluated iterators.
+
+- `text.split_iter(separator=' ', keepseparator=False) -> SplitIterator[Str]`
+- `text.rsplit_iter(separator=' ', keepseparator=False) -> SplitIterator[Str]`
+- `text.split_charset_iter(separator='chars', keepseparator=False) -> SplitIterator[Str]`
+- `text.rsplit_charset_iter(separator='chars', keepseparator=False) -> SplitIterator[Str]`
+
+StringZilla can easily be 10x more memory efficient than native Python classes for tokenization.
+With lazy operations, it practically becomes free.
+
+```py
+import stringzilla as sz
+%load_ext memory_profiler
+
+text = open("enwik9.txt", "r").read() # 1 GB, mean word length 7.73 bytes
+%memit text.split() # increment: 8670.12 MiB (152 ms)
+%memit sz.split(text) # increment: 530.75 MiB (25 ms)
+%memit sum(1 for _ in sz.split_iter(text)) # increment: 0.00 MiB
+```
+
 ### Low-Level Python API
 
 Aside from calling the methods on the `Str` and `Strs` classes, you can also call the global functions directly on `str` and `bytes` instances.
@@ -454,6 +501,21 @@ assert sz.alignment_score(
 
 </details>
 
+### Serialization
+
+#### Filesystem
+
+Similar to how `File` can be used to read a large file, other interfaces can be used to dump strings to disk faster.
+The `Str` class has `write_to` to write the string to a file, and `offset_within` to obtain integer offsets of substring view in larger string for navigation.
+
+```py
+web_archieve = Str("<html>...</html><html>...</html>")
+_, end_tag, next_doc = web_archieve.partition("</html>") # or use `find`
+next_doc_offset = next_doc.offset_within(web_archieve)
+web_archieve.write_to("next_doc.html")
+```
+
+
 ## Quick Start: C/C++ 🛠️
 
 The C library is header-only, so you can just copy the `stringzilla.h` header into your project.
@@ -1031,9 +1093,27 @@ __`STRINGZILLA_BUILD_SHARED`, `STRINGZILLA_BUILD_TEST`, `STRINGZILLA_BUILD_BENCH
 ## Quick Start: Rust 🦀
 
 StringZilla is available as a Rust crate.
-It currently covers only the most basic functionality, but is planned to be extended to cover the full C++ API.
+Some of the interfaces will look familiar to the users of the `memchr` crate.
+
+```rust
+use stringzilla::sz;
+
+// Identical to `memchr::memmem::find` and `memchr::memmem::rfind` functions
+sz::find("Hello, world!", "world") // 7
+sz::rfind("Hello, world!", "world") // 7
+
+// Generalizations of `memchr::memrchr[123]`
+sz::find_char_from("Hello, world!", "world") // 2
+sz::rfind_char_from("Hello, world!", "world") // 11
+```
+
+Unlike `memchr`, the throughput of `stringzilla` is [high in both normal and reverse-order searches][memchr-benchmarks].
+It also provides no constraints on the size of the character set, while `memchr` allows only 1, 2, or 3 characters.
+In addition to global functions, `stringzilla` provides a `StringZilla` extension trait:
 
 ```rust
+use stringzilla::StringZilla;
+
 let my_string: String = String::from("Hello, world!");
 let my_str = my_string.as_str();
 let my_cow_str = Cow::from(&my_string);
@@ -1051,6 +1131,23 @@ assert_eq!(my_str.sz_find("world"), Some(7));
 assert_eq!(my_cow_str.as_ref().sz_find("world"), Some(7));
 ```
 
+The library also exposes Levenshtein and Hamming edit-distances for byte-arrays and UTF-8 strings, as well as Needleman-Wunch alignment scores.
+
+```rust
+use stringzilla::sz;
+
+// Handling arbitrary byte arrays:
+sz::edit_distance("Hello, world!", "Hello, world?"); // 1
+sz::hamming_distance("Hello, world!", "Hello, world?"); // 1
+sz::alignment_score("Hello, world!", "Hello, world?", sz::unary_substitution_costs(), -1); // -1
+
+// Handling UTF-8 strings:
+sz::hamming_distance_utf8("αβγδ", "αγγδ") // 1
+sz::edit_distance_utf8("façade", "facade") // 1
+```
+
+[memchr-benchmarks]: https://github.com/ashvardanian/memchr_vs_stringzilla
+
 ## Quick Start: Swift 🍏
 
 StringZilla is available as a Swift package.

diff --git a/c/lib.c b/c/lib.c
@@ -23,6 +23,7 @@ typedef sz_size_t size_t; // Reuse the type definition we've inferred from `stri
 #else
 typedef __SIZE_TYPE__ size_t; // For GCC/Clang
 #endif
+int rand(void) { return 0; }
 void free(void *start) { sz_unused(start); }
 void *malloc(size_t length) {
     sz_unused(length);
@@ -255,13 +256,34 @@ SZ_DYNAMIC sz_cptr_t sz_rfind_charset(sz_cptr_t text, sz_size_t length, sz_chars
     return sz_dispatch_table.rfind_from_set(text, length, set);
 }
 
+SZ_DYNAMIC sz_size_t sz_hamming_distance( //
+    sz_cptr_t a, sz_size_t a_length,      //
+    sz_cptr_t b, sz_size_t b_length,      //
+    sz_size_t bound) {
+    return sz_hamming_distance_serial(a, a_length, b, b_length, bound);
+}
+
+SZ_DYNAMIC sz_size_t sz_hamming_distance_utf8( //
+    sz_cptr_t a, sz_size_t a_length,           //
+    sz_cptr_t b, sz_size_t b_length,           //
+    sz_size_t bound) {
+    return sz_hamming_distance_utf8_serial(a, a_length, b, b_length, bound);
+}
+
 SZ_DYNAMIC sz_size_t sz_edit_distance( //
     sz_cptr_t a, sz_size_t a_length,   //
     sz_cptr_t b, sz_size_t b_length,   //
     sz_size_t bound, sz_memory_allocator_t *alloc) {
     return sz_dispatch_table.edit_distance(a, a_length, b, b_length, bound, alloc);
 }
 
+SZ_DYNAMIC sz_size_t sz_edit_distance_utf8( //
+    sz_cptr_t a, sz_size_t a_length,        //
+    sz_cptr_t b, sz_size_t b_length,        //
+    sz_size_t bound, sz_memory_allocator_t *alloc) {
+    return _sz_edit_distance_wagner_fisher_serial(a, a_length, b, b_length, bound, sz_true_k, alloc);
+}
+
 SZ_DYNAMIC sz_ssize_t sz_alignment_score(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length,
                                          sz_error_cost_t const *subs, sz_error_cost_t gap,
                                          sz_memory_allocator_t *alloc) {
@@ -302,3 +324,14 @@ SZ_DYNAMIC sz_cptr_t sz_rfind_char_not_from(sz_cptr_t h, sz_size_t h_length, sz_
     sz_charset_invert(&set);
     return sz_rfind_charset(h, h_length, &set);
 }
+
+sz_u64_t _sz_random_generator(void *empty_state) {
+    sz_unused(empty_state);
+    return (sz_u64_t)rand();
+}
+
+SZ_DYNAMIC void sz_generate(sz_cptr_t alphabet, sz_size_t alphabet_size, sz_ptr_t result, sz_size_t result_length,
+                            sz_random_generator_t generator, void *generator_user_data) {
+    if (!generator) generator = _sz_random_generator;
+    sz_generate_serial(alphabet, alphabet_size, result, result_length, generator, generator_user_data);
+}