ashvardanian
diff --git a/‎CONTRIBUTING.md
Lines changed: 14 additions & 0 deletions b/‎CONTRIBUTING.md
Lines changed: 14 additions & 0 deletions
diff --git a/‎Package.swift
Lines changed: 6 additions & 6 deletions b/‎Package.swift
Lines changed: 6 additions & 6 deletions
diff --git a/‎README.md
Lines changed: 30 additions & 30 deletions b/‎README.md
Lines changed: 30 additions & 30 deletions
@@ -432,6 +432,13 @@ npm ci && npm test
 swift build && swift test
 ```
 
+To format, consider using [SwiftFormat](https://github.com/nicklockwood/SwiftFormat):
+
+```bash
+brew install swiftformat
+swiftformat .
+```
+
 Running Swift on Linux requires a couple of extra steps, as the Swift compiler is not available in the default repositories.
 Please get the most recent Swift tarball from the [official website](https://www.swift.org/install/).
 At the time of writing, for 64-bit Arm CPU running Ubuntu 22.04, the following commands would work:
@@ -467,6 +474,13 @@ sudo docker run --rm -v "$PWD:/workspace" -w /workspace swift:5.9 /bin/bash -cl
 cargo test
 ```
 
+If you need to isolate a failing test:
+
+```bash
+export RUST_BACKTRACE=full
+cargo test -- --test-threads=1 --nocapture
+```
+
 If you are updating the package contents, you can validate the list of included files using the following command:
 
 ```bash
 
@@ -5,16 +5,16 @@ let package = Package(
     name: "StringZilla",
     platforms: [
         // Linux doesn't have to be explicitly listed
-        .iOS(.v13),      // For iOS, version 13 and later
-        .tvOS(.v13),     // For tvOS, version 13 and later
+        .iOS(.v13), // For iOS, version 13 and later
+        .tvOS(.v13), // For tvOS, version 13 and later
         .macOS(.v10_15), // For macOS, version 10.15 (Catalina) and later
-        .watchOS(.v6)    // For watchOS, version 6 and later
+        .watchOS(.v6), // For watchOS, version 6 and later
     ],
     products: [
         .library(
             name: "StringZilla",
             targets: ["StringZillaC", "StringZilla"]
-        )
+        ),
     ],
     targets: [
         .target(
@@ -27,7 +27,7 @@ let package = Package(
                 .define("SZ_AVOID_LIBC", to: "0"), // We need `malloc` from LibC
                 .define("SZ_DEBUG", to: "0"), // We don't need any extra assertions in the C layer
                 .headerSearchPath("include/stringzilla"), // Specify header search paths
-                .unsafeFlags(["-Wall"]) // Use with caution: specify custom compiler flags
+                .unsafeFlags(["-Wall"]), // Use with caution: specify custom compiler flags
             ]
         ),
         .target(
@@ -41,7 +41,7 @@ let package = Package(
             dependencies: ["StringZilla"],
             path: "swift",
             sources: ["Test.swift"]
-        )
+        ),
     ],
     cLanguageStandard: CLanguageStandard.c99
 )
@@ -137,7 +137,7 @@ __Who is this for?__
       <span style="color:#ABABAB;">arm:</span> <b>0.02</b> GB/s
     </td>
     <td align="center">
-      <code>sz_find_charset</code><br/>
+      <code>sz_find_byteset</code><br/>
       <span style="color:#ABABAB;">x86:</span> <b>4.08</b> &centerdot;
       <span style="color:#ABABAB;">arm:</span> <b>3.22</b> GB/s
     </td>
@@ -155,7 +155,7 @@ __Who is this for?__
     </td>
     <td align="center">⚪</td>
     <td align="center">
-      <code>sz_rfind_charset</code><br/>
+      <code>sz_rfind_byteset</code><br/>
       <span style="color:#ABABAB;">x86:</span> <b>0.43</b> &centerdot;
       <span style="color:#ABABAB;">arm:</span> <b>0.23</b> GB/s
     </td>
@@ -181,7 +181,7 @@ __Who is this for?__
       <span style="color:#ABABAB;">arm:</span> <b>5.9</b> MB/s
     </td>
     <td align="center">
-      <code>sz_generate</code><br/>
+      <code>sz_fill_random</code><br/>
       <span style="color:#ABABAB;">x86:</span> <b>56.2</b> &centerdot;
       <span style="color:#ABABAB;">arm:</span> <b>25.8</b> MB/s
     </td>
@@ -203,7 +203,7 @@ __Who is this for?__
       <span style="color:#ABABAB;">arm:</span> <b>140.0</b> MB/s
     </td>
     <td align="center">
-      <code>sz_look_up_transform</code><br/>
+      <code>sz_lookup</code><br/>
       <span style="color:#ABABAB;">x86:</span> <b>21.2</b> &centerdot;
       <span style="color:#ABABAB;">arm:</span> <b>8.5</b> GB/s
     </td>
@@ -247,7 +247,7 @@ __Who is this for?__
       <span style="color:#ABABAB;">arm:</span> <b>2,220</b> ns
     </td>
     <td align="center">
-      <code>sz_edit_distance</code><br/>
+      <code>sz_levenshtein_distance</code><br/>
       <span style="color:#ABABAB;">x86:</span> <b>99</b> &centerdot;
       <span style="color:#ABABAB;">arm:</span> <b>180</b> ns
     </td>
@@ -265,7 +265,7 @@ __Who is this for?__
       <span style="color:#ABABAB;">arm:</span> <b>367</b> ms
     </td>
     <td align="center">
-      <code>sz_alignment_score</code><br/>
+      <code>sz_needleman_wunsch_score</code><br/>
       <span style="color:#ABABAB;">x86:</span> <b>73</b> &centerdot;
       <span style="color:#ABABAB;">arm:</span> <b>177</b> ms
     </td>
@@ -396,8 +396,8 @@ x: int = text.find_first_of('chars', start=0, end=sys.maxsize)
 x: int = text.find_last_of('chars', start=0, end=sys.maxsize)
 x: int = text.find_first_not_of('chars', start=0, end=sys.maxsize)
 x: int = text.find_last_not_of('chars', start=0, end=sys.maxsize)
-x: Strs = text.split_charset(separator='chars', maxsplit=sys.maxsize, keepseparator=False)
-x: Strs = text.rsplit_charset(separator='chars', maxsplit=sys.maxsize, keepseparator=False)
+x: Strs = text.split_byteset(separator='chars', maxsplit=sys.maxsize, keepseparator=False)
+x: Strs = text.rsplit_byteset(separator='chars', maxsplit=sys.maxsize, keepseparator=False)
 ```
 
 You can also transform the string using Look-Up Tables (LUTs), mapping it to a different character set.
@@ -453,8 +453,8 @@ StringZilla saves a lot of memory by viewing existing memory regions as substrin
 ```py
 x: SplitIterator[Str] = text.split_iter(separator=' ', keepseparator=False)
 x: SplitIterator[Str] = text.rsplit_iter(separator=' ', keepseparator=False)
-x: SplitIterator[Str] = text.split_charset_iter(separator='chars', keepseparator=False)
-x: SplitIterator[Str] = text.rsplit_charset_iter(separator='chars', keepseparator=False)
+x: SplitIterator[Str] = text.split_byteset_iter(separator='chars', keepseparator=False)
+x: SplitIterator[Str] = text.rsplit_byteset_iter(separator='chars', keepseparator=False)
 ```
 
 StringZilla can easily be 10x more memory efficient than native Python classes for tokenization.
@@ -654,7 +654,7 @@ By design, StringZilla has a couple of notable differences from LibC:
 
 That way `sz_find` and `sz_rfind` are similar to `strstr` and `strrstr` in LibC.
 Similarly, `sz_find_byte` and `sz_rfind_byte` replace `memchr` and `memrchr`.
-The `sz_find_charset` maps to `strspn` and `strcspn`, while `sz_rfind_charset` has no sibling in LibC.
+The `sz_find_byteset` maps to `strspn` and `strcspn`, while `sz_rfind_byteset` has no sibling in LibC.
 
 <table>
     <tr>
@@ -679,11 +679,11 @@ The `sz_find_charset` maps to `strspn` and `strcspn`, while `sz_rfind_charset` h
     </tr>
     <tr>
         <td><code>strcspn(haystack, needles)</code></td>
-        <td><code>sz_rfind_charset(haystack, haystack_length, needles_bitset)</code></td>
+        <td><code>sz_rfind_byteset(haystack, haystack_length, needles_bitset)</code></td>
     </tr>
     <tr>
         <td><code>strspn(haystack, needles)</code></td>
-        <td><code>sz_find_charset(haystack, haystack_length, needles_bitset)</code></td>
+        <td><code>sz_find_byteset(haystack, haystack_length, needles_bitset)</code></td>
     </tr>
     <tr>
         <td><code>memmem(haystack, haystack_length, needle, needle_length)</code>, <code>strstr</code></td>
@@ -923,7 +923,7 @@ StringZilla provides a convenient `partition` function, which returns a tuple of
 ```cpp
 auto parts = haystack.partition(':'); // Matching a character
 auto [before, match, after] = haystack.partition(':'); // Structure unpacking
-auto [before, match, after] = haystack.partition(sz::char_set(":;")); // Character-set argument
+auto [before, match, after] = haystack.partition(sz::byteset(":;")); // Character-set argument
 auto [before, match, after] = haystack.partition(" : "); // String argument
 auto [before, match, after] = haystack.rpartition(sz::whitespaces_set()); // Split around the last whitespace
 ```
@@ -951,8 +951,8 @@ Here is a sneak peek of the most useful ones.
 ```cpp
 text.hash(); // -> 64 bit unsigned integer 
 text.ssize(); // -> 64 bit signed length to avoid `static_cast<std::ssize_t>(text.size())`
-text.contains_only(" \w\t"); // == text.find_first_not_of(sz::char_set(" \w\t")) == npos;
-text.contains(sz::whitespaces_set()); // == text.find(sz::char_set(sz::whitespaces_set())) != npos;
+text.contains_only(" \w\t"); // == text.find_first_not_of(sz::byteset(" \w\t")) == npos;
+text.contains(sz::whitespaces_set()); // == text.find(sz::byteset(sz::whitespaces_set())) != npos;
 
 // Simpler slicing than `substr`
 text.front(10); // -> sz::string_view
@@ -997,7 +997,7 @@ To avoid those, StringZilla provides lazily-evaluated ranges, compatible with th
 
 ```cpp
 for (auto line : haystack.split("\r\n"))
-    for (auto word : line.split(sz::char_set(" \w\t.,;:!?")))
+    for (auto word : line.split(sz::byteset(" \w\t.,;:!?")))
         std::cout << word << std::endl;
 ```
 
@@ -1006,9 +1006,9 @@ It also allows interleaving matches, if you want both inclusions of `xx` in `xxx
 Debugging pointer offsets is not a pleasant exercise, so keep the following functions in mind.
 
 - `haystack.[r]find_all(needle, interleaving)`
-- `haystack.[r]find_all(sz::char_set(""))`
+- `haystack.[r]find_all(sz::byteset(""))`
 - `haystack.[r]split(needle)`
-- `haystack.[r]split(sz::char_set(""))`
+- `haystack.[r]split(sz::byteset(""))`
 
 For $N$ matches the split functions will report $N+1$ matches, potentially including empty strings.
 Ranges have a few convenience methods as well:
@@ -1065,7 +1065,7 @@ sz::string random_string(std::size_t length, char const *alphabet, std::size_t c
 ```
 
 Mouthful and slow.
-StringZilla provides a C native method - `sz_generate` and a convenient C++ wrapper - `sz::generate`.
+StringZilla provides a C native method - `sz_fill_random` and a convenient C++ wrapper - `sz::generate`.
 Similar to Python it also defines the commonly used character sets.
 
 ```cpp
@@ -1085,9 +1085,9 @@ In text processing, it's often necessary to replace all occurrences of a specifi
 Standard library functions may not offer the most efficient or convenient methods for performing bulk replacements, especially when dealing with large strings or performance-critical applications.
 
 - `haystack.replace_all(needle_string, replacement_string)`
-- `haystack.replace_all(sz::char_set(""), replacement_string)`
+- `haystack.replace_all(sz::byteset(""), replacement_string)`
 - `haystack.try_replace_all(needle_string, replacement_string)`
-- `haystack.try_replace_all(sz::char_set(""), replacement_string)`
+- `haystack.try_replace_all(sz::byteset(""), replacement_string)`
 - `haystack.transform(sz::look_up_table::identity())`
 - `haystack.transform(sz::look_up_table::identity(), haystack.data())`
 
@@ -1250,8 +1250,8 @@ sz::find("Hello, world!", "world") // 7
 sz::rfind("Hello, world!", "world") // 7
 
 // Generalizations of `memchr::memrchr[123]`
-sz::find_char_from("Hello, world!", "world") // 2
-sz::rfind_char_from("Hello, world!", "world") // 11
+sz::find_byte_from("Hello, world!", "world") // 2
+sz::rfind_byte_from("Hello, world!", "world") // 11
 ```
 
 Unlike `memchr`, the throughput of `stringzilla` is [high in both normal and reverse-order searches][memchr-benchmarks].
@@ -1268,10 +1268,10 @@ let my_cow_str = Cow::from(&my_string);
 // Use the generic function with a String
 assert_eq!(my_string.sz_find("world"), Some(7));
 assert_eq!(my_string.sz_rfind("world"), Some(7));
-assert_eq!(my_string.sz_find_char_from("world"), Some(2));
-assert_eq!(my_string.sz_rfind_char_from("world"), Some(11));
-assert_eq!(my_string.sz_find_char_not_from("world"), Some(0));
-assert_eq!(my_string.sz_rfind_char_not_from("world"), Some(12));
+assert_eq!(my_string.sz_find_byte_from("world"), Some(2));
+assert_eq!(my_string.sz_rfind_byte_from("world"), Some(11));
+assert_eq!(my_string.sz_find_byte_not_from("world"), Some(0));
+assert_eq!(my_string.sz_rfind_byte_not_from("world"), Some(12));
 
 // Same works for &str and Cow<'_, str>
 assert_eq!(my_str.sz_find("world"), Some(7));
@@ -1315,7 +1315,7 @@ s[s.findLast(substring: "o")!...] // "o StringZilla. 👋")
 s[s.findFirst(characterFrom: "aeiou")!...] // "ello, world! Welcome to StringZilla. 👋")
 s[s.findLast(characterFrom: "aeiou")!...] // "a. 👋")
 s[s.findFirst(characterNotFrom: "aeiou")!...] // "Hello, world! Welcome to StringZilla. 👋"
-s.editDistance(from: "Hello, world!")! // 29
+s.levenshteinDistance(from: "Hello, world!")! // 29
 ```
 
 ## Algorithms & Design Decisions 📚
@@ -1561,7 +1561,7 @@ Most StringZilla operations are byte-level, so they work well with ASCII and UTF
 In some cases, like edit-distance computation, the result of byte-level evaluation and character-level evaluation may differ.
 So StringZilla provides following functions to work with Unicode:
 
-- `sz_edit_distance_utf8` - computes the Levenshtein distance between two UTF-8 strings.
+- `sz_levenshtein_distance_utf8` - computes the Levenshtein distance between two UTF-8 strings.
 - `sz_hamming_distance_utf8` - computes the Hamming distance between two UTF-8 strings.
 
 Java, JavaScript, Python 2, C#, and Objective-C, however, use wide characters (`wchar`) - two byte long codes, instead of the more reasonable fixed-length UTF32 or variable-length UTF8.