From 245da6df67df2a51cca453ff8fd5dd48f20f5994 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Oct 2023 18:12:38 -0700
Subject: [PATCH 1/8] Improve: drop `ctype`, `stddef`, `stdint` headers

---
 .vscode/settings.json     |   2 +
 README.md                 |   4 +-
 python/lib.c              |  42 +++---
 scripts/bench.ipynb       |   2 +-
 stringzilla/stringzilla.h | 274 +++++++++++++++++++++++++-------------
 5 files changed, 203 insertions(+), 121 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 886d1d22..08c5bb65 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -151,6 +151,7 @@
     "NOMINMAX",
     "NOTIMPLEMENTED",
     "numpy",
+    "octogram",
     "pytest",
     "Pythonic",
     "quadgram",
@@ -166,6 +167,7 @@
     "substr",
     "SWAR",
     "TPFLAGS",
+    "unigram",
     "Vardanian",
     "vectorcallfunc",
     "XDECREF",
diff --git a/README.md b/README.md
index 3c04c219..85032c34 100644
--- a/README.md
+++ b/README.md
@@ -116,11 +116,11 @@ There is an ABI-stable C 99 interface, in case you have a database, an operating
 
 // Initialize your haystack and needle
 sz_haystack_t haystack = {your_text, your_text_length};
-sz_needle_t needle = {your_subtext, your_subtext_length, your_anomaly_offset};
+sz_needle_t needle = {your_subtext, your_subtext_length, your_quadgram_offset};
 
 // Perform string-level operations
 size_t character_count = sz_count_char(haystack, 'a');
-size_t character_position = sz_find_char(haystack, 'a');
+size_t character_position = sz_find_unigram(haystack, 'a');
 size_t substring_position = sz_find_substr(haystack, needle);
 
 // Perform collection level operations
diff --git a/python/lib.c b/python/lib.c
index ad10f196..a0f6caca 100644
--- a/python/lib.c
+++ b/python/lib.c
@@ -48,12 +48,12 @@ static struct {
  *          native `mmap` module, as it exposes the address of the mapping in memory.
  */
 typedef struct {
-    PyObject_HEAD;
+    PyObject_HEAD
 #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
-    HANDLE file_handle;
+        HANDLE file_handle;
     HANDLE mapping_handle;
 #else
-    int file_descriptor;
+        int file_descriptor;
 #endif
     void *start;
     size_t length;
@@ -72,8 +72,7 @@ typedef struct {
  *      - Str(File("some-path.txt"), from=0, to=sys.maxint)
  */
 typedef struct {
-    PyObject_HEAD;
-    PyObject *parent;
+    PyObject_HEAD PyObject *parent;
     char const *start;
     size_t length;
 } Str;
@@ -83,14 +82,14 @@ typedef struct {
  *          for faster sorting, shuffling, joins, and lookups.
  */
 typedef struct {
-    PyObject_HEAD;
+    PyObject_HEAD
 
-    enum {
-        STRS_CONSECUTIVE_32,
-        STRS_CONSECUTIVE_64,
-        STRS_REORDERED,
-        STRS_MULTI_SOURCE,
-    } type;
+        enum {
+            STRS_CONSECUTIVE_32,
+            STRS_CONSECUTIVE_64,
+            STRS_REORDERED,
+            STRS_MULTI_SOURCE,
+        } type;
 
     union {
         /**
@@ -641,7 +640,7 @@ static void Str_releasebuffer(PyObject *_, Py_buffer *view) {
 static int Str_in(Str *self, PyObject *arg) {
 
     sz_needle_t needle_struct;
-    needle_struct.anomaly_offset = 0;
+    needle_struct.quadgram_offset = 0;
     if (!export_string_like(arg, &needle_struct.start, &needle_struct.length)) {
         PyErr_SetString(PyExc_TypeError, "Unsupported argument type");
         return -1;
@@ -851,7 +850,7 @@ static int Str_find_( //
     Py_ssize_t start, end;
 
     // Validate and convert `haystack` and `needle`
-    needle.anomaly_offset = 0;
+    needle.quadgram_offset = 0;
     if (!export_string_like(haystack_obj, &haystack.start, &haystack.length) ||
         !export_string_like(needle_obj, &needle.start, &needle.length)) {
         PyErr_SetString(PyExc_TypeError, "Haystack and needle must be string-like");
@@ -1000,7 +999,7 @@ static PyObject *Str_count(PyObject *self, PyObject *args, PyObject *kwargs) {
     Py_ssize_t end = end_obj ? PyLong_AsSsize_t(end_obj) : PY_SSIZE_T_MAX;
     int allowoverlap = allowoverlap_obj ? PyObject_IsTrue(allowoverlap_obj) : 0;
 
-    needle.anomaly_offset = 0;
+    needle.quadgram_offset = 0;
     if (!export_string_like(haystack_obj, &haystack.start, &haystack.length) ||
         !export_string_like(needle_obj, &needle.start, &needle.length))
         return PyErr_Format(PyExc_TypeError, "Haystack and needle must be string-like"), NULL;
@@ -1287,7 +1286,7 @@ static PyObject *Str_split(PyObject *self, PyObject *args, PyObject *kwargs) {
     sz_needle_t separator;
     int keepseparator;
     Py_ssize_t maxsplit;
-    separator.anomaly_offset = 0;
+    separator.quadgram_offset = 0;
 
     // Validate and convert `text`
     if (!export_string_like(text_obj, &text.start, &text.length)) {
@@ -1565,14 +1564,9 @@ static boolean_t Strs_sort_(Strs *self,
     }
 
     // Get the parts and their count
-    sz_haystack_t *parts = NULL;
-    size_t count = 0;
-    switch (self->type) {
-    case STRS_REORDERED:
-        parts = self->data.reordered.parts;
-        count = self->data.reordered.count;
-        break;
-    }
+    // The only possible `self->type` by now is the `STRS_REORDERED`
+    sz_haystack_t *parts = self->data.reordered.parts;
+    size_t count = self->data.reordered.count;
 
     // Allocate temporary memory to store the ordering offsets
     size_t memory_needed = sizeof(sz_size_t) * count;
diff --git a/scripts/bench.ipynb b/scripts/bench.ipynb
index b69d2f8f..b3bc4392 100644
--- a/scripts/bench.ipynb
+++ b/scripts/bench.ipynb
@@ -176,7 +176,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.11"
+   "version": "3.10.13"
   },
   "orig_nbformat": 4
  },
diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h
index 7b664ca6..51319f01 100644
--- a/stringzilla/stringzilla.h
+++ b/stringzilla/stringzilla.h
@@ -1,10 +1,7 @@
 #ifndef STRINGZILLA_H_
 #define STRINGZILLA_H_
 
-#include <ctype.h>  // `tolower`
 #include <search.h> // `qsort_s`
-#include <stddef.h> // `sz_size_t`
-#include <stdint.h> // `uint8_t`
 #include <stdlib.h> // `qsort_r`
 #include <string.h> // `memcpy`
 
@@ -30,11 +27,71 @@
 extern "C" {
 #endif
 
-typedef uint32_t sz_anomaly_t;
-typedef uint64_t sz_size_t;
+#if defined(__LP64__) || defined(_LP64) || defined(__x86_64__) || defined(_WIN64)
+typedef unsigned long sz_size_t; // 64-bit on most platforms when pointers are 64-bit
+#else
+typedef unsigned sz_size_t; // 32-bit on most platforms when pointers are 32-bit
+#endif
+
+typedef unsigned sz_u32_t;           // Always 32 bits
+typedef unsigned long long sz_u64_t; // Always 64 bits
+
+typedef union sz_quadgram_t {
+    unsigned u32;
+    unsigned char u8s[4];
+} sz_quadgram_t; // Always 32-bit unsigned integer, representing 8 bytes/characters
+
+typedef union sz_octogram_t {
+    unsigned long long u64;
+    unsigned char u8s[8];
+} sz_octogram_t; // Always 64-bit unsigned integer, representing 8 bytes/characters
 
 inline static sz_size_t sz_divide_round_up(sz_size_t x, sz_size_t divisor) { return (x + (divisor - 1)) / divisor; }
 
+inline static sz_size_t sz_tolower_ascii(char c) {
+    static char lowered[256] = {
+        0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  15,  //
+        16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  //
+        32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  //
+        48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  //
+        64,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, //
+        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 91,  92,  93,  94,  95,  //
+        96,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, //
+        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, //
+        128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, //
+        144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, //
+        160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, //
+        176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, //
+        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
+        240, 241, 242, 243, 244, 245, 246, 215, 248, 249, 250, 251, 252, 253, 254, 223, //
+        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
+        240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, //
+    };
+    return lowered[(int)c];
+}
+
+inline static sz_size_t sz_toupper_ascii(char c) {
+    static char upped[256] = {
+        0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  15,  //
+        16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  //
+        32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  //
+        48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  //
+        64,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, //
+        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 91,  92,  93,  94,  95,  //
+        96,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  //
+        80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  123, 124, 125, 126, 127, //
+        128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, //
+        144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, //
+        160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, //
+        176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, //
+        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
+        240, 241, 242, 243, 244, 245, 246, 215, 248, 249, 250, 251, 252, 253, 254, 223, //
+        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
+        240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, //
+    };
+    return upped[(int)c];
+}
+
 /**
  *  @brief This is a faster alternative to `strncmp(a, b, length) == 0`.
  *  @return 1 for `true`, and 0 for `false`.
@@ -53,28 +110,29 @@ typedef struct sz_haystack_t {
 typedef struct sz_needle_t {
     char const *start;
     sz_size_t length;
-    sz_size_t anomaly_offset;
+    sz_size_t quadgram_offset;
 } sz_needle_t;
 
 /**
  *  @brief  SWAR single-character counting procedure, jumping 8 bytes at a time.
  */
-inline static sz_size_t sz_count_char_swar(sz_haystack_t h, char n) {
+inline static sz_size_t sz_count_unigram_swar(sz_haystack_t h, char n) {
 
     sz_size_t result = 0;
     char const *text = h.start;
     char const *end = h.start + h.length;
 
-    for (; (uint64_t)text % 8 != 0 && text < end; ++text) result += *text == n;
+    // Process the misaligned head, to void UB on unaligned 64-bit loads.
+    for (; ((unsigned long)text & 7ul) && text < end; ++text) result += *text == n;
 
     // This code simulates hyper-scalar execution, comparing 8 characters at a time.
-    uint64_t nnnnnnnn = n;
+    sz_u64_t nnnnnnnn = n;
     nnnnnnnn |= nnnnnnnn << 8;
     nnnnnnnn |= nnnnnnnn << 16;
     nnnnnnnn |= nnnnnnnn << 32;
     for (; text + 8 <= end; text += 8) {
-        uint64_t text_slice = *(uint64_t const *)text;
-        uint64_t match_indicators = ~(text_slice ^ nnnnnnnn);
+        sz_u64_t text_slice = *(sz_u64_t const *)text;
+        sz_u64_t match_indicators = ~(text_slice ^ nnnnnnnn);
         match_indicators &= match_indicators >> 1;
         match_indicators &= match_indicators >> 2;
         match_indicators &= match_indicators >> 4;
@@ -89,22 +147,23 @@ inline static sz_size_t sz_count_char_swar(sz_haystack_t h, char n) {
 /**
  *  @brief  SWAR single-character search in string, jumping 8 bytes at a time.
  */
-inline static sz_size_t sz_find_char_swar(sz_haystack_t h, char n) {
+inline static sz_size_t sz_find_unigram_swar(sz_haystack_t h, char n) {
 
     char const *text = h.start;
     char const *end = h.start + h.length;
 
-    for (; (uint64_t)text % 8 != 0 && text < end; ++text)
+    // Process the misaligned head, to void UB on unaligned 64-bit loads.
+    for (; ((unsigned long)text & 7ul) && text < end; ++text)
         if (*text == n) return text - h.start;
 
     // This code simulates hyper-scalar execution, analyzing 8 offsets at a time.
-    uint64_t nnnnnnnn = n;
+    sz_u64_t nnnnnnnn = n;
     nnnnnnnn |= nnnnnnnn << 8;  // broadcast `n` into `nnnnnnnn`
     nnnnnnnn |= nnnnnnnn << 16; // broadcast `n` into `nnnnnnnn`
     nnnnnnnn |= nnnnnnnn << 32; // broadcast `n` into `nnnnnnnn`
     for (; text + 8 <= end; text += 8) {
-        uint64_t text_slice = *(uint64_t const *)text;
-        uint64_t match_indicators = ~(text_slice ^ nnnnnnnn);
+        sz_u64_t text_slice = *(sz_u64_t const *)text;
+        sz_u64_t match_indicators = ~(text_slice ^ nnnnnnnn);
         match_indicators &= match_indicators >> 1;
         match_indicators &= match_indicators >> 2;
         match_indicators &= match_indicators >> 4;
@@ -121,26 +180,31 @@ inline static sz_size_t sz_find_char_swar(sz_haystack_t h, char n) {
 /**
  *  @brief  SWAR character-bigram search in string, jumping 8 bytes at a time.
  */
-inline static sz_size_t sz_find_2chars_swar(sz_haystack_t h, char const *n) {
+inline static sz_size_t sz_find_bigram_swar(sz_haystack_t h, char const *n) {
 
     char const *text = h.start;
     char const *end = h.start + h.length;
 
+    // Process the misaligned head, to void UB on unaligned 64-bit loads.
+    for (; ((unsigned long)text & 7ul) && text + 2 <= end; ++text)
+        if (text[0] == n[0] && text[1] == n[1]) return text - h.start;
+
     // This code simulates hyper-scalar execution, analyzing 7 offsets at a time.
-    uint64_t nnnn = ((uint64_t)(n[0]) << 0) | ((uint64_t)(n[1]) << 8); // broadcast `n` into `nnnn`
+    sz_u64_t nnnn = ((sz_u64_t)(n[0]) << 0) | ((sz_u64_t)(n[1]) << 8); // broadcast `n` into `nnnn`
     nnnn |= nnnn << 16;                                                // broadcast `n` into `nnnn`
     nnnn |= nnnn << 32;                                                // broadcast `n` into `nnnn`
-    uint64_t text_slice;
     for (; text + 8 <= end; text += 7) {
-        memcpy(&text_slice, text, 8);
-        uint64_t even_indicators = ~(text_slice ^ nnnn);
-        uint64_t odd_indicators = ~((text_slice << 8) ^ nnnn);
+        sz_u64_t text_slice = *(sz_u64_t const *)text;
+        sz_u64_t even_indicators = ~(text_slice ^ nnnn);
+        sz_u64_t odd_indicators = ~((text_slice << 8) ^ nnnn);
+
         // For every even match - 2 char (16 bits) must be identical.
         even_indicators &= even_indicators >> 1;
         even_indicators &= even_indicators >> 2;
         even_indicators &= even_indicators >> 4;
         even_indicators &= even_indicators >> 8;
         even_indicators &= 0x0001000100010001;
+
         // For every odd match - 2 char (16 bits) must be identical.
         odd_indicators &= odd_indicators >> 1;
         odd_indicators &= odd_indicators >> 2;
@@ -149,7 +213,7 @@ inline static sz_size_t sz_find_2chars_swar(sz_haystack_t h, char const *n) {
         odd_indicators &= 0x0001000100010000;
 
         if (even_indicators + odd_indicators) {
-            uint64_t match_indicators = even_indicators | (odd_indicators >> 8);
+            sz_u64_t match_indicators = even_indicators | (odd_indicators >> 8);
             return text - h.start + ctz64(match_indicators) / 8;
         }
     }
@@ -162,23 +226,26 @@ inline static sz_size_t sz_find_2chars_swar(sz_haystack_t h, char const *n) {
 /**
  *  @brief  SWAR character-trigram search in string, jumping 8 bytes at a time.
  */
-inline static sz_size_t sz_find_3chars_swar(sz_haystack_t h, char const *n) {
+inline static sz_size_t sz_find_trigram_swar(sz_haystack_t h, char const *n) {
 
     char const *text = h.start;
     char const *end = h.start + h.length;
 
+    // Process the misaligned head, to void UB on unaligned 64-bit loads.
+    for (; ((unsigned long)text & 7ul) && text + 3 <= end; ++text)
+        if (text[0] == n[0] && text[1] == n[1] && text[2] == n[2]) return text - h.start;
+
     // This code simulates hyper-scalar execution, analyzing 6 offsets at a time.
     // We have two unused bytes at the end.
-    uint64_t nn = (uint64_t)(n[0] << 0) | ((uint64_t)(n[1]) << 8) | ((uint64_t)(n[2]) << 16); // broadcast `n` into `nn`
+    sz_u64_t nn = (sz_u64_t)(n[0] << 0) | ((sz_u64_t)(n[1]) << 8) | ((sz_u64_t)(n[2]) << 16); // broadcast `n` into `nn`
     nn |= nn << 24;                                                                           // broadcast `n` into `nn`
     nn <<= 16;                                                                                // broadcast `n` into `nn`
 
     for (; text + 8 <= end; text += 6) {
-        uint64_t text_slice;
-        memcpy(&text_slice, text, 8);
-        uint64_t first_indicators = ~(text_slice ^ nn);
-        uint64_t second_indicators = ~((text_slice << 8) ^ nn);
-        uint64_t third_indicators = ~((text_slice << 16) ^ nn);
+        sz_u64_t text_slice = *(sz_u64_t const *)text;
+        sz_u64_t first_indicators = ~(text_slice ^ nn);
+        sz_u64_t second_indicators = ~((text_slice << 8) ^ nn);
+        sz_u64_t third_indicators = ~((text_slice << 16) ^ nn);
         // For every first match - 3 chars (24 bits) must be identical.
         // For that merge every byte state and then combine those three-way.
         first_indicators &= first_indicators >> 1;
@@ -203,7 +270,7 @@ inline static sz_size_t sz_find_3chars_swar(sz_haystack_t h, char const *n) {
         third_indicators =
             (third_indicators >> 16) & (third_indicators >> 8) & (third_indicators >> 0) & 0x0000010000010000;
 
-        uint64_t match_indicators = first_indicators | (second_indicators >> 8) | (third_indicators >> 16);
+        sz_u64_t match_indicators = first_indicators | (second_indicators >> 8) | (third_indicators >> 16);
         if (match_indicators != 0) return text - h.start + ctz64(match_indicators) / 8;
     }
 
@@ -215,29 +282,32 @@ inline static sz_size_t sz_find_3chars_swar(sz_haystack_t h, char const *n) {
 /**
  *  @brief  SWAR character-quadgram search in string, jumping 8 bytes at a time.
  */
-inline static sz_size_t sz_find_4chars_swar(sz_haystack_t h, char const *n) {
+inline static sz_size_t sz_find_quadgram_swar(sz_haystack_t h, char const *n) {
 
     char const *text = h.start;
     char const *end = h.start + h.length;
 
+    // Process the misaligned head, to void UB on unaligned 64-bit loads.
+    for (; ((unsigned long)text & 7ul) && text + 4 <= end; ++text)
+        if (text[0] == n[0] && text[1] == n[1] && text[2] == n[2] && text[3] == n[3]) return text - h.start;
+
     // This code simulates hyper-scalar execution, analyzing 4 offsets at a time.
-    uint64_t nn = (uint64_t)(n[0] << 0) | ((uint64_t)(n[1]) << 8) | ((uint64_t)(n[2]) << 16) | ((uint64_t)(n[3]) << 24);
+    sz_u64_t nn = (sz_u64_t)(n[0] << 0) | ((sz_u64_t)(n[1]) << 8) | ((sz_u64_t)(n[2]) << 16) | ((sz_u64_t)(n[3]) << 24);
     nn |= nn << 32;
 
     //
-    uint8_t lookup[16] = {0};
-    lookup[0b0010] = lookup[0b0110] = lookup[0b1010] = lookup[0b1110] = 1;
-    lookup[0b0100] = lookup[0b1100] = 2;
-    lookup[0b1000] = 3;
+    unsigned char lookup[16] = {0};
+    lookup[0x2] = lookup[0x6] = lookup[0xA] = lookup[0xE] = 1;
+    lookup[0x4] = lookup[0xC] = 2;
+    lookup[0x8] = 3;
 
     // We can perform 5 comparisons per load, but it's easier to perform 4, minimizing the size of the lookup table.
     for (; text + 8 <= end; text += 4) {
-        uint64_t text_slice;
-        memcpy(&text_slice, text, 8);
-        uint64_t text01 = (text_slice & 0x00000000FFFFFFFF) | ((text_slice & 0x000000FFFFFFFF00) << 24);
-        uint64_t text23 = ((text_slice & 0x0000FFFFFFFF0000) >> 16) | ((text_slice & 0x00FFFFFFFF000000) << 8);
-        uint64_t text01_indicators = ~(text01 ^ nn);
-        uint64_t text23_indicators = ~(text23 ^ nn);
+        sz_u64_t text_slice = *(sz_u64_t const *)text;
+        sz_u64_t text01 = (text_slice & 0x00000000FFFFFFFF) | ((text_slice & 0x000000FFFFFFFF00) << 24);
+        sz_u64_t text23 = ((text_slice & 0x0000FFFFFFFF0000) >> 16) | ((text_slice & 0x00FFFFFFFF000000) << 8);
+        sz_u64_t text01_indicators = ~(text01 ^ nn);
+        sz_u64_t text23_indicators = ~(text23 ^ nn);
 
         // For every first match - 4 chars (32 bits) must be identical.
         text01_indicators &= text01_indicators >> 1;
@@ -258,7 +328,7 @@ inline static sz_size_t sz_find_4chars_swar(sz_haystack_t h, char const *n) {
         if (text01_indicators + text23_indicators) {
             // Assuming we have performed 4 comparisons, we can only have 2^4=16 outcomes.
             // Which is small enough for a lookup table.
-            uint8_t match_indicators = (uint8_t)(                      //
+            unsigned char match_indicators = (unsigned char)(          //
                 (text01_indicators >> 31) | (text01_indicators << 0) | //
                 (text23_indicators >> 29) | (text23_indicators << 2));
             return text - h.start + lookup[match_indicators];
@@ -272,7 +342,7 @@ inline static sz_size_t sz_find_4chars_swar(sz_haystack_t h, char const *n) {
 
 /**
  *  @brief  Trivial substring search with scalar code. Instead of comparing characters one-by-one
- *          it compares 4-byte anomalies first, most commonly prefixes. It's computationally cheaper.
+ *          it compares 4-byte quadgrams first, most commonly prefixes. It's computationally cheaper.
  *          Matching performance fluctuates between 1 GB/s and 3,5 GB/s per core.
  */
 inline static sz_size_t sz_find_substr_swar(sz_haystack_t h, sz_needle_t n) {
@@ -281,26 +351,36 @@ inline static sz_size_t sz_find_substr_swar(sz_haystack_t h, sz_needle_t n) {
 
     switch (n.length) {
     case 0: return 0;
-    case 1: return sz_find_char_swar(h, *n.start);
-    case 2: return sz_find_2chars_swar(h, n.start);
-    case 3: return sz_find_3chars_swar(h, n.start);
-    case 4: return sz_find_4chars_swar(h, n.start);
+    case 1: return sz_find_unigram_swar(h, *n.start);
+    case 2: return sz_find_bigram_swar(h, n.start);
+    case 3: return sz_find_trigram_swar(h, n.start);
+    case 4: return sz_find_quadgram_swar(h, n.start);
     default: {
         char const *text = h.start;
         char const *const end = h.start + h.length;
 
-        sz_anomaly_t n_anomaly, h_anomaly;
-        sz_size_t const n_suffix_len = n.length - 4 - n.anomaly_offset;
-        char const *n_suffix_ptr = n.start + 4 + n.anomaly_offset;
-        memcpy(&n_anomaly, n.start + n.anomaly_offset, 4);
-
-        text += n.anomaly_offset;
-        for (; text + n.length <= end; text++) {
-            memcpy(&h_anomaly, text, 4);
-            if (h_anomaly == n_anomaly)                                               // Match anomaly.
-                if (sz_equal(text + 4, n_suffix_ptr, n_suffix_len))                   // Match suffix.
-                    if (sz_equal(text - n.anomaly_offset, n.start, n.anomaly_offset)) // Match prefix.
-                        return text - h.start - n.anomaly_offset;
+        sz_quadgram_t n_quadgram, h_quadgram;
+        sz_size_t const n_suffix_len = n.length - 4 - n.quadgram_offset;
+        char const *n_suffix_ptr = n.start + 4 + n.quadgram_offset;
+        n_quadgram.u8s[0] = n.start[n.quadgram_offset];
+        n_quadgram.u8s[1] = n.start[n.quadgram_offset + 1];
+        n_quadgram.u8s[2] = n.start[n.quadgram_offset + 2];
+        n_quadgram.u8s[3] = n.start[n.quadgram_offset + 3];
+        h_quadgram.u8s[0] = h.start[0];
+        h_quadgram.u8s[1] = h.start[1];
+        h_quadgram.u8s[2] = h.start[2];
+        h_quadgram.u8s[3] = h.start[3];
+
+        text += n.quadgram_offset;
+        while (text + n.length <= end) {
+            if (h_quadgram.u32 == n_quadgram.u32)                                       // Match quadgram.
+                if (sz_equal(text + 4, n_suffix_ptr, n_suffix_len))                     // Match suffix.
+                    if (sz_equal(text - n.quadgram_offset, n.start, n.quadgram_offset)) // Match prefix.
+                        return text - h.start - n.quadgram_offset;
+
+            h_quadgram.u32 <<= 8;
+            h_quadgram.u8s[3] = *text;
+            ++text;
         }
         return h.length;
     }
@@ -319,17 +399,17 @@ inline static sz_size_t sz_find_substr_avx2(sz_haystack_t h, sz_needle_t n) {
 
     // Precomputed constants
     char const *const end = h.start + h.length;
-    uint32_t anomaly = 0;
-    uint32_t mask = 0;
+    sz_quadgram_t quadgram = 0;
+    sz_quadgram_t mask = 0;
     switch (n.length) {
-    case 1: memset(&mask, 0xFF, 1), memcpy(&anomaly, n.start, 1); break;
-    case 2: memset(&mask, 0xFF, 2), memcpy(&anomaly, n.start, 2); break;
-    case 3: memset(&mask, 0xFF, 3), memcpy(&anomaly, n.start, 3); break;
-    default: memset(&mask, 0xFF, 4), memcpy(&anomaly, n.start, 4); break;
+    case 1: memset(&mask, 0xFF, 1), memcpy(&quadgram, n.start, 1); break;
+    case 2: memset(&mask, 0xFF, 2), memcpy(&quadgram, n.start, 2); break;
+    case 3: memset(&mask, 0xFF, 3), memcpy(&quadgram, n.start, 3); break;
+    default: memset(&mask, 0xFF, 4), memcpy(&quadgram, n.start, 4); break;
     }
 
-    __m256i const anomalies = _mm256_set1_epi32(*(uint32_t const *)&anomaly);
-    __m256i const masks = _mm256_set1_epi32(*(uint32_t const *)&mask);
+    __m256i const quadgrams = _mm256_set1_epi32(quadgram.u32);
+    __m256i const masks = _mm256_set1_epi32(mask.u32);
 
     // Top level for-loop changes dramatically.
     // In sequential computing model for 32 offsets we would do:
@@ -345,13 +425,13 @@ inline static sz_size_t sz_find_substr_avx2(sz_haystack_t h, sz_needle_t n) {
 
         // Performing many unaligned loads ends up being faster than loading once and shuffling around.
         __m256i texts0 = _mm256_and_si256(_mm256_loadu_si256((__m256i const *)(text + 0)), masks);
-        int matches0 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts0, anomalies));
+        int matches0 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts0, quadgrams));
         __m256i texts1 = _mm256_and_si256(_mm256_loadu_si256((__m256i const *)(text + 1)), masks);
-        int matches1 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts1, anomalies));
+        int matches1 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts1, quadgrams));
         __m256i text2 = _mm256_and_si256(_mm256_loadu_si256((__m256i const *)(text + 2)), masks);
-        int matches2 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(text2, anomalies));
+        int matches2 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(text2, quadgrams));
         __m256i texts3 = _mm256_and_si256(_mm256_loadu_si256((__m256i const *)(text + 3)), masks);
-        int matches3 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts3, anomalies));
+        int matches3 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts3, quadgrams));
 
         if (matches0 | matches1 | matches2 | matches3) {
             for (sz_size_t i = 0; i < 32; i++) {
@@ -382,16 +462,22 @@ inline static sz_size_t sz_find_substr_neon(sz_haystack_t h, sz_needle_t n) {
 
     // Precomputed constants
     char const *const end = h.start + h.length;
-    uint32_t anomaly = 0;
-    uint32_t mask = 0;
+    sz_quadgram_t quadgram = {};
+    sz_quadgram_t mask = {};
     switch (n.length) {
-    case 1: memset(&mask, 0xFF, 1), memcpy(&anomaly, n.start, 1); break;
-    case 2: memset(&mask, 0xFF, 2), memcpy(&anomaly, n.start, 2); break;
-    case 3: memset(&mask, 0xFF, 3), memcpy(&anomaly, n.start, 3); break;
-    default: memset(&mask, 0xFF, 4), memcpy(&anomaly, n.start, 4); break;
+    case 1: mask.u8s[0] = 0xFF, quadgram.u8s[0] = n.start[0]; break;
+    case 2: mask.u8s[0] = mask.u8s[1] = 0xFF, quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1]; break;
+    case 3:
+        mask.u8s[0] = mask.u8s[1] = mask.u8s[2] = 0xFF, quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1],
+        quadgram.u8s[2] = n.start[2];
+        break;
+    default:
+        mask.u32 = 0xFFFFFFFF, quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1], quadgram.u8s[2] = n.start[2],
+        quadgram.u8s[3] = n.start[3];
+        break;
     }
 
-    uint32x4_t const anomalies = vld1q_dup_u32(&anomaly);
+    uint32x4_t const quadgrams = vld1q_dup_u32(&quadgram.u32);
     uint32x4_t const masks = vld1q_dup_u32(&mask);
     uint32x4_t matches, matches0, matches1, matches2, matches3;
 
@@ -400,10 +486,10 @@ inline static sz_size_t sz_find_substr_neon(sz_haystack_t h, sz_needle_t n) {
 
         // Each of the following `matchesX` contains only 4 relevant bits - one per word.
         // Each signifies a match at the given offset.
-        matches0 = vceqq_u32(vandq_u32(vld1q_u32((uint32_t const *)(text + 0)), masks), anomalies);
-        matches1 = vceqq_u32(vandq_u32(vld1q_u32((uint32_t const *)(text + 1)), masks), anomalies);
-        matches2 = vceqq_u32(vandq_u32(vld1q_u32((uint32_t const *)(text + 2)), masks), anomalies);
-        matches3 = vceqq_u32(vandq_u32(vld1q_u32((uint32_t const *)(text + 3)), masks), anomalies);
+        matches0 = vceqq_u32(vandq_u32(vld1q_u32((sz_quadgram_t const *)(text + 0)), masks), quadgrams);
+        matches1 = vceqq_u32(vandq_u32(vld1q_u32((sz_quadgram_t const *)(text + 1)), masks), quadgrams);
+        matches2 = vceqq_u32(vandq_u32(vld1q_u32((sz_quadgram_t const *)(text + 2)), masks), quadgrams);
+        matches3 = vceqq_u32(vandq_u32(vld1q_u32((sz_quadgram_t const *)(text + 3)), masks), quadgrams);
         matches = vorrq_u32(vorrq_u32(matches0, matches1), vorrq_u32(matches2, matches3));
 
         if (vmaxvq_u32(matches)) {
@@ -448,8 +534,8 @@ inline static sz_size_t sz_find_substr_neon(sz_haystack_t h, sz_needle_t n) {
 
 #endif // Arm Neon
 
-inline static sz_size_t sz_count_char(sz_haystack_t h, char n) { return sz_count_char_swar(h, n); }
-inline static sz_size_t sz_find_char(sz_haystack_t h, char n) { return sz_find_char_swar(h, n); }
+inline static sz_size_t sz_count_char(sz_haystack_t h, char n) { return sz_count_unigram_swar(h, n); }
+inline static sz_size_t sz_find_unigram(sz_haystack_t h, char n) { return sz_find_unigram_swar(h, n); }
 
 inline static sz_size_t sz_find_substr(sz_haystack_t h, sz_needle_t n) {
     if (h.length < n.length) return h.length;
@@ -665,10 +751,10 @@ inline static void sz_sort(sz_sequence_t *sequence, sz_sort_config_t const *conf
         char *prefix = (char *)&sequence->order[i];
         for (sz_size_t j = 0; j != length; ++j) prefix[7 - j] = begin[j];
         if (case_insensitive) {
-            prefix[0] = tolower(prefix[0]);
-            prefix[1] = tolower(prefix[1]);
-            prefix[2] = tolower(prefix[2]);
-            prefix[3] = tolower(prefix[3]);
+            prefix[0] = sz_tolower_ascii(prefix[0]);
+            prefix[1] = sz_tolower_ascii(prefix[1]);
+            prefix[2] = sz_tolower_ascii(prefix[2]);
+            prefix[3] = sz_tolower_ascii(prefix[3]);
         }
     }
 
@@ -679,7 +765,7 @@ inline static void sz_sort(sz_sequence_t *sequence, sz_sort_config_t const *conf
     _sz_sort_recursion(sequence, 0, 32, comparator);
 }
 
-typedef uint8_t levenstein_distance_t;
+typedef unsigned char levenstein_distance_t;
 
 /**
  *  @return Amount of temporary memory (in bytes) needed to efficiently compute
@@ -758,11 +844,11 @@ inline static levenstein_distance_t sz_levenstein( //
 /**
  *  @brief  Hashes provided string using hardware-accelerated CRC32 instructions.
  */
-inline static uint32_t sz_hash_crc32_native(char const *start, sz_size_t length) { return 0; }
+inline static sz_u32_t sz_hash_crc32_native(char const *start, sz_size_t length) { return 0; }
 
-inline static uint32_t sz_hash_crc32_neon(char const *start, sz_size_t length) { return 0; }
+inline static sz_u32_t sz_hash_crc32_neon(char const *start, sz_size_t length) { return 0; }
 
-inline static uint32_t sz_hash_crc32_sse(char const *start, sz_size_t length) { return 0; }
+inline static sz_u32_t sz_hash_crc32_sse(char const *start, sz_size_t length) { return 0; }
 
 #ifdef __cplusplus
 }

From b62b9c666c8970bb4219f1227b225ecb44a0d707 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Oct 2023 19:42:11 -0700
Subject: [PATCH 2/8] Fix: SWAR search bug

---
 stringzilla/stringzilla.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h
index 51319f01..7353024a 100644
--- a/stringzilla/stringzilla.h
+++ b/stringzilla/stringzilla.h
@@ -373,13 +373,13 @@ inline static sz_size_t sz_find_substr_swar(sz_haystack_t h, sz_needle_t n) {
 
         text += n.quadgram_offset;
         while (text + n.length <= end) {
+            h_quadgram.u8s[3] = text[3];
             if (h_quadgram.u32 == n_quadgram.u32)                                       // Match quadgram.
                 if (sz_equal(text + 4, n_suffix_ptr, n_suffix_len))                     // Match suffix.
                     if (sz_equal(text - n.quadgram_offset, n.start, n.quadgram_offset)) // Match prefix.
                         return text - h.start - n.quadgram_offset;
 
-            h_quadgram.u32 <<= 8;
-            h_quadgram.u8s[3] = *text;
+            h_quadgram.u32 >>= 8;
             ++text;
         }
         return h.length;

From ac7012a2796e613af75fde91e205ef55fb84944b Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Oct 2023 20:39:44 -0700
Subject: [PATCH 3/8] Improve: avoiding nested loop in AVX2

---
 stringzilla/stringzilla.h | 93 +++++++++++++++++++++++++--------------
 1 file changed, 61 insertions(+), 32 deletions(-)

diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h
index 7353024a..6b481dda 100644
--- a/stringzilla/stringzilla.h
+++ b/stringzilla/stringzilla.h
@@ -387,6 +387,40 @@ inline static sz_size_t sz_find_substr_swar(sz_haystack_t h, sz_needle_t n) {
     }
 }
 
+/**
+ *  Helper function, used in substring search operations.
+ */
+inline static void _sz_find_substr_populate_quadgram( //
+    sz_haystack_t h,
+    sz_needle_t n,
+    sz_quadgram_t *quadgram_out,
+    sz_quadgram_t *mask_out) {
+
+    sz_quadgram_t quadgram;
+    sz_quadgram_t mask;
+    switch (n.length) {
+    case 1:
+        mask.u8s[0] = 0xFF, mask.u8s[1] = mask.u8s[2] = mask.u8s[3] = 0;
+        quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = quadgram.u8s[2] = quadgram.u8s[3] = 0;
+        break;
+    case 2:
+        mask.u8s[0] = mask.u8s[1] = 0xFF, mask.u8s[2] = mask.u8s[3] = 0;
+        quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1], quadgram.u8s[2] = quadgram.u8s[3] = 0;
+        break;
+    case 3:
+        mask.u8s[0] = mask.u8s[1] = mask.u8s[2] = 0xFF, mask.u8s[3] = 0;
+        quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1], quadgram.u8s[2] = n.start[2], quadgram.u8s[3] = 0;
+        break;
+    default:
+        mask.u32 = 0xFFFFFFFF;
+        quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1], quadgram.u8s[2] = n.start[2],
+        quadgram.u8s[3] = n.start[3];
+        break;
+    }
+    *quadgram_out = quadgram;
+    *mask_out = mask;
+}
+
 #if defined(__AVX2__)
 
 /**
@@ -399,15 +433,9 @@ inline static sz_size_t sz_find_substr_avx2(sz_haystack_t h, sz_needle_t n) {
 
     // Precomputed constants
     char const *const end = h.start + h.length;
-    sz_quadgram_t quadgram = 0;
-    sz_quadgram_t mask = 0;
-    switch (n.length) {
-    case 1: memset(&mask, 0xFF, 1), memcpy(&quadgram, n.start, 1); break;
-    case 2: memset(&mask, 0xFF, 2), memcpy(&quadgram, n.start, 2); break;
-    case 3: memset(&mask, 0xFF, 3), memcpy(&quadgram, n.start, 3); break;
-    default: memset(&mask, 0xFF, 4), memcpy(&quadgram, n.start, 4); break;
-    }
-
+    sz_quadgram_t quadgram;
+    sz_quadgram_t mask;
+    _sz_find_substr_populate_quadgram(h, n, &quadgram, &mask);
     __m256i const quadgrams = _mm256_set1_epi32(quadgram.u32);
     __m256i const masks = _mm256_set1_epi32(mask.u32);
 
@@ -421,7 +449,7 @@ inline static sz_size_t sz_find_substr_avx2(sz_haystack_t h, sz_needle_t n) {
     //  + 3 bitwise ANDs.
     //  + 1 heavy (but very unlikely) branch.
     char const *text = h.start;
-    for (; (text + n.length + 32) <= end; text += 32) {
+    while (text + n.length + 32 <= end) {
 
         // Performing many unaligned loads ends up being faster than loading once and shuffling around.
         __m256i texts0 = _mm256_and_si256(_mm256_loadu_si256((__m256i const *)(text + 0)), masks);
@@ -434,10 +462,23 @@ inline static sz_size_t sz_find_substr_avx2(sz_haystack_t h, sz_needle_t n) {
         int matches3 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts3, quadgrams));
 
         if (matches0 | matches1 | matches2 | matches3) {
-            for (sz_size_t i = 0; i < 32; i++) {
-                if (sz_equal(text + i, n.start, n.length)) return i + (text - h.start);
+            int matches =                   //
+                (matches0 & 0x1111'1111u) | //
+                (matches1 & 0x2222'2222u) | //
+                (matches2 & 0x4444'4444u) | //
+                (matches3 & 0x8888'8888u);
+            size_t first_match_offset = _tzcnt_u32(matches);
+            if (n.length > 4) {
+                if (sz_equal(text + first_match_offset + 4, n.start + 4, n.length - 4))
+                    return text + first_match_offset - h.start;
+                else
+                    text += first_match_offset + 1;
             }
-        }
+            else
+                return text + first_match_offset - h.start;
+            }
+        else
+            text += 32;
     }
 
     // Don't forget the last (up to 35) characters.
@@ -462,21 +503,9 @@ inline static sz_size_t sz_find_substr_neon(sz_haystack_t h, sz_needle_t n) {
 
     // Precomputed constants
     char const *const end = h.start + h.length;
-    sz_quadgram_t quadgram = {};
-    sz_quadgram_t mask = {};
-    switch (n.length) {
-    case 1: mask.u8s[0] = 0xFF, quadgram.u8s[0] = n.start[0]; break;
-    case 2: mask.u8s[0] = mask.u8s[1] = 0xFF, quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1]; break;
-    case 3:
-        mask.u8s[0] = mask.u8s[1] = mask.u8s[2] = 0xFF, quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1],
-        quadgram.u8s[2] = n.start[2];
-        break;
-    default:
-        mask.u32 = 0xFFFFFFFF, quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1], quadgram.u8s[2] = n.start[2],
-        quadgram.u8s[3] = n.start[3];
-        break;
-    }
-
+    sz_quadgram_t quadgram;
+    sz_quadgram_t mask;
+    _sz_find_substr_populate_quadgram(h, n, &quadgram, &mask);
     uint32x4_t const quadgrams = vld1q_dup_u32(&quadgram.u32);
     uint32x4_t const masks = vld1q_dup_u32(&mask);
     uint32x4_t matches, matches0, matches1, matches2, matches3;
@@ -486,10 +515,10 @@ inline static sz_size_t sz_find_substr_neon(sz_haystack_t h, sz_needle_t n) {
 
         // Each of the following `matchesX` contains only 4 relevant bits - one per word.
         // Each signifies a match at the given offset.
-        matches0 = vceqq_u32(vandq_u32(vld1q_u32((sz_quadgram_t const *)(text + 0)), masks), quadgrams);
-        matches1 = vceqq_u32(vandq_u32(vld1q_u32((sz_quadgram_t const *)(text + 1)), masks), quadgrams);
-        matches2 = vceqq_u32(vandq_u32(vld1q_u32((sz_quadgram_t const *)(text + 2)), masks), quadgrams);
-        matches3 = vceqq_u32(vandq_u32(vld1q_u32((sz_quadgram_t const *)(text + 3)), masks), quadgrams);
+        matches0 = vceqq_u32(vandq_u32(vld1q_u32(text + 0), masks), quadgrams);
+        matches1 = vceqq_u32(vandq_u32(vld1q_u32(text + 1), masks), quadgrams);
+        matches2 = vceqq_u32(vandq_u32(vld1q_u32(text + 2), masks), quadgrams);
+        matches3 = vceqq_u32(vandq_u32(vld1q_u32(text + 3), masks), quadgrams);
         matches = vorrq_u32(vorrq_u32(matches0, matches1), vorrq_u32(matches2, matches3));
 
         if (vmaxvq_u32(matches)) {

From efafbbf0687f1d315c94b54b08e5b93f91e88be0 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 9 Oct 2023 13:01:13 -0700
Subject: [PATCH 4/8] Break: Avoiding LibC and new API

---
 stringzilla/stringzilla.h | 789 +++++++++++++++++++++-----------------
 1 file changed, 446 insertions(+), 343 deletions(-)

diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h
index 6b481dda..0aa8774b 100644
--- a/stringzilla/stringzilla.h
+++ b/stringzilla/stringzilla.h
@@ -1,13 +1,10 @@
 #ifndef STRINGZILLA_H_
 #define STRINGZILLA_H_
 
-#include <search.h> // `qsort_s`
-#include <stdlib.h> // `qsort_r`
-#include <string.h> // `memcpy`
-
 #if defined(__AVX2__)
 #include <x86intrin.h>
 #endif
+
 #if defined(__ARM_NEON)
 #include <arm_neon.h>
 #endif
@@ -16,117 +13,88 @@
 #include <intrin.h>
 #define popcount64 __popcnt64
 #define ctz64 _tzcnt_u64
+#define clz64 _lzcnt_u64
 #define strncasecmp _strnicmp
 #define strcasecmp _stricmp
 #else
 #define popcount64 __builtin_popcountll
 #define ctz64 __builtin_ctzll
+#define clz64 __builtin_clzll
+#endif
+
+/**
+ *  Generally `NULL` is coming from locale.h, stddef.h, stdio.h, stdlib.h, string.h, time.h, and wchar.h,
+ *  according to the C standard.
+ */
+#ifndef NULL
+#define NULL ((void *)0)
 #endif
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+/**
+ *  @brief  Analogous to `size_t` and `std::size_t`, unsigned integer, identical to pointer size.
+ *          64-bit on most platforms where pointers are 64-bit.
+ *          32-bit on platforms where pointers are 32-bit.
+ */
 #if defined(__LP64__) || defined(_LP64) || defined(__x86_64__) || defined(_WIN64)
-typedef unsigned long sz_size_t; // 64-bit on most platforms when pointers are 64-bit
+typedef unsigned long sz_size_t;
 #else
-typedef unsigned sz_size_t; // 32-bit on most platforms when pointers are 32-bit
+typedef unsigned sz_size_t;
 #endif
 
+typedef int sz_bool_t;               // Only one relevant bit
 typedef unsigned sz_u32_t;           // Always 32 bits
 typedef unsigned long long sz_u64_t; // Always 64 bits
+typedef char const *sz_string_ptr_t; // A type alias for `char const * `
+
+/**
+ *  @brief  Helper construct for higher-level bindings.
+ */
+typedef struct sz_string_view_t {
+    sz_string_ptr_t start;
+    sz_size_t length;
+} sz_string_view_t;
 
-typedef union sz_quadgram_t {
+/**
+ *  @brief  Internal data-structure, used to address "anomalies" (often prefixes),
+ *          during substring search. Always a 32-bit unsigned integer, containing 4 chars.
+ */
+typedef union _sz_anomaly_t {
     unsigned u32;
     unsigned char u8s[4];
-} sz_quadgram_t; // Always 32-bit unsigned integer, representing 8 bytes/characters
-
-typedef union sz_octogram_t {
-    unsigned long long u64;
-    unsigned char u8s[8];
-} sz_octogram_t; // Always 64-bit unsigned integer, representing 8 bytes/characters
-
-inline static sz_size_t sz_divide_round_up(sz_size_t x, sz_size_t divisor) { return (x + (divisor - 1)) / divisor; }
-
-inline static sz_size_t sz_tolower_ascii(char c) {
-    static char lowered[256] = {
-        0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  15,  //
-        16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  //
-        32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  //
-        48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  //
-        64,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, //
-        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 91,  92,  93,  94,  95,  //
-        96,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, //
-        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, //
-        128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, //
-        144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, //
-        160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, //
-        176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, //
-        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
-        240, 241, 242, 243, 244, 245, 246, 215, 248, 249, 250, 251, 252, 253, 254, 223, //
-        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
-        240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, //
-    };
-    return lowered[(int)c];
-}
-
-inline static sz_size_t sz_toupper_ascii(char c) {
-    static char upped[256] = {
-        0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  15,  //
-        16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  //
-        32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  //
-        48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  //
-        64,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, //
-        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 91,  92,  93,  94,  95,  //
-        96,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  //
-        80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  123, 124, 125, 126, 127, //
-        128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, //
-        144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, //
-        160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, //
-        176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, //
-        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
-        240, 241, 242, 243, 244, 245, 246, 215, 248, 249, 250, 251, 252, 253, 254, 223, //
-        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
-        240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, //
-    };
-    return upped[(int)c];
-}
+} _sz_anomaly_t;
 
 /**
- *  @brief This is a faster alternative to `strncmp(a, b, length) == 0`.
+ *  @brief  This is a slightly faster alternative to `strncmp(a, b, length) == 0`.
+ *          Doesn't provide major performance improvements, but helps avoid the LibC dependency.
  *  @return 1 for `true`, and 0 for `false`.
  */
-inline static int sz_equal(char const *a, char const *b, sz_size_t length) {
-    char const *const a_end = a + length;
+inline static sz_bool_t sz_equal(sz_string_ptr_t a, sz_string_ptr_t b, sz_size_t length) {
+    sz_string_ptr_t const a_end = a + length;
     while (a != a_end && *a == *b) a++, b++;
     return a_end == a;
 }
 
-typedef struct sz_haystack_t {
-    char const *start;
-    sz_size_t length;
-} sz_haystack_t;
-
-typedef struct sz_needle_t {
-    char const *start;
-    sz_size_t length;
-    sz_size_t quadgram_offset;
-} sz_needle_t;
-
 /**
- *  @brief  SWAR single-character counting procedure, jumping 8 bytes at a time.
+ *  @brief  Count the number of occurrences of a @b single-character needle in an arbitrary length haystack.
+ *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
  */
-inline static sz_size_t sz_count_unigram_swar(sz_haystack_t h, char n) {
+inline static sz_size_t sz_count_char_swar(sz_string_ptr_t const haystack,
+                                           sz_size_t const haystack_length,
+                                           sz_string_ptr_t const needle) {
 
     sz_size_t result = 0;
-    char const *text = h.start;
-    char const *end = h.start + h.length;
+    sz_string_ptr_t text = haystack;
+    sz_string_ptr_t const end = haystack + haystack_length;
 
     // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((unsigned long)text & 7ul) && text < end; ++text) result += *text == n;
+    for (; ((unsigned long)text & 7ul) && text < end; ++text) result += *text == *needle;
 
     // This code simulates hyper-scalar execution, comparing 8 characters at a time.
-    sz_u64_t nnnnnnnn = n;
+    sz_u64_t nnnnnnnn = *needle;
     nnnnnnnn |= nnnnnnnn << 8;
     nnnnnnnn |= nnnnnnnn << 16;
     nnnnnnnn |= nnnnnnnn << 32;
@@ -140,27 +108,31 @@ inline static sz_size_t sz_count_unigram_swar(sz_haystack_t h, char n) {
         result += popcount64(match_indicators);
     }
 
-    for (; text < end; ++text) result += *text == n;
+    for (; text < end; ++text) result += *text == *needle;
     return result;
 }
 
 /**
- *  @brief  SWAR single-character search in string, jumping 8 bytes at a time.
+ *  @brief  Find the first occurrence of a @b single-character needle in an arbitrary length haystack.
+ *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
+ *          Identical to `memchr(haystack, needle[0], haystack_length)`.
  */
-inline static sz_size_t sz_find_unigram_swar(sz_haystack_t h, char n) {
+inline static sz_string_ptr_t sz_find_1char_swar(sz_string_ptr_t const haystack,
+                                                 sz_size_t const haystack_length,
+                                                 sz_string_ptr_t const needle) {
 
-    char const *text = h.start;
-    char const *end = h.start + h.length;
+    sz_string_ptr_t text = haystack;
+    sz_string_ptr_t const end = haystack + haystack_length;
 
     // Process the misaligned head, to void UB on unaligned 64-bit loads.
     for (; ((unsigned long)text & 7ul) && text < end; ++text)
-        if (*text == n) return text - h.start;
+        if (*text == *needle) return text;
 
     // This code simulates hyper-scalar execution, analyzing 8 offsets at a time.
-    sz_u64_t nnnnnnnn = n;
-    nnnnnnnn |= nnnnnnnn << 8;  // broadcast `n` into `nnnnnnnn`
-    nnnnnnnn |= nnnnnnnn << 16; // broadcast `n` into `nnnnnnnn`
-    nnnnnnnn |= nnnnnnnn << 32; // broadcast `n` into `nnnnnnnn`
+    sz_u64_t nnnnnnnn = *needle;
+    nnnnnnnn |= nnnnnnnn << 8;  // broadcast `needle` into `nnnnnnnn`
+    nnnnnnnn |= nnnnnnnn << 16; // broadcast `needle` into `nnnnnnnn`
+    nnnnnnnn |= nnnnnnnn << 32; // broadcast `needle` into `nnnnnnnn`
     for (; text + 8 <= end; text += 8) {
         sz_u64_t text_slice = *(sz_u64_t const *)text;
         sz_u64_t match_indicators = ~(text_slice ^ nnnnnnnn);
@@ -169,30 +141,70 @@ inline static sz_size_t sz_find_unigram_swar(sz_haystack_t h, char n) {
         match_indicators &= match_indicators >> 4;
         match_indicators &= 0x0101010101010101;
 
-        if (match_indicators != 0) return text - h.start + ctz64(match_indicators) / 8;
+        if (match_indicators != 0) return text + ctz64(match_indicators) / 8;
     }
 
     for (; text < end; ++text)
-        if (*text == n) return text - h.start;
-    return h.length;
+        if (*text == *needle) return text;
+    return NULL;
+}
+
+/**
+ *  @brief  Find the last occurrence of a @b single-character needle in an arbitrary length haystack.
+ *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
+ *          Identical to `memrchr(haystack, needle[0], haystack_length)`.
+ */
+inline static sz_string_ptr_t sz_rfind_1char_swar(sz_string_ptr_t const haystack,
+                                                  sz_size_t const haystack_length,
+                                                  sz_string_ptr_t const needle) {
+
+    sz_string_ptr_t const end = haystack + haystack_length;
+    sz_string_ptr_t text = end - 1;
+
+    // Process the misaligned head, to void UB on unaligned 64-bit loads.
+    for (; ((unsigned long)text & 7ul) && text >= haystack; --text)
+        if (*text == *needle) return text;
+
+    // This code simulates hyper-scalar execution, analyzing 8 offsets at a time.
+    sz_u64_t nnnnnnnn = *needle;
+    nnnnnnnn |= nnnnnnnn << 8;  // broadcast `needle` into `nnnnnnnn`
+    nnnnnnnn |= nnnnnnnn << 16; // broadcast `needle` into `nnnnnnnn`
+    nnnnnnnn |= nnnnnnnn << 32; // broadcast `needle` into `nnnnnnnn`
+    for (; text - 8 >= haystack; text -= 8) {
+        sz_u64_t text_slice = *(sz_u64_t const *)text;
+        sz_u64_t match_indicators = ~(text_slice ^ nnnnnnnn);
+        match_indicators &= match_indicators >> 1;
+        match_indicators &= match_indicators >> 2;
+        match_indicators &= match_indicators >> 4;
+        match_indicators &= 0x0101010101010101;
+
+        if (match_indicators != 0) return text - 8 + clz64(match_indicators) / 8;
+    }
+
+    for (; text >= haystack; --text)
+        if (*text == *needle) return text;
+    return NULL;
 }
 
 /**
- *  @brief  SWAR character-bigram search in string, jumping 8 bytes at a time.
+ *  @brief  Find the first occurrence of a @b two-character needle in an arbitrary length haystack.
+ *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
  */
-inline static sz_size_t sz_find_bigram_swar(sz_haystack_t h, char const *n) {
+inline static sz_string_ptr_t sz_find_2char_swar(sz_string_ptr_t const haystack,
+                                                 sz_size_t const haystack_length,
+                                                 sz_string_ptr_t const needle) {
 
-    char const *text = h.start;
-    char const *end = h.start + h.length;
+    sz_string_ptr_t text = haystack;
+    sz_string_ptr_t const end = haystack + haystack_length;
 
     // Process the misaligned head, to void UB on unaligned 64-bit loads.
     for (; ((unsigned long)text & 7ul) && text + 2 <= end; ++text)
-        if (text[0] == n[0] && text[1] == n[1]) return text - h.start;
+        if (text[0] == needle[0] && text[1] == needle[1]) return text;
 
     // This code simulates hyper-scalar execution, analyzing 7 offsets at a time.
-    sz_u64_t nnnn = ((sz_u64_t)(n[0]) << 0) | ((sz_u64_t)(n[1]) << 8); // broadcast `n` into `nnnn`
-    nnnn |= nnnn << 16;                                                // broadcast `n` into `nnnn`
-    nnnn |= nnnn << 32;                                                // broadcast `n` into `nnnn`
+    sz_u64_t nnnn = ((sz_u64_t)(needle[0]) << 0) | ((sz_u64_t)(needle[1]) << 8); // broadcast `needle` into `nnnn`
+    nnnn |= nnnn << 16;                                                          // broadcast `needle` into `nnnn`
+    nnnn |= nnnn << 32;                                                          // broadcast `needle` into `nnnn`
     for (; text + 8 <= end; text += 7) {
         sz_u64_t text_slice = *(sz_u64_t const *)text;
         sz_u64_t even_indicators = ~(text_slice ^ nnnn);
@@ -214,32 +226,38 @@ inline static sz_size_t sz_find_bigram_swar(sz_haystack_t h, char const *n) {
 
         if (even_indicators + odd_indicators) {
             sz_u64_t match_indicators = even_indicators | (odd_indicators >> 8);
-            return text - h.start + ctz64(match_indicators) / 8;
+            return text + ctz64(match_indicators) / 8;
         }
     }
 
     for (; text + 2 <= end; ++text)
-        if (text[0] == n[0] && text[1] == n[1]) return text - h.start;
-    return h.length;
+        if (text[0] == needle[0] && text[1] == needle[1]) return text;
+    return NULL;
 }
 
 /**
- *  @brief  SWAR character-trigram search in string, jumping 8 bytes at a time.
+ *  @brief  Find the first occurrence of a three-character needle in an arbitrary length haystack.
+ *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
  */
-inline static sz_size_t sz_find_trigram_swar(sz_haystack_t h, char const *n) {
+inline static sz_string_ptr_t sz_find_3char_swar(sz_string_ptr_t const haystack,
+                                                 sz_size_t const haystack_length,
+                                                 sz_string_ptr_t const needle) {
 
-    char const *text = h.start;
-    char const *end = h.start + h.length;
+    sz_string_ptr_t text = haystack;
+    sz_string_ptr_t end = haystack + haystack_length;
 
     // Process the misaligned head, to void UB on unaligned 64-bit loads.
     for (; ((unsigned long)text & 7ul) && text + 3 <= end; ++text)
-        if (text[0] == n[0] && text[1] == n[1] && text[2] == n[2]) return text - h.start;
+        if (text[0] == needle[0] && text[1] == needle[1] && text[2] == needle[2]) return text;
 
     // This code simulates hyper-scalar execution, analyzing 6 offsets at a time.
     // We have two unused bytes at the end.
-    sz_u64_t nn = (sz_u64_t)(n[0] << 0) | ((sz_u64_t)(n[1]) << 8) | ((sz_u64_t)(n[2]) << 16); // broadcast `n` into `nn`
-    nn |= nn << 24;                                                                           // broadcast `n` into `nn`
-    nn <<= 16;                                                                                // broadcast `n` into `nn`
+    sz_u64_t nn =                      // broadcast `needle` into `nn`
+        (sz_u64_t)(needle[0] << 0) |   // broadcast `needle` into `nn`
+        ((sz_u64_t)(needle[1]) << 8) | // broadcast `needle` into `nn`
+        ((sz_u64_t)(needle[2]) << 16); // broadcast `needle` into `nn`
+    nn |= nn << 24;                    // broadcast `needle` into `nn`
+    nn <<= 16;                         // broadcast `needle` into `nn`
 
     for (; text + 8 <= end; text += 6) {
         sz_u64_t text_slice = *(sz_u64_t const *)text;
@@ -271,35 +289,39 @@ inline static sz_size_t sz_find_trigram_swar(sz_haystack_t h, char const *n) {
             (third_indicators >> 16) & (third_indicators >> 8) & (third_indicators >> 0) & 0x0000010000010000;
 
         sz_u64_t match_indicators = first_indicators | (second_indicators >> 8) | (third_indicators >> 16);
-        if (match_indicators != 0) return text - h.start + ctz64(match_indicators) / 8;
+        if (match_indicators != 0) return text + ctz64(match_indicators) / 8;
     }
 
     for (; text + 3 <= end; ++text)
-        if (text[0] == n[0] && text[1] == n[1] && text[2] == n[2]) return text - h.start;
-    return h.length;
+        if (text[0] == needle[0] && text[1] == needle[1] && text[2] == needle[2]) return text;
+    return NULL;
 }
 
 /**
- *  @brief  SWAR character-quadgram search in string, jumping 8 bytes at a time.
+ *  @brief  Find the first occurrence of a @b four-character needle in an arbitrary length haystack.
+ *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
  */
-inline static sz_size_t sz_find_quadgram_swar(sz_haystack_t h, char const *n) {
+inline static sz_string_ptr_t sz_find_4char_swar(sz_string_ptr_t const haystack,
+                                                 sz_size_t const haystack_length,
+                                                 sz_string_ptr_t const needle) {
 
-    char const *text = h.start;
-    char const *end = h.start + h.length;
+    sz_string_ptr_t text = haystack;
+    sz_string_ptr_t end = haystack + haystack_length;
 
     // Process the misaligned head, to void UB on unaligned 64-bit loads.
     for (; ((unsigned long)text & 7ul) && text + 4 <= end; ++text)
-        if (text[0] == n[0] && text[1] == n[1] && text[2] == n[2] && text[3] == n[3]) return text - h.start;
+        if (text[0] == needle[0] && text[1] == needle[1] && text[2] == needle[2] && text[3] == needle[3]) return text;
 
     // This code simulates hyper-scalar execution, analyzing 4 offsets at a time.
-    sz_u64_t nn = (sz_u64_t)(n[0] << 0) | ((sz_u64_t)(n[1]) << 8) | ((sz_u64_t)(n[2]) << 16) | ((sz_u64_t)(n[3]) << 24);
+    sz_u64_t nn = (sz_u64_t)(needle[0] << 0) | ((sz_u64_t)(needle[1]) << 8) | ((sz_u64_t)(needle[2]) << 16) |
+                  ((sz_u64_t)(needle[3]) << 24);
     nn |= nn << 32;
 
     //
-    unsigned char lookup[16] = {0};
-    lookup[0x2] = lookup[0x6] = lookup[0xA] = lookup[0xE] = 1;
-    lookup[0x4] = lookup[0xC] = 2;
-    lookup[0x8] = 3;
+    unsigned char offset_in_slice[16] = {0};
+    offset_in_slice[0x2] = offset_in_slice[0x6] = offset_in_slice[0xA] = offset_in_slice[0xE] = 1;
+    offset_in_slice[0x4] = offset_in_slice[0xC] = 2;
+    offset_in_slice[0x8] = 3;
 
     // We can perform 5 comparisons per load, but it's easier to perform 4, minimizing the size of the lookup table.
     for (; text + 8 <= end; text += 4) {
@@ -331,58 +353,63 @@ inline static sz_size_t sz_find_quadgram_swar(sz_haystack_t h, char const *n) {
             unsigned char match_indicators = (unsigned char)(          //
                 (text01_indicators >> 31) | (text01_indicators << 0) | //
                 (text23_indicators >> 29) | (text23_indicators << 2));
-            return text - h.start + lookup[match_indicators];
+            return text + offset_in_slice[match_indicators];
         }
     }
 
     for (; text + 4 <= end; ++text)
-        if (text[0] == n[0] && text[1] == n[1] && text[2] == n[2] && text[3] == n[3]) return text - h.start;
-    return h.length;
+        if (text[0] == needle[0] && text[1] == needle[1] && text[2] == needle[2] && text[3] == needle[3]) return text;
+    return NULL;
 }
 
 /**
- *  @brief  Trivial substring search with scalar code. Instead of comparing characters one-by-one
- *          it compares 4-byte quadgrams first, most commonly prefixes. It's computationally cheaper.
+ *  @brief  Trivial substring search with scalar SWAR code. Instead of comparing characters one-by-one
+ *          it compares 4-byte anomalies first, most commonly prefixes. It's computationally cheaper.
  *          Matching performance fluctuates between 1 GB/s and 3,5 GB/s per core.
  */
-inline static sz_size_t sz_find_substr_swar(sz_haystack_t h, sz_needle_t n) {
-
-    if (h.length < n.length) return h.length;
-
-    switch (n.length) {
-    case 0: return 0;
-    case 1: return sz_find_unigram_swar(h, *n.start);
-    case 2: return sz_find_bigram_swar(h, n.start);
-    case 3: return sz_find_trigram_swar(h, n.start);
-    case 4: return sz_find_quadgram_swar(h, n.start);
+inline static sz_string_ptr_t sz_find_substr_swar( //
+    sz_string_ptr_t const haystack,
+    sz_size_t const haystack_length,
+    sz_string_ptr_t const needle,
+    sz_size_t const needle_length) {
+
+    if (haystack_length < needle_length) return NULL;
+
+    sz_size_t anomaly_offset = 0;
+    switch (needle_length) {
+    case 0: return NULL;
+    case 1: return sz_find_1char_swar(haystack, haystack_length, needle);
+    case 2: return sz_find_2char_swar(haystack, haystack_length, needle);
+    case 3: return sz_find_3char_swar(haystack, haystack_length, needle);
+    case 4: return sz_find_4char_swar(haystack, haystack_length, needle);
     default: {
-        char const *text = h.start;
-        char const *const end = h.start + h.length;
-
-        sz_quadgram_t n_quadgram, h_quadgram;
-        sz_size_t const n_suffix_len = n.length - 4 - n.quadgram_offset;
-        char const *n_suffix_ptr = n.start + 4 + n.quadgram_offset;
-        n_quadgram.u8s[0] = n.start[n.quadgram_offset];
-        n_quadgram.u8s[1] = n.start[n.quadgram_offset + 1];
-        n_quadgram.u8s[2] = n.start[n.quadgram_offset + 2];
-        n_quadgram.u8s[3] = n.start[n.quadgram_offset + 3];
-        h_quadgram.u8s[0] = h.start[0];
-        h_quadgram.u8s[1] = h.start[1];
-        h_quadgram.u8s[2] = h.start[2];
-        h_quadgram.u8s[3] = h.start[3];
-
-        text += n.quadgram_offset;
-        while (text + n.length <= end) {
-            h_quadgram.u8s[3] = text[3];
-            if (h_quadgram.u32 == n_quadgram.u32)                                       // Match quadgram.
-                if (sz_equal(text + 4, n_suffix_ptr, n_suffix_len))                     // Match suffix.
-                    if (sz_equal(text - n.quadgram_offset, n.start, n.quadgram_offset)) // Match prefix.
-                        return text - h.start - n.quadgram_offset;
-
-            h_quadgram.u32 >>= 8;
+        sz_string_ptr_t text = haystack;
+        sz_string_ptr_t const end = haystack + haystack_length;
+
+        _sz_anomaly_t n_anomaly, h_anomaly;
+        sz_size_t const n_suffix_len = needle_length - 4 - anomaly_offset;
+        sz_string_ptr_t n_suffix_ptr = needle + 4 + anomaly_offset;
+        n_anomaly.u8s[0] = needle[anomaly_offset];
+        n_anomaly.u8s[1] = needle[anomaly_offset + 1];
+        n_anomaly.u8s[2] = needle[anomaly_offset + 2];
+        n_anomaly.u8s[3] = needle[anomaly_offset + 3];
+        h_anomaly.u8s[0] = haystack[0];
+        h_anomaly.u8s[1] = haystack[1];
+        h_anomaly.u8s[2] = haystack[2];
+        h_anomaly.u8s[3] = haystack[3];
+
+        text += anomaly_offset;
+        while (text + needle_length <= end) {
+            h_anomaly.u8s[3] = text[3];
+            if (h_anomaly.u32 == n_anomaly.u32)                                  // Match anomaly.
+                if (sz_equal(text + 4, n_suffix_ptr, n_suffix_len))              // Match suffix.
+                    if (sz_equal(text - anomaly_offset, needle, anomaly_offset)) // Match prefix - optimized out.
+                        return text - anomaly_offset;
+
+            h_anomaly.u32 >>= 8;
             ++text;
         }
-        return h.length;
+        return NULL;
     }
     }
 }
@@ -390,34 +417,33 @@ inline static sz_size_t sz_find_substr_swar(sz_haystack_t h, sz_needle_t n) {
 /**
  *  Helper function, used in substring search operations.
  */
-inline static void _sz_find_substr_populate_quadgram( //
-    sz_haystack_t h,
-    sz_needle_t n,
-    sz_quadgram_t *quadgram_out,
-    sz_quadgram_t *mask_out) {
-
-    sz_quadgram_t quadgram;
-    sz_quadgram_t mask;
-    switch (n.length) {
+inline static void _sz_find_substr_populate_anomaly( //
+    sz_string_ptr_t const needle,
+    sz_size_t const needle_length,
+    _sz_anomaly_t *anomaly_out,
+    _sz_anomaly_t *mask_out) {
+
+    _sz_anomaly_t anomaly;
+    _sz_anomaly_t mask;
+    switch (needle_length) {
     case 1:
         mask.u8s[0] = 0xFF, mask.u8s[1] = mask.u8s[2] = mask.u8s[3] = 0;
-        quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = quadgram.u8s[2] = quadgram.u8s[3] = 0;
+        anomaly.u8s[0] = needle[0], anomaly.u8s[1] = anomaly.u8s[2] = anomaly.u8s[3] = 0;
         break;
     case 2:
         mask.u8s[0] = mask.u8s[1] = 0xFF, mask.u8s[2] = mask.u8s[3] = 0;
-        quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1], quadgram.u8s[2] = quadgram.u8s[3] = 0;
+        anomaly.u8s[0] = needle[0], anomaly.u8s[1] = needle[1], anomaly.u8s[2] = anomaly.u8s[3] = 0;
         break;
     case 3:
         mask.u8s[0] = mask.u8s[1] = mask.u8s[2] = 0xFF, mask.u8s[3] = 0;
-        quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1], quadgram.u8s[2] = n.start[2], quadgram.u8s[3] = 0;
+        anomaly.u8s[0] = needle[0], anomaly.u8s[1] = needle[1], anomaly.u8s[2] = needle[2], anomaly.u8s[3] = 0;
         break;
     default:
         mask.u32 = 0xFFFFFFFF;
-        quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1], quadgram.u8s[2] = n.start[2],
-        quadgram.u8s[3] = n.start[3];
+        anomaly.u8s[0] = needle[0], anomaly.u8s[1] = needle[1], anomaly.u8s[2] = needle[2], anomaly.u8s[3] = needle[3];
         break;
     }
-    *quadgram_out = quadgram;
+    *anomaly_out = anomaly;
     *mask_out = mask;
 }
 
@@ -429,14 +455,17 @@ inline static void _sz_find_substr_populate_quadgram( //
  *          was practically more efficient than loading once and shifting around, as introduces
  *          less data dependencies.
  */
-inline static sz_size_t sz_find_substr_avx2(sz_haystack_t h, sz_needle_t n) {
+inline static sz_string_ptr_t sz_find_substr_avx2(sz_string_ptr_t const haystack,
+                                                  sz_size_t const haystack_length,
+                                                  sz_string_ptr_t const needle,
+                                                  sz_size_t const needle_length) {
 
     // Precomputed constants
-    char const *const end = h.start + h.length;
-    sz_quadgram_t quadgram;
-    sz_quadgram_t mask;
-    _sz_find_substr_populate_quadgram(h, n, &quadgram, &mask);
-    __m256i const quadgrams = _mm256_set1_epi32(quadgram.u32);
+    sz_string_ptr_t const end = haystack + haystack_length;
+    _sz_anomaly_t anomaly;
+    _sz_anomaly_t mask;
+    _sz_find_substr_populate_anomaly(needle, needle_length, &anomaly, &mask);
+    __m256i const anomalies = _mm256_set1_epi32(anomaly.u32);
     __m256i const masks = _mm256_set1_epi32(mask.u32);
 
     // Top level for-loop changes dramatically.
@@ -448,18 +477,18 @@ inline static sz_size_t sz_find_substr_avx2(sz_haystack_t h, sz_needle_t n) {
     //  + 4 movemasks.
     //  + 3 bitwise ANDs.
     //  + 1 heavy (but very unlikely) branch.
-    char const *text = h.start;
-    while (text + n.length + 32 <= end) {
+    sz_string_ptr_t text = haystack;
+    while (text + needle_length + 32 <= end) {
 
         // Performing many unaligned loads ends up being faster than loading once and shuffling around.
         __m256i texts0 = _mm256_and_si256(_mm256_loadu_si256((__m256i const *)(text + 0)), masks);
-        int matches0 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts0, quadgrams));
+        int matches0 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts0, anomalies));
         __m256i texts1 = _mm256_and_si256(_mm256_loadu_si256((__m256i const *)(text + 1)), masks);
-        int matches1 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts1, quadgrams));
+        int matches1 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts1, anomalies));
         __m256i text2 = _mm256_and_si256(_mm256_loadu_si256((__m256i const *)(text + 2)), masks);
-        int matches2 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(text2, quadgrams));
+        int matches2 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(text2, anomalies));
         __m256i texts3 = _mm256_and_si256(_mm256_loadu_si256((__m256i const *)(text + 3)), masks);
-        int matches3 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts3, quadgrams));
+        int matches3 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts3, anomalies));
 
         if (matches0 | matches1 | matches2 | matches3) {
             int matches =                   //
@@ -468,25 +497,21 @@ inline static sz_size_t sz_find_substr_avx2(sz_haystack_t h, sz_needle_t n) {
                 (matches2 & 0x4444'4444u) | //
                 (matches3 & 0x8888'8888u);
             size_t first_match_offset = _tzcnt_u32(matches);
-            if (n.length > 4) {
-                if (sz_equal(text + first_match_offset + 4, n.start + 4, n.length - 4))
-                    return text + first_match_offset - h.start;
+            if (needle_length > 4) {
+                if (sz_equal(text + first_match_offset + 4, needle + 4, needle_length - 4))
+                    return text + first_match_offset;
                 else
                     text += first_match_offset + 1;
             }
             else
-                return text + first_match_offset - h.start;
-            }
+                return text + first_match_offset;
+        }
         else
             text += 32;
     }
 
     // Don't forget the last (up to 35) characters.
-    sz_haystack_t tail;
-    tail.start = text;
-    tail.length = end - text;
-    size_t tail_match = sz_find_substr_swar(tail, n);
-    return text + tail_match - h.start;
+    return sz_find_substr_swar(text, end - text, needle, needle_length);
 }
 
 #endif // x86 AVX2
@@ -499,26 +524,29 @@ inline static sz_size_t sz_find_substr_avx2(sz_haystack_t h, sz_needle_t n) {
  *          was practically more efficient than loading once and shifting around, as introduces
  *          less data dependencies.
  */
-inline static sz_size_t sz_find_substr_neon(sz_haystack_t h, sz_needle_t n) {
+inline static sz_string_ptr_t sz_find_substr_neon(sz_string_ptr_t const haystack,
+                                                  sz_size_t const haystack_length,
+                                                  sz_string_ptr_t const needle,
+                                                  sz_size_t const needle_length) {
 
     // Precomputed constants
-    char const *const end = h.start + h.length;
-    sz_quadgram_t quadgram;
-    sz_quadgram_t mask;
-    _sz_find_substr_populate_quadgram(h, n, &quadgram, &mask);
-    uint32x4_t const quadgrams = vld1q_dup_u32(&quadgram.u32);
-    uint32x4_t const masks = vld1q_dup_u32(&mask);
+    sz_string_ptr_t const end = haystack + haystack_length;
+    _sz_anomaly_t anomaly;
+    _sz_anomaly_t mask;
+    _sz_find_substr_populate_anomaly(needle, needle_length, &anomaly, &mask);
+    uint32x4_t const anomalies = vld1q_dup_u32(&anomaly.u32);
+    uint32x4_t const masks = vld1q_dup_u32(&mask.u32);
     uint32x4_t matches, matches0, matches1, matches2, matches3;
 
-    char const *text = h.start;
-    while (text + n.length + 16 <= end) {
+    sz_string_ptr_t text = haystack;
+    while (text + needle_length + 16 <= end) {
 
         // Each of the following `matchesX` contains only 4 relevant bits - one per word.
         // Each signifies a match at the given offset.
-        matches0 = vceqq_u32(vandq_u32(vld1q_u32(text + 0), masks), quadgrams);
-        matches1 = vceqq_u32(vandq_u32(vld1q_u32(text + 1), masks), quadgrams);
-        matches2 = vceqq_u32(vandq_u32(vld1q_u32(text + 2), masks), quadgrams);
-        matches3 = vceqq_u32(vandq_u32(vld1q_u32(text + 3), masks), quadgrams);
+        matches0 = vceqq_u32(vandq_u32(vreinterpretq_u32_u8(vld1q_u8((unsigned char *)text + 0)), masks), anomalies);
+        matches1 = vceqq_u32(vandq_u32(vreinterpretq_u32_u8(vld1q_u8((unsigned char *)text + 1)), masks), anomalies);
+        matches2 = vceqq_u32(vandq_u32(vreinterpretq_u32_u8(vld1q_u8((unsigned char *)text + 2)), masks), anomalies);
+        matches3 = vceqq_u32(vandq_u32(vreinterpretq_u32_u8(vld1q_u8((unsigned char *)text + 3)), masks), anomalies);
         matches = vorrq_u32(vorrq_u32(matches0, matches1), vorrq_u32(matches2, matches3));
 
         if (vmaxvq_u32(matches)) {
@@ -540,73 +568,172 @@ inline static sz_size_t sz_find_substr_neon(sz_haystack_t h, sz_needle_t n) {
 
             // Find the first match
             size_t first_match_offset = __builtin_ctz(matches_u16);
-            if (n.length > 4) {
-                if (sz_equal(text + first_match_offset + 4, n.start + 4, n.length - 4))
-                    return text + first_match_offset - h.start;
+            if (needle_length > 4) {
+                if (sz_equal(text + first_match_offset + 4, needle + 4, needle_length - 4))
+                    return text + first_match_offset;
                 else
                     text += first_match_offset + 1;
             }
             else
-                return text + first_match_offset - h.start;
+                return text + first_match_offset;
         }
         else
             text += 16;
     }
 
     // Don't forget the last (up to 16+3=19) characters.
-    sz_haystack_t tail;
-    tail.start = text;
-    tail.length = end - text;
-    size_t tail_match = sz_find_substr_swar(tail, n);
-    return text + tail_match - h.start;
+    return sz_find_substr_swar(text, end - text, needle, needle_length);
 }
 
 #endif // Arm Neon
 
-inline static sz_size_t sz_count_char(sz_haystack_t h, char n) { return sz_count_unigram_swar(h, n); }
-inline static sz_size_t sz_find_unigram(sz_haystack_t h, char n) { return sz_find_unigram_swar(h, n); }
+inline static sz_size_t sz_count_char(sz_string_ptr_t const haystack,
+                                      sz_size_t const haystack_length,
+                                      sz_string_ptr_t const needle) {
+    return sz_count_char_swar(haystack, haystack_length, needle);
+}
 
-inline static sz_size_t sz_find_substr(sz_haystack_t h, sz_needle_t n) {
-    if (h.length < n.length) return h.length;
+inline static sz_string_ptr_t sz_find_1char(sz_string_ptr_t const haystack,
+                                            sz_size_t const haystack_length,
+                                            sz_string_ptr_t const needle) {
+    return sz_find_1char_swar(haystack, haystack_length, needle);
+}
 
+inline static sz_string_ptr_t sz_rfind_1char(sz_string_ptr_t const haystack,
+                                             sz_size_t const haystack_length,
+                                             sz_string_ptr_t const needle) {
+    return sz_rfind_1char_swar(haystack, haystack_length, needle);
+}
+
+inline static sz_string_ptr_t sz_find_substr(sz_string_ptr_t const haystack,
+                                             sz_size_t const haystack_length,
+                                             sz_string_ptr_t const needle,
+                                             sz_size_t const needle_length) {
+    if (haystack_length < needle_length) return NULL;
 #if defined(__ARM_NEON)
-    return sz_find_substr_neon(h, n);
+    return sz_find_substr_neon(haystack, haystack_length, needle, needle_length);
 #elif defined(__AVX2__)
-    return sz_find_substr_avx2(h, n);
+    return sz_find_substr_avx2(haystack, haystack_length, needle, needle_length);
 #else
-    return sz_find_substr_swar(h, n);
+    return sz_find_substr_swar(haystack, haystack_length, needle, needle_length);
 #endif
 }
 
-inline static void sz_swap(sz_size_t *a, sz_size_t *b) {
-    sz_size_t t = *a;
+/**
+ *  @brief  Maps any ASCII character to itself, or the lowercase variant, if available.
+ */
+inline static char sz_tolower_ascii(char c) {
+    static unsigned char lowered[256] = {
+        0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  15,  //
+        16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  //
+        32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  //
+        48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  //
+        64,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, //
+        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 91,  92,  93,  94,  95,  //
+        96,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, //
+        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, //
+        128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, //
+        144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, //
+        160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, //
+        176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, //
+        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
+        240, 241, 242, 243, 244, 245, 246, 215, 248, 249, 250, 251, 252, 253, 254, 223, //
+        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
+        240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, //
+    };
+    return *(char *)&lowered[(int)c];
+}
+
+/**
+ *  @brief  Maps any ASCII character to itself, or the uppercase variant, if available.
+ */
+inline static char sz_toupper_ascii(char c) {
+    static unsigned char upped[256] = {
+        0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  15,  //
+        16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  //
+        32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  //
+        48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  //
+        64,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, //
+        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 91,  92,  93,  94,  95,  //
+        96,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  //
+        80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  123, 124, 125, 126, 127, //
+        128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, //
+        144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, //
+        160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, //
+        176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, //
+        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
+        240, 241, 242, 243, 244, 245, 246, 215, 248, 249, 250, 251, 252, 253, 254, 223, //
+        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
+        240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, //
+    };
+    return *(char *)&upped[(int)c];
+}
+
+/**
+ *  @brief  Char-level lexicographic comparison of two strings.
+ *          Doesn't provide major performance improvements, but helps avoid the LibC dependency.
+ */
+inline static sz_bool_t sz_is_less_ascii(sz_string_ptr_t const a,
+                                         sz_size_t const a_length,
+                                         sz_string_ptr_t const b,
+                                         sz_size_t const b_length) {
+
+    sz_size_t min_length = (a_length < b_length) ? a_length : b_length;
+    for (sz_size_t i = 0; i < min_length; ++i) {
+        if (a[i] < b[i]) return 1;
+        if (a[i] > b[i]) return 0;
+    }
+    return a_length < b_length;
+}
+
+/**
+ *  @brief  Char-level lexicographic comparison of two strings, insensitive to the case of ASCII symbols.
+ *          Doesn't provide major performance improvements, but helps avoid the LibC dependency.
+ */
+inline static sz_bool_t sz_is_less_uncased_ascii(sz_string_ptr_t const a,
+                                                 sz_size_t const a_length,
+                                                 sz_string_ptr_t const b,
+                                                 sz_size_t const b_length) {
+
+    sz_size_t min_length = (a_length < b_length) ? a_length : b_length;
+    for (sz_size_t i = 0; i < min_length; ++i) {
+        char a_lower = sz_tolower_ascii(a[i]);
+        char b_lower = sz_tolower_ascii(b[i]);
+        if (a_lower < b_lower) return 1;
+        if (a_lower > b_lower) return 0;
+    }
+    return a_length < b_length;
+}
+
+/**
+ *  @brief  Helper, that swaps two 64-bit integers representing the order of elements in the sequence.
+ */
+inline static void _sz_swap_order(sz_u64_t *a, sz_u64_t *b) {
+    sz_u64_t t = *a;
     *a = *b;
     *b = t;
 }
 
-typedef char const *(*sz_sequence_get_start_t)(void const *, sz_size_t);
-typedef sz_size_t (*sz_sequence_get_length_t)(void const *, sz_size_t);
-typedef int (*sz_sequence_predicate_t)(void const *, sz_size_t);
-typedef int (*sz_sequence_comparator_t)(void const *, sz_size_t, sz_size_t);
+struct sz_sequence_s;
 
-// Define a type for the comparison function, depending on the platform.
-#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) || defined(__APPLE__)
-typedef int (*sz_qsort_comparison_func_t)(void *, void const *, void const *);
-#else
-typedef int (*sz_qsort_comparison_func_t)(void const *, void const *, void *);
-#endif
+typedef sz_string_ptr_t (*sz_sequence_member_start_t)(void const *, sz_size_t);
+typedef sz_size_t (*sz_sequence_member_length_t)(void const *, sz_size_t);
+typedef sz_bool_t (*sz_sequence_predicate_t)(void const *, sz_size_t);
+typedef sz_bool_t (*sz_sequence_comparator_t)(void const *, sz_size_t, sz_size_t);
+typedef sz_bool_t (*sz_string_is_less_t)(sz_string_ptr_t, sz_size_t, sz_string_ptr_t, sz_size_t);
 
-typedef struct sz_sequence_t {
-    sz_size_t *order;
+typedef struct sz_sequence_s {
+    sz_u64_t *order;
     sz_size_t count;
-    sz_sequence_get_start_t get_start;
-    sz_sequence_get_length_t get_length;
+    sz_sequence_member_start_t get_start;
+    sz_sequence_member_length_t get_length;
     void const *handle;
 } sz_sequence_t;
 
 /**
- *  @brief  Similar to `std::partition`, given a predicate splits the
- *          sequence into two parts.
+ *  @brief  Similar to `std::partition`, given a predicate splits the sequence into two parts.
+ *          The algorithm is unstable, meaning that elements may change relative order, as long
+ *          as they are in the right partition. This is the simpler algorithm for partitioning.
  */
 inline static sz_size_t sz_partition(sz_sequence_t *sequence, sz_sequence_predicate_t predicate) {
 
@@ -615,14 +742,16 @@ inline static sz_size_t sz_partition(sz_sequence_t *sequence, sz_sequence_predic
 
     for (sz_size_t i = matches + 1; i < sequence->count; ++i)
         if (predicate(sequence->handle, sequence->order[i]))
-            sz_swap(sequence->order + i, sequence->order + matches), ++matches;
+            _sz_swap_order(sequence->order + i, sequence->order + matches), ++matches;
 
     return matches;
 }
 
 /**
- *  @brief  Inplace `std::set_union` for two consecutive chunks forming
- *          the same continuous sequence.
+ *  @brief  Inplace `std::set_union` for two consecutive chunks forming the same continuous `sequence`.
+ *
+ *  @param partition The number of elements in the first sub-sequence in `sequence`.
+ *  @param less Comparison function, to determine the lexicographic ordering.
  */
 inline static void sz_merge(sz_sequence_t *sequence, sz_size_t partition, sz_sequence_comparator_t less) {
 
@@ -642,10 +771,7 @@ inline static void sz_merge(sz_sequence_t *sequence, sz_size_t partition, sz_seq
 
             // Shift all the elements between element 1
             // element 2, right by 1.
-            while (index != start_a) {
-                sequence->order[index] = sequence->order[index - 1];
-                index--;
-            }
+            while (index != start_a) { sequence->order[index] = sequence->order[index - 1], index--; }
             sequence->order[start_a] = value;
 
             // Update all the pointers
@@ -656,112 +782,86 @@ inline static void sz_merge(sz_sequence_t *sequence, sz_size_t partition, sz_seq
     }
 }
 
+inline static void sz_sort_insertion(sz_sequence_t *sequence, sz_sequence_comparator_t comparator) {
+    sz_u64_t *keys = sequence->order;
+    sz_size_t keys_count = sequence->count;
+    for (sz_size_t i = 1; i < keys_count; i++) {
+        sz_u64_t i_key = keys[i];
+        // Move elements of arr[0..i-1] that are greater than key to one position ahead of their current position
+        sz_size_t j = i;
+        for (; j > 0 && comparator(sequence, keys[j - 1], i) != 0; --j) keys[j] = keys[j - 1];
+        keys[j] = i_key;
+    }
+}
+
+/**
+ *  @brief  Internal Radix sorting procedure.
+ */
 inline static void _sz_sort_recursion( //
     sz_sequence_t *sequence,
     sz_size_t bit_idx,
     sz_size_t bit_max,
-    sz_qsort_comparison_func_t qsort_comparator) {
+    sz_sequence_comparator_t comparator,
+    sz_size_t partial_order_length) {
 
     if (!sequence->count) return;
 
     // Partition a range of integers according to a specific bit value
     sz_size_t split = 0;
     {
-        sz_size_t mask = (1ul << 63) >> bit_idx;
+        sz_u64_t mask = (1ul << 63) >> bit_idx;
         while (split != sequence->count && !(sequence->order[split] & mask)) ++split;
         for (sz_size_t i = split + 1; i < sequence->count; ++i)
-            if (!(sequence->order[i] & mask)) sz_swap(sequence->order + i, sequence->order + split), ++split;
+            if (!(sequence->order[i] & mask)) _sz_swap_order(sequence->order + i, sequence->order + split), ++split;
     }
 
     // Go down recursively
     if (bit_idx < bit_max) {
         sz_sequence_t a = *sequence;
         a.count = split;
-        _sz_sort_recursion(&a, bit_idx + 1, bit_max, qsort_comparator);
+        _sz_sort_recursion(&a, bit_idx + 1, bit_max, comparator, partial_order_length);
 
         sz_sequence_t b = *sequence;
         b.order += split;
         b.count -= split;
-        _sz_sort_recursion(&b, bit_idx + 1, bit_max, qsort_comparator);
+        _sz_sort_recursion(&b, bit_idx + 1, bit_max, comparator, partial_order_length);
     }
     // Reached the end of recursion
     else {
         // Discard the prefixes
-        for (sz_size_t i = 0; i != sequence->count; ++i) { memset((char *)(&sequence->order[i]) + 4, 0, 4ul); }
-
-        // Perform sorts on smaller chunks instead of the whole handle
-#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
-        // https://stackoverflow.com/a/39561369
-        // https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/qsort-s?view=msvc-170
-        qsort_s(sequence->order, split, sizeof(sz_size_t), qsort_comparator, (void *)sequence);
-        qsort_s(sequence->order + split,
-                sequence->count - split,
-                sizeof(sz_size_t),
-                qsort_comparator,
-                (void *)sequence);
-#elif __APPLE__
-        qsort_r(sequence->order, split, sizeof(sz_size_t), (void *)sequence, qsort_comparator);
-        qsort_r(sequence->order + split,
-                sequence->count - split,
-                sizeof(sz_size_t),
-                (void *)sequence,
-                qsort_comparator);
-#else
-        // https://linux.die.net/man/3/qsort_r
-        qsort_r(sequence->order, split, sizeof(sz_size_t), qsort_comparator, (void *)sequence);
-        qsort_r(sequence->order + split,
-                sequence->count - split,
-                sizeof(sz_size_t),
-                qsort_comparator,
-                (void *)sequence);
-#endif
+        sz_u32_t *order_half_words = (sz_u32_t *)sequence->order;
+        for (sz_size_t i = 0; i != sequence->count; ++i) { order_half_words[i * 2 + 1] = 0; }
+
+        sz_sequence_t a = *sequence;
+        a.count = split;
+        sz_sort_insertion(&a, comparator);
+
+        sz_sequence_t b = *sequence;
+        b.order += split;
+        b.count -= split;
+        sz_sort_insertion(&b, comparator);
     }
 }
 
-inline static int _sz_sort_sequence_strncmp(
-#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) || __APPLE__
-    void *sequence_raw, void const *a_raw, void const *b_raw
-#else
-    void const *a_raw, void const *b_raw, void *sequence_raw
-#endif
-) {
-    // https://man.freebsd.org/cgi/man.cgi?query=qsort_s&sektion=3&n=1
-    // https://www.man7.org/linux/man-pages/man3/strcmp.3.html
-    sz_sequence_t *sequence = (sz_sequence_t *)sequence_raw;
-    sz_size_t a = *(sz_size_t *)a_raw;
-    sz_size_t b = *(sz_size_t *)b_raw;
-    sz_size_t a_len = sequence->get_length(sequence->handle, a);
-    sz_size_t b_len = sequence->get_length(sequence->handle, b);
-    int res = strncmp( //
-        sequence->get_start(sequence->handle, a),
-        sequence->get_start(sequence->handle, b),
-        a_len > b_len ? b_len : a_len);
-    return res ? res : a_len - b_len;
+inline static sz_bool_t _sz_sort_compare_less_ascii(sz_sequence_t *sequence, sz_size_t i_key, sz_size_t j_key) {
+    sz_string_ptr_t i_str = sequence->get_start(sequence->handle, i_key);
+    sz_size_t i_len = sequence->get_length(sequence->handle, i_key);
+    sz_string_ptr_t j_str = sequence->get_start(sequence->handle, j_key);
+    sz_size_t j_len = sequence->get_length(sequence->handle, j_key);
+    return sz_is_less_ascii(i_str, i_len, j_str, j_len);
 }
 
-inline static int _sz_sort_sequence_strncasecmp(
-#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) || __APPLE__
-    void *sequence_raw, void const *a_raw, void const *b_raw
-#else
-    void const *a_raw, void const *b_raw, void *sequence_raw
-#endif
-) {
-    // https://man.freebsd.org/cgi/man.cgi?query=qsort_s&sektion=3&n=1
-    // https://www.man7.org/linux/man-pages/man3/strcmp.3.html
-    sz_sequence_t *sequence = (sz_sequence_t *)sequence_raw;
-    sz_size_t a = *(sz_size_t *)a_raw;
-    sz_size_t b = *(sz_size_t *)b_raw;
-    sz_size_t a_len = sequence->get_length(sequence->handle, a);
-    sz_size_t b_len = sequence->get_length(sequence->handle, b);
-    int res = strncasecmp( //
-        sequence->get_start(sequence->handle, a),
-        sequence->get_start(sequence->handle, b),
-        a_len > b_len ? b_len : a_len);
-    return res ? res : a_len - b_len;
+inline static sz_bool_t _sz_sort_compare_less_uncased_ascii(sz_sequence_t *sequence, sz_size_t i_key, sz_size_t j_key) {
+    sz_string_ptr_t i_str = sequence->get_start(sequence->handle, i_key);
+    sz_size_t i_len = sequence->get_length(sequence->handle, i_key);
+    sz_string_ptr_t j_str = sequence->get_start(sequence->handle, j_key);
+    sz_size_t j_len = sequence->get_length(sequence->handle, j_key);
+    return sz_is_less_uncased_ascii(i_str, i_len, j_str, j_len);
 }
 
 typedef struct sz_sort_config_t {
-    int case_insensitive;
+    sz_bool_t case_insensitive;
+    sz_size_t partial_order_length;
 } sz_sort_config_t;
 
 /**
@@ -770,11 +870,13 @@ typedef struct sz_sort_config_t {
  */
 inline static void sz_sort(sz_sequence_t *sequence, sz_sort_config_t const *config) {
 
-    int case_insensitive = config && config->case_insensitive;
+    sz_bool_t case_insensitive = config && config->case_insensitive;
+    sz_size_t partial_order_length =
+        config && config->partial_order_length ? config->partial_order_length : sequence->count;
 
     // Export up to 4 bytes into the `sequence` bits themselves
     for (sz_size_t i = 0; i != sequence->count; ++i) {
-        char const *begin = sequence->get_start(sequence->handle, sequence->order[i]);
+        sz_string_ptr_t begin = sequence->get_start(sequence->handle, sequence->order[i]);
         sz_size_t length = sequence->get_length(sequence->handle, sequence->order[i]);
         length = length > 4ul ? 4ul : length;
         char *prefix = (char *)&sequence->order[i];
@@ -787,11 +889,11 @@ inline static void sz_sort(sz_sequence_t *sequence, sz_sort_config_t const *conf
         }
     }
 
-    sz_qsort_comparison_func_t comparator = _sz_sort_sequence_strncmp;
-    if (case_insensitive) comparator = _sz_sort_sequence_strncasecmp;
+    sz_sequence_comparator_t comparator = (sz_sequence_comparator_t)_sz_sort_compare_less_ascii;
+    if (case_insensitive) comparator = (sz_sequence_comparator_t)_sz_sort_compare_less_uncased_ascii;
 
     // Perform optionally-parallel radix sort on them
-    _sz_sort_recursion(sequence, 0, 32, comparator);
+    _sz_sort_recursion(sequence, 0, 32, comparator, partial_order_length);
 }
 
 typedef unsigned char levenstein_distance_t;
@@ -806,9 +908,9 @@ inline static sz_size_t sz_levenstein_memory_needed(sz_size_t _, sz_size_t b_len
  *  @brief  Auxiliary function, that computes the minimum of three values.
  */
 inline static levenstein_distance_t _sz_levenstein_minimum( //
-    levenstein_distance_t a,
-    levenstein_distance_t b,
-    levenstein_distance_t c) {
+    levenstein_distance_t const a,
+    levenstein_distance_t const b,
+    levenstein_distance_t const c) {
 
     return (a < b ? (a < c ? a : c) : (b < c ? b : c));
 }
@@ -818,11 +920,11 @@ inline static levenstein_distance_t _sz_levenstein_minimum( //
  *          It accepts an upper bound on the possible error. Quadratic complexity in time, linear in space.
  */
 inline static levenstein_distance_t sz_levenstein( //
-    char const *a,
-    sz_size_t a_length,
-    char const *b,
-    sz_size_t b_length,
-    levenstein_distance_t bound,
+    sz_string_ptr_t const a,
+    sz_size_t const a_length,
+    sz_string_ptr_t const b,
+    sz_size_t const b_length,
+    levenstein_distance_t const bound,
     void *buffer) {
 
     // If one of the strings is empty - the edit distance is equal to the length of the other one
@@ -873,11 +975,11 @@ inline static levenstein_distance_t sz_levenstein( //
 /**
  *  @brief  Hashes provided string using hardware-accelerated CRC32 instructions.
  */
-inline static sz_u32_t sz_hash_crc32_native(char const *start, sz_size_t length) { return 0; }
+inline static sz_u32_t sz_hash_crc32_native(sz_string_ptr_t start, sz_size_t length) { return 0; }
 
-inline static sz_u32_t sz_hash_crc32_neon(char const *start, sz_size_t length) { return 0; }
+inline static sz_u32_t sz_hash_crc32_neon(sz_string_ptr_t start, sz_size_t length) { return 0; }
 
-inline static sz_u32_t sz_hash_crc32_sse(char const *start, sz_size_t length) { return 0; }
+inline static sz_u32_t sz_hash_crc32_sse(sz_string_ptr_t start, sz_size_t length) { return 0; }
 
 #ifdef __cplusplus
 }
@@ -889,5 +991,6 @@ inline static sz_u32_t sz_hash_crc32_sse(char const *start, sz_size_t length) {
 #endif
 #undef popcount64
 #undef ctz64
+#undef clz64
 
 #endif // STRINGZILLA_H_

From 644630b852ad43be6ba092c3091b458446688c4c Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 9 Oct 2023 14:38:44 -0700
Subject: [PATCH 5/8] Improve: Intro-sort

---
 stringzilla/stringzilla.h | 167 ++++++++++++++++++++++++++++++++------
 1 file changed, 142 insertions(+), 25 deletions(-)

diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h
index 0aa8774b..84e864cf 100644
--- a/stringzilla/stringzilla.h
+++ b/stringzilla/stringzilla.h
@@ -714,15 +714,15 @@ inline static void _sz_swap_order(sz_u64_t *a, sz_u64_t *b) {
     *b = t;
 }
 
-struct sz_sequence_s;
+struct sz_sequence_t;
 
-typedef sz_string_ptr_t (*sz_sequence_member_start_t)(void const *, sz_size_t);
-typedef sz_size_t (*sz_sequence_member_length_t)(void const *, sz_size_t);
-typedef sz_bool_t (*sz_sequence_predicate_t)(void const *, sz_size_t);
-typedef sz_bool_t (*sz_sequence_comparator_t)(void const *, sz_size_t, sz_size_t);
+typedef sz_string_ptr_t (*sz_sequence_member_start_t)(struct sz_sequence_t const *, sz_size_t);
+typedef sz_size_t (*sz_sequence_member_length_t)(struct sz_sequence_t const *, sz_size_t);
+typedef sz_bool_t (*sz_sequence_predicate_t)(struct sz_sequence_t const *, sz_size_t);
+typedef sz_bool_t (*sz_sequence_comparator_t)(struct sz_sequence_t const *, sz_size_t, sz_size_t);
 typedef sz_bool_t (*sz_string_is_less_t)(sz_string_ptr_t, sz_size_t, sz_string_ptr_t, sz_size_t);
 
-typedef struct sz_sequence_s {
+typedef struct sz_sequence_t {
     sz_u64_t *order;
     sz_size_t count;
     sz_sequence_member_start_t get_start;
@@ -738,10 +738,10 @@ typedef struct sz_sequence_s {
 inline static sz_size_t sz_partition(sz_sequence_t *sequence, sz_sequence_predicate_t predicate) {
 
     sz_size_t matches = 0;
-    while (matches != sequence->count && predicate(sequence->handle, sequence->order[matches])) ++matches;
+    while (matches != sequence->count && predicate(sequence, sequence->order[matches])) ++matches;
 
     for (sz_size_t i = matches + 1; i < sequence->count; ++i)
-        if (predicate(sequence->handle, sequence->order[i]))
+        if (predicate(sequence, sequence->order[i]))
             _sz_swap_order(sequence->order + i, sequence->order + matches), ++matches;
 
     return matches;
@@ -758,13 +758,13 @@ inline static void sz_merge(sz_sequence_t *sequence, sz_size_t partition, sz_seq
     sz_size_t start_b = partition + 1;
 
     // If the direct merge is already sorted
-    if (!less(sequence->handle, sequence->order[start_b], sequence->order[partition])) return;
+    if (!less(sequence, sequence->order[start_b], sequence->order[partition])) return;
 
     sz_size_t start_a = 0;
     while (start_a <= partition && start_b <= sequence->count) {
 
         // If element 1 is in right place
-        if (!less(sequence->handle, sequence->order[start_b], sequence->order[start_a])) { start_a++; }
+        if (!less(sequence, sequence->order[start_b], sequence->order[start_a])) { start_a++; }
         else {
             sz_size_t value = sequence->order[start_b];
             sz_size_t index = start_b;
@@ -782,18 +782,135 @@ inline static void sz_merge(sz_sequence_t *sequence, sz_size_t partition, sz_seq
     }
 }
 
-inline static void sz_sort_insertion(sz_sequence_t *sequence, sz_sequence_comparator_t comparator) {
+inline static void sz_sort_insertion(sz_sequence_t *sequence, sz_sequence_comparator_t less) {
     sz_u64_t *keys = sequence->order;
     sz_size_t keys_count = sequence->count;
     for (sz_size_t i = 1; i < keys_count; i++) {
         sz_u64_t i_key = keys[i];
-        // Move elements of arr[0..i-1] that are greater than key to one position ahead of their current position
         sz_size_t j = i;
-        for (; j > 0 && comparator(sequence, keys[j - 1], i) != 0; --j) keys[j] = keys[j - 1];
+        for (; j > 0 && less(sequence, i_key, keys[j - 1]); --j) keys[j] = keys[j - 1];
         keys[j] = i_key;
     }
 }
 
+// Utility functions
+inline static sz_size_t _sz_log2i(sz_size_t n) {
+    sz_size_t log2 = 0;
+    while (n >>= 1) ++log2;
+    return log2;
+}
+
+inline static void _sz_sift_down(
+    sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_u64_t *order, sz_size_t start, sz_size_t end) {
+    sz_size_t root = start;
+    while (2 * root + 1 <= end) {
+        sz_size_t child = 2 * root + 1;
+        if (child + 1 <= end && less(sequence, order[child], order[child + 1])) { child++; }
+        if (!less(sequence, order[root], order[child])) { return; }
+        _sz_swap_order(order + root, order + child);
+        root = child;
+    }
+}
+
+inline static void _sz_heapify(sz_sequence_t *sequence,
+                               sz_sequence_comparator_t less,
+                               sz_u64_t *order,
+                               sz_size_t count) {
+    sz_size_t start = (count - 2) / 2;
+    while (1) {
+        _sz_sift_down(sequence, less, order, start, count - 1);
+        if (start == 0) return;
+        start--;
+    }
+}
+
+inline static void _sz_heapsort(sz_sequence_t *sequence,
+                                sz_sequence_comparator_t less,
+                                sz_size_t first,
+                                sz_size_t last) {
+    sz_u64_t *order = sequence->order;
+    sz_size_t count = last - first;
+    _sz_heapify(sequence, less, order + first, count);
+    sz_size_t end = count - 1;
+    while (end > 0) {
+        _sz_swap_order(order + first, order + first + end);
+        end--;
+        _sz_sift_down(sequence, less, order + first, 0, end);
+    }
+}
+
+inline static void _sz_introsort(
+    sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_size_t first, sz_size_t last, sz_size_t depth) {
+
+    sz_size_t length = last - first;
+    switch (length) {
+    case 0:
+    case 1: return;
+    case 2:
+        if (less(sequence, sequence->order[first + 1], sequence->order[first]))
+            _sz_swap_order(&sequence->order[first], &sequence->order[first + 1]);
+        return;
+    case 3:
+        sz_u64_t a = sequence->order[first];
+        sz_u64_t b = sequence->order[first + 1];
+        sz_u64_t c = sequence->order[first + 2];
+        if (less(sequence, b, a)) _sz_swap_order(&a, &b);
+        if (less(sequence, c, b)) _sz_swap_order(&c, &b);
+        if (less(sequence, b, a)) _sz_swap_order(&a, &b);
+        sequence->order[first] = a;
+        sequence->order[first + 1] = b;
+        sequence->order[first + 2] = c;
+        return;
+    }
+    // Until a certain length, the quadratic-complexity insertion-sort is fine
+    if (length <= 16) {
+        sz_sequence_t sub_seq = *sequence;
+        sub_seq.order += first;
+        sub_seq.count = length;
+        sz_sort_insertion(&sub_seq, less);
+        return;
+    }
+
+    // Fallback to N-logN-complexity heap-sort
+    if (depth == 0) {
+        _sz_heapsort(sequence, less, first, last);
+        return;
+    }
+
+    --depth;
+
+    // Median-of-three logic to choose pivot
+    sz_size_t median = first + length / 2;
+    if (less(sequence, sequence->order[median], sequence->order[first]))
+        _sz_swap_order(&sequence->order[first], &sequence->order[median]);
+    if (less(sequence, sequence->order[last - 1], sequence->order[first]))
+        _sz_swap_order(&sequence->order[first], &sequence->order[last - 1]);
+    if (less(sequence, sequence->order[median], sequence->order[last - 1]))
+        _sz_swap_order(&sequence->order[median], &sequence->order[last - 1]);
+
+    // Partition using the median-of-three as the pivot
+    sz_u64_t pivot = sequence->order[median];
+    sz_size_t left = first;
+    sz_size_t right = last - 1;
+    while (true) {
+        while (less(sequence, sequence->order[left], pivot)) left++;
+        while (less(sequence, pivot, sequence->order[right])) right--;
+        if (left >= right) break;
+        _sz_swap_order(&sequence->order[left], &sequence->order[right]);
+        left++;
+        right--;
+    }
+
+    // Recursively sort the partitions
+    _sz_introsort(sequence, less, first, left, depth);
+    _sz_introsort(sequence, less, right + 1, last, depth);
+}
+
+inline static void sz_sort_introsort(sz_sequence_t *sequence, sz_sequence_comparator_t less) {
+    sz_size_t depth_limit = 2 * _sz_log2i(sequence->count);
+    _sz_introsort(sequence, less, 0, sequence->count, depth_limit);
+}
+
 /**
  *  @brief  Internal Radix sorting procedure.
  */
@@ -834,28 +951,28 @@ inline static void _sz_sort_recursion( //
 
         sz_sequence_t a = *sequence;
         a.count = split;
-        sz_sort_insertion(&a, comparator);
+        sz_sort_introsort(&a, comparator);
 
         sz_sequence_t b = *sequence;
         b.order += split;
         b.count -= split;
-        sz_sort_insertion(&b, comparator);
+        sz_sort_introsort(&b, comparator);
     }
 }
 
 inline static sz_bool_t _sz_sort_compare_less_ascii(sz_sequence_t *sequence, sz_size_t i_key, sz_size_t j_key) {
-    sz_string_ptr_t i_str = sequence->get_start(sequence->handle, i_key);
-    sz_size_t i_len = sequence->get_length(sequence->handle, i_key);
-    sz_string_ptr_t j_str = sequence->get_start(sequence->handle, j_key);
-    sz_size_t j_len = sequence->get_length(sequence->handle, j_key);
+    sz_string_ptr_t i_str = sequence->get_start(sequence, i_key);
+    sz_size_t i_len = sequence->get_length(sequence, i_key);
+    sz_string_ptr_t j_str = sequence->get_start(sequence, j_key);
+    sz_size_t j_len = sequence->get_length(sequence, j_key);
     return sz_is_less_ascii(i_str, i_len, j_str, j_len);
 }
 
 inline static sz_bool_t _sz_sort_compare_less_uncased_ascii(sz_sequence_t *sequence, sz_size_t i_key, sz_size_t j_key) {
-    sz_string_ptr_t i_str = sequence->get_start(sequence->handle, i_key);
-    sz_size_t i_len = sequence->get_length(sequence->handle, i_key);
-    sz_string_ptr_t j_str = sequence->get_start(sequence->handle, j_key);
-    sz_size_t j_len = sequence->get_length(sequence->handle, j_key);
+    sz_string_ptr_t i_str = sequence->get_start(sequence, i_key);
+    sz_size_t i_len = sequence->get_length(sequence, i_key);
+    sz_string_ptr_t j_str = sequence->get_start(sequence, j_key);
+    sz_size_t j_len = sequence->get_length(sequence, j_key);
     return sz_is_less_uncased_ascii(i_str, i_len, j_str, j_len);
 }
 
@@ -876,8 +993,8 @@ inline static void sz_sort(sz_sequence_t *sequence, sz_sort_config_t const *conf
 
     // Export up to 4 bytes into the `sequence` bits themselves
     for (sz_size_t i = 0; i != sequence->count; ++i) {
-        sz_string_ptr_t begin = sequence->get_start(sequence->handle, sequence->order[i]);
-        sz_size_t length = sequence->get_length(sequence->handle, sequence->order[i]);
+        sz_string_ptr_t begin = sequence->get_start(sequence, sequence->order[i]);
+        sz_size_t length = sequence->get_length(sequence, sequence->order[i]);
         length = length > 4ul ? 4ul : length;
         char *prefix = (char *)&sequence->order[i];
         for (sz_size_t j = 0; j != length; ++j) prefix[7 - j] = begin[j];

From 05a409ce7ab1f76582c7936d2a4d2d6c99e7b3ed Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 9 Oct 2023 16:30:38 -0700
Subject: [PATCH 6/8] Refactor: New C API for JS

---
 javascript/lib.c          | 62 +++++++++++++-------------
 javascript/test/find.js   | 14 +++---
 scripts/test.c            | 13 +++---
 scripts/test.cpp          | 92 +++++++++++++++++++++------------------
 stringzilla/stringzilla.h |  9 ++--
 5 files changed, 97 insertions(+), 93 deletions(-)

diff --git a/javascript/lib.c b/javascript/lib.c
index fe1f5f68..18e36a1b 100644
--- a/javascript/lib.c
+++ b/javascript/lib.c
@@ -8,17 +8,18 @@
  *  @see        NodeJS docs: https://nodejs.org/api/n-api.html
  */
 
-#include <node_api.h>
-#include <stringzilla.h>
+#include <node_api.h>    // `napi_*` functions
+#include <stdlib.h>      // `malloc`
+#include <stringzilla.h> // `sz_*` functions
 
-napi_value FindAPI(napi_env env, napi_callback_info info) {
+napi_value indexOfAPI(napi_env env, napi_callback_info info) {
     size_t argc = 2;
     napi_value args[2];
     napi_get_cb_info(env, info, &argc, args, NULL, NULL);
 
     // Extract the C string from the JavaScript string for haystack and needle
-    sz_haystack_t haystack_sz = {NULL, 0};
-    sz_needle_t needle_sz = {NULL, 0, 0};
+    sz_string_view_t haystack_sz = {NULL, 0};
+    sz_string_view_t needle_sz = {NULL, 0};
 
     // For haystack
     napi_get_value_string_utf8(env, args[0], NULL, 0, (size_t *)&haystack_sz.length);
@@ -38,37 +39,32 @@ napi_value FindAPI(napi_env env, napi_callback_info info) {
                                needle_sz.length + 1,
                                (size_t *)&needle_sz.length);
 
-    // Perform the find operation
-    sz_size_t result = sz_find_substr(haystack_sz, needle_sz);
-
-    // Cleanup
-    free((void *)haystack_sz.start);
-    free((void *)needle_sz.start);
-
     // Convert the result to JavaScript BigInt and return
     napi_value js_result;
+    if (needle_sz.length == 0) { napi_create_bigint_int64(env, 0, &js_result); }
+    else {
+        sz_string_ptr_t result =
+            sz_find_substr(haystack_sz.start, haystack_sz.length, needle_sz.start, needle_sz.length);
 
-    // In JavaScript, if `find` is unable to find the specified value, then it should return -1
-    if (result == haystack_sz.length) napi_create_bigint_int64(env, -1, &js_result);
-    else
-        napi_create_bigint_uint64(env, result, &js_result);
+        // In JavaScript, if `indexOf` is unable to indexOf the specified value, then it should return -1
+        if (result == NULL) { napi_create_bigint_int64(env, -1, &js_result); }
+        else { napi_create_bigint_uint64(env, result - haystack_sz.start, &js_result); }
+    }
 
+    // Cleanup
+    free((void *)haystack_sz.start);
+    free((void *)needle_sz.start);
     return js_result;
 }
 
-size_t count_char(sz_haystack_t haystack_sz, char needle) {
-    size_t result = sz_count_char(haystack_sz, needle);
-    return result;
-}
-
-napi_value CountAPI(napi_env env, napi_callback_info info) {
+napi_value countAPI(napi_env env, napi_callback_info info) {
     size_t argc = 3;
     napi_value args[3];
     napi_get_cb_info(env, info, &argc, args, NULL, NULL);
 
     // Extract the C string from the JavaScript string for haystack and needle
-    sz_haystack_t haystack_sz = {NULL, 0};
-    sz_needle_t needle_sz = {NULL, 0, 0};
+    sz_string_view_t haystack_sz = {NULL, 0};
+    sz_string_view_t needle_sz = {NULL, 0};
 
     // For haystack
     napi_get_value_string_utf8(env, args[0], NULL, 0, (size_t *)&haystack_sz.length);
@@ -95,11 +91,13 @@ napi_value CountAPI(napi_env env, napi_callback_info info) {
 
     size_t count = 0;
     if (needle_sz.length == 0 || haystack_sz.length == 0 || haystack_sz.length < needle_sz.length) { count = 0; }
-    else if (needle_sz.length == 1) { count = count_char(haystack_sz, needle_sz.start[0]); }
+    else if (needle_sz.length == 1) { count = sz_count_char(haystack_sz.start, haystack_sz.length, needle_sz.start); }
     else if (overlap) {
         while (haystack_sz.length) {
-            sz_size_t offset = sz_find_substr(haystack_sz, needle_sz);
-            int found = offset != haystack_sz.length;
+            sz_string_ptr_t ptr =
+                sz_find_substr(haystack_sz.start, haystack_sz.length, needle_sz.start, needle_sz.length);
+            sz_bool_t found = ptr != NULL;
+            sz_size_t offset = found ? ptr - haystack_sz.start : haystack_sz.length;
             count += found;
             haystack_sz.start += offset + found;
             haystack_sz.length -= offset + found;
@@ -107,8 +105,10 @@ napi_value CountAPI(napi_env env, napi_callback_info info) {
     }
     else {
         while (haystack_sz.length) {
-            sz_size_t offset = sz_find_substr(haystack_sz, needle_sz);
-            int found = offset != haystack_sz.length;
+            sz_string_ptr_t ptr =
+                sz_find_substr(haystack_sz.start, haystack_sz.length, needle_sz.start, needle_sz.length);
+            sz_bool_t found = ptr != NULL;
+            sz_size_t offset = found ? ptr - haystack_sz.start : haystack_sz.length;
             count += found;
             haystack_sz.start += offset + needle_sz.length;
             haystack_sz.length -= offset + needle_sz.length * found;
@@ -129,8 +129,8 @@ napi_value CountAPI(napi_env env, napi_callback_info info) {
 napi_value Init(napi_env env, napi_value exports) {
 
     // Define an array of property descriptors
-    napi_property_descriptor findDesc = {"find", 0, FindAPI, 0, 0, 0, napi_default, 0};
-    napi_property_descriptor countDesc = {"count", 0, CountAPI, 0, 0, 0, napi_default, 0};
+    napi_property_descriptor findDesc = {"indexOf", 0, indexOfAPI, 0, 0, 0, napi_default, 0};
+    napi_property_descriptor countDesc = {"count", 0, countAPI, 0, 0, 0, napi_default, 0};
     napi_property_descriptor properties[] = {findDesc, countDesc};
 
     // Define the properties on the `exports` object
diff --git a/javascript/test/find.js b/javascript/test/find.js
index cd2a800d..9fe4e5b7 100644
--- a/javascript/test/find.js
+++ b/javascript/test/find.js
@@ -5,26 +5,26 @@ import assert from 'node:assert';
 const stringzilla = bindings('stringzilla');
 
 test('Find Word in Text - Positive Case', () => {
-    const result = stringzilla.find('hello world, hello john', 'hello');
+    const result = stringzilla.indexOf('hello world, hello john', 'hello');
 
     assert.strictEqual(result, 0n);
 });
 
 test('Find Word in Text - Negative Case (Word Not Found)', () => {
-    const result_1 = stringzilla.find('ha', 'aaa');
+    const result_1 = stringzilla.indexOf('ha', 'aaa');
     assert.strictEqual(result_1, -1n);
 
-    const result_2 = stringzilla.find('g', 'a');
+    const result_2 = stringzilla.indexOf('g', 'a');
     assert.strictEqual(result_2, -1n);
 });
 
 test('Find Word in Text - Negative Case (Empty String Inputs)', () => {
-    const result_1 = stringzilla.find('hello world', '');
+    const result_1 = stringzilla.indexOf('hello world', '');
     assert.strictEqual(result_1, 0n);
 
-    const result_2 = stringzilla.find('', 'a');
+    const result_2 = stringzilla.indexOf('', 'a');
     assert.strictEqual(result_2, -1n);
 
-    const result_3 = stringzilla.find('', '');
-    assert.strictEqual(result_2, -1n);
+    const result_3 = stringzilla.indexOf('', '');
+    assert.strictEqual(result_3, 0n);
 });
diff --git a/scripts/test.c b/scripts/test.c
index a921e76d..127975b0 100644
--- a/scripts/test.c
+++ b/scripts/test.c
@@ -27,24 +27,23 @@ void test_sz_find_substr() {
         for (int variability = 1; variability < VARIABILITY; variability++) {
             populate_random_string(buffer, length, variability);
 
-            struct sz_haystack_t haystack;
+            sz_string_view_t haystack;
             haystack.start = buffer;
             haystack.length = length;
 
             int pattern_length = rand() % 5 + 1;
             populate_random_string(pattern, pattern_length, variability);
 
-            struct sz_needle_t needle;
+            sz_string_view_t needle;
             needle.start = pattern;
             needle.length = pattern_length;
 
             // Comparing the result of your function with the standard library function.
-            const char *result_libc = strstr(buffer, pattern);
-            uint64_t result_stringzilla = sz_find_substr(haystack, needle);
+            sz_string_ptr_t result_libc = strstr(buffer, pattern);
+            sz_string_ptr_t result_stringzilla =
+                sz_find_substr(haystack.start, haystack.length, needle.start, needle.length);
 
-            assert(((result_libc && result_stringzilla == (uint64_t)(result_libc - buffer)) ||
-                    (!result_libc && result_stringzilla == (uint64_t)-1)) &&
-                   "Test failed for sz_find_substr");
+            assert(((result_libc == NULL) ^ (result_stringzilla == NULL)) && "Test failed for sz_find_substr");
         }
     }
 }
diff --git a/scripts/test.cpp b/scripts/test.cpp
index ddef4e82..8dc1a4d2 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -1,39 +1,39 @@
-#include <cstring>
+#include <algorithm>
 #include <chrono>
-#include <iostream>
+#include <cstring>
 #include <fstream>
-#include <vector>
-#include <string>
-#include <numeric>
+#include <iostream>
 #include <limits>
-#include <algorithm>
+#include <numeric>
+#include <string>
 #include <strstream>
+#include <vector>
 
 #include <stringzilla.h>
 
 using strings_t = std::vector<std::string>;
 using idx_t = sz_size_t;
-using permute_t = std::vector<idx_t>;
+using permute_t = std::vector<sz_u64_t>;
 
 #pragma region - C callbacks
 
-static char const *get_start(void const *array_c, sz_size_t i) {
-    strings_t const &array = *reinterpret_cast<strings_t const *>(array_c);
+static char const *get_start(sz_sequence_t const *array_c, sz_size_t i) {
+    strings_t const &array = *reinterpret_cast<strings_t const *>(array_c->handle);
     return array[i].c_str();
 }
 
-static sz_size_t get_length(void const *array_c, sz_size_t i) {
-    strings_t const &array = *reinterpret_cast<strings_t const *>(array_c);
+static sz_size_t get_length(sz_sequence_t const *array_c, sz_size_t i) {
+    strings_t const &array = *reinterpret_cast<strings_t const *>(array_c->handle);
     return array[i].size();
 }
 
-static int is_less(void const *array_c, sz_size_t i, sz_size_t j) {
-    strings_t const &array = *reinterpret_cast<strings_t const *>(array_c);
+static int is_less(sz_sequence_t const *array_c, sz_size_t i, sz_size_t j) {
+    strings_t const &array = *reinterpret_cast<strings_t const *>(array_c->handle);
     return array[i] < array[j];
 }
 
-static int has_under_four_chars(void const *array_c, sz_size_t i) {
-    strings_t const &array = *reinterpret_cast<strings_t const *>(array_c);
+static int has_under_four_chars(sz_sequence_t const *array_c, sz_size_t i) {
+    strings_t const &array = *reinterpret_cast<strings_t const *>(array_c->handle);
     return array[i].size() < 4;
 }
 
@@ -64,7 +64,7 @@ void populate_with_test(strings_t &strings) {
 
 constexpr size_t offset_in_word = 0;
 
-inline static idx_t hybrid_sort_cpp(strings_t const &strings, idx_t *order) {
+inline static idx_t hybrid_sort_cpp(strings_t const &strings, sz_u64_t *order) {
 
     // What if we take up-to 4 first characters and the index
     for (size_t i = 0; i != strings.size(); ++i)
@@ -72,7 +72,7 @@ inline static idx_t hybrid_sort_cpp(strings_t const &strings, idx_t *order) {
                     strings[order[i]].c_str(),
                     std::min(strings[order[i]].size(), 4ul));
 
-    std::sort(order, order + strings.size(), [&](idx_t i, idx_t j) {
+    std::sort(order, order + strings.size(), [&](sz_u64_t i, sz_u64_t j) {
         char *i_bytes = (char *)&i;
         char *j_bytes = (char *)&j;
         return *(uint32_t *)(i_bytes + offset_in_word) < *(uint32_t *)(j_bytes + offset_in_word);
@@ -80,7 +80,7 @@ inline static idx_t hybrid_sort_cpp(strings_t const &strings, idx_t *order) {
 
     for (size_t i = 0; i != strings.size(); ++i) std::memset((char *)&order[i] + offset_in_word, 0, 4ul);
 
-    std::sort(order, order + strings.size(), [&](idx_t i, idx_t j) { return strings[i] < strings[j]; });
+    std::sort(order, order + strings.size(), [&](sz_u64_t i, sz_u64_t j) { return strings[i] < strings[j]; });
 
     return strings.size();
 }
@@ -92,14 +92,14 @@ int hybrid_sort_c_compare_uint32_t(const void *a, const void *b) {
 }
 
 int hybrid_sort_c_compare_strings(void *arg, const void *a, const void *b) {
-    sz_sequence_t *seq = (sz_sequence_t *)arg;
+    sz_sequence_t *sequence = (sz_sequence_t *)arg;
     sz_size_t idx_a = *(sz_size_t *)a;
     sz_size_t idx_b = *(sz_size_t *)b;
 
-    const char *str_a = seq->get_start(seq->handle, idx_a);
-    const char *str_b = seq->get_start(seq->handle, idx_b);
-    sz_size_t len_a = seq->get_length(seq->handle, idx_a);
-    sz_size_t len_b = seq->get_length(seq->handle, idx_b);
+    const char *str_a = sequence->get_start(sequence, idx_a);
+    const char *str_b = sequence->get_start(sequence, idx_b);
+    sz_size_t len_a = sequence->get_length(sequence, idx_a);
+    sz_size_t len_b = sequence->get_length(sequence, idx_b);
 
     int res = strncmp(str_a, str_b, len_a < len_b ? len_a : len_b);
     return res ? res : (int)(len_a - len_b);
@@ -108,8 +108,8 @@ int hybrid_sort_c_compare_strings(void *arg, const void *a, const void *b) {
 sz_size_t hybrid_sort_c(sz_sequence_t *sequence) {
     // Copy up to 4 first characters into the 'order' array.
     for (sz_size_t i = 0; i < sequence->count; ++i) {
-        const char *str = sequence->get_start(sequence->handle, sequence->order[i]);
-        sz_size_t len = sequence->get_length(sequence->handle, sequence->order[i]);
+        const char *str = sequence->get_start(sequence, sequence->order[i]);
+        sz_size_t len = sequence->get_length(sequence, sequence->order[i]);
         len = len > 4 ? 4 : len;
         memcpy((char *)&sequence->order[i] + sizeof(sz_size_t) - 4, str, len);
     }
@@ -128,7 +128,7 @@ sz_size_t hybrid_sort_c(sz_sequence_t *sequence) {
     return sequence->count;
 }
 
-inline static idx_t hybrid_stable_sort_cpp(strings_t const &strings, idx_t *order) {
+inline static idx_t hybrid_stable_sort_cpp(strings_t const &strings, sz_u64_t *order) {
 
     // What if we take up-to 4 first characters and the index
     for (size_t i = 0; i != strings.size(); ++i)
@@ -136,7 +136,7 @@ inline static idx_t hybrid_stable_sort_cpp(strings_t const &strings, idx_t *orde
                     strings[order[i]].c_str(),
                     std::min(strings[order[i]].size(), 4ul));
 
-    std::stable_sort(order, order + strings.size(), [&](idx_t i, idx_t j) {
+    std::stable_sort(order, order + strings.size(), [&](sz_u64_t i, sz_u64_t j) {
         char *i_bytes = (char *)&i;
         char *j_bytes = (char *)&j;
         return *(uint32_t *)(i_bytes + offset_in_word) < *(uint32_t *)(j_bytes + offset_in_word);
@@ -144,7 +144,7 @@ inline static idx_t hybrid_stable_sort_cpp(strings_t const &strings, idx_t *orde
 
     for (size_t i = 0; i != strings.size(); ++i) std::memset((char *)&order[i] + offset_in_word, 0, 4ul);
 
-    std::stable_sort(order, order + strings.size(), [&](idx_t i, idx_t j) { return strings[i] < strings[j]; });
+    std::stable_sort(order, order + strings.size(), [&](sz_u64_t i, sz_u64_t j) { return strings[i] < strings[j]; });
 
     return strings.size();
 }
@@ -209,7 +209,7 @@ int main(int, char const **) {
     std::printf("Hey, Ash!\n");
 
     strings_t strings;
-    populate_from_file("leipzig1M.txt", strings, 10000000);
+    populate_from_file("leipzig1M.txt", strings, 1000000);
     std::size_t mean_bytes = 0;
     for (std::string const &str : strings) mean_bytes += str.size();
     mean_bytes /= strings.size();
@@ -229,26 +229,23 @@ int main(int, char const **) {
     for (std::size_t needle_len = 1; needle_len <= 0; ++needle_len) {
         std::string needle(needle_len, '\4');
         std::printf("---- Needle length: %zu\n", needle_len);
-        bench_search("std::search", full_text, [&]() {
+        bench_search("std::search", full_text, [&]() mutable {
             return std::search(full_text.begin(), full_text.end(), needle.begin(), needle.end()) - full_text.begin();
         });
-        bench_search("sz_find_substr_swar", full_text, [&]() {
-            sz_haystack_t h {full_text.data(), full_text.size()};
-            sz_needle_t n {needle.data(), needle.size()};
-            return sz_find_substr_swar(h, n);
+        bench_search("sz_find_substr_swar", full_text, [&]() mutable {
+            sz_string_ptr_t ptr = sz_find_substr_swar(full_text.data(), full_text.size(), needle.data(), needle.size());
+            return ptr ? ptr - full_text.data() : full_text.size();
         });
 #if defined(__ARM_NEON)
-        bench_search("sz_find_substr_neon", full_text, [&]() {
-            sz_haystack_t h {full_text.data(), full_text.size()};
-            sz_needle_t n {needle.data(), needle.size()};
-            return sz_find_substr_neon(h, n);
+        bench_search("sz_find_substr_neon", full_text, [&]() mutable {
+            sz_string_ptr_t ptr = sz_find_substr_neon(full_text.data(), full_text.size(), needle.data(), needle.size());
+            return ptr ? ptr - full_text.data() : full_text.size();
         });
 #endif
 #if defined(__AVX2__)
-        bench_search("sz_find_substr_avx2", full_text, [&]() {
-            sz_haystack_t h {full_text.data(), full_text.size()};
-            sz_needle_t n {needle.data(), needle.size()};
-            return sz_find_substr_avx2(h, n);
+        bench_search("sz_find_substr_avx2", full_text, [&]() mutable {
+            sz_string_ptr_t ptr = sz_find_substr_avx2(full_text.data(), full_text.size(), needle.data(), needle.size());
+            return ptr ? ptr - full_text.data() : full_text.size();
         });
 #endif
     }
@@ -300,6 +297,17 @@ int main(int, char const **) {
         });
         expect_sorted(strings, permute_new);
 
+        bench_permute("sz_sort_introsort", strings, permute_new, [](strings_t const &strings, permute_t &permute) {
+            sz_sequence_t array;
+            array.order = permute.data();
+            array.count = strings.size();
+            array.handle = &strings;
+            array.get_start = get_start;
+            array.get_length = get_length;
+            sz_sort_introsort(&array, (sz_sequence_comparator_t)_sz_sort_compare_less_ascii);
+        });
+        expect_sorted(strings, permute_new);
+
         bench_permute("hybrid_sort_c", strings, permute_new, [](strings_t const &strings, permute_t &permute) {
             sz_sequence_t array;
             array.order = permute.data();
diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h
index 84e864cf..ba7f5f39 100644
--- a/stringzilla/stringzilla.h
+++ b/stringzilla/stringzilla.h
@@ -609,7 +609,7 @@ inline static sz_string_ptr_t sz_find_substr(sz_string_ptr_t const haystack,
                                              sz_size_t const haystack_length,
                                              sz_string_ptr_t const needle,
                                              sz_size_t const needle_length) {
-    if (haystack_length < needle_length) return NULL;
+    if (haystack_length < needle_length || needle_length == 0) return NULL;
 #if defined(__ARM_NEON)
     return sz_find_substr_neon(haystack, haystack_length, needle, needle_length);
 #elif defined(__AVX2__)
@@ -850,7 +850,7 @@ inline static void _sz_introsort(
         if (less(sequence, sequence->order[first + 1], sequence->order[first]))
             _sz_swap_order(&sequence->order[first], &sequence->order[first + 1]);
         return;
-    case 3:
+    case 3: {
         sz_u64_t a = sequence->order[first];
         sz_u64_t b = sequence->order[first + 1];
         sz_u64_t c = sequence->order[first + 2];
@@ -862,6 +862,7 @@ inline static void _sz_introsort(
         sequence->order[first + 2] = c;
         return;
     }
+    }
     // Until a certain length, the quadratic-complexity insertion-sort is fine
     if (length <= 16) {
         sz_sequence_t sub_seq = *sequence;
@@ -1102,10 +1103,6 @@ inline static sz_u32_t sz_hash_crc32_sse(sz_string_ptr_t start, sz_size_t length
 }
 #endif
 
-#ifdef _MSC_VER
-#undef strncasecmp
-#undef strcasecmp
-#endif
 #undef popcount64
 #undef ctz64
 #undef clz64

From cffae4a684437eafe3ed75299d2fb8c82baa1019 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 9 Oct 2023 16:50:13 -0700
Subject: [PATCH 7/8] Refactor: Sync up Py and JS bindings

---
 javascript/lib.c          |  87 ++++++---------
 python/lib.c              | 183 +++++++++++++++---------------
 scripts/test.cpp          |  15 ++-
 stringzilla/stringzilla.h | 226 ++++++++++++++++++++------------------
 4 files changed, 253 insertions(+), 258 deletions(-)

diff --git a/javascript/lib.c b/javascript/lib.c
index 18e36a1b..8ebe72eb 100644
--- a/javascript/lib.c
+++ b/javascript/lib.c
@@ -18,42 +18,33 @@ napi_value indexOfAPI(napi_env env, napi_callback_info info) {
     napi_get_cb_info(env, info, &argc, args, NULL, NULL);
 
     // Extract the C string from the JavaScript string for haystack and needle
-    sz_string_view_t haystack_sz = {NULL, 0};
-    sz_string_view_t needle_sz = {NULL, 0};
+    sz_string_view_t haystack = {NULL, 0};
+    sz_string_view_t needle = {NULL, 0};
 
     // For haystack
-    napi_get_value_string_utf8(env, args[0], NULL, 0, (size_t *)&haystack_sz.length);
-    haystack_sz.start = malloc(haystack_sz.length + 1);
-    napi_get_value_string_utf8(env,
-                               args[0],
-                               (char *)haystack_sz.start,
-                               haystack_sz.length + 1,
-                               (size_t *)&haystack_sz.length);
+    napi_get_value_string_utf8(env, args[0], NULL, 0, (size_t *)&haystack.length);
+    haystack.start = malloc(haystack.length + 1);
+    napi_get_value_string_utf8(env, args[0], (char *)haystack.start, haystack.length + 1, (size_t *)&haystack.length);
 
     // For needle
-    napi_get_value_string_utf8(env, args[1], NULL, 0, (size_t *)&needle_sz.length);
-    needle_sz.start = malloc(needle_sz.length + 1);
-    napi_get_value_string_utf8(env,
-                               args[1],
-                               (char *)needle_sz.start,
-                               needle_sz.length + 1,
-                               (size_t *)&needle_sz.length);
+    napi_get_value_string_utf8(env, args[1], NULL, 0, (size_t *)&needle.length);
+    needle.start = malloc(needle.length + 1);
+    napi_get_value_string_utf8(env, args[1], (char *)needle.start, needle.length + 1, (size_t *)&needle.length);
 
     // Convert the result to JavaScript BigInt and return
     napi_value js_result;
-    if (needle_sz.length == 0) { napi_create_bigint_int64(env, 0, &js_result); }
+    if (needle.length == 0) { napi_create_bigint_int64(env, 0, &js_result); }
     else {
-        sz_string_ptr_t result =
-            sz_find_substr(haystack_sz.start, haystack_sz.length, needle_sz.start, needle_sz.length);
+        sz_string_start_t result = sz_find_substring(haystack.start, haystack.length, needle.start, needle.length);
 
         // In JavaScript, if `indexOf` is unable to indexOf the specified value, then it should return -1
         if (result == NULL) { napi_create_bigint_int64(env, -1, &js_result); }
-        else { napi_create_bigint_uint64(env, result - haystack_sz.start, &js_result); }
+        else { napi_create_bigint_uint64(env, result - haystack.start, &js_result); }
     }
 
     // Cleanup
-    free((void *)haystack_sz.start);
-    free((void *)needle_sz.start);
+    free((void *)haystack.start);
+    free((void *)needle.start);
     return js_result;
 }
 
@@ -63,55 +54,45 @@ napi_value countAPI(napi_env env, napi_callback_info info) {
     napi_get_cb_info(env, info, &argc, args, NULL, NULL);
 
     // Extract the C string from the JavaScript string for haystack and needle
-    sz_string_view_t haystack_sz = {NULL, 0};
-    sz_string_view_t needle_sz = {NULL, 0};
+    sz_string_view_t haystack = {NULL, 0};
+    sz_string_view_t needle = {NULL, 0};
 
     // For haystack
-    napi_get_value_string_utf8(env, args[0], NULL, 0, (size_t *)&haystack_sz.length);
-    haystack_sz.start = malloc(haystack_sz.length + 1);
-    napi_get_value_string_utf8(env,
-                               args[0],
-                               (char *)haystack_sz.start,
-                               haystack_sz.length + 1,
-                               (size_t *)&haystack_sz.length);
+    napi_get_value_string_utf8(env, args[0], NULL, 0, (size_t *)&haystack.length);
+    haystack.start = malloc(haystack.length + 1);
+    napi_get_value_string_utf8(env, args[0], (char *)haystack.start, haystack.length + 1, (size_t *)&haystack.length);
 
     // For needle
-    napi_get_value_string_utf8(env, args[1], NULL, 0, (size_t *)&needle_sz.length);
-    needle_sz.start = malloc(needle_sz.length + 1);
-    napi_get_value_string_utf8(env,
-                               args[1],
-                               (char *)needle_sz.start,
-                               needle_sz.length + 1,
-                               (size_t *)&needle_sz.length);
+    napi_get_value_string_utf8(env, args[1], NULL, 0, (size_t *)&needle.length);
+    needle.start = malloc(needle.length + 1);
+    napi_get_value_string_utf8(env, args[1], (char *)needle.start, needle.length + 1, (size_t *)&needle.length);
 
     bool overlap = false;
     if (argc > 2) { napi_get_value_bool(env, args[2], &overlap); }
 
-    void const *haystack_start = haystack_sz.start, *needle_start = needle_sz.start;
+    void const *haystack_start = haystack.start, *needle_start = needle.start;
 
     size_t count = 0;
-    if (needle_sz.length == 0 || haystack_sz.length == 0 || haystack_sz.length < needle_sz.length) { count = 0; }
-    else if (needle_sz.length == 1) { count = sz_count_char(haystack_sz.start, haystack_sz.length, needle_sz.start); }
+    if (needle.length == 0 || haystack.length == 0 || haystack.length < needle.length) { count = 0; }
+    else if (needle.length == 1) { count = sz_count_char(haystack.start, haystack.length, needle.start); }
     else if (overlap) {
-        while (haystack_sz.length) {
-            sz_string_ptr_t ptr =
-                sz_find_substr(haystack_sz.start, haystack_sz.length, needle_sz.start, needle_sz.length);
+        while (haystack.length) {
+            sz_string_start_t ptr = sz_find_substring(haystack.start, haystack.length, needle.start, needle.length);
             sz_bool_t found = ptr != NULL;
-            sz_size_t offset = found ? ptr - haystack_sz.start : haystack_sz.length;
+            sz_size_t offset = found ? ptr - haystack.start : haystack.length;
             count += found;
-            haystack_sz.start += offset + found;
-            haystack_sz.length -= offset + found;
+            haystack.start += offset + found;
+            haystack.length -= offset + found;
         }
     }
     else {
-        while (haystack_sz.length) {
-            sz_string_ptr_t ptr =
-                sz_find_substr(haystack_sz.start, haystack_sz.length, needle_sz.start, needle_sz.length);
+        while (haystack.length) {
+            sz_string_start_t ptr = sz_find_substring(haystack.start, haystack.length, needle.start, needle.length);
             sz_bool_t found = ptr != NULL;
-            sz_size_t offset = found ? ptr - haystack_sz.start : haystack_sz.length;
+            sz_size_t offset = found ? ptr - haystack.start : haystack.length;
             count += found;
-            haystack_sz.start += offset + needle_sz.length;
-            haystack_sz.length -= offset + needle_sz.length * found;
+            haystack.start += offset + needle.length;
+            haystack.length -= offset + needle.length * found;
         }
     }
 
diff --git a/python/lib.c b/python/lib.c
index a0f6caca..c0ad69d4 100644
--- a/python/lib.c
+++ b/python/lib.c
@@ -38,10 +38,7 @@ static PyTypeObject FileType;
 static PyTypeObject StrType;
 static PyTypeObject StrsType;
 
-static struct {
-    void *start;
-    size_t length;
-} temporary_memory = {NULL, 0};
+static sz_string_view_t temporary_memory = {NULL, 0};
 
 /**
  *  @brief  Describes an on-disk file mapped into RAM, which is different from Python's
@@ -55,8 +52,8 @@ typedef struct {
 #else
         int file_descriptor;
 #endif
-    void *start;
-    size_t length;
+    sz_string_start_t start;
+    sz_size_t length;
 } File;
 
 /**
@@ -73,8 +70,8 @@ typedef struct {
  */
 typedef struct {
     PyObject_HEAD PyObject *parent;
-    char const *start;
-    size_t length;
+    sz_string_start_t start;
+    sz_size_t length;
 } Str;
 
 /**
@@ -133,7 +130,7 @@ typedef struct {
         struct reordered_slices_t {
             size_t count;
             PyObject *parent;
-            sz_haystack_t *parts;
+            sz_string_view_t *parts;
         } reordered;
 
     } data;
@@ -144,10 +141,13 @@ typedef struct {
 
 #pragma region Helpers
 
-typedef int boolean_t;
+inline static sz_string_start_t haystacks_get_start(sz_sequence_t *seq, sz_size_t i) {
+    return ((sz_string_view_t const *)seq->handle)[i].start;
+}
 
-inline static char const *haystacks_get_start(sz_haystack_t const *parts, sz_size_t i) { return parts[i].start; }
-inline static size_t haystacks_get_length(sz_haystack_t const *parts, sz_size_t i) { return parts[i].length; }
+inline static sz_size_t haystacks_get_length(sz_sequence_t *seq, sz_size_t i) {
+    return ((sz_string_view_t const *)seq->handle)[i].length;
+}
 
 void reverse_offsets(sz_size_t *array, size_t length) {
     size_t i, j;
@@ -159,21 +159,21 @@ void reverse_offsets(sz_size_t *array, size_t length) {
     }
 }
 
-void reverse_haystacks(sz_haystack_t *array, size_t length) {
+void reverse_haystacks(sz_string_view_t *array, size_t length) {
     size_t i, j;
     // Swap array[i] and array[j]
     for (i = 0, j = length - 1; i < j; i++, j--) {
-        sz_haystack_t temp = array[i];
+        sz_string_view_t temp = array[i];
         array[i] = array[j];
         array[j] = temp;
     }
 }
 
-void apply_order(sz_haystack_t *array, sz_size_t *order, size_t length) {
-    for (size_t i = 0; i < length; ++i) {
+void apply_order(sz_string_view_t *array, sz_u64_t *order, size_t length) {
+    for (sz_u64_t i = 0; i < length; ++i) {
         if (i == order[i]) continue;
-        sz_haystack_t temp = array[i];
-        size_t k = i, j;
+        sz_string_view_t temp = array[i];
+        sz_u64_t k = i, j;
         while (i != (j = order[k])) {
             array[k] = array[j];
             order[k] = k;
@@ -205,7 +205,7 @@ void slice(size_t length, ssize_t start, ssize_t end, size_t *normalized_offset,
     *normalized_length = end - start;
 }
 
-boolean_t export_string_like(PyObject *object, char const **start, size_t *length) {
+sz_bool_t export_string_like(PyObject *object, sz_string_start_t **start, sz_size_t *length) {
     if (PyUnicode_Check(object)) {
         // Handle Python str
         Py_ssize_t signed_length;
@@ -277,7 +277,7 @@ get_string_at_offset_t str_at_offset_getter(Strs *strs) {
     }
 }
 
-boolean_t prepare_strings_for_reordering(Strs *strs) {
+sz_bool_t prepare_strings_for_reordering(Strs *strs) {
 
     // Allocate memory for reordered slices
     size_t count = 0;
@@ -306,7 +306,7 @@ boolean_t prepare_strings_for_reordering(Strs *strs) {
         return 0;
     }
 
-    sz_haystack_t *new_parts = (sz_haystack_t *)malloc(count * sizeof(sz_haystack_t));
+    sz_string_view_t *new_parts = (sz_string_view_t *)malloc(count * sizeof(sz_string_view_t));
     if (new_parts == NULL) {
         PyErr_SetString(PyExc_MemoryError, "Unable to allocate memory for reordered slices");
         return 0;
@@ -333,7 +333,7 @@ boolean_t prepare_strings_for_reordering(Strs *strs) {
     return 1;
 }
 
-boolean_t prepare_strings_for_extension(Strs *strs, size_t new_parents, size_t new_parts) { return 1; }
+sz_bool_t prepare_strings_for_extension(Strs *strs, size_t new_parents, size_t new_parts) { return 1; }
 
 #pragma endregion
 
@@ -622,8 +622,8 @@ static int Str_getbuffer(Str *self, Py_buffer *view, int flags) {
     view->itemsize = sizeof(char);
     view->format = "c"; // https://docs.python.org/3/library/struct.html#format-characters
     view->ndim = 1;
-    view->shape = &self->length; // 1-D array, so shape is just a pointer to the length
-    view->strides = itemsize;    // strides in a 1-D array is just the item size
+    view->shape = (Py_ssize_t *)&self->length; // 1-D array, so shape is just a pointer to the length
+    view->strides = itemsize;                  // strides in a 1-D array is just the item size
     view->suboffsets = NULL;
     view->internal = NULL;
 
@@ -639,18 +639,13 @@ static void Str_releasebuffer(PyObject *_, Py_buffer *view) {
 
 static int Str_in(Str *self, PyObject *arg) {
 
-    sz_needle_t needle_struct;
-    needle_struct.quadgram_offset = 0;
+    sz_string_view_t needle_struct;
     if (!export_string_like(arg, &needle_struct.start, &needle_struct.length)) {
         PyErr_SetString(PyExc_TypeError, "Unsupported argument type");
         return -1;
     }
 
-    sz_haystack_t haystack;
-    haystack.start = self->start;
-    haystack.length = self->length;
-    size_t position = sz_find_substr(haystack, needle_struct);
-    return position != haystack.length;
+    return sz_find_substring(self->start, self->length, needle_struct.start, needle_struct.length) != NULL;
 }
 
 static Py_ssize_t Strs_len(Strs *self) {
@@ -756,12 +751,12 @@ static PyObject *Strs_subscript(Strs *self, PyObject *key) {
             to->count = stop - start;
             to->parent = from->parent;
 
-            to->parts = malloc(sizeof(sz_haystack_t) * to->count);
+            to->parts = malloc(sizeof(sz_string_view_t) * to->count);
             if (to->parts == NULL && PyErr_NoMemory()) {
                 Py_XDECREF(self_slice);
                 return NULL;
             }
-            memcpy(to->parts, from->parts + start, sizeof(sz_haystack_t) * to->count);
+            memcpy(to->parts, from->parts + start, sizeof(sz_string_view_t) * to->count);
             Py_INCREF(to->parent);
             break;
         }
@@ -816,8 +811,8 @@ static int Str_find_( //
     PyObject *args,
     PyObject *kwargs,
     Py_ssize_t *offset_out,
-    sz_haystack_t *haystack_out,
-    sz_needle_t *needle_out) {
+    sz_string_view_t *haystack_out,
+    sz_string_view_t *needle_out) {
 
     int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
     Py_ssize_t nargs = PyTuple_Size(args);
@@ -845,12 +840,11 @@ static int Str_find_( //
         }
     }
 
-    sz_haystack_t haystack;
-    sz_needle_t needle;
+    sz_string_view_t haystack;
+    sz_string_view_t needle;
     Py_ssize_t start, end;
 
     // Validate and convert `haystack` and `needle`
-    needle.quadgram_offset = 0;
     if (!export_string_like(haystack_obj, &haystack.start, &haystack.length) ||
         !export_string_like(needle_obj, &needle.start, &needle.length)) {
         PyErr_SetString(PyExc_TypeError, "Haystack and needle must be string-like");
@@ -884,9 +878,9 @@ static int Str_find_( //
     haystack.length = normalized_length;
 
     // Perform contains operation
-    size_t offset = sz_find_substr(haystack, needle);
-    if (offset == haystack.length) { *offset_out = -1; }
-    else { *offset_out = (Py_ssize_t)offset; }
+    sz_string_start_t match = sz_find_substring(haystack.start, haystack.length, needle.start, needle.length);
+    if (match == NULL) { *offset_out = -1; }
+    else { *offset_out = (Py_ssize_t)(match - haystack.start); }
 
     *haystack_out = haystack;
     *needle_out = needle;
@@ -895,16 +889,16 @@ static int Str_find_( //
 
 static PyObject *Str_find(PyObject *self, PyObject *args, PyObject *kwargs) {
     Py_ssize_t signed_offset;
-    sz_haystack_t text;
-    sz_needle_t separator;
+    sz_string_view_t text;
+    sz_string_view_t separator;
     if (!Str_find_(self, args, kwargs, &signed_offset, &text, &separator)) return NULL;
     return PyLong_FromSsize_t(signed_offset);
 }
 
 static PyObject *Str_index(PyObject *self, PyObject *args, PyObject *kwargs) {
     Py_ssize_t signed_offset;
-    sz_haystack_t text;
-    sz_needle_t separator;
+    sz_string_view_t text;
+    sz_string_view_t separator;
     if (!Str_find_(self, args, kwargs, &signed_offset, &text, &separator)) return NULL;
     if (signed_offset == -1) {
         PyErr_SetString(PyExc_ValueError, "substring not found");
@@ -915,8 +909,8 @@ static PyObject *Str_index(PyObject *self, PyObject *args, PyObject *kwargs) {
 
 static PyObject *Str_contains(PyObject *self, PyObject *args, PyObject *kwargs) {
     Py_ssize_t signed_offset;
-    sz_haystack_t text;
-    sz_needle_t separator;
+    sz_string_view_t text;
+    sz_string_view_t separator;
     if (!Str_find_(self, args, kwargs, &signed_offset, &text, &separator)) return NULL;
     if (signed_offset == -1) { Py_RETURN_FALSE; }
     else { Py_RETURN_TRUE; }
@@ -924,8 +918,8 @@ static PyObject *Str_contains(PyObject *self, PyObject *args, PyObject *kwargs)
 
 static PyObject *Str_partition(PyObject *self, PyObject *args, PyObject *kwargs) {
     Py_ssize_t separator_index;
-    sz_haystack_t text;
-    sz_needle_t separator;
+    sz_string_view_t text;
+    sz_string_view_t separator;
     PyObject *result_tuple;
 
     // Use Str_find_ to get the index of the separator
@@ -993,13 +987,12 @@ static PyObject *Str_count(PyObject *self, PyObject *args, PyObject *kwargs) {
                 return NULL;
     }
 
-    sz_haystack_t haystack;
-    sz_needle_t needle;
+    sz_string_view_t haystack;
+    sz_string_view_t needle;
     Py_ssize_t start = start_obj ? PyLong_AsSsize_t(start_obj) : 0;
     Py_ssize_t end = end_obj ? PyLong_AsSsize_t(end_obj) : PY_SSIZE_T_MAX;
     int allowoverlap = allowoverlap_obj ? PyObject_IsTrue(allowoverlap_obj) : 0;
 
-    needle.quadgram_offset = 0;
     if (!export_string_like(haystack_obj, &haystack.start, &haystack.length) ||
         !export_string_like(needle_obj, &needle.start, &needle.length))
         return PyErr_Format(PyExc_TypeError, "Haystack and needle must be string-like"), NULL;
@@ -1013,27 +1006,28 @@ static PyObject *Str_count(PyObject *self, PyObject *args, PyObject *kwargs) {
 
     size_t count = 0;
     if (needle.length == 0 || haystack.length == 0 || haystack.length < needle.length) { count = 0; }
-    else if (needle.length == 1) { count = sz_count_char(haystack, needle.start[0]); }
-    else if (needle.length != 1) {
-        if (allowoverlap) {
-            while (haystack.length) {
-                sz_size_t offset = sz_find_substr(haystack, needle);
-                int found = offset != haystack.length;
-                count += found;
-                haystack.start += offset + found;
-                haystack.length -= offset + found;
-            }
+    else if (needle.length == 1) { count = sz_count_char(haystack.start, haystack.length, needle.start); }
+    else if (allowoverlap) {
+        while (haystack.length) {
+            sz_string_start_t ptr = sz_find_substring(haystack.start, haystack.length, needle.start, needle.length);
+            sz_bool_t found = ptr != NULL;
+            sz_size_t offset = found ? ptr - haystack.start : haystack.length;
+            count += found;
+            haystack.start += offset + found;
+            haystack.length -= offset + found;
         }
-        else {
-            while (haystack.length) {
-                sz_size_t offset = sz_find_substr(haystack, needle);
-                int found = offset != haystack.length;
-                count += found;
-                haystack.start += offset + needle.length;
-                haystack.length -= offset + needle.length * found;
-            }
+    }
+    else {
+        while (haystack.length) {
+            sz_string_start_t ptr = sz_find_substring(haystack.start, haystack.length, needle.start, needle.length);
+            sz_bool_t found = ptr != NULL;
+            sz_size_t offset = found ? ptr - haystack.start : haystack.length;
+            count += found;
+            haystack.start += offset + needle.length;
+            haystack.length -= offset + needle.length * found;
         }
     }
+
     return PyLong_FromSize_t(count);
 }
 
@@ -1068,7 +1062,7 @@ static PyObject *Str_levenstein(PyObject *self, PyObject *args, PyObject *kwargs
         return NULL;
     }
 
-    sz_haystack_t str1, str2;
+    sz_string_view_t str1, str2;
     if (!export_string_like(str1_obj, &str1.start, &str1.length) ||
         !export_string_like(str2_obj, &str2.start, &str2.length)) {
         PyErr_Format(PyExc_TypeError, "Both arguments must be string-like");
@@ -1119,7 +1113,7 @@ static PyObject *Str_startswith(PyObject *self, PyObject *args, PyObject *kwargs
         return NULL;
     }
 
-    sz_haystack_t str, prefix;
+    sz_string_view_t str, prefix;
     if (!export_string_like(str_obj, &str.start, &str.length) ||
         !export_string_like(prefix_obj, &prefix.start, &prefix.length)) {
         PyErr_SetString(PyExc_TypeError, "Both arguments must be string-like");
@@ -1162,7 +1156,7 @@ static PyObject *Str_endswith(PyObject *self, PyObject *args, PyObject *kwargs)
         return NULL;
     }
 
-    sz_haystack_t str, suffix;
+    sz_string_view_t str, suffix;
     if (!export_string_like(str_obj, &str.start, &str.length) ||
         !export_string_like(suffix_obj, &suffix.start, &suffix.length)) {
         PyErr_SetString(PyExc_TypeError, "Both arguments must be string-like");
@@ -1180,7 +1174,7 @@ static PyObject *Str_endswith(PyObject *self, PyObject *args, PyObject *kwargs)
 }
 
 static Strs *Str_split_(
-    PyObject *parent, sz_haystack_t text, sz_needle_t separator, int keepseparator, Py_ssize_t maxsplit) {
+    PyObject *parent, sz_string_view_t text, sz_string_view_t separator, int keepseparator, Py_ssize_t maxsplit) {
 
     // Create Strs object
     Strs *result = (Strs *)PyObject_New(Strs, &StrsType);
@@ -1209,10 +1203,9 @@ static Strs *Str_split_(
     // Iterate through string, keeping track of the
     sz_size_t last_start = 0;
     while (last_start <= text.length && offsets_count < maxsplit) {
-        sz_haystack_t text_remaining;
-        text_remaining.start = text.start + last_start;
-        text_remaining.length = text.length - last_start;
-        sz_size_t offset_in_remaining = sz_find_substr(text_remaining, separator);
+        sz_string_start_t match =
+            sz_find_substring(text.start + last_start, text.length - last_start, separator.start, separator.length);
+        sz_size_t offset_in_remaining = match ? match - text.start - last_start : text.length - last_start;
 
         // Reallocate offsets array if needed
         if (offsets_count >= offsets_capacity) {
@@ -1232,7 +1225,7 @@ static Strs *Str_split_(
         }
 
         // Export the offset
-        size_t will_continue = offset_in_remaining != text_remaining.length;
+        size_t will_continue = match != NULL;
         size_t next_offset = last_start + offset_in_remaining + separator.length * will_continue;
         if (text.length >= UINT32_MAX) { ((uint64_t *)offsets_endings)[offsets_count++] = (uint64_t)next_offset; }
         else { ((uint32_t *)offsets_endings)[offsets_count++] = (uint32_t)next_offset; }
@@ -1282,11 +1275,10 @@ static PyObject *Str_split(PyObject *self, PyObject *args, PyObject *kwargs) {
         }
     }
 
-    sz_haystack_t text;
-    sz_needle_t separator;
+    sz_string_view_t text;
+    sz_string_view_t separator;
     int keepseparator;
     Py_ssize_t maxsplit;
-    separator.quadgram_offset = 0;
 
     // Validate and convert `text`
     if (!export_string_like(text_obj, &text.start, &text.length)) {
@@ -1355,7 +1347,7 @@ static PyObject *Str_splitlines(PyObject *self, PyObject *args, PyObject *kwargs
         }
     }
 
-    sz_haystack_t text;
+    sz_string_view_t text;
     int keeplinebreaks;
     Py_ssize_t maxsplit = PY_SSIZE_T_MAX; // Default value for maxsplit
 
@@ -1388,14 +1380,14 @@ static PyObject *Str_splitlines(PyObject *self, PyObject *args, PyObject *kwargs
     // https://docs.python.org/3/library/stdtypes.html#str.splitlines
     // \n, \r, \r\n, \v or \x0b, \f or \x0c, \x1c, \x1d, \x1e, \x85, \u2028, \u2029
     // https://github.com/ashvardanian/StringZilla/issues/29
-    sz_needle_t separator;
+    sz_string_view_t separator;
     separator.start = "\n";
     separator.length = 1;
     return Str_split_(text_obj, text, separator, keeplinebreaks, maxsplit);
 }
 
 static PyObject *Str_concat(PyObject *self, PyObject *other) {
-    struct sz_haystack_t self_str, other_str;
+    struct sz_string_view_t self_str, other_str;
 
     // Validate and convert `self`
     if (!export_string_like(self, &self_str.start, &self_str.length)) {
@@ -1453,7 +1445,8 @@ static PyNumberMethods Str_as_number = {
 
 #define sz_method_flags_m METH_VARARGS | METH_KEYWORDS
 
-static PyMethodDef Str_methods[] = { //
+static PyMethodDef Str_methods[] = {
+    //
     {"find", Str_find, sz_method_flags_m, "Find the first occurrence of a substring."},
     {"index", Str_index, sz_method_flags_m, "Find the first occurrence of a substring or raise error if missing."},
     {"contains", Str_contains, sz_method_flags_m, "Check if a string contains a substring."},
@@ -1537,14 +1530,14 @@ static PyObject *Strs_shuffle(Strs *self, PyObject *args, PyObject *kwargs) {
 
     // Get the parts and their count
     struct reordered_slices_t *reordered = &self->data.reordered;
-    sz_haystack_t *parts = reordered->parts;
+    sz_string_view_t *parts = reordered->parts;
     size_t count = reordered->count;
 
     // Fisher-Yates Shuffle Algorithm
     for (size_t i = count - 1; i > 0; --i) {
         size_t j = rand() % (i + 1);
         // Swap parts[i] and parts[j]
-        sz_haystack_t temp = parts[i];
+        sz_string_view_t temp = parts[i];
         parts[i] = parts[j];
         parts[j] = temp;
     }
@@ -1552,8 +1545,8 @@ static PyObject *Strs_shuffle(Strs *self, PyObject *args, PyObject *kwargs) {
     Py_RETURN_NONE;
 }
 
-static boolean_t Strs_sort_(Strs *self,
-                            sz_haystack_t **parts_output,
+static sz_bool_t Strs_sort_(Strs *self,
+                            sz_string_view_t **parts_output,
                             sz_size_t **order_output,
                             sz_size_t *count_output) {
 
@@ -1565,7 +1558,7 @@ static boolean_t Strs_sort_(Strs *self,
 
     // Get the parts and their count
     // The only possible `self->type` by now is the `STRS_REORDERED`
-    sz_haystack_t *parts = self->data.reordered.parts;
+    sz_string_view_t *parts = self->data.reordered.parts;
     size_t count = self->data.reordered.count;
 
     // Allocate temporary memory to store the ordering offsets
@@ -1627,7 +1620,7 @@ static PyObject *Strs_sort(Strs *self, PyObject *args, PyObject *kwargs) {
         }
     }
 
-    boolean_t reverse = 0; // Default is False
+    sz_bool_t reverse = 0; // Default is False
     if (reverse_obj) {
         if (!PyBool_Check(reverse_obj)) {
             PyErr_SetString(PyExc_TypeError, "The reverse must be a boolean");
@@ -1636,7 +1629,7 @@ static PyObject *Strs_sort(Strs *self, PyObject *args, PyObject *kwargs) {
         reverse = PyObject_IsTrue(reverse_obj);
     }
 
-    sz_haystack_t *parts = NULL;
+    sz_string_view_t *parts = NULL;
     sz_size_t *order = NULL;
     sz_size_t count = 0;
     if (!Strs_sort_(self, &parts, &order, &count)) return NULL;
@@ -1680,7 +1673,7 @@ static PyObject *Strs_order(Strs *self, PyObject *args, PyObject *kwargs) {
         }
     }
 
-    boolean_t reverse = 0; // Default is False
+    sz_bool_t reverse = 0; // Default is False
     if (reverse_obj) {
         if (!PyBool_Check(reverse_obj)) {
             PyErr_SetString(PyExc_TypeError, "The reverse must be a boolean");
@@ -1689,7 +1682,7 @@ static PyObject *Strs_order(Strs *self, PyObject *args, PyObject *kwargs) {
         reverse = PyObject_IsTrue(reverse_obj);
     }
 
-    sz_haystack_t *parts = NULL;
+    sz_string_view_t *parts = NULL;
     sz_size_t *order = NULL;
     sz_size_t count = 0;
     if (!Strs_sort_(self, &parts, &order, &count)) return NULL;
diff --git a/scripts/test.cpp b/scripts/test.cpp
index 8dc1a4d2..b61b7d40 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -232,19 +232,22 @@ int main(int, char const **) {
         bench_search("std::search", full_text, [&]() mutable {
             return std::search(full_text.begin(), full_text.end(), needle.begin(), needle.end()) - full_text.begin();
         });
-        bench_search("sz_find_substr_swar", full_text, [&]() mutable {
-            sz_string_ptr_t ptr = sz_find_substr_swar(full_text.data(), full_text.size(), needle.data(), needle.size());
+        bench_search("sz_find_substring_swar", full_text, [&]() mutable {
+            sz_string_start_t ptr =
+                sz_find_substring_swar(full_text.data(), full_text.size(), needle.data(), needle.size());
             return ptr ? ptr - full_text.data() : full_text.size();
         });
 #if defined(__ARM_NEON)
-        bench_search("sz_find_substr_neon", full_text, [&]() mutable {
-            sz_string_ptr_t ptr = sz_find_substr_neon(full_text.data(), full_text.size(), needle.data(), needle.size());
+        bench_search("sz_find_substring_neon", full_text, [&]() mutable {
+            sz_string_start_t ptr =
+                sz_find_substring_neon(full_text.data(), full_text.size(), needle.data(), needle.size());
             return ptr ? ptr - full_text.data() : full_text.size();
         });
 #endif
 #if defined(__AVX2__)
-        bench_search("sz_find_substr_avx2", full_text, [&]() mutable {
-            sz_string_ptr_t ptr = sz_find_substr_avx2(full_text.data(), full_text.size(), needle.data(), needle.size());
+        bench_search("sz_find_substring_avx2", full_text, [&]() mutable {
+            sz_string_start_t ptr =
+                sz_find_substring_avx2(full_text.data(), full_text.size(), needle.data(), needle.size());
             return ptr ? ptr - full_text.data() : full_text.size();
         });
 #endif
diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h
index ba7f5f39..c7c0ae49 100644
--- a/stringzilla/stringzilla.h
+++ b/stringzilla/stringzilla.h
@@ -45,16 +45,16 @@ typedef unsigned long sz_size_t;
 typedef unsigned sz_size_t;
 #endif
 
-typedef int sz_bool_t;               // Only one relevant bit
-typedef unsigned sz_u32_t;           // Always 32 bits
-typedef unsigned long long sz_u64_t; // Always 64 bits
-typedef char const *sz_string_ptr_t; // A type alias for `char const * `
+typedef int sz_bool_t;                 // Only one relevant bit
+typedef unsigned sz_u32_t;             // Always 32 bits
+typedef unsigned long long sz_u64_t;   // Always 64 bits
+typedef char const *sz_string_start_t; // A type alias for `char const * `
 
 /**
  *  @brief  Helper construct for higher-level bindings.
  */
 typedef struct sz_string_view_t {
-    sz_string_ptr_t start;
+    sz_string_start_t start;
     sz_size_t length;
 } sz_string_view_t;
 
@@ -72,8 +72,8 @@ typedef union _sz_anomaly_t {
  *          Doesn't provide major performance improvements, but helps avoid the LibC dependency.
  *  @return 1 for `true`, and 0 for `false`.
  */
-inline static sz_bool_t sz_equal(sz_string_ptr_t a, sz_string_ptr_t b, sz_size_t length) {
-    sz_string_ptr_t const a_end = a + length;
+inline static sz_bool_t sz_equal(sz_string_start_t a, sz_string_start_t b, sz_size_t length) {
+    sz_string_start_t const a_end = a + length;
     while (a != a_end && *a == *b) a++, b++;
     return a_end == a;
 }
@@ -82,13 +82,13 @@ inline static sz_bool_t sz_equal(sz_string_ptr_t a, sz_string_ptr_t b, sz_size_t
  *  @brief  Count the number of occurrences of a @b single-character needle in an arbitrary length haystack.
  *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
  */
-inline static sz_size_t sz_count_char_swar(sz_string_ptr_t const haystack,
+inline static sz_size_t sz_count_char_swar(sz_string_start_t const haystack,
                                            sz_size_t const haystack_length,
-                                           sz_string_ptr_t const needle) {
+                                           sz_string_start_t const needle) {
 
     sz_size_t result = 0;
-    sz_string_ptr_t text = haystack;
-    sz_string_ptr_t const end = haystack + haystack_length;
+    sz_string_start_t text = haystack;
+    sz_string_start_t const end = haystack + haystack_length;
 
     // Process the misaligned head, to void UB on unaligned 64-bit loads.
     for (; ((unsigned long)text & 7ul) && text < end; ++text) result += *text == *needle;
@@ -117,12 +117,12 @@ inline static sz_size_t sz_count_char_swar(sz_string_ptr_t const haystack,
  *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
  *          Identical to `memchr(haystack, needle[0], haystack_length)`.
  */
-inline static sz_string_ptr_t sz_find_1char_swar(sz_string_ptr_t const haystack,
-                                                 sz_size_t const haystack_length,
-                                                 sz_string_ptr_t const needle) {
+inline static sz_string_start_t sz_find_1char_swar(sz_string_start_t const haystack,
+                                                   sz_size_t const haystack_length,
+                                                   sz_string_start_t const needle) {
 
-    sz_string_ptr_t text = haystack;
-    sz_string_ptr_t const end = haystack + haystack_length;
+    sz_string_start_t text = haystack;
+    sz_string_start_t const end = haystack + haystack_length;
 
     // Process the misaligned head, to void UB on unaligned 64-bit loads.
     for (; ((unsigned long)text & 7ul) && text < end; ++text)
@@ -154,12 +154,12 @@ inline static sz_string_ptr_t sz_find_1char_swar(sz_string_ptr_t const haystack,
  *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
  *          Identical to `memrchr(haystack, needle[0], haystack_length)`.
  */
-inline static sz_string_ptr_t sz_rfind_1char_swar(sz_string_ptr_t const haystack,
-                                                  sz_size_t const haystack_length,
-                                                  sz_string_ptr_t const needle) {
+inline static sz_string_start_t sz_rfind_1char_swar(sz_string_start_t const haystack,
+                                                    sz_size_t const haystack_length,
+                                                    sz_string_start_t const needle) {
 
-    sz_string_ptr_t const end = haystack + haystack_length;
-    sz_string_ptr_t text = end - 1;
+    sz_string_start_t const end = haystack + haystack_length;
+    sz_string_start_t text = end - 1;
 
     // Process the misaligned head, to void UB on unaligned 64-bit loads.
     for (; ((unsigned long)text & 7ul) && text >= haystack; --text)
@@ -190,12 +190,12 @@ inline static sz_string_ptr_t sz_rfind_1char_swar(sz_string_ptr_t const haystack
  *  @brief  Find the first occurrence of a @b two-character needle in an arbitrary length haystack.
  *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
  */
-inline static sz_string_ptr_t sz_find_2char_swar(sz_string_ptr_t const haystack,
-                                                 sz_size_t const haystack_length,
-                                                 sz_string_ptr_t const needle) {
+inline static sz_string_start_t sz_find_2char_swar(sz_string_start_t const haystack,
+                                                   sz_size_t const haystack_length,
+                                                   sz_string_start_t const needle) {
 
-    sz_string_ptr_t text = haystack;
-    sz_string_ptr_t const end = haystack + haystack_length;
+    sz_string_start_t text = haystack;
+    sz_string_start_t const end = haystack + haystack_length;
 
     // Process the misaligned head, to void UB on unaligned 64-bit loads.
     for (; ((unsigned long)text & 7ul) && text + 2 <= end; ++text)
@@ -239,12 +239,12 @@ inline static sz_string_ptr_t sz_find_2char_swar(sz_string_ptr_t const haystack,
  *  @brief  Find the first occurrence of a three-character needle in an arbitrary length haystack.
  *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
  */
-inline static sz_string_ptr_t sz_find_3char_swar(sz_string_ptr_t const haystack,
-                                                 sz_size_t const haystack_length,
-                                                 sz_string_ptr_t const needle) {
+inline static sz_string_start_t sz_find_3char_swar(sz_string_start_t const haystack,
+                                                   sz_size_t const haystack_length,
+                                                   sz_string_start_t const needle) {
 
-    sz_string_ptr_t text = haystack;
-    sz_string_ptr_t end = haystack + haystack_length;
+    sz_string_start_t text = haystack;
+    sz_string_start_t end = haystack + haystack_length;
 
     // Process the misaligned head, to void UB on unaligned 64-bit loads.
     for (; ((unsigned long)text & 7ul) && text + 3 <= end; ++text)
@@ -301,12 +301,12 @@ inline static sz_string_ptr_t sz_find_3char_swar(sz_string_ptr_t const haystack,
  *  @brief  Find the first occurrence of a @b four-character needle in an arbitrary length haystack.
  *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
  */
-inline static sz_string_ptr_t sz_find_4char_swar(sz_string_ptr_t const haystack,
-                                                 sz_size_t const haystack_length,
-                                                 sz_string_ptr_t const needle) {
+inline static sz_string_start_t sz_find_4char_swar(sz_string_start_t const haystack,
+                                                   sz_size_t const haystack_length,
+                                                   sz_string_start_t const needle) {
 
-    sz_string_ptr_t text = haystack;
-    sz_string_ptr_t end = haystack + haystack_length;
+    sz_string_start_t text = haystack;
+    sz_string_start_t end = haystack + haystack_length;
 
     // Process the misaligned head, to void UB on unaligned 64-bit loads.
     for (; ((unsigned long)text & 7ul) && text + 4 <= end; ++text)
@@ -367,10 +367,10 @@ inline static sz_string_ptr_t sz_find_4char_swar(sz_string_ptr_t const haystack,
  *          it compares 4-byte anomalies first, most commonly prefixes. It's computationally cheaper.
  *          Matching performance fluctuates between 1 GB/s and 3,5 GB/s per core.
  */
-inline static sz_string_ptr_t sz_find_substr_swar( //
-    sz_string_ptr_t const haystack,
+inline static sz_string_start_t sz_find_substring_swar( //
+    sz_string_start_t const haystack,
     sz_size_t const haystack_length,
-    sz_string_ptr_t const needle,
+    sz_string_start_t const needle,
     sz_size_t const needle_length) {
 
     if (haystack_length < needle_length) return NULL;
@@ -383,12 +383,12 @@ inline static sz_string_ptr_t sz_find_substr_swar( //
     case 3: return sz_find_3char_swar(haystack, haystack_length, needle);
     case 4: return sz_find_4char_swar(haystack, haystack_length, needle);
     default: {
-        sz_string_ptr_t text = haystack;
-        sz_string_ptr_t const end = haystack + haystack_length;
+        sz_string_start_t text = haystack;
+        sz_string_start_t const end = haystack + haystack_length;
 
         _sz_anomaly_t n_anomaly, h_anomaly;
         sz_size_t const n_suffix_len = needle_length - 4 - anomaly_offset;
-        sz_string_ptr_t n_suffix_ptr = needle + 4 + anomaly_offset;
+        sz_string_start_t n_suffix_ptr = needle + 4 + anomaly_offset;
         n_anomaly.u8s[0] = needle[anomaly_offset];
         n_anomaly.u8s[1] = needle[anomaly_offset + 1];
         n_anomaly.u8s[2] = needle[anomaly_offset + 2];
@@ -401,10 +401,9 @@ inline static sz_string_ptr_t sz_find_substr_swar( //
         text += anomaly_offset;
         while (text + needle_length <= end) {
             h_anomaly.u8s[3] = text[3];
-            if (h_anomaly.u32 == n_anomaly.u32)                                  // Match anomaly.
-                if (sz_equal(text + 4, n_suffix_ptr, n_suffix_len))              // Match suffix.
-                    if (sz_equal(text - anomaly_offset, needle, anomaly_offset)) // Match prefix - optimized out.
-                        return text - anomaly_offset;
+            if (h_anomaly.u32 == n_anomaly.u32)                     // Match anomaly.
+                if (sz_equal(text + 4, n_suffix_ptr, n_suffix_len)) // Match suffix.
+                    return text;
 
             h_anomaly.u32 >>= 8;
             ++text;
@@ -417,8 +416,8 @@ inline static sz_string_ptr_t sz_find_substr_swar( //
 /**
  *  Helper function, used in substring search operations.
  */
-inline static void _sz_find_substr_populate_anomaly( //
-    sz_string_ptr_t const needle,
+inline static void _sz_find_substring_populate_anomaly( //
+    sz_string_start_t const needle,
     sz_size_t const needle_length,
     _sz_anomaly_t *anomaly_out,
     _sz_anomaly_t *mask_out) {
@@ -455,16 +454,16 @@ inline static void _sz_find_substr_populate_anomaly( //
  *          was practically more efficient than loading once and shifting around, as introduces
  *          less data dependencies.
  */
-inline static sz_string_ptr_t sz_find_substr_avx2(sz_string_ptr_t const haystack,
-                                                  sz_size_t const haystack_length,
-                                                  sz_string_ptr_t const needle,
-                                                  sz_size_t const needle_length) {
+inline static sz_string_start_t sz_find_substring_avx2(sz_string_start_t const haystack,
+                                                       sz_size_t const haystack_length,
+                                                       sz_string_start_t const needle,
+                                                       sz_size_t const needle_length) {
 
     // Precomputed constants
-    sz_string_ptr_t const end = haystack + haystack_length;
+    sz_string_start_t const end = haystack + haystack_length;
     _sz_anomaly_t anomaly;
     _sz_anomaly_t mask;
-    _sz_find_substr_populate_anomaly(needle, needle_length, &anomaly, &mask);
+    _sz_find_substring_populate_anomaly(needle, needle_length, &anomaly, &mask);
     __m256i const anomalies = _mm256_set1_epi32(anomaly.u32);
     __m256i const masks = _mm256_set1_epi32(mask.u32);
 
@@ -477,7 +476,7 @@ inline static sz_string_ptr_t sz_find_substr_avx2(sz_string_ptr_t const haystack
     //  + 4 movemasks.
     //  + 3 bitwise ANDs.
     //  + 1 heavy (but very unlikely) branch.
-    sz_string_ptr_t text = haystack;
+    sz_string_start_t text = haystack;
     while (text + needle_length + 32 <= end) {
 
         // Performing many unaligned loads ends up being faster than loading once and shuffling around.
@@ -511,7 +510,7 @@ inline static sz_string_ptr_t sz_find_substr_avx2(sz_string_ptr_t const haystack
     }
 
     // Don't forget the last (up to 35) characters.
-    return sz_find_substr_swar(text, end - text, needle, needle_length);
+    return sz_find_substring_swar(text, end - text, needle, needle_length);
 }
 
 #endif // x86 AVX2
@@ -524,21 +523,21 @@ inline static sz_string_ptr_t sz_find_substr_avx2(sz_string_ptr_t const haystack
  *          was practically more efficient than loading once and shifting around, as introduces
  *          less data dependencies.
  */
-inline static sz_string_ptr_t sz_find_substr_neon(sz_string_ptr_t const haystack,
-                                                  sz_size_t const haystack_length,
-                                                  sz_string_ptr_t const needle,
-                                                  sz_size_t const needle_length) {
+inline static sz_string_start_t sz_find_substring_neon(sz_string_start_t const haystack,
+                                                       sz_size_t const haystack_length,
+                                                       sz_string_start_t const needle,
+                                                       sz_size_t const needle_length) {
 
     // Precomputed constants
-    sz_string_ptr_t const end = haystack + haystack_length;
+    sz_string_start_t const end = haystack + haystack_length;
     _sz_anomaly_t anomaly;
     _sz_anomaly_t mask;
-    _sz_find_substr_populate_anomaly(needle, needle_length, &anomaly, &mask);
+    _sz_find_substring_populate_anomaly(needle, needle_length, &anomaly, &mask);
     uint32x4_t const anomalies = vld1q_dup_u32(&anomaly.u32);
     uint32x4_t const masks = vld1q_dup_u32(&mask.u32);
     uint32x4_t matches, matches0, matches1, matches2, matches3;
 
-    sz_string_ptr_t text = haystack;
+    sz_string_start_t text = haystack;
     while (text + needle_length + 16 <= end) {
 
         // Each of the following `matchesX` contains only 4 relevant bits - one per word.
@@ -582,40 +581,40 @@ inline static sz_string_ptr_t sz_find_substr_neon(sz_string_ptr_t const haystack
     }
 
     // Don't forget the last (up to 16+3=19) characters.
-    return sz_find_substr_swar(text, end - text, needle, needle_length);
+    return sz_find_substring_swar(text, end - text, needle, needle_length);
 }
 
 #endif // Arm Neon
 
-inline static sz_size_t sz_count_char(sz_string_ptr_t const haystack,
+inline static sz_size_t sz_count_char(sz_string_start_t const haystack,
                                       sz_size_t const haystack_length,
-                                      sz_string_ptr_t const needle) {
+                                      sz_string_start_t const needle) {
     return sz_count_char_swar(haystack, haystack_length, needle);
 }
 
-inline static sz_string_ptr_t sz_find_1char(sz_string_ptr_t const haystack,
-                                            sz_size_t const haystack_length,
-                                            sz_string_ptr_t const needle) {
+inline static sz_string_start_t sz_find_1char(sz_string_start_t const haystack,
+                                              sz_size_t const haystack_length,
+                                              sz_string_start_t const needle) {
     return sz_find_1char_swar(haystack, haystack_length, needle);
 }
 
-inline static sz_string_ptr_t sz_rfind_1char(sz_string_ptr_t const haystack,
-                                             sz_size_t const haystack_length,
-                                             sz_string_ptr_t const needle) {
+inline static sz_string_start_t sz_rfind_1char(sz_string_start_t const haystack,
+                                               sz_size_t const haystack_length,
+                                               sz_string_start_t const needle) {
     return sz_rfind_1char_swar(haystack, haystack_length, needle);
 }
 
-inline static sz_string_ptr_t sz_find_substr(sz_string_ptr_t const haystack,
-                                             sz_size_t const haystack_length,
-                                             sz_string_ptr_t const needle,
-                                             sz_size_t const needle_length) {
+inline static sz_string_start_t sz_find_substring(sz_string_start_t const haystack,
+                                                  sz_size_t const haystack_length,
+                                                  sz_string_start_t const needle,
+                                                  sz_size_t const needle_length) {
     if (haystack_length < needle_length || needle_length == 0) return NULL;
 #if defined(__ARM_NEON)
-    return sz_find_substr_neon(haystack, haystack_length, needle, needle_length);
+    return sz_find_substring_neon(haystack, haystack_length, needle, needle_length);
 #elif defined(__AVX2__)
-    return sz_find_substr_avx2(haystack, haystack_length, needle, needle_length);
+    return sz_find_substring_avx2(haystack, haystack_length, needle, needle_length);
 #else
-    return sz_find_substr_swar(haystack, haystack_length, needle, needle_length);
+    return sz_find_substring_swar(haystack, haystack_length, needle, needle_length);
 #endif
 }
 
@@ -669,30 +668,46 @@ inline static char sz_toupper_ascii(char c) {
     return *(char *)&upped[(int)c];
 }
 
+inline static sz_u64_t sz_u64_unaligned_load(void const *ptr) {
+#ifdef _MSC_VER
+    return *((__unaligned sz_u64_t *)ptr);
+#else
+    __attribute__((aligned(1))) sz_u64_t const *uptr = (sz_u64_t const *)ptr;
+    return *uptr;
+#endif
+}
+
+inline static sz_u64_t sz_u64_byte_reverse(sz_u64_t val) {
+#ifdef _MSC_VER
+    return _byteswap_uint64(val);
+#else
+    return __builtin_bswap64(val);
+#endif
+}
+
 /**
  *  @brief  Char-level lexicographic comparison of two strings.
  *          Doesn't provide major performance improvements, but helps avoid the LibC dependency.
  */
-inline static sz_bool_t sz_is_less_ascii(sz_string_ptr_t const a,
+inline static sz_bool_t sz_is_less_ascii(sz_string_start_t a,
                                          sz_size_t const a_length,
-                                         sz_string_ptr_t const b,
+                                         sz_string_start_t b,
                                          sz_size_t const b_length) {
 
     sz_size_t min_length = (a_length < b_length) ? a_length : b_length;
-    for (sz_size_t i = 0; i < min_length; ++i) {
-        if (a[i] < b[i]) return 1;
-        if (a[i] > b[i]) return 0;
-    }
-    return a_length < b_length;
+    sz_string_start_t const min_end = a + min_length;
+    while (a + 8 <= min_end && sz_u64_unaligned_load(a) == sz_u64_unaligned_load(b)) a += 8, b += 8;
+    while (a != min_end && *a == *b) a++, b++;
+    return a != min_end ? (*a < *b) : (a_length < b_length);
 }
 
 /**
  *  @brief  Char-level lexicographic comparison of two strings, insensitive to the case of ASCII symbols.
  *          Doesn't provide major performance improvements, but helps avoid the LibC dependency.
  */
-inline static sz_bool_t sz_is_less_uncased_ascii(sz_string_ptr_t const a,
+inline static sz_bool_t sz_is_less_uncased_ascii(sz_string_start_t const a,
                                                  sz_size_t const a_length,
-                                                 sz_string_ptr_t const b,
+                                                 sz_string_start_t const b,
                                                  sz_size_t const b_length) {
 
     sz_size_t min_length = (a_length < b_length) ? a_length : b_length;
@@ -716,11 +731,11 @@ inline static void _sz_swap_order(sz_u64_t *a, sz_u64_t *b) {
 
 struct sz_sequence_t;
 
-typedef sz_string_ptr_t (*sz_sequence_member_start_t)(struct sz_sequence_t const *, sz_size_t);
+typedef sz_string_start_t (*sz_sequence_member_start_t)(struct sz_sequence_t const *, sz_size_t);
 typedef sz_size_t (*sz_sequence_member_length_t)(struct sz_sequence_t const *, sz_size_t);
 typedef sz_bool_t (*sz_sequence_predicate_t)(struct sz_sequence_t const *, sz_size_t);
 typedef sz_bool_t (*sz_sequence_comparator_t)(struct sz_sequence_t const *, sz_size_t, sz_size_t);
-typedef sz_bool_t (*sz_string_is_less_t)(sz_string_ptr_t, sz_size_t, sz_string_ptr_t, sz_size_t);
+typedef sz_bool_t (*sz_string_is_less_t)(sz_string_start_t, sz_size_t, sz_string_start_t, sz_size_t);
 
 typedef struct sz_sequence_t {
     sz_u64_t *order;
@@ -795,9 +810,12 @@ inline static void sz_sort_insertion(sz_sequence_t *sequence, sz_sequence_compar
 
 // Utility functions
 inline static sz_size_t _sz_log2i(sz_size_t n) {
-    sz_size_t log2 = 0;
-    while (n >>= 1) ++log2;
-    return log2;
+    if (n == 0) return 0;                // to avoid undefined behavior with __builtin_clz
+#if defined(__LP64__) || defined(_WIN64) // 64-bit
+    return 63 - __builtin_clzll(n);
+#else // 32-bit
+    return 31 - __builtin_clz(n);
+#endif
 }
 
 inline static void _sz_sift_down(
@@ -893,7 +911,7 @@ inline static void _sz_introsort(
     sz_u64_t pivot = sequence->order[median];
     sz_size_t left = first;
     sz_size_t right = last - 1;
-    while (true) {
+    while (1) {
         while (less(sequence, sequence->order[left], pivot)) left++;
         while (less(sequence, pivot, sequence->order[right])) right--;
         if (left >= right) break;
@@ -962,17 +980,17 @@ inline static void _sz_sort_recursion( //
 }
 
 inline static sz_bool_t _sz_sort_compare_less_ascii(sz_sequence_t *sequence, sz_size_t i_key, sz_size_t j_key) {
-    sz_string_ptr_t i_str = sequence->get_start(sequence, i_key);
+    sz_string_start_t i_str = sequence->get_start(sequence, i_key);
     sz_size_t i_len = sequence->get_length(sequence, i_key);
-    sz_string_ptr_t j_str = sequence->get_start(sequence, j_key);
+    sz_string_start_t j_str = sequence->get_start(sequence, j_key);
     sz_size_t j_len = sequence->get_length(sequence, j_key);
     return sz_is_less_ascii(i_str, i_len, j_str, j_len);
 }
 
 inline static sz_bool_t _sz_sort_compare_less_uncased_ascii(sz_sequence_t *sequence, sz_size_t i_key, sz_size_t j_key) {
-    sz_string_ptr_t i_str = sequence->get_start(sequence, i_key);
+    sz_string_start_t i_str = sequence->get_start(sequence, i_key);
     sz_size_t i_len = sequence->get_length(sequence, i_key);
-    sz_string_ptr_t j_str = sequence->get_start(sequence, j_key);
+    sz_string_start_t j_str = sequence->get_start(sequence, j_key);
     sz_size_t j_len = sequence->get_length(sequence, j_key);
     return sz_is_less_uncased_ascii(i_str, i_len, j_str, j_len);
 }
@@ -994,7 +1012,7 @@ inline static void sz_sort(sz_sequence_t *sequence, sz_sort_config_t const *conf
 
     // Export up to 4 bytes into the `sequence` bits themselves
     for (sz_size_t i = 0; i != sequence->count; ++i) {
-        sz_string_ptr_t begin = sequence->get_start(sequence, sequence->order[i]);
+        sz_string_start_t begin = sequence->get_start(sequence, sequence->order[i]);
         sz_size_t length = sequence->get_length(sequence, sequence->order[i]);
         length = length > 4ul ? 4ul : length;
         char *prefix = (char *)&sequence->order[i];
@@ -1038,9 +1056,9 @@ inline static levenstein_distance_t _sz_levenstein_minimum( //
  *          It accepts an upper bound on the possible error. Quadratic complexity in time, linear in space.
  */
 inline static levenstein_distance_t sz_levenstein( //
-    sz_string_ptr_t const a,
+    sz_string_start_t const a,
     sz_size_t const a_length,
-    sz_string_ptr_t const b,
+    sz_string_start_t const b,
     sz_size_t const b_length,
     levenstein_distance_t const bound,
     void *buffer) {
@@ -1093,11 +1111,11 @@ inline static levenstein_distance_t sz_levenstein( //
 /**
  *  @brief  Hashes provided string using hardware-accelerated CRC32 instructions.
  */
-inline static sz_u32_t sz_hash_crc32_native(sz_string_ptr_t start, sz_size_t length) { return 0; }
+inline static sz_u32_t sz_hash_crc32_native(sz_string_start_t start, sz_size_t length) { return 0; }
 
-inline static sz_u32_t sz_hash_crc32_neon(sz_string_ptr_t start, sz_size_t length) { return 0; }
+inline static sz_u32_t sz_hash_crc32_neon(sz_string_start_t start, sz_size_t length) { return 0; }
 
-inline static sz_u32_t sz_hash_crc32_sse(sz_string_ptr_t start, sz_size_t length) { return 0; }
+inline static sz_u32_t sz_hash_crc32_sse(sz_string_start_t start, sz_size_t length) { return 0; }
 
 #ifdef __cplusplus
 }

From 416b885429d2eb97e6c677eaac0eba6de5ff9fc4 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 9 Oct 2023 16:51:35 -0700
Subject: [PATCH 8/8] Make: Formatting and docs

---
 .vscode/settings.json |   3 +-
 CMakeLists.txt        | 140 ++++++++++++++++++++++--------------------
 README.md             |  10 +--
 scripts/bench.ipynb   |  20 ++++--
 scripts/test.c        |  14 ++---
 5 files changed, 100 insertions(+), 87 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 08c5bb65..575441f2 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -118,7 +118,8 @@
     "strstream": "cpp",
     "filesystem": "cpp",
     "stringzilla.h": "c",
-    "__memory": "c"
+    "__memory": "c",
+    "charconv": "c"
   },
   "C_Cpp.default.configurationProvider": "ms-vscode.cmake-tools",
   "cSpell.words": [
diff --git a/CMakeLists.txt b/CMakeLists.txt
index df569329..230c2a06 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,105 +1,109 @@
 # This CMake file is heavily inspired by following `stringzilla` CMake:
 # https://github.com/nlohmann/json/blob/develop/CMakeLists.txt
 cmake_minimum_required(VERSION 3.1)
-project(stringzilla VERSION 0.1.0 LANGUAGES C CXX)
+project(
+  stringzilla
+  VERSION 0.1.0
+  LANGUAGES C CXX)
 
-set (CMAKE_C_STANDARD 11)
-set (CMAKE_CXX_STANDARD 17)
+set(CMAKE_C_STANDARD 11)
+set(CMAKE_CXX_STANDARD 17)
 
-# Determine if USearch is built as a subproject (using `add_subdirectory`) or if it is the main project
+# Determine if USearch is built as a subproject (using `add_subdirectory`) or if
+# it is the main project
 set(STRINGZILLA_IS_MAIN_PROJECT OFF)
-if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR)
-    set(STRINGZILLA_IS_MAIN_PROJECT ON)
+if(CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR)
+  set(STRINGZILLA_IS_MAIN_PROJECT ON)
 endif()
 
 # Options
 option(STRINGZILLA_INSTALL "Install CMake targets" OFF)
-option(STRINGZILLA_BUILD_TEST "Compile a native unit test in C++" ${STRINGZILLA_IS_MAIN_PROJECT})
-option(STRINGZILLA_BUILD_BENCHMARK "Compile a native benchmark in C++" ${STRINGZILLA_IS_MAIN_PROJECT})
+option(STRINGZILLA_BUILD_TEST "Compile a native unit test in C++"
+       ${STRINGZILLA_IS_MAIN_PROJECT})
+option(STRINGZILLA_BUILD_BENCHMARK "Compile a native benchmark in C++"
+       ${STRINGZILLA_IS_MAIN_PROJECT})
 option(STRINGZILLA_BUILD_WOLFRAM "Compile Wolfram Language bindings" OFF)
 
 # Includes
 set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH})
 include(ExternalProject)
 
-# Allow CMake 3.13+ to override options when using FetchContent / add_subdirectory
-if (POLICY CMP0077)
-    cmake_policy(SET CMP0077 NEW)
-endif ()
+# Allow CMake 3.13+ to override options when using FetchContent /
+# add_subdirectory
+if(POLICY CMP0077)
+  cmake_policy(SET CMP0077 NEW)
+endif()
 
 # Configuration
 include(GNUInstallDirs)
-set(STRINGZILLA_TARGET_NAME               ${PROJECT_NAME})
-set(STRINGZILLA_CONFIG_INSTALL_DIR        "${CMAKE_INSTALL_DATADIR}/cmake/${PROJECT_NAME}" CACHE INTERNAL "")
-set(STRINGZILLA_INCLUDE_INSTALL_DIR       "${CMAKE_INSTALL_INCLUDEDIR}")
-set(STRINGZILLA_TARGETS_EXPORT_NAME       "${PROJECT_NAME}Targets")
-set(STRINGZILLA_CMAKE_CONFIG_TEMPLATE     "cmake/config.cmake.in")
-set(STRINGZILLA_CMAKE_CONFIG_DIR          "${CMAKE_CURRENT_BINARY_DIR}")
-set(STRINGZILLA_CMAKE_VERSION_CONFIG_FILE "${STRINGZILLA_CMAKE_CONFIG_DIR}/${PROJECT_NAME}ConfigVersion.cmake")
-set(STRINGZILLA_CMAKE_PROJECT_CONFIG_FILE "${STRINGZILLA_CMAKE_CONFIG_DIR}/${PROJECT_NAME}Config.cmake")
-set(STRINGZILLA_CMAKE_PROJECT_TARGETS_FILE "${STRINGZILLA_CMAKE_CONFIG_DIR}/${PROJECT_NAME}Targets.cmake")
-set(STRINGZILLA_PKGCONFIG_INSTALL_DIR      "${CMAKE_INSTALL_DATADIR}/pkgconfig")
-
+set(STRINGZILLA_TARGET_NAME ${PROJECT_NAME})
+set(STRINGZILLA_CONFIG_INSTALL_DIR
+    "${CMAKE_INSTALL_DATADIR}/cmake/${PROJECT_NAME}"
+    CACHE INTERNAL "")
+set(STRINGZILLA_INCLUDE_INSTALL_DIR "${CMAKE_INSTALL_INCLUDEDIR}")
+set(STRINGZILLA_TARGETS_EXPORT_NAME "${PROJECT_NAME}Targets")
+set(STRINGZILLA_CMAKE_CONFIG_TEMPLATE "cmake/config.cmake.in")
+set(STRINGZILLA_CMAKE_CONFIG_DIR "${CMAKE_CURRENT_BINARY_DIR}")
+set(STRINGZILLA_CMAKE_VERSION_CONFIG_FILE
+    "${STRINGZILLA_CMAKE_CONFIG_DIR}/${PROJECT_NAME}ConfigVersion.cmake")
+set(STRINGZILLA_CMAKE_PROJECT_CONFIG_FILE
+    "${STRINGZILLA_CMAKE_CONFIG_DIR}/${PROJECT_NAME}Config.cmake")
+set(STRINGZILLA_CMAKE_PROJECT_TARGETS_FILE
+    "${STRINGZILLA_CMAKE_CONFIG_DIR}/${PROJECT_NAME}Targets.cmake")
+set(STRINGZILLA_PKGCONFIG_INSTALL_DIR "${CMAKE_INSTALL_DATADIR}/pkgconfig")
 
 # Define our header-only library
 add_library(${STRINGZILLA_TARGET_NAME} INTERFACE)
-add_library(${PROJECT_NAME}::${STRINGZILLA_TARGET_NAME} ALIAS ${STRINGZILLA_TARGET_NAME})
+add_library(${PROJECT_NAME}::${STRINGZILLA_TARGET_NAME} ALIAS
+            ${STRINGZILLA_TARGET_NAME})
 set(STRINGZILLA_INCLUDE_BUILD_DIR "${PROJECT_SOURCE_DIR}/include/")
 
 target_compile_definitions(
-    ${STRINGZILLA_TARGET_NAME}
-    INTERFACE
-    $<$<NOT:$<BOOL:${JSON_GlobalUDLs}>>:STRINGZILLA_USE_OPENMP=0>
-)
+  ${STRINGZILLA_TARGET_NAME}
+  INTERFACE $<$<NOT:$<BOOL:${JSON_GlobalUDLs}>>:STRINGZILLA_USE_OPENMP=0>)
 target_include_directories(
-    ${STRINGZILLA_TARGET_NAME}
-    ${STRINGZILLA_SYSTEM_INCLUDE} INTERFACE
-    $<BUILD_INTERFACE:${STRINGZILLA_INCLUDE_BUILD_DIR}>
-    $<INSTALL_INTERFACE:include>
-)
+  ${STRINGZILLA_TARGET_NAME} ${STRINGZILLA_SYSTEM_INCLUDE}
+  INTERFACE $<BUILD_INTERFACE:${STRINGZILLA_INCLUDE_BUILD_DIR}>
+            $<INSTALL_INTERFACE:include>)
 
 if(STRINGZILLA_INSTALL)
-    install(
-        DIRECTORY ${STRINGZILLA_INCLUDE_BUILD_DIR}
-        DESTINATION ${STRINGZILLA_INCLUDE_INSTALL_DIR}
-    )
-    install(
-        FILES ${STRINGZILLA_CMAKE_PROJECT_CONFIG_FILE} ${STRINGZILLA_CMAKE_VERSION_CONFIG_FILE}
-        DESTINATION ${STRINGZILLA_CONFIG_INSTALL_DIR}
-    )
-    export(
-        TARGETS ${STRINGZILLA_TARGET_NAME}
-        NAMESPACE ${PROJECT_NAME}::
-        FILE ${STRINGZILLA_CMAKE_PROJECT_TARGETS_FILE}
-    )
-    install(
-        TARGETS ${STRINGZILLA_TARGET_NAME}
-        EXPORT ${STRINGZILLA_TARGETS_EXPORT_NAME}
-        INCLUDES DESTINATION ${STRINGZILLA_INCLUDE_INSTALL_DIR}
-    )
-    install(
-        EXPORT ${STRINGZILLA_TARGETS_EXPORT_NAME}
-        NAMESPACE ${PROJECT_NAME}::
-        DESTINATION ${STRINGZILLA_CONFIG_INSTALL_DIR}
-    )
-    install(
-        FILES "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}.pc"
-        DESTINATION ${STRINGZILLA_PKGCONFIG_INSTALL_DIR}
-    )
+  install(DIRECTORY ${STRINGZILLA_INCLUDE_BUILD_DIR}
+          DESTINATION ${STRINGZILLA_INCLUDE_INSTALL_DIR})
+  install(FILES ${STRINGZILLA_CMAKE_PROJECT_CONFIG_FILE}
+                ${STRINGZILLA_CMAKE_VERSION_CONFIG_FILE}
+          DESTINATION ${STRINGZILLA_CONFIG_INSTALL_DIR})
+  export(
+    TARGETS ${STRINGZILLA_TARGET_NAME}
+    NAMESPACE ${PROJECT_NAME}::
+    FILE ${STRINGZILLA_CMAKE_PROJECT_TARGETS_FILE})
+  install(
+    TARGETS ${STRINGZILLA_TARGET_NAME}
+    EXPORT ${STRINGZILLA_TARGETS_EXPORT_NAME}
+    INCLUDES
+    DESTINATION ${STRINGZILLA_INCLUDE_INSTALL_DIR})
+  install(
+    EXPORT ${STRINGZILLA_TARGETS_EXPORT_NAME}
+    NAMESPACE ${PROJECT_NAME}::
+    DESTINATION ${STRINGZILLA_CONFIG_INSTALL_DIR})
+  install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}.pc"
+          DESTINATION ${STRINGZILLA_PKGCONFIG_INSTALL_DIR})
 endif()
 
 if(${STRINGZILLA_BUILD_TEST} OR ${STRINGZILLA_BUILD_BENCHMARK})
-  add_executable(stringzilla_test scripts/test.c)
+  add_executable(stringzilla_test scripts/test.cpp)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
+  set(CMAKE_CXX_FLAGS
+      "${CMAKE_CXX_FLAGS} -O3 -flto -march=native -finline-functions -funroll-loops"
+  )
 
   target_include_directories(stringzilla_test PRIVATE stringzilla)
-  set_target_properties(stringzilla_test PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+  set_target_properties(stringzilla_test PROPERTIES RUNTIME_OUTPUT_DIRECTORY
+                                                    ${CMAKE_BINARY_DIR})
 
-  if(${CMAKE_VERSION} VERSION_EQUAL 3.13 OR ${CMAKE_VERSION} VERSION_GREATER 3.13)
-      include(CTest)
-      enable_testing()
-      add_test(NAME stringzilla_test COMMAND stringzilla_test)
+  if(${CMAKE_VERSION} VERSION_EQUAL 3.13 OR ${CMAKE_VERSION} VERSION_GREATER
+                                            3.13)
+    include(CTest)
+    enable_testing()
+    add_test(NAME stringzilla_test COMMAND stringzilla_test)
   endif()
 endif()
-
diff --git a/README.md b/README.md
index 85032c34..8f0765c3 100644
--- a/README.md
+++ b/README.md
@@ -35,8 +35,8 @@ Coming soon.
 
 ## Quick Start: Python 🐍
 
-1️. Install via pip: `pip install stringzilla`  
-1. Import the classes you need: `from stringzilla import Str, Strs, File`  
+1. Install via pip: `pip install stringzilla`  
+2. Import the classes you need: `from stringzilla import Str, Strs, File`  
 
 ### Basic Usage
 
@@ -115,13 +115,13 @@ There is an ABI-stable C 99 interface, in case you have a database, an operating
 #include "stringzilla.h"
 
 // Initialize your haystack and needle
-sz_haystack_t haystack = {your_text, your_text_length};
-sz_needle_t needle = {your_subtext, your_subtext_length, your_quadgram_offset};
+sz_string_view_t haystack = {your_text, your_text_length};
+sz_string_view_t needle = {your_subtext, your_subtext_length};
 
 // Perform string-level operations
 size_t character_count = sz_count_char(haystack, 'a');
 size_t character_position = sz_find_unigram(haystack, 'a');
-size_t substring_position = sz_find_substr(haystack, needle);
+size_t substring_position = sz_find_substring(haystack, needle);
 
 // Perform collection level operations
 sz_sequence_t array = {your_order, your_count, your_get_start, your_get_length, your_handle};
diff --git a/scripts/bench.ipynb b/scripts/bench.ipynb
index b3bc4392..492db50a 100644
--- a/scripts/bench.ipynb
+++ b/scripts/bench.ipynb
@@ -88,7 +88,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "150 ms ± 2.01 ms per loop (mean ± std. dev. of 100 runs, 1 loop each)\n"
+      "152 ms ± 3.24 ms per loop (mean ± std. dev. of 100 runs, 1 loop each)\n"
      ]
     }
    ],
@@ -106,7 +106,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "37.8 ms ± 286 µs per loop (mean ± std. dev. of 100 runs, 1 loop each)\n"
+      "38.1 ms ± 312 µs per loop (mean ± std. dev. of 100 runs, 1 loop each)\n"
      ]
     }
    ],
@@ -124,7 +124,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "173 ns ± 23.7 ns per loop (mean ± std. dev. of 1000 runs, 1 loop each)\n"
+      "The slowest run took 7.28 times longer than the fastest. This could mean that an intermediate result is being cached.\n",
+      "186 ns ± 41.1 ns per loop (mean ± std. dev. of 1000 runs, 1 loop each)\n"
      ]
     }
    ],
@@ -142,8 +143,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "The slowest run took 82.51 times longer than the fastest. This could mean that an intermediate result is being cached.\n",
-      "94.3 ns ± 108 ns per loop (mean ± std. dev. of 1000 runs, 1 loop each)\n"
+      "The slowest run took 120.95 times longer than the fastest. This could mean that an intermediate result is being cached.\n",
+      "99.6 ns ± 155 ns per loop (mean ± std. dev. of 1000 runs, 1 loop each)\n"
      ]
     }
    ],
@@ -152,6 +153,13 @@
     "sz_str.find(pattern)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -176,7 +184,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.13"
+   "version": "3.10.11"
   },
   "orig_nbformat": 4
  },
diff --git a/scripts/test.c b/scripts/test.c
index 127975b0..b39fd982 100644
--- a/scripts/test.c
+++ b/scripts/test.c
@@ -18,8 +18,8 @@ void populate_random_string(char *buffer, int length, int variability) {
     buffer[length] = '\0';
 }
 
-// Test function for sz_find_substr
-void test_sz_find_substr() {
+// Test function for sz_find_substring
+void test_sz_find_substring() {
     char buffer[MAX_LENGTH + 1];
     char pattern[6]; // Maximum length of 5 + 1 for '\0'
 
@@ -39,11 +39,11 @@ void test_sz_find_substr() {
             needle.length = pattern_length;
 
             // Comparing the result of your function with the standard library function.
-            sz_string_ptr_t result_libc = strstr(buffer, pattern);
-            sz_string_ptr_t result_stringzilla =
-                sz_find_substr(haystack.start, haystack.length, needle.start, needle.length);
+            sz_string_start_t result_libc = strstr(buffer, pattern);
+            sz_string_start_t result_stringzilla =
+                sz_find_substring(haystack.start, haystack.length, needle.start, needle.length);
 
-            assert(((result_libc == NULL) ^ (result_stringzilla == NULL)) && "Test failed for sz_find_substr");
+            assert(((result_libc == NULL) ^ (result_stringzilla == NULL)) && "Test failed for sz_find_substring");
         }
     }
 }
@@ -51,7 +51,7 @@ void test_sz_find_substr() {
 int main() {
     srand((unsigned int)time(NULL));
 
-    test_sz_find_substr();
+    test_sz_find_substring();
     // Add calls to other test functions as you implement them
 
     printf("All tests passed!\n");