From bc4d2e9053d393643a3f5f88371065abc2ca7a69 Mon Sep 17 00:00:00 2001
From: r2rSahakyan <89134416+r2rSahakyan@users.noreply.github.com>
Date: Sat, 2 Sep 2023 23:35:57 +0400
Subject: [PATCH 01/72] Ver First Binding Draft in CPython

Span class with size , hash, comparison, contains and find functions.
all added code under #ifndef PURE_CPYTHON
---
 python/lib.cpp | 186 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 185 insertions(+), 1 deletion(-)
diff --git a/python/lib.cpp b/python/lib.cpp
index 979e1f56..4724e176 100644
--- a/python/lib.cpp
+++ b/python/lib.cpp
@@ -25,6 +25,11 @@ typedef SSIZE_T ssize_t;
 #include <string>      // `std::string`
 #include <string_view> // `std::string_view`
 
+#define PURE_CPYTHON
+#ifdef PURE_CPYTHON
+#include <Python.h>
+#endif
+
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
@@ -647,6 +652,185 @@ void define_slice_ops(py::class_<at, std::shared_ptr<at>> &str_view_struct) {
         py::keep_alive<0, 1>());
 }
 
+#ifdef PURE_CPYTHON
+typedef struct
+{
+        PyObject_HEAD
+        std::shared_ptr<py_span_t>  span;
+} PySpan;
+
+
+static PyMethodDef PySpan_methods[] = {
+    {"size", (PyCFunction)PySpan_size, METH_NOARGS, "Get the size"},
+    {"__hash__", (PyCFunction) PySpan_hash, METH_NOARGS, "Returns the hash value"},
+    {"__eq__", (PyCFunction) PySpan_eq, METH_O, "Equality check"},
+    {"__ne__", (PyCFunction) PySpan_ne, METH_O, "Non-equality check"},
+    {"__gt__", (PyCFunction) PySpan_gt, METH_O, "Greater than check"},
+    {"__lt__", (PyCFunction) PySpan_lt, METH_O, "Less than check"},
+    {"contains", (PyCFunction)PyStrView_contains, METH_VARARGS | METH_KEYWORDS, "Check if contains"},
+    {"find", (PyCFunction)PyStrView_find, METH_VARARGS | METH_KEYWORDS, "Find needle"},
+    {NULL}  // Sentinel
+};
+
+static PyObject *PySpan_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+    PySpan *self;
+    self = (PySpan *)type->tp_alloc(type, 0);
+    return (PyObject *)self;
+}
+static int PySpan_init(PySpan *self, PyObject *args, PyObject *kwds)
+{
+    self->span = std::make_shared<py_span_t>();
+    return 0;
+}
+
+static PyObject *PySpan_size(PySpan *self, PyObject *Py_UNUSED(ignored))
+{
+    return PyLong_FromSsize_t(self->span->size());
+}
+
+static void PySpan_dealloc(PySpan *self)
+{
+    // Handle the deallocation of the C++ object
+    self->span.reset();
+    Py_TYPE(self)->tp_free((PyObject *)self);
+}
+
+static PyObject* PySpan_hash(PySpan *self) {
+    return PyLong_FromUnsignedLong(self->span->hash());
+}
+
+static PyObject* PySpan_eq(PySpan *self, PyObject *other) {
+    if (PyUnicode_Check(other)) {
+        return PyBool_FromLong(self->span->operator==(PyUnicode_AsUTF8(other)));
+    } else if (PyObject_TypeCheck(other, &PySpanType)) {
+        return PyBool_FromLong(self->span->operator==(((PySpan *)other)->span));
+    }
+    Py_RETURN_FALSE;
+}
+
+static PyObject* PySpan_ne(PySpan *self, PyObject *other) {
+    if (PyUnicode_Check(other)) {
+        return PyBool_FromLong(self->span->operator!=(PyUnicode_AsUTF8(other)));
+    } else if (PyObject_TypeCheck(other, &PySpanType)) {
+        return PyBool_FromLong(self->span->operator!=(((PySpan *)other)->span));
+    }
+    Py_RETURN_TRUE;
+}
+
+static PyObject* PySpan_gt(PySpan *self, PyObject *other) {
+    if (PyUnicode_Check(other)) {
+        return PyBool_FromLong(self->span->operator>(PyUnicode_AsUTF8(other)));
+    } else if (PyObject_TypeCheck(other, &PySpanType)) {
+        return PyBool_FromLong(self->span->operator>(((PySpan *)other)->span));
+    }
+    Py_RETURN_FALSE;
+}
+
+static PyObject* PySpan_lt(PySpan *self, PyObject *other) {
+    if (PyUnicode_Check(other)) {
+        return PyBool_FromLong(self->span->operator<(PyUnicode_AsUTF8(other)));
+    } else if (PyObject_TypeCheck(other, &PySpanType)) {
+        return PyBool_FromLong(self->span->operator<(((PySpan *)other)->span));
+    }
+    Py_RETURN_FALSE;
+}
+
+static PyObject * PyStrView_contains(PyStrView *self, PyObject *args, PyObject *kwargs) {
+    char *needle;
+    int start = 0, end = INT_MAX;
+    static char *kwlist[] = {"needle", "start", "end", NULL};
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s|ii", kwlist, &needle, &start, &end)) {
+        return NULL;
+    }
+    
+    if(self->span->contains(needle,start,end)){
+        Py_INCREF(Py_True);
+        return Py_True;
+    } else {
+        Py_INCREF(Py_False);
+        return Py_False;
+    }
+}
+static PyObject * PyStrView_find(PyStrView *self, PyObject *args, PyObject *kwargs) {
+    char *needle;
+    int start = 0, end = INT_MAX;
+    static char *kwlist[] = {"needle", "start", "end", NULL};
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s|ii", kwlist, &needle, &start, &end)) {
+        return NULL;
+    }
+    return PyLong_FromLong(self->span->find(needle,start,end));
+}
+
+static PyTypeObject PySpanType = {
+    PyVarObject_HEAD_INIT(NULL, 0) /* ob_size */
+    "YourModule.Span",             /* tp_name */
+    sizeof(PySpan),                /* tp_basicsize */
+    0,                             /* tp_itemsize */
+    (destructor)PySpan_dealloc,    /* tp_dealloc */
+    0,                             /* tp_print */
+    0,                             /* tp_getattr */
+    0,                             /* tp_setattr */
+    0,                             /* tp_compare */
+    0,                             /* tp_repr */
+    0,                             /* tp_as_number */
+    0,                             /* tp_as_sequence */
+    0,                             /* tp_as_mapping */
+    0,                             /* tp_hash */
+    0,                             /* tp_call */
+    0,                             /* tp_str */
+    0,                             /* tp_getattro */
+    0,                             /* tp_setattro */
+    0,                             /* tp_as_buffer */
+    Py_TPFLAGS_DEFAULT,            /* tp_flags */
+    "Span objects",                /* tp_doc */
+    0,                             /* tp_traverse */
+    0,                             /* tp_clear */
+    0,                             /* tp_richcompare */
+    0,                             /* tp_weaklistoffset */
+    0,                             /* tp_iter */
+    0,                             /* tp_iternext */
+    PySpan_methods,                /* tp_methods */
+    0,                             /* tp_members */
+    0,                             /* tp_getset */
+    0,                             /* tp_base */
+    0,                             /* tp_dict */
+    0,                             /* tp_descr_get */
+    0,                             /* tp_descr_set */
+    0,                             /* tp_dictoffset */
+    (initproc)PySpan_init,         /* tp_init */
+    0,                             /* tp_alloc */
+    PySpan_new,                    /* tp_new */
+};
+
+static PyModuleDef stringzilla_module = {
+    PyModuleDef_HEAD_INIT,
+    "stringzilla",
+    "Crunch 100+ GB Strings in Python with ease",
+    -1,
+    NULL, NULL, NULL, NULL, NULL
+};
+
+PyMODINIT_FUNC PyInit_stringzilla(void) {
+    PyObject *m;
+
+    if (PyType_Ready(&PySpanType) < 0)
+        return NULL;
+
+    m = PyModule_Create(&stringzilla_module);
+    if (m == NULL)
+        return NULL;
+
+    Py_INCREF(&PySpanType);
+    PyModule_AddObject(m, "Span", (PyObject *)&PySpanType);
+
+    return m;
+}
+
+#endif
+
 PYBIND11_MODULE(stringzilla, m) {
     m.doc() = "Crunch 100+ GB Strings in Python with ease";
 
@@ -732,4 +916,4 @@ PYBIND11_MODULE(stringzilla, m) {
     py_strs.def("append", &py_spans_t::append<py_subspan_t>, py::call_guard<py::gil_scoped_release>());
     py_strs.def("append", &py_spans_t::append_copy);
     py_strs.def("extend", &py_spans_t::extend_copy);
-}
\ No newline at end of file
+}

From f633d12a1346e729d10aa3f1423316b5daa9ef67 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 10 Sep 2023 17:37:59 +0400
Subject: [PATCH 02/72] Refactor: Restarting CPython bindings

---
 python/lib.cpp | 919 +++----------------------------------------------
 1 file changed, 42 insertions(+), 877 deletions(-)

diff --git a/python/lib.cpp b/python/lib.cpp
index 4724e176..77678767 100644
--- a/python/lib.cpp
+++ b/python/lib.cpp
@@ -1,4 +1,6 @@
-
+/**
+ *  @brief
+ */
 #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
 #define NOMINMAX
 #include <windows.h>
@@ -16,801 +18,53 @@ typedef SSIZE_T ssize_t;
 #include <unistd.h> // `ssize_t`
 #endif
 
-#include <random>      // `std::random_device`
-#include <utility>     // `std::exchange`
-#include <limits>      // `std::numeric_limits`
-#include <numeric>     // `std::iota`
-#include <cmath>       // `std::abs`
-#include <algorithm>   // `std::shuffle`
-#include <string>      // `std::string`
-#include <string_view> // `std::string_view`
-
-#define PURE_CPYTHON
-#ifdef PURE_CPYTHON
 #include <Python.h>
-#endif
-
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
-#include "stringzilla.h"
-
-namespace py = pybind11;
-
-struct py_span_t;
-struct py_str_t;
-struct py_file_t;
-struct py_subspan_t;
-struct py_spans_t;
-
-struct span_t {
-    char const *ptr;
-    size_t len;
-
-    explicit operator bool() const noexcept { return ptr; }
-    char const *data() const noexcept { return ptr; }
-    size_t size() const noexcept { return len; }
-    bool contains(char const *fragment) const noexcept { return ptr <= fragment && fragment < (ptr + len); }
-};
-
-static constexpr ssize_t ssize_max_k = std::numeric_limits<ssize_t>::max();
-static constexpr size_t size_max_k = std::numeric_limits<size_t>::max();
-
-inline size_t find_substr(span_t h_span, char n) noexcept {
-    strzl_haystack_t h {h_span.ptr, h_span.len};
-    return strzl_naive_find_char(h, n);
-}
-
-inline size_t find_substr(span_t h_span, span_t n_span) noexcept {
-    strzl_haystack_t h {h_span.ptr, h_span.len};
-    strzl_needle_t n {n_span.ptr, n_span.len, 0};
-
-#if defined(__AVX2__)
-    return strzl_avx2_find_substr(h, n);
-#elif defined(__ARM_NEON)
-    return strzl_neon_find_substr(h, n);
-#else
-    return strzl_naive_find_substr(h, n);
-#endif
-}
-
-inline size_t count_char(span_t h_span, char n) noexcept {
-    strzl_haystack_t h {h_span.ptr, h_span.len};
-    return strzl_naive_count_char(h, n);
-}
-
-inline size_t count_substr(span_t h, span_t n, bool overlap = false) noexcept {
-
-    if (n.len == 1)
-        return count_char(h, *n.ptr);
-    if (h.len < n.len)
-        return 0;
-
-    size_t result = 0;
-    if (overlap) {
-        while (h.len) {
-            size_t offset = find_substr(h, n);
-            bool found = offset != h.len;
-            result += found;
-            h.ptr += offset + found;
-            h.len -= offset + found;
-        }
-    }
-
-    else {
-        while (h.len) {
-            size_t offset = find_substr(h, n);
-            bool found = offset != h.len;
-            result += found;
-            h.ptr += offset + n.len;
-            h.len -= offset + n.len * found;
-        }
-    }
-
-    return result;
-}
-
-span_t to_span(std::string_view s) { return {s.data(), s.size()}; }
-std::string_view to_stl(span_t s) { return {s.data(), s.size()}; }
-
-struct index_span_t {
-    size_t offset;
-    size_t length;
-};
-
-index_span_t slice(size_t length, ssize_t start, ssize_t end) {
-    ssize_t len = static_cast<ssize_t>(length);
-    ssize_t abs_start = std::abs(start);
-    ssize_t abs_end = std::abs(end);
-
-    if (len == 0 || start == end)
-        return {0ul, 0ul};
-
-    if (start > end) {
-        if ((start < 0 && end < 0) || (start >= 0 && end > 0) || len - abs_end < start)
-            return {0ul, 0ul};
-        end = len - abs_end;
-    }
-    else if (start < 0 && end < 0) {
-        if (abs_start <= len && abs_end <= len) {
-            start = len + start;
-            end = len + end;
-        }
-        else if (abs_start > len && abs_end <= len) {
-            start = 0;
-            end = len + end;
-        }
-        else if (abs_start <= len && abs_end > len) {
-            start = len + start;
-            end = len;
-        }
-        else if (abs_start > len && abs_end > len) {
-            start = 0;
-            end = len;
-        }
-    }
-    else if (start < 0 && end >= 0) {
-        end = end == 0 ? len : std::min(end, len);
-        if (!((start = len - abs_start) < end && start >= 0))
-            start = end = 0;
-    }
-    else if (start >= 0 && end < 0) {
-        if (len >= start) {
-            if ((len + end) >= start)
-                end = len + end;
-            else
-                end = len;
-        }
-        else
-            end = start;
-    }
-    else {
-        start = std::min(start, len);
-        end = end == 0 ? len : std::min(end, len);
-    }
-    return {static_cast<size_t>(start), static_cast<size_t>(end - start)};
-}
-
-size_t unsigned_offset(size_t length, ssize_t idx) {
-    if (idx >= 0) {
-        if (static_cast<size_t>(idx) > length)
-            throw std::out_of_range("Accessing beyond content length");
-        return static_cast<size_t>(idx);
-    }
-    else {
-        if (static_cast<size_t>(-idx) > length)
-            throw std::out_of_range("Accessing beyond content length");
-        return static_cast<size_t>(length + idx);
-    }
-}
-
-span_t subspan(span_t span, ssize_t start, ssize_t end = ssize_max_k) {
-    index_span_t index_span = slice(span.size(), start, end);
-    return {span.ptr + index_span.offset, index_span.length};
-}
-
-struct py_span_t : public span_t, public std::enable_shared_from_this<py_span_t> {
-
-    py_span_t(span_t view = {}) : span_t(view) {}
-    virtual ~py_span_t() {}
-
-    using span_t::len;
-    using span_t::ptr;
-
-    span_t span() const { return {ptr, len}; }
-    ssize_t size() const { return static_cast<ssize_t>(len); }
-    bool contains(std::string_view needle, ssize_t start, ssize_t end) const;
-    ssize_t find(std::string_view, ssize_t start, ssize_t end) const;
-    ssize_t count(std::string_view, ssize_t start, ssize_t end, bool allowoverlap) const;
-    std::shared_ptr<py_spans_t> splitlines(bool keeplinebreaks, char separator, size_t maxsplit) const;
-    std::shared_ptr<py_spans_t> split(std::string_view separator, size_t maxsplit, bool keepseparator) const;
-    std::shared_ptr<py_subspan_t> sub(ssize_t start, ssize_t end) const;
-
-    char const *begin() const { return reinterpret_cast<char const *>(ptr); }
-    char const *end() const { return begin() + len; }
-    char at(ssize_t offset) const { return begin()[unsigned_offset(len, offset)]; }
-    py::str to_python() const { return {begin(), len}; }
-    std::size_t hash() const { return std::hash<std::string_view> {}({ptr, len}); }
-
-    bool operator==(py::str const &str) const { return to_stl({ptr, len}) == str.cast<std::string_view>(); }
-    bool operator!=(py::str const &str) const { return !(*this == str); }
-    bool operator==(py_span_t const &other) const { return to_stl({ptr, len}) == to_stl({other.ptr, other.len}); }
-    bool operator!=(py_span_t const &other) const { return !(*this == other); }
-    bool operator>(py::str const &str) const { return to_stl({ptr, len}) > str.cast<std::string_view>(); }
-    bool operator<(py::str const &str) const { return to_stl({ptr, len}) < str.cast<std::string_view>(); }
-    bool operator>(py_span_t const &other) const { return to_stl({ptr, len}) > to_stl({other.ptr, other.len}); }
-    bool operator<(py_span_t const &other) const { return to_stl({ptr, len}) < to_stl({other.ptr, other.len}); }
-
-    span_t after_n(size_t offset) const noexcept {
-        return (offset < len) ? span_t {ptr + offset, len - offset} : span_t {};
-    }
-    span_t before_n(size_t tail) const noexcept {
-        return (tail < len) ? span_t {ptr + len - tail, len - tail} : span_t {};
-    }
-};
-
-struct py_str_t : public py_span_t {
-    std::string copy_;
-
-    py_str_t(std::string_view string = "") : copy_(string) { ptr = to_span(copy_).ptr, len = to_span(copy_).len; }
-    ~py_str_t() {}
-
-    using py_span_t::contains;
-    using py_span_t::count;
-    using py_span_t::find;
-    using py_span_t::size;
-    using py_span_t::split;
-    using py_span_t::splitlines;
-};
-
-struct py_file_t : public py_span_t {
-    std::string path;
-#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
-    HANDLE file_handle = nullptr;
-    HANDLE mapping_handle = nullptr;
-#else
-    int file_descriptor = 0;
-#endif
-
-  public:
-    py_file_t(std::string const &path) { open(path); }
-    ~py_file_t() { close(); }
-
-    void reopen() { open(path); }
-    void open(std::string const &p) {
-        close();
-        path = p;
-
-#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
-
-        file_handle =
-            CreateFile(path.c_str(), GENERIC_READ, FILE_SHARE_READ, 0, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0);
-        if (file_handle == INVALID_HANDLE_VALUE)
-            throw std::runtime_error("Couldn't map the file!");
-
-        mapping_handle = CreateFileMapping(file_handle, 0, PAGE_READONLY, 0, 0, 0);
-        if (mapping_handle == 0) {
-            CloseHandle(std::exchange(file_handle, nullptr));
-            throw std::runtime_error("Couldn't map the file!");
-        }
-
-        char *file = (char *)MapViewOfFile(mapping_handle, FILE_MAP_READ, 0, 0, 0);
-        if (file == 0) {
-            CloseHandle(std::exchange(mapping_handle, nullptr));
-            CloseHandle(std::exchange(file_handle, nullptr));
-            throw std::runtime_error("Couldn't map the file!");
-        }
-        ptr = file;
-        len = GetFileSize(file_handle, 0);
-#else
-        struct stat sb;
-        file_descriptor = ::open(path.c_str(), O_RDONLY);
-        if (fstat(file_descriptor, &sb) != 0) {
-            ::close(std::exchange(file_descriptor, 0));
-            throw std::runtime_error("Can't retrieve file size!");
-        }
-        size_t file_size = sb.st_size;
-        void *map = mmap(NULL, sb.st_size, PROT_READ, MAP_SHARED, file_descriptor, 0);
-        if (map == MAP_FAILED) {
-            ::close(std::exchange(file_descriptor, 0));
-            throw std::runtime_error("Couldn't map the file!");
-        }
-        ptr = reinterpret_cast<char const *>(map);
-        len = file_size;
-#endif
-    }
-
-    void close() {
-
-#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
-        if (ptr)
-            UnmapViewOfFile(std::exchange(ptr, nullptr)), len = 0;
-        if (mapping_handle)
-            CloseHandle(std::exchange(mapping_handle, nullptr));
-        if (file_handle)
-            CloseHandle(std::exchange(file_handle, nullptr));
-
-#else
-        if (ptr)
-            munmap((void *)std::exchange(ptr, nullptr), std::exchange(len, 0));
-        if (file_descriptor != 0)
-            ::close(std::exchange(file_descriptor, 0));
-#endif
-    }
-
-    std::shared_ptr<py_str_t> load() const { return std::make_shared<py_str_t>(to_stl(*this)); }
-
-    using py_span_t::contains;
-    using py_span_t::count;
-    using py_span_t::find;
-    using py_span_t::size;
-    using py_span_t::split;
-    using py_span_t::splitlines;
-};
-
-struct py_subspan_t : public py_span_t {
-    std::shared_ptr<py_span_t const> parent_;
-
-  public:
-    py_subspan_t() = default;
-    py_subspan_t(py_subspan_t &&) = default;
-    py_subspan_t &operator=(py_subspan_t &&) = default;
-    py_subspan_t(std::shared_ptr<py_span_t const> parent, span_t str) : parent_(std::move(parent)) {
-        ptr = str.ptr, len = str.len;
-    }
-
-    using py_span_t::contains;
-    using py_span_t::count;
-    using py_span_t::find;
-    using py_span_t::size;
-    using py_span_t::split;
-    using py_span_t::splitlines;
-};
-
-static std::shared_ptr<py_subspan_t> empty_subspan = std::make_shared<py_subspan_t>();
-
-struct py_spans_t : public std::enable_shared_from_this<py_spans_t> {
-
-    using parent_t = std::shared_ptr<py_span_t const>;
-
-    struct less_address_t {
-        using is_transparent = void;
-        bool operator()(py_span_t const &a, py_span_t const &b) const noexcept { return a.data() < b.data(); }
-        bool operator()(parent_t const &a, parent_t const &b) const noexcept { return a->data() < b->data(); }
-        bool operator()(py_span_t const &a, char const *b) const noexcept {
-            return a.span().contains(b) ? false : a.data() < b;
-        }
-        bool operator()(parent_t const &a, char const *b) const noexcept {
-            return a->span().contains(b) ? false : a->data() < b;
-        }
-        bool operator()(char const *a, py_span_t const &b) const noexcept {
-            return b.span().contains(a) ? false : a < b.data();
-        }
-        bool operator()(char const *a, parent_t const &b) const noexcept {
-            return b->span().contains(a) ? false : a < b->data();
-        }
-    };
-
-    using parents_t = std::set<parent_t, less_address_t>;
-    using parts_t = std::vector<span_t>;
-
-  private:
-    parents_t parents_;
-    parts_t parts_;
 
-    static char const *strzl_array_get_begin(void const *raw, size_t i) { return ((span_t *)raw)[i].ptr; }
-    static size_t strzl_array_get_length(void const *raw, size_t i) { return ((span_t *)raw)[i].len; }
-
-  public:
-    py_spans_t() = default;
-    py_spans_t(py_spans_t &&) = default;
-    py_spans_t &operator=(py_spans_t &&) = default;
-    py_spans_t(parents_t parents, parts_t parts) : parents_(std::move(parents)), parts_(std::move(parts)) {}
-
-    struct iterator_t {
-        py_spans_t const *py_spans_ = nullptr;
-        size_t idx_ = 0;
-
-        bool operator==(iterator_t const &other) const { return idx_ == other.idx_; }
-        bool operator!=(iterator_t const &other) const { return idx_ != other.idx_; }
-        std::shared_ptr<py_subspan_t> operator*() const { return py_spans_->at(idx_); }
-        iterator_t &operator++() {
-            idx_++;
-            return *this;
-        }
-        iterator_t operator++(int) {
-            iterator_t old(*this);
-            ++*this;
-            return old;
-        }
-    };
-
-    std::shared_ptr<py_subspan_t> pop(ssize_t i) {
-        std::size_t offset = unsigned_offset(size(), i);
-        span_t part = parts_[offset];
-        if (!part) {
-            parts_.erase(parts_.begin() + offset);
-            return empty_subspan;
-        }
-        auto parent_iterator = parents_.find(part.data());
-        auto popped = std::make_shared<py_subspan_t>(*parent_iterator, part);
-        parts_.erase(parts_.begin() + offset);
-        return popped;
-    }
-
-    std::shared_ptr<py_subspan_t> at(ssize_t i) const {
-        std::size_t offset = unsigned_offset(size(), i);
-        span_t part = parts_[offset];
-        if (!part)
-            return empty_subspan;
-        auto parent_iterator = parents_.find(part.data());
-        auto popped = std::make_shared<py_subspan_t>(*parent_iterator, part);
-        return popped;
-    }
-
-    std::shared_ptr<py_spans_t> sub(ssize_t start, ssize_t end, ssize_t step, ssize_t length) const {
-        if (step == 1) {
-            auto first_part_it = parts_.begin() + start;
-            std::vector<span_t> sub_parts(first_part_it, first_part_it + length);
-            return std::make_shared<py_spans_t>(parents_, std::move(sub_parts));
-        }
-        std::vector<span_t> sub_parts(length);
-        for (ssize_t parts_idx = start, sub_idx = 0; sub_idx < length; parts_idx += step, ++sub_idx)
-            sub_parts[sub_idx] = parts_[parts_idx];
-        return std::make_shared<py_spans_t>(parents_, std::move(sub_parts));
-    }
-
-    iterator_t begin() const { return {this, 0}; }
-    iterator_t end() const { return {this, parts_.size()}; }
-    ssize_t size() const { return static_cast<ssize_t>(parts_.size()); }
-
-    void sort() {
-        std::vector<std::size_t> permute(parts_.size());
-        std::iota(permute.begin(), permute.end(), 0ul);
-        strzl_array_t array;
-        array.order = permute.data();
-        array.count = permute.size();
-        array.handle = parts_.data();
-        array.get_begin = strzl_array_get_begin;
-        array.get_length = strzl_array_get_length;
-        strzl_sort(&array, nullptr);
-        std::vector<span_t> new_parts(parts_.size());
-        for (std::size_t i = 0; i != parts_.size(); ++i)
-            new_parts[i] = parts_[permute[i]];
-        parts_ = new_parts;
-    }
-
-    void shuffle(std::optional<std::size_t> maybe_seed) {
-        std::random_device device;
-        std::size_t seed = maybe_seed ? *maybe_seed : device();
-        using seed_t = typename std::mt19937::result_type;
-        std::mt19937 generator {static_cast<seed_t>(seed)};
-        std::shuffle(parts_.begin(), parts_.end(), generator);
-    }
-
-    void reverse() { std::reverse(parts_.begin(), parts_.end()); }
-
-    void extend(py_spans_t const &other) {
-        parents_.insert(other.parents_.begin(), other.parents_.end());
-        parts_.insert(parts_.end(), other.parts_.begin(), other.parts_.end());
-    }
-
-    template <typename py_span_or_derived_at>
-    void append(std::shared_ptr<py_span_or_derived_at> const &other) {
-        parents_.insert(std::dynamic_pointer_cast<py_span_t>(other));
-        parts_.push_back(other->span());
-    }
-
-    void append_copy(std::string_view other) { append(std::make_shared<py_str_t>(other)); }
-
-    void extend_copy(std::vector<std::string_view> const &others) {
-        // `std::set` doesn't ahve such an interface:
-        // parents_.reserve(parents_.size() + others.size());
-        parts_.reserve(parts_.size() + others.size());
-        for (std::string_view other : others)
-            append_copy(other);
-    }
-
-    std::shared_ptr<py_spans_t> sorted() const {
-        auto copy = std::make_shared<py_spans_t>(parents_, parts_);
-        copy->sort();
-        return copy;
-    }
-
-    std::shared_ptr<py_spans_t> shuffled(std::optional<std::size_t> maybe_seed) const {
-        auto copy = std::make_shared<py_spans_t>(parents_, parts_);
-        copy->shuffle(maybe_seed);
-        return copy;
-    }
-};
-
-bool py_span_t::contains(std::string_view needle, ssize_t start, ssize_t end) const {
-    if (needle.size() == 0)
-        return true;
-    span_t part = subspan(span(), start, end);
-    size_t offset = needle.size() == 1 //
-                        ? find_substr(part, needle.front())
-                        : find_substr(part, to_span(needle));
-    return offset != part.len;
-}
-
-ssize_t py_span_t::find(std::string_view needle, ssize_t start, ssize_t end) const {
-    if (needle.size() == 0)
-        return 0;
-    span_t part = subspan(span(), start, end);
-    size_t offset = needle.size() == 1 //
-                        ? find_substr(part, needle.front())
-                        : find_substr(part, to_span(needle));
-    return offset != part.len ? offset : -1;
-}
-
-ssize_t py_span_t::count(std::string_view needle, ssize_t start, ssize_t end, bool allowoverlap) const {
-    if (needle.size() == 0)
-        return 0;
-    span_t part = subspan(span(), start, end);
-    auto result = needle.size() == 1 //
-                      ? count_char(part, needle.front())
-                      : count_substr(part, to_span(needle), allowoverlap);
-    return result;
-}
-
-std::shared_ptr<py_spans_t> py_span_t::splitlines(bool keeplinebreaks, char separator, size_t maxsplit) const {
-
-    size_t count_separators = count_char(span(), separator);
-    std::vector<span_t> parts(std::min(count_separators + 1, maxsplit));
-    size_t last_start = 0;
-    for (size_t i = 0; i + 1 < parts.size(); ++i) {
-        span_t remaining = after_n(last_start);
-        size_t offset_in_remaining = find_substr(remaining, separator);
-        parts[i] = span_t {ptr + last_start, offset_in_remaining + keeplinebreaks};
-        last_start += offset_in_remaining + 1;
-    }
-    parts[count_separators] = after_n(last_start);
-    py_spans_t::parent_t parent = shared_from_this();
-    return std::make_shared<py_spans_t>(py_spans_t::parents_t {std::move(parent)}, std::move(parts));
-}
-
-std::shared_ptr<py_spans_t> py_span_t::split(std::string_view separator, size_t maxsplit, bool keepseparator) const {
-
-    if (separator.size() == 1 && maxsplit == ssize_max_k)
-        return splitlines(keepseparator, separator.front(), maxsplit);
-
-    std::vector<span_t> parts;
-    size_t last_start = 0;
-    bool will_continue = true;
-    while (last_start < len && parts.size() + 1 < maxsplit) {
-        span_t remaining = after_n(last_start);
-        size_t offset_in_remaining = find_substr(remaining, to_span(separator));
-        will_continue = offset_in_remaining != remaining.size();
-        size_t part_len = offset_in_remaining + separator.size() * keepseparator * will_continue;
-        parts.emplace_back(span_t {remaining.data(), part_len});
-        last_start += offset_in_remaining + separator.size();
-    }
-    // Python marks includes empy ending as well
-    if (will_continue)
-        parts.emplace_back(after_n(last_start));
-    py_spans_t::parent_t parent = shared_from_this();
-    return std::make_shared<py_spans_t>(py_spans_t::parents_t {std::move(parent)}, std::move(parts));
-}
-
-std::shared_ptr<py_subspan_t> py_span_t::sub(ssize_t start, ssize_t end) const {
-    index_span_t index_span = slice(size(), start, end);
-    return std::make_shared<py_subspan_t>(shared_from_this(), span_t {ptr + index_span.offset, index_span.length});
-}
-
-template <typename at>
-void define_comparsion_ops(py::class_<at, std::shared_ptr<at>> &str_view_struct) {
-    str_view_struct.def("__hash__", [](at const &self) { return self.hash(); });
-    str_view_struct.def("__eq__", [](at const &self, py::str const &str) { return self == str; });
-    str_view_struct.def("__ne__", [](at const &self, py::str const &str) { return self != str; });
-    str_view_struct.def("__eq__", [](at const &self, at const &other) { return self == other; });
-    str_view_struct.def("__ne__", [](at const &self, at const &other) { return self != other; });
-    str_view_struct.def("__gt__", [](at const &self, py::str const &str) { return self > str; });
-    str_view_struct.def("__lt__", [](at const &self, py::str const &str) { return self < str; });
-    str_view_struct.def("__gt__", [](at const &self, at const &other) { return self > other; });
-    str_view_struct.def("__lt__", [](at const &self, at const &other) { return self < other; });
-}
-
-template <typename at>
-void define_slice_ops(py::class_<at, std::shared_ptr<at>> &str_view_struct) {
-
-    str_view_struct.def( //
-        "contains",
-        &at::contains,
-        py::arg("needle"),
-        py::arg("start") = 0,
-        py::arg("end") = ssize_max_k,
-        py::call_guard<py::gil_scoped_release>());
-    str_view_struct.def( //
-        "find",
-        &at::find,
-        py::arg("needle"),
-        py::arg("start") = 0,
-        py::arg("end") = ssize_max_k,
-        py::call_guard<py::gil_scoped_release>());
-    str_view_struct.def( //
-        "count",
-        &at::count,
-        py::arg("needle"),
-        py::arg("start") = 0,
-        py::arg("end") = ssize_max_k,
-        py::arg("allowoverlap") = false,
-        py::call_guard<py::gil_scoped_release>());
-    str_view_struct.def( //
-        "splitlines",
-        &at::splitlines,
-        py::arg("keeplinebreaks") = false,
-        py::arg("separator") = '\n',
-        py::kw_only(),
-        py::arg("maxsplit") = size_max_k,
-        py::call_guard<py::gil_scoped_release>());
-    str_view_struct.def( //
-        "split",
-        &at::split,
-        py::arg("separator") = " ",
-        py::arg("maxsplit") = size_max_k,
-        py::kw_only(),
-        py::arg("keepseparator") = false,
-        py::call_guard<py::gil_scoped_release>());
-    str_view_struct.def( //
-        "sub",
-        &at::sub,
-        py::arg("start") = 0,
-        py::arg("end") = 0);
-
-    // Substring presence operator
-    str_view_struct.def("__contains__",
-                        [](at const &str, std::string_view needle) { return str.contains(needle, 0, ssize_max_k); });
-
-    // Character access operators
-    str_view_struct.def("__str__", &at::to_python);
-    str_view_struct.def("__getitem__", &at::at, py::arg("index"));
-    str_view_struct.def("__len__", &at::size);
-    str_view_struct.def(
-        "__iter__",
-        [](at const &s) { return py::make_iterator(s.begin(), s.end()); },
-        py::keep_alive<0, 1>());
-}
-
-#ifdef PURE_CPYTHON
-typedef struct
-{
-        PyObject_HEAD
-        std::shared_ptr<py_span_t>  span;
-} PySpan;
-
-
-static PyMethodDef PySpan_methods[] = {
-    {"size", (PyCFunction)PySpan_size, METH_NOARGS, "Get the size"},
-    {"__hash__", (PyCFunction) PySpan_hash, METH_NOARGS, "Returns the hash value"},
-    {"__eq__", (PyCFunction) PySpan_eq, METH_O, "Equality check"},
-    {"__ne__", (PyCFunction) PySpan_ne, METH_O, "Non-equality check"},
-    {"__gt__", (PyCFunction) PySpan_gt, METH_O, "Greater than check"},
-    {"__lt__", (PyCFunction) PySpan_lt, METH_O, "Less than check"},
-    {"contains", (PyCFunction)PyStrView_contains, METH_VARARGS | METH_KEYWORDS, "Check if contains"},
-    {"find", (PyCFunction)PyStrView_find, METH_VARARGS | METH_KEYWORDS, "Find needle"},
-    {NULL}  // Sentinel
-};
-
-static PyObject *PySpan_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
-{
-    PySpan *self;
-    self = (PySpan *)type->tp_alloc(type, 0);
-    return (PyObject *)self;
-}
-static int PySpan_init(PySpan *self, PyObject *args, PyObject *kwds)
-{
-    self->span = std::make_shared<py_span_t>();
-    return 0;
-}
-
-static PyObject *PySpan_size(PySpan *self, PyObject *Py_UNUSED(ignored))
-{
-    return PyLong_FromSsize_t(self->span->size());
-}
-
-static void PySpan_dealloc(PySpan *self)
-{
-    // Handle the deallocation of the C++ object
-    self->span.reset();
-    Py_TYPE(self)->tp_free((PyObject *)self);
-}
-
-static PyObject* PySpan_hash(PySpan *self) {
-    return PyLong_FromUnsignedLong(self->span->hash());
-}
-
-static PyObject* PySpan_eq(PySpan *self, PyObject *other) {
-    if (PyUnicode_Check(other)) {
-        return PyBool_FromLong(self->span->operator==(PyUnicode_AsUTF8(other)));
-    } else if (PyObject_TypeCheck(other, &PySpanType)) {
-        return PyBool_FromLong(self->span->operator==(((PySpan *)other)->span));
-    }
-    Py_RETURN_FALSE;
-}
-
-static PyObject* PySpan_ne(PySpan *self, PyObject *other) {
-    if (PyUnicode_Check(other)) {
-        return PyBool_FromLong(self->span->operator!=(PyUnicode_AsUTF8(other)));
-    } else if (PyObject_TypeCheck(other, &PySpanType)) {
-        return PyBool_FromLong(self->span->operator!=(((PySpan *)other)->span));
-    }
-    Py_RETURN_TRUE;
-}
-
-static PyObject* PySpan_gt(PySpan *self, PyObject *other) {
-    if (PyUnicode_Check(other)) {
-        return PyBool_FromLong(self->span->operator>(PyUnicode_AsUTF8(other)));
-    } else if (PyObject_TypeCheck(other, &PySpanType)) {
-        return PyBool_FromLong(self->span->operator>(((PySpan *)other)->span));
-    }
-    Py_RETURN_FALSE;
-}
-
-static PyObject* PySpan_lt(PySpan *self, PyObject *other) {
-    if (PyUnicode_Check(other)) {
-        return PyBool_FromLong(self->span->operator<(PyUnicode_AsUTF8(other)));
-    } else if (PyObject_TypeCheck(other, &PySpanType)) {
-        return PyBool_FromLong(self->span->operator<(((PySpan *)other)->span));
-    }
-    Py_RETURN_FALSE;
-}
-
-static PyObject * PyStrView_contains(PyStrView *self, PyObject *args, PyObject *kwargs) {
-    char *needle;
-    int start = 0, end = INT_MAX;
-    static char *kwlist[] = {"needle", "start", "end", NULL};
-
-    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s|ii", kwlist, &needle, &start, &end)) {
-        return NULL;
-    }
-    
-    if(self->span->contains(needle,start,end)){
-        Py_INCREF(Py_True);
-        return Py_True;
-    } else {
-        Py_INCREF(Py_False);
-        return Py_False;
-    }
-}
-static PyObject * PyStrView_find(PyStrView *self, PyObject *args, PyObject *kwargs) {
-    char *needle;
-    int start = 0, end = INT_MAX;
-    static char *kwlist[] = {"needle", "start", "end", NULL};
-
-    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s|ii", kwlist, &needle, &start, &end)) {
-        return NULL;
-    }
-    return PyLong_FromLong(self->span->find(needle,start,end));
-}
-
-static PyTypeObject PySpanType = {
-    PyVarObject_HEAD_INIT(NULL, 0) /* ob_size */
-    "YourModule.Span",             /* tp_name */
-    sizeof(PySpan),                /* tp_basicsize */
-    0,                             /* tp_itemsize */
-    (destructor)PySpan_dealloc,    /* tp_dealloc */
-    0,                             /* tp_print */
-    0,                             /* tp_getattr */
-    0,                             /* tp_setattr */
-    0,                             /* tp_compare */
-    0,                             /* tp_repr */
-    0,                             /* tp_as_number */
-    0,                             /* tp_as_sequence */
-    0,                             /* tp_as_mapping */
-    0,                             /* tp_hash */
-    0,                             /* tp_call */
-    0,                             /* tp_str */
-    0,                             /* tp_getattro */
-    0,                             /* tp_setattro */
-    0,                             /* tp_as_buffer */
-    Py_TPFLAGS_DEFAULT,            /* tp_flags */
-    "Span objects",                /* tp_doc */
-    0,                             /* tp_traverse */
-    0,                             /* tp_clear */
-    0,                             /* tp_richcompare */
-    0,                             /* tp_weaklistoffset */
-    0,                             /* tp_iter */
-    0,                             /* tp_iternext */
-    PySpan_methods,                /* tp_methods */
-    0,                             /* tp_members */
-    0,                             /* tp_getset */
-    0,                             /* tp_base */
-    0,                             /* tp_dict */
-    0,                             /* tp_descr_get */
-    0,                             /* tp_descr_set */
-    0,                             /* tp_dictoffset */
-    (initproc)PySpan_init,         /* tp_init */
-    0,                             /* tp_alloc */
-    PySpan_new,                    /* tp_new */
-};
+/**
+ *  @brief  Type-punned StringZilla-string, that may either be an immutable in-memory string,
+ *          similar to Python's native `str`, or a memory-mapped immutable file from disk,
+ *          or a slice of one of those classes or the Python's native `str` and `bytes` classes.
+ *
+ *  When a slice is being used, the `parent` object's reference count is being incremented.
+ *  When an in-memory string is used - we avoid the second memory allocation and allocate the `HEAD`,
+ *  the length, and the content region in a single continuous chunk.
+ */
+typedef struct {
+    PyObject_HEAD;
+
+    typedef enum {
+        in_memory_k,
+        on_disk_k,
+        slice_k,
+    } variant;
+
+    typedef struct {
+        size_t length;
+    } in_memory_t;
+
+    typedef struct {
+        void *start;
+        size_t length;
+        int file_descriptor;
+    } on_disk_t;
+
+    typedef struct {
+        PyObject *parent;
+        void *start;
+        size_t length;
+    } slice_t;
+} strzl_t;
 
 static PyModuleDef stringzilla_module = {
     PyModuleDef_HEAD_INIT,
     "stringzilla",
     "Crunch 100+ GB Strings in Python with ease",
     -1,
-    NULL, NULL, NULL, NULL, NULL
+    NULL,
+    NULL,
+    NULL,
+    NULL,
+    NULL,
 };
 
 PyMODINIT_FUNC PyInit_stringzilla(void) {
@@ -828,92 +82,3 @@ PyMODINIT_FUNC PyInit_stringzilla(void) {
 
     return m;
 }
-
-#endif
-
-PYBIND11_MODULE(stringzilla, m) {
-    m.doc() = "Crunch 100+ GB Strings in Python with ease";
-
-    auto py_span = py::class_<py_span_t, std::shared_ptr<py_span_t>>(m, "Span");
-    define_comparsion_ops(py_span);
-    define_slice_ops(py_span);
-
-    auto py_subspan = py::class_<py_subspan_t, std::shared_ptr<py_subspan_t>>(m, "SubSpan");
-    define_comparsion_ops(py_subspan);
-    define_slice_ops(py_subspan);
-
-    auto py_str = py::class_<py_str_t, std::shared_ptr<py_str_t>>(m, "Str");
-    py_str.def(py::init([](std::string arg) { return std::make_shared<py_str_t>(std::move(arg)); }), py::arg("str"));
-    py_str.def("__getitem__", [](py_str_t &s, py::slice slice) {
-        ssize_t start, stop, step, length;
-        if (!slice.compute(s.size(), &start, &stop, &step, &length))
-            throw py::error_already_set();
-        if (step != 1)
-            throw std::invalid_argument("Step argument is not supported for Str");
-        return s.sub(start, stop);
-    });
-    define_comparsion_ops(py_str);
-    define_slice_ops(py_str);
-
-    auto py_file = py::class_<py_file_t, std::shared_ptr<py_file_t>>(m, "File");
-    py_file.def( //
-        py::init([](std::string path) { return std::make_shared<py_file_t>(std::move(path)); }),
-        py::arg("path"));
-    define_slice_ops(py_file);
-    py_file.def("open", &py_file_t::open, py::arg("path"));
-    py_file.def("open", &py_file_t::reopen);
-    py_file.def("load", &py_file_t::load);
-    py_file.def("close", &py_file_t::close);
-    py_file.def("__getitem__", [](py_file_t &s, py::slice slice) {
-        ssize_t start, stop, step, length;
-        if (!slice.compute(s.size(), &start, &stop, &step, &length))
-            throw py::error_already_set();
-        if (step != 1)
-            throw std::invalid_argument("Step argument is not supported for File");
-        return s.sub(start, stop);
-    });
-
-    auto py_strs = py::class_<py_spans_t, std::shared_ptr<py_spans_t>>(m, "Strs");
-    py_strs.def(py::init([]() { return std::make_shared<py_spans_t>(); }));
-    py_strs.def("__len__", &py_spans_t::size);
-    py_strs.def("__getitem__", &py_spans_t::at, py::arg("index"));
-    py_strs.def(
-        "__iter__",
-        [](py_spans_t const &s) { return py::make_iterator(s.begin(), s.end()); },
-        py::keep_alive<0, 1>());
-    py_strs.def("pop", &py_spans_t::pop, py::call_guard<py::gil_scoped_release>());
-    py_strs.def("sort", &py_spans_t::sort, py::call_guard<py::gil_scoped_release>());
-    py_strs.def("reverse", &py_spans_t::reverse, py::call_guard<py::gil_scoped_release>());
-    py_strs.def("shuffle",
-                &py_spans_t::shuffle,
-                py::arg("seed") = std::nullopt,
-                py::call_guard<py::gil_scoped_release>());
-    py_strs.def("__getitem__", [](py_spans_t &s, py::slice slice) {
-        ssize_t start, stop, step, length;
-        if (!slice.compute(s.size(), &start, &stop, &step, &length))
-            throw py::error_already_set();
-        return s.sub(start, stop, step, length);
-    });
-    py_strs.def( //
-        "sub",
-        [](py_spans_t &s, ssize_t start, ssize_t stop, ssize_t step = 1) {
-            auto index_span = slice(s.size(), start, stop);
-            ssize_t length = stop = index_span.length;
-            start = index_span.offset;
-            return s.sub(start, stop, step, length);
-        });
-
-    py_strs.def("shuffled",
-                &py_spans_t::shuffled,
-                py::arg("seed") = std::nullopt,
-                py::call_guard<py::gil_scoped_release>());
-    py_strs.def("sorted", &py_spans_t::sorted, py::call_guard<py::gil_scoped_release>());
-
-    py_strs.def("extend", &py_spans_t::extend, py::call_guard<py::gil_scoped_release>());
-    py_strs.def("append", &py_spans_t::append<py_span_t>, py::call_guard<py::gil_scoped_release>());
-    py_strs.def("append", &py_spans_t::append<py_str_t>, py::call_guard<py::gil_scoped_release>());
-    py_strs.def("append", &py_spans_t::append<py_file_t>, py::call_guard<py::gil_scoped_release>());
-    py_strs.def("append", &py_spans_t::append<py_subspan_t>, py::call_guard<py::gil_scoped_release>());
-    py_strs.def("append", &py_spans_t::append_copy);
-    py_strs.def("extend", &py_spans_t::extend_copy);
-}

From 963cd4d90f0cf30ed6ba361e73d6915a6eee1ff5 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 11 Sep 2023 18:09:57 +0400
Subject: [PATCH 03/72] Docs: Annotate SWAR methods

---
 .vscode/settings.json     |  2 ++
 stringzilla/stringzilla.h | 25 ++++++++++++++++++-------
 2 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 0ed99251..3a4c79d2 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -121,6 +121,7 @@
   "C_Cpp.default.configurationProvider": "ms-vscode.cmake-tools",
   "cSpell.words": [
     "allowoverlap",
+    "bigram",
     "cibuildwheel",
     "getitem",
     "keeplinebreaks",
@@ -129,6 +130,7 @@
     "maxsplit",
     "memcpy",
     "pytest",
+    "quadgram",
     "readlines",
     "SIMD",
     "splitlines",
diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h
index 303c8dbd..3e7df130 100644
--- a/stringzilla/stringzilla.h
+++ b/stringzilla/stringzilla.h
@@ -55,8 +55,7 @@ typedef struct strzl_needle_t {
 } strzl_needle_t;
 
 /**
- *  @brief  A naive subtring matching algorithm with O(|h|*|n|) comparisons.
- *          Matching performance fluctuates between 200 MB/s and 2 GB/s.
+ *  @brief  SWAR single-character counting procedure, jumping 8 bytes at a time.
  */
 inline static size_t strzl_naive_count_char(strzl_haystack_t h, char n) {
 
@@ -67,7 +66,7 @@ inline static size_t strzl_naive_count_char(strzl_haystack_t h, char n) {
     for (; (uint64_t)text % 8 != 0 && text < end; ++text)
         result += *text == n;
 
-    // This code simulates hyperscalar execution, comparing 8 characters at a time.
+    // This code simulates hyper-scalar execution, comparing 8 characters at a time.
     uint64_t nnnnnnnn = n;
     nnnnnnnn |= nnnnnnnn << 8;
     nnnnnnnn |= nnnnnnnn << 16;
@@ -87,6 +86,9 @@ inline static size_t strzl_naive_count_char(strzl_haystack_t h, char n) {
     return result;
 }
 
+/**
+ *  @brief  SWAR single-character search in string, jumping 8 bytes at a time.
+ */
 inline static size_t strzl_naive_find_char(strzl_haystack_t h, char n) {
 
     char const *text = h.ptr;
@@ -96,7 +98,7 @@ inline static size_t strzl_naive_find_char(strzl_haystack_t h, char n) {
         if (*text == n)
             return text - h.ptr;
 
-    // This code simulates hyperscalar execution, analyzing 8 offsets at a time.
+    // This code simulates hyper-scalar execution, analyzing 8 offsets at a time.
     uint64_t nnnnnnnn = n;
     nnnnnnnn |= nnnnnnnn << 8;  // broadcast `n` into `nnnnnnnn`
     nnnnnnnn |= nnnnnnnn << 16; // broadcast `n` into `nnnnnnnn`
@@ -119,12 +121,15 @@ inline static size_t strzl_naive_find_char(strzl_haystack_t h, char n) {
     return h.len;
 }
 
+/**
+ *  @brief  SWAR character-bigram search in string, jumping 8 bytes at a time.
+ */
 inline static size_t strzl_naive_find_2chars(strzl_haystack_t h, char const *n) {
 
     char const *text = h.ptr;
     char const *end = h.ptr + h.len;
 
-    // This code simulates hyperscalar execution, analyzing 7 offsets at a time.
+    // This code simulates hyper-scalar execution, analyzing 7 offsets at a time.
     uint64_t nnnn = (uint64_t(n[0]) << 0) | (uint64_t(n[1]) << 8); // broadcast `n` into `nnnn`
     nnnn |= nnnn << 16;                                            // broadcast `n` into `nnnn`
     nnnn |= nnnn << 32;                                            // broadcast `n` into `nnnn`
@@ -158,12 +163,15 @@ inline static size_t strzl_naive_find_2chars(strzl_haystack_t h, char const *n)
     return h.len;
 }
 
+/**
+ *  @brief  SWAR character-trigram search in string, jumping 8 bytes at a time.
+ */
 inline static size_t strzl_naive_find_3chars(strzl_haystack_t h, char const *n) {
 
     char const *text = h.ptr;
     char const *end = h.ptr + h.len;
 
-    // This code simulates hyperscalar execution, analyzing 6 offsets at a time.
+    // This code simulates hyper-scalar execution, analyzing 6 offsets at a time.
     // We have two unused bytes at the end.
     uint64_t nn = uint64_t(n[0] << 0) | (uint64_t(n[1]) << 8) | (uint64_t(n[2]) << 16); // broadcast `n` into `nn`
     nn |= nn << 24;                                                                     // broadcast `n` into `nn`
@@ -210,12 +218,15 @@ inline static size_t strzl_naive_find_3chars(strzl_haystack_t h, char const *n)
     return h.len;
 }
 
+/**
+ *  @brief  SWAR character-quadgram search in string, jumping 8 bytes at a time.
+ */
 inline static size_t strzl_naive_find_4chars(strzl_haystack_t h, char const *n) {
 
     char const *text = h.ptr;
     char const *end = h.ptr + h.len;
 
-    // This code simulates hyperscalar execution, analyzing 4 offsets at a time.
+    // This code simulates hyper-scalar execution, analyzing 4 offsets at a time.
     uint64_t nn = uint64_t(n[0] << 0) | (uint64_t(n[1]) << 8) | (uint64_t(n[2]) << 16) | (uint64_t(n[3]) << 24);
     nn |= nn << 32;
     nn = nn;

From aa57b4f63e98bab674945a4c4a14a75dcba0e7ee Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 11 Sep 2023 18:11:34 +0400
Subject: [PATCH 04/72] Add: `MemoryMappedFile`

---
 python/lib.cpp | 303 +++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 270 insertions(+), 33 deletions(-)

diff --git a/python/lib.cpp b/python/lib.cpp
index 77678767..5267cff4 100644
--- a/python/lib.cpp
+++ b/python/lib.cpp
@@ -1,5 +1,6 @@
 /**
- *  @brief
+ *  @brief  Very light-weight CPython wrapper for StringZilla, with support for memory-mapping,
+ *          native Python strings, Apache Arrow collections, and more.
  */
 #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
 #define NOMINMAX
@@ -20,40 +21,264 @@ typedef SSIZE_T ssize_t;
 
 #include <Python.h>
 
+#pragma region Helpers
+
+void slice(size_t length, ssize_t start, ssize_t end, size_t *normalized_offset, size_t *normalized_length) {
+
+    // clang-format off
+    // Normalize negative indices
+    if (start < 0) start += length;
+    if (end < 0) end += length;
+
+    // Clamp indices to a valid range
+    if (start < 0) start = 0;
+    if (end < 0) end = 0;
+    if (start > length) start = length;
+    if (end > length) end = length;
+
+    // Ensure start <= end
+    if (start > end) start = end;
+    // clang-format on
+
+    *normalized_offset = start;
+    *normalized_length = end - start;
+}
+
+#pragma endregion
+
+#pragma region MemoryMappingFile
+
+/**
+ *  @brief  Describes an on-disk file mapped into RAM, which is different from Python's
+ *          native `mmap` module, as it exposes the address of the mapping in memory.
+ */
+typedef struct {
+    PyObject_HEAD;
+#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
+    HANDLE file_handle;
+    HANDLE mapping_handle;
+#else
+    int file_descriptor;
+#endif
+    void *ptr;
+    size_t len;
+} MemoryMappedFile;
+
+static void MemoryMappedFile_dealloc(MemoryMappedFile *self) {
+#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
+    if (self->ptr) {
+        UnmapViewOfFile(self->ptr);
+        self->ptr = NULL;
+    }
+    if (self->mapping_handle) {
+        CloseHandle(self->mapping_handle);
+        self->mapping_handle = NULL;
+    }
+    if (self->file_handle) {
+        CloseHandle(self->file_handle);
+        self->file_handle = NULL;
+    }
+#else
+    if (self->ptr) {
+        munmap(self->ptr, self->len);
+        self->ptr = NULL;
+        self->len = 0;
+    }
+    if (self->file_descriptor != 0) {
+        close(self->file_descriptor);
+        self->file_descriptor = 0;
+    }
+#endif
+    Py_TYPE(self)->tp_free((PyObject *)self);
+}
+
+static PyObject *MemoryMappedFile_new(PyTypeObject *type, PyObject *positional_args, PyObject *named_args) {
+    MemoryMappedFile *self;
+    self = (MemoryMappedFile *)type->tp_alloc(type, 0);
+    if (self != NULL) {
+        self->ptr = NULL;
+        self->len = 0;
+#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
+        self->file_handle = NULL;
+        self->mapping_handle = NULL;
+#else
+        self->file_descriptor = 0;
+#endif
+    }
+    return (PyObject *)self;
+}
+
+static int MemoryMappedFile_init(MemoryMappedFile *self, PyObject *positional_args, PyObject *named_args) {
+    const char *path;
+    if (!PyArg_ParseTuple(positional_args, "s", &path)) {
+        return -1;
+    }
+
+#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
+    self->file_handle = CreateFile(path, GENERIC_READ, FILE_SHARE_READ, 0, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0);
+    if (self->file_handle == INVALID_HANDLE_VALUE) {
+        PyErr_SetString(PyExc_RuntimeError, "Couldn't map the file!");
+        return -1;
+    }
+
+    self->mapping_handle = CreateFileMapping(self->file_handle, 0, PAGE_READONLY, 0, 0, 0);
+    if (self->mapping_handle == 0) {
+        CloseHandle(self->file_handle);
+        self->file_handle = NULL;
+        PyErr_SetString(PyExc_RuntimeError, "Couldn't map the file!");
+        return -1;
+    }
+
+    char *file = (char *)MapViewOfFile(self->mapping_handle, FILE_MAP_READ, 0, 0, 0);
+    if (file == 0) {
+        CloseHandle(self->mapping_handle);
+        self->mapping_handle = NULL;
+        CloseHandle(self->file_handle);
+        self->file_handle = NULL;
+        PyErr_SetString(PyExc_RuntimeError, "Couldn't map the file!");
+        return -1;
+    }
+    self->ptr = file;
+    self->len = GetFileSize(self->file_handle, 0);
+#else
+    struct stat sb;
+    self->file_descriptor = open(path, O_RDONLY);
+    if (fstat(self->file_descriptor, &sb) != 0) {
+        close(self->file_descriptor);
+        self->file_descriptor = 0;
+        PyErr_SetString(PyExc_RuntimeError, "Can't retrieve file size!");
+        return -1;
+    }
+    size_t file_size = sb.st_size;
+    void *map = mmap(NULL, sb.st_size, PROT_READ, MAP_SHARED, self->file_descriptor, 0);
+    if (map == MAP_FAILED) {
+        close(self->file_descriptor);
+        self->file_descriptor = 0;
+        PyErr_SetString(PyExc_RuntimeError, "Couldn't map the file!");
+        return -1;
+    }
+    self->ptr = map;
+    self->len = file_size;
+#endif
+
+    return 0;
+}
+
+static PyMethodDef MemoryMappedFile_methods[] = {
+    // Your method definitions here
+    {NULL} /* Sentinel */
+};
+
+static PyTypeObject MemoryMappedFileType = {
+    PyVarObject_HEAD_INIT(NULL, 0).tp_name = "stringzilla.MemoryMappedFile",
+    .tp_doc = "MemoryMappedFile objects",
+    .tp_basicsize = sizeof(MemoryMappedFile),
+    .tp_itemsize = 0,
+    .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,
+    .tp_new = MemoryMappedFile_new,
+    .tp_init = (initproc)MemoryMappedFile_init,
+    .tp_dealloc = (destructor)MemoryMappedFile_dealloc,
+    .tp_methods = MemoryMappedFile_methods,
+};
+
+#pragma endregion
+
+#pragma region Str
+
 /**
- *  @brief  Type-punned StringZilla-string, that may either be an immutable in-memory string,
- *          similar to Python's native `str`, or a memory-mapped immutable file from disk,
- *          or a slice of one of those classes or the Python's native `str` and `bytes` classes.
+ *  @brief  Type-punned StringZilla-string, that points to a slice of an existing Python `str`
+ *          or a `MemoryMappedFile`.
  *
- *  When a slice is being used, the `parent` object's reference count is being incremented.
- *  When an in-memory string is used - we avoid the second memory allocation and allocate the `HEAD`,
- *  the length, and the content region in a single continuous chunk.
+ *  When a slice is constructed, the `parent` object's reference count is being incremented to preserve lifetime.
+ *  It usage in Python would look like:
+ *
+ *      - Str() # Empty string
+ *      - Str("some-string") # Full-range slice of a Python `str`
+ *      - Str(File("some-path.txt")) # Full-range view of a persisted file
+ *      - Str(File("some-path.txt"), from=0, to=sys.maxint)
  */
 typedef struct {
     PyObject_HEAD;
+    PyObject *parent;
+    void *start;
+    size_t length;
+} Str;
+
+static int Str_init(Str *self, PyObject *positional_args, PyObject *named_args) {
+    PyObject *parent = NULL;
+    Py_ssize_t from = 0;
+    Py_ssize_t to = PY_SSIZE_T_MAX;
+
+    // The `named_args` would be `NULL`
+    if (named_args) {
+        static char *names[] = {"parent", "from", "to", NULL};
+        if (!PyArg_ParseTupleAndKeywords(positional_args, named_args, "|Onn", names, &parent, &from, &to))
+            return -1;
+    }
+    else if (!PyArg_ParseTuple(positional_args, "|Onn", &parent, &from, &to))
+        return -1;
 
-    typedef enum {
-        in_memory_k,
-        on_disk_k,
-        slice_k,
-    } variant;
-
-    typedef struct {
-        size_t length;
-    } in_memory_t;
-
-    typedef struct {
-        void *start;
-        size_t length;
-        int file_descriptor;
-    } on_disk_t;
-
-    typedef struct {
-        PyObject *parent;
-        void *start;
-        size_t length;
-    } slice_t;
-} strzl_t;
+    self->parent = parent;
+    if (PyUnicode_Check(parent)) {
+        // Handle Python str
+        self->start = PyUnicode_DATA(parent);
+        self->length = PyUnicode_GET_DATA_SIZE(parent);
+        Py_INCREF(parent); // Increment the reference count of the parent
+    }
+    else if (PyObject_TypeCheck(parent, &MemoryMappedFileType)) {
+        // Handle MemoryMappedFile
+        MemoryMappedFile *file = (MemoryMappedFile *)parent;
+        self->start = file->ptr;
+        self->length = file->len;
+        Py_INCREF(parent); // Increment the reference count of the parent
+    }
+    else if (parent == NULL) {
+        // Handle empty string
+        self->start = NULL;
+        self->length = 0;
+    }
+    else {
+        PyErr_SetString(PyExc_TypeError, "Unsupported parent type");
+        return -1;
+    }
+
+    // Apply slicing
+    size_t normalized_offset, normalized_length;
+    slice(self->length, from, to, &normalized_offset, &normalized_length);
+    self->start = ((char *)self->start) + normalized_offset;
+    self->length = normalized_length;
+    return 0;
+}
+
+static PyObject *Str_new(PyTypeObject *type, PyObject *args, PyObject *kwds) {
+    Str *self;
+    self = (Str *)type->tp_alloc(type, 0);
+    if (!self)
+        return NULL;
+
+    self->parent = NULL;
+    self->start = NULL;
+    self->length = 0;
+    return (PyObject *)self;
+}
+
+static void Str_dealloc(Str *self) {
+    if (self->parent)
+        Py_XDECREF(self->parent);
+    Py_TYPE(self)->tp_free((PyObject *)self);
+}
+
+static PyTypeObject StrType = {
+    PyVarObject_HEAD_INIT(NULL, 0).tp_name = "stringzilla.Str",
+    .tp_doc = "Stringzilla Str objects",
+    .tp_basicsize = sizeof(Str),
+    .tp_itemsize = 0,
+    .tp_flags = Py_TPFLAGS_DEFAULT,
+    .tp_new = Str_new,
+    .tp_dealloc = (destructor)Str_dealloc,
+};
+
+#pragma endregion
 
 static PyModuleDef stringzilla_module = {
     PyModuleDef_HEAD_INIT,
@@ -70,15 +295,27 @@ static PyModuleDef stringzilla_module = {
 PyMODINIT_FUNC PyInit_stringzilla(void) {
     PyObject *m;
 
-    if (PyType_Ready(&PySpanType) < 0)
+    if (PyType_Ready(&StrType) < 0)
         return NULL;
 
     m = PyModule_Create(&stringzilla_module);
     if (m == NULL)
         return NULL;
 
-    Py_INCREF(&PySpanType);
-    PyModule_AddObject(m, "Span", (PyObject *)&PySpanType);
+    Py_INCREF(&StrType);
+    if (PyModule_AddObject(m, "Str", (PyObject *)&StrType) < 0) {
+        Py_XDECREF(&StrType);
+        Py_XDECREF(m);
+        return NULL;
+    }
+
+    Py_INCREF(&MemoryMappedFileType);
+    if (PyModule_AddObject(m, "MemoryMappedFile", (PyObject *)&MemoryMappedFileType) < 0) {
+        Py_XDECREF(&MemoryMappedFileType);
+        Py_XDECREF(&StrType);
+        Py_XDECREF(m);
+        return NULL;
+    }
 
     return m;
-}
+}
\ No newline at end of file

From 21a3737f5e9050bc5bed1164f76f11a25e83da32 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 11 Sep 2023 18:12:55 +0400
Subject: [PATCH 05/72] Make: Switch to pure C

---
 python/{lib.cpp => lib.c} | 0
 setup.py                  | 2 +-
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename python/{lib.cpp => lib.c} (100%)

diff --git a/python/lib.cpp b/python/lib.c
similarity index 100%
rename from python/lib.cpp
rename to python/lib.c
diff --git a/setup.py b/setup.py
index 32bad412..44cf4cf8 100644
--- a/setup.py
+++ b/setup.py
@@ -51,7 +51,7 @@
 ext_modules = [
     Pybind11Extension(
         "stringzilla",
-        ["python/lib.cpp"],
+        ["python/lib.c"],
         include_dirs=["stringzilla"],
         extra_compile_args=compile_args,
         extra_link_args=link_args,

From 45cb82b5836033b352c0ac3cc2a1856ec3e532a3 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 16 Sep 2023 20:41:45 +0400
Subject: [PATCH 06/72] Improve: Passing basic tests

---
 .vscode/settings.json     |   9 +
 python/lib.c              | 356 ++++++++++++++++++++++++++++----------
 scripts/test.py           |  22 ++-
 setup.py                  |   4 +-
 stringzilla/stringzilla.h |  63 ++++---
 5 files changed, 336 insertions(+), 118 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 3a4c79d2..a32765cc 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -121,14 +121,22 @@
   "C_Cpp.default.configurationProvider": "ms-vscode.cmake-tools",
   "cSpell.words": [
     "allowoverlap",
+    "basicsize",
     "bigram",
     "cibuildwheel",
+    "endregion",
     "getitem",
+    "getslice",
+    "initproc",
+    "itemsize",
     "keeplinebreaks",
     "keepseparator",
     "levenstein",
     "maxsplit",
     "memcpy",
+    "newfunc",
+    "NOARGS",
+    "NOMINMAX",
     "pytest",
     "quadgram",
     "readlines",
@@ -139,6 +147,7 @@
     "strzl",
     "substr",
     "SWAR",
+    "TPFLAGS",
     "Zilla"
   ]
 }
\ No newline at end of file
diff --git a/python/lib.c b/python/lib.c
index 5267cff4..fb42b7df 100644
--- a/python/lib.c
+++ b/python/lib.c
@@ -21,6 +21,50 @@ typedef SSIZE_T ssize_t;
 
 #include <Python.h>
 
+#include <stringzilla.h>
+
+#pragma region Forward Declarations
+
+static PyTypeObject MemoryMappedFileType;
+static PyTypeObject StrType;
+
+/**
+ *  @brief  Describes an on-disk file mapped into RAM, which is different from Python's
+ *          native `mmap` module, as it exposes the address of the mapping in memory.
+ */
+typedef struct {
+    PyObject_HEAD;
+#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
+    HANDLE file_handle;
+    HANDLE mapping_handle;
+#else
+    int file_descriptor;
+#endif
+    void *start;
+    size_t length;
+} MemoryMappedFile;
+
+/**
+ *  @brief  Type-punned StringZilla-string, that points to a slice of an existing Python `str`
+ *          or a `MemoryMappedFile`.
+ *
+ *  When a slice is constructed, the `parent` object's reference count is being incremented to preserve lifetime.
+ *  It usage in Python would look like:
+ *
+ *      - Str() # Empty string
+ *      - Str("some-string") # Full-range slice of a Python `str`
+ *      - Str(File("some-path.txt")) # Full-range view of a persisted file
+ *      - Str(File("some-path.txt"), from=0, to=sys.maxint)
+ */
+typedef struct {
+    PyObject_HEAD;
+    PyObject *parent;
+    char const *start;
+    size_t length;
+} Str;
+
+#pragma endregion
+
 #pragma region Helpers
 
 void slice(size_t length, ssize_t start, ssize_t end, size_t *normalized_offset, size_t *normalized_length) {
@@ -33,8 +77,8 @@ void slice(size_t length, ssize_t start, ssize_t end, size_t *normalized_offset,
     // Clamp indices to a valid range
     if (start < 0) start = 0;
     if (end < 0) end = 0;
-    if (start > length) start = length;
-    if (end > length) end = length;
+    if (start > (ssize_t)length) start = length;
+    if (end > (ssize_t)length) end = length;
 
     // Ensure start <= end
     if (start > end) start = end;
@@ -48,27 +92,11 @@ void slice(size_t length, ssize_t start, ssize_t end, size_t *normalized_offset,
 
 #pragma region MemoryMappingFile
 
-/**
- *  @brief  Describes an on-disk file mapped into RAM, which is different from Python's
- *          native `mmap` module, as it exposes the address of the mapping in memory.
- */
-typedef struct {
-    PyObject_HEAD;
-#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
-    HANDLE file_handle;
-    HANDLE mapping_handle;
-#else
-    int file_descriptor;
-#endif
-    void *ptr;
-    size_t len;
-} MemoryMappedFile;
-
 static void MemoryMappedFile_dealloc(MemoryMappedFile *self) {
 #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
-    if (self->ptr) {
-        UnmapViewOfFile(self->ptr);
-        self->ptr = NULL;
+    if (self->start) {
+        UnmapViewOfFile(self->start);
+        self->start = NULL;
     }
     if (self->mapping_handle) {
         CloseHandle(self->mapping_handle);
@@ -79,10 +107,10 @@ static void MemoryMappedFile_dealloc(MemoryMappedFile *self) {
         self->file_handle = NULL;
     }
 #else
-    if (self->ptr) {
-        munmap(self->ptr, self->len);
-        self->ptr = NULL;
-        self->len = 0;
+    if (self->start) {
+        munmap(self->start, self->length);
+        self->start = NULL;
+        self->length = 0;
     }
     if (self->file_descriptor != 0) {
         close(self->file_descriptor);
@@ -95,24 +123,23 @@ static void MemoryMappedFile_dealloc(MemoryMappedFile *self) {
 static PyObject *MemoryMappedFile_new(PyTypeObject *type, PyObject *positional_args, PyObject *named_args) {
     MemoryMappedFile *self;
     self = (MemoryMappedFile *)type->tp_alloc(type, 0);
-    if (self != NULL) {
-        self->ptr = NULL;
-        self->len = 0;
+    if (self == NULL)
+        return NULL;
+
 #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
-        self->file_handle = NULL;
-        self->mapping_handle = NULL;
+    self->file_handle = NULL;
+    self->mapping_handle = NULL;
 #else
-        self->file_descriptor = 0;
+    self->file_descriptor = 0;
 #endif
-    }
-    return (PyObject *)self;
+    self->start = NULL;
+    self->length = 0;
 }
 
 static int MemoryMappedFile_init(MemoryMappedFile *self, PyObject *positional_args, PyObject *named_args) {
     const char *path;
-    if (!PyArg_ParseTuple(positional_args, "s", &path)) {
+    if (!PyArg_ParseTuple(positional_args, "s", &path))
         return -1;
-    }
 
 #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
     self->file_handle = CreateFile(path, GENERIC_READ, FILE_SHARE_READ, 0, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0);
@@ -138,8 +165,8 @@ static int MemoryMappedFile_init(MemoryMappedFile *self, PyObject *positional_ar
         PyErr_SetString(PyExc_RuntimeError, "Couldn't map the file!");
         return -1;
     }
-    self->ptr = file;
-    self->len = GetFileSize(self->file_handle, 0);
+    self->start = file;
+    self->length = GetFileSize(self->file_handle, 0);
 #else
     struct stat sb;
     self->file_descriptor = open(path, O_RDONLY);
@@ -157,52 +184,74 @@ static int MemoryMappedFile_init(MemoryMappedFile *self, PyObject *positional_ar
         PyErr_SetString(PyExc_RuntimeError, "Couldn't map the file!");
         return -1;
     }
-    self->ptr = map;
-    self->len = file_size;
+    self->start = map;
+    self->length = file_size;
 #endif
 
     return 0;
 }
 
-static PyMethodDef MemoryMappedFile_methods[] = {
-    // Your method definitions here
-    {NULL} /* Sentinel */
-};
+static PyMethodDef MemoryMappedFile_methods[] = { //
+    {NULL, NULL, 0, NULL}};
 
 static PyTypeObject MemoryMappedFileType = {
-    PyVarObject_HEAD_INIT(NULL, 0).tp_name = "stringzilla.MemoryMappedFile",
-    .tp_doc = "MemoryMappedFile objects",
+    PyObject_HEAD_INIT(NULL).tp_name = "stringzilla.MemoryMappedFile",
+    .tp_doc = "Memory mapped file class, that exposes the memory range for low-level access",
     .tp_basicsize = sizeof(MemoryMappedFile),
-    .tp_itemsize = 0,
-    .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,
-    .tp_new = MemoryMappedFile_new,
+    .tp_flags = Py_TPFLAGS_DEFAULT,
+    .tp_methods = MemoryMappedFile_methods,
+    .tp_new = (newfunc)MemoryMappedFile_new,
     .tp_init = (initproc)MemoryMappedFile_init,
     .tp_dealloc = (destructor)MemoryMappedFile_dealloc,
-    .tp_methods = MemoryMappedFile_methods,
+
+    // PyBufferProcs *tp_as_buffer;
+
+    // reprfunc tp_repr;
+    // PyNumberMethods *tp_as_number;
+    // PySequenceMethods *tp_as_sequence;
+    // PyMappingMethods *tp_as_mapping;
+    // ternaryfunc tp_call;
+    // reprfunc tp_str;
+    // getattrofunc tp_getattro;
+    // setattrofunc tp_setattro;
 };
 
 #pragma endregion
 
-#pragma region Str
+int export_string_like(PyObject *object, char const **start, size_t *length) {
+    if (PyUnicode_Check(object)) {
+        // Handle Python str
+        Py_ssize_t signed_length;
+        *start = PyUnicode_AsUTF8AndSize(object, &signed_length);
+        *length = (size_t)signed_length;
+        return 1;
+    }
+    else if (PyBytes_Check(object)) {
+        // Handle Python str
+        Py_ssize_t signed_length;
+        if (PyBytes_AsStringAndSize(object, start, signed_length) == -1) {
+            PyErr_SetString(PyExc_TypeError, "Mapping bytes failed");
+            return 0;
+        }
+        *length = (size_t)signed_length;
+        return 1;
+    }
+    else if (PyObject_TypeCheck(object, &StrType)) {
+        Str *str = (Str *)object;
+        *start = str->start;
+        *length = str->length;
+        return 1;
+    }
+    else if (PyObject_TypeCheck(object, &MemoryMappedFileType)) {
+        MemoryMappedFile *file = (MemoryMappedFile *)object;
+        *start = file->start;
+        *length = file->length;
+        return 1;
+    }
+    return 0;
+}
 
-/**
- *  @brief  Type-punned StringZilla-string, that points to a slice of an existing Python `str`
- *          or a `MemoryMappedFile`.
- *
- *  When a slice is constructed, the `parent` object's reference count is being incremented to preserve lifetime.
- *  It usage in Python would look like:
- *
- *      - Str() # Empty string
- *      - Str("some-string") # Full-range slice of a Python `str`
- *      - Str(File("some-path.txt")) # Full-range view of a persisted file
- *      - Str(File("some-path.txt"), from=0, to=sys.maxint)
- */
-typedef struct {
-    PyObject_HEAD;
-    PyObject *parent;
-    void *start;
-    size_t length;
-} Str;
+#pragma region Str
 
 static int Str_init(Str *self, PyObject *positional_args, PyObject *named_args) {
     PyObject *parent = NULL;
@@ -218,25 +267,16 @@ static int Str_init(Str *self, PyObject *positional_args, PyObject *named_args)
     else if (!PyArg_ParseTuple(positional_args, "|Onn", &parent, &from, &to))
         return -1;
 
-    self->parent = parent;
-    if (PyUnicode_Check(parent)) {
-        // Handle Python str
-        self->start = PyUnicode_DATA(parent);
-        self->length = PyUnicode_GET_DATA_SIZE(parent);
-        Py_INCREF(parent); // Increment the reference count of the parent
-    }
-    else if (PyObject_TypeCheck(parent, &MemoryMappedFileType)) {
-        // Handle MemoryMappedFile
-        MemoryMappedFile *file = (MemoryMappedFile *)parent;
-        self->start = file->ptr;
-        self->length = file->len;
-        Py_INCREF(parent); // Increment the reference count of the parent
-    }
-    else if (parent == NULL) {
-        // Handle empty string
+    // Handle empty string
+    if (parent == NULL) {
         self->start = NULL;
         self->length = 0;
     }
+    // Increment the reference count of the parent
+    else if (export_string_like(parent, &self->start, &self->length)) {
+        self->parent = parent;
+        Py_INCREF(parent);
+    }
     else {
         PyErr_SetString(PyExc_TypeError, "Unsupported parent type");
         return -1;
@@ -268,24 +308,157 @@ static void Str_dealloc(Str *self) {
     Py_TYPE(self)->tp_free((PyObject *)self);
 }
 
+static Py_ssize_t Str_len(Str *self) { return self->length; }
+
+static PyObject *Str_getitem(Str *self, Py_ssize_t i) {
+
+    // Negative indexing
+    if (i < 0)
+        i += self->length;
+
+    if (i < 0 || (size_t)i >= self->length) {
+        PyErr_SetString(PyExc_IndexError, "Index out of range");
+        return NULL;
+    }
+
+    // Assuming the underlying data is UTF-8 encoded
+    return PyUnicode_FromStringAndSize(self->start + i, 1);
+}
+
+// Will be called by the `PySequence_Contains`
+static int Str_contains(Str *self, PyObject *arg) {
+
+    struct strzl_needle_t needle_struct;
+    needle_struct.anomaly_offset = 0;
+    if (!export_string_like(arg, &needle_struct.ptr, &needle_struct.len)) {
+        PyErr_SetString(PyExc_TypeError, "Unsupported argument type");
+        return -1;
+    }
+
+    struct strzl_haystack_t haystack;
+    haystack.ptr = self->start;
+    haystack.len = self->length;
+    size_t position = strzl_neon_find_substr(haystack, needle_struct);
+    return position != haystack.len;
+}
+
+static Py_hash_t Str_hash(Str *self) { return (Py_hash_t)strzl_hash_crc32_native(self->start, self->length); }
+
+static PyObject *Str_getslice(Str *self, PyObject *args) {
+    PyObject *start_obj = NULL, *end_obj = NULL;
+    ssize_t start = 0, end = self->length; // Default values
+
+    if (!PyArg_ParseTuple(args, "|OO", &start_obj, &end_obj))
+        return NULL;
+
+    if (start_obj != NULL && start_obj != Py_None) {
+        if (!PyLong_Check(start_obj)) {
+            PyErr_SetString(PyExc_TypeError, "Start index must be an integer or None");
+            return NULL;
+        }
+        start = PyLong_AsSsize_t(start_obj);
+    }
+
+    if (end_obj != NULL && end_obj != Py_None) {
+        if (!PyLong_Check(end_obj)) {
+            PyErr_SetString(PyExc_TypeError, "End index must be an integer or None");
+            return NULL;
+        }
+        end = PyLong_AsSsize_t(end_obj);
+    }
+
+    size_t normalized_offset, normalized_length;
+    slice(self->length, start, end, &normalized_offset, &normalized_length);
+
+    if (normalized_length == 0)
+        return PyUnicode_FromString("");
+
+    // Create a new Str object
+    Str *new_str = (Str *)PyObject_New(Str, &StrType);
+    if (new_str == NULL)
+        return NULL;
+
+    // Set the parent to the original Str object and increment its reference count
+    new_str->parent = (PyObject *)self;
+    Py_INCREF(self);
+
+    // Set the start and length to point to the slice
+    new_str->start = self->start + normalized_offset;
+    new_str->length = normalized_length;
+    return (PyObject *)new_str;
+}
+
+static PyObject *Str_str(Str *self, PyObject *args) { return PyUnicode_FromStringAndSize(self->start, self->length); }
+
+static PyObject *Str_find_substr(Str *self, PyObject *args) {
+    PyObject *needle_obj;
+    if (!PyArg_ParseTuple(args, "O", &needle_obj))
+        return NULL;
+
+    struct strzl_needle_t needle_struct;
+    needle_struct.anomaly_offset = 0;
+
+    if (PyObject_TypeCheck(needle_obj, &StrType)) {
+        Str *needle = (Str *)needle_obj;
+        needle_struct.ptr = needle->start;
+        needle_struct.len = needle->length;
+    }
+    else if (PyUnicode_Check(needle_obj)) {
+        needle_struct.ptr = PyUnicode_AsUTF8AndSize(needle_obj, (Py_ssize_t *)&needle_struct.len);
+        if (needle_struct.ptr == NULL)
+            return NULL; // Error case, likely a UnicodeEncodeError
+    }
+    else {
+        PyErr_SetString(PyExc_TypeError, "Argument must be an instance of Str or a native Python str");
+        return NULL;
+    }
+
+    struct strzl_haystack_t haystack;
+    haystack.ptr = self->start;
+    haystack.len = self->length;
+    size_t position = strzl_neon_find_substr(haystack, needle_struct);
+    return PyLong_FromSize_t(position);
+}
+
+static PySequenceMethods Str_as_sequence = {
+    .sq_length = (lenfunc)Str_len,           //
+    .sq_item = (ssizeargfunc)Str_getitem,    //
+    .sq_contains = (objobjproc)Str_contains, //
+};
+
+static PyMethodDef Str_methods[] = { //
+    {"contains", (PyCFunction)Str_str, METH_NOARGS, "Convert to Python `str`"},
+    {"find", (PyCFunction)Str_len, METH_NOARGS, "Get length"},
+    {"find", (PyCFunction)Str_len, METH_NOARGS, "Get length"},
+    {"__getitem__", (PyCFunction)Str_getitem, METH_O, "Indexing"},
+    {NULL, NULL, 0, NULL}};
+
 static PyTypeObject StrType = {
-    PyVarObject_HEAD_INIT(NULL, 0).tp_name = "stringzilla.Str",
-    .tp_doc = "Stringzilla Str objects",
+    PyObject_HEAD_INIT(NULL).tp_name = "stringzilla.Str",
+    .tp_doc = "Immutable string/slice class with SIMD and SWAR-accelerated operations",
     .tp_basicsize = sizeof(Str),
-    .tp_itemsize = 0,
     .tp_flags = Py_TPFLAGS_DEFAULT,
-    .tp_new = Str_new,
+    .tp_methods = Str_methods,
+    .tp_new = (newfunc)Str_new,
+    .tp_init = (initproc)Str_init,
     .tp_dealloc = (destructor)Str_dealloc,
+    .tp_as_sequence = &Str_as_sequence,
+    .tp_hash = (hashfunc)Str_hash, // String hashing functions
+    // .tp_as_buffer = (PyBufferProcs *)NULL, // Functions to access object as input/output buffer
+    // .tp_vectorcall = (vectorcallfunc)NULL, // Faster function dispatch
 };
 
 #pragma endregion
 
+static PyMethodDef stringzilla_methods[] = { //
+    {NULL, NULL, 0, NULL}};
+
 static PyModuleDef stringzilla_module = {
     PyModuleDef_HEAD_INIT,
     "stringzilla",
     "Crunch 100+ GB Strings in Python with ease",
     -1,
-    NULL,
+    stringzilla_methods,
     NULL,
     NULL,
     NULL,
@@ -298,6 +471,9 @@ PyMODINIT_FUNC PyInit_stringzilla(void) {
     if (PyType_Ready(&StrType) < 0)
         return NULL;
 
+    if (PyType_Ready(&MemoryMappedFileType) < 0)
+        return NULL;
+
     m = PyModule_Create(&stringzilla_module);
     if (m == NULL)
         return NULL;
@@ -318,4 +494,4 @@ PyMODINIT_FUNC PyInit_stringzilla(void) {
     }
 
     return m;
-}
\ No newline at end of file
+}
diff --git a/scripts/test.py b/scripts/test.py
index 68cb105c..189345e0 100644
--- a/scripts/test.py
+++ b/scripts/test.py
@@ -5,7 +5,27 @@
 
 import pytest
 
-from stringzilla import Str, File, Strs, levenstein
+from stringzilla import Str
+
+
+def test_construct():
+    native = "aaaaa"
+    big = Str(native)
+    assert len(big) == len(native)
+
+
+def test_indexing():
+    native = "abcdef"
+    big = Str(native)
+    for i in range(len(native)):
+        assert big[i] == native[i]
+
+
+def test_contains():
+    big = Str("abcdef")
+    assert "a" in big
+    assert "ab" in big
+    assert "xxx" not in big
 
 
 def get_random_string(
diff --git a/setup.py b/setup.py
index 44cf4cf8..83c22aea 100644
--- a/setup.py
+++ b/setup.py
@@ -11,7 +11,7 @@
 macros_args = []
 
 if sys.platform == "linux":
-    compile_args.append("-std=c++17")
+    compile_args.append("-std=c99")
     compile_args.append("-O3")
     compile_args.append("-pedantic")
     compile_args.append("-Wno-unknown-pragmas")
@@ -36,7 +36,7 @@
 
 
 if sys.platform == "darwin":
-    compile_args.append("-std=c++17")
+    compile_args.append("-std=c99")
     compile_args.append("-O3")
     compile_args.append("-pedantic")
     compile_args.append("-Wno-unknown-pragmas")
diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h
index 3e7df130..ca58b1ed 100644
--- a/stringzilla/stringzilla.h
+++ b/stringzilla/stringzilla.h
@@ -35,8 +35,9 @@ inline static size_t strzl_divide_round_up(size_t x, size_t divisor) { return (x
 
 /**
  *  @brief This is a faster alternative to `strncmp(a, b, len) == 0`.
+ *  @return 1 for `true`, and 0 for `false`.
  */
-inline static bool strzl_equal(char const *a, char const *b, size_t len) {
+inline static int strzl_equal(char const *a, char const *b, size_t len) {
     char const *const a_end = a + len;
     while (a != a_end && *a == *b)
         a++, b++;
@@ -130,9 +131,9 @@ inline static size_t strzl_naive_find_2chars(strzl_haystack_t h, char const *n)
     char const *end = h.ptr + h.len;
 
     // This code simulates hyper-scalar execution, analyzing 7 offsets at a time.
-    uint64_t nnnn = (uint64_t(n[0]) << 0) | (uint64_t(n[1]) << 8); // broadcast `n` into `nnnn`
-    nnnn |= nnnn << 16;                                            // broadcast `n` into `nnnn`
-    nnnn |= nnnn << 32;                                            // broadcast `n` into `nnnn`
+    uint64_t nnnn = ((uint64_t)(n[0]) << 0) | ((uint64_t)(n[1]) << 8); // broadcast `n` into `nnnn`
+    nnnn |= nnnn << 16;                                                // broadcast `n` into `nnnn`
+    nnnn |= nnnn << 32;                                                // broadcast `n` into `nnnn`
     uint64_t text_slice;
     for (; text + 8 <= end; text += 7) {
         memcpy(&text_slice, text, 8);
@@ -173,9 +174,9 @@ inline static size_t strzl_naive_find_3chars(strzl_haystack_t h, char const *n)
 
     // This code simulates hyper-scalar execution, analyzing 6 offsets at a time.
     // We have two unused bytes at the end.
-    uint64_t nn = uint64_t(n[0] << 0) | (uint64_t(n[1]) << 8) | (uint64_t(n[2]) << 16); // broadcast `n` into `nn`
-    nn |= nn << 24;                                                                     // broadcast `n` into `nn`
-    nn <<= 16;                                                                          // broadcast `n` into `nn`
+    uint64_t nn = (uint64_t)(n[0] << 0) | ((uint64_t)(n[1]) << 8) | ((uint64_t)(n[2]) << 16); // broadcast `n` into `nn`
+    nn |= nn << 24;                                                                           // broadcast `n` into `nn`
+    nn <<= 16;                                                                                // broadcast `n` into `nn`
 
     for (; text + 8 <= end; text += 6) {
         uint64_t text_slice;
@@ -227,9 +228,8 @@ inline static size_t strzl_naive_find_4chars(strzl_haystack_t h, char const *n)
     char const *end = h.ptr + h.len;
 
     // This code simulates hyper-scalar execution, analyzing 4 offsets at a time.
-    uint64_t nn = uint64_t(n[0] << 0) | (uint64_t(n[1]) << 8) | (uint64_t(n[2]) << 16) | (uint64_t(n[3]) << 24);
+    uint64_t nn = (uint64_t)(n[0] << 0) | ((uint64_t)(n[1]) << 8) | ((uint64_t)(n[2]) << 16) | ((uint64_t)(n[3]) << 24);
     nn |= nn << 32;
-    nn = nn;
 
     //
     uint8_t lookup[16] = {0};
@@ -264,8 +264,8 @@ inline static size_t strzl_naive_find_4chars(strzl_haystack_t h, char const *n)
 
         if (text01_indicators + text23_indicators) {
             // Assuming we have performed 4 comparisons, we can only have 2^4=16 outcomes.
-            // Which is small enought for a lookup table.
-            uint8_t match_indicators = uint8_t(                        //
+            // Which is small enough for a lookup table.
+            uint8_t match_indicators = (uint8_t)(                      //
                 (text01_indicators >> 31) | (text01_indicators << 0) | //
                 (text23_indicators >> 29) | (text23_indicators << 2));
             return text - h.ptr + lookup[match_indicators];
@@ -370,8 +370,10 @@ size_t strzl_avx2_find_substr(strzl_haystack_t h, strzl_needle_t n) {
     }
 
     // Don't forget the last (up to 35) characters.
-    size_t tail_len = end - text;
-    size_t tail_match = strzl_naive_find_substr({text, tail_len}, n);
+    strzl_haystack_t h_remainder;
+    h_remainder.ptr = text;
+    h_remainder.len = end - text;
+    size_t tail_match = strzl_naive_find_substr(h_remainder, n);
     return text + tail_match - h.ptr;
 }
 
@@ -415,7 +417,7 @@ inline static size_t strzl_neon_find_substr(strzl_haystack_t h, strzl_needle_t n
         // vorrq_u32 (all)
         uint32x4_t matches = vorrq_u32(vorrq_u32(matches0, matches1), vorrq_u32(matches2, matches3));
         uint64x2_t matches64x2 = vreinterpretq_u64_u32(matches);
-        bool has_match = vgetq_lane_u64(matches64x2, 0) | vgetq_lane_u64(matches64x2, 1);
+        int has_match = vgetq_lane_u64(matches64x2, 0) | vgetq_lane_u64(matches64x2, 1);
 
         if (has_match) {
             for (size_t i = 0; i < 16; i++) {
@@ -426,8 +428,10 @@ inline static size_t strzl_neon_find_substr(strzl_haystack_t h, strzl_needle_t n
     }
 
     // Don't forget the last (up to 16+3=19) characters.
-    size_t tail_len = end - text;
-    size_t tail_match = strzl_naive_find_substr({text, tail_len}, n);
+    strzl_haystack_t h_remainder;
+    h_remainder.ptr = text;
+    h_remainder.len = end - text;
+    size_t tail_match = strzl_naive_find_substr(h_remainder, n);
     return text + tail_match - h.ptr;
 }
 
@@ -441,16 +445,16 @@ inline static void strzl_swap(size_t *a, size_t *b) {
 
 typedef char const *(*strzl_array_get_begin_t)(void const *, size_t);
 typedef size_t (*strzl_array_get_length_t)(void const *, size_t);
-typedef bool (*strzl_array_predicate_t)(void const *, size_t);
-typedef bool (*strzl_array_comparator_t)(void const *, size_t, size_t);
+typedef int (*strzl_array_predicate_t)(void const *, size_t);
+typedef int (*strzl_array_comparator_t)(void const *, size_t, size_t);
 
-struct strzl_array_t {
+typedef struct strzl_array_t {
     size_t *order;
     size_t count;
     strzl_array_get_begin_t get_begin;
     strzl_array_get_length_t get_length;
     void const *handle;
-};
+} strzl_array_t;
 
 /**
  *  @brief  Similar to `std::partition`, given a predicate splits the
@@ -610,9 +614,9 @@ inline static int _strzl_sort_array_strncasecmp(
     return res ? res : a_len - b_len;
 }
 
-struct strzl_sort_config_t {
-    bool case_insensitive;
-};
+typedef struct strzl_sort_config_t {
+    int case_insensitive;
+} strzl_sort_config_t;
 
 /**
  *  @brief  Sorting algorithm, combining Radix Sort for the first 32 bits of every word
@@ -620,7 +624,7 @@ struct strzl_sort_config_t {
  */
 inline static void strzl_sort(strzl_array_t *array, strzl_sort_config_t const *config) {
 
-    bool case_insensitive = config && config->case_insensitive;
+    int case_insensitive = config && config->case_insensitive;
 
     // Export up to 4 bytes into the `array` bits themselves
     for (size_t i = 0; i != array->count; ++i) {
@@ -657,7 +661,7 @@ typedef uint8_t levenstein_distance_t;
  *  @return Amount of temporary memory (in bytes) needed to efficiently compute
  *          the Levenstein distance between two strings of given size.
  */
-inline static size_t strzl_levenstein_memory_needed(size_t, size_t b_length) { return b_length + b_length + 2; }
+inline static size_t strzl_levenstein_memory_needed(size_t _, size_t b_length) { return b_length + b_length + 2; }
 
 /**
  *  @brief  Auxiliary function, that computes the minimum of three values.
@@ -712,6 +716,15 @@ inline static levenstein_distance_t strzl_levenstein( //
     return previous_distances[b_length];
 }
 
+/**
+ *  @brief  Hashes provided string using hardware-accelerated CRC32 instructions.
+ */
+inline static uint32_t strzl_hash_crc32_native(char const *start, size_t length) { return 0; }
+
+inline static uint32_t strzl_hash_crc32_neon(char const *start, size_t length) { return 0; }
+
+inline static uint32_t strzl_hash_crc32_sse(char const *start, size_t length) { return 0; }
+
 #ifdef __cplusplus
 }
 #endif

From faacdd138bcbbef07381ae81a70be0011c98a2d1 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 16 Sep 2023 21:15:32 +0400
Subject: [PATCH 07/72] Improve: Vectorized function calls

---
 .vscode/settings.json |   2 +
 python/lib.c          | 184 +++++++++++++--------
 scripts/test.py       | 370 +++++++++++++++++++++---------------------
 3 files changed, 310 insertions(+), 246 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index a32765cc..1696d8d4 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -131,9 +131,11 @@
     "itemsize",
     "keeplinebreaks",
     "keepseparator",
+    "kwnames",
     "levenstein",
     "maxsplit",
     "memcpy",
+    "nargsf",
     "newfunc",
     "NOARGS",
     "NOMINMAX",
diff --git a/python/lib.c b/python/lib.c
index fb42b7df..4dfc90e9 100644
--- a/python/lib.c
+++ b/python/lib.c
@@ -88,6 +88,80 @@ void slice(size_t length, ssize_t start, ssize_t end, size_t *normalized_offset,
     *normalized_length = end - start;
 }
 
+int export_string_like(PyObject *object, char const **start, size_t *length) {
+    if (PyUnicode_Check(object)) {
+        // Handle Python str
+        Py_ssize_t signed_length;
+        *start = PyUnicode_AsUTF8AndSize(object, &signed_length);
+        *length = (size_t)signed_length;
+        return 1;
+    }
+    else if (PyBytes_Check(object)) {
+        // Handle Python str
+        Py_ssize_t signed_length;
+        if (PyBytes_AsStringAndSize(object, (char **)start, &signed_length) == -1) {
+            PyErr_SetString(PyExc_TypeError, "Mapping bytes failed");
+            return 0;
+        }
+        *length = (size_t)signed_length;
+        return 1;
+    }
+    else if (PyObject_TypeCheck(object, &StrType)) {
+        Str *str = (Str *)object;
+        *start = str->start;
+        *length = str->length;
+        return 1;
+    }
+    else if (PyObject_TypeCheck(object, &MemoryMappedFileType)) {
+        MemoryMappedFile *file = (MemoryMappedFile *)object;
+        *start = file->start;
+        *length = file->length;
+        return 1;
+    }
+    return 0;
+}
+
+#pragma endregion
+
+#pragma region Global Functions
+
+static PyObject *str_find_vectorcall(PyObject *_, PyObject *const *args, size_t nargsf, PyObject *kwnames) {
+    // Check the number of arguments and types
+    Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
+    if (nargs < 2 || nargs > 4) {
+        PyErr_SetString(PyExc_TypeError, "Invalid arguments");
+        return NULL;
+    }
+
+    // Parse the haystack.
+    PyObject *haystack_obj = args[0];
+    struct strzl_haystack_t haystack;
+    if (!export_string_like(haystack_obj, &haystack.ptr, &haystack.len)) {
+        PyErr_SetString(PyExc_TypeError, "First argument (haystack) must be string-like");
+        return NULL;
+    }
+
+    // Parse the needle.
+    PyObject *needle_obj = args[1];
+    struct strzl_needle_t needle;
+    needle.anomaly_offset = 0;
+    if (!export_string_like(needle_obj, &needle.ptr, &needle.len)) {
+        PyErr_SetString(PyExc_TypeError, "Second argument (needle) must be string-like");
+        return NULL;
+    }
+
+    // Limit the haystack range.
+    Py_ssize_t start = (nargs > 2) ? PyLong_AsSsize_t(args[2]) : 0;
+    Py_ssize_t end = (nargs > 3) ? PyLong_AsSsize_t(args[3]) : PY_SSIZE_T_MAX;
+    size_t normalized_offset, normalized_length;
+    slice(haystack.len, start, end, &normalized_offset, &normalized_length);
+
+    haystack.ptr = haystack.ptr + normalized_offset;
+    haystack.len = normalized_length;
+    size_t position = strzl_neon_find_substr(haystack, needle);
+    return PyLong_FromSize_t(position);
+}
+
 #pragma endregion
 
 #pragma region MemoryMappingFile
@@ -218,39 +292,6 @@ static PyTypeObject MemoryMappedFileType = {
 
 #pragma endregion
 
-int export_string_like(PyObject *object, char const **start, size_t *length) {
-    if (PyUnicode_Check(object)) {
-        // Handle Python str
-        Py_ssize_t signed_length;
-        *start = PyUnicode_AsUTF8AndSize(object, &signed_length);
-        *length = (size_t)signed_length;
-        return 1;
-    }
-    else if (PyBytes_Check(object)) {
-        // Handle Python str
-        Py_ssize_t signed_length;
-        if (PyBytes_AsStringAndSize(object, start, signed_length) == -1) {
-            PyErr_SetString(PyExc_TypeError, "Mapping bytes failed");
-            return 0;
-        }
-        *length = (size_t)signed_length;
-        return 1;
-    }
-    else if (PyObject_TypeCheck(object, &StrType)) {
-        Str *str = (Str *)object;
-        *start = str->start;
-        *length = str->length;
-        return 1;
-    }
-    else if (PyObject_TypeCheck(object, &MemoryMappedFileType)) {
-        MemoryMappedFile *file = (MemoryMappedFile *)object;
-        *start = file->start;
-        *length = file->length;
-        return 1;
-    }
-    return 0;
-}
-
 #pragma region Str
 
 static int Str_init(Str *self, PyObject *positional_args, PyObject *named_args) {
@@ -390,36 +431,6 @@ static PyObject *Str_getslice(Str *self, PyObject *args) {
 
 static PyObject *Str_str(Str *self, PyObject *args) { return PyUnicode_FromStringAndSize(self->start, self->length); }
 
-static PyObject *Str_find_substr(Str *self, PyObject *args) {
-    PyObject *needle_obj;
-    if (!PyArg_ParseTuple(args, "O", &needle_obj))
-        return NULL;
-
-    struct strzl_needle_t needle_struct;
-    needle_struct.anomaly_offset = 0;
-
-    if (PyObject_TypeCheck(needle_obj, &StrType)) {
-        Str *needle = (Str *)needle_obj;
-        needle_struct.ptr = needle->start;
-        needle_struct.len = needle->length;
-    }
-    else if (PyUnicode_Check(needle_obj)) {
-        needle_struct.ptr = PyUnicode_AsUTF8AndSize(needle_obj, (Py_ssize_t *)&needle_struct.len);
-        if (needle_struct.ptr == NULL)
-            return NULL; // Error case, likely a UnicodeEncodeError
-    }
-    else {
-        PyErr_SetString(PyExc_TypeError, "Argument must be an instance of Str or a native Python str");
-        return NULL;
-    }
-
-    struct strzl_haystack_t haystack;
-    haystack.ptr = self->start;
-    haystack.len = self->length;
-    size_t position = strzl_neon_find_substr(haystack, needle_struct);
-    return PyLong_FromSize_t(position);
-}
-
 static PySequenceMethods Str_as_sequence = {
     .sq_length = (lenfunc)Str_len,           //
     .sq_item = (ssizeargfunc)Str_getitem,    //
@@ -445,7 +456,6 @@ static PyTypeObject StrType = {
     .tp_as_sequence = &Str_as_sequence,
     .tp_hash = (hashfunc)Str_hash, // String hashing functions
     // .tp_as_buffer = (PyBufferProcs *)NULL, // Functions to access object as input/output buffer
-    // .tp_vectorcall = (vectorcallfunc)NULL, // Faster function dispatch
 };
 
 #pragma endregion
@@ -465,6 +475,13 @@ static PyModuleDef stringzilla_module = {
     NULL,
 };
 
+static PyObject *vectorized_find = NULL;
+static PyObject *vectorized_count = NULL;
+static PyObject *vectorized_contains = NULL;
+static PyObject *vectorized_split = NULL;
+static PyObject *vectorized_sort = NULL;
+static PyObject *vectorized_shuffle = NULL;
+
 PyMODINIT_FUNC PyInit_stringzilla(void) {
     PyObject *m;
 
@@ -493,5 +510,44 @@ PyMODINIT_FUNC PyInit_stringzilla(void) {
         return NULL;
     }
 
+    // Create the 'find' function
+    vectorized_find = PyObject_Malloc(sizeof(PyCFunctionObject));
+    if (vectorized_find == NULL) {
+        Py_XDECREF(&MemoryMappedFileType);
+        Py_XDECREF(&StrType);
+        Py_XDECREF(m);
+        PyErr_NoMemory();
+        return NULL;
+    }
+    PyObject_Init(vectorized_find, &PyCFunction_Type);
+    ((PyCFunctionObject *)vectorized_find)->m_ml = NULL; // No regular PyMethodDef
+    ((PyCFunctionObject *)vectorized_find)->vectorcall = str_find_vectorcall;
+
+    // Add the 'find' function to the module
+    if (PyModule_AddObject(m, "find", vectorized_find) < 0) {
+        PyObject_Free(vectorized_find);
+        Py_XDECREF(&MemoryMappedFileType);
+        Py_XDECREF(&StrType);
+        Py_XDECREF(m);
+        return NULL;
+    }
+
     return m;
+
+cleanup:
+    if (vectorized_find)
+        Py_XDECREF(vectorized_find);
+    if (vectorized_count)
+        Py_XDECREF(vectorized_count);
+    if (vectorized_contains)
+        Py_XDECREF(vectorized_contains);
+    if (vectorized_split)
+        Py_XDECREF(vectorized_split);
+    if (vectorized_sort)
+        Py_XDECREF(vectorized_sort);
+    if (vectorized_shuffle)
+        Py_XDECREF(vectorized_shuffle);
+    Py_XDECREF(m);
+    PyErr_NoMemory();
+    return NULL;
 }
diff --git a/scripts/test.py b/scripts/test.py
index 189345e0..a846b6ae 100644
--- a/scripts/test.py
+++ b/scripts/test.py
@@ -5,6 +5,7 @@
 
 import pytest
 
+import stringzilla as sz
 from stringzilla import Str
 
 
@@ -28,202 +29,207 @@ def test_contains():
     assert "xxx" not in big
 
 
-def get_random_string(
-    length: Optional[int] = None, variability: Optional[int] = None
-) -> str:
-    if length is None:
-        length = randint(3, 300)
-    if variability is None:
-        variability = len(ascii_lowercase)
-    return "".join(choice(ascii_lowercase[:variability]) for _ in range(length))
+def test_globals():
+    assert sz.find("abcdef", "bcdef") == 1
+    assert sz.find("abcdef", "x") == 6
 
 
-def is_equal_strings(native_strings, big_strings):
-    for native_slice, big_slice in zip(native_strings, big_strings):
-        assert native_slice == big_slice
+# def get_random_string(
+#     length: Optional[int] = None, variability: Optional[int] = None
+# ) -> str:
+#     if length is None:
+#         length = randint(3, 300)
+#     if variability is None:
+#         variability = len(ascii_lowercase)
+#     return "".join(choice(ascii_lowercase[:variability]) for _ in range(length))
 
 
-def check_identical(
-    native: str,
-    big: Union[Str, File],
-    needle: Optional[str] = None,
-    check_iterators: bool = False,
-):
-    if needle is None:
-        part_offset = randint(0, len(native) - 1)
-        part_length = randint(1, len(native) - part_offset)
-        needle = native[part_offset:part_length]
+# def is_equal_strings(native_strings, big_strings):
+#     for native_slice, big_slice in zip(native_strings, big_strings):
+#         assert native_slice == big_slice
 
-    present_in_native: bool = needle in native
-    present_in_big = needle in big
-    assert present_in_native == present_in_big
-    assert native.find(needle) == big.find(needle)
-    assert native.count(needle) == big.count(needle)
 
-    native_strings = native.split(needle)
-    big_strings: Strs = big.split(needle)
-    assert len(native_strings) == len(big_strings)
+# def check_identical(
+#     native: str,
+#     big: Union[Str, File],
+#     needle: Optional[str] = None,
+#     check_iterators: bool = False,
+# ):
+#     if needle is None:
+#         part_offset = randint(0, len(native) - 1)
+#         part_length = randint(1, len(native) - part_offset)
+#         needle = native[part_offset:part_length]
 
-    if check_iterators:
-        for i in range(len(native_strings)):
-            assert len(native_strings[i]) == len(big_strings[i])
-            assert native_strings[i] == big_strings[i]
-            assert [c for c in native_strings[i]] == [c for c in big_strings[i]]
+#     present_in_native: bool = needle in native
+#     present_in_big = needle in big
+#     assert present_in_native == present_in_big
+#     assert native.find(needle) == big.find(needle)
+#     assert native.count(needle) == big.count(needle)
 
-    is_equal_strings(native_strings, big_strings)
+#     native_strings = native.split(needle)
+#     big_strings: Strs = big.split(needle)
+#     assert len(native_strings) == len(big_strings)
 
+#     if check_iterators:
+#         for i in range(len(native_strings)):
+#             assert len(native_strings[i]) == len(big_strings[i])
+#             assert native_strings[i] == big_strings[i]
+#             assert [c for c in native_strings[i]] == [c for c in big_strings[i]]
 
-@pytest.mark.parametrize("haystack_length", range(1, 65))
-@pytest.mark.parametrize("variability", range(1, 25))
-def test_contains(haystack_length: int, variability: int):
-    native = get_random_string(variability=variability, length=haystack_length)
-    big = Str(native)
-    pattern = get_random_string(variability=variability, length=randint(1, 5))
-    assert (pattern in native) == big.contains(pattern)
-
-
-def test_count_overlap():
-    native = "aaaaa"
-    big = Str(native)
-    assert native.count("aa") == big.count("aa")
-    assert 4 == big.count("aa", allowoverlap=True)
-
-
-def test_splitlines():
-    native = "line1\nline2\nline3"
-    big = Str(native)
-    assert native.splitlines() == list(big.splitlines())
-    assert native.splitlines(True) == list(big.splitlines(keeplinebreaks=True))
-
-
-def test_split_keepseparator():
-    native = "word1 word2 word3"
-    big = Str(native)
-    assert ["word1 ", "word2 ", "word3"] == list(big.split(" ", keepseparator=True))
-
-
-def test_strs_operations():
-    native = "line1\nline2\nline3"
-    big = Str(native)
-    lines = big.splitlines()
-    lines.sort()
-    assert ["line1", "line2", "line3"] == list(lines)
+#     is_equal_strings(native_strings, big_strings)
 
-    shuffled_copy = lines.shuffled(seed=42)
-    assert set(lines) == set(shuffled_copy)
 
-    lines.append("line4")
-    assert 4 == len(lines)
-    lines.extend(["line5", "line6"])
-    assert 6 == len(lines)
+# @pytest.mark.parametrize("haystack_length", range(1, 65))
+# @pytest.mark.parametrize("variability", range(1, 25))
+# def test_contains(haystack_length: int, variability: int):
+#     native = get_random_string(variability=variability, length=haystack_length)
+#     big = Str(native)
+#     pattern = get_random_string(variability=variability, length=randint(1, 5))
+#     assert (pattern in native) == big.contains(pattern)
 
-    lines.append(lines[0])
-    assert 7 == len(lines)
-    assert lines[6] == "line1"
 
-    lines.extend(lines)
-    assert 14 == len(lines)
-    assert lines[7] == "line1"
-    assert lines[8] == "line2"
-    assert lines[12] == "line6"
+# def test_count_overlap():
+#     native = "aaaaa"
+#     big = Str(native)
+#     assert native.count("aa") == big.count("aa")
+#     assert 4 == big.count("aa", allowoverlap=True)
 
-    # Test that shuffles are reproducible with the same `seed`
-    a = [str(s) for s in lines.shuffled(seed=42)]
-    b = [str(s) for s in lines.shuffled(seed=42)]
-    assert a == b
+
+# def test_splitlines():
+#     native = "line1\nline2\nline3"
+#     big = Str(native)
+#     assert native.splitlines() == list(big.splitlines())
+#     assert native.splitlines(True) == list(big.splitlines(keeplinebreaks=True))
 
 
-@pytest.mark.parametrize("repetitions", range(1, 10))
-def test_basic(repetitions: int):
-    native = "abcd" * repetitions
-    big = Str(native)
-
-    check_identical(native, big, "a", True)
-    check_identical(native, big, "ab", True)
-    check_identical(native, big, "abc", True)
-    check_identical(native, big, "abcd", True)
-    check_identical(native, big, "abcde", True)  # Missing pattern
-
-
-@pytest.mark.parametrize("pattern_length", [1, 2, 4, 5])
-@pytest.mark.parametrize("haystack_length", range(1, 69, 3))
-@pytest.mark.parametrize("variability", range(1, 27, 3))
-def test_fuzzy(pattern_length: int, haystack_length: int, variability: int):
-    native = get_random_string(variability=variability, length=haystack_length)
-    big = Str(native)
-
-    # Start by matching the prefix and the suffix
-    check_identical(native, big, native[:pattern_length])
-    check_identical(native, big, native[-pattern_length:])
-
-    # Continue with random strs
-    for _ in range(haystack_length // pattern_length):
-        pattern = get_random_string(variability=variability, length=pattern_length)
-        check_identical(native, big, pattern)
-
-
-def test_strs():
-    native = get_random_string(length=10)
-    big = Str(native)
-
-    assert native[0:5] == big.sub(0, 5) and native[0:5] == big[0:5]
-    assert native[5:10] == big.sub(5, 10) and native[5:10] == big[5:10]
-
-    assert native[5:5] == big.sub(5, 5) and native[5:5] == big[5:5]
-    assert native[-5:-5] == big.sub(-5, -5) and native[-5:-5] == big[-5:-5]
-    assert native[2:-2] == big.sub(2, -2) and native[2:-2] == big[2:-2]
-    assert native[7:-7] == big.sub(7, -7) and native[7:-7] == big[7:-7]
-
-    assert native[5:3] == big.sub(5, 3) and native[5:3] == big[5:3]
-    assert native[5:7] == big.sub(5, 7) and native[5:7] == big[5:7]
-    assert native[5:-3] == big.sub(5, -3) and native[5:-3] == big[5:-3]
-    assert native[5:-7] == big.sub(5, -7) and native[5:-7] == big[5:-7]
-
-    assert native[-5:3] == big.sub(-5, 3) and native[-5:3] == big[-5:3]
-    assert native[-5:7] == big.sub(-5, 7) and native[-5:7] == big[-5:7]
-    assert native[-5:-3] == big.sub(-5, -3) and native[-5:-3] == big[-5:-3]
-    assert native[-5:-7] == big.sub(-5, -7) and native[-5:-7] == big[-5:-7]
-
-    assert native[2:] == big.sub(2) and native[2:] == big[2:]
-    assert native[:7] == big.sub(end=7) and native[:7] == big[:7]
-    assert native[-2:] == big.sub(-2) and native[-2:] == big[-2:]
-    assert native[:-7] == big.sub(end=-7) and native[:-7] == big[:-7]
-    assert native[:-10] == big.sub(end=-10) and native[:-10] == big[:-10]
-    assert native[:-1] == big.sub(end=-1) and native[:-1] == big[:-1]
-
-    length = 1000
-    native = get_random_string(length=length)
-    big = Str(native)
-
-    needle = native[0 : randint(2, 5)]
-    native_strings = native.split(needle)
-    big_strings: Strs = big.split(needle)
-
-    length = len(native_strings)
-    for i in range(length):
-        start = randint(1 - length, length - 1)
-        stop = randint(1 - length, length - 1)
-        step = 0
-        while step == 0:
-            step = randint(-int(math.sqrt(length)), int(math.sqrt(length)))
-
-        is_equal_strings(native_strings[start:stop:step], big_strings[start:stop:step])
-        is_equal_strings(
-            native_strings[start:stop:step],
-            big_strings.sub(start, stop, step),
-        )
-
-
-def test_levenstein():
-    # Create a new string by slicing and concatenating
-    def insert_char_at(s, char_to_insert, index):
-        return s[:index] + char_to_insert + s[index:]
-
-    for _ in range(100):
-        a = get_random_string(length=20)
-        b = a
-        for i in range(150):
-            source_offset = randint(0, len(ascii_lowercase) - 1)
-            target_offset = randint(0, len(b) - 1)
-            b = insert_char_at(b, ascii_lowercase[source_offset], target_offset)
-            assert levenstein(a, b, 200) == i + 1
+# def test_split_keepseparator():
+#     native = "word1 word2 word3"
+#     big = Str(native)
+#     assert ["word1 ", "word2 ", "word3"] == list(big.split(" ", keepseparator=True))
+
+
+# def test_strs_operations():
+#     native = "line1\nline2\nline3"
+#     big = Str(native)
+#     lines = big.splitlines()
+#     lines.sort()
+#     assert ["line1", "line2", "line3"] == list(lines)
+
+#     shuffled_copy = lines.shuffled(seed=42)
+#     assert set(lines) == set(shuffled_copy)
+
+#     lines.append("line4")
+#     assert 4 == len(lines)
+#     lines.extend(["line5", "line6"])
+#     assert 6 == len(lines)
+
+#     lines.append(lines[0])
+#     assert 7 == len(lines)
+#     assert lines[6] == "line1"
+
+#     lines.extend(lines)
+#     assert 14 == len(lines)
+#     assert lines[7] == "line1"
+#     assert lines[8] == "line2"
+#     assert lines[12] == "line6"
+
+#     # Test that shuffles are reproducible with the same `seed`
+#     a = [str(s) for s in lines.shuffled(seed=42)]
+#     b = [str(s) for s in lines.shuffled(seed=42)]
+#     assert a == b
+
+
+# @pytest.mark.parametrize("repetitions", range(1, 10))
+# def test_basic(repetitions: int):
+#     native = "abcd" * repetitions
+#     big = Str(native)
+
+#     check_identical(native, big, "a", True)
+#     check_identical(native, big, "ab", True)
+#     check_identical(native, big, "abc", True)
+#     check_identical(native, big, "abcd", True)
+#     check_identical(native, big, "abcde", True)  # Missing pattern
+
+
+# @pytest.mark.parametrize("pattern_length", [1, 2, 4, 5])
+# @pytest.mark.parametrize("haystack_length", range(1, 69, 3))
+# @pytest.mark.parametrize("variability", range(1, 27, 3))
+# def test_fuzzy(pattern_length: int, haystack_length: int, variability: int):
+#     native = get_random_string(variability=variability, length=haystack_length)
+#     big = Str(native)
+
+#     # Start by matching the prefix and the suffix
+#     check_identical(native, big, native[:pattern_length])
+#     check_identical(native, big, native[-pattern_length:])
+
+#     # Continue with random strs
+#     for _ in range(haystack_length // pattern_length):
+#         pattern = get_random_string(variability=variability, length=pattern_length)
+#         check_identical(native, big, pattern)
+
+
+# def test_strs():
+#     native = get_random_string(length=10)
+#     big = Str(native)
+
+#     assert native[0:5] == big.sub(0, 5) and native[0:5] == big[0:5]
+#     assert native[5:10] == big.sub(5, 10) and native[5:10] == big[5:10]
+
+#     assert native[5:5] == big.sub(5, 5) and native[5:5] == big[5:5]
+#     assert native[-5:-5] == big.sub(-5, -5) and native[-5:-5] == big[-5:-5]
+#     assert native[2:-2] == big.sub(2, -2) and native[2:-2] == big[2:-2]
+#     assert native[7:-7] == big.sub(7, -7) and native[7:-7] == big[7:-7]
+
+#     assert native[5:3] == big.sub(5, 3) and native[5:3] == big[5:3]
+#     assert native[5:7] == big.sub(5, 7) and native[5:7] == big[5:7]
+#     assert native[5:-3] == big.sub(5, -3) and native[5:-3] == big[5:-3]
+#     assert native[5:-7] == big.sub(5, -7) and native[5:-7] == big[5:-7]
+
+#     assert native[-5:3] == big.sub(-5, 3) and native[-5:3] == big[-5:3]
+#     assert native[-5:7] == big.sub(-5, 7) and native[-5:7] == big[-5:7]
+#     assert native[-5:-3] == big.sub(-5, -3) and native[-5:-3] == big[-5:-3]
+#     assert native[-5:-7] == big.sub(-5, -7) and native[-5:-7] == big[-5:-7]
+
+#     assert native[2:] == big.sub(2) and native[2:] == big[2:]
+#     assert native[:7] == big.sub(end=7) and native[:7] == big[:7]
+#     assert native[-2:] == big.sub(-2) and native[-2:] == big[-2:]
+#     assert native[:-7] == big.sub(end=-7) and native[:-7] == big[:-7]
+#     assert native[:-10] == big.sub(end=-10) and native[:-10] == big[:-10]
+#     assert native[:-1] == big.sub(end=-1) and native[:-1] == big[:-1]
+
+#     length = 1000
+#     native = get_random_string(length=length)
+#     big = Str(native)
+
+#     needle = native[0 : randint(2, 5)]
+#     native_strings = native.split(needle)
+#     big_strings: Strs = big.split(needle)
+
+#     length = len(native_strings)
+#     for i in range(length):
+#         start = randint(1 - length, length - 1)
+#         stop = randint(1 - length, length - 1)
+#         step = 0
+#         while step == 0:
+#             step = randint(-int(math.sqrt(length)), int(math.sqrt(length)))
+
+#         is_equal_strings(native_strings[start:stop:step], big_strings[start:stop:step])
+#         is_equal_strings(
+#             native_strings[start:stop:step],
+#             big_strings.sub(start, stop, step),
+#         )
+
+
+# def test_levenstein():
+#     # Create a new string by slicing and concatenating
+#     def insert_char_at(s, char_to_insert, index):
+#         return s[:index] + char_to_insert + s[index:]
+
+#     for _ in range(100):
+#         a = get_random_string(length=20)
+#         b = a
+#         for i in range(150):
+#             source_offset = randint(0, len(ascii_lowercase) - 1)
+#             target_offset = randint(0, len(b) - 1)
+#             b = insert_char_at(b, ascii_lowercase[source_offset], target_offset)
+#             assert levenstein(a, b, 200) == i + 1

From 9a38259fc0f120a34ee2c369443fbc0d5dffc8b3 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 16 Sep 2023 21:42:24 +0400
Subject: [PATCH 08/72] Add: Slices and rich comparisons

---
 .vscode/settings.json |  2 +
 python/lib.c          | 92 ++++++++++++++++++++++++++++++++++++++-----
 scripts/test.py       |  6 +++
 3 files changed, 90 insertions(+), 10 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 1696d8d4..6e4162b1 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -139,9 +139,11 @@
     "newfunc",
     "NOARGS",
     "NOMINMAX",
+    "NOTIMPLEMENTED",
     "pytest",
     "quadgram",
     "readlines",
+    "richcompare",
     "SIMD",
     "splitlines",
     "stringzilla",
diff --git a/python/lib.c b/python/lib.c
index 4dfc90e9..01240c44 100644
--- a/python/lib.c
+++ b/python/lib.c
@@ -366,6 +366,42 @@ static PyObject *Str_getitem(Str *self, Py_ssize_t i) {
     return PyUnicode_FromStringAndSize(self->start + i, 1);
 }
 
+static PyObject *Str_subscript(Str *self, PyObject *key) {
+    if (PySlice_Check(key)) {
+        Py_ssize_t start, stop, step;
+        if (PySlice_Unpack(key, &start, &stop, &step) < 0)
+            return NULL;
+        if (PySlice_AdjustIndices(self->length, &start, &stop, step) < 0)
+            return NULL;
+
+        if (step != 1) {
+            PyErr_SetString(PyExc_IndexError, "Efficient step is not supported");
+            return NULL;
+        }
+
+        // Create a new `Str` object
+        Str *self_slice = (Str *)StrType.tp_alloc(&StrType, 0);
+        if (self_slice == NULL && PyErr_NoMemory())
+            return NULL;
+
+        // Set its properties based on the slice
+        self_slice->start = self->start + start;
+        self_slice->length = stop - start;
+        self_slice->parent = (PyObject *)self; // Set parent to keep it alive
+
+        // Increment the reference count of the parent
+        Py_INCREF(self);
+        return (PyObject *)self_slice;
+    }
+    else if (PyLong_Check(key)) {
+        return Str_getitem(self, PyLong_AsSsize_t(key));
+    }
+    else {
+        PyErr_SetString(PyExc_TypeError, "Str indices must be integers or slices");
+        return NULL;
+    }
+}
+
 // Will be called by the `PySequence_Contains`
 static int Str_contains(Str *self, PyObject *arg) {
 
@@ -431,17 +467,47 @@ static PyObject *Str_getslice(Str *self, PyObject *args) {
 
 static PyObject *Str_str(Str *self, PyObject *args) { return PyUnicode_FromStringAndSize(self->start, self->length); }
 
+static PyObject *Str_richcompare(PyObject *self, PyObject *other, int op) {
+
+    char const *a_start, *b_start;
+    size_t a_length, b_length;
+    if (!export_string_like(self, &a_start, &a_length) || !export_string_like(other, &b_start, &b_length))
+        Py_RETURN_NOTIMPLEMENTED;
+
+    // Perform byte-wise comparison up to the minimum length
+    size_t min_length = a_length < b_length ? a_length : b_length;
+    int cmp_result = memcmp(a_start, b_start, min_length);
+
+    // If the strings are equal up to `min_length`, then the shorter string is smaller
+    if (cmp_result == 0)
+        cmp_result = (a_length > b_length) - (a_length < b_length);
+
+    switch (op) {
+    case Py_LT: return PyBool_FromLong(cmp_result < 0);
+    case Py_LE: return PyBool_FromLong(cmp_result <= 0);
+    case Py_EQ: return PyBool_FromLong(cmp_result == 0);
+    case Py_NE: return PyBool_FromLong(cmp_result != 0);
+    case Py_GT: return PyBool_FromLong(cmp_result > 0);
+    case Py_GE: return PyBool_FromLong(cmp_result >= 0);
+    default: Py_RETURN_NOTIMPLEMENTED;
+    }
+}
+
 static PySequenceMethods Str_as_sequence = {
-    .sq_length = (lenfunc)Str_len,           //
-    .sq_item = (ssizeargfunc)Str_getitem,    //
-    .sq_contains = (objobjproc)Str_contains, //
+    .sq_length = Str_len,        //
+    .sq_item = Str_getitem,      //
+    .sq_contains = Str_contains, //
+};
+
+static PyMappingMethods Str_as_mapping = {
+    .mp_length = Str_len,          //
+    .mp_subscript = Str_subscript, // Is used to implement slices in Python
 };
 
 static PyMethodDef Str_methods[] = { //
     {"contains", (PyCFunction)Str_str, METH_NOARGS, "Convert to Python `str`"},
-    {"find", (PyCFunction)Str_len, METH_NOARGS, "Get length"},
-    {"find", (PyCFunction)Str_len, METH_NOARGS, "Get length"},
-    {"__getitem__", (PyCFunction)Str_getitem, METH_O, "Indexing"},
+    // {"find", (PyCFunction)Str_len, METH_NOARGS, "Get length"},
+    // {"__getitem__", (PyCFunction)Str_getitem, METH_O, "Indexing"},
     {NULL, NULL, 0, NULL}};
 
 static PyTypeObject StrType = {
@@ -450,11 +516,13 @@ static PyTypeObject StrType = {
     .tp_basicsize = sizeof(Str),
     .tp_flags = Py_TPFLAGS_DEFAULT,
     .tp_methods = Str_methods,
-    .tp_new = (newfunc)Str_new,
-    .tp_init = (initproc)Str_init,
-    .tp_dealloc = (destructor)Str_dealloc,
+    .tp_new = Str_new,
+    .tp_init = Str_init,
+    .tp_dealloc = Str_dealloc,
     .tp_as_sequence = &Str_as_sequence,
-    .tp_hash = (hashfunc)Str_hash, // String hashing functions
+    .tp_as_mapping = &Str_as_mapping,
+    .tp_hash = Str_hash, // String hashing functions
+    .tp_richcompare = Str_richcompare,
     // .tp_as_buffer = (PyBufferProcs *)NULL, // Functions to access object as input/output buffer
 };
 
@@ -475,9 +543,13 @@ static PyModuleDef stringzilla_module = {
     NULL,
 };
 
+// String functions:
 static PyObject *vectorized_find = NULL;
 static PyObject *vectorized_count = NULL;
 static PyObject *vectorized_contains = NULL;
+static PyObject *vectorized_levenstein = NULL;
+
+// String collections:
 static PyObject *vectorized_split = NULL;
 static PyObject *vectorized_sort = NULL;
 static PyObject *vectorized_shuffle = NULL;
diff --git a/scripts/test.py b/scripts/test.py
index a846b6ae..61b121ad 100644
--- a/scripts/test.py
+++ b/scripts/test.py
@@ -29,6 +29,12 @@ def test_contains():
     assert "xxx" not in big
 
 
+def test_rich_comparisons():
+    assert Str("aa") == "aa"
+    assert Str("aa") < "b"
+    assert Str("abb")[1:] == "bb"
+
+
 def test_globals():
     assert sz.find("abcdef", "bcdef") == 1
     assert sz.find("abcdef", "x") == 6

From 6f9a9bdd0eb5366e695b3e20014b727d77c7779b Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 17 Sep 2023 13:20:33 +0400
Subject: [PATCH 09/72] Make: Remove PyBind11 dependency

---
 .vscode/settings.json | 1 +
 README.md             | 2 +-
 pyproject.toml        | 2 +-
 setup.py              | 6 ++----
 4 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 6e4162b1..ae2ecb67 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -152,6 +152,7 @@
     "substr",
     "SWAR",
     "TPFLAGS",
+    "Vardanian",
     "Zilla"
   ]
 }
\ No newline at end of file
diff --git a/README.md b/README.md
index 3b31b262..1049ef63 100644
--- a/README.md
+++ b/README.md
@@ -118,7 +118,7 @@ strzl_sort(&array, &your_config);
 
 Future development plans include:
 
-- Replace PyBind11 with CPython.
+- [x] Replace PyBind11 with CPython.
 - Reverse-order operations in Python #12.
 - Bindings for JavaScript #25, Java, and Rust.
 - Faster string sorting algorithm.
diff --git a/pyproject.toml b/pyproject.toml
index 84e82bf3..fe8221c7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,5 +1,5 @@
 [build-system]
-requires = ["setuptools>=42", "wheel", "cmake>=3.22", "pybind11"]
+requires = ["setuptools>=42", "wheel", "cmake>=3.22"]
 build-backend = "setuptools.build_meta"
 
 [tool.pytest.ini_options]
diff --git a/setup.py b/setup.py
index 83c22aea..546aed5a 100644
--- a/setup.py
+++ b/setup.py
@@ -1,9 +1,7 @@
 import os
 import sys
-from setuptools import setup
 import platform
-
-from pybind11.setup_helpers import Pybind11Extension
+from setuptools import setup, Extension
 
 
 compile_args = []
@@ -49,7 +47,7 @@
 
 
 ext_modules = [
-    Pybind11Extension(
+    Extension(
         "stringzilla",
         ["python/lib.c"],
         include_dirs=["stringzilla"],

From b5689b748637f17bed07fe20a0a4311ee3868665 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 17 Sep 2023 15:27:59 +0400
Subject: [PATCH 10/72] Add: Vectorized `count`

---
 .vscode/settings.json |   3 +
 README.md             |  28 ++++--
 python/lib.c          | 228 +++++++++++++++++++++++++++++++++---------
 scripts/test.py       |  15 ++-
 4 files changed, 216 insertions(+), 58 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index ae2ecb67..7ed8de05 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -131,10 +131,12 @@
     "itemsize",
     "keeplinebreaks",
     "keepseparator",
+    "kwds",
     "kwnames",
     "levenstein",
     "maxsplit",
     "memcpy",
+    "MODINIT",
     "nargsf",
     "newfunc",
     "NOARGS",
@@ -153,6 +155,7 @@
     "SWAR",
     "TPFLAGS",
     "Vardanian",
+    "vectorcallfunc",
     "Zilla"
   ]
 }
\ No newline at end of file
diff --git a/README.md b/README.md
index 1049ef63..973deaee 100644
--- a/README.md
+++ b/README.md
@@ -36,21 +36,21 @@ Coming soon.
 ## Quick Start: Python 🐍
 
 1️. Install via pip: `pip install stringzilla`  
-2. Import classes: `from stringzilla import Str, File, Strs`  
+1. Import the classes you need: `from stringzilla import Str, Strs, MemoryMappedFile`  
 
 ### Basic Usage
 
 StringZilla offers two mostly interchangeable core classes:
 
 ```python
-from stringzilla import Str, File
+from stringzilla import Str, MemoryMappedFile
 
-text1 = Str('some-string')
-text2 = File('some-file.txt')
+text_from_str = Str('some-string')
+text_from_file = Str(MemoryMappedFile('some-file.txt'))
 ```
 
 The `Str` is designed to replace long Python `str` strings and wrap our C-level API.
-On the other hand, the `File` memory-maps a file from persistent memory without loading its copy into RAM.
+On the other hand, the `MemoryMappedFile` memory-maps a file from persistent memory without loading its copy into RAM.
 The contents of that file would remain immutable, and the mapping can be shared by multiple Python processes simultaneously.
 A standard dataset pre-processing use case would be to map a sizeable textual dataset like Common Crawl into memory, spawn child processes, and split the job between them.
 
@@ -58,11 +58,12 @@ A standard dataset pre-processing use case would be to map a sizeable textual da
 
 - Length: `len(text) -> int`
 - Indexing: `text[42] -> str`
-- Slicing: `text[42:46] -> str`
+- Slicing: `text[42:46] -> Str`
+- String conversion: `str(text) -> str`
+- Substring check: `'substring' in text -> bool`
 
 ### Advanced Operations
 
-- `'substring' in text -> bool`
 - `text.contains('substring', start=0, end=9223372036854775807) -> bool`
 - `text.find('substring', start=0, end=9223372036854775807) -> int`
 - `text.count('substring', start=0, end=9223372036854775807, allowoverlap=False) -> int`
@@ -93,6 +94,19 @@ lines.append('Pythonic string')
 lines.extend(shuffled_copy)
 ```
 
+### Low-Level Python API
+
+The StringZilla CPython bindings implement vector-call conventions for faster calls.
+
+```py
+import stringzilla as sz
+
+contains: bool = sz.contains("haystack", "needle", start=0, end=9223372036854775807)
+offset: int = sz.find("haystack", "needle", start=0, end=9223372036854775807)
+count: int = sz.count("haystack", "needle", start=0, end=9223372036854775807, allowoverlap=False)
+levenstein: int = sz.levenstein("needle", "nidl")
+```
+
 ## Quick Start: C 🛠️
 
 There is an ABI-stable C 99 interface, in case you have a database, an operating system, or a runtime you want to integrate with StringZilla.
diff --git a/python/lib.c b/python/lib.c
index 01240c44..93bbce93 100644
--- a/python/lib.c
+++ b/python/lib.c
@@ -125,41 +125,160 @@ int export_string_like(PyObject *object, char const **start, size_t *length) {
 
 #pragma region Global Functions
 
-static PyObject *str_find_vectorcall(PyObject *_, PyObject *const *args, size_t nargsf, PyObject *kwnames) {
-    // Check the number of arguments and types
+static size_t str_find_vectorcall_(PyObject *_, PyObject *const *args, size_t nargsf, PyObject *kwnames) {
     Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
-    if (nargs < 2 || nargs > 4) {
-        PyErr_SetString(PyExc_TypeError, "Invalid arguments");
+
+    // Initialize defaults
+    Py_ssize_t start = 0;
+    Py_ssize_t end = PY_SSIZE_T_MAX;
+
+    // Parse positional arguments: haystack and needle
+    if (nargs < 2) {
+        PyErr_SetString(PyExc_TypeError, "Invalid number of arguments");
         return NULL;
     }
 
-    // Parse the haystack.
     PyObject *haystack_obj = args[0];
+    PyObject *needle_obj = args[1];
+
     struct strzl_haystack_t haystack;
-    if (!export_string_like(haystack_obj, &haystack.ptr, &haystack.len)) {
-        PyErr_SetString(PyExc_TypeError, "First argument (haystack) must be string-like");
+    struct strzl_needle_t needle;
+    if (!export_string_like(haystack_obj, &haystack.ptr, &haystack.len) ||
+        !export_string_like(needle_obj, &needle.ptr, &needle.len)) {
+        PyErr_SetString(PyExc_TypeError, "Haystack and needle must be string-like");
         return NULL;
     }
 
-    // Parse the needle.
+    // Parse additional positional arguments
+    if (nargs > 2)
+        start = PyLong_AsSsize_t(args[2]);
+    if (nargs > 3)
+        end = PyLong_AsSsize_t(args[3]);
+
+    // Parse keyword arguments
+    if (kwnames != NULL) {
+        for (Py_ssize_t i = 0; i < PyTuple_Size(kwnames); ++i) {
+            PyObject *key = PyTuple_GetItem(kwnames, i);
+            PyObject *value = args[nargs + i];
+            if (PyUnicode_CompareWithASCIIString(key, "start") == 0)
+                start = PyLong_AsSsize_t(value);
+            else if (PyUnicode_CompareWithASCIIString(key, "end") == 0)
+                end = PyLong_AsSsize_t(value);
+            else {
+                PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key);
+                return NULL;
+            }
+        }
+    }
+
+    // Limit the haystack range
+    size_t normalized_offset, normalized_length;
+    slice(haystack.len, start, end, &normalized_offset, &normalized_length);
+    haystack.ptr += normalized_offset;
+    haystack.len = normalized_length;
+
+    // Perform contains operation
+    return strzl_neon_find_substr(haystack, needle);
+}
+
+static PyObject *str_find_vectorcall(PyObject *_, PyObject *const *args, size_t nargsf, PyObject *kwnames) {
+    size_t offset = str_find_vectorcall_(NULL, args, nargsf, kwnames);
+    return PyLong_FromSize_t(offset);
+}
+
+static PyObject *str_contains_vectorcall(PyObject *_, PyObject *const *args, size_t nargsf, PyObject *kwnames) {
+    size_t offset = str_find_vectorcall_(NULL, args, nargsf, kwnames);
+    if (offset != haystack.len) {
+        Py_RETURN_TRUE;
+    }
+    else {
+        Py_RETURN_FALSE;
+    }
+}
+
+static PyObject *str_count_vectorcall(PyObject *_, PyObject *const *args, size_t nargsf, PyObject *kwnames) {
+    Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
+
+    // Initialize defaults
+    Py_ssize_t start = 0;
+    Py_ssize_t end = PY_SSIZE_T_MAX;
+    int allow_overlap = 0;
+
+    // Parse positional arguments: haystack and needle
+    if (nargs < 2) {
+        PyErr_SetString(PyExc_TypeError, "Invalid number of arguments");
+        return NULL;
+    }
+
+    PyObject *haystack_obj = args[0];
     PyObject *needle_obj = args[1];
+
+    struct strzl_haystack_t haystack;
     struct strzl_needle_t needle;
-    needle.anomaly_offset = 0;
-    if (!export_string_like(needle_obj, &needle.ptr, &needle.len)) {
-        PyErr_SetString(PyExc_TypeError, "Second argument (needle) must be string-like");
+    if (!export_string_like(haystack_obj, &haystack.ptr, &haystack.len) ||
+        !export_string_like(needle_obj, &needle.ptr, &needle.len)) {
+        PyErr_SetString(PyExc_TypeError, "Haystack and needle must be string-like");
         return NULL;
     }
 
-    // Limit the haystack range.
-    Py_ssize_t start = (nargs > 2) ? PyLong_AsSsize_t(args[2]) : 0;
-    Py_ssize_t end = (nargs > 3) ? PyLong_AsSsize_t(args[3]) : PY_SSIZE_T_MAX;
+    // Parse additional positional arguments
+    if (nargs > 2)
+        start = PyLong_AsSsize_t(args[2]);
+    if (nargs > 3)
+        end = PyLong_AsSsize_t(args[3]);
+
+    // Parse keyword arguments
+    if (kwnames != NULL) {
+        for (Py_ssize_t i = 0; i < PyTuple_Size(kwnames); ++i) {
+            PyObject *key = PyTuple_GetItem(kwnames, i);
+            PyObject *value = args[nargs + i];
+            if (PyUnicode_CompareWithASCIIString(key, "start") == 0)
+                start = PyLong_AsSsize_t(value);
+            else if (PyUnicode_CompareWithASCIIString(key, "end") == 0)
+                end = PyLong_AsSsize_t(value);
+            else if (PyUnicode_CompareWithASCIIString(key, "allowoverlap") == 0)
+                allow_overlap = PyObject_IsTrue(value);
+            else {
+                PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key);
+                return NULL;
+            }
+        }
+    }
+
+    // Limit the haystack range
     size_t normalized_offset, normalized_length;
     slice(haystack.len, start, end, &normalized_offset, &normalized_length);
-
-    haystack.ptr = haystack.ptr + normalized_offset;
+    haystack.ptr += normalized_offset;
     haystack.len = normalized_length;
-    size_t position = strzl_neon_find_substr(haystack, needle);
-    return PyLong_FromSize_t(position);
+
+    // Perform counting operation
+    size_t count = 0;
+    if (needle.len == 1) {
+        count = strzl_naive_count_char(haystack, *needle.ptr);
+    }
+    else {
+        // Your existing logic for count_substr can be embedded here
+        if (allow_overlap) {
+            while (haystack.len) {
+                size_t offset = strzl_neon_find_substr(haystack, needle);
+                int found = offset != haystack.len;
+                count += found;
+                haystack.ptr += offset + found;
+                haystack.len -= offset + found;
+            }
+        }
+        else {
+            while (haystack.len) {
+                size_t offset = strzl_neon_find_substr(haystack, needle);
+                int found = offset != haystack.len;
+                count += found;
+                haystack.ptr += offset + needle.len;
+                haystack.len -= offset + needle.len * found;
+            }
+        }
+    }
+
+    return PyLong_FromSize_t(count);
 }
 
 #pragma endregion
@@ -349,8 +468,12 @@ static void Str_dealloc(Str *self) {
     Py_TYPE(self)->tp_free((PyObject *)self);
 }
 
+static PyObject *Str_str(Str *self) { return PyUnicode_FromStringAndSize(self->start, self->length); }
+
 static Py_ssize_t Str_len(Str *self) { return self->length; }
 
+static Py_hash_t Str_hash(Str *self) { return (Py_hash_t)strzl_hash_crc32_native(self->start, self->length); }
+
 static PyObject *Str_getitem(Str *self, Py_ssize_t i) {
 
     // Negative indexing
@@ -419,8 +542,6 @@ static int Str_contains(Str *self, PyObject *arg) {
     return position != haystack.len;
 }
 
-static Py_hash_t Str_hash(Str *self) { return (Py_hash_t)strzl_hash_crc32_native(self->start, self->length); }
-
 static PyObject *Str_getslice(Str *self, PyObject *args) {
     PyObject *start_obj = NULL, *end_obj = NULL;
     ssize_t start = 0, end = self->length; // Default values
@@ -465,8 +586,6 @@ static PyObject *Str_getslice(Str *self, PyObject *args) {
     return (PyObject *)new_str;
 }
 
-static PyObject *Str_str(Str *self, PyObject *args) { return PyUnicode_FromStringAndSize(self->start, self->length); }
-
 static PyObject *Str_richcompare(PyObject *self, PyObject *other, int op) {
 
     char const *a_start, *b_start;
@@ -505,9 +624,9 @@ static PyMappingMethods Str_as_mapping = {
 };
 
 static PyMethodDef Str_methods[] = { //
-    {"contains", (PyCFunction)Str_str, METH_NOARGS, "Convert to Python `str`"},
-    // {"find", (PyCFunction)Str_len, METH_NOARGS, "Get length"},
-    // {"__getitem__", (PyCFunction)Str_getitem, METH_O, "Indexing"},
+    // {"contains", (PyCFunction)..., METH_NOARGS, "Convert to Python `str`"},
+    // {"find", (PyCFunction)..., METH_NOARGS, "Get length"},
+    // {"__getitem__", (PyCFunction)..., METH_O, "Indexing"},
     {NULL, NULL, 0, NULL}};
 
 static PyTypeObject StrType = {
@@ -523,6 +642,7 @@ static PyTypeObject StrType = {
     .tp_as_mapping = &Str_as_mapping,
     .tp_hash = Str_hash, // String hashing functions
     .tp_richcompare = Str_richcompare,
+    .tp_str = Str_str,
     // .tp_as_buffer = (PyBufferProcs *)NULL, // Functions to access object as input/output buffer
 };
 
@@ -554,6 +674,24 @@ static PyObject *vectorized_split = NULL;
 static PyObject *vectorized_sort = NULL;
 static PyObject *vectorized_shuffle = NULL;
 
+PyObject *register_vectorcall(PyObject *module, char const *name, vectorcallfunc vectorcall) {
+
+    PyCFunctionObject *vectorcall_object = (PyCFunctionObject *)PyObject_Malloc(sizeof(PyCFunctionObject));
+    if (vectorcall_object == NULL)
+        return NULL;
+
+    PyObject_Init(vectorcall_object, &PyCFunction_Type);
+    vectorcall_object->m_ml = NULL; // No regular `PyMethodDef`
+    vectorcall_object->vectorcall = vectorcall;
+
+    // Add the 'find' function to the module
+    if (PyModule_AddObject(module, name, vectorcall_object) < 0) {
+        Py_XDECREF(vectorcall_object);
+        return NULL;
+    }
+    return vectorcall_object;
+}
+
 PyMODINIT_FUNC PyInit_stringzilla(void) {
     PyObject *m;
 
@@ -582,26 +720,19 @@ PyMODINIT_FUNC PyInit_stringzilla(void) {
         return NULL;
     }
 
-    // Create the 'find' function
-    vectorized_find = PyObject_Malloc(sizeof(PyCFunctionObject));
-    if (vectorized_find == NULL) {
-        Py_XDECREF(&MemoryMappedFileType);
-        Py_XDECREF(&StrType);
-        Py_XDECREF(m);
-        PyErr_NoMemory();
-        return NULL;
-    }
-    PyObject_Init(vectorized_find, &PyCFunction_Type);
-    ((PyCFunctionObject *)vectorized_find)->m_ml = NULL; // No regular PyMethodDef
-    ((PyCFunctionObject *)vectorized_find)->vectorcall = str_find_vectorcall;
+    // Register the vectorized functions
+    vectorized_find = register_vectorcall(m, "find", str_find_vectorcall);
+    vectorized_contains = register_vectorcall(m, "contains", str_contains_vectorcall);
+    vectorized_count = register_vectorcall(m, "count", str_count_vectorcall);
+    vectorized_levenstein = register_vectorcall(m, "levenstein", str_find_vectorcall);
 
-    // Add the 'find' function to the module
-    if (PyModule_AddObject(m, "find", vectorized_find) < 0) {
-        PyObject_Free(vectorized_find);
-        Py_XDECREF(&MemoryMappedFileType);
-        Py_XDECREF(&StrType);
-        Py_XDECREF(m);
-        return NULL;
+    vectorized_split = register_vectorcall(m, "split", str_find_vectorcall);
+    vectorized_sort = register_vectorcall(m, "sort", str_find_vectorcall);
+    vectorized_shuffle = register_vectorcall(m, "shuffle", str_find_vectorcall);
+    if (!vectorized_find || !vectorized_count ||          //
+        !vectorized_contains || !vectorized_levenstein || //
+        !vectorized_split || !vectorized_sort || !vectorized_shuffle) {
+        goto cleanup;
     }
 
     return m;
@@ -609,16 +740,21 @@ PyMODINIT_FUNC PyInit_stringzilla(void) {
 cleanup:
     if (vectorized_find)
         Py_XDECREF(vectorized_find);
-    if (vectorized_count)
-        Py_XDECREF(vectorized_count);
     if (vectorized_contains)
         Py_XDECREF(vectorized_contains);
+    if (vectorized_count)
+        Py_XDECREF(vectorized_count);
+    if (vectorized_levenstein)
+        Py_XDECREF(vectorized_levenstein);
     if (vectorized_split)
         Py_XDECREF(vectorized_split);
     if (vectorized_sort)
         Py_XDECREF(vectorized_sort);
     if (vectorized_shuffle)
         Py_XDECREF(vectorized_shuffle);
+
+    Py_XDECREF(&MemoryMappedFileType);
+    Py_XDECREF(&StrType);
     Py_XDECREF(m);
     PyErr_NoMemory();
     return NULL;
diff --git a/scripts/test.py b/scripts/test.py
index 61b121ad..21d5d7ba 100644
--- a/scripts/test.py
+++ b/scripts/test.py
@@ -9,6 +9,16 @@
 from stringzilla import Str
 
 
+def test_globals():
+    assert sz.find("abcdef", "bcdef") == 1
+    assert sz.find("abcdef", "x") == 6
+
+    assert sz.count("abcdef", "x") == 0
+    assert sz.count("aaaaa", "a") == 5
+    assert sz.count("aaaaa", "aa") == 2
+    assert sz.count("aaaaa", "aa", allowoverlap=True) == 4
+
+
 def test_construct():
     native = "aaaaa"
     big = Str(native)
@@ -35,11 +45,6 @@ def test_rich_comparisons():
     assert Str("abb")[1:] == "bb"
 
 
-def test_globals():
-    assert sz.find("abcdef", "bcdef") == 1
-    assert sz.find("abcdef", "x") == 6
-
-
 # def get_random_string(
 #     length: Optional[int] = None, variability: Optional[int] = None
 # ) -> str:

From 4ed2e9a83d286ac299d84f22ea0172152a40f0b0 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 17 Sep 2023 16:20:15 +0400
Subject: [PATCH 11/72] Fix: Bounded Levenstein distance

---
 .vscode/settings.json     |   4 +-
 README.md                 |   8 +-
 python/lib.c              | 183 +++++++++++++++++++++++++++-----------
 scripts/test.py           |   8 +-
 stringzilla/stringzilla.h |  35 ++++++--
 5 files changed, 175 insertions(+), 63 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 7ed8de05..2a8fa9c2 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -116,10 +116,12 @@
     "stop_token": "cpp",
     "__verbose_abort": "cpp",
     "strstream": "cpp",
-    "filesystem": "cpp"
+    "filesystem": "cpp",
+    "__memory": "c"
   },
   "C_Cpp.default.configurationProvider": "ms-vscode.cmake-tools",
   "cSpell.words": [
+    "abababab",
     "allowoverlap",
     "basicsize",
     "bigram",
diff --git a/README.md b/README.md
index 973deaee..8b4630de 100644
--- a/README.md
+++ b/README.md
@@ -36,21 +36,21 @@ Coming soon.
 ## Quick Start: Python 🐍
 
 1️. Install via pip: `pip install stringzilla`  
-1. Import the classes you need: `from stringzilla import Str, Strs, MemoryMappedFile`  
+1. Import the classes you need: `from stringzilla import Str, Strs, File`  
 
 ### Basic Usage
 
 StringZilla offers two mostly interchangeable core classes:
 
 ```python
-from stringzilla import Str, MemoryMappedFile
+from stringzilla import Str, File
 
 text_from_str = Str('some-string')
-text_from_file = Str(MemoryMappedFile('some-file.txt'))
+text_from_file = Str(File('some-file.txt'))
 ```
 
 The `Str` is designed to replace long Python `str` strings and wrap our C-level API.
-On the other hand, the `MemoryMappedFile` memory-maps a file from persistent memory without loading its copy into RAM.
+On the other hand, the `File` memory-maps a file from persistent memory without loading its copy into RAM.
 The contents of that file would remain immutable, and the mapping can be shared by multiple Python processes simultaneously.
 A standard dataset pre-processing use case would be to map a sizeable textual dataset like Common Crawl into memory, spawn child processes, and split the job between them.
 
diff --git a/python/lib.c b/python/lib.c
index 93bbce93..768b0d7a 100644
--- a/python/lib.c
+++ b/python/lib.c
@@ -25,9 +25,14 @@ typedef SSIZE_T ssize_t;
 
 #pragma region Forward Declarations
 
-static PyTypeObject MemoryMappedFileType;
+static PyTypeObject FileType;
 static PyTypeObject StrType;
 
+struct {
+    void *ptr;
+    size_t len;
+} temporary_memory = {NULL, 0};
+
 /**
  *  @brief  Describes an on-disk file mapped into RAM, which is different from Python's
  *          native `mmap` module, as it exposes the address of the mapping in memory.
@@ -42,11 +47,11 @@ typedef struct {
 #endif
     void *start;
     size_t length;
-} MemoryMappedFile;
+} File;
 
 /**
  *  @brief  Type-punned StringZilla-string, that points to a slice of an existing Python `str`
- *          or a `MemoryMappedFile`.
+ *          or a `File`.
  *
  *  When a slice is constructed, the `parent` object's reference count is being incremented to preserve lifetime.
  *  It usage in Python would look like:
@@ -112,8 +117,8 @@ int export_string_like(PyObject *object, char const **start, size_t *length) {
         *length = str->length;
         return 1;
     }
-    else if (PyObject_TypeCheck(object, &MemoryMappedFileType)) {
-        MemoryMappedFile *file = (MemoryMappedFile *)object;
+    else if (PyObject_TypeCheck(object, &FileType)) {
+        File *file = (File *)object;
         *start = file->start;
         *length = file->length;
         return 1;
@@ -125,7 +130,7 @@ int export_string_like(PyObject *object, char const **start, size_t *length) {
 
 #pragma region Global Functions
 
-static size_t str_find_vectorcall_(PyObject *_, PyObject *const *args, size_t nargsf, PyObject *kwnames) {
+static Py_ssize_t str_find_vectorcall_(PyObject *_, PyObject *const *args, size_t nargsf, PyObject *kwnames) {
     Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
 
     // Initialize defaults
@@ -140,9 +145,9 @@ static size_t str_find_vectorcall_(PyObject *_, PyObject *const *args, size_t na
 
     PyObject *haystack_obj = args[0];
     PyObject *needle_obj = args[1];
-
     struct strzl_haystack_t haystack;
     struct strzl_needle_t needle;
+    needle.anomaly_offset = 0;
     if (!export_string_like(haystack_obj, &haystack.ptr, &haystack.len) ||
         !export_string_like(needle_obj, &needle.ptr, &needle.len)) {
         PyErr_SetString(PyExc_TypeError, "Haystack and needle must be string-like");
@@ -178,21 +183,24 @@ static size_t str_find_vectorcall_(PyObject *_, PyObject *const *args, size_t na
     haystack.len = normalized_length;
 
     // Perform contains operation
-    return strzl_neon_find_substr(haystack, needle);
+    size_t offset = strzl_neon_find_substr(haystack, needle);
+    if (offset == haystack.len)
+        return -1;
+    return (Py_ssize_t)offset;
 }
 
 static PyObject *str_find_vectorcall(PyObject *_, PyObject *const *args, size_t nargsf, PyObject *kwnames) {
-    size_t offset = str_find_vectorcall_(NULL, args, nargsf, kwnames);
-    return PyLong_FromSize_t(offset);
+    Py_ssize_t signed_offset = str_find_vectorcall_(NULL, args, nargsf, kwnames);
+    return PyLong_FromSsize_t(signed_offset);
 }
 
 static PyObject *str_contains_vectorcall(PyObject *_, PyObject *const *args, size_t nargsf, PyObject *kwnames) {
-    size_t offset = str_find_vectorcall_(NULL, args, nargsf, kwnames);
-    if (offset != haystack.len) {
-        Py_RETURN_TRUE;
+    Py_ssize_t signed_offset = str_find_vectorcall_(NULL, args, nargsf, kwnames);
+    if (signed_offset == -1) {
+        Py_RETURN_FALSE;
     }
     else {
-        Py_RETURN_FALSE;
+        Py_RETURN_TRUE;
     }
 }
 
@@ -215,6 +223,7 @@ static PyObject *str_count_vectorcall(PyObject *_, PyObject *const *args, size_t
 
     struct strzl_haystack_t haystack;
     struct strzl_needle_t needle;
+    needle.anomaly_offset = 0;
     if (!export_string_like(haystack_obj, &haystack.ptr, &haystack.len) ||
         !export_string_like(needle_obj, &needle.ptr, &needle.len)) {
         PyErr_SetString(PyExc_TypeError, "Haystack and needle must be string-like");
@@ -281,11 +290,81 @@ static PyObject *str_count_vectorcall(PyObject *_, PyObject *const *args, size_t
     return PyLong_FromSize_t(count);
 }
 
+static PyObject *str_levenstein_vectorcall(PyObject *_, PyObject *const *args, size_t nargsf, PyObject *kwnames) {
+    Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
+
+    // Validate the number of arguments
+    if (nargs < 2 || nargs > 3) {
+        PyErr_SetString(PyExc_TypeError, "Invalid number of arguments");
+        return NULL;
+    }
+
+    PyObject *str1_obj = args[0];
+    PyObject *str2_obj = args[1];
+
+    struct strzl_haystack_t str1, str2;
+    if (!export_string_like(str1_obj, &str1.ptr, &str1.len) || !export_string_like(str2_obj, &str2.ptr, &str2.len)) {
+        PyErr_SetString(PyExc_TypeError, "Both arguments must be string-like");
+        return NULL;
+    }
+
+    // Initialize bound argument
+    int bound = 255;
+
+    // Check if `bound` is given as a positional argument
+    if (nargs == 3) {
+        bound = PyLong_AsLong(args[2]);
+        if (bound > 255 || bound < 0) {
+            PyErr_SetString(PyExc_ValueError, "Bound must be an integer between 0 and 255");
+            return NULL;
+        }
+    }
+
+    // Parse keyword arguments
+    if (kwnames != NULL) {
+        for (Py_ssize_t i = 0; i < PyTuple_Size(kwnames); ++i) {
+            PyObject *key = PyTuple_GetItem(kwnames, i);
+            PyObject *value = args[nargs + i];
+            if (PyUnicode_CompareWithASCIIString(key, "bound") == 0) {
+                if (nargs == 3) {
+                    PyErr_SetString(PyExc_TypeError, "Received bound both as positional and keyword argument");
+                    return NULL;
+                }
+                bound = PyLong_AsLong(value);
+                if (bound > 255 || bound < 0) {
+                    PyErr_SetString(PyExc_ValueError, "Bound must be an integer between 0 and 255");
+                    return NULL;
+                }
+            }
+        }
+    }
+
+    // Initialize or reallocate the Levenshtein distance matrix
+    size_t memory_needed = strzl_levenstein_memory_needed(str1.len, str2.len);
+    if (temporary_memory.len < memory_needed) {
+        temporary_memory.ptr = realloc(temporary_memory.ptr, memory_needed);
+        temporary_memory.len = memory_needed;
+    }
+    if (temporary_memory.ptr == NULL) {
+        PyErr_SetString(PyExc_MemoryError, "Unable to allocate memory for the Levenshtein matrix");
+        return NULL;
+    }
+
+    levenstein_distance_t distance = strzl_levenstein( //
+        str1.ptr,
+        str1.len,
+        str2.ptr,
+        str2.len,
+        (levenstein_distance_t)bound,
+        temporary_memory.ptr);
+    return PyLong_FromLong(distance);
+}
+
 #pragma endregion
 
 #pragma region MemoryMappingFile
 
-static void MemoryMappedFile_dealloc(MemoryMappedFile *self) {
+static void File_dealloc(File *self) {
 #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
     if (self->start) {
         UnmapViewOfFile(self->start);
@@ -313,9 +392,9 @@ static void MemoryMappedFile_dealloc(MemoryMappedFile *self) {
     Py_TYPE(self)->tp_free((PyObject *)self);
 }
 
-static PyObject *MemoryMappedFile_new(PyTypeObject *type, PyObject *positional_args, PyObject *named_args) {
-    MemoryMappedFile *self;
-    self = (MemoryMappedFile *)type->tp_alloc(type, 0);
+static PyObject *File_new(PyTypeObject *type, PyObject *positional_args, PyObject *named_args) {
+    File *self;
+    self = (File *)type->tp_alloc(type, 0);
     if (self == NULL)
         return NULL;
 
@@ -329,7 +408,7 @@ static PyObject *MemoryMappedFile_new(PyTypeObject *type, PyObject *positional_a
     self->length = 0;
 }
 
-static int MemoryMappedFile_init(MemoryMappedFile *self, PyObject *positional_args, PyObject *named_args) {
+static int File_init(File *self, PyObject *positional_args, PyObject *named_args) {
     const char *path;
     if (!PyArg_ParseTuple(positional_args, "s", &path))
         return -1;
@@ -384,18 +463,18 @@ static int MemoryMappedFile_init(MemoryMappedFile *self, PyObject *positional_ar
     return 0;
 }
 
-static PyMethodDef MemoryMappedFile_methods[] = { //
+static PyMethodDef File_methods[] = { //
     {NULL, NULL, 0, NULL}};
 
-static PyTypeObject MemoryMappedFileType = {
-    PyObject_HEAD_INIT(NULL).tp_name = "stringzilla.MemoryMappedFile",
+static PyTypeObject FileType = {
+    PyObject_HEAD_INIT(NULL).tp_name = "stringzilla.File",
     .tp_doc = "Memory mapped file class, that exposes the memory range for low-level access",
-    .tp_basicsize = sizeof(MemoryMappedFile),
+    .tp_basicsize = sizeof(File),
     .tp_flags = Py_TPFLAGS_DEFAULT,
-    .tp_methods = MemoryMappedFile_methods,
-    .tp_new = (newfunc)MemoryMappedFile_new,
-    .tp_init = (initproc)MemoryMappedFile_init,
-    .tp_dealloc = (destructor)MemoryMappedFile_dealloc,
+    .tp_methods = File_methods,
+    .tp_new = (newfunc)File_new,
+    .tp_init = (initproc)File_init,
+    .tp_dealloc = (destructor)File_dealloc,
 
     // PyBufferProcs *tp_as_buffer;
 
@@ -663,17 +742,6 @@ static PyModuleDef stringzilla_module = {
     NULL,
 };
 
-// String functions:
-static PyObject *vectorized_find = NULL;
-static PyObject *vectorized_count = NULL;
-static PyObject *vectorized_contains = NULL;
-static PyObject *vectorized_levenstein = NULL;
-
-// String collections:
-static PyObject *vectorized_split = NULL;
-static PyObject *vectorized_sort = NULL;
-static PyObject *vectorized_shuffle = NULL;
-
 PyObject *register_vectorcall(PyObject *module, char const *name, vectorcallfunc vectorcall) {
 
     PyCFunctionObject *vectorcall_object = (PyCFunctionObject *)PyObject_Malloc(sizeof(PyCFunctionObject));
@@ -692,13 +760,19 @@ PyObject *register_vectorcall(PyObject *module, char const *name, vectorcallfunc
     return vectorcall_object;
 }
 
+void cleanup_module(void) {
+    free(temporary_memory.ptr);
+    temporary_memory.ptr = NULL;
+    temporary_memory.len = 0;
+}
+
 PyMODINIT_FUNC PyInit_stringzilla(void) {
     PyObject *m;
 
     if (PyType_Ready(&StrType) < 0)
         return NULL;
 
-    if (PyType_Ready(&MemoryMappedFileType) < 0)
+    if (PyType_Ready(&FileType) < 0)
         return NULL;
 
     m = PyModule_Create(&stringzilla_module);
@@ -712,23 +786,30 @@ PyMODINIT_FUNC PyInit_stringzilla(void) {
         return NULL;
     }
 
-    Py_INCREF(&MemoryMappedFileType);
-    if (PyModule_AddObject(m, "MemoryMappedFile", (PyObject *)&MemoryMappedFileType) < 0) {
-        Py_XDECREF(&MemoryMappedFileType);
+    Py_INCREF(&FileType);
+    if (PyModule_AddObject(m, "File", (PyObject *)&FileType) < 0) {
+        Py_XDECREF(&FileType);
         Py_XDECREF(&StrType);
         Py_XDECREF(m);
         return NULL;
     }
 
+    // Initialize temporary_memory, if needed
+    // For example, allocate an initial chunk
+    temporary_memory.ptr = malloc(4096);
+    temporary_memory.len = 4096 * (temporary_memory.ptr != NULL);
+    atexit(cleanup_module);
+
     // Register the vectorized functions
-    vectorized_find = register_vectorcall(m, "find", str_find_vectorcall);
-    vectorized_contains = register_vectorcall(m, "contains", str_contains_vectorcall);
-    vectorized_count = register_vectorcall(m, "count", str_count_vectorcall);
-    vectorized_levenstein = register_vectorcall(m, "levenstein", str_find_vectorcall);
-
-    vectorized_split = register_vectorcall(m, "split", str_find_vectorcall);
-    vectorized_sort = register_vectorcall(m, "sort", str_find_vectorcall);
-    vectorized_shuffle = register_vectorcall(m, "shuffle", str_find_vectorcall);
+    PyObject *vectorized_find = register_vectorcall(m, "find", str_find_vectorcall);
+    PyObject *vectorized_contains = register_vectorcall(m, "contains", str_contains_vectorcall);
+    PyObject *vectorized_count = register_vectorcall(m, "count", str_count_vectorcall);
+    PyObject *vectorized_levenstein = register_vectorcall(m, "levenstein", str_levenstein_vectorcall);
+
+    PyObject *vectorized_split = register_vectorcall(m, "split", str_find_vectorcall);
+    PyObject *vectorized_sort = register_vectorcall(m, "sort", str_find_vectorcall);
+    PyObject *vectorized_shuffle = register_vectorcall(m, "shuffle", str_find_vectorcall);
+
     if (!vectorized_find || !vectorized_count ||          //
         !vectorized_contains || !vectorized_levenstein || //
         !vectorized_split || !vectorized_sort || !vectorized_shuffle) {
@@ -753,7 +834,7 @@ PyMODINIT_FUNC PyInit_stringzilla(void) {
     if (vectorized_shuffle)
         Py_XDECREF(vectorized_shuffle);
 
-    Py_XDECREF(&MemoryMappedFileType);
+    Py_XDECREF(&FileType);
     Py_XDECREF(&StrType);
     Py_XDECREF(m);
     PyErr_NoMemory();
diff --git a/scripts/test.py b/scripts/test.py
index 21d5d7ba..c3f70523 100644
--- a/scripts/test.py
+++ b/scripts/test.py
@@ -11,13 +11,19 @@
 
 def test_globals():
     assert sz.find("abcdef", "bcdef") == 1
-    assert sz.find("abcdef", "x") == 6
+    assert sz.find("abcdef", "x") == -1
 
     assert sz.count("abcdef", "x") == 0
     assert sz.count("aaaaa", "a") == 5
     assert sz.count("aaaaa", "aa") == 2
     assert sz.count("aaaaa", "aa", allowoverlap=True) == 4
 
+    assert sz.levenstein("aaa", "aaa") == 0
+    assert sz.levenstein("aaa", "bbb") == 3
+    assert sz.levenstein("abababab", "aaaaaaaa") == 4
+    assert sz.levenstein("abababab", "aaaaaaaa", 2) == 2
+    assert sz.levenstein("abababab", "aaaaaaaa", bound=2) == 2
+
 
 def test_construct():
     native = "aaaaa"
diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h
index ca58b1ed..dc89934c 100644
--- a/stringzilla/stringzilla.h
+++ b/stringzilla/stringzilla.h
@@ -412,9 +412,9 @@ inline static size_t strzl_neon_find_substr(strzl_haystack_t h, strzl_needle_t n
         uint32x4_t matches3 = vceqq_u32(vandq_u32(vld1q_u32((uint32_t const *)(text + 3)), masks), anomalies);
 
         // Extracting matches from matches:
-        // vmaxvq_u32 (only a64)
-        // vgetq_lane_u32 (all)
-        // vorrq_u32 (all)
+        //   vmaxvq_u32 (only a64)
+        //   vgetq_lane_u32 (all)
+        //   vorrq_u32 (all)
         uint32x4_t matches = vorrq_u32(vorrq_u32(matches0, matches1), vorrq_u32(matches2, matches3));
         uint64x2_t matches64x2 = vreinterpretq_u64_u32(matches);
         int has_match = vgetq_lane_u64(matches64x2, 0) | vgetq_lane_u64(matches64x2, 1);
@@ -686,10 +686,21 @@ inline static levenstein_distance_t strzl_levenstein( //
     levenstein_distance_t bound,
     void *buffer) {
 
+    // If one of the strings is empty - the edit distance is equal to the length of the other one
     if (a_length == 0)
-        return b_length <= bound ? b_length : bound + 1;
+        return b_length <= bound ? b_length : bound;
     if (b_length == 0)
-        return a_length <= bound ? a_length : bound + 1;
+        return a_length <= bound ? a_length : bound;
+
+    // If the difference in length is beyond the `bound`, there is no need to check at all
+    if (a_length > b_length) {
+        if (a_length - b_length > bound)
+            return bound + 1;
+    }
+    else {
+        if (b_length - a_length > bound)
+            return bound + 1;
+    }
 
     levenstein_distance_t *previous_distances = (levenstein_distance_t *)buffer;
     levenstein_distance_t *current_distances = previous_distances + b_length + 1;
@@ -700,20 +711,32 @@ inline static levenstein_distance_t strzl_levenstein( //
     for (size_t idx_a = 0; idx_a != a_length; ++idx_a) {
         current_distances[0] = idx_a + 1;
 
+        // Initialize min_distance with a value greater than bound
+        levenstein_distance_t min_distance = bound;
+
         for (size_t idx_b = 0; idx_b != b_length; ++idx_b) {
             levenstein_distance_t cost_deletion = previous_distances[idx_b + 1] + 1;
             levenstein_distance_t cost_insertion = current_distances[idx_b] + 1;
             levenstein_distance_t cost_substitution = previous_distances[idx_b] + (a[idx_a] != b[idx_b]);
             current_distances[idx_b + 1] = _strzl_levenstein_minimum(cost_deletion, cost_insertion, cost_substitution);
+
+            // Keep track of the minimum distance seen so far in this row
+            if (current_distances[idx_b + 1] < min_distance) {
+                min_distance = current_distances[idx_b + 1];
+            }
         }
 
+        // If the minimum distance in this row exceeded the bound, return early
+        if (min_distance > bound)
+            return bound;
+
         // Swap previous_distances and current_distances pointers
         levenstein_distance_t *temp = previous_distances;
         previous_distances = current_distances;
         current_distances = temp;
     }
 
-    return previous_distances[b_length];
+    return previous_distances[b_length] <= bound ? previous_distances[b_length] : bound;
 }
 
 /**

From d37b3422d99d1cfa20532ff9ff29ae6200acee07 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 17 Sep 2023 16:28:57 +0400
Subject: [PATCH 12/72] Break: Shorter function prefixes

---
 .vscode/settings.json     |   1 +
 README.md                 |  14 +--
 python/lib.c              |  30 +++---
 scripts/test.cpp          |  36 +++----
 stringzilla/stringzilla.h | 221 +++++++++++++++++++-------------------
 5 files changed, 152 insertions(+), 150 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 2a8fa9c2..48034254 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -158,6 +158,7 @@
     "TPFLAGS",
     "Vardanian",
     "vectorcallfunc",
+    "XDECREF",
     "Zilla"
   ]
 }
\ No newline at end of file
diff --git a/README.md b/README.md
index 8b4630de..1f9cea00 100644
--- a/README.md
+++ b/README.md
@@ -115,17 +115,17 @@ There is an ABI-stable C 99 interface, in case you have a database, an operating
 #include "stringzilla.h"
 
 // Initialize your haystack and needle
-strzl_haystack_t haystack = {your_text, your_text_length};
-strzl_needle_t needle = {your_subtext, your_subtext_length, your_anomaly_offset};
+sz_haystack_t haystack = {your_text, your_text_length};
+sz_needle_t needle = {your_subtext, your_subtext_length, your_anomaly_offset};
 
 // Perform string-level operations
-size_t character_count = strzl_naive_count_char(haystack, 'a');
-size_t character_position = strzl_naive_find_char(haystack, 'a');
-size_t substring_position = strzl_naive_find_substr(haystack, needle);
+size_t character_count = sz_naive_count_char(haystack, 'a');
+size_t character_position = sz_naive_find_char(haystack, 'a');
+size_t substring_position = sz_naive_find_substr(haystack, needle);
 
 // Perform collection level operations
-strzl_array_t array = {your_order, your_count, your_get_begin, your_get_length, your_handle};
-strzl_sort(&array, &your_config);
+sz_array_t array = {your_order, your_count, your_get_begin, your_get_length, your_handle};
+sz_sort(&array, &your_config);
 ```
 
 ## Contributing 👾
diff --git a/python/lib.c b/python/lib.c
index 768b0d7a..84582d32 100644
--- a/python/lib.c
+++ b/python/lib.c
@@ -145,8 +145,8 @@ static Py_ssize_t str_find_vectorcall_(PyObject *_, PyObject *const *args, size_
 
     PyObject *haystack_obj = args[0];
     PyObject *needle_obj = args[1];
-    struct strzl_haystack_t haystack;
-    struct strzl_needle_t needle;
+    struct sz_haystack_t haystack;
+    struct sz_needle_t needle;
     needle.anomaly_offset = 0;
     if (!export_string_like(haystack_obj, &haystack.ptr, &haystack.len) ||
         !export_string_like(needle_obj, &needle.ptr, &needle.len)) {
@@ -183,7 +183,7 @@ static Py_ssize_t str_find_vectorcall_(PyObject *_, PyObject *const *args, size_
     haystack.len = normalized_length;
 
     // Perform contains operation
-    size_t offset = strzl_neon_find_substr(haystack, needle);
+    size_t offset = sz_neon_find_substr(haystack, needle);
     if (offset == haystack.len)
         return -1;
     return (Py_ssize_t)offset;
@@ -221,8 +221,8 @@ static PyObject *str_count_vectorcall(PyObject *_, PyObject *const *args, size_t
     PyObject *haystack_obj = args[0];
     PyObject *needle_obj = args[1];
 
-    struct strzl_haystack_t haystack;
-    struct strzl_needle_t needle;
+    struct sz_haystack_t haystack;
+    struct sz_needle_t needle;
     needle.anomaly_offset = 0;
     if (!export_string_like(haystack_obj, &haystack.ptr, &haystack.len) ||
         !export_string_like(needle_obj, &needle.ptr, &needle.len)) {
@@ -263,13 +263,13 @@ static PyObject *str_count_vectorcall(PyObject *_, PyObject *const *args, size_t
     // Perform counting operation
     size_t count = 0;
     if (needle.len == 1) {
-        count = strzl_naive_count_char(haystack, *needle.ptr);
+        count = sz_naive_count_char(haystack, *needle.ptr);
     }
     else {
         // Your existing logic for count_substr can be embedded here
         if (allow_overlap) {
             while (haystack.len) {
-                size_t offset = strzl_neon_find_substr(haystack, needle);
+                size_t offset = sz_neon_find_substr(haystack, needle);
                 int found = offset != haystack.len;
                 count += found;
                 haystack.ptr += offset + found;
@@ -278,7 +278,7 @@ static PyObject *str_count_vectorcall(PyObject *_, PyObject *const *args, size_t
         }
         else {
             while (haystack.len) {
-                size_t offset = strzl_neon_find_substr(haystack, needle);
+                size_t offset = sz_neon_find_substr(haystack, needle);
                 int found = offset != haystack.len;
                 count += found;
                 haystack.ptr += offset + needle.len;
@@ -302,7 +302,7 @@ static PyObject *str_levenstein_vectorcall(PyObject *_, PyObject *const *args, s
     PyObject *str1_obj = args[0];
     PyObject *str2_obj = args[1];
 
-    struct strzl_haystack_t str1, str2;
+    struct sz_haystack_t str1, str2;
     if (!export_string_like(str1_obj, &str1.ptr, &str1.len) || !export_string_like(str2_obj, &str2.ptr, &str2.len)) {
         PyErr_SetString(PyExc_TypeError, "Both arguments must be string-like");
         return NULL;
@@ -340,7 +340,7 @@ static PyObject *str_levenstein_vectorcall(PyObject *_, PyObject *const *args, s
     }
 
     // Initialize or reallocate the Levenshtein distance matrix
-    size_t memory_needed = strzl_levenstein_memory_needed(str1.len, str2.len);
+    size_t memory_needed = sz_levenstein_memory_needed(str1.len, str2.len);
     if (temporary_memory.len < memory_needed) {
         temporary_memory.ptr = realloc(temporary_memory.ptr, memory_needed);
         temporary_memory.len = memory_needed;
@@ -350,7 +350,7 @@ static PyObject *str_levenstein_vectorcall(PyObject *_, PyObject *const *args, s
         return NULL;
     }
 
-    levenstein_distance_t distance = strzl_levenstein( //
+    levenstein_distance_t distance = sz_levenstein( //
         str1.ptr,
         str1.len,
         str2.ptr,
@@ -551,7 +551,7 @@ static PyObject *Str_str(Str *self) { return PyUnicode_FromStringAndSize(self->s
 
 static Py_ssize_t Str_len(Str *self) { return self->length; }
 
-static Py_hash_t Str_hash(Str *self) { return (Py_hash_t)strzl_hash_crc32_native(self->start, self->length); }
+static Py_hash_t Str_hash(Str *self) { return (Py_hash_t)sz_hash_crc32_native(self->start, self->length); }
 
 static PyObject *Str_getitem(Str *self, Py_ssize_t i) {
 
@@ -607,17 +607,17 @@ static PyObject *Str_subscript(Str *self, PyObject *key) {
 // Will be called by the `PySequence_Contains`
 static int Str_contains(Str *self, PyObject *arg) {
 
-    struct strzl_needle_t needle_struct;
+    struct sz_needle_t needle_struct;
     needle_struct.anomaly_offset = 0;
     if (!export_string_like(arg, &needle_struct.ptr, &needle_struct.len)) {
         PyErr_SetString(PyExc_TypeError, "Unsupported argument type");
         return -1;
     }
 
-    struct strzl_haystack_t haystack;
+    struct sz_haystack_t haystack;
     haystack.ptr = self->start;
     haystack.len = self->length;
-    size_t position = strzl_neon_find_substr(haystack, needle_struct);
+    size_t position = sz_neon_find_substr(haystack, needle_struct);
     return position != haystack.len;
 }
 
diff --git a/scripts/test.cpp b/scripts/test.cpp
index c6fa28ab..3e76248e 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -195,23 +195,23 @@ int main(int, char const **) {
         bench_search("std::search", full_text, [&]() {
             return std::search(full_text.begin(), full_text.end(), needle.begin(), needle.end()) - full_text.begin();
         });
-        bench_search("strzl_naive_find_substr", full_text, [&]() {
-            strzl_haystack_t h {full_text.data(), full_text.size()};
-            strzl_needle_t n {needle.data(), needle.size()};
-            return strzl_naive_find_substr(h, n);
+        bench_search("sz_naive_find_substr", full_text, [&]() {
+            sz_haystack_t h {full_text.data(), full_text.size()};
+            sz_needle_t n {needle.data(), needle.size()};
+            return sz_naive_find_substr(h, n);
         });
 #if defined(__ARM_NEON)
-        bench_search("strzl_neon_find_substr", full_text, [&]() {
-            strzl_haystack_t h {full_text.data(), full_text.size()};
-            strzl_needle_t n {needle.data(), needle.size()};
-            return strzl_neon_find_substr(h, n);
+        bench_search("sz_neon_find_substr", full_text, [&]() {
+            sz_haystack_t h {full_text.data(), full_text.size()};
+            sz_needle_t n {needle.data(), needle.size()};
+            return sz_neon_find_substr(h, n);
         });
 #endif
 #if defined(__AVX2__)
-        bench_search("strzl_avx2_find_substr", full_text, [&]() {
-            strzl_haystack_t h {full_text.data(), full_text.size()};
-            strzl_needle_t n {needle.data(), needle.size()};
-            return strzl_avx2_find_substr(h, n);
+        bench_search("sz_avx2_find_substr", full_text, [&]() {
+            sz_haystack_t h {full_text.data(), full_text.size()};
+            sz_needle_t n {needle.data(), needle.size()};
+            return sz_avx2_find_substr(h, n);
         });
 #endif
     }
@@ -233,12 +233,12 @@ int main(int, char const **) {
         });
         expect_partitioned_by_length(strings, permute_base);
 
-        bench_permute("strzl_partition", strings, permute_new, [](strings_t const &strings, permute_t &permute) {
-            strzl_array_t array;
+        bench_permute("sz_partition", strings, permute_new, [](strings_t const &strings, permute_t &permute) {
+            sz_array_t array;
             array.order = permute.data();
             array.count = strings.size();
             array.handle = &strings;
-            strzl_partition(&array, &has_under_four_chars);
+            sz_partition(&array, &has_under_four_chars);
         });
         expect_partitioned_by_length(strings, permute_new);
         // TODO: expect_same(permute_base, permute_new);
@@ -252,14 +252,14 @@ int main(int, char const **) {
         });
         expect_sorted(strings, permute_base);
 
-        bench_permute("strzl_sort", strings, permute_new, [](strings_t const &strings, permute_t &permute) {
-            strzl_array_t array;
+        bench_permute("sz_sort", strings, permute_new, [](strings_t const &strings, permute_t &permute) {
+            sz_array_t array;
             array.order = permute.data();
             array.count = strings.size();
             array.handle = &strings;
             array.get_begin = get_begin;
             array.get_length = get_length;
-            strzl_sort(&array, nullptr);
+            sz_sort(&array, nullptr);
         });
         expect_sorted(strings, permute_new);
 
diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h
index dc89934c..5e60adb5 100644
--- a/stringzilla/stringzilla.h
+++ b/stringzilla/stringzilla.h
@@ -2,7 +2,7 @@
 #define STRINGZILLA_H_
 
 #include <stdint.h> // `uint8_t`
-#include <stddef.h> // `size_t`
+#include <stddef.h> // `sz_size_t`
 #include <string.h> // `memcpy`
 #include <stdlib.h> // `qsort_r`
 #include <search.h> // `qsort_s`
@@ -29,38 +29,39 @@
 extern "C" {
 #endif
 
-typedef uint32_t strzl_anomaly_t;
+typedef uint32_t sz_anomaly_t;
+typedef uint64_t sz_size_t;
 
-inline static size_t strzl_divide_round_up(size_t x, size_t divisor) { return (x + (divisor - 1)) / divisor; }
+inline static sz_size_t sz_divide_round_up(sz_size_t x, sz_size_t divisor) { return (x + (divisor - 1)) / divisor; }
 
 /**
  *  @brief This is a faster alternative to `strncmp(a, b, len) == 0`.
  *  @return 1 for `true`, and 0 for `false`.
  */
-inline static int strzl_equal(char const *a, char const *b, size_t len) {
+inline static int sz_equal(char const *a, char const *b, sz_size_t len) {
     char const *const a_end = a + len;
     while (a != a_end && *a == *b)
         a++, b++;
     return a_end == a;
 }
 
-typedef struct strzl_haystack_t {
+typedef struct sz_haystack_t {
     char const *ptr;
-    size_t len;
-} strzl_haystack_t;
+    sz_size_t len;
+} sz_haystack_t;
 
-typedef struct strzl_needle_t {
+typedef struct sz_needle_t {
     char const *ptr;
-    size_t len;
-    size_t anomaly_offset;
-} strzl_needle_t;
+    sz_size_t len;
+    sz_size_t anomaly_offset;
+} sz_needle_t;
 
 /**
  *  @brief  SWAR single-character counting procedure, jumping 8 bytes at a time.
  */
-inline static size_t strzl_naive_count_char(strzl_haystack_t h, char n) {
+inline static sz_size_t sz_naive_count_char(sz_haystack_t h, char n) {
 
-    size_t result = 0;
+    sz_size_t result = 0;
     char const *text = h.ptr;
     char const *end = h.ptr + h.len;
 
@@ -90,7 +91,7 @@ inline static size_t strzl_naive_count_char(strzl_haystack_t h, char n) {
 /**
  *  @brief  SWAR single-character search in string, jumping 8 bytes at a time.
  */
-inline static size_t strzl_naive_find_char(strzl_haystack_t h, char n) {
+inline static sz_size_t sz_naive_find_char(sz_haystack_t h, char n) {
 
     char const *text = h.ptr;
     char const *end = h.ptr + h.len;
@@ -125,7 +126,7 @@ inline static size_t strzl_naive_find_char(strzl_haystack_t h, char n) {
 /**
  *  @brief  SWAR character-bigram search in string, jumping 8 bytes at a time.
  */
-inline static size_t strzl_naive_find_2chars(strzl_haystack_t h, char const *n) {
+inline static sz_size_t sz_naive_find_2chars(sz_haystack_t h, char const *n) {
 
     char const *text = h.ptr;
     char const *end = h.ptr + h.len;
@@ -167,7 +168,7 @@ inline static size_t strzl_naive_find_2chars(strzl_haystack_t h, char const *n)
 /**
  *  @brief  SWAR character-trigram search in string, jumping 8 bytes at a time.
  */
-inline static size_t strzl_naive_find_3chars(strzl_haystack_t h, char const *n) {
+inline static sz_size_t sz_naive_find_3chars(sz_haystack_t h, char const *n) {
 
     char const *text = h.ptr;
     char const *end = h.ptr + h.len;
@@ -222,7 +223,7 @@ inline static size_t strzl_naive_find_3chars(strzl_haystack_t h, char const *n)
 /**
  *  @brief  SWAR character-quadgram search in string, jumping 8 bytes at a time.
  */
-inline static size_t strzl_naive_find_4chars(strzl_haystack_t h, char const *n) {
+inline static sz_size_t sz_naive_find_4chars(sz_haystack_t h, char const *n) {
 
     char const *text = h.ptr;
     char const *end = h.ptr + h.len;
@@ -283,7 +284,7 @@ inline static size_t strzl_naive_find_4chars(strzl_haystack_t h, char const *n)
  *          it compares 4-byte anomalies first, most commonly prefixes. It's computationally cheaper.
  *          Matching performance fluctuates between 1 GB/s and 3,5 GB/s per core.
  */
-inline static size_t strzl_naive_find_substr(strzl_haystack_t h, strzl_needle_t n) {
+inline static sz_size_t sz_naive_find_substr(sz_haystack_t h, sz_needle_t n) {
 
     if (h.len < n.len)
         return h.len;
@@ -292,22 +293,22 @@ inline static size_t strzl_naive_find_substr(strzl_haystack_t h, strzl_needle_t
     char const *const end = h.ptr + h.len;
     switch (n.len) {
     case 0: return 0;
-    case 1: return strzl_naive_find_char(h, *n.ptr);
-    case 2: return strzl_naive_find_2chars(h, n.ptr);
-    case 3: return strzl_naive_find_3chars(h, n.ptr);
-    case 4: return strzl_naive_find_4chars(h, n.ptr);
+    case 1: return sz_naive_find_char(h, *n.ptr);
+    case 2: return sz_naive_find_2chars(h, n.ptr);
+    case 3: return sz_naive_find_3chars(h, n.ptr);
+    case 4: return sz_naive_find_4chars(h, n.ptr);
     default: {
-        strzl_anomaly_t n_anomaly, h_anomaly;
-        size_t const n_suffix_len = n.len - 4 - n.anomaly_offset;
+        sz_anomaly_t n_anomaly, h_anomaly;
+        sz_size_t const n_suffix_len = n.len - 4 - n.anomaly_offset;
         char const *n_suffix_ptr = n.ptr + 4 + n.anomaly_offset;
         memcpy(&n_anomaly, n.ptr + n.anomaly_offset, 4);
 
         text += n.anomaly_offset;
         for (; text + n.len <= end; text++) {
             memcpy(&h_anomaly, text, 4);
-            if (h_anomaly == n_anomaly)                                                // Match anomaly.
-                if (strzl_equal(text + 4, n_suffix_ptr, n_suffix_len))                 // Match suffix.
-                    if (strzl_equal(text - n.anomaly_offset, n.ptr, n.anomaly_offset)) // Match prefix.
+            if (h_anomaly == n_anomaly)                                             // Match anomaly.
+                if (sz_equal(text + 4, n_suffix_ptr, n_suffix_len))                 // Match suffix.
+                    if (sz_equal(text - n.anomaly_offset, n.ptr, n.anomaly_offset)) // Match prefix.
                         return text - h.ptr - n.anomaly_offset;
         }
         return h.len;
@@ -323,7 +324,7 @@ inline static size_t strzl_naive_find_substr(strzl_haystack_t h, strzl_needle_t
  *          was practically more efficient than loading once and shifting around, as introduces
  *          less data dependencies.
  */
-size_t strzl_avx2_find_substr(strzl_haystack_t h, strzl_needle_t n) {
+sz_size_t sz_avx2_find_substr(sz_haystack_t h, sz_needle_t n) {
 
     // Precomputed constants
     char const *const end = h.ptr + h.len;
@@ -362,18 +363,18 @@ size_t strzl_avx2_find_substr(strzl_haystack_t h, strzl_needle_t n) {
         int matches3 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts3, anomalies));
 
         if (matches0 | matches1 | matches2 | matches3) {
-            for (size_t i = 0; i < 32; i++) {
-                if (strzl_equal(text + i, n.ptr, n.len))
+            for (sz_size_t i = 0; i < 32; i++) {
+                if (sz_equal(text + i, n.ptr, n.len))
                     return i + (text - h.ptr);
             }
         }
     }
 
     // Don't forget the last (up to 35) characters.
-    strzl_haystack_t h_remainder;
+    sz_haystack_t h_remainder;
     h_remainder.ptr = text;
     h_remainder.len = end - text;
-    size_t tail_match = strzl_naive_find_substr(h_remainder, n);
+    sz_size_t tail_match = sz_naive_find_substr(h_remainder, n);
     return text + tail_match - h.ptr;
 }
 
@@ -387,7 +388,7 @@ size_t strzl_avx2_find_substr(strzl_haystack_t h, strzl_needle_t n) {
  *          was practically more efficient than loading once and shifting around, as introduces
  *          less data dependencies.
  */
-inline static size_t strzl_neon_find_substr(strzl_haystack_t h, strzl_needle_t n) {
+inline static sz_size_t sz_neon_find_substr(sz_haystack_t h, sz_needle_t n) {
 
     // Precomputed constants
     char const *const end = h.ptr + h.len;
@@ -420,55 +421,55 @@ inline static size_t strzl_neon_find_substr(strzl_haystack_t h, strzl_needle_t n
         int has_match = vgetq_lane_u64(matches64x2, 0) | vgetq_lane_u64(matches64x2, 1);
 
         if (has_match) {
-            for (size_t i = 0; i < 16; i++) {
-                if (strzl_equal(text + i, n.ptr, n.len))
+            for (sz_size_t i = 0; i < 16; i++) {
+                if (sz_equal(text + i, n.ptr, n.len))
                     return i + (text - h.ptr);
             }
         }
     }
 
     // Don't forget the last (up to 16+3=19) characters.
-    strzl_haystack_t h_remainder;
+    sz_haystack_t h_remainder;
     h_remainder.ptr = text;
     h_remainder.len = end - text;
-    size_t tail_match = strzl_naive_find_substr(h_remainder, n);
+    sz_size_t tail_match = sz_naive_find_substr(h_remainder, n);
     return text + tail_match - h.ptr;
 }
 
 #endif // Arm Neon
 
-inline static void strzl_swap(size_t *a, size_t *b) {
-    size_t t = *a;
+inline static void sz_swap(sz_size_t *a, sz_size_t *b) {
+    sz_size_t t = *a;
     *a = *b;
     *b = t;
 }
 
-typedef char const *(*strzl_array_get_begin_t)(void const *, size_t);
-typedef size_t (*strzl_array_get_length_t)(void const *, size_t);
-typedef int (*strzl_array_predicate_t)(void const *, size_t);
-typedef int (*strzl_array_comparator_t)(void const *, size_t, size_t);
+typedef char const *(*sz_array_get_begin_t)(void const *, sz_size_t);
+typedef sz_size_t (*sz_array_get_length_t)(void const *, sz_size_t);
+typedef int (*sz_array_predicate_t)(void const *, sz_size_t);
+typedef int (*sz_array_comparator_t)(void const *, sz_size_t, sz_size_t);
 
-typedef struct strzl_array_t {
-    size_t *order;
-    size_t count;
-    strzl_array_get_begin_t get_begin;
-    strzl_array_get_length_t get_length;
+typedef struct sz_array_t {
+    sz_size_t *order;
+    sz_size_t count;
+    sz_array_get_begin_t get_begin;
+    sz_array_get_length_t get_length;
     void const *handle;
-} strzl_array_t;
+} sz_array_t;
 
 /**
  *  @brief  Similar to `std::partition`, given a predicate splits the
  *          array into two parts.
  */
-inline static size_t strzl_partition(strzl_array_t *array, strzl_array_predicate_t predicate) {
+inline static sz_size_t sz_partition(sz_array_t *array, sz_array_predicate_t predicate) {
 
-    size_t matches = 0;
+    sz_size_t matches = 0;
     while (matches != array->count && predicate(array->handle, array->order[matches]))
         ++matches;
 
-    for (size_t i = matches + 1; i < array->count; ++i)
+    for (sz_size_t i = matches + 1; i < array->count; ++i)
         if (predicate(array->handle, array->order[i]))
-            strzl_swap(array->order + i, array->order + matches), ++matches;
+            sz_swap(array->order + i, array->order + matches), ++matches;
 
     return matches;
 }
@@ -477,15 +478,15 @@ inline static size_t strzl_partition(strzl_array_t *array, strzl_array_predicate
  *  @brief  Inplace `std::set_union` for two consecutive chunks forming
  *          the same continuous array.
  */
-inline static void strzl_merge(strzl_array_t *array, size_t partition, strzl_array_comparator_t less) {
+inline static void sz_merge(sz_array_t *array, sz_size_t partition, sz_array_comparator_t less) {
 
-    size_t start_b = partition + 1;
+    sz_size_t start_b = partition + 1;
 
     // If the direct merge is already sorted
     if (!less(array->handle, array->order[start_b], array->order[partition]))
         return;
 
-    size_t start_a = 0;
+    sz_size_t start_a = 0;
     while (start_a <= partition && start_b <= array->count) {
 
         // If element 1 is in right place
@@ -493,8 +494,8 @@ inline static void strzl_merge(strzl_array_t *array, size_t partition, strzl_arr
             start_a++;
         }
         else {
-            size_t value = array->order[start_b];
-            size_t index = start_b;
+            sz_size_t value = array->order[start_b];
+            sz_size_t index = start_b;
 
             // Shift all the elements between element 1
             // element 2, right by 1.
@@ -512,10 +513,10 @@ inline static void strzl_merge(strzl_array_t *array, size_t partition, strzl_arr
     }
 }
 
-inline static void _strzl_sort_recursion( //
-    strzl_array_t *array,
-    size_t bit_idx,
-    size_t bit_max,
+inline static void _sz_sort_recursion( //
+    sz_array_t *array,
+    sz_size_t bit_idx,
+    sz_size_t bit_max,
 #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) || __APPLE__
     int (*libc_comparator)(void *, void const *, void const *)
 #else
@@ -527,52 +528,52 @@ inline static void _strzl_sort_recursion( //
         return;
 
     // Partition a range of integers according to a specific bit value
-    size_t split = 0;
+    sz_size_t split = 0;
     {
-        size_t mask = (1ul << 63) >> bit_idx;
+        sz_size_t mask = (1ul << 63) >> bit_idx;
         while (split != array->count && !(array->order[split] & mask))
             ++split;
 
-        for (size_t i = split + 1; i < array->count; ++i)
+        for (sz_size_t i = split + 1; i < array->count; ++i)
             if (!(array->order[i] & mask))
-                strzl_swap(array->order + i, array->order + split), ++split;
+                sz_swap(array->order + i, array->order + split), ++split;
     }
 
     // Go down recursively
     if (bit_idx < bit_max) {
-        strzl_array_t a = *array;
+        sz_array_t a = *array;
         a.count = split;
-        _strzl_sort_recursion(&a, bit_idx + 1, bit_max, libc_comparator);
+        _sz_sort_recursion(&a, bit_idx + 1, bit_max, libc_comparator);
 
-        strzl_array_t b = *array;
+        sz_array_t b = *array;
         b.order += split;
         b.count -= split;
-        _strzl_sort_recursion(&b, bit_idx + 1, bit_max, libc_comparator);
+        _sz_sort_recursion(&b, bit_idx + 1, bit_max, libc_comparator);
     }
     // Reached the end of recursion
     else {
         // Discard the prefixes
-        for (size_t i = 0; i != array->count; ++i)
+        for (sz_size_t i = 0; i != array->count; ++i)
             memset((char *)(&array->order[i]) + 4, 0, 4ul);
 
 #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
         // Perform sorts on smaller chunks instead of the whole handle
         // https://stackoverflow.com/a/39561369
         // https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/qsort-s?view=msvc-170
-        qsort_s(array->order, split, sizeof(size_t), libc_comparator, (void *)array);
-        qsort_s(array->order + split, array->count - split, sizeof(size_t), libc_comparator, (void *)array);
+        qsort_s(array->order, split, sizeof(sz_size_t), libc_comparator, (void *)array);
+        qsort_s(array->order + split, array->count - split, sizeof(sz_size_t), libc_comparator, (void *)array);
 #elif __APPLE__
-        qsort_r(array->order, split, sizeof(size_t), (void *)array, libc_comparator);
-        qsort_r(array->order + split, array->count - split, sizeof(size_t), (void *)array, libc_comparator);
+        qsort_r(array->order, split, sizeof(sz_size_t), (void *)array, libc_comparator);
+        qsort_r(array->order + split, array->count - split, sizeof(sz_size_t), (void *)array, libc_comparator);
 #else
         // https://linux.die.net/man/3/qsort_r
-        qsort_r(array->order, split, sizeof(size_t), libc_comparator, (void *)array);
-        qsort_r(array->order + split, array->count - split, sizeof(size_t), libc_comparator, (void *)array);
+        qsort_r(array->order, split, sizeof(sz_size_t), libc_comparator, (void *)array);
+        qsort_r(array->order + split, array->count - split, sizeof(sz_size_t), libc_comparator, (void *)array);
 #endif
     }
 }
 
-inline static int _strzl_sort_array_strncmp(
+inline static int _sz_sort_array_strncmp(
 #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) || __APPLE__
     void *array_raw, void const *a_raw, void const *b_raw
 #else
@@ -581,11 +582,11 @@ inline static int _strzl_sort_array_strncmp(
 ) {
     // https://man.freebsd.org/cgi/man.cgi?query=qsort_s&sektion=3&n=1
     // https://www.man7.org/linux/man-pages/man3/strcmp.3.html
-    strzl_array_t *array = (strzl_array_t *)array_raw;
-    size_t a = *(size_t *)a_raw;
-    size_t b = *(size_t *)b_raw;
-    size_t a_len = array->get_length(array->handle, a);
-    size_t b_len = array->get_length(array->handle, b);
+    sz_array_t *array = (sz_array_t *)array_raw;
+    sz_size_t a = *(sz_size_t *)a_raw;
+    sz_size_t b = *(sz_size_t *)b_raw;
+    sz_size_t a_len = array->get_length(array->handle, a);
+    sz_size_t b_len = array->get_length(array->handle, b);
     int res = strncmp( //
         array->get_begin(array->handle, a),
         array->get_begin(array->handle, b),
@@ -593,7 +594,7 @@ inline static int _strzl_sort_array_strncmp(
     return res ? res : a_len - b_len;
 }
 
-inline static int _strzl_sort_array_strncasecmp(
+inline static int _sz_sort_array_strncasecmp(
 #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) || __APPLE__
     void *array_raw, void const *a_raw, void const *b_raw
 #else
@@ -602,11 +603,11 @@ inline static int _strzl_sort_array_strncasecmp(
 ) {
     // https://man.freebsd.org/cgi/man.cgi?query=qsort_s&sektion=3&n=1
     // https://www.man7.org/linux/man-pages/man3/strcmp.3.html
-    strzl_array_t *array = (strzl_array_t *)array_raw;
-    size_t a = *(size_t *)a_raw;
-    size_t b = *(size_t *)b_raw;
-    size_t a_len = array->get_length(array->handle, a);
-    size_t b_len = array->get_length(array->handle, b);
+    sz_array_t *array = (sz_array_t *)array_raw;
+    sz_size_t a = *(sz_size_t *)a_raw;
+    sz_size_t b = *(sz_size_t *)b_raw;
+    sz_size_t a_len = array->get_length(array->handle, a);
+    sz_size_t b_len = array->get_length(array->handle, b);
     int res = strncasecmp( //
         array->get_begin(array->handle, a),
         array->get_begin(array->handle, b),
@@ -614,25 +615,25 @@ inline static int _strzl_sort_array_strncasecmp(
     return res ? res : a_len - b_len;
 }
 
-typedef struct strzl_sort_config_t {
+typedef struct sz_sort_config_t {
     int case_insensitive;
-} strzl_sort_config_t;
+} sz_sort_config_t;
 
 /**
  *  @brief  Sorting algorithm, combining Radix Sort for the first 32 bits of every word
  *          and a follow-up Quick Sort on resulting structure.
  */
-inline static void strzl_sort(strzl_array_t *array, strzl_sort_config_t const *config) {
+inline static void sz_sort(sz_array_t *array, sz_sort_config_t const *config) {
 
     int case_insensitive = config && config->case_insensitive;
 
     // Export up to 4 bytes into the `array` bits themselves
-    for (size_t i = 0; i != array->count; ++i) {
+    for (sz_size_t i = 0; i != array->count; ++i) {
         char const *begin = array->get_begin(array->handle, array->order[i]);
-        size_t length = array->get_length(array->handle, array->order[i]);
+        sz_size_t length = array->get_length(array->handle, array->order[i]);
         length = length > 4ul ? 4ul : length;
         char *prefix = (char *)&array->order[i];
-        for (size_t j = 0; j != length; ++j)
+        for (sz_size_t j = 0; j != length; ++j)
             prefix[7 - j] = begin[j];
         if (case_insensitive) {
             prefix[0] = tolower(prefix[0]);
@@ -647,12 +648,12 @@ inline static void strzl_sort(strzl_array_t *array, strzl_sort_config_t const *c
 #else
     int (*comparator)(void const *, void const *, void *);
 #endif
-    comparator = _strzl_sort_array_strncmp;
+    comparator = _sz_sort_array_strncmp;
     if (case_insensitive)
-        comparator = _strzl_sort_array_strncasecmp;
+        comparator = _sz_sort_array_strncasecmp;
 
     // Perform optionally-parallel radix sort on them
-    _strzl_sort_recursion(array, 0, 32, comparator);
+    _sz_sort_recursion(array, 0, 32, comparator);
 }
 
 typedef uint8_t levenstein_distance_t;
@@ -661,12 +662,12 @@ typedef uint8_t levenstein_distance_t;
  *  @return Amount of temporary memory (in bytes) needed to efficiently compute
  *          the Levenstein distance between two strings of given size.
  */
-inline static size_t strzl_levenstein_memory_needed(size_t _, size_t b_length) { return b_length + b_length + 2; }
+inline static sz_size_t sz_levenstein_memory_needed(sz_size_t _, sz_size_t b_length) { return b_length + b_length + 2; }
 
 /**
  *  @brief  Auxiliary function, that computes the minimum of three values.
  */
-inline static levenstein_distance_t _strzl_levenstein_minimum( //
+inline static levenstein_distance_t _sz_levenstein_minimum( //
     levenstein_distance_t a,
     levenstein_distance_t b,
     levenstein_distance_t c) {
@@ -678,11 +679,11 @@ inline static levenstein_distance_t _strzl_levenstein_minimum( //
  *  @brief  Levenshtein String Similarity function, implemented with linear memory consumption.
  *          It accepts an upper bound on the possible error. Quadratic complexity in time, linear in space.
  */
-inline static levenstein_distance_t strzl_levenstein( //
+inline static levenstein_distance_t sz_levenstein( //
     char const *a,
-    size_t a_length,
+    sz_size_t a_length,
     char const *b,
-    size_t b_length,
+    sz_size_t b_length,
     levenstein_distance_t bound,
     void *buffer) {
 
@@ -705,20 +706,20 @@ inline static levenstein_distance_t strzl_levenstein( //
     levenstein_distance_t *previous_distances = (levenstein_distance_t *)buffer;
     levenstein_distance_t *current_distances = previous_distances + b_length + 1;
 
-    for (size_t idx_b = 0; idx_b != (b_length + 1); ++idx_b)
+    for (sz_size_t idx_b = 0; idx_b != (b_length + 1); ++idx_b)
         previous_distances[idx_b] = idx_b;
 
-    for (size_t idx_a = 0; idx_a != a_length; ++idx_a) {
+    for (sz_size_t idx_a = 0; idx_a != a_length; ++idx_a) {
         current_distances[0] = idx_a + 1;
 
         // Initialize min_distance with a value greater than bound
         levenstein_distance_t min_distance = bound;
 
-        for (size_t idx_b = 0; idx_b != b_length; ++idx_b) {
+        for (sz_size_t idx_b = 0; idx_b != b_length; ++idx_b) {
             levenstein_distance_t cost_deletion = previous_distances[idx_b + 1] + 1;
             levenstein_distance_t cost_insertion = current_distances[idx_b] + 1;
             levenstein_distance_t cost_substitution = previous_distances[idx_b] + (a[idx_a] != b[idx_b]);
-            current_distances[idx_b + 1] = _strzl_levenstein_minimum(cost_deletion, cost_insertion, cost_substitution);
+            current_distances[idx_b + 1] = _sz_levenstein_minimum(cost_deletion, cost_insertion, cost_substitution);
 
             // Keep track of the minimum distance seen so far in this row
             if (current_distances[idx_b + 1] < min_distance) {
@@ -742,11 +743,11 @@ inline static levenstein_distance_t strzl_levenstein( //
 /**
  *  @brief  Hashes provided string using hardware-accelerated CRC32 instructions.
  */
-inline static uint32_t strzl_hash_crc32_native(char const *start, size_t length) { return 0; }
+inline static uint32_t sz_hash_crc32_native(char const *start, sz_size_t length) { return 0; }
 
-inline static uint32_t strzl_hash_crc32_neon(char const *start, size_t length) { return 0; }
+inline static uint32_t sz_hash_crc32_neon(char const *start, sz_size_t length) { return 0; }
 
-inline static uint32_t strzl_hash_crc32_sse(char const *start, size_t length) { return 0; }
+inline static uint32_t sz_hash_crc32_sse(char const *start, sz_size_t length) { return 0; }
 
 #ifdef __cplusplus
 }

From cf324fb0bc12cd59a4d90ca4d4462255d3d3f0ec Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 17 Sep 2023 16:40:06 +0400
Subject: [PATCH 13/72] Refactor: `start` and `length` common member names

---
 README.md                 |   2 +-
 python/lib.c              |  87 +++++------
 scripts/test.cpp          |   8 +-
 stringzilla/stringzilla.h | 312 ++++++++++++++++++++------------------
 4 files changed, 210 insertions(+), 199 deletions(-)

diff --git a/README.md b/README.md
index 1f9cea00..bd0ee0c7 100644
--- a/README.md
+++ b/README.md
@@ -124,7 +124,7 @@ size_t character_position = sz_naive_find_char(haystack, 'a');
 size_t substring_position = sz_naive_find_substr(haystack, needle);
 
 // Perform collection level operations
-sz_array_t array = {your_order, your_count, your_get_begin, your_get_length, your_handle};
+sz_sequence_t array = {your_order, your_count, your_get_start, your_get_length, your_handle};
 sz_sort(&array, &your_config);
 ```
 
diff --git a/python/lib.c b/python/lib.c
index 84582d32..c670ae81 100644
--- a/python/lib.c
+++ b/python/lib.c
@@ -29,8 +29,8 @@ static PyTypeObject FileType;
 static PyTypeObject StrType;
 
 struct {
-    void *ptr;
-    size_t len;
+    void *start;
+    size_t length;
 } temporary_memory = {NULL, 0};
 
 /**
@@ -148,8 +148,8 @@ static Py_ssize_t str_find_vectorcall_(PyObject *_, PyObject *const *args, size_
     struct sz_haystack_t haystack;
     struct sz_needle_t needle;
     needle.anomaly_offset = 0;
-    if (!export_string_like(haystack_obj, &haystack.ptr, &haystack.len) ||
-        !export_string_like(needle_obj, &needle.ptr, &needle.len)) {
+    if (!export_string_like(haystack_obj, &haystack.start, &haystack.length) ||
+        !export_string_like(needle_obj, &needle.start, &needle.length)) {
         PyErr_SetString(PyExc_TypeError, "Haystack and needle must be string-like");
         return NULL;
     }
@@ -178,13 +178,13 @@ static Py_ssize_t str_find_vectorcall_(PyObject *_, PyObject *const *args, size_
 
     // Limit the haystack range
     size_t normalized_offset, normalized_length;
-    slice(haystack.len, start, end, &normalized_offset, &normalized_length);
-    haystack.ptr += normalized_offset;
-    haystack.len = normalized_length;
+    slice(haystack.length, start, end, &normalized_offset, &normalized_length);
+    haystack.start += normalized_offset;
+    haystack.length = normalized_length;
 
     // Perform contains operation
     size_t offset = sz_neon_find_substr(haystack, needle);
-    if (offset == haystack.len)
+    if (offset == haystack.length)
         return -1;
     return (Py_ssize_t)offset;
 }
@@ -224,8 +224,8 @@ static PyObject *str_count_vectorcall(PyObject *_, PyObject *const *args, size_t
     struct sz_haystack_t haystack;
     struct sz_needle_t needle;
     needle.anomaly_offset = 0;
-    if (!export_string_like(haystack_obj, &haystack.ptr, &haystack.len) ||
-        !export_string_like(needle_obj, &needle.ptr, &needle.len)) {
+    if (!export_string_like(haystack_obj, &haystack.start, &haystack.length) ||
+        !export_string_like(needle_obj, &needle.start, &needle.length)) {
         PyErr_SetString(PyExc_TypeError, "Haystack and needle must be string-like");
         return NULL;
     }
@@ -256,33 +256,33 @@ static PyObject *str_count_vectorcall(PyObject *_, PyObject *const *args, size_t
 
     // Limit the haystack range
     size_t normalized_offset, normalized_length;
-    slice(haystack.len, start, end, &normalized_offset, &normalized_length);
-    haystack.ptr += normalized_offset;
-    haystack.len = normalized_length;
+    slice(haystack.length, start, end, &normalized_offset, &normalized_length);
+    haystack.start += normalized_offset;
+    haystack.length = normalized_length;
 
     // Perform counting operation
     size_t count = 0;
-    if (needle.len == 1) {
-        count = sz_naive_count_char(haystack, *needle.ptr);
+    if (needle.length == 1) {
+        count = sz_naive_count_char(haystack, *needle.start);
     }
     else {
         // Your existing logic for count_substr can be embedded here
         if (allow_overlap) {
-            while (haystack.len) {
+            while (haystack.length) {
                 size_t offset = sz_neon_find_substr(haystack, needle);
-                int found = offset != haystack.len;
+                int found = offset != haystack.length;
                 count += found;
-                haystack.ptr += offset + found;
-                haystack.len -= offset + found;
+                haystack.start += offset + found;
+                haystack.length -= offset + found;
             }
         }
         else {
-            while (haystack.len) {
+            while (haystack.length) {
                 size_t offset = sz_neon_find_substr(haystack, needle);
-                int found = offset != haystack.len;
+                int found = offset != haystack.length;
                 count += found;
-                haystack.ptr += offset + needle.len;
-                haystack.len -= offset + needle.len * found;
+                haystack.start += offset + needle.length;
+                haystack.length -= offset + needle.length * found;
             }
         }
     }
@@ -303,7 +303,8 @@ static PyObject *str_levenstein_vectorcall(PyObject *_, PyObject *const *args, s
     PyObject *str2_obj = args[1];
 
     struct sz_haystack_t str1, str2;
-    if (!export_string_like(str1_obj, &str1.ptr, &str1.len) || !export_string_like(str2_obj, &str2.ptr, &str2.len)) {
+    if (!export_string_like(str1_obj, &str1.start, &str1.length) ||
+        !export_string_like(str2_obj, &str2.start, &str2.length)) {
         PyErr_SetString(PyExc_TypeError, "Both arguments must be string-like");
         return NULL;
     }
@@ -340,23 +341,23 @@ static PyObject *str_levenstein_vectorcall(PyObject *_, PyObject *const *args, s
     }
 
     // Initialize or reallocate the Levenshtein distance matrix
-    size_t memory_needed = sz_levenstein_memory_needed(str1.len, str2.len);
-    if (temporary_memory.len < memory_needed) {
-        temporary_memory.ptr = realloc(temporary_memory.ptr, memory_needed);
-        temporary_memory.len = memory_needed;
+    size_t memory_needed = sz_levenstein_memory_needed(str1.length, str2.length);
+    if (temporary_memory.length < memory_needed) {
+        temporary_memory.start = realloc(temporary_memory.start, memory_needed);
+        temporary_memory.length = memory_needed;
     }
-    if (temporary_memory.ptr == NULL) {
+    if (temporary_memory.start == NULL) {
         PyErr_SetString(PyExc_MemoryError, "Unable to allocate memory for the Levenshtein matrix");
         return NULL;
     }
 
     levenstein_distance_t distance = sz_levenstein( //
-        str1.ptr,
-        str1.len,
-        str2.ptr,
-        str2.len,
+        str1.start,
+        str1.length,
+        str2.start,
+        str2.length,
         (levenstein_distance_t)bound,
-        temporary_memory.ptr);
+        temporary_memory.start);
     return PyLong_FromLong(distance);
 }
 
@@ -609,16 +610,16 @@ static int Str_contains(Str *self, PyObject *arg) {
 
     struct sz_needle_t needle_struct;
     needle_struct.anomaly_offset = 0;
-    if (!export_string_like(arg, &needle_struct.ptr, &needle_struct.len)) {
+    if (!export_string_like(arg, &needle_struct.start, &needle_struct.length)) {
         PyErr_SetString(PyExc_TypeError, "Unsupported argument type");
         return -1;
     }
 
     struct sz_haystack_t haystack;
-    haystack.ptr = self->start;
-    haystack.len = self->length;
+    haystack.start = self->start;
+    haystack.length = self->length;
     size_t position = sz_neon_find_substr(haystack, needle_struct);
-    return position != haystack.len;
+    return position != haystack.length;
 }
 
 static PyObject *Str_getslice(Str *self, PyObject *args) {
@@ -761,9 +762,9 @@ PyObject *register_vectorcall(PyObject *module, char const *name, vectorcallfunc
 }
 
 void cleanup_module(void) {
-    free(temporary_memory.ptr);
-    temporary_memory.ptr = NULL;
-    temporary_memory.len = 0;
+    free(temporary_memory.start);
+    temporary_memory.start = NULL;
+    temporary_memory.length = 0;
 }
 
 PyMODINIT_FUNC PyInit_stringzilla(void) {
@@ -796,8 +797,8 @@ PyMODINIT_FUNC PyInit_stringzilla(void) {
 
     // Initialize temporary_memory, if needed
     // For example, allocate an initial chunk
-    temporary_memory.ptr = malloc(4096);
-    temporary_memory.len = 4096 * (temporary_memory.ptr != NULL);
+    temporary_memory.start = malloc(4096);
+    temporary_memory.length = 4096 * (temporary_memory.start != NULL);
     atexit(cleanup_module);
 
     // Register the vectorized functions
diff --git a/scripts/test.cpp b/scripts/test.cpp
index 3e76248e..5c97c452 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -17,7 +17,7 @@ using permute_t = std::vector<idx_t>;
 
 #pragma region - C callbacks
 
-static char const *get_begin(void const *array_c, size_t i) {
+static char const *get_start(void const *array_c, size_t i) {
     strings_t const &array = *reinterpret_cast<strings_t const *>(array_c);
     return array[i].c_str();
 }
@@ -234,7 +234,7 @@ int main(int, char const **) {
         expect_partitioned_by_length(strings, permute_base);
 
         bench_permute("sz_partition", strings, permute_new, [](strings_t const &strings, permute_t &permute) {
-            sz_array_t array;
+            sz_sequence_t array;
             array.order = permute.data();
             array.count = strings.size();
             array.handle = &strings;
@@ -253,11 +253,11 @@ int main(int, char const **) {
         expect_sorted(strings, permute_base);
 
         bench_permute("sz_sort", strings, permute_new, [](strings_t const &strings, permute_t &permute) {
-            sz_array_t array;
+            sz_sequence_t array;
             array.order = permute.data();
             array.count = strings.size();
             array.handle = &strings;
-            array.get_begin = get_begin;
+            array.get_start = get_start;
             array.get_length = get_length;
             sz_sort(&array, nullptr);
         });
diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h
index 5e60adb5..a60c0dea 100644
--- a/stringzilla/stringzilla.h
+++ b/stringzilla/stringzilla.h
@@ -35,24 +35,24 @@ typedef uint64_t sz_size_t;
 inline static sz_size_t sz_divide_round_up(sz_size_t x, sz_size_t divisor) { return (x + (divisor - 1)) / divisor; }
 
 /**
- *  @brief This is a faster alternative to `strncmp(a, b, len) == 0`.
+ *  @brief This is a faster alternative to `strncmp(a, b, length) == 0`.
  *  @return 1 for `true`, and 0 for `false`.
  */
-inline static int sz_equal(char const *a, char const *b, sz_size_t len) {
-    char const *const a_end = a + len;
+inline static int sz_equal(char const *a, char const *b, sz_size_t length) {
+    char const *const a_end = a + length;
     while (a != a_end && *a == *b)
         a++, b++;
     return a_end == a;
 }
 
 typedef struct sz_haystack_t {
-    char const *ptr;
-    sz_size_t len;
+    char const *start;
+    sz_size_t length;
 } sz_haystack_t;
 
 typedef struct sz_needle_t {
-    char const *ptr;
-    sz_size_t len;
+    char const *start;
+    sz_size_t length;
     sz_size_t anomaly_offset;
 } sz_needle_t;
 
@@ -62,8 +62,8 @@ typedef struct sz_needle_t {
 inline static sz_size_t sz_naive_count_char(sz_haystack_t h, char n) {
 
     sz_size_t result = 0;
-    char const *text = h.ptr;
-    char const *end = h.ptr + h.len;
+    char const *text = h.start;
+    char const *end = h.start + h.length;
 
     for (; (uint64_t)text % 8 != 0 && text < end; ++text)
         result += *text == n;
@@ -93,12 +93,12 @@ inline static sz_size_t sz_naive_count_char(sz_haystack_t h, char n) {
  */
 inline static sz_size_t sz_naive_find_char(sz_haystack_t h, char n) {
 
-    char const *text = h.ptr;
-    char const *end = h.ptr + h.len;
+    char const *text = h.start;
+    char const *end = h.start + h.length;
 
     for (; (uint64_t)text % 8 != 0 && text < end; ++text)
         if (*text == n)
-            return text - h.ptr;
+            return text - h.start;
 
     // This code simulates hyper-scalar execution, analyzing 8 offsets at a time.
     uint64_t nnnnnnnn = n;
@@ -114,13 +114,13 @@ inline static sz_size_t sz_naive_find_char(sz_haystack_t h, char n) {
         match_indicators &= 0x0101010101010101;
 
         if (match_indicators != 0)
-            return text - h.ptr + ctz64(match_indicators) / 8;
+            return text - h.start + ctz64(match_indicators) / 8;
     }
 
     for (; text < end; ++text)
         if (*text == n)
-            return text - h.ptr;
-    return h.len;
+            return text - h.start;
+    return h.length;
 }
 
 /**
@@ -128,8 +128,8 @@ inline static sz_size_t sz_naive_find_char(sz_haystack_t h, char n) {
  */
 inline static sz_size_t sz_naive_find_2chars(sz_haystack_t h, char const *n) {
 
-    char const *text = h.ptr;
-    char const *end = h.ptr + h.len;
+    char const *text = h.start;
+    char const *end = h.start + h.length;
 
     // This code simulates hyper-scalar execution, analyzing 7 offsets at a time.
     uint64_t nnnn = ((uint64_t)(n[0]) << 0) | ((uint64_t)(n[1]) << 8); // broadcast `n` into `nnnn`
@@ -155,14 +155,14 @@ inline static sz_size_t sz_naive_find_2chars(sz_haystack_t h, char const *n) {
 
         if (even_indicators + odd_indicators) {
             uint64_t match_indicators = even_indicators | (odd_indicators >> 8);
-            return text - h.ptr + ctz64(match_indicators) / 8;
+            return text - h.start + ctz64(match_indicators) / 8;
         }
     }
 
     for (; text + 2 <= end; ++text)
         if (text[0] == n[0] && text[1] == n[1])
-            return text - h.ptr;
-    return h.len;
+            return text - h.start;
+    return h.length;
 }
 
 /**
@@ -170,8 +170,8 @@ inline static sz_size_t sz_naive_find_2chars(sz_haystack_t h, char const *n) {
  */
 inline static sz_size_t sz_naive_find_3chars(sz_haystack_t h, char const *n) {
 
-    char const *text = h.ptr;
-    char const *end = h.ptr + h.len;
+    char const *text = h.start;
+    char const *end = h.start + h.length;
 
     // This code simulates hyper-scalar execution, analyzing 6 offsets at a time.
     // We have two unused bytes at the end.
@@ -211,13 +211,13 @@ inline static sz_size_t sz_naive_find_3chars(sz_haystack_t h, char const *n) {
 
         uint64_t match_indicators = first_indicators | (second_indicators >> 8) | (third_indicators >> 16);
         if (match_indicators != 0)
-            return text - h.ptr + ctz64(match_indicators) / 8;
+            return text - h.start + ctz64(match_indicators) / 8;
     }
 
     for (; text + 3 <= end; ++text)
         if (text[0] == n[0] && text[1] == n[1] && text[2] == n[2])
-            return text - h.ptr;
-    return h.len;
+            return text - h.start;
+    return h.length;
 }
 
 /**
@@ -225,8 +225,8 @@ inline static sz_size_t sz_naive_find_3chars(sz_haystack_t h, char const *n) {
  */
 inline static sz_size_t sz_naive_find_4chars(sz_haystack_t h, char const *n) {
 
-    char const *text = h.ptr;
-    char const *end = h.ptr + h.len;
+    char const *text = h.start;
+    char const *end = h.start + h.length;
 
     // This code simulates hyper-scalar execution, analyzing 4 offsets at a time.
     uint64_t nn = (uint64_t)(n[0] << 0) | ((uint64_t)(n[1]) << 8) | ((uint64_t)(n[2]) << 16) | ((uint64_t)(n[3]) << 24);
@@ -269,14 +269,14 @@ inline static sz_size_t sz_naive_find_4chars(sz_haystack_t h, char const *n) {
             uint8_t match_indicators = (uint8_t)(                      //
                 (text01_indicators >> 31) | (text01_indicators << 0) | //
                 (text23_indicators >> 29) | (text23_indicators << 2));
-            return text - h.ptr + lookup[match_indicators];
+            return text - h.start + lookup[match_indicators];
         }
     }
 
     for (; text + 4 <= end; ++text)
         if (text[0] == n[0] && text[1] == n[1] && text[2] == n[2] && text[3] == n[3])
-            return text - h.ptr;
-    return h.len;
+            return text - h.start;
+    return h.length;
 }
 
 /**
@@ -286,32 +286,32 @@ inline static sz_size_t sz_naive_find_4chars(sz_haystack_t h, char const *n) {
  */
 inline static sz_size_t sz_naive_find_substr(sz_haystack_t h, sz_needle_t n) {
 
-    if (h.len < n.len)
-        return h.len;
+    if (h.length < n.length)
+        return h.length;
 
-    char const *text = h.ptr;
-    char const *const end = h.ptr + h.len;
-    switch (n.len) {
+    char const *text = h.start;
+    char const *const end = h.start + h.length;
+    switch (n.length) {
     case 0: return 0;
-    case 1: return sz_naive_find_char(h, *n.ptr);
-    case 2: return sz_naive_find_2chars(h, n.ptr);
-    case 3: return sz_naive_find_3chars(h, n.ptr);
-    case 4: return sz_naive_find_4chars(h, n.ptr);
+    case 1: return sz_naive_find_char(h, *n.start);
+    case 2: return sz_naive_find_2chars(h, n.start);
+    case 3: return sz_naive_find_3chars(h, n.start);
+    case 4: return sz_naive_find_4chars(h, n.start);
     default: {
         sz_anomaly_t n_anomaly, h_anomaly;
-        sz_size_t const n_suffix_len = n.len - 4 - n.anomaly_offset;
-        char const *n_suffix_ptr = n.ptr + 4 + n.anomaly_offset;
-        memcpy(&n_anomaly, n.ptr + n.anomaly_offset, 4);
+        sz_size_t const n_suffix_len = n.length - 4 - n.anomaly_offset;
+        char const *n_suffix_ptr = n.start + 4 + n.anomaly_offset;
+        memcpy(&n_anomaly, n.start + n.anomaly_offset, 4);
 
         text += n.anomaly_offset;
-        for (; text + n.len <= end; text++) {
+        for (; text + n.length <= end; text++) {
             memcpy(&h_anomaly, text, 4);
-            if (h_anomaly == n_anomaly)                                             // Match anomaly.
-                if (sz_equal(text + 4, n_suffix_ptr, n_suffix_len))                 // Match suffix.
-                    if (sz_equal(text - n.anomaly_offset, n.ptr, n.anomaly_offset)) // Match prefix.
-                        return text - h.ptr - n.anomaly_offset;
+            if (h_anomaly == n_anomaly)                                               // Match anomaly.
+                if (sz_equal(text + 4, n_suffix_ptr, n_suffix_len))                   // Match suffix.
+                    if (sz_equal(text - n.anomaly_offset, n.start, n.anomaly_offset)) // Match prefix.
+                        return text - h.start - n.anomaly_offset;
         }
-        return h.len;
+        return h.length;
     }
     }
 }
@@ -327,14 +327,14 @@ inline static sz_size_t sz_naive_find_substr(sz_haystack_t h, sz_needle_t n) {
 sz_size_t sz_avx2_find_substr(sz_haystack_t h, sz_needle_t n) {
 
     // Precomputed constants
-    char const *const end = h.ptr + h.len;
+    char const *const end = h.start + h.length;
     uint32_t anomaly = 0;
     uint32_t mask = 0;
-    switch (n.len) {
-    case 1: memset(&mask, 0xFF, 1), memcpy(&anomaly, n.ptr, 1); break;
-    case 2: memset(&mask, 0xFF, 2), memcpy(&anomaly, n.ptr, 2); break;
-    case 3: memset(&mask, 0xFF, 3), memcpy(&anomaly, n.ptr, 3); break;
-    default: memset(&mask, 0xFF, 4), memcpy(&anomaly, n.ptr, 4); break;
+    switch (n.length) {
+    case 1: memset(&mask, 0xFF, 1), memcpy(&anomaly, n.start, 1); break;
+    case 2: memset(&mask, 0xFF, 2), memcpy(&anomaly, n.start, 2); break;
+    case 3: memset(&mask, 0xFF, 3), memcpy(&anomaly, n.start, 3); break;
+    default: memset(&mask, 0xFF, 4), memcpy(&anomaly, n.start, 4); break;
     }
 
     __m256i const anomalies = _mm256_set1_epi32(*(uint32_t const *)&anomaly);
@@ -349,8 +349,8 @@ sz_size_t sz_avx2_find_substr(sz_haystack_t h, sz_needle_t n) {
     //  + 4 movemasks.
     //  + 3 bitwise ANDs.
     //  + 1 heavy (but very unlikely) branch.
-    char const *text = h.ptr;
-    for (; (text + n.len + 32) <= end; text += 32) {
+    char const *text = h.start;
+    for (; (text + n.length + 32) <= end; text += 32) {
 
         // Performing many unaligned loads ends up being faster than loading once and shuffling around.
         __m256i texts0 = _mm256_and_si256(_mm256_loadu_si256((__m256i const *)(text + 0)), masks);
@@ -364,18 +364,18 @@ sz_size_t sz_avx2_find_substr(sz_haystack_t h, sz_needle_t n) {
 
         if (matches0 | matches1 | matches2 | matches3) {
             for (sz_size_t i = 0; i < 32; i++) {
-                if (sz_equal(text + i, n.ptr, n.len))
-                    return i + (text - h.ptr);
+                if (sz_equal(text + i, n.start, n.length))
+                    return i + (text - h.start);
             }
         }
     }
 
     // Don't forget the last (up to 35) characters.
     sz_haystack_t h_remainder;
-    h_remainder.ptr = text;
-    h_remainder.len = end - text;
+    h_remainder.start = text;
+    h_remainder.length = end - text;
     sz_size_t tail_match = sz_naive_find_substr(h_remainder, n);
-    return text + tail_match - h.ptr;
+    return text + tail_match - h.start;
 }
 
 #endif // x86 AVX2
@@ -391,21 +391,21 @@ sz_size_t sz_avx2_find_substr(sz_haystack_t h, sz_needle_t n) {
 inline static sz_size_t sz_neon_find_substr(sz_haystack_t h, sz_needle_t n) {
 
     // Precomputed constants
-    char const *const end = h.ptr + h.len;
+    char const *const end = h.start + h.length;
     uint32_t anomaly = 0;
     uint32_t mask = 0;
-    switch (n.len) {
-    case 1: memset(&mask, 0xFF, 1), memcpy(&anomaly, n.ptr, 1); break;
-    case 2: memset(&mask, 0xFF, 2), memcpy(&anomaly, n.ptr, 2); break;
-    case 3: memset(&mask, 0xFF, 3), memcpy(&anomaly, n.ptr, 3); break;
-    default: memset(&mask, 0xFF, 4), memcpy(&anomaly, n.ptr, 4); break;
+    switch (n.length) {
+    case 1: memset(&mask, 0xFF, 1), memcpy(&anomaly, n.start, 1); break;
+    case 2: memset(&mask, 0xFF, 2), memcpy(&anomaly, n.start, 2); break;
+    case 3: memset(&mask, 0xFF, 3), memcpy(&anomaly, n.start, 3); break;
+    default: memset(&mask, 0xFF, 4), memcpy(&anomaly, n.start, 4); break;
     }
 
     uint32x4_t const anomalies = vld1q_dup_u32(&anomaly);
     uint32x4_t const masks = vld1q_dup_u32(&mask);
 
-    char const *text = h.ptr;
-    for (; (text + n.len + 16) <= end; text += 16) {
+    char const *text = h.start;
+    for (; (text + n.length + 16) <= end; text += 16) {
 
         uint32x4_t matches0 = vceqq_u32(vandq_u32(vld1q_u32((uint32_t const *)(text + 0)), masks), anomalies);
         uint32x4_t matches1 = vceqq_u32(vandq_u32(vld1q_u32((uint32_t const *)(text + 1)), masks), anomalies);
@@ -422,18 +422,18 @@ inline static sz_size_t sz_neon_find_substr(sz_haystack_t h, sz_needle_t n) {
 
         if (has_match) {
             for (sz_size_t i = 0; i < 16; i++) {
-                if (sz_equal(text + i, n.ptr, n.len))
-                    return i + (text - h.ptr);
+                if (sz_equal(text + i, n.start, n.length))
+                    return i + (text - h.start);
             }
         }
     }
 
     // Don't forget the last (up to 16+3=19) characters.
     sz_haystack_t h_remainder;
-    h_remainder.ptr = text;
-    h_remainder.len = end - text;
+    h_remainder.start = text;
+    h_remainder.length = end - text;
     sz_size_t tail_match = sz_naive_find_substr(h_remainder, n);
-    return text + tail_match - h.ptr;
+    return text + tail_match - h.start;
 }
 
 #endif // Arm Neon
@@ -444,66 +444,73 @@ inline static void sz_swap(sz_size_t *a, sz_size_t *b) {
     *b = t;
 }
 
-typedef char const *(*sz_array_get_begin_t)(void const *, sz_size_t);
-typedef sz_size_t (*sz_array_get_length_t)(void const *, sz_size_t);
-typedef int (*sz_array_predicate_t)(void const *, sz_size_t);
-typedef int (*sz_array_comparator_t)(void const *, sz_size_t, sz_size_t);
+typedef char const *(*sz_sequence_get_start_t)(void const *, sz_size_t);
+typedef sz_size_t (*sz_sequence_get_length_t)(void const *, sz_size_t);
+typedef int (*sz_sequence_predicate_t)(void const *, sz_size_t);
+typedef int (*sz_sequence_comparator_t)(void const *, sz_size_t, sz_size_t);
 
-typedef struct sz_array_t {
+// Define a type for the comparison function, depending on the platform.
+#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) || defined(__APPLE__)
+typedef int (*sz_qsort_comparison_func_t)(void *, void const *, void const *);
+#else
+typedef int (*sz_qsort_comparison_func_t)(void const *, void const *, void *);
+#endif
+
+typedef struct sz_sequence_t {
     sz_size_t *order;
     sz_size_t count;
-    sz_array_get_begin_t get_begin;
-    sz_array_get_length_t get_length;
+    sz_sequence_get_start_t get_start;
+    sz_sequence_get_length_t get_length;
     void const *handle;
-} sz_array_t;
+} sz_sequence_t;
 
 /**
  *  @brief  Similar to `std::partition`, given a predicate splits the
- *          array into two parts.
+ *          sequence into two parts.
  */
-inline static sz_size_t sz_partition(sz_array_t *array, sz_array_predicate_t predicate) {
+inline static sz_size_t sz_partition(sz_sequence_t *sequence, sz_sequence_predicate_t predicate) {
 
     sz_size_t matches = 0;
-    while (matches != array->count && predicate(array->handle, array->order[matches]))
+    while (matches != sequence->count && predicate(sequence->handle, sequence->order[matches]))
         ++matches;
 
-    for (sz_size_t i = matches + 1; i < array->count; ++i)
-        if (predicate(array->handle, array->order[i]))
-            sz_swap(array->order + i, array->order + matches), ++matches;
+    for (sz_size_t i = matches + 1; i < sequence->count; ++i)
+        if (predicate(sequence->handle, sequence->order[i]))
+            sz_swap(sequence->order + i, sequence->order + matches), ++matches;
 
     return matches;
 }
 
 /**
  *  @brief  Inplace `std::set_union` for two consecutive chunks forming
- *          the same continuous array.
+ *          the same continuous sequence.
  */
-inline static void sz_merge(sz_array_t *array, sz_size_t partition, sz_array_comparator_t less) {
+inline static void sz_merge(sz_sequence_t *sequence, sz_size_t partition, sz_sequence_comparator_t less) {
 
     sz_size_t start_b = partition + 1;
 
     // If the direct merge is already sorted
-    if (!less(array->handle, array->order[start_b], array->order[partition]))
+    if (!less(sequence->handle, sequence->order[start_b], sequence->order[partition]))
         return;
 
     sz_size_t start_a = 0;
-    while (start_a <= partition && start_b <= array->count) {
+    while (start_a <= partition && start_b <= sequence->count) {
 
         // If element 1 is in right place
-        if (!less(array->handle, array->order[start_b], array->order[start_a])) {
+        if (!less(sequence->handle, sequence->order[start_b], sequence->order[start_a])) {
             start_a++;
         }
         else {
-            sz_size_t value = array->order[start_b];
+            sz_size_t value = sequence->order[start_b];
             sz_size_t index = start_b;
 
             // Shift all the elements between element 1
             // element 2, right by 1.
             while (index != start_a) {
-                array->order[index] = array->order[index - 1];
+                sequence->order[index] = sequence->order[index - 1];
                 index--;
             }
-            array->order[start_a] = value;
+            sequence->order[start_a] = value;
 
             // Update all the pointers
             start_a++;
@@ -514,103 +521,111 @@ inline static void sz_merge(sz_array_t *array, sz_size_t partition, sz_array_com
 }
 
 inline static void _sz_sort_recursion( //
-    sz_array_t *array,
+    sz_sequence_t *sequence,
     sz_size_t bit_idx,
     sz_size_t bit_max,
-#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) || __APPLE__
-    int (*libc_comparator)(void *, void const *, void const *)
-#else
-    int (*libc_comparator)(void const *, void const *, void *)
-#endif
-) {
+    sz_qsort_comparison_func_t qsort_comparator) {
 
-    if (!array->count)
+    if (!sequence->count)
         return;
 
     // Partition a range of integers according to a specific bit value
     sz_size_t split = 0;
     {
         sz_size_t mask = (1ul << 63) >> bit_idx;
-        while (split != array->count && !(array->order[split] & mask))
+        while (split != sequence->count && !(sequence->order[split] & mask))
             ++split;
 
-        for (sz_size_t i = split + 1; i < array->count; ++i)
-            if (!(array->order[i] & mask))
-                sz_swap(array->order + i, array->order + split), ++split;
+        for (sz_size_t i = split + 1; i < sequence->count; ++i)
+            if (!(sequence->order[i] & mask))
+                sz_swap(sequence->order + i, sequence->order + split), ++split;
     }
 
     // Go down recursively
     if (bit_idx < bit_max) {
-        sz_array_t a = *array;
+        sz_sequence_t a = *sequence;
         a.count = split;
-        _sz_sort_recursion(&a, bit_idx + 1, bit_max, libc_comparator);
+        _sz_sort_recursion(&a, bit_idx + 1, bit_max, qsort_comparator);
 
-        sz_array_t b = *array;
+        sz_sequence_t b = *sequence;
         b.order += split;
         b.count -= split;
-        _sz_sort_recursion(&b, bit_idx + 1, bit_max, libc_comparator);
+        _sz_sort_recursion(&b, bit_idx + 1, bit_max, qsort_comparator);
     }
     // Reached the end of recursion
     else {
         // Discard the prefixes
-        for (sz_size_t i = 0; i != array->count; ++i)
-            memset((char *)(&array->order[i]) + 4, 0, 4ul);
+        for (sz_size_t i = 0; i != sequence->count; ++i) {
+            memset((char *)(&sequence->order[i]) + 4, 0, 4ul);
+        }
 
-#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
         // Perform sorts on smaller chunks instead of the whole handle
+#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
         // https://stackoverflow.com/a/39561369
         // https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/qsort-s?view=msvc-170
-        qsort_s(array->order, split, sizeof(sz_size_t), libc_comparator, (void *)array);
-        qsort_s(array->order + split, array->count - split, sizeof(sz_size_t), libc_comparator, (void *)array);
+        qsort_s(sequence->order, split, sizeof(sz_size_t), qsort_comparator, (void *)sequence);
+        qsort_s(sequence->order + split,
+                sequence->count - split,
+                sizeof(sz_size_t),
+                qsort_comparator,
+                (void *)sequence);
 #elif __APPLE__
-        qsort_r(array->order, split, sizeof(sz_size_t), (void *)array, libc_comparator);
-        qsort_r(array->order + split, array->count - split, sizeof(sz_size_t), (void *)array, libc_comparator);
+        qsort_r(sequence->order, split, sizeof(sz_size_t), (void *)sequence, qsort_comparator);
+        qsort_r(sequence->order + split,
+                sequence->count - split,
+                sizeof(sz_size_t),
+                (void *)sequence,
+                qsort_comparator);
 #else
         // https://linux.die.net/man/3/qsort_r
-        qsort_r(array->order, split, sizeof(sz_size_t), libc_comparator, (void *)array);
-        qsort_r(array->order + split, array->count - split, sizeof(sz_size_t), libc_comparator, (void *)array);
+        qsort_r(sequence->order, split, sizeof(sz_size_t), qsort_comparator, (void *)sequence);
+        qsort_r(sequence->order + split,
+                sequence->count - split,
+                sizeof(sz_size_t),
+                qsort_comparator,
+                (void *)sequence);
 #endif
     }
 }
 
-inline static int _sz_sort_array_strncmp(
+inline static int _sz_sort_sequence_strncmp(
 #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) || __APPLE__
-    void *array_raw, void const *a_raw, void const *b_raw
+    void *sequence_raw, void const *a_raw, void const *b_raw
 #else
-    void const *a_raw, void const *b_raw, void *array_raw
+    void const *a_raw, void const *b_raw, void *sequence_raw
 #endif
 ) {
     // https://man.freebsd.org/cgi/man.cgi?query=qsort_s&sektion=3&n=1
     // https://www.man7.org/linux/man-pages/man3/strcmp.3.html
-    sz_array_t *array = (sz_array_t *)array_raw;
+    sz_sequence_t *sequence = (sz_sequence_t *)sequence_raw;
     sz_size_t a = *(sz_size_t *)a_raw;
     sz_size_t b = *(sz_size_t *)b_raw;
-    sz_size_t a_len = array->get_length(array->handle, a);
-    sz_size_t b_len = array->get_length(array->handle, b);
+    sz_size_t a_len = sequence->get_length(sequence->handle, a);
+    sz_size_t b_len = sequence->get_length(sequence->handle, b);
     int res = strncmp( //
-        array->get_begin(array->handle, a),
-        array->get_begin(array->handle, b),
+        sequence->get_start(sequence->handle, a),
+        sequence->get_start(sequence->handle, b),
         a_len > b_len ? b_len : a_len);
     return res ? res : a_len - b_len;
 }
 
-inline static int _sz_sort_array_strncasecmp(
+inline static int _sz_sort_sequence_strncasecmp(
 #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) || __APPLE__
-    void *array_raw, void const *a_raw, void const *b_raw
+    void *sequence_raw, void const *a_raw, void const *b_raw
 #else
-    void const *a_raw, void const *b_raw, void *array_raw
+    void const *a_raw, void const *b_raw, void *sequence_raw
 #endif
 ) {
     // https://man.freebsd.org/cgi/man.cgi?query=qsort_s&sektion=3&n=1
     // https://www.man7.org/linux/man-pages/man3/strcmp.3.html
-    sz_array_t *array = (sz_array_t *)array_raw;
+    sz_sequence_t *sequence = (sz_sequence_t *)sequence_raw;
     sz_size_t a = *(sz_size_t *)a_raw;
     sz_size_t b = *(sz_size_t *)b_raw;
-    sz_size_t a_len = array->get_length(array->handle, a);
-    sz_size_t b_len = array->get_length(array->handle, b);
+    sz_size_t a_len = sequence->get_length(sequence->handle, a);
+    sz_size_t b_len = sequence->get_length(sequence->handle, b);
     int res = strncasecmp( //
-        array->get_begin(array->handle, a),
-        array->get_begin(array->handle, b),
+        sequence->get_start(sequence->handle, a),
+        sequence->get_start(sequence->handle, b),
         a_len > b_len ? b_len : a_len);
     return res ? res : a_len - b_len;
 }
@@ -623,16 +638,16 @@ typedef struct sz_sort_config_t {
  *  @brief  Sorting algorithm, combining Radix Sort for the first 32 bits of every word
  *          and a follow-up Quick Sort on resulting structure.
  */
-inline static void sz_sort(sz_array_t *array, sz_sort_config_t const *config) {
+inline static void sz_sort(sz_sequence_t *sequence, sz_sort_config_t const *config) {
 
     int case_insensitive = config && config->case_insensitive;
 
-    // Export up to 4 bytes into the `array` bits themselves
-    for (sz_size_t i = 0; i != array->count; ++i) {
-        char const *begin = array->get_begin(array->handle, array->order[i]);
-        sz_size_t length = array->get_length(array->handle, array->order[i]);
+    // Export up to 4 bytes into the `sequence` bits themselves
+    for (sz_size_t i = 0; i != sequence->count; ++i) {
+        char const *begin = sequence->get_start(sequence->handle, sequence->order[i]);
+        sz_size_t length = sequence->get_length(sequence->handle, sequence->order[i]);
         length = length > 4ul ? 4ul : length;
-        char *prefix = (char *)&array->order[i];
+        char *prefix = (char *)&sequence->order[i];
         for (sz_size_t j = 0; j != length; ++j)
             prefix[7 - j] = begin[j];
         if (case_insensitive) {
@@ -643,17 +658,12 @@ inline static void sz_sort(sz_array_t *array, sz_sort_config_t const *config) {
         }
     }
 
-#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) || __APPLE__
-    int (*comparator)(void *, void const *, void const *);
-#else
-    int (*comparator)(void const *, void const *, void *);
-#endif
-    comparator = _sz_sort_array_strncmp;
+    sz_qsort_comparison_func_t comparator = _sz_sort_sequence_strncmp;
     if (case_insensitive)
-        comparator = _sz_sort_array_strncasecmp;
+        comparator = _sz_sort_sequence_strncasecmp;
 
     // Perform optionally-parallel radix sort on them
-    _sz_sort_recursion(array, 0, 32, comparator);
+    _sz_sort_recursion(sequence, 0, 32, comparator);
 }
 
 typedef uint8_t levenstein_distance_t;

From 21200c8311f4af37e8f5c55af508222cd8e29aa9 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 17 Sep 2023 16:48:02 +0400
Subject: [PATCH 14/72] Fix: Benchmarks compilation

---
 scripts/test.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/scripts/test.cpp b/scripts/test.cpp
index 5c97c452..c1462c6d 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -12,27 +12,27 @@
 #include <stringzilla.h>
 
 using strings_t = std::vector<std::string>;
-using idx_t = std::size_t;
+using idx_t = sz_size_t;
 using permute_t = std::vector<idx_t>;
 
 #pragma region - C callbacks
 
-static char const *get_start(void const *array_c, size_t i) {
+static char const *get_start(void const *array_c, sz_size_t i) {
     strings_t const &array = *reinterpret_cast<strings_t const *>(array_c);
     return array[i].c_str();
 }
 
-static size_t get_length(void const *array_c, size_t i) {
+static sz_size_t get_length(void const *array_c, sz_size_t i) {
     strings_t const &array = *reinterpret_cast<strings_t const *>(array_c);
     return array[i].size();
 }
 
-static bool is_less(void const *array_c, size_t i, size_t j) {
+static int is_less(void const *array_c, sz_size_t i, sz_size_t j) {
     strings_t const &array = *reinterpret_cast<strings_t const *>(array_c);
     return array[i] < array[j];
 }
 
-static bool has_under_four_chars(void const *array_c, size_t i) {
+static int has_under_four_chars(void const *array_c, sz_size_t i) {
     strings_t const &array = *reinterpret_cast<strings_t const *>(array_c);
     return array[i].size() < 4;
 }

From 12b6d0b844cd0506bbfc75885fbe61691a6068d7 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 17 Sep 2023 17:29:56 +0400
Subject: [PATCH 15/72] Add: `Strs` structure in CPython

---
 python/lib.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 85 insertions(+)

diff --git a/python/lib.c b/python/lib.c
index c670ae81..a74f9816 100644
--- a/python/lib.c
+++ b/python/lib.c
@@ -27,6 +27,7 @@ typedef SSIZE_T ssize_t;
 
 static PyTypeObject FileType;
 static PyTypeObject StrType;
+static PyTypeObject StrsType;
 
 struct {
     void *start;
@@ -68,6 +69,69 @@ typedef struct {
     size_t length;
 } Str;
 
+/**
+ *  @brief  Variable length Python object similar to `Tuple[Union[Str, str]]`,
+ *          for faster sorting, shuffling, joins, and lookups.
+ */
+typedef struct {
+    PyObject_HEAD;
+
+    enum {
+        STRS_CONSECUTIVE_32,
+        STRS_CONSECUTIVE_64,
+        STRS_REORDERED,
+        STRS_MULTI_SOURCE,
+    } type;
+
+    union {
+        /**
+         *  Simple structure resembling Apache Arrow arrays of variable length strings.
+         *  When you split a `Str`, that is under 4 GB in size, this is used for space-efficiency.
+         */
+        struct consecutive_slices_32bit_t {
+            size_t count;
+            PyObject *parent;
+            char const *start;
+            uint32_t *offsets;
+        } consecutive_32bit;
+
+        /**
+         *  Simple structure resembling Apache Arrow arrays of variable length strings.
+         *  When you split a `Str`, over 4 GB long, this structure is used to indicate chunk offsets.
+         */
+        struct consecutive_slices_64bit_t {
+            size_t count;
+            PyObject *parent;
+            char const *start;
+            uint64_t *offsets;
+        } consecutive_64bit;
+
+        /**
+         *  Once you sort, shuffle, or reorganize slices making up a larger string, this structure
+         *  cn be used for space-efficient lookups.
+         */
+        struct reordered_slices_t {
+            size_t count;
+            PyObject *parent;
+            sz_haystack_t *parts;
+        } reordered;
+
+        /**
+         *  Complex structure with two variable length chunks inside - for the parents and their slices.
+         *  The parents are sorted in ascending order of their memory ranges, to let us rapidly locate the source
+         *  with a binary search. The slices are preserved
+         */
+        struct multi_source_strings_t {
+            size_t count;
+            size_t parents_count;
+
+            PyObject **parents;
+            sz_haystack_t *parts;
+        } multi_source;
+    } data;
+
+} Strs;
+
 #pragma endregion
 
 #pragma region Helpers
@@ -726,6 +790,15 @@ static PyTypeObject StrType = {
     // .tp_as_buffer = (PyBufferProcs *)NULL, // Functions to access object as input/output buffer
 };
 
+static PyTypeObject StrsType = {
+    PyVarObject_HEAD_INIT(NULL, 0).tp_name = "stringzilla.Strs",
+    .tp_doc = "Space-efficient container for large collections of strings and their slices",
+    .tp_basicsize = sizeof(Strs),
+    .tp_itemsize = 0,
+    .tp_flags = Py_TPFLAGS_DEFAULT,
+    .tp_new = PyType_GenericNew,
+};
+
 #pragma endregion
 
 static PyMethodDef stringzilla_methods[] = { //
@@ -776,6 +849,9 @@ PyMODINIT_FUNC PyInit_stringzilla(void) {
     if (PyType_Ready(&FileType) < 0)
         return NULL;
 
+    if (PyType_Ready(&StrsType) < 0)
+        return NULL;
+
     m = PyModule_Create(&stringzilla_module);
     if (m == NULL)
         return NULL;
@@ -795,6 +871,15 @@ PyMODINIT_FUNC PyInit_stringzilla(void) {
         return NULL;
     }
 
+    Py_INCREF(&StrsType);
+    if (PyModule_AddObject(m, "Strs", (PyObject *)&StrsType) < 0) {
+        Py_XDECREF(&StrsType);
+        Py_XDECREF(&FileType);
+        Py_XDECREF(&StrType);
+        Py_XDECREF(m);
+        return NULL;
+    }
+
     // Initialize temporary_memory, if needed
     // For example, allocate an initial chunk
     temporary_memory.start = malloc(4096);

From 8f76c291241c955df1845ac8be9f39e29e9c6de9 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 17 Sep 2023 17:30:13 +0400
Subject: [PATCH 16/72] Add: Purely `qsort`-based hybrid sort benchmark

---
 scripts/test.cpp | 74 ++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 65 insertions(+), 9 deletions(-)

diff --git a/scripts/test.cpp b/scripts/test.cpp
index c1462c6d..1cf34bb2 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -65,7 +65,7 @@ void populate_with_test(strings_t &strings) {
 
 constexpr size_t offset_in_word = 0;
 
-inline static idx_t hybrid_sort(strings_t const &strings, idx_t *order) {
+inline static idx_t hybrid_sort_cpp(strings_t const &strings, idx_t *order) {
 
     // What if we take up-to 4 first characters and the index
     for (size_t i = 0; i != strings.size(); ++i)
@@ -87,7 +87,50 @@ inline static idx_t hybrid_sort(strings_t const &strings, idx_t *order) {
     return strings.size();
 }
 
-inline static idx_t hybrid_stable_sort(strings_t const &strings, idx_t *order) {
+int hybrid_sort_c_compare_uint32_t(const void *a, const void *b) {
+    uint32_t int_a = *((uint32_t *)(((char *)a) + sizeof(sz_size_t) - 4));
+    uint32_t int_b = *((uint32_t *)(((char *)b) + sizeof(sz_size_t) - 4));
+    return (int_a < int_b) ? -1 : (int_a > int_b);
+}
+
+int hybrid_sort_c_compare_strings(void *arg, const void *a, const void *b) {
+    sz_sequence_t *seq = (sz_sequence_t *)arg;
+    sz_size_t idx_a = *(sz_size_t *)a;
+    sz_size_t idx_b = *(sz_size_t *)b;
+
+    const char *str_a = seq->get_start(seq->handle, idx_a);
+    const char *str_b = seq->get_start(seq->handle, idx_b);
+    sz_size_t len_a = seq->get_length(seq->handle, idx_a);
+    sz_size_t len_b = seq->get_length(seq->handle, idx_b);
+
+    int res = strncmp(str_a, str_b, len_a < len_b ? len_a : len_b);
+    return res ? res : (int)(len_a - len_b);
+}
+
+sz_size_t hybrid_sort_c(sz_sequence_t *sequence) {
+    // Copy up to 4 first characters into the 'order' array.
+    for (sz_size_t i = 0; i < sequence->count; ++i) {
+        const char *str = sequence->get_start(sequence->handle, sequence->order[i]);
+        sz_size_t len = sequence->get_length(sequence->handle, sequence->order[i]);
+        len = len > 4 ? 4 : len;
+        memcpy((char *)&sequence->order[i] + sizeof(sz_size_t) - 4, str, len);
+    }
+
+    // Sort based on the first 4 bytes.
+    qsort(sequence->order, sequence->count, sizeof(sz_size_t), hybrid_sort_c_compare_uint32_t);
+
+    // Clear the 4 bytes used for the initial sort.
+    for (sz_size_t i = 0; i < sequence->count; ++i) {
+        memset((char *)&sequence->order[i] + sizeof(sz_size_t) - 4, 0, 4);
+    }
+
+    // Sort the full strings.
+    qsort_r(sequence->order, sequence->count, sizeof(sz_size_t), sequence, hybrid_sort_c_compare_strings);
+
+    return sequence->count;
+}
+
+inline static idx_t hybrid_stable_sort_cpp(strings_t const &strings, idx_t *order) {
 
     // What if we take up-to 4 first characters and the index
     for (size_t i = 0; i != strings.size(); ++i)
@@ -189,7 +232,7 @@ int main(int, char const **) {
     };
 
     // Search substring
-    for (std::size_t needle_len = 1; needle_len <= 5; ++needle_len) {
+    for (std::size_t needle_len = 1; needle_len <= 0; ++needle_len) {
         std::string needle(needle_len, '\4');
         std::printf("---- Needle length: %zu\n", needle_len);
         bench_search("std::search", full_text, [&]() {
@@ -221,7 +264,7 @@ int main(int, char const **) {
     permute_new.resize(strings.size());
 
     // Partitioning
-    if (true) {
+    if (false) {
         std::printf("---- Partitioning:\n");
         bench_permute("std::partition", strings, permute_base, [](strings_t const &strings, permute_t &permute) {
             std::partition(permute.begin(), permute.end(), [&](size_t i) { return strings[i].size() < 4; });
@@ -263,8 +306,19 @@ int main(int, char const **) {
         });
         expect_sorted(strings, permute_new);
 
-        bench_permute("hybrid_sort", strings, permute_new, [](strings_t const &strings, permute_t &permute) {
-            hybrid_sort(strings, permute.data());
+        bench_permute("hybrid_sort_c", strings, permute_new, [](strings_t const &strings, permute_t &permute) {
+            sz_sequence_t array;
+            array.order = permute.data();
+            array.count = strings.size();
+            array.handle = &strings;
+            array.get_start = get_start;
+            array.get_length = get_length;
+            hybrid_sort_c(&array);
+        });
+        expect_sorted(strings, permute_new);
+
+        bench_permute("hybrid_sort_cpp", strings, permute_new, [](strings_t const &strings, permute_t &permute) {
+            hybrid_sort_cpp(strings, permute.data());
         });
         expect_sorted(strings, permute_new);
 
@@ -274,9 +328,11 @@ int main(int, char const **) {
         });
         expect_sorted(strings, permute_base);
 
-        bench_permute("hybrid_stable_sort", strings, permute_base, [](strings_t const &strings, permute_t &permute) {
-            hybrid_stable_sort(strings, permute.data());
-        });
+        bench_permute(
+            "hybrid_stable_sort_cpp",
+            strings,
+            permute_base,
+            [](strings_t const &strings, permute_t &permute) { hybrid_stable_sort_cpp(strings, permute.data()); });
         expect_sorted(strings, permute_new);
         expect_same(permute_base, permute_new);
     }

From e53e1b9b9a272c40c1f5f26c2cbac12a4ad9dde2 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 17 Sep 2023 21:10:06 +0400
Subject: [PATCH 17/72] Add: Vectorized `split` for Python

---
 python/lib.c    | 124 +++++++++++++++++++++++++++++++++++++++++++++++-
 scripts/test.py |  11 +++--
 2 files changed, 129 insertions(+), 6 deletions(-)

diff --git a/python/lib.c b/python/lib.c
index a74f9816..bd16cb23 100644
--- a/python/lib.c
+++ b/python/lib.c
@@ -90,6 +90,7 @@ typedef struct {
          */
         struct consecutive_slices_32bit_t {
             size_t count;
+            size_t separator_length;
             PyObject *parent;
             char const *start;
             uint32_t *offsets;
@@ -101,6 +102,7 @@ typedef struct {
          */
         struct consecutive_slices_64bit_t {
             size_t count;
+            size_t separator_length;
             PyObject *parent;
             char const *start;
             uint64_t *offsets;
@@ -425,6 +427,125 @@ static PyObject *str_levenstein_vectorcall(PyObject *_, PyObject *const *args, s
     return PyLong_FromLong(distance);
 }
 
+static PyObject *strs_split_vectorcall(PyObject *self, PyObject *const *args, size_t nargsf, PyObject *kwnames) {
+    Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
+
+    // Validate the number of arguments
+    if (nargs < 1) {
+        PyErr_SetString(PyExc_TypeError, "sz.split() requires at least 1 argument");
+        return NULL;
+    }
+
+    PyObject *text_obj = args[0];
+    struct sz_haystack_t text;
+    if (!export_string_like(text_obj, &text.start, &text.length)) {
+        PyErr_SetString(PyExc_TypeError, "First argument must be string-like");
+        return NULL;
+    }
+
+    struct sz_needle_t separator;
+    separator.start = " ";
+    separator.length = 1;
+    separator.anomaly_offset = 0;
+    int keepseparator = 0;
+    Py_ssize_t maxsplit = PY_SSIZE_T_MAX;
+
+    // Parse additional positional arguments and keyword arguments
+    if (kwnames != NULL) {
+        for (Py_ssize_t i = 0; i < PyTuple_Size(kwnames); ++i) {
+            PyObject *key = PyTuple_GetItem(kwnames, i);
+            PyObject *value = args[nargs + i];
+            if (PyUnicode_CompareWithASCIIString(key, "separator") == 0) {
+                // Assume separator is passed as a Python Unicode object
+                Py_ssize_t len;
+                separator.start = PyUnicode_AsUTF8AndSize(value, &len);
+                separator.length = (size_t)len;
+            }
+            else if (PyUnicode_CompareWithASCIIString(key, "maxsplit") == 0)
+                maxsplit = PyLong_AsSsize_t(value);
+            else if (PyUnicode_CompareWithASCIIString(key, "keepseparator") == 0)
+                keepseparator = PyObject_IsTrue(value);
+            else {
+                PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key);
+                return NULL;
+            }
+        }
+    }
+
+    // Create Strs object
+    Strs *result = (Strs *)PyObject_New(Strs, &StrsType);
+    if (!result)
+        return NULL;
+
+    // Initialize Strs object based on the splitting logic
+    void *offsets = NULL;
+    size_t offsets_capacity = 0;
+    size_t offsets_count = 0;
+    size_t bytes_per_offset;
+    if (text.length >= UINT32_MAX) {
+        bytes_per_offset = 8;
+        result->type = STRS_CONSECUTIVE_64;
+        result->data.consecutive_64bit.start = text.start;
+        result->data.consecutive_64bit.parent = text_obj;
+        result->data.consecutive_64bit.separator_length = keepseparator * separator.length;
+    }
+    else {
+        bytes_per_offset = 4;
+        result->type = STRS_CONSECUTIVE_32;
+        result->data.consecutive_32bit.start = text.start;
+        result->data.consecutive_32bit.parent = text_obj;
+        result->data.consecutive_32bit.separator_length = keepseparator * separator.length;
+    }
+
+    // Iterate through string, keeping track of the
+    sz_size_t last_start = 0;
+    while (last_start < text.length && offsets_count < maxsplit) {
+        sz_haystack_t text_remaining;
+        text_remaining.start = text.start + last_start;
+        text_remaining.length = text.length - last_start;
+        sz_size_t offset_in_remaining = sz_neon_find_substr(text_remaining, separator);
+
+        // Reallocate offsets array if needed
+        if (offsets_count >= offsets_capacity) {
+            offsets_capacity = (offsets_capacity + 1) * 2;
+            void *new_offsets = realloc(offsets, offsets_capacity * bytes_per_offset);
+            if (!new_offsets) {
+                if (offsets)
+                    free(offsets);
+            }
+            offsets = new_offsets;
+        }
+
+        // If the memory allocation has failed - discard the response
+        if (!offsets) {
+            Py_XDECREF(result);
+            PyErr_NoMemory();
+            return NULL;
+        }
+
+        // Export the offset
+        if (text.length >= UINT32_MAX)
+            ((uint64_t *)offsets)[offsets_count++] = (uint64_t)(last_start + offset_in_remaining);
+        else
+            ((uint32_t *)offsets)[offsets_count++] = (uint32_t)(last_start + offset_in_remaining);
+
+        // Next time we want to start
+        last_start = last_start + offset_in_remaining + separator.length;
+    }
+
+    // Populate the Strs object with the offsets
+    if (text.length >= UINT32_MAX) {
+        result->data.consecutive_64bit.offsets = offsets;
+        result->data.consecutive_64bit.count = offsets_count;
+    }
+    else {
+        result->data.consecutive_32bit.offsets = offsets;
+        result->data.consecutive_32bit.count = offsets_count;
+    }
+
+    return (PyObject *)result;
+}
+
 #pragma endregion
 
 #pragma region MemoryMappingFile
@@ -881,7 +1002,6 @@ PyMODINIT_FUNC PyInit_stringzilla(void) {
     }
 
     // Initialize temporary_memory, if needed
-    // For example, allocate an initial chunk
     temporary_memory.start = malloc(4096);
     temporary_memory.length = 4096 * (temporary_memory.start != NULL);
     atexit(cleanup_module);
@@ -892,7 +1012,7 @@ PyMODINIT_FUNC PyInit_stringzilla(void) {
     PyObject *vectorized_count = register_vectorcall(m, "count", str_count_vectorcall);
     PyObject *vectorized_levenstein = register_vectorcall(m, "levenstein", str_levenstein_vectorcall);
 
-    PyObject *vectorized_split = register_vectorcall(m, "split", str_find_vectorcall);
+    PyObject *vectorized_split = register_vectorcall(m, "split", strs_split_vectorcall);
     PyObject *vectorized_sort = register_vectorcall(m, "sort", str_find_vectorcall);
     PyObject *vectorized_shuffle = register_vectorcall(m, "shuffle", str_find_vectorcall);
 
diff --git a/scripts/test.py b/scripts/test.py
index c3f70523..0c5d095f 100644
--- a/scripts/test.py
+++ b/scripts/test.py
@@ -119,10 +119,13 @@ def test_rich_comparisons():
 #     assert native.splitlines(True) == list(big.splitlines(keeplinebreaks=True))
 
 
-# def test_split_keepseparator():
-#     native = "word1 word2 word3"
-#     big = Str(native)
-#     assert ["word1 ", "word2 ", "word3"] == list(big.split(" ", keepseparator=True))
+def test_split_keepseparator():
+    native = "word1 word2 word3"
+    big = Str(native)
+    words = sz.split(big, " ")
+    parts = sz.split(big, " ", keepseparator=True)
+    # assert words[0] == "word1"
+    # assert parts[0] == "word1 "
 
 
 # def test_strs_operations():

From 6f6b389c363cfc4a1a1ef89a560796430b23b709 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 18 Sep 2023 18:53:32 +0400
Subject: [PATCH 18/72] Add: Split into consecutive slices

---
 python/lib.c    | 404 +++++++++++++++++++++++++++++++++---------------
 scripts/test.py |  35 +++--
 2 files changed, 300 insertions(+), 139 deletions(-)

diff --git a/python/lib.c b/python/lib.c
index bd16cb23..38fd38af 100644
--- a/python/lib.c
+++ b/python/lib.c
@@ -87,25 +87,35 @@ typedef struct {
         /**
          *  Simple structure resembling Apache Arrow arrays of variable length strings.
          *  When you split a `Str`, that is under 4 GB in size, this is used for space-efficiency.
+         *  The `end_offsets` contains `count`-many integers marking the end offset of part at a given
+         *  index. The length of consecutive elements can be determined as the difference in consecutive
+         *  offsets. The starting offset of the first element is zero bytes after the `start`.
+         *  Every chunk will include a separator of length `separator_length` at the end, except for the
+         *  last one.
          */
         struct consecutive_slices_32bit_t {
             size_t count;
             size_t separator_length;
             PyObject *parent;
             char const *start;
-            uint32_t *offsets;
+            uint32_t *end_offsets;
         } consecutive_32bit;
 
         /**
          *  Simple structure resembling Apache Arrow arrays of variable length strings.
          *  When you split a `Str`, over 4 GB long, this structure is used to indicate chunk offsets.
+         *  The `end_offsets` contains `count`-many integers marking the end offset of part at a given
+         *  index. The length of consecutive elements can be determined as the difference in consecutive
+         *  offsets. The starting offset of the first element is zero bytes after the `start`.
+         *  Every chunk will include a separator of length `separator_length` at the end, except for the
+         *  last one.
          */
         struct consecutive_slices_64bit_t {
             size_t count;
             size_t separator_length;
             PyObject *parent;
             char const *start;
-            uint64_t *offsets;
+            uint64_t *end_offsets;
         } consecutive_64bit;
 
         /**
@@ -199,50 +209,72 @@ int export_string_like(PyObject *object, char const **start, size_t *length) {
 static Py_ssize_t str_find_vectorcall_(PyObject *_, PyObject *const *args, size_t nargsf, PyObject *kwnames) {
     Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
 
-    // Initialize defaults
-    Py_ssize_t start = 0;
-    Py_ssize_t end = PY_SSIZE_T_MAX;
-
-    // Parse positional arguments: haystack and needle
     if (nargs < 2) {
         PyErr_SetString(PyExc_TypeError, "Invalid number of arguments");
         return NULL;
     }
 
+    // Initialize with default values or positional arguments
     PyObject *haystack_obj = args[0];
     PyObject *needle_obj = args[1];
-    struct sz_haystack_t haystack;
-    struct sz_needle_t needle;
-    needle.anomaly_offset = 0;
-    if (!export_string_like(haystack_obj, &haystack.start, &haystack.length) ||
-        !export_string_like(needle_obj, &needle.start, &needle.length)) {
-        PyErr_SetString(PyExc_TypeError, "Haystack and needle must be string-like");
-        return NULL;
-    }
-
-    // Parse additional positional arguments
-    if (nargs > 2)
-        start = PyLong_AsSsize_t(args[2]);
-    if (nargs > 3)
-        end = PyLong_AsSsize_t(args[3]);
+    PyObject *start_obj = (nargs > 2) ? args[2] : NULL;
+    PyObject *end_obj = (nargs > 3) ? args[3] : NULL;
 
-    // Parse keyword arguments
+    // Parse keyword arguments to overwrite positional ones
     if (kwnames != NULL) {
         for (Py_ssize_t i = 0; i < PyTuple_Size(kwnames); ++i) {
             PyObject *key = PyTuple_GetItem(kwnames, i);
             PyObject *value = args[nargs + i];
             if (PyUnicode_CompareWithASCIIString(key, "start") == 0)
-                start = PyLong_AsSsize_t(value);
+                start_obj = value;
             else if (PyUnicode_CompareWithASCIIString(key, "end") == 0)
-                end = PyLong_AsSsize_t(value);
+                end_obj = value;
             else {
                 PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key);
                 return NULL;
             }
+            if (PyErr_Occurred())
+                return NULL;
         }
     }
 
-    // Limit the haystack range
+    struct sz_haystack_t haystack;
+    struct sz_needle_t needle;
+    Py_ssize_t start, end;
+
+    // Validate and convert `haystack` and `needle`
+    needle.anomaly_offset = 0;
+    if (!export_string_like(haystack_obj, &haystack.start, &haystack.length) ||
+        !export_string_like(needle_obj, &needle.start, &needle.length)) {
+        PyErr_SetString(PyExc_TypeError, "Haystack and needle must be string-like");
+        return NULL;
+    }
+
+    // Validate and convert `start`
+    if (start_obj) {
+        start = PyLong_AsSsize_t(start_obj);
+        if (start == -1 && PyErr_Occurred()) {
+            PyErr_SetString(PyExc_TypeError, "The start argument must be an integer");
+            return NULL;
+        }
+    }
+    else {
+        start = 0;
+    }
+
+    // Validate and convert `end`
+    if (end_obj) {
+        end = PyLong_AsSsize_t(end_obj);
+        if (end == -1 && PyErr_Occurred()) {
+            PyErr_SetString(PyExc_TypeError, "The end argument must be an integer");
+            return NULL;
+        }
+    }
+    else {
+        end = PY_SSIZE_T_MAX;
+    }
+
+    // Limit the `haystack` range
     size_t normalized_offset, normalized_length;
     slice(haystack.length, start, end, &normalized_offset, &normalized_length);
     haystack.start += normalized_offset;
@@ -273,12 +305,7 @@ static PyObject *str_contains_vectorcall(PyObject *_, PyObject *const *args, siz
 static PyObject *str_count_vectorcall(PyObject *_, PyObject *const *args, size_t nargsf, PyObject *kwnames) {
     Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
 
-    // Initialize defaults
-    Py_ssize_t start = 0;
-    Py_ssize_t end = PY_SSIZE_T_MAX;
-    int allow_overlap = 0;
-
-    // Parse positional arguments: haystack and needle
+    // Initialize with default values or positional arguments
     if (nargs < 2) {
         PyErr_SetString(PyExc_TypeError, "Invalid number of arguments");
         return NULL;
@@ -286,40 +313,79 @@ static PyObject *str_count_vectorcall(PyObject *_, PyObject *const *args, size_t
 
     PyObject *haystack_obj = args[0];
     PyObject *needle_obj = args[1];
+    PyObject *start_obj = (nargs > 2) ? args[2] : NULL;
+    PyObject *end_obj = (nargs > 3) ? args[3] : NULL;
+    PyObject *allowoverlap_obj = (nargs > 4) ? args[4] : NULL;
 
-    struct sz_haystack_t haystack;
-    struct sz_needle_t needle;
-    needle.anomaly_offset = 0;
-    if (!export_string_like(haystack_obj, &haystack.start, &haystack.length) ||
-        !export_string_like(needle_obj, &needle.start, &needle.length)) {
-        PyErr_SetString(PyExc_TypeError, "Haystack and needle must be string-like");
-        return NULL;
-    }
-
-    // Parse additional positional arguments
-    if (nargs > 2)
-        start = PyLong_AsSsize_t(args[2]);
-    if (nargs > 3)
-        end = PyLong_AsSsize_t(args[3]);
-
-    // Parse keyword arguments
+    // Parse keyword arguments to overwrite positional ones
     if (kwnames != NULL) {
         for (Py_ssize_t i = 0; i < PyTuple_Size(kwnames); ++i) {
             PyObject *key = PyTuple_GetItem(kwnames, i);
             PyObject *value = args[nargs + i];
             if (PyUnicode_CompareWithASCIIString(key, "start") == 0)
-                start = PyLong_AsSsize_t(value);
+                start_obj = value;
             else if (PyUnicode_CompareWithASCIIString(key, "end") == 0)
-                end = PyLong_AsSsize_t(value);
+                end_obj = value;
             else if (PyUnicode_CompareWithASCIIString(key, "allowoverlap") == 0)
-                allow_overlap = PyObject_IsTrue(value);
+                allowoverlap_obj = value;
             else {
                 PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key);
                 return NULL;
             }
+            if (PyErr_Occurred())
+                return NULL;
         }
     }
 
+    struct sz_haystack_t haystack;
+    struct sz_needle_t needle;
+    int allowoverlap;
+    Py_ssize_t start, end;
+
+    // Validate and convert `haystack` and `needle`
+    needle.anomaly_offset = 0;
+    if (!export_string_like(haystack_obj, &haystack.start, &haystack.length) ||
+        !export_string_like(needle_obj, &needle.start, &needle.length)) {
+        PyErr_SetString(PyExc_TypeError, "Haystack and needle must be string-like");
+        return NULL;
+    }
+
+    // Validate and convert `start`
+    if (start_obj) {
+        start = PyLong_AsSsize_t(start_obj);
+        if (start == -1 && PyErr_Occurred()) {
+            PyErr_SetString(PyExc_TypeError, "The start argument must be an integer");
+            return NULL;
+        }
+    }
+    else {
+        start = 0;
+    }
+
+    // Validate and convert `end`
+    if (end_obj) {
+        end = PyLong_AsSsize_t(end_obj);
+        if (end == -1 && PyErr_Occurred()) {
+            PyErr_SetString(PyExc_TypeError, "The end argument must be an integer");
+            return NULL;
+        }
+    }
+    else {
+        end = PY_SSIZE_T_MAX;
+    }
+
+    // Validate and convert `allowoverlap`
+    if (allowoverlap_obj) {
+        allowoverlap = PyObject_IsTrue(allowoverlap_obj);
+        if (allowoverlap == -1) {
+            PyErr_SetString(PyExc_TypeError, "The allowoverlap argument must be a boolean");
+            return NULL;
+        }
+    }
+    else {
+        allowoverlap = 0;
+    }
+
     // Limit the haystack range
     size_t normalized_offset, normalized_length;
     slice(haystack.length, start, end, &normalized_offset, &normalized_length);
@@ -333,7 +399,7 @@ static PyObject *str_count_vectorcall(PyObject *_, PyObject *const *args, size_t
     }
     else {
         // Your existing logic for count_substr can be embedded here
-        if (allow_overlap) {
+        if (allowoverlap) {
             while (haystack.length) {
                 size_t offset = sz_neon_find_substr(haystack, needle);
                 int found = offset != haystack.length;
@@ -427,50 +493,92 @@ static PyObject *str_levenstein_vectorcall(PyObject *_, PyObject *const *args, s
     return PyLong_FromLong(distance);
 }
 
-static PyObject *strs_split_vectorcall(PyObject *self, PyObject *const *args, size_t nargsf, PyObject *kwnames) {
+static PyObject *strs_split_vectorcall(PyObject *_, PyObject *const *args, size_t nargsf, PyObject *kwnames) {
     Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
 
-    // Validate the number of arguments
     if (nargs < 1) {
         PyErr_SetString(PyExc_TypeError, "sz.split() requires at least 1 argument");
         return NULL;
     }
 
+    // Initialize with default values or positional arguments
     PyObject *text_obj = args[0];
-    struct sz_haystack_t text;
-    if (!export_string_like(text_obj, &text.start, &text.length)) {
-        PyErr_SetString(PyExc_TypeError, "First argument must be string-like");
-        return NULL;
-    }
+    PyObject *separator_obj = (nargs > 1) ? args[1] : NULL;
+    PyObject *maxsplit_obj = (nargs > 2) ? args[2] : NULL;
+    PyObject *keepseparator_obj = (nargs > 3) ? args[3] : NULL;
 
-    struct sz_needle_t separator;
-    separator.start = " ";
-    separator.length = 1;
-    separator.anomaly_offset = 0;
-    int keepseparator = 0;
-    Py_ssize_t maxsplit = PY_SSIZE_T_MAX;
-
-    // Parse additional positional arguments and keyword arguments
+    // Parse keyword arguments to overwrite positional ones
     if (kwnames != NULL) {
         for (Py_ssize_t i = 0; i < PyTuple_Size(kwnames); ++i) {
             PyObject *key = PyTuple_GetItem(kwnames, i);
             PyObject *value = args[nargs + i];
-            if (PyUnicode_CompareWithASCIIString(key, "separator") == 0) {
-                // Assume separator is passed as a Python Unicode object
-                Py_ssize_t len;
-                separator.start = PyUnicode_AsUTF8AndSize(value, &len);
-                separator.length = (size_t)len;
-            }
+
+            if (PyUnicode_CompareWithASCIIString(key, "separator") == 0)
+                separator_obj = value;
             else if (PyUnicode_CompareWithASCIIString(key, "maxsplit") == 0)
-                maxsplit = PyLong_AsSsize_t(value);
+                maxsplit_obj = value;
             else if (PyUnicode_CompareWithASCIIString(key, "keepseparator") == 0)
-                keepseparator = PyObject_IsTrue(value);
+                keepseparator_obj = value;
             else {
                 PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key);
                 return NULL;
             }
+
+            // Check for errors during conversion
+            if (PyErr_Occurred())
+                return NULL;
+        }
+    }
+
+    struct sz_haystack_t text;
+    struct sz_needle_t separator;
+    int keepseparator;
+    Py_ssize_t maxsplit;
+    separator.anomaly_offset = 0;
+
+    // Validate and convert `text`
+    if (!export_string_like(text_obj, &text.start, &text.length)) {
+        PyErr_SetString(PyExc_TypeError, "The text argument must be string-like");
+        return NULL;
+    }
+
+    // Validate and convert `separator`
+    if (separator_obj) {
+        Py_ssize_t len;
+        if (!export_string_like(separator_obj, &separator.start, &len)) {
+            PyErr_SetString(PyExc_TypeError, "The separator argument must be string-like");
+            return NULL;
+        }
+        separator.length = (size_t)len;
+    }
+    else {
+        separator.start = " ";
+        separator.length = 1;
+    }
+
+    // Validate and convert `keepseparator`
+    if (keepseparator_obj) {
+        keepseparator = PyObject_IsTrue(keepseparator_obj);
+        if (keepseparator == -1) {
+            PyErr_SetString(PyExc_TypeError, "The keepseparator argument must be a boolean");
+            return NULL;
+        }
+    }
+    else {
+        keepseparator = 0;
+    }
+
+    // Validate and convert `maxsplit`
+    if (maxsplit_obj) {
+        maxsplit = PyLong_AsSsize_t(maxsplit_obj);
+        if (maxsplit == -1 && PyErr_Occurred()) {
+            PyErr_SetString(PyExc_TypeError, "The maxsplit argument must be an integer");
+            return NULL;
         }
     }
+    else {
+        maxsplit = PY_SSIZE_T_MAX;
+    }
 
     // Create Strs object
     Strs *result = (Strs *)PyObject_New(Strs, &StrsType);
@@ -478,7 +586,7 @@ static PyObject *strs_split_vectorcall(PyObject *self, PyObject *const *args, si
         return NULL;
 
     // Initialize Strs object based on the splitting logic
-    void *offsets = NULL;
+    void *offsets_endings = NULL;
     size_t offsets_capacity = 0;
     size_t offsets_count = 0;
     size_t bytes_per_offset;
@@ -487,14 +595,14 @@ static PyObject *strs_split_vectorcall(PyObject *self, PyObject *const *args, si
         result->type = STRS_CONSECUTIVE_64;
         result->data.consecutive_64bit.start = text.start;
         result->data.consecutive_64bit.parent = text_obj;
-        result->data.consecutive_64bit.separator_length = keepseparator * separator.length;
+        result->data.consecutive_64bit.separator_length = !keepseparator * separator.length;
     }
     else {
         bytes_per_offset = 4;
         result->type = STRS_CONSECUTIVE_32;
         result->data.consecutive_32bit.start = text.start;
         result->data.consecutive_32bit.parent = text_obj;
-        result->data.consecutive_32bit.separator_length = keepseparator * separator.length;
+        result->data.consecutive_32bit.separator_length = !keepseparator * separator.length;
     }
 
     // Iterate through string, keeping track of the
@@ -508,26 +616,28 @@ static PyObject *strs_split_vectorcall(PyObject *self, PyObject *const *args, si
         // Reallocate offsets array if needed
         if (offsets_count >= offsets_capacity) {
             offsets_capacity = (offsets_capacity + 1) * 2;
-            void *new_offsets = realloc(offsets, offsets_capacity * bytes_per_offset);
+            void *new_offsets = realloc(offsets_endings, offsets_capacity * bytes_per_offset);
             if (!new_offsets) {
-                if (offsets)
-                    free(offsets);
+                if (offsets_endings)
+                    free(offsets_endings);
             }
-            offsets = new_offsets;
+            offsets_endings = new_offsets;
         }
 
         // If the memory allocation has failed - discard the response
-        if (!offsets) {
+        if (!offsets_endings) {
             Py_XDECREF(result);
             PyErr_NoMemory();
             return NULL;
         }
 
         // Export the offset
+        size_t will_continue = offset_in_remaining != text_remaining.length;
+        size_t next_offset = last_start + offset_in_remaining + separator.length * will_continue;
         if (text.length >= UINT32_MAX)
-            ((uint64_t *)offsets)[offsets_count++] = (uint64_t)(last_start + offset_in_remaining);
+            ((uint64_t *)offsets_endings)[offsets_count++] = (uint64_t)next_offset;
         else
-            ((uint32_t *)offsets)[offsets_count++] = (uint32_t)(last_start + offset_in_remaining);
+            ((uint32_t *)offsets_endings)[offsets_count++] = (uint32_t)next_offset;
 
         // Next time we want to start
         last_start = last_start + offset_in_remaining + separator.length;
@@ -535,14 +645,15 @@ static PyObject *strs_split_vectorcall(PyObject *self, PyObject *const *args, si
 
     // Populate the Strs object with the offsets
     if (text.length >= UINT32_MAX) {
-        result->data.consecutive_64bit.offsets = offsets;
+        result->data.consecutive_64bit.end_offsets = offsets_endings;
         result->data.consecutive_64bit.count = offsets_count;
     }
     else {
-        result->data.consecutive_32bit.offsets = offsets;
+        result->data.consecutive_32bit.end_offsets = offsets_endings;
         result->data.consecutive_32bit.count = offsets_count;
     }
 
+    Py_INCREF(text_obj);
     return (PyObject *)result;
 }
 
@@ -735,10 +846,10 @@ static void Str_dealloc(Str *self) {
 
 static PyObject *Str_str(Str *self) { return PyUnicode_FromStringAndSize(self->start, self->length); }
 
-static Py_ssize_t Str_len(Str *self) { return self->length; }
-
 static Py_hash_t Str_hash(Str *self) { return (Py_hash_t)sz_hash_crc32_native(self->start, self->length); }
 
+static Py_ssize_t Str_len(Str *self) { return self->length; }
+
 static PyObject *Str_getitem(Str *self, Py_ssize_t i) {
 
     // Negative indexing
@@ -807,50 +918,80 @@ static int Str_contains(Str *self, PyObject *arg) {
     return position != haystack.length;
 }
 
-static PyObject *Str_getslice(Str *self, PyObject *args) {
-    PyObject *start_obj = NULL, *end_obj = NULL;
-    ssize_t start = 0, end = self->length; // Default values
+static Py_ssize_t Strs_len(Strs *self) {
+    switch (self->type) {
+    case STRS_CONSECUTIVE_32: return self->data.consecutive_32bit.count;
+    case STRS_CONSECUTIVE_64: return self->data.consecutive_64bit.count;
+    case STRS_REORDERED: return self->data.reordered.count;
+    case STRS_MULTI_SOURCE: return self->data.multi_source.count;
+    default: return 0;
+    }
+}
 
-    if (!PyArg_ParseTuple(args, "|OO", &start_obj, &end_obj))
+static PyObject *Strs_getitem(Strs *self, Py_ssize_t i) {
+    // Check for negative index and convert to positive
+    Py_ssize_t count = Strs_len(self);
+    if (i < 0)
+        i += count;
+    if (i < 0 || i >= count) {
+        PyErr_SetString(PyExc_IndexError, "Index out of range");
         return NULL;
-
-    if (start_obj != NULL && start_obj != Py_None) {
-        if (!PyLong_Check(start_obj)) {
-            PyErr_SetString(PyExc_TypeError, "Start index must be an integer or None");
-            return NULL;
-        }
-        start = PyLong_AsSsize_t(start_obj);
     }
 
-    if (end_obj != NULL && end_obj != Py_None) {
-        if (!PyLong_Check(end_obj)) {
-            PyErr_SetString(PyExc_TypeError, "End index must be an integer or None");
-            return NULL;
-        }
-        end = PyLong_AsSsize_t(end_obj);
+    PyObject *parent = NULL;
+    char const *start = NULL;
+    size_t length = 0;
+
+    // Extract a member element based on
+    switch (self->type) {
+    case STRS_CONSECUTIVE_32: {
+        uint32_t start_offset = (i == 0) ? 0 : self->data.consecutive_32bit.end_offsets[i - 1];
+        uint32_t end_offset = self->data.consecutive_32bit.end_offsets[i];
+        start = self->data.consecutive_32bit.start + start_offset;
+        length = end_offset - start_offset - self->data.consecutive_32bit.separator_length * (i + 1 != count);
+        parent = self->data.consecutive_32bit.parent;
+        break;
+    }
+    case STRS_CONSECUTIVE_64: {
+        uint64_t start_offset = (i == 0) ? 0 : self->data.consecutive_64bit.end_offsets[i - 1];
+        uint64_t end_offset = self->data.consecutive_64bit.end_offsets[i];
+        start = self->data.consecutive_64bit.start + start_offset;
+        length = end_offset - start_offset - self->data.consecutive_64bit.separator_length * (i + 1 != count);
+        parent = self->data.consecutive_64bit.parent;
+        break;
+    }
+    case STRS_REORDERED: {
+        //
+        break;
+    }
+    case STRS_MULTI_SOURCE: {
+        //
+        break;
+    }
+    default: PyErr_SetString(PyExc_TypeError, "Unknown Strs kind"); return NULL;
     }
 
-    size_t normalized_offset, normalized_length;
-    slice(self->length, start, end, &normalized_offset, &normalized_length);
-
-    if (normalized_length == 0)
-        return PyUnicode_FromString("");
-
-    // Create a new Str object
-    Str *new_str = (Str *)PyObject_New(Str, &StrType);
-    if (new_str == NULL)
+    // Create a new `Str` object
+    Str *parent_slice = (Str *)StrType.tp_alloc(&StrType, 0);
+    if (parent_slice == NULL && PyErr_NoMemory())
         return NULL;
 
-    // Set the parent to the original Str object and increment its reference count
-    new_str->parent = (PyObject *)self;
-    Py_INCREF(self);
+    parent_slice->start = start;
+    parent_slice->length = length;
+    parent_slice->parent = parent;
+    Py_INCREF(parent);
+    return parent_slice;
+}
 
-    // Set the start and length to point to the slice
-    new_str->start = self->start + normalized_offset;
-    new_str->length = normalized_length;
-    return (PyObject *)new_str;
+static PyObject *Strs_subscript(Str *self, PyObject *key) {
+    if (PyLong_Check(key))
+        return Strs_getitem(self, PyLong_AsSsize_t(key));
+    return NULL;
 }
 
+// Will be called by the `PySequence_Contains`
+static int Strs_contains(Str *self, PyObject *arg) { return 0; }
+
 static PyObject *Str_richcompare(PyObject *self, PyObject *other, int op) {
 
     char const *a_start, *b_start;
@@ -888,7 +1029,7 @@ static PyMappingMethods Str_as_mapping = {
     .mp_subscript = Str_subscript, // Is used to implement slices in Python
 };
 
-static PyMethodDef Str_methods[] = { //
+static PyMethodDef Str_methods[] = {
     // {"contains", (PyCFunction)..., METH_NOARGS, "Convert to Python `str`"},
     // {"find", (PyCFunction)..., METH_NOARGS, "Get length"},
     // {"__getitem__", (PyCFunction)..., METH_O, "Indexing"},
@@ -911,6 +1052,17 @@ static PyTypeObject StrType = {
     // .tp_as_buffer = (PyBufferProcs *)NULL, // Functions to access object as input/output buffer
 };
 
+static PySequenceMethods Strs_as_sequence = {
+    .sq_length = Strs_len,        //
+    .sq_item = Strs_getitem,      //
+    .sq_contains = Strs_contains, //
+};
+
+static PyMappingMethods Strs_as_mapping = {
+    .mp_length = Strs_len,          //
+    .mp_subscript = Strs_subscript, // Is used to implement slices in Python
+};
+
 static PyTypeObject StrsType = {
     PyVarObject_HEAD_INIT(NULL, 0).tp_name = "stringzilla.Strs",
     .tp_doc = "Space-efficient container for large collections of strings and their slices",
@@ -918,6 +1070,8 @@ static PyTypeObject StrsType = {
     .tp_itemsize = 0,
     .tp_flags = Py_TPFLAGS_DEFAULT,
     .tp_new = PyType_GenericNew,
+    .tp_as_sequence = &Strs_as_sequence,
+    .tp_as_mapping = &Strs_as_mapping,
 };
 
 #pragma endregion
@@ -956,7 +1110,8 @@ PyObject *register_vectorcall(PyObject *module, char const *name, vectorcallfunc
 }
 
 void cleanup_module(void) {
-    free(temporary_memory.start);
+    if (temporary_memory.start)
+        free(temporary_memory.start);
     temporary_memory.start = NULL;
     temporary_memory.length = 0;
 }
@@ -1004,7 +1159,7 @@ PyMODINIT_FUNC PyInit_stringzilla(void) {
     // Initialize temporary_memory, if needed
     temporary_memory.start = malloc(4096);
     temporary_memory.length = 4096 * (temporary_memory.start != NULL);
-    atexit(cleanup_module);
+    // atexit(cleanup_module);
 
     // Register the vectorized functions
     PyObject *vectorized_find = register_vectorcall(m, "find", str_find_vectorcall);
@@ -1043,6 +1198,5 @@ PyMODINIT_FUNC PyInit_stringzilla(void) {
     Py_XDECREF(&FileType);
     Py_XDECREF(&StrType);
     Py_XDECREF(m);
-    PyErr_NoMemory();
     return NULL;
 }
diff --git a/scripts/test.py b/scripts/test.py
index 0c5d095f..1fc2193f 100644
--- a/scripts/test.py
+++ b/scripts/test.py
@@ -38,17 +38,17 @@ def test_indexing():
         assert big[i] == native[i]
 
 
-def test_contains():
-    big = Str("abcdef")
-    assert "a" in big
-    assert "ab" in big
-    assert "xxx" not in big
+# def test_contains():
+#     big = Str("abcdef")
+#     assert "a" in big
+#     assert "ab" in big
+#     assert "xxx" not in big
 
 
-def test_rich_comparisons():
-    assert Str("aa") == "aa"
-    assert Str("aa") < "b"
-    assert Str("abb")[1:] == "bb"
+# def test_rich_comparisons():
+#     assert Str("aa") == "aa"
+#     assert Str("aa") < "b"
+#     assert Str("abb")[1:] == "bb"
 
 
 # def get_random_string(
@@ -120,12 +120,19 @@ def test_rich_comparisons():
 
 
 def test_split_keepseparator():
-    native = "word1 word2 word3"
+    native = "word1_word2_word3"
     big = Str(native)
-    words = sz.split(big, " ")
-    parts = sz.split(big, " ", keepseparator=True)
-    # assert words[0] == "word1"
-    # assert parts[0] == "word1 "
+
+    words = sz.split(big, "_")
+    assert len(words) == 3
+
+    parts = sz.split(big, "_", keepseparator=True)
+    assert len(parts) == 3
+
+    assert str(words[0]) == "word1"
+    assert str(parts[0]) == "word1_"
+    assert str(words[2]) == "word3"
+    assert str(parts[2]) == "word3"
 
 
 # def test_strs_operations():

From febbdf57b0d75f2ce7a0e0e6b92f91404a8a8468 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 19 Sep 2023 18:01:07 +0400
Subject: [PATCH 19/72] Improve: Same functions as global and members

---
 .clang-format         |   6 +-
 .gitignore            |   1 +
 .vscode/settings.json |   1 +
 python/lib.c          | 494 ++++++++++++++----------------------------
 scripts/test.py       |  58 ++---
 5 files changed, 200 insertions(+), 360 deletions(-)

diff --git a/.clang-format b/.clang-format
index ab9f350a..b1adf3b0 100644
--- a/.clang-format
+++ b/.clang-format
@@ -16,12 +16,12 @@ AlignTrailingComments: true
 AllowAllArgumentsOnNextLine: false
 AllowAllConstructorInitializersOnNextLine: true
 AllowAllParametersOfDeclarationOnNextLine: true
-AllowShortBlocksOnASingleLine: false
+AllowShortBlocksOnASingleLine: Always
 AllowShortCaseLabelsOnASingleLine: true
 AllowShortFunctionsOnASingleLine: true
-AllowShortIfStatementsOnASingleLine: Never
+AllowShortIfStatementsOnASingleLine: Always
 AllowShortLambdasOnASingleLine: true
-AllowShortLoopsOnASingleLine: false
+AllowShortLoopsOnASingleLine: true
 AlwaysBreakBeforeMultilineStrings: true
 AlwaysBreakTemplateDeclarations: Yes
 AlwaysBreakBeforeMultilineStrings: true
diff --git a/.gitignore b/.gitignore
index a96d24d0..cfbdf78a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,5 +13,6 @@ substr_search_cpp
 *.so
 *.egg-info
 *.whl
+node_modules/
 
 leipzig1M.txt
\ No newline at end of file
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 48034254..b75f1ba8 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -133,6 +133,7 @@
     "itemsize",
     "keeplinebreaks",
     "keepseparator",
+    "kwargs",
     "kwds",
     "kwnames",
     "levenstein",
diff --git a/python/lib.c b/python/lib.c
index 38fd38af..b0360866 100644
--- a/python/lib.c
+++ b/python/lib.c
@@ -1,6 +1,8 @@
 /**
  *  @brief  Very light-weight CPython wrapper for StringZilla, with support for memory-mapping,
  *          native Python strings, Apache Arrow collections, and more.
+ *
+ *  To minimize latency this implementation avoids `PyArg_ParseTupleAndKeywords` calls.
  */
 #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
 #define NOMINMAX
@@ -29,7 +31,7 @@ static PyTypeObject FileType;
 static PyTypeObject StrType;
 static PyTypeObject StrsType;
 
-struct {
+static struct {
     void *start;
     size_t length;
 } temporary_memory = {NULL, 0};
@@ -206,35 +208,30 @@ int export_string_like(PyObject *object, char const **start, size_t *length) {
 
 #pragma region Global Functions
 
-static Py_ssize_t str_find_vectorcall_(PyObject *_, PyObject *const *args, size_t nargsf, PyObject *kwnames) {
-    Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
-
-    if (nargs < 2) {
+static Py_ssize_t api_find_(PyObject *self, PyObject *args, PyObject *kwargs) {
+    int is_member = (self != NULL && PyObject_TypeCheck(self, &StrType));
+    Py_ssize_t nargs = PyTuple_Size(args);
+    if (nargs < !is_member + 1 || nargs > !is_member + 3) {
         PyErr_SetString(PyExc_TypeError, "Invalid number of arguments");
-        return NULL;
+        return 0;
     }
 
-    // Initialize with default values or positional arguments
-    PyObject *haystack_obj = args[0];
-    PyObject *needle_obj = args[1];
-    PyObject *start_obj = (nargs > 2) ? args[2] : NULL;
-    PyObject *end_obj = (nargs > 3) ? args[3] : NULL;
-
-    // Parse keyword arguments to overwrite positional ones
-    if (kwnames != NULL) {
-        for (Py_ssize_t i = 0; i < PyTuple_Size(kwnames); ++i) {
-            PyObject *key = PyTuple_GetItem(kwnames, i);
-            PyObject *value = args[nargs + i];
-            if (PyUnicode_CompareWithASCIIString(key, "start") == 0)
-                start_obj = value;
-            else if (PyUnicode_CompareWithASCIIString(key, "end") == 0)
-                end_obj = value;
+    PyObject *haystack_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
+    PyObject *needle_obj = PyTuple_GET_ITEM(args, !is_member + 0);
+    PyObject *start_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
+    PyObject *end_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL;
+
+    // Parse keyword arguments
+    if (kwargs) {
+        Py_ssize_t pos = 0;
+        PyObject *key, *value;
+        while (PyDict_Next(kwargs, &pos, &key, &value)) {
+            if (PyUnicode_CompareWithASCIIString(key, "start") == 0) { start_obj = value; }
+            else if (PyUnicode_CompareWithASCIIString(key, "end") == 0) { end_obj = value; }
             else {
                 PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key);
-                return NULL;
+                return 0;
             }
-            if (PyErr_Occurred())
-                return NULL;
         }
     }
 
@@ -247,7 +244,7 @@ static Py_ssize_t str_find_vectorcall_(PyObject *_, PyObject *const *args, size_
     if (!export_string_like(haystack_obj, &haystack.start, &haystack.length) ||
         !export_string_like(needle_obj, &needle.start, &needle.length)) {
         PyErr_SetString(PyExc_TypeError, "Haystack and needle must be string-like");
-        return NULL;
+        return 0;
     }
 
     // Validate and convert `start`
@@ -255,24 +252,20 @@ static Py_ssize_t str_find_vectorcall_(PyObject *_, PyObject *const *args, size_
         start = PyLong_AsSsize_t(start_obj);
         if (start == -1 && PyErr_Occurred()) {
             PyErr_SetString(PyExc_TypeError, "The start argument must be an integer");
-            return NULL;
+            return 0;
         }
     }
-    else {
-        start = 0;
-    }
+    else { start = 0; }
 
     // Validate and convert `end`
     if (end_obj) {
         end = PyLong_AsSsize_t(end_obj);
         if (end == -1 && PyErr_Occurred()) {
             PyErr_SetString(PyExc_TypeError, "The end argument must be an integer");
-            return NULL;
+            return 0;
         }
     }
-    else {
-        end = PY_SSIZE_T_MAX;
-    }
+    else { end = PY_SSIZE_T_MAX; }
 
     // Limit the `haystack` range
     size_t normalized_offset, normalized_length;
@@ -282,123 +275,68 @@ static Py_ssize_t str_find_vectorcall_(PyObject *_, PyObject *const *args, size_
 
     // Perform contains operation
     size_t offset = sz_neon_find_substr(haystack, needle);
-    if (offset == haystack.length)
-        return -1;
+    if (offset == haystack.length) return -1;
     return (Py_ssize_t)offset;
 }
 
-static PyObject *str_find_vectorcall(PyObject *_, PyObject *const *args, size_t nargsf, PyObject *kwnames) {
-    Py_ssize_t signed_offset = str_find_vectorcall_(NULL, args, nargsf, kwnames);
+static PyObject *api_find(PyObject *self, PyObject *args, PyObject *kwargs) {
+    Py_ssize_t signed_offset = api_find_(self, args, kwargs);
+    if (PyErr_Occurred()) return NULL;
     return PyLong_FromSsize_t(signed_offset);
 }
 
-static PyObject *str_contains_vectorcall(PyObject *_, PyObject *const *args, size_t nargsf, PyObject *kwnames) {
-    Py_ssize_t signed_offset = str_find_vectorcall_(NULL, args, nargsf, kwnames);
-    if (signed_offset == -1) {
-        Py_RETURN_FALSE;
-    }
-    else {
-        Py_RETURN_TRUE;
-    }
+static PyObject *api_contains(PyObject *self, PyObject *args, PyObject *kwargs) {
+    Py_ssize_t signed_offset = api_find_(self, args, kwargs);
+    if (PyErr_Occurred()) return NULL;
+    if (signed_offset == -1) { Py_RETURN_FALSE; }
+    else { Py_RETURN_TRUE; }
 }
 
-static PyObject *str_count_vectorcall(PyObject *_, PyObject *const *args, size_t nargsf, PyObject *kwnames) {
-    Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
-
-    // Initialize with default values or positional arguments
-    if (nargs < 2) {
-        PyErr_SetString(PyExc_TypeError, "Invalid number of arguments");
+static PyObject *api_count(PyObject *self, PyObject *args, PyObject *kwargs) {
+    int is_member = (self != NULL && PyObject_TypeCheck(self, &StrType));
+    Py_ssize_t nargs = PyTuple_Size(args);
+    if (nargs < !is_member + 1 || nargs > !is_member + 4) {
+        PyErr_Format(PyExc_TypeError, "Invalid number of arguments");
         return NULL;
     }
 
-    PyObject *haystack_obj = args[0];
-    PyObject *needle_obj = args[1];
-    PyObject *start_obj = (nargs > 2) ? args[2] : NULL;
-    PyObject *end_obj = (nargs > 3) ? args[3] : NULL;
-    PyObject *allowoverlap_obj = (nargs > 4) ? args[4] : NULL;
-
-    // Parse keyword arguments to overwrite positional ones
-    if (kwnames != NULL) {
-        for (Py_ssize_t i = 0; i < PyTuple_Size(kwnames); ++i) {
-            PyObject *key = PyTuple_GetItem(kwnames, i);
-            PyObject *value = args[nargs + i];
-            if (PyUnicode_CompareWithASCIIString(key, "start") == 0)
-                start_obj = value;
-            else if (PyUnicode_CompareWithASCIIString(key, "end") == 0)
-                end_obj = value;
-            else if (PyUnicode_CompareWithASCIIString(key, "allowoverlap") == 0)
-                allowoverlap_obj = value;
-            else {
-                PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key);
-                return NULL;
-            }
-            if (PyErr_Occurred())
+    PyObject *haystack_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
+    PyObject *needle_obj = PyTuple_GET_ITEM(args, !is_member + 0);
+    PyObject *start_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
+    PyObject *end_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL;
+    PyObject *allowoverlap_obj = nargs > !is_member + 3 ? PyTuple_GET_ITEM(args, !is_member + 3) : NULL;
+
+    if (kwargs) {
+        Py_ssize_t pos = 0;
+        PyObject *key, *value;
+        while (PyDict_Next(kwargs, &pos, &key, &value))
+            if (PyUnicode_CompareWithASCIIString(key, "start") == 0) { start_obj = value; }
+            else if (PyUnicode_CompareWithASCIIString(key, "end") == 0) { end_obj = value; }
+            else if (PyUnicode_CompareWithASCIIString(key, "allowoverlap") == 0) { allowoverlap_obj = value; }
+            else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key))
                 return NULL;
-        }
     }
 
     struct sz_haystack_t haystack;
     struct sz_needle_t needle;
-    int allowoverlap;
-    Py_ssize_t start, end;
+    Py_ssize_t start = start_obj ? PyLong_AsSsize_t(start_obj) : 0;
+    Py_ssize_t end = end_obj ? PyLong_AsSsize_t(end_obj) : PY_SSIZE_T_MAX;
+    int allowoverlap = allowoverlap_obj ? PyObject_IsTrue(allowoverlap_obj) : 0;
 
-    // Validate and convert `haystack` and `needle`
     needle.anomaly_offset = 0;
     if (!export_string_like(haystack_obj, &haystack.start, &haystack.length) ||
-        !export_string_like(needle_obj, &needle.start, &needle.length)) {
-        PyErr_SetString(PyExc_TypeError, "Haystack and needle must be string-like");
-        return NULL;
-    }
-
-    // Validate and convert `start`
-    if (start_obj) {
-        start = PyLong_AsSsize_t(start_obj);
-        if (start == -1 && PyErr_Occurred()) {
-            PyErr_SetString(PyExc_TypeError, "The start argument must be an integer");
-            return NULL;
-        }
-    }
-    else {
-        start = 0;
-    }
+        !export_string_like(needle_obj, &needle.start, &needle.length))
+        return PyErr_Format(PyExc_TypeError, "Haystack and needle must be string-like"), NULL;
 
-    // Validate and convert `end`
-    if (end_obj) {
-        end = PyLong_AsSsize_t(end_obj);
-        if (end == -1 && PyErr_Occurred()) {
-            PyErr_SetString(PyExc_TypeError, "The end argument must be an integer");
-            return NULL;
-        }
-    }
-    else {
-        end = PY_SSIZE_T_MAX;
-    }
+    if ((start == -1 || end == -1 || allowoverlap == -1) && PyErr_Occurred()) return NULL;
 
-    // Validate and convert `allowoverlap`
-    if (allowoverlap_obj) {
-        allowoverlap = PyObject_IsTrue(allowoverlap_obj);
-        if (allowoverlap == -1) {
-            PyErr_SetString(PyExc_TypeError, "The allowoverlap argument must be a boolean");
-            return NULL;
-        }
-    }
-    else {
-        allowoverlap = 0;
-    }
-
-    // Limit the haystack range
     size_t normalized_offset, normalized_length;
     slice(haystack.length, start, end, &normalized_offset, &normalized_length);
     haystack.start += normalized_offset;
     haystack.length = normalized_length;
 
-    // Perform counting operation
-    size_t count = 0;
-    if (needle.length == 1) {
-        count = sz_naive_count_char(haystack, *needle.start);
-    }
-    else {
-        // Your existing logic for count_substr can be embedded here
+    size_t count = needle.length == 1 ? sz_naive_count_char(haystack, *needle.start) : 0;
+    if (needle.length != 1) {
         if (allowoverlap) {
             while (haystack.length) {
                 size_t offset = sz_neon_find_substr(haystack, needle);
@@ -418,114 +356,87 @@ static PyObject *str_count_vectorcall(PyObject *_, PyObject *const *args, size_t
             }
         }
     }
-
     return PyLong_FromSize_t(count);
 }
 
-static PyObject *str_levenstein_vectorcall(PyObject *_, PyObject *const *args, size_t nargsf, PyObject *kwnames) {
-    Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
-
-    // Validate the number of arguments
-    if (nargs < 2 || nargs > 3) {
-        PyErr_SetString(PyExc_TypeError, "Invalid number of arguments");
-        return NULL;
-    }
-
-    PyObject *str1_obj = args[0];
-    PyObject *str2_obj = args[1];
-
-    struct sz_haystack_t str1, str2;
-    if (!export_string_like(str1_obj, &str1.start, &str1.length) ||
-        !export_string_like(str2_obj, &str2.start, &str2.length)) {
-        PyErr_SetString(PyExc_TypeError, "Both arguments must be string-like");
+static PyObject *api_levenstein(PyObject *self, PyObject *args, PyObject *kwargs) {
+    int is_member = (self != NULL && PyObject_TypeCheck(self, &StrType));
+    Py_ssize_t nargs = PyTuple_Size(args);
+    if (nargs < !is_member + 1 || nargs > !is_member + 2) {
+        PyErr_Format(PyExc_TypeError, "Invalid number of arguments");
         return NULL;
     }
 
-    // Initialize bound argument
-    int bound = 255;
+    PyObject *str1_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
+    PyObject *str2_obj = PyTuple_GET_ITEM(args, !is_member + 0);
+    PyObject *bound_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
 
-    // Check if `bound` is given as a positional argument
-    if (nargs == 3) {
-        bound = PyLong_AsLong(args[2]);
-        if (bound > 255 || bound < 0) {
-            PyErr_SetString(PyExc_ValueError, "Bound must be an integer between 0 and 255");
-            return NULL;
-        }
-    }
-
-    // Parse keyword arguments
-    if (kwnames != NULL) {
-        for (Py_ssize_t i = 0; i < PyTuple_Size(kwnames); ++i) {
-            PyObject *key = PyTuple_GetItem(kwnames, i);
-            PyObject *value = args[nargs + i];
+    if (kwargs) {
+        PyObject *key, *value;
+        Py_ssize_t pos = 0;
+        while (PyDict_Next(kwargs, &pos, &key, &value))
             if (PyUnicode_CompareWithASCIIString(key, "bound") == 0) {
-                if (nargs == 3) {
-                    PyErr_SetString(PyExc_TypeError, "Received bound both as positional and keyword argument");
-                    return NULL;
-                }
-                bound = PyLong_AsLong(value);
-                if (bound > 255 || bound < 0) {
-                    PyErr_SetString(PyExc_ValueError, "Bound must be an integer between 0 and 255");
+                if (bound_obj) {
+                    PyErr_Format(PyExc_TypeError, "Received bound both as positional and keyword argument");
                     return NULL;
                 }
+                bound_obj = value;
             }
-        }
     }
 
-    // Initialize or reallocate the Levenshtein distance matrix
+    int bound = 255; // Default value for bound
+    if (bound_obj && ((bound = PyLong_AsLong(bound_obj)) > 255 || bound < 0)) {
+        PyErr_Format(PyExc_ValueError, "Bound must be an integer between 0 and 255");
+        return NULL;
+    }
+
+    struct sz_haystack_t str1, str2;
+    if (!export_string_like(str1_obj, &str1.start, &str1.length) ||
+        !export_string_like(str2_obj, &str2.start, &str2.length)) {
+        PyErr_Format(PyExc_TypeError, "Both arguments must be string-like");
+        return NULL;
+    }
+
     size_t memory_needed = sz_levenstein_memory_needed(str1.length, str2.length);
     if (temporary_memory.length < memory_needed) {
         temporary_memory.start = realloc(temporary_memory.start, memory_needed);
         temporary_memory.length = memory_needed;
     }
-    if (temporary_memory.start == NULL) {
-        PyErr_SetString(PyExc_MemoryError, "Unable to allocate memory for the Levenshtein matrix");
+    if (!temporary_memory.start) {
+        PyErr_Format(PyExc_MemoryError, "Unable to allocate memory for the Levenshtein matrix");
         return NULL;
     }
 
-    levenstein_distance_t distance = sz_levenstein( //
-        str1.start,
-        str1.length,
-        str2.start,
-        str2.length,
-        (levenstein_distance_t)bound,
-        temporary_memory.start);
+    levenstein_distance_t small_bound = (levenstein_distance_t)bound;
+    levenstein_distance_t distance =
+        sz_levenstein(str1.start, str1.length, str2.start, str2.length, small_bound, temporary_memory.start);
+
     return PyLong_FromLong(distance);
 }
 
-static PyObject *strs_split_vectorcall(PyObject *_, PyObject *const *args, size_t nargsf, PyObject *kwnames) {
-    Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
+static PyObject *api_split(PyObject *self, PyObject *args, PyObject *kwargs) {
 
-    if (nargs < 1) {
+    // Check minimum arguments
+    int is_member = (self != NULL && PyObject_TypeCheck(self, &StrType));
+    Py_ssize_t nargs = PyTuple_Size(args);
+    if (nargs < !is_member + 1 || nargs > !is_member + 3) {
         PyErr_SetString(PyExc_TypeError, "sz.split() requires at least 1 argument");
         return NULL;
     }
 
-    // Initialize with default values or positional arguments
-    PyObject *text_obj = args[0];
-    PyObject *separator_obj = (nargs > 1) ? args[1] : NULL;
-    PyObject *maxsplit_obj = (nargs > 2) ? args[2] : NULL;
-    PyObject *keepseparator_obj = (nargs > 3) ? args[3] : NULL;
-
-    // Parse keyword arguments to overwrite positional ones
-    if (kwnames != NULL) {
-        for (Py_ssize_t i = 0; i < PyTuple_Size(kwnames); ++i) {
-            PyObject *key = PyTuple_GetItem(kwnames, i);
-            PyObject *value = args[nargs + i];
-
-            if (PyUnicode_CompareWithASCIIString(key, "separator") == 0)
-                separator_obj = value;
-            else if (PyUnicode_CompareWithASCIIString(key, "maxsplit") == 0)
-                maxsplit_obj = value;
-            else if (PyUnicode_CompareWithASCIIString(key, "keepseparator") == 0)
-                keepseparator_obj = value;
-            else {
-                PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key);
-                return NULL;
-            }
+    PyObject *text_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
+    PyObject *separator_obj = nargs > !is_member + 0 ? PyTuple_GET_ITEM(args, !is_member + 0) : NULL;
+    PyObject *maxsplit_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
+    PyObject *keepseparator_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL;
 
-            // Check for errors during conversion
-            if (PyErr_Occurred())
+    if (kwargs) {
+        PyObject *key, *value;
+        Py_ssize_t pos = 0;
+        while (PyDict_Next(kwargs, &pos, &key, &value)) {
+            if (PyUnicode_CompareWithASCIIString(key, "separator") == 0) { separator_obj = value; }
+            else if (PyUnicode_CompareWithASCIIString(key, "maxsplit") == 0) { maxsplit_obj = value; }
+            else if (PyUnicode_CompareWithASCIIString(key, "keepseparator") == 0) { keepseparator_obj = value; }
+            else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key))
                 return NULL;
         }
     }
@@ -564,9 +475,7 @@ static PyObject *strs_split_vectorcall(PyObject *_, PyObject *const *args, size_
             return NULL;
         }
     }
-    else {
-        keepseparator = 0;
-    }
+    else { keepseparator = 0; }
 
     // Validate and convert `maxsplit`
     if (maxsplit_obj) {
@@ -576,14 +485,11 @@ static PyObject *strs_split_vectorcall(PyObject *_, PyObject *const *args, size_
             return NULL;
         }
     }
-    else {
-        maxsplit = PY_SSIZE_T_MAX;
-    }
+    else { maxsplit = PY_SSIZE_T_MAX; }
 
     // Create Strs object
     Strs *result = (Strs *)PyObject_New(Strs, &StrsType);
-    if (!result)
-        return NULL;
+    if (!result) return NULL;
 
     // Initialize Strs object based on the splitting logic
     void *offsets_endings = NULL;
@@ -618,8 +524,7 @@ static PyObject *strs_split_vectorcall(PyObject *_, PyObject *const *args, size_
             offsets_capacity = (offsets_capacity + 1) * 2;
             void *new_offsets = realloc(offsets_endings, offsets_capacity * bytes_per_offset);
             if (!new_offsets) {
-                if (offsets_endings)
-                    free(offsets_endings);
+                if (offsets_endings) free(offsets_endings);
             }
             offsets_endings = new_offsets;
         }
@@ -634,10 +539,8 @@ static PyObject *strs_split_vectorcall(PyObject *_, PyObject *const *args, size_
         // Export the offset
         size_t will_continue = offset_in_remaining != text_remaining.length;
         size_t next_offset = last_start + offset_in_remaining + separator.length * will_continue;
-        if (text.length >= UINT32_MAX)
-            ((uint64_t *)offsets_endings)[offsets_count++] = (uint64_t)next_offset;
-        else
-            ((uint32_t *)offsets_endings)[offsets_count++] = (uint32_t)next_offset;
+        if (text.length >= UINT32_MAX) { ((uint64_t *)offsets_endings)[offsets_count++] = (uint64_t)next_offset; }
+        else { ((uint32_t *)offsets_endings)[offsets_count++] = (uint32_t)next_offset; }
 
         // Next time we want to start
         last_start = last_start + offset_in_remaining + separator.length;
@@ -692,8 +595,7 @@ static void File_dealloc(File *self) {
 static PyObject *File_new(PyTypeObject *type, PyObject *positional_args, PyObject *named_args) {
     File *self;
     self = (File *)type->tp_alloc(type, 0);
-    if (self == NULL)
-        return NULL;
+    if (self == NULL) return NULL;
 
 #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
     self->file_handle = NULL;
@@ -707,8 +609,7 @@ static PyObject *File_new(PyTypeObject *type, PyObject *positional_args, PyObjec
 
 static int File_init(File *self, PyObject *positional_args, PyObject *named_args) {
     const char *path;
-    if (!PyArg_ParseTuple(positional_args, "s", &path))
-        return -1;
+    if (!PyArg_ParseTuple(positional_args, "s", &path)) return -1;
 
 #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
     self->file_handle = CreateFile(path, GENERIC_READ, FILE_SHARE_READ, 0, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0);
@@ -772,17 +673,6 @@ static PyTypeObject FileType = {
     .tp_new = (newfunc)File_new,
     .tp_init = (initproc)File_init,
     .tp_dealloc = (destructor)File_dealloc,
-
-    // PyBufferProcs *tp_as_buffer;
-
-    // reprfunc tp_repr;
-    // PyNumberMethods *tp_as_number;
-    // PySequenceMethods *tp_as_sequence;
-    // PyMappingMethods *tp_as_mapping;
-    // ternaryfunc tp_call;
-    // reprfunc tp_str;
-    // getattrofunc tp_getattro;
-    // setattrofunc tp_setattro;
 };
 
 #pragma endregion
@@ -797,8 +687,7 @@ static int Str_init(Str *self, PyObject *positional_args, PyObject *named_args)
     // The `named_args` would be `NULL`
     if (named_args) {
         static char *names[] = {"parent", "from", "to", NULL};
-        if (!PyArg_ParseTupleAndKeywords(positional_args, named_args, "|Onn", names, &parent, &from, &to))
-            return -1;
+        if (!PyArg_ParseTupleAndKeywords(positional_args, named_args, "|Onn", names, &parent, &from, &to)) return -1;
     }
     else if (!PyArg_ParseTuple(positional_args, "|Onn", &parent, &from, &to))
         return -1;
@@ -829,8 +718,7 @@ static int Str_init(Str *self, PyObject *positional_args, PyObject *named_args)
 static PyObject *Str_new(PyTypeObject *type, PyObject *args, PyObject *kwds) {
     Str *self;
     self = (Str *)type->tp_alloc(type, 0);
-    if (!self)
-        return NULL;
+    if (!self) return NULL;
 
     self->parent = NULL;
     self->start = NULL;
@@ -839,8 +727,8 @@ static PyObject *Str_new(PyTypeObject *type, PyObject *args, PyObject *kwds) {
 }
 
 static void Str_dealloc(Str *self) {
-    if (self->parent)
-        Py_XDECREF(self->parent);
+    if (self->parent) Py_XDECREF(self->parent);
+    self->parent = NULL;
     Py_TYPE(self)->tp_free((PyObject *)self);
 }
 
@@ -853,8 +741,7 @@ static Py_ssize_t Str_len(Str *self) { return self->length; }
 static PyObject *Str_getitem(Str *self, Py_ssize_t i) {
 
     // Negative indexing
-    if (i < 0)
-        i += self->length;
+    if (i < 0) i += self->length;
 
     if (i < 0 || (size_t)i >= self->length) {
         PyErr_SetString(PyExc_IndexError, "Index out of range");
@@ -867,12 +754,10 @@ static PyObject *Str_getitem(Str *self, Py_ssize_t i) {
 
 static PyObject *Str_subscript(Str *self, PyObject *key) {
     if (PySlice_Check(key)) {
+        // Sanity checks
         Py_ssize_t start, stop, step;
-        if (PySlice_Unpack(key, &start, &stop, &step) < 0)
-            return NULL;
-        if (PySlice_AdjustIndices(self->length, &start, &stop, step) < 0)
-            return NULL;
-
+        if (PySlice_Unpack(key, &start, &stop, &step) < 0) return NULL;
+        if (PySlice_AdjustIndices(self->length, &start, &stop, step) < 0) return NULL;
         if (step != 1) {
             PyErr_SetString(PyExc_IndexError, "Efficient step is not supported");
             return NULL;
@@ -880,8 +765,7 @@ static PyObject *Str_subscript(Str *self, PyObject *key) {
 
         // Create a new `Str` object
         Str *self_slice = (Str *)StrType.tp_alloc(&StrType, 0);
-        if (self_slice == NULL && PyErr_NoMemory())
-            return NULL;
+        if (self_slice == NULL && PyErr_NoMemory()) return NULL;
 
         // Set its properties based on the slice
         self_slice->start = self->start + start;
@@ -892,9 +776,7 @@ static PyObject *Str_subscript(Str *self, PyObject *key) {
         Py_INCREF(self);
         return (PyObject *)self_slice;
     }
-    else if (PyLong_Check(key)) {
-        return Str_getitem(self, PyLong_AsSsize_t(key));
-    }
+    else if (PyLong_Check(key)) { return Str_getitem(self, PyLong_AsSsize_t(key)); }
     else {
         PyErr_SetString(PyExc_TypeError, "Str indices must be integers or slices");
         return NULL;
@@ -931,8 +813,7 @@ static Py_ssize_t Strs_len(Strs *self) {
 static PyObject *Strs_getitem(Strs *self, Py_ssize_t i) {
     // Check for negative index and convert to positive
     Py_ssize_t count = Strs_len(self);
-    if (i < 0)
-        i += count;
+    if (i < 0) i += count;
     if (i < 0 || i >= count) {
         PyErr_SetString(PyExc_IndexError, "Index out of range");
         return NULL;
@@ -973,8 +854,7 @@ static PyObject *Strs_getitem(Strs *self, Py_ssize_t i) {
 
     // Create a new `Str` object
     Str *parent_slice = (Str *)StrType.tp_alloc(&StrType, 0);
-    if (parent_slice == NULL && PyErr_NoMemory())
-        return NULL;
+    if (parent_slice == NULL && PyErr_NoMemory()) return NULL;
 
     parent_slice->start = start;
     parent_slice->length = length;
@@ -984,8 +864,7 @@ static PyObject *Strs_getitem(Strs *self, Py_ssize_t i) {
 }
 
 static PyObject *Strs_subscript(Str *self, PyObject *key) {
-    if (PyLong_Check(key))
-        return Strs_getitem(self, PyLong_AsSsize_t(key));
+    if (PyLong_Check(key)) return Strs_getitem(self, PyLong_AsSsize_t(key));
     return NULL;
 }
 
@@ -1004,8 +883,7 @@ static PyObject *Str_richcompare(PyObject *self, PyObject *other, int op) {
     int cmp_result = memcmp(a_start, b_start, min_length);
 
     // If the strings are equal up to `min_length`, then the shorter string is smaller
-    if (cmp_result == 0)
-        cmp_result = (a_length > b_length) - (a_length < b_length);
+    if (cmp_result == 0) cmp_result = (a_length > b_length) - (a_length < b_length);
 
     switch (op) {
     case Py_LT: return PyBool_FromLong(cmp_result < 0);
@@ -1029,10 +907,14 @@ static PyMappingMethods Str_as_mapping = {
     .mp_subscript = Str_subscript, // Is used to implement slices in Python
 };
 
-static PyMethodDef Str_methods[] = {
-    // {"contains", (PyCFunction)..., METH_NOARGS, "Convert to Python `str`"},
-    // {"find", (PyCFunction)..., METH_NOARGS, "Get length"},
-    // {"__getitem__", (PyCFunction)..., METH_O, "Indexing"},
+#define sz_method_flags_m METH_VARARGS | METH_KEYWORDS
+
+static PyMethodDef Str_methods[] = { //
+    {"find", api_find, sz_method_flags_m, "Find the first occurrence of a substring."},
+    {"contains", api_contains, sz_method_flags_m, "Check if a string contains a substring."},
+    {"count", api_count, sz_method_flags_m, "Count the occurrences of a substring."},
+    {"levenstein", api_levenstein, sz_method_flags_m, "Calculate the Levenshtein distance between two strings."},
+    {"split", api_split, sz_method_flags_m, "Split a string by a separator."},
     {NULL, NULL, 0, NULL}};
 
 static PyTypeObject StrType = {
@@ -1076,8 +958,20 @@ static PyTypeObject StrsType = {
 
 #pragma endregion
 
-static PyMethodDef stringzilla_methods[] = { //
-    {NULL, NULL, 0, NULL}};
+static void stringzilla_cleanup(PyObject *m) {
+    if (temporary_memory.start) free(temporary_memory.start);
+    temporary_memory.start = NULL;
+    temporary_memory.length = 0;
+}
+
+static PyMethodDef stringzilla_methods[] = {
+    {"find", api_find, sz_method_flags_m, "Find the first occurrence of a substring."},
+    {"contains", api_contains, sz_method_flags_m, "Check if a string contains a substring."},
+    {"count", api_count, sz_method_flags_m, "Count the occurrences of a substring."},
+    {"levenstein", api_levenstein, sz_method_flags_m, "Calculate the Levenshtein distance between two strings."},
+    {"split", api_split, sz_method_flags_m, "Split a string by a separator."},
+    {NULL, NULL, 0, NULL} /* Sentinel */
+};
 
 static PyModuleDef stringzilla_module = {
     PyModuleDef_HEAD_INIT,
@@ -1088,49 +982,18 @@ static PyModuleDef stringzilla_module = {
     NULL,
     NULL,
     NULL,
-    NULL,
+    stringzilla_cleanup,
 };
 
-PyObject *register_vectorcall(PyObject *module, char const *name, vectorcallfunc vectorcall) {
-
-    PyCFunctionObject *vectorcall_object = (PyCFunctionObject *)PyObject_Malloc(sizeof(PyCFunctionObject));
-    if (vectorcall_object == NULL)
-        return NULL;
-
-    PyObject_Init(vectorcall_object, &PyCFunction_Type);
-    vectorcall_object->m_ml = NULL; // No regular `PyMethodDef`
-    vectorcall_object->vectorcall = vectorcall;
-
-    // Add the 'find' function to the module
-    if (PyModule_AddObject(module, name, vectorcall_object) < 0) {
-        Py_XDECREF(vectorcall_object);
-        return NULL;
-    }
-    return vectorcall_object;
-}
-
-void cleanup_module(void) {
-    if (temporary_memory.start)
-        free(temporary_memory.start);
-    temporary_memory.start = NULL;
-    temporary_memory.length = 0;
-}
-
 PyMODINIT_FUNC PyInit_stringzilla(void) {
     PyObject *m;
 
-    if (PyType_Ready(&StrType) < 0)
-        return NULL;
-
-    if (PyType_Ready(&FileType) < 0)
-        return NULL;
-
-    if (PyType_Ready(&StrsType) < 0)
-        return NULL;
+    if (PyType_Ready(&StrType) < 0) return NULL;
+    if (PyType_Ready(&FileType) < 0) return NULL;
+    if (PyType_Ready(&StrsType) < 0) return NULL;
 
     m = PyModule_Create(&stringzilla_module);
-    if (m == NULL)
-        return NULL;
+    if (m == NULL) return NULL;
 
     Py_INCREF(&StrType);
     if (PyModule_AddObject(m, "Str", (PyObject *)&StrType) < 0) {
@@ -1159,42 +1022,9 @@ PyMODINIT_FUNC PyInit_stringzilla(void) {
     // Initialize temporary_memory, if needed
     temporary_memory.start = malloc(4096);
     temporary_memory.length = 4096 * (temporary_memory.start != NULL);
-    // atexit(cleanup_module);
-
-    // Register the vectorized functions
-    PyObject *vectorized_find = register_vectorcall(m, "find", str_find_vectorcall);
-    PyObject *vectorized_contains = register_vectorcall(m, "contains", str_contains_vectorcall);
-    PyObject *vectorized_count = register_vectorcall(m, "count", str_count_vectorcall);
-    PyObject *vectorized_levenstein = register_vectorcall(m, "levenstein", str_levenstein_vectorcall);
-
-    PyObject *vectorized_split = register_vectorcall(m, "split", strs_split_vectorcall);
-    PyObject *vectorized_sort = register_vectorcall(m, "sort", str_find_vectorcall);
-    PyObject *vectorized_shuffle = register_vectorcall(m, "shuffle", str_find_vectorcall);
-
-    if (!vectorized_find || !vectorized_count ||          //
-        !vectorized_contains || !vectorized_levenstein || //
-        !vectorized_split || !vectorized_sort || !vectorized_shuffle) {
-        goto cleanup;
-    }
-
     return m;
 
 cleanup:
-    if (vectorized_find)
-        Py_XDECREF(vectorized_find);
-    if (vectorized_contains)
-        Py_XDECREF(vectorized_contains);
-    if (vectorized_count)
-        Py_XDECREF(vectorized_count);
-    if (vectorized_levenstein)
-        Py_XDECREF(vectorized_levenstein);
-    if (vectorized_split)
-        Py_XDECREF(vectorized_split);
-    if (vectorized_sort)
-        Py_XDECREF(vectorized_sort);
-    if (vectorized_shuffle)
-        Py_XDECREF(vectorized_shuffle);
-
     Py_XDECREF(&FileType);
     Py_XDECREF(&StrType);
     Py_XDECREF(m);
diff --git a/scripts/test.py b/scripts/test.py
index 1fc2193f..8163e0e5 100644
--- a/scripts/test.py
+++ b/scripts/test.py
@@ -25,6 +25,22 @@ def test_globals():
     assert sz.levenstein("abababab", "aaaaaaaa", bound=2) == 2
 
 
+def test_split_keepseparator():
+    native = "word1_word2_word3"
+    big = Str(native)
+
+    words = sz.split(big, "_")
+    assert len(words) == 3
+
+    parts = sz.split(big, "_", keepseparator=True)
+    assert len(parts) == 3
+
+    assert str(words[0]) == "word1"
+    assert str(parts[0]) == "word1_"
+    assert str(words[2]) == "word3"
+    assert str(parts[2]) == "word3"
+
+
 def test_construct():
     native = "aaaaa"
     big = Str(native)
@@ -38,17 +54,25 @@ def test_indexing():
         assert big[i] == native[i]
 
 
-# def test_contains():
-#     big = Str("abcdef")
-#     assert "a" in big
-#     assert "ab" in big
-#     assert "xxx" not in big
+def test_count():
+    native = "aaaaa"
+    big = Str(native)
+    assert big.count("a") == 5
+    assert big.count("aa") == 2
+    assert big.count("aa", allowoverlap=True) == 4
+
+
+def test_contains():
+    big = Str("abcdef")
+    assert "a" in big
+    assert "ab" in big
+    assert "xxx" not in big
 
 
-# def test_rich_comparisons():
-#     assert Str("aa") == "aa"
-#     assert Str("aa") < "b"
-#     assert Str("abb")[1:] == "bb"
+def test_rich_comparisons():
+    assert Str("aa") == "aa"
+    assert Str("aa") < "b"
+    assert Str("abb")[1:] == "bb"
 
 
 # def get_random_string(
@@ -119,22 +143,6 @@ def test_indexing():
 #     assert native.splitlines(True) == list(big.splitlines(keeplinebreaks=True))
 
 
-def test_split_keepseparator():
-    native = "word1_word2_word3"
-    big = Str(native)
-
-    words = sz.split(big, "_")
-    assert len(words) == 3
-
-    parts = sz.split(big, "_", keepseparator=True)
-    assert len(parts) == 3
-
-    assert str(words[0]) == "word1"
-    assert str(parts[0]) == "word1_"
-    assert str(words[2]) == "word3"
-    assert str(parts[2]) == "word3"
-
-
 # def test_strs_operations():
 #     native = "line1\nline2\nline3"
 #     big = Str(native)

From 7dc3b2821b7b5642cb3221b306ab559af31f8189 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 19 Sep 2023 18:26:36 +0400
Subject: [PATCH 20/72] Add: Buffer protocol support

---
 .vscode/settings.json |  2 ++
 python/lib.c          | 48 +++++++++++++++++++++++++++++++------
 scripts/test.py       | 56 +++++++++++++++++++++++++------------------
 3 files changed, 76 insertions(+), 30 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 2ee49d0a..6fa841e1 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -142,6 +142,7 @@
     "MODINIT",
     "napi",
     "nargsf",
+    "ndim",
     "newfunc",
     "NOARGS",
     "NOMINMAX",
@@ -149,6 +150,7 @@
     "pytest",
     "quadgram",
     "readlines",
+    "releasebuffer",
     "richcompare",
     "SIMD",
     "splitlines",
diff --git a/python/lib.c b/python/lib.c
index b0360866..8035e0ba 100644
--- a/python/lib.c
+++ b/python/lib.c
@@ -2,7 +2,8 @@
  *  @brief  Very light-weight CPython wrapper for StringZilla, with support for memory-mapping,
  *          native Python strings, Apache Arrow collections, and more.
  *
- *  To minimize latency this implementation avoids `PyArg_ParseTupleAndKeywords` calls.
+ *  - Doesn't use PyBind11, NanoBind, Boost.Python, or any other high-level libs, only CPython API.
+ *  - To minimize latency this implementation avoids `PyArg_ParseTupleAndKeywords` calls.
  */
 #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
 #define NOMINMAX
@@ -783,7 +784,35 @@ static PyObject *Str_subscript(Str *self, PyObject *key) {
     }
 }
 
-// Will be called by the `PySequence_Contains`
+static int Str_getbuffer(Str *self, Py_buffer *view, int flags) {
+    if (view == NULL) {
+        PyErr_SetString(PyExc_ValueError, "NULL view in getbuffer");
+        return -1;
+    }
+
+    static Py_ssize_t itemsize[1] = {1};
+    view->obj = (PyObject *)self;
+    view->buf = self->start;
+    view->len = self->length;
+    view->readonly = 1;
+    view->itemsize = sizeof(char);
+    view->format = "c"; // https://docs.python.org/3/library/struct.html#format-characters
+    view->ndim = 1;
+    view->shape = &self->length; // 1-D array, so shape is just a pointer to the length
+    view->strides = itemsize;    // strides in a 1-D array is just the item size
+    view->suboffsets = NULL;
+    view->internal = NULL;
+
+    Py_INCREF(self);
+    return 0;
+}
+
+static void Str_releasebuffer(PyObject *_, Py_buffer *view) {
+    // This function MUST NOT decrement view->obj, since that is done automatically
+    // in PyBuffer_Release() (this scheme is useful for breaking reference cycles).
+    // https://docs.python.org/3/c-api/typeobj.html#c.PyBufferProcs.bf_releasebuffer
+}
+
 static int Str_contains(Str *self, PyObject *arg) {
 
     struct sz_needle_t needle_struct;
@@ -907,6 +936,11 @@ static PyMappingMethods Str_as_mapping = {
     .mp_subscript = Str_subscript, // Is used to implement slices in Python
 };
 
+static PyBufferProcs Str_as_buffer = {
+    .bf_getbuffer = Str_getbuffer,
+    .bf_releasebuffer = Str_releasebuffer,
+};
+
 #define sz_method_flags_m METH_VARARGS | METH_KEYWORDS
 
 static PyMethodDef Str_methods[] = { //
@@ -922,16 +956,16 @@ static PyTypeObject StrType = {
     .tp_doc = "Immutable string/slice class with SIMD and SWAR-accelerated operations",
     .tp_basicsize = sizeof(Str),
     .tp_flags = Py_TPFLAGS_DEFAULT,
-    .tp_methods = Str_methods,
     .tp_new = Str_new,
     .tp_init = Str_init,
     .tp_dealloc = Str_dealloc,
-    .tp_as_sequence = &Str_as_sequence,
-    .tp_as_mapping = &Str_as_mapping,
-    .tp_hash = Str_hash, // String hashing functions
+    .tp_hash = Str_hash,
     .tp_richcompare = Str_richcompare,
     .tp_str = Str_str,
-    // .tp_as_buffer = (PyBufferProcs *)NULL, // Functions to access object as input/output buffer
+    .tp_methods = Str_methods,
+    .tp_as_sequence = &Str_as_sequence,
+    .tp_as_mapping = &Str_as_mapping,
+    .tp_as_buffer = &Str_as_buffer,
 };
 
 static PySequenceMethods Strs_as_sequence = {
diff --git a/scripts/test.py b/scripts/test.py
index 8163e0e5..b9083ea6 100644
--- a/scripts/test.py
+++ b/scripts/test.py
@@ -75,19 +75,29 @@ def test_rich_comparisons():
     assert Str("abb")[1:] == "bb"
 
 
-# def get_random_string(
-#     length: Optional[int] = None, variability: Optional[int] = None
-# ) -> str:
-#     if length is None:
-#         length = randint(3, 300)
-#     if variability is None:
-#         variability = len(ascii_lowercase)
-#     return "".join(choice(ascii_lowercase[:variability]) for _ in range(length))
+def test_buffer_protocol():
+    import numpy as np
 
+    my_str = Str("hello")
+    arr = np.array(my_str)
+    assert arr.dtype == np.dtype("c")
+    assert arr.shape == (len("hello"),)
+    assert "".join([c.decode("utf-8") for c in arr.tolist()]) == "hello"
 
-# def is_equal_strings(native_strings, big_strings):
-#     for native_slice, big_slice in zip(native_strings, big_strings):
-#         assert native_slice == big_slice
+
+def get_random_string(
+    length: Optional[int] = None, variability: Optional[int] = None
+) -> str:
+    if length is None:
+        length = randint(3, 300)
+    if variability is None:
+        variability = len(ascii_lowercase)
+    return "".join(choice(ascii_lowercase[:variability]) for _ in range(length))
+
+
+def is_equal_strings(native_strings, big_strings):
+    for native_slice, big_slice in zip(native_strings, big_strings):
+        assert native_slice == big_slice
 
 
 # def check_identical(
@@ -255,16 +265,16 @@ def test_rich_comparisons():
 #         )
 
 
-# def test_levenstein():
-#     # Create a new string by slicing and concatenating
-#     def insert_char_at(s, char_to_insert, index):
-#         return s[:index] + char_to_insert + s[index:]
+def test_levenstein():
+    # Create a new string by slicing and concatenating
+    def insert_char_at(s, char_to_insert, index):
+        return s[:index] + char_to_insert + s[index:]
 
-#     for _ in range(100):
-#         a = get_random_string(length=20)
-#         b = a
-#         for i in range(150):
-#             source_offset = randint(0, len(ascii_lowercase) - 1)
-#             target_offset = randint(0, len(b) - 1)
-#             b = insert_char_at(b, ascii_lowercase[source_offset], target_offset)
-#             assert levenstein(a, b, 200) == i + 1
+    for _ in range(100):
+        a = get_random_string(length=20)
+        b = a
+        for i in range(150):
+            source_offset = randint(0, len(ascii_lowercase) - 1)
+            target_offset = randint(0, len(b) - 1)
+            b = insert_char_at(b, ascii_lowercase[source_offset], target_offset)
+            assert sz.levenstein(a, b, 200) == i + 1

From 177005e40201893c4eb67540959fcfba44e19e7f Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 19 Sep 2023 18:26:58 +0400
Subject: [PATCH 21/72] Format: Compact code style

---
 stringzilla/stringzilla.h | 102 +++++++++++++-------------------------
 1 file changed, 35 insertions(+), 67 deletions(-)

diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h
index fea8ac47..8bd32fa1 100644
--- a/stringzilla/stringzilla.h
+++ b/stringzilla/stringzilla.h
@@ -41,8 +41,7 @@ inline static sz_size_t sz_divide_round_up(sz_size_t x, sz_size_t divisor) { ret
  */
 inline static int sz_equal(char const *a, char const *b, sz_size_t length) {
     char const *const a_end = a + length;
-    while (a != a_end && *a == *b)
-        a++, b++;
+    while (a != a_end && *a == *b) a++, b++;
     return a_end == a;
 }
 
@@ -66,8 +65,7 @@ inline static sz_size_t sz_naive_count_char(sz_haystack_t h, char n) {
     char const *text = h.start;
     char const *end = h.start + h.length;
 
-    for (; (uint64_t)text % 8 != 0 && text < end; ++text)
-        result += *text == n;
+    for (; (uint64_t)text % 8 != 0 && text < end; ++text) result += *text == n;
 
     // This code simulates hyper-scalar execution, comparing 8 characters at a time.
     uint64_t nnnnnnnn = n;
@@ -84,8 +82,7 @@ inline static sz_size_t sz_naive_count_char(sz_haystack_t h, char n) {
         result += popcount64(match_indicators);
     }
 
-    for (; text < end; ++text)
-        result += *text == n;
+    for (; text < end; ++text) result += *text == n;
     return result;
 }
 
@@ -98,8 +95,7 @@ inline static sz_size_t sz_naive_find_char(sz_haystack_t h, char n) {
     char const *end = h.start + h.length;
 
     for (; (uint64_t)text % 8 != 0 && text < end; ++text)
-        if (*text == n)
-            return text - h.start;
+        if (*text == n) return text - h.start;
 
     // This code simulates hyper-scalar execution, analyzing 8 offsets at a time.
     uint64_t nnnnnnnn = n;
@@ -114,13 +110,11 @@ inline static sz_size_t sz_naive_find_char(sz_haystack_t h, char n) {
         match_indicators &= match_indicators >> 4;
         match_indicators &= 0x0101010101010101;
 
-        if (match_indicators != 0)
-            return text - h.start + ctz64(match_indicators) / 8;
+        if (match_indicators != 0) return text - h.start + ctz64(match_indicators) / 8;
     }
 
     for (; text < end; ++text)
-        if (*text == n)
-            return text - h.start;
+        if (*text == n) return text - h.start;
     return h.length;
 }
 
@@ -161,8 +155,7 @@ inline static sz_size_t sz_naive_find_2chars(sz_haystack_t h, char const *n) {
     }
 
     for (; text + 2 <= end; ++text)
-        if (text[0] == n[0] && text[1] == n[1])
-            return text - h.start;
+        if (text[0] == n[0] && text[1] == n[1]) return text - h.start;
     return h.length;
 }
 
@@ -211,13 +204,11 @@ inline static sz_size_t sz_naive_find_3chars(sz_haystack_t h, char const *n) {
             (third_indicators >> 16) & (third_indicators >> 8) & (third_indicators >> 0) & 0x0000010000010000;
 
         uint64_t match_indicators = first_indicators | (second_indicators >> 8) | (third_indicators >> 16);
-        if (match_indicators != 0)
-            return text - h.start + ctz64(match_indicators) / 8;
+        if (match_indicators != 0) return text - h.start + ctz64(match_indicators) / 8;
     }
 
     for (; text + 3 <= end; ++text)
-        if (text[0] == n[0] && text[1] == n[1] && text[2] == n[2])
-            return text - h.start;
+        if (text[0] == n[0] && text[1] == n[1] && text[2] == n[2]) return text - h.start;
     return h.length;
 }
 
@@ -275,8 +266,7 @@ inline static sz_size_t sz_naive_find_4chars(sz_haystack_t h, char const *n) {
     }
 
     for (; text + 4 <= end; ++text)
-        if (text[0] == n[0] && text[1] == n[1] && text[2] == n[2] && text[3] == n[3])
-            return text - h.start;
+        if (text[0] == n[0] && text[1] == n[1] && text[2] == n[2] && text[3] == n[3]) return text - h.start;
     return h.length;
 }
 
@@ -287,8 +277,7 @@ inline static sz_size_t sz_naive_find_4chars(sz_haystack_t h, char const *n) {
  */
 inline static sz_size_t sz_naive_find_substr(sz_haystack_t h, sz_needle_t n) {
 
-    if (h.length < n.length)
-        return h.length;
+    if (h.length < n.length) return h.length;
 
     char const *text = h.start;
     char const *const end = h.start + h.length;
@@ -365,18 +354,17 @@ sz_size_t sz_avx2_find_substr(sz_haystack_t h, sz_needle_t n) {
 
         if (matches0 | matches1 | matches2 | matches3) {
             for (sz_size_t i = 0; i < 32; i++) {
-                if (sz_equal(text + i, n.start, n.length))
-                    return i + (text - h.start);
+                if (sz_equal(text + i, n.start, n.length)) return i + (text - h.start);
             }
         }
     }
 
     // Don't forget the last (up to 35) characters.
     sz_haystack_t tail;
-    tail.ptr = text;
-    tail.len = end - text;
+    tail.start = text;
+    tail.length = end - text;
     size_t tail_match = sz_naive_find_substr(tail, n);
-    return text + tail_match - h.ptr;
+    return text + tail_match - h.start;
 }
 
 #endif // x86 AVX2
@@ -423,18 +411,17 @@ inline static sz_size_t sz_neon_find_substr(sz_haystack_t h, sz_needle_t n) {
 
         if (has_match) {
             for (sz_size_t i = 0; i < 16; i++) {
-                if (sz_equal(text + i, n.start, n.length))
-                    return i + (text - h.start);
+                if (sz_equal(text + i, n.start, n.length)) return i + (text - h.start);
             }
         }
     }
 
     // Don't forget the last (up to 16+3=19) characters.
     sz_haystack_t tail;
-    tail.ptr = text;
-    tail.len = end - text;
+    tail.start = text;
+    tail.length = end - text;
     size_t tail_match = sz_naive_find_substr(tail, n);
-    return text + tail_match - h.ptr;
+    return text + tail_match - h.start;
 }
 
 #endif // Arm Neon
@@ -472,8 +459,7 @@ typedef struct sz_sequence_t {
 inline static sz_size_t sz_partition(sz_sequence_t *sequence, sz_sequence_predicate_t predicate) {
 
     sz_size_t matches = 0;
-    while (matches != sequence->count && predicate(sequence->handle, sequence->order[matches]))
-        ++matches;
+    while (matches != sequence->count && predicate(sequence->handle, sequence->order[matches])) ++matches;
 
     for (sz_size_t i = matches + 1; i < sequence->count; ++i)
         if (predicate(sequence->handle, sequence->order[i]))
@@ -491,16 +477,13 @@ inline static void sz_merge(sz_sequence_t *sequence, sz_size_t partition, sz_seq
     sz_size_t start_b = partition + 1;
 
     // If the direct merge is already sorted
-    if (!less(sequence->handle, sequence->order[start_b], sequence->order[partition]))
-        return;
+    if (!less(sequence->handle, sequence->order[start_b], sequence->order[partition])) return;
 
     sz_size_t start_a = 0;
     while (start_a <= partition && start_b <= sequence->count) {
 
         // If element 1 is in right place
-        if (!less(sequence->handle, sequence->order[start_b], sequence->order[start_a])) {
-            start_a++;
-        }
+        if (!less(sequence->handle, sequence->order[start_b], sequence->order[start_a])) { start_a++; }
         else {
             sz_size_t value = sequence->order[start_b];
             sz_size_t index = start_b;
@@ -527,19 +510,16 @@ inline static void _sz_sort_recursion( //
     sz_size_t bit_max,
     sz_qsort_comparison_func_t qsort_comparator) {
 
-    if (!sequence->count)
-        return;
+    if (!sequence->count) return;
 
     // Partition a range of integers according to a specific bit value
     sz_size_t split = 0;
     {
         sz_size_t mask = (1ul << 63) >> bit_idx;
-        while (split != sequence->count && !(sequence->order[split] & mask))
-            ++split;
+        while (split != sequence->count && !(sequence->order[split] & mask)) ++split;
 
         for (sz_size_t i = split + 1; i < sequence->count; ++i)
-            if (!(sequence->order[i] & mask))
-                sz_swap(sequence->order + i, sequence->order + split), ++split;
+            if (!(sequence->order[i] & mask)) sz_swap(sequence->order + i, sequence->order + split), ++split;
     }
 
     // Go down recursively
@@ -556,9 +536,7 @@ inline static void _sz_sort_recursion( //
     // Reached the end of recursion
     else {
         // Discard the prefixes
-        for (sz_size_t i = 0; i != sequence->count; ++i) {
-            memset((char *)(&sequence->order[i]) + 4, 0, 4ul);
-        }
+        for (sz_size_t i = 0; i != sequence->count; ++i) { memset((char *)(&sequence->order[i]) + 4, 0, 4ul); }
 
         // Perform sorts on smaller chunks instead of the whole handle
 #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
@@ -649,8 +627,7 @@ inline static void sz_sort(sz_sequence_t *sequence, sz_sort_config_t const *conf
         sz_size_t length = sequence->get_length(sequence->handle, sequence->order[i]);
         length = length > 4ul ? 4ul : length;
         char *prefix = (char *)&sequence->order[i];
-        for (sz_size_t j = 0; j != length; ++j)
-            prefix[7 - j] = begin[j];
+        for (sz_size_t j = 0; j != length; ++j) prefix[7 - j] = begin[j];
         if (case_insensitive) {
             prefix[0] = tolower(prefix[0]);
             prefix[1] = tolower(prefix[1]);
@@ -660,8 +637,7 @@ inline static void sz_sort(sz_sequence_t *sequence, sz_sort_config_t const *conf
     }
 
     sz_qsort_comparison_func_t comparator = _sz_sort_sequence_strncmp;
-    if (case_insensitive)
-        comparator = _sz_sort_sequence_strncasecmp;
+    if (case_insensitive) comparator = _sz_sort_sequence_strncasecmp;
 
     // Perform optionally-parallel radix sort on them
     _sz_sort_recursion(sequence, 0, 32, comparator);
@@ -699,26 +675,21 @@ inline static levenstein_distance_t sz_levenstein( //
     void *buffer) {
 
     // If one of the strings is empty - the edit distance is equal to the length of the other one
-    if (a_length == 0)
-        return b_length <= bound ? b_length : bound;
-    if (b_length == 0)
-        return a_length <= bound ? a_length : bound;
+    if (a_length == 0) return b_length <= bound ? b_length : bound;
+    if (b_length == 0) return a_length <= bound ? a_length : bound;
 
     // If the difference in length is beyond the `bound`, there is no need to check at all
     if (a_length > b_length) {
-        if (a_length - b_length > bound)
-            return bound + 1;
+        if (a_length - b_length > bound) return bound + 1;
     }
     else {
-        if (b_length - a_length > bound)
-            return bound + 1;
+        if (b_length - a_length > bound) return bound + 1;
     }
 
     levenstein_distance_t *previous_distances = (levenstein_distance_t *)buffer;
     levenstein_distance_t *current_distances = previous_distances + b_length + 1;
 
-    for (sz_size_t idx_b = 0; idx_b != (b_length + 1); ++idx_b)
-        previous_distances[idx_b] = idx_b;
+    for (sz_size_t idx_b = 0; idx_b != (b_length + 1); ++idx_b) previous_distances[idx_b] = idx_b;
 
     for (sz_size_t idx_a = 0; idx_a != a_length; ++idx_a) {
         current_distances[0] = idx_a + 1;
@@ -733,14 +704,11 @@ inline static levenstein_distance_t sz_levenstein( //
             current_distances[idx_b + 1] = _sz_levenstein_minimum(cost_deletion, cost_insertion, cost_substitution);
 
             // Keep track of the minimum distance seen so far in this row
-            if (current_distances[idx_b + 1] < min_distance) {
-                min_distance = current_distances[idx_b + 1];
-            }
+            if (current_distances[idx_b + 1] < min_distance) { min_distance = current_distances[idx_b + 1]; }
         }
 
         // If the minimum distance in this row exceeded the bound, return early
-        if (min_distance > bound)
-            return bound;
+        if (min_distance > bound) return bound;
 
         // Swap previous_distances and current_distances pointers
         levenstein_distance_t *temp = previous_distances;

From b771739f66e42d26ab5321025d3aa098dadcbf2a Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 19 Sep 2023 22:14:25 +0400
Subject: [PATCH 22/72] Add: `startswith` & `endswith`

---
 .vscode/settings.json |  2 ++
 python/lib.c          | 61 ++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 56 insertions(+), 7 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 6fa841e1..97c0113c 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -127,6 +127,7 @@
     "bigram",
     "cibuildwheel",
     "endregion",
+    "endswith",
     "getitem",
     "getslice",
     "initproc",
@@ -154,6 +155,7 @@
     "richcompare",
     "SIMD",
     "splitlines",
+    "startswith",
     "stringzilla",
     "Strs",
     "strzl",
diff --git a/python/lib.c b/python/lib.c
index 8035e0ba..faf14a4d 100644
--- a/python/lib.c
+++ b/python/lib.c
@@ -210,7 +210,7 @@ int export_string_like(PyObject *object, char const **start, size_t *length) {
 #pragma region Global Functions
 
 static Py_ssize_t api_find_(PyObject *self, PyObject *args, PyObject *kwargs) {
-    int is_member = (self != NULL && PyObject_TypeCheck(self, &StrType));
+    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
     Py_ssize_t nargs = PyTuple_Size(args);
     if (nargs < !is_member + 1 || nargs > !is_member + 3) {
         PyErr_SetString(PyExc_TypeError, "Invalid number of arguments");
@@ -294,7 +294,7 @@ static PyObject *api_contains(PyObject *self, PyObject *args, PyObject *kwargs)
 }
 
 static PyObject *api_count(PyObject *self, PyObject *args, PyObject *kwargs) {
-    int is_member = (self != NULL && PyObject_TypeCheck(self, &StrType));
+    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
     Py_ssize_t nargs = PyTuple_Size(args);
     if (nargs < !is_member + 1 || nargs > !is_member + 4) {
         PyErr_Format(PyExc_TypeError, "Invalid number of arguments");
@@ -361,7 +361,7 @@ static PyObject *api_count(PyObject *self, PyObject *args, PyObject *kwargs) {
 }
 
 static PyObject *api_levenstein(PyObject *self, PyObject *args, PyObject *kwargs) {
-    int is_member = (self != NULL && PyObject_TypeCheck(self, &StrType));
+    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
     Py_ssize_t nargs = PyTuple_Size(args);
     if (nargs < !is_member + 1 || nargs > !is_member + 2) {
         PyErr_Format(PyExc_TypeError, "Invalid number of arguments");
@@ -415,10 +415,54 @@ static PyObject *api_levenstein(PyObject *self, PyObject *args, PyObject *kwargs
     return PyLong_FromLong(distance);
 }
 
+static PyObject *api_startswith(PyObject *self, PyObject *args, PyObject *kwargs) {
+    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
+    if (PyTuple_Size(args) != !is_member + 1) {
+        PyErr_Format(PyExc_TypeError, "Invalid number of arguments");
+        return NULL;
+    }
+
+    PyObject *str_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
+    PyObject *prefix_obj = PyTuple_GET_ITEM(args, !is_member);
+
+    struct sz_haystack_t str, prefix;
+    if (!export_string_like(str_obj, &str.start, &str.length) ||
+        !export_string_like(prefix_obj, &prefix.start, &prefix.length)) {
+        PyErr_SetString(PyExc_TypeError, "Both arguments must be string-like");
+        return NULL;
+    }
+
+    if (str.length < prefix.length) { Py_RETURN_FALSE; }
+    else if (strncmp(str.start, prefix.start, prefix.length) == 0) { Py_RETURN_TRUE; }
+    else { Py_RETURN_FALSE; }
+}
+
+static PyObject *api_endswith(PyObject *self, PyObject *args, PyObject *kwargs) {
+    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
+    if (PyTuple_Size(args) != !is_member + 1) {
+        PyErr_Format(PyExc_TypeError, "Invalid number of arguments");
+        return NULL;
+    }
+
+    PyObject *str_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
+    PyObject *suffix_obj = PyTuple_GET_ITEM(args, !is_member);
+
+    struct sz_haystack_t str, suffix;
+    if (!export_string_like(str_obj, &str.start, &str.length) ||
+        !export_string_like(suffix_obj, &suffix.start, &suffix.length)) {
+        PyErr_SetString(PyExc_TypeError, "Both arguments must be string-like");
+        return NULL;
+    }
+
+    if (str.length < suffix.length) { Py_RETURN_FALSE; }
+    else if (strncmp(str.start + (str.length - suffix.length), suffix.start, suffix.length) == 0) { Py_RETURN_TRUE; }
+    else { Py_RETURN_FALSE; }
+}
+
 static PyObject *api_split(PyObject *self, PyObject *args, PyObject *kwargs) {
 
     // Check minimum arguments
-    int is_member = (self != NULL && PyObject_TypeCheck(self, &StrType));
+    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
     Py_ssize_t nargs = PyTuple_Size(args);
     if (nargs < !is_member + 1 || nargs > !is_member + 3) {
         PyErr_SetString(PyExc_TypeError, "sz.split() requires at least 1 argument");
@@ -691,7 +735,7 @@ static int Str_init(Str *self, PyObject *positional_args, PyObject *named_args)
         if (!PyArg_ParseTupleAndKeywords(positional_args, named_args, "|Onn", names, &parent, &from, &to)) return -1;
     }
     else if (!PyArg_ParseTuple(positional_args, "|Onn", &parent, &from, &to))
-        return -1;
+                    return -1;
 
     // Handle empty string
     if (parent == NULL) {
@@ -949,6 +993,8 @@ static PyMethodDef Str_methods[] = { //
     {"count", api_count, sz_method_flags_m, "Count the occurrences of a substring."},
     {"levenstein", api_levenstein, sz_method_flags_m, "Calculate the Levenshtein distance between two strings."},
     {"split", api_split, sz_method_flags_m, "Split a string by a separator."},
+    {"startswith", api_startswith, sz_method_flags_m, "Check if a string starts with a given prefix."},
+    {"endswith", api_endswith, sz_method_flags_m, "Check if a string ends with a given suffix."},
     {NULL, NULL, 0, NULL}};
 
 static PyTypeObject StrType = {
@@ -1004,8 +1050,9 @@ static PyMethodDef stringzilla_methods[] = {
     {"count", api_count, sz_method_flags_m, "Count the occurrences of a substring."},
     {"levenstein", api_levenstein, sz_method_flags_m, "Calculate the Levenshtein distance between two strings."},
     {"split", api_split, sz_method_flags_m, "Split a string by a separator."},
-    {NULL, NULL, 0, NULL} /* Sentinel */
-};
+    {"startswith", api_startswith, sz_method_flags_m, "Check if a string starts with a given prefix."},
+    {"endswith", api_endswith, sz_method_flags_m, "Check if a string ends with a given suffix."},
+    {NULL, NULL, 0, NULL}};
 
 static PyModuleDef stringzilla_module = {
     PyModuleDef_HEAD_INIT,

From 173197fd4f5098ae17c79a4d8b8d673d73ee0317 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 19 Sep 2023 22:14:35 +0400
Subject: [PATCH 23/72] Improve: Faster `Str` constructor

---
 python/lib.c | 68 +++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 59 insertions(+), 9 deletions(-)

diff --git a/python/lib.c b/python/lib.c
index faf14a4d..185b7c09 100644
--- a/python/lib.c
+++ b/python/lib.c
@@ -724,24 +724,74 @@ static PyTypeObject FileType = {
 
 #pragma region Str
 
-static int Str_init(Str *self, PyObject *positional_args, PyObject *named_args) {
-    PyObject *parent = NULL;
-    Py_ssize_t from = 0;
-    Py_ssize_t to = PY_SSIZE_T_MAX;
+static int Str_init(Str *self, PyObject *args, PyObject *kwargs) {
 
-    // The `named_args` would be `NULL`
-    if (named_args) {
-        static char *names[] = {"parent", "from", "to", NULL};
-        if (!PyArg_ParseTupleAndKeywords(positional_args, named_args, "|Onn", names, &parent, &from, &to)) return -1;
+    // Parse all arguments into PyObjects first
+    Py_ssize_t nargs = PyTuple_Size(args);
+    if (nargs > 3) {
+        PyErr_SetString(PyExc_TypeError, "Invalid number of arguments");
+        return -1;
     }
-    else if (!PyArg_ParseTuple(positional_args, "|Onn", &parent, &from, &to))
+    PyObject *parent_obj = nargs >= 1 ? PyTuple_GET_ITEM(args, 0) : NULL;
+    PyObject *from_obj = nargs >= 2 ? PyTuple_GET_ITEM(args, 1) : NULL;
+    PyObject *to_obj = nargs >= 3 ? PyTuple_GET_ITEM(args, 2) : NULL;
+
+    // Parse keyword arguments, if provided, and ensure no duplicates
+    if (kwargs) {
+        PyObject *key, *value;
+        Py_ssize_t pos = 0;
+        while (PyDict_Next(kwargs, &pos, &key, &value)) {
+            if (PyUnicode_CompareWithASCIIString(key, "parent") == 0) {
+                if (parent_obj) {
+                    PyErr_SetString(PyExc_TypeError, "Received `parent` both as positional and keyword argument");
                     return -1;
+                }
+                parent_obj = value;
+            }
+            else if (PyUnicode_CompareWithASCIIString(key, "from") == 0) {
+                if (from_obj) {
+                    PyErr_SetString(PyExc_TypeError, "Received `from` both as positional and keyword argument");
+                    return -1;
+                }
+                from_obj = value;
+            }
+            else if (PyUnicode_CompareWithASCIIString(key, "to") == 0) {
+                if (to_obj) {
+                    PyErr_SetString(PyExc_TypeError, "Received `to` both as positional and keyword argument");
+                    return -1;
+                }
+                to_obj = value;
+            }
+            else {
+                PyErr_SetString(PyExc_TypeError, "Invalid keyword argument");
+                return -1;
+            }
+        }
+    }
 
     // Handle empty string
     if (parent == NULL) {
         self->start = NULL;
         self->length = 0;
     }
+
+    // Now, type-check and cast each argument
+    Py_ssize_t from = 0, to = PY_SSIZE_T_MAX;
+    if (from_obj) {
+        from = PyLong_AsSsize_t(from_obj);
+        if (from == -1 && PyErr_Occurred()) {
+            PyErr_SetString(PyExc_TypeError, "The `from` argument must be an integer");
+            return -1;
+        }
+    }
+    if (to_obj) {
+        to = PyLong_AsSsize_t(to_obj);
+        if (to == -1 && PyErr_Occurred()) {
+            PyErr_SetString(PyExc_TypeError, "The `to` argument must be an integer");
+            return -1;
+        }
+    }
+
     // Increment the reference count of the parent
     else if (export_string_like(parent, &self->start, &self->length)) {
         self->parent = parent;

From a480c0bcdc3e5d219a299675b7b990d693f111ad Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 19 Sep 2023 22:53:22 +0400
Subject: [PATCH 24/72] Add: `Str` concatenation

---
 python/lib.c | 385 ++++++++++++++++++++++++++++++++++++---------------
 setup.py     |   2 +
 2 files changed, 275 insertions(+), 112 deletions(-)

diff --git a/python/lib.c b/python/lib.c
index 185b7c09..7d4c59ef 100644
--- a/python/lib.c
+++ b/python/lib.c
@@ -209,7 +209,7 @@ int export_string_like(PyObject *object, char const **start, size_t *length) {
 
 #pragma region Global Functions
 
-static Py_ssize_t api_find_(PyObject *self, PyObject *args, PyObject *kwargs) {
+static Py_ssize_t Str_find_(PyObject *self, PyObject *args, PyObject *kwargs) {
     int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
     Py_ssize_t nargs = PyTuple_Size(args);
     if (nargs < !is_member + 1 || nargs > !is_member + 3) {
@@ -236,8 +236,8 @@ static Py_ssize_t api_find_(PyObject *self, PyObject *args, PyObject *kwargs) {
         }
     }
 
-    struct sz_haystack_t haystack;
-    struct sz_needle_t needle;
+    sz_haystack_t haystack;
+    sz_needle_t needle;
     Py_ssize_t start, end;
 
     // Validate and convert `haystack` and `needle`
@@ -280,20 +280,30 @@ static Py_ssize_t api_find_(PyObject *self, PyObject *args, PyObject *kwargs) {
     return (Py_ssize_t)offset;
 }
 
-static PyObject *api_find(PyObject *self, PyObject *args, PyObject *kwargs) {
-    Py_ssize_t signed_offset = api_find_(self, args, kwargs);
+static PyObject *Str_find(PyObject *self, PyObject *args, PyObject *kwargs) {
+    Py_ssize_t signed_offset = Str_find_(self, args, kwargs);
     if (PyErr_Occurred()) return NULL;
     return PyLong_FromSsize_t(signed_offset);
 }
 
-static PyObject *api_contains(PyObject *self, PyObject *args, PyObject *kwargs) {
-    Py_ssize_t signed_offset = api_find_(self, args, kwargs);
+static PyObject *Str_index(PyObject *self, PyObject *args, PyObject *kwargs) {
+    Py_ssize_t signed_offset = Str_find_(self, args, kwargs);
+    if (PyErr_Occurred()) return NULL;
+    if (signed_offset == -1) {
+        PyErr_SetString(PyExc_ValueError, "substring not found");
+        return NULL;
+    }
+    return PyLong_FromSsize_t(signed_offset);
+}
+
+static PyObject *Str_contains(PyObject *self, PyObject *args, PyObject *kwargs) {
+    Py_ssize_t signed_offset = Str_find_(self, args, kwargs);
     if (PyErr_Occurred()) return NULL;
     if (signed_offset == -1) { Py_RETURN_FALSE; }
     else { Py_RETURN_TRUE; }
 }
 
-static PyObject *api_count(PyObject *self, PyObject *args, PyObject *kwargs) {
+static PyObject *Str_count(PyObject *self, PyObject *args, PyObject *kwargs) {
     int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
     Py_ssize_t nargs = PyTuple_Size(args);
     if (nargs < !is_member + 1 || nargs > !is_member + 4) {
@@ -318,8 +328,8 @@ static PyObject *api_count(PyObject *self, PyObject *args, PyObject *kwargs) {
                 return NULL;
     }
 
-    struct sz_haystack_t haystack;
-    struct sz_needle_t needle;
+    sz_haystack_t haystack;
+    sz_needle_t needle;
     Py_ssize_t start = start_obj ? PyLong_AsSsize_t(start_obj) : 0;
     Py_ssize_t end = end_obj ? PyLong_AsSsize_t(end_obj) : PY_SSIZE_T_MAX;
     int allowoverlap = allowoverlap_obj ? PyObject_IsTrue(allowoverlap_obj) : 0;
@@ -360,7 +370,7 @@ static PyObject *api_count(PyObject *self, PyObject *args, PyObject *kwargs) {
     return PyLong_FromSize_t(count);
 }
 
-static PyObject *api_levenstein(PyObject *self, PyObject *args, PyObject *kwargs) {
+static PyObject *Str_levenstein(PyObject *self, PyObject *args, PyObject *kwargs) {
     int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
     Py_ssize_t nargs = PyTuple_Size(args);
     if (nargs < !is_member + 1 || nargs > !is_member + 2) {
@@ -391,7 +401,7 @@ static PyObject *api_levenstein(PyObject *self, PyObject *args, PyObject *kwargs
         return NULL;
     }
 
-    struct sz_haystack_t str1, str2;
+    sz_haystack_t str1, str2;
     if (!export_string_like(str1_obj, &str1.start, &str1.length) ||
         !export_string_like(str2_obj, &str2.start, &str2.length)) {
         PyErr_Format(PyExc_TypeError, "Both arguments must be string-like");
@@ -415,51 +425,169 @@ static PyObject *api_levenstein(PyObject *self, PyObject *args, PyObject *kwargs
     return PyLong_FromLong(distance);
 }
 
-static PyObject *api_startswith(PyObject *self, PyObject *args, PyObject *kwargs) {
+static PyObject *Str_startswith(PyObject *self, PyObject *args, PyObject *kwargs) {
     int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
-    if (PyTuple_Size(args) != !is_member + 1) {
+    Py_ssize_t nargs = PyTuple_Size(args);
+    if (nargs < !is_member + 1 || nargs > !is_member + 3) {
         PyErr_Format(PyExc_TypeError, "Invalid number of arguments");
         return NULL;
     }
 
     PyObject *str_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
     PyObject *prefix_obj = PyTuple_GET_ITEM(args, !is_member);
+    PyObject *start_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
+    PyObject *end_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL;
+
+    // Optional start and end arguments
+    Py_ssize_t start = 0, end = PY_SSIZE_T_MAX;
+
+    if (start_obj && ((start = PyLong_AsSsize_t(start_obj)) == -1 && PyErr_Occurred())) {
+        PyErr_SetString(PyExc_TypeError, "start must be an integer");
+        return NULL;
+    }
+
+    if (end_obj && ((end = PyLong_AsSsize_t(end_obj)) == -1 && PyErr_Occurred())) {
+        PyErr_SetString(PyExc_TypeError, "end must be an integer");
+        return NULL;
+    }
 
-    struct sz_haystack_t str, prefix;
+    sz_haystack_t str, prefix;
     if (!export_string_like(str_obj, &str.start, &str.length) ||
         !export_string_like(prefix_obj, &prefix.start, &prefix.length)) {
         PyErr_SetString(PyExc_TypeError, "Both arguments must be string-like");
         return NULL;
     }
 
+    // Apply start and end arguments
+    str.start += start;
+    str.length -= start;
+    if (end != PY_SSIZE_T_MAX && end - start < str.length) { str.length = end - start; }
+
     if (str.length < prefix.length) { Py_RETURN_FALSE; }
     else if (strncmp(str.start, prefix.start, prefix.length) == 0) { Py_RETURN_TRUE; }
     else { Py_RETURN_FALSE; }
 }
 
-static PyObject *api_endswith(PyObject *self, PyObject *args, PyObject *kwargs) {
+static PyObject *Str_endswith(PyObject *self, PyObject *args, PyObject *kwargs) {
     int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
-    if (PyTuple_Size(args) != !is_member + 1) {
+    Py_ssize_t nargs = PyTuple_Size(args);
+    if (nargs < !is_member + 1 || nargs > !is_member + 3) {
         PyErr_Format(PyExc_TypeError, "Invalid number of arguments");
         return NULL;
     }
 
     PyObject *str_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
     PyObject *suffix_obj = PyTuple_GET_ITEM(args, !is_member);
+    PyObject *start_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
+    PyObject *end_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL;
+
+    // Optional start and end arguments
+    Py_ssize_t start = 0, end = PY_SSIZE_T_MAX;
+
+    if (start_obj && ((start = PyLong_AsSsize_t(start_obj)) == -1 && PyErr_Occurred())) {
+        PyErr_SetString(PyExc_TypeError, "start must be an integer");
+        return NULL;
+    }
 
-    struct sz_haystack_t str, suffix;
+    if (end_obj && ((end = PyLong_AsSsize_t(end_obj)) == -1 && PyErr_Occurred())) {
+        PyErr_SetString(PyExc_TypeError, "end must be an integer");
+        return NULL;
+    }
+
+    sz_haystack_t str, suffix;
     if (!export_string_like(str_obj, &str.start, &str.length) ||
         !export_string_like(suffix_obj, &suffix.start, &suffix.length)) {
         PyErr_SetString(PyExc_TypeError, "Both arguments must be string-like");
         return NULL;
     }
 
+    // Apply start and end arguments
+    str.start += start;
+    str.length -= start;
+    if (end != PY_SSIZE_T_MAX && end - start < str.length) { str.length = end - start; }
+
     if (str.length < suffix.length) { Py_RETURN_FALSE; }
     else if (strncmp(str.start + (str.length - suffix.length), suffix.start, suffix.length) == 0) { Py_RETURN_TRUE; }
     else { Py_RETURN_FALSE; }
 }
 
-static PyObject *api_split(PyObject *self, PyObject *args, PyObject *kwargs) {
+static Strs *Str_split_(
+    PyObject *parent, sz_haystack_t text, sz_needle_t separator, int keepseparator, Py_ssize_t maxsplit) {
+
+    // Create Strs object
+    Strs *result = (Strs *)PyObject_New(Strs, &StrsType);
+    if (!result) return NULL;
+
+    // Initialize Strs object based on the splitting logic
+    void *offsets_endings = NULL;
+    size_t offsets_capacity = 0;
+    size_t offsets_count = 0;
+    size_t bytes_per_offset;
+    if (text.length >= UINT32_MAX) {
+        bytes_per_offset = 8;
+        result->type = STRS_CONSECUTIVE_64;
+        result->data.consecutive_64bit.start = text.start;
+        result->data.consecutive_64bit.parent = parent;
+        result->data.consecutive_64bit.separator_length = !keepseparator * separator.length;
+    }
+    else {
+        bytes_per_offset = 4;
+        result->type = STRS_CONSECUTIVE_32;
+        result->data.consecutive_32bit.start = text.start;
+        result->data.consecutive_32bit.parent = parent;
+        result->data.consecutive_32bit.separator_length = !keepseparator * separator.length;
+    }
+
+    // Iterate through string, keeping track of the
+    sz_size_t last_start = 0;
+    while (last_start < text.length && offsets_count < maxsplit) {
+        sz_haystack_t text_remaining;
+        text_remaining.start = text.start + last_start;
+        text_remaining.length = text.length - last_start;
+        sz_size_t offset_in_remaining = sz_neon_find_substr(text_remaining, separator);
+
+        // Reallocate offsets array if needed
+        if (offsets_count >= offsets_capacity) {
+            offsets_capacity = (offsets_capacity + 1) * 2;
+            void *new_offsets = realloc(offsets_endings, offsets_capacity * bytes_per_offset);
+            if (!new_offsets) {
+                if (offsets_endings) free(offsets_endings);
+            }
+            offsets_endings = new_offsets;
+        }
+
+        // If the memory allocation has failed - discard the response
+        if (!offsets_endings) {
+            Py_XDECREF(result);
+            PyErr_NoMemory();
+            return NULL;
+        }
+
+        // Export the offset
+        size_t will_continue = offset_in_remaining != text_remaining.length;
+        size_t next_offset = last_start + offset_in_remaining + separator.length * will_continue;
+        if (text.length >= UINT32_MAX) { ((uint64_t *)offsets_endings)[offsets_count++] = (uint64_t)next_offset; }
+        else { ((uint32_t *)offsets_endings)[offsets_count++] = (uint32_t)next_offset; }
+
+        // Next time we want to start
+        last_start = last_start + offset_in_remaining + separator.length;
+    }
+
+    // Populate the Strs object with the offsets
+    if (text.length >= UINT32_MAX) {
+        result->data.consecutive_64bit.end_offsets = offsets_endings;
+        result->data.consecutive_64bit.count = offsets_count;
+    }
+    else {
+        result->data.consecutive_32bit.end_offsets = offsets_endings;
+        result->data.consecutive_32bit.count = offsets_count;
+    }
+
+    Py_INCREF(parent);
+    return result;
+}
+
+static PyObject *Str_split(PyObject *self, PyObject *args, PyObject *kwargs) {
 
     // Check minimum arguments
     int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
@@ -486,8 +614,8 @@ static PyObject *api_split(PyObject *self, PyObject *args, PyObject *kwargs) {
         }
     }
 
-    struct sz_haystack_t text;
-    struct sz_needle_t separator;
+    sz_haystack_t text;
+    sz_needle_t separator;
     int keepseparator;
     Py_ssize_t maxsplit;
     separator.anomaly_offset = 0;
@@ -532,77 +660,107 @@ static PyObject *api_split(PyObject *self, PyObject *args, PyObject *kwargs) {
     }
     else { maxsplit = PY_SSIZE_T_MAX; }
 
-    // Create Strs object
-    Strs *result = (Strs *)PyObject_New(Strs, &StrsType);
-    if (!result) return NULL;
+    return Str_split_(text_obj, text, separator, keepseparator, maxsplit);
+}
 
-    // Initialize Strs object based on the splitting logic
-    void *offsets_endings = NULL;
-    size_t offsets_capacity = 0;
-    size_t offsets_count = 0;
-    size_t bytes_per_offset;
-    if (text.length >= UINT32_MAX) {
-        bytes_per_offset = 8;
-        result->type = STRS_CONSECUTIVE_64;
-        result->data.consecutive_64bit.start = text.start;
-        result->data.consecutive_64bit.parent = text_obj;
-        result->data.consecutive_64bit.separator_length = !keepseparator * separator.length;
+static PyObject *Str_splitlines(PyObject *self, PyObject *args, PyObject *kwargs) {
+
+    // Check minimum arguments
+    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
+    Py_ssize_t nargs = PyTuple_Size(args);
+    if (nargs < !is_member || nargs > !is_member + 2) {
+        PyErr_SetString(PyExc_TypeError, "splitlines() requires at least 1 argument");
+        return NULL;
     }
-    else {
-        bytes_per_offset = 4;
-        result->type = STRS_CONSECUTIVE_32;
-        result->data.consecutive_32bit.start = text.start;
-        result->data.consecutive_32bit.parent = text_obj;
-        result->data.consecutive_32bit.separator_length = !keepseparator * separator.length;
+
+    PyObject *text_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
+    PyObject *keeplinebreaks_obj = nargs > !is_member ? PyTuple_GET_ITEM(args, !is_member) : NULL;
+    PyObject *maxsplit_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
+
+    if (kwargs) {
+        PyObject *key, *value;
+        Py_ssize_t pos = 0;
+        while (PyDict_Next(kwargs, &pos, &key, &value)) {
+            if (PyUnicode_CompareWithASCIIString(key, "keeplinebreaks") == 0) { keeplinebreaks_obj = value; }
+            else if (PyUnicode_CompareWithASCIIString(key, "maxsplit") == 0) { maxsplit_obj = value; }
+            else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key)) { return NULL; }
+        }
     }
 
-    // Iterate through string, keeping track of the
-    sz_size_t last_start = 0;
-    while (last_start < text.length && offsets_count < maxsplit) {
-        sz_haystack_t text_remaining;
-        text_remaining.start = text.start + last_start;
-        text_remaining.length = text.length - last_start;
-        sz_size_t offset_in_remaining = sz_neon_find_substr(text_remaining, separator);
+    sz_haystack_t text;
+    int keeplinebreaks;
+    Py_ssize_t maxsplit = PY_SSIZE_T_MAX; // Default value for maxsplit
 
-        // Reallocate offsets array if needed
-        if (offsets_count >= offsets_capacity) {
-            offsets_capacity = (offsets_capacity + 1) * 2;
-            void *new_offsets = realloc(offsets_endings, offsets_capacity * bytes_per_offset);
-            if (!new_offsets) {
-                if (offsets_endings) free(offsets_endings);
-            }
-            offsets_endings = new_offsets;
+    // Validate and convert `text`
+    if (!export_string_like(text_obj, &text.start, &text.length)) {
+        PyErr_SetString(PyExc_TypeError, "The text argument must be string-like");
+        return NULL;
+    }
+
+    // Validate and convert `keeplinebreaks`
+    if (keeplinebreaks_obj) {
+        keeplinebreaks = PyObject_IsTrue(keeplinebreaks_obj);
+        if (keeplinebreaks == -1) {
+            PyErr_SetString(PyExc_TypeError, "The keeplinebreaks argument must be a boolean");
+            return NULL;
         }
+    }
+    else { keeplinebreaks = 0; }
 
-        // If the memory allocation has failed - discard the response
-        if (!offsets_endings) {
-            Py_XDECREF(result);
-            PyErr_NoMemory();
+    // Validate and convert `maxsplit`
+    if (maxsplit_obj) {
+        maxsplit = PyLong_AsSsize_t(maxsplit_obj);
+        if (maxsplit == -1 && PyErr_Occurred()) {
+            PyErr_SetString(PyExc_TypeError, "The maxsplit argument must be an integer");
             return NULL;
         }
+    }
 
-        // Export the offset
-        size_t will_continue = offset_in_remaining != text_remaining.length;
-        size_t next_offset = last_start + offset_in_remaining + separator.length * will_continue;
-        if (text.length >= UINT32_MAX) { ((uint64_t *)offsets_endings)[offsets_count++] = (uint64_t)next_offset; }
-        else { ((uint32_t *)offsets_endings)[offsets_count++] = (uint32_t)next_offset; }
+    // TODO: Support arbitrary newline characters:
+    // https://docs.python.org/3/library/stdtypes.html#str.splitlines
+    // \n, \r, \r\n, \v or \x0b, \f or \x0c, \x1c, \x1d, \x1e, \x85, \u2028, \u2029
+    // https://github.com/ashvardanian/StringZilla/issues/29
+    sz_needle_t separator;
+    separator.start = "\n";
+    separator.length = 1;
+    return Str_split_(text_obj, text, separator, keeplinebreaks, maxsplit);
+}
 
-        // Next time we want to start
-        last_start = last_start + offset_in_remaining + separator.length;
+static PyObject *Str_concat(PyObject *self, PyObject *other) {
+    struct sz_haystack_t self_str, other_str;
+
+    // Validate and convert `self`
+    if (!export_string_like(self, &self_str.start, &self_str.length)) {
+        PyErr_SetString(PyExc_TypeError, "The self object must be string-like");
+        return NULL;
     }
 
-    // Populate the Strs object with the offsets
-    if (text.length >= UINT32_MAX) {
-        result->data.consecutive_64bit.end_offsets = offsets_endings;
-        result->data.consecutive_64bit.count = offsets_count;
+    // Validate and convert `other`
+    if (!export_string_like(other, &other_str.start, &other_str.length)) {
+        PyErr_SetString(PyExc_TypeError, "The other object must be string-like");
+        return NULL;
     }
-    else {
-        result->data.consecutive_32bit.end_offsets = offsets_endings;
-        result->data.consecutive_32bit.count = offsets_count;
+
+    // Allocate a new Str instance
+    Str *result_str = PyObject_New(Str, &StrType);
+    if (result_str == NULL) { return NULL; }
+
+    // Calculate the total length of the new string
+    result_str->parent = NULL;
+    result_str->length = self_str.length + other_str.length;
+
+    // Allocate memory for the new string
+    result_str->start = malloc(result_str->length);
+    if (result_str->start == NULL) {
+        PyErr_SetString(PyExc_MemoryError, "Unable to allocate memory for string concatenation");
+        return NULL;
     }
 
-    Py_INCREF(text_obj);
-    return (PyObject *)result;
+    // Perform the string concatenation
+    memcpy(result_str->start, self_str.start, self_str.length);
+    memcpy(result_str->start + self_str.length, other_str.start, other_str.length);
+
+    return (PyObject *)result_str;
 }
 
 #pragma endregion
@@ -769,12 +927,6 @@ static int Str_init(Str *self, PyObject *args, PyObject *kwargs) {
         }
     }
 
-    // Handle empty string
-    if (parent == NULL) {
-        self->start = NULL;
-        self->length = 0;
-    }
-
     // Now, type-check and cast each argument
     Py_ssize_t from = 0, to = PY_SSIZE_T_MAX;
     if (from_obj) {
@@ -792,10 +944,15 @@ static int Str_init(Str *self, PyObject *args, PyObject *kwargs) {
         }
     }
 
+    // Handle empty string
+    if (parent_obj == NULL) {
+        self->start = NULL;
+        self->length = 0;
+    }
     // Increment the reference count of the parent
-    else if (export_string_like(parent, &self->start, &self->length)) {
-        self->parent = parent;
-        Py_INCREF(parent);
+    else if (export_string_like(parent_obj, &self->start, &self->length)) {
+        self->parent = parent_obj;
+        Py_INCREF(parent_obj);
     }
     else {
         PyErr_SetString(PyExc_TypeError, "Unsupported parent type");
@@ -822,7 +979,8 @@ static PyObject *Str_new(PyTypeObject *type, PyObject *args, PyObject *kwds) {
 }
 
 static void Str_dealloc(Str *self) {
-    if (self->parent) Py_XDECREF(self->parent);
+    if (self->parent) { Py_XDECREF(self->parent); }
+    else if (self->start) { free(self->start); }
     self->parent = NULL;
     Py_TYPE(self)->tp_free((PyObject *)self);
 }
@@ -907,16 +1065,16 @@ static void Str_releasebuffer(PyObject *_, Py_buffer *view) {
     // https://docs.python.org/3/c-api/typeobj.html#c.PyBufferProcs.bf_releasebuffer
 }
 
-static int Str_contains(Str *self, PyObject *arg) {
+static int Str_in(Str *self, PyObject *arg) {
 
-    struct sz_needle_t needle_struct;
+    sz_needle_t needle_struct;
     needle_struct.anomaly_offset = 0;
     if (!export_string_like(arg, &needle_struct.start, &needle_struct.length)) {
         PyErr_SetString(PyExc_TypeError, "Unsupported argument type");
         return -1;
     }
 
-    struct sz_haystack_t haystack;
+    sz_haystack_t haystack;
     haystack.start = self->start;
     haystack.length = self->length;
     size_t position = sz_neon_find_substr(haystack, needle_struct);
@@ -1020,9 +1178,9 @@ static PyObject *Str_richcompare(PyObject *self, PyObject *other, int op) {
 }
 
 static PySequenceMethods Str_as_sequence = {
-    .sq_length = Str_len,        //
-    .sq_item = Str_getitem,      //
-    .sq_contains = Str_contains, //
+    .sq_length = Str_len,   //
+    .sq_item = Str_getitem, //
+    .sq_contains = Str_in,  //
 };
 
 static PyMappingMethods Str_as_mapping = {
@@ -1035,16 +1193,22 @@ static PyBufferProcs Str_as_buffer = {
     .bf_releasebuffer = Str_releasebuffer,
 };
 
+static PyNumberMethods Str_as_number = {
+    .nb_add = Str_concat,
+};
+
 #define sz_method_flags_m METH_VARARGS | METH_KEYWORDS
 
 static PyMethodDef Str_methods[] = { //
-    {"find", api_find, sz_method_flags_m, "Find the first occurrence of a substring."},
-    {"contains", api_contains, sz_method_flags_m, "Check if a string contains a substring."},
-    {"count", api_count, sz_method_flags_m, "Count the occurrences of a substring."},
-    {"levenstein", api_levenstein, sz_method_flags_m, "Calculate the Levenshtein distance between two strings."},
-    {"split", api_split, sz_method_flags_m, "Split a string by a separator."},
-    {"startswith", api_startswith, sz_method_flags_m, "Check if a string starts with a given prefix."},
-    {"endswith", api_endswith, sz_method_flags_m, "Check if a string ends with a given suffix."},
+    {"find", Str_find, sz_method_flags_m, "Find the first occurrence of a substring."},
+    {"index", Str_index, sz_method_flags_m, "Find the first occurrence of a substring or raise error if missing."},
+    {"contains", Str_contains, sz_method_flags_m, "Check if a string contains a substring."},
+    {"count", Str_count, sz_method_flags_m, "Count the occurrences of a substring."},
+    {"levenstein", Str_levenstein, sz_method_flags_m, "Calculate the Levenshtein distance between two strings."},
+    {"split", Str_split, sz_method_flags_m, "Split a string by a separator."},
+    {"splitlines", Str_splitlines, sz_method_flags_m, "Split a string by line breaks."},
+    {"startswith", Str_startswith, sz_method_flags_m, "Check if a string starts with a given prefix."},
+    {"endswith", Str_endswith, sz_method_flags_m, "Check if a string ends with a given suffix."},
     {NULL, NULL, 0, NULL}};
 
 static PyTypeObject StrType = {
@@ -1062,6 +1226,7 @@ static PyTypeObject StrType = {
     .tp_as_sequence = &Str_as_sequence,
     .tp_as_mapping = &Str_as_mapping,
     .tp_as_buffer = &Str_as_buffer,
+    .tp_as_number = &Str_as_number,
 };
 
 static PySequenceMethods Strs_as_sequence = {
@@ -1095,13 +1260,15 @@ static void stringzilla_cleanup(PyObject *m) {
 }
 
 static PyMethodDef stringzilla_methods[] = {
-    {"find", api_find, sz_method_flags_m, "Find the first occurrence of a substring."},
-    {"contains", api_contains, sz_method_flags_m, "Check if a string contains a substring."},
-    {"count", api_count, sz_method_flags_m, "Count the occurrences of a substring."},
-    {"levenstein", api_levenstein, sz_method_flags_m, "Calculate the Levenshtein distance between two strings."},
-    {"split", api_split, sz_method_flags_m, "Split a string by a separator."},
-    {"startswith", api_startswith, sz_method_flags_m, "Check if a string starts with a given prefix."},
-    {"endswith", api_endswith, sz_method_flags_m, "Check if a string ends with a given suffix."},
+    {"find", Str_find, sz_method_flags_m, "Find the first occurrence of a substring."},
+    {"index", Str_index, sz_method_flags_m, "Find the first occurrence of a substring or raise error if missing."},
+    {"contains", Str_contains, sz_method_flags_m, "Check if a string contains a substring."},
+    {"count", Str_count, sz_method_flags_m, "Count the occurrences of a substring."},
+    {"levenstein", Str_levenstein, sz_method_flags_m, "Calculate the Levenshtein distance between two strings."},
+    {"split", Str_split, sz_method_flags_m, "Split a string by a separator."},
+    {"splitlines", Str_splitlines, sz_method_flags_m, "Split a string by line breaks."},
+    {"startswith", Str_startswith, sz_method_flags_m, "Check if a string starts with a given prefix."},
+    {"endswith", Str_endswith, sz_method_flags_m, "Check if a string ends with a given suffix."},
     {NULL, NULL, 0, NULL}};
 
 static PyModuleDef stringzilla_module = {
@@ -1154,10 +1321,4 @@ PyMODINIT_FUNC PyInit_stringzilla(void) {
     temporary_memory.start = malloc(4096);
     temporary_memory.length = 4096 * (temporary_memory.start != NULL);
     return m;
-
-cleanup:
-    Py_XDECREF(&FileType);
-    Py_XDECREF(&StrType);
-    Py_XDECREF(m);
-    return NULL;
 }
diff --git a/setup.py b/setup.py
index 419d9915..cb136b1d 100644
--- a/setup.py
+++ b/setup.py
@@ -38,6 +38,8 @@
     compile_args.append("-O3")
     compile_args.append("-pedantic")
     compile_args.append("-Wno-unknown-pragmas")
+    compile_args.append("-Wno-incompatible-function-pointer-types")
+    compile_args.append("-Wno-incompatible-pointer-types")
     compile_args.append("-Xpreprocessor -fopenmp")
     link_args.append("-Xpreprocessor -lomp")
 

From 4f69e6623b3a0ad4eae49afa943dc1587a1f10c2 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 19 Sep 2023 23:50:28 +0400
Subject: [PATCH 25/72] Add: `partition` method in Python

---
 python/lib.c | 1683 +++++++++++++++++++++++++++-----------------------
 1 file changed, 903 insertions(+), 780 deletions(-)

diff --git a/python/lib.c b/python/lib.c
index 7d4c59ef..8459cc30 100644
--- a/python/lib.c
+++ b/python/lib.c
@@ -205,976 +205,1097 @@ int export_string_like(PyObject *object, char const **start, size_t *length) {
     return 0;
 }
 
-#pragma endregion
-
-#pragma region Global Functions
-
-static Py_ssize_t Str_find_(PyObject *self, PyObject *args, PyObject *kwargs) {
-    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
-    Py_ssize_t nargs = PyTuple_Size(args);
-    if (nargs < !is_member + 1 || nargs > !is_member + 3) {
-        PyErr_SetString(PyExc_TypeError, "Invalid number of arguments");
-        return 0;
+int get_string_at_offset(
+    Strs *strs, Py_ssize_t i, Py_ssize_t count, PyObject **parent, char const **start, size_t *length) {
+    switch (strs->type) {
+    case STRS_CONSECUTIVE_32: {
+        uint32_t start_offset = (i == 0) ? 0 : strs->data.consecutive_32bit.end_offsets[i - 1];
+        uint32_t end_offset = strs->data.consecutive_32bit.end_offsets[i];
+        *start = strs->data.consecutive_32bit.start + start_offset;
+        *length = end_offset - start_offset - strs->data.consecutive_32bit.separator_length * (i + 1 != count);
+        *parent = strs->data.consecutive_32bit.parent;
+        return 1;
     }
-
-    PyObject *haystack_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
-    PyObject *needle_obj = PyTuple_GET_ITEM(args, !is_member + 0);
-    PyObject *start_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
-    PyObject *end_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL;
-
-    // Parse keyword arguments
-    if (kwargs) {
-        Py_ssize_t pos = 0;
-        PyObject *key, *value;
-        while (PyDict_Next(kwargs, &pos, &key, &value)) {
-            if (PyUnicode_CompareWithASCIIString(key, "start") == 0) { start_obj = value; }
-            else if (PyUnicode_CompareWithASCIIString(key, "end") == 0) { end_obj = value; }
-            else {
-                PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key);
-                return 0;
-            }
-        }
+    case STRS_CONSECUTIVE_64: {
+        uint64_t start_offset = (i == 0) ? 0 : strs->data.consecutive_64bit.end_offsets[i - 1];
+        uint64_t end_offset = strs->data.consecutive_64bit.end_offsets[i];
+        *start = strs->data.consecutive_64bit.start + start_offset;
+        *length = end_offset - start_offset - strs->data.consecutive_64bit.separator_length * (i + 1 != count);
+        *parent = strs->data.consecutive_64bit.parent;
+        return 1;
     }
-
-    sz_haystack_t haystack;
-    sz_needle_t needle;
-    Py_ssize_t start, end;
-
-    // Validate and convert `haystack` and `needle`
-    needle.anomaly_offset = 0;
-    if (!export_string_like(haystack_obj, &haystack.start, &haystack.length) ||
-        !export_string_like(needle_obj, &needle.start, &needle.length)) {
-        PyErr_SetString(PyExc_TypeError, "Haystack and needle must be string-like");
-        return 0;
+    case STRS_REORDERED: {
+        //
+        return 1;
     }
-
-    // Validate and convert `start`
-    if (start_obj) {
-        start = PyLong_AsSsize_t(start_obj);
-        if (start == -1 && PyErr_Occurred()) {
-            PyErr_SetString(PyExc_TypeError, "The start argument must be an integer");
-            return 0;
-        }
+    case STRS_MULTI_SOURCE: {
+        //
+        return 1;
     }
-    else { start = 0; }
-
-    // Validate and convert `end`
-    if (end_obj) {
-        end = PyLong_AsSsize_t(end_obj);
-        if (end == -1 && PyErr_Occurred()) {
-            PyErr_SetString(PyExc_TypeError, "The end argument must be an integer");
-            return 0;
-        }
+    default:
+        // Unsupported type
+        PyErr_SetString(PyExc_TypeError, "Unsupported type for conversion");
+        return -1;
     }
-    else { end = PY_SSIZE_T_MAX; }
-
-    // Limit the `haystack` range
-    size_t normalized_offset, normalized_length;
-    slice(haystack.length, start, end, &normalized_offset, &normalized_length);
-    haystack.start += normalized_offset;
-    haystack.length = normalized_length;
-
-    // Perform contains operation
-    size_t offset = sz_neon_find_substr(haystack, needle);
-    if (offset == haystack.length) return -1;
-    return (Py_ssize_t)offset;
-}
-
-static PyObject *Str_find(PyObject *self, PyObject *args, PyObject *kwargs) {
-    Py_ssize_t signed_offset = Str_find_(self, args, kwargs);
-    if (PyErr_Occurred()) return NULL;
-    return PyLong_FromSsize_t(signed_offset);
 }
 
-static PyObject *Str_index(PyObject *self, PyObject *args, PyObject *kwargs) {
-    Py_ssize_t signed_offset = Str_find_(self, args, kwargs);
-    if (PyErr_Occurred()) return NULL;
-    if (signed_offset == -1) {
-        PyErr_SetString(PyExc_ValueError, "substring not found");
-        return NULL;
+int prepare_strings_for_reordering(Strs *strs) {
+    // Already in reordered form
+    if (strs->type == STRS_REORDERED) { return 1; }
+
+    // Allocate memory for reordered slices
+    size_t count = 0;
+    switch (strs->type) {
+    case STRS_CONSECUTIVE_32: count = strs->data.consecutive_32bit.count; break;
+    case STRS_CONSECUTIVE_64: count = strs->data.consecutive_64bit.count; break;
+    case STRS_REORDERED: return 1;
+    case STRS_MULTI_SOURCE: return 1;
+    default:
+        // Unsupported type
+        PyErr_SetString(PyExc_TypeError, "Unsupported type for conversion");
+        return -1;
     }
-    return PyLong_FromSsize_t(signed_offset);
-}
-
-static PyObject *Str_contains(PyObject *self, PyObject *args, PyObject *kwargs) {
-    Py_ssize_t signed_offset = Str_find_(self, args, kwargs);
-    if (PyErr_Occurred()) return NULL;
-    if (signed_offset == -1) { Py_RETURN_FALSE; }
-    else { Py_RETURN_TRUE; }
-}
 
-static PyObject *Str_count(PyObject *self, PyObject *args, PyObject *kwargs) {
-    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
-    Py_ssize_t nargs = PyTuple_Size(args);
-    if (nargs < !is_member + 1 || nargs > !is_member + 4) {
-        PyErr_Format(PyExc_TypeError, "Invalid number of arguments");
-        return NULL;
+    sz_haystack_t *new_parts = (sz_haystack_t *)malloc(count * sizeof(sz_haystack_t));
+    if (new_parts == NULL) {
+        PyErr_SetString(PyExc_MemoryError, "Unable to allocate memory for reordered slices");
+        return -1;
     }
 
-    PyObject *haystack_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
-    PyObject *needle_obj = PyTuple_GET_ITEM(args, !is_member + 0);
-    PyObject *start_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
-    PyObject *end_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL;
-    PyObject *allowoverlap_obj = nargs > !is_member + 3 ? PyTuple_GET_ITEM(args, !is_member + 3) : NULL;
+    // Populate the new reordered array using get_string_at_offset
+    for (Py_ssize_t i = 0; i < count; ++i) {
+        PyObject *parent;
+        char const *start;
+        size_t length;
+        if (!get_string_at_offset(strs, i, count, &parent, &start, &length)) {
+            // Handle error
+            PyErr_SetString(PyExc_RuntimeError, "Failed to get string at offset");
+            free(new_parts);
+            return -1;
+        }
 
-    if (kwargs) {
-        Py_ssize_t pos = 0;
-        PyObject *key, *value;
-        while (PyDict_Next(kwargs, &pos, &key, &value))
-            if (PyUnicode_CompareWithASCIIString(key, "start") == 0) { start_obj = value; }
-            else if (PyUnicode_CompareWithASCIIString(key, "end") == 0) { end_obj = value; }
-            else if (PyUnicode_CompareWithASCIIString(key, "allowoverlap") == 0) { allowoverlap_obj = value; }
-            else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key))
-                return NULL;
+        new_parts[i].start = start;
+        new_parts[i].length = length;
     }
 
-    sz_haystack_t haystack;
-    sz_needle_t needle;
-    Py_ssize_t start = start_obj ? PyLong_AsSsize_t(start_obj) : 0;
-    Py_ssize_t end = end_obj ? PyLong_AsSsize_t(end_obj) : PY_SSIZE_T_MAX;
-    int allowoverlap = allowoverlap_obj ? PyObject_IsTrue(allowoverlap_obj) : 0;
-
-    needle.anomaly_offset = 0;
-    if (!export_string_like(haystack_obj, &haystack.start, &haystack.length) ||
-        !export_string_like(needle_obj, &needle.start, &needle.length))
-        return PyErr_Format(PyExc_TypeError, "Haystack and needle must be string-like"), NULL;
+    // Release previous used memory.
 
-    if ((start == -1 || end == -1 || allowoverlap == -1) && PyErr_Occurred()) return NULL;
-
-    size_t normalized_offset, normalized_length;
-    slice(haystack.length, start, end, &normalized_offset, &normalized_length);
-    haystack.start += normalized_offset;
-    haystack.length = normalized_length;
+    // Update the Strs object
+    strs->type = STRS_REORDERED;
+    strs->data.reordered.count = count;
+    strs->data.reordered.parts = new_parts;
+    strs->data.reordered.parent = NULL; // Assuming the parent is no longer needed
 
-    size_t count = needle.length == 1 ? sz_naive_count_char(haystack, *needle.start) : 0;
-    if (needle.length != 1) {
-        if (allowoverlap) {
-            while (haystack.length) {
-                size_t offset = sz_neon_find_substr(haystack, needle);
-                int found = offset != haystack.length;
-                count += found;
-                haystack.start += offset + found;
-                haystack.length -= offset + found;
-            }
-        }
-        else {
-            while (haystack.length) {
-                size_t offset = sz_neon_find_substr(haystack, needle);
-                int found = offset != haystack.length;
-                count += found;
-                haystack.start += offset + needle.length;
-                haystack.length -= offset + needle.length * found;
-            }
-        }
-    }
-    return PyLong_FromSize_t(count);
+    return 0;
 }
 
-static PyObject *Str_levenstein(PyObject *self, PyObject *args, PyObject *kwargs) {
-    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
-    Py_ssize_t nargs = PyTuple_Size(args);
-    if (nargs < !is_member + 1 || nargs > !is_member + 2) {
-        PyErr_Format(PyExc_TypeError, "Invalid number of arguments");
-        return NULL;
-    }
+#pragma endregion
 
-    PyObject *str1_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
-    PyObject *str2_obj = PyTuple_GET_ITEM(args, !is_member + 0);
-    PyObject *bound_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
+#pragma region MemoryMappingFile
 
-    if (kwargs) {
-        PyObject *key, *value;
-        Py_ssize_t pos = 0;
-        while (PyDict_Next(kwargs, &pos, &key, &value))
-            if (PyUnicode_CompareWithASCIIString(key, "bound") == 0) {
-                if (bound_obj) {
-                    PyErr_Format(PyExc_TypeError, "Received bound both as positional and keyword argument");
-                    return NULL;
-                }
-                bound_obj = value;
-            }
+static void File_dealloc(File *self) {
+#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
+    if (self->start) {
+        UnmapViewOfFile(self->start);
+        self->start = NULL;
     }
-
-    int bound = 255; // Default value for bound
-    if (bound_obj && ((bound = PyLong_AsLong(bound_obj)) > 255 || bound < 0)) {
-        PyErr_Format(PyExc_ValueError, "Bound must be an integer between 0 and 255");
-        return NULL;
+    if (self->mapping_handle) {
+        CloseHandle(self->mapping_handle);
+        self->mapping_handle = NULL;
     }
-
-    sz_haystack_t str1, str2;
-    if (!export_string_like(str1_obj, &str1.start, &str1.length) ||
-        !export_string_like(str2_obj, &str2.start, &str2.length)) {
-        PyErr_Format(PyExc_TypeError, "Both arguments must be string-like");
-        return NULL;
+    if (self->file_handle) {
+        CloseHandle(self->file_handle);
+        self->file_handle = NULL;
     }
-
-    size_t memory_needed = sz_levenstein_memory_needed(str1.length, str2.length);
-    if (temporary_memory.length < memory_needed) {
-        temporary_memory.start = realloc(temporary_memory.start, memory_needed);
-        temporary_memory.length = memory_needed;
+#else
+    if (self->start) {
+        munmap(self->start, self->length);
+        self->start = NULL;
+        self->length = 0;
     }
-    if (!temporary_memory.start) {
-        PyErr_Format(PyExc_MemoryError, "Unable to allocate memory for the Levenshtein matrix");
-        return NULL;
+    if (self->file_descriptor != 0) {
+        close(self->file_descriptor);
+        self->file_descriptor = 0;
     }
+#endif
+    Py_TYPE(self)->tp_free((PyObject *)self);
+}
 
-    levenstein_distance_t small_bound = (levenstein_distance_t)bound;
-    levenstein_distance_t distance =
-        sz_levenstein(str1.start, str1.length, str2.start, str2.length, small_bound, temporary_memory.start);
+static PyObject *File_new(PyTypeObject *type, PyObject *positional_args, PyObject *named_args) {
+    File *self;
+    self = (File *)type->tp_alloc(type, 0);
+    if (self == NULL) return NULL;
 
-    return PyLong_FromLong(distance);
+#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
+    self->file_handle = NULL;
+    self->mapping_handle = NULL;
+#else
+    self->file_descriptor = 0;
+#endif
+    self->start = NULL;
+    self->length = 0;
 }
 
-static PyObject *Str_startswith(PyObject *self, PyObject *args, PyObject *kwargs) {
-    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
-    Py_ssize_t nargs = PyTuple_Size(args);
-    if (nargs < !is_member + 1 || nargs > !is_member + 3) {
-        PyErr_Format(PyExc_TypeError, "Invalid number of arguments");
-        return NULL;
-    }
-
-    PyObject *str_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
-    PyObject *prefix_obj = PyTuple_GET_ITEM(args, !is_member);
-    PyObject *start_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
-    PyObject *end_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL;
+static int File_init(File *self, PyObject *positional_args, PyObject *named_args) {
+    const char *path;
+    if (!PyArg_ParseTuple(positional_args, "s", &path)) return -1;
 
-    // Optional start and end arguments
-    Py_ssize_t start = 0, end = PY_SSIZE_T_MAX;
+#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
+    self->file_handle = CreateFile(path, GENERIC_READ, FILE_SHARE_READ, 0, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0);
+    if (self->file_handle == INVALID_HANDLE_VALUE) {
+        PyErr_SetString(PyExc_RuntimeError, "Couldn't map the file!");
+        return -1;
+    }
 
-    if (start_obj && ((start = PyLong_AsSsize_t(start_obj)) == -1 && PyErr_Occurred())) {
-        PyErr_SetString(PyExc_TypeError, "start must be an integer");
-        return NULL;
+    self->mapping_handle = CreateFileMapping(self->file_handle, 0, PAGE_READONLY, 0, 0, 0);
+    if (self->mapping_handle == 0) {
+        CloseHandle(self->file_handle);
+        self->file_handle = NULL;
+        PyErr_SetString(PyExc_RuntimeError, "Couldn't map the file!");
+        return -1;
     }
 
-    if (end_obj && ((end = PyLong_AsSsize_t(end_obj)) == -1 && PyErr_Occurred())) {
-        PyErr_SetString(PyExc_TypeError, "end must be an integer");
-        return NULL;
+    char *file = (char *)MapViewOfFile(self->mapping_handle, FILE_MAP_READ, 0, 0, 0);
+    if (file == 0) {
+        CloseHandle(self->mapping_handle);
+        self->mapping_handle = NULL;
+        CloseHandle(self->file_handle);
+        self->file_handle = NULL;
+        PyErr_SetString(PyExc_RuntimeError, "Couldn't map the file!");
+        return -1;
     }
+    self->start = file;
+    self->length = GetFileSize(self->file_handle, 0);
+#else
+    struct stat sb;
+    self->file_descriptor = open(path, O_RDONLY);
+    if (fstat(self->file_descriptor, &sb) != 0) {
+        close(self->file_descriptor);
+        self->file_descriptor = 0;
+        PyErr_SetString(PyExc_RuntimeError, "Can't retrieve file size!");
+        return -1;
+    }
+    size_t file_size = sb.st_size;
+    void *map = mmap(NULL, sb.st_size, PROT_READ, MAP_SHARED, self->file_descriptor, 0);
+    if (map == MAP_FAILED) {
+        close(self->file_descriptor);
+        self->file_descriptor = 0;
+        PyErr_SetString(PyExc_RuntimeError, "Couldn't map the file!");
+        return -1;
+    }
+    self->start = map;
+    self->length = file_size;
+#endif
 
-    sz_haystack_t str, prefix;
-    if (!export_string_like(str_obj, &str.start, &str.length) ||
-        !export_string_like(prefix_obj, &prefix.start, &prefix.length)) {
-        PyErr_SetString(PyExc_TypeError, "Both arguments must be string-like");
-        return NULL;
+    return 0;
+}
+
+static PyMethodDef File_methods[] = { //
+    {NULL, NULL, 0, NULL}};
+
+static PyTypeObject FileType = {
+    PyObject_HEAD_INIT(NULL).tp_name = "stringzilla.File",
+    .tp_doc = "Memory mapped file class, that exposes the memory range for low-level access",
+    .tp_basicsize = sizeof(File),
+    .tp_flags = Py_TPFLAGS_DEFAULT,
+    .tp_methods = File_methods,
+    .tp_new = (newfunc)File_new,
+    .tp_init = (initproc)File_init,
+    .tp_dealloc = (destructor)File_dealloc,
+};
+
+#pragma endregion
+
+#pragma region Str
+
+static int Str_init(Str *self, PyObject *args, PyObject *kwargs) {
+
+    // Parse all arguments into PyObjects first
+    Py_ssize_t nargs = PyTuple_Size(args);
+    if (nargs > 3) {
+        PyErr_SetString(PyExc_TypeError, "Invalid number of arguments");
+        return -1;
     }
+    PyObject *parent_obj = nargs >= 1 ? PyTuple_GET_ITEM(args, 0) : NULL;
+    PyObject *from_obj = nargs >= 2 ? PyTuple_GET_ITEM(args, 1) : NULL;
+    PyObject *to_obj = nargs >= 3 ? PyTuple_GET_ITEM(args, 2) : NULL;
 
-    // Apply start and end arguments
-    str.start += start;
-    str.length -= start;
-    if (end != PY_SSIZE_T_MAX && end - start < str.length) { str.length = end - start; }
+    // Parse keyword arguments, if provided, and ensure no duplicates
+    if (kwargs) {
+        PyObject *key, *value;
+        Py_ssize_t pos = 0;
+        while (PyDict_Next(kwargs, &pos, &key, &value)) {
+            if (PyUnicode_CompareWithASCIIString(key, "parent") == 0) {
+                if (parent_obj) {
+                    PyErr_SetString(PyExc_TypeError, "Received `parent` both as positional and keyword argument");
+                    return -1;
+                }
+                parent_obj = value;
+            }
+            else if (PyUnicode_CompareWithASCIIString(key, "from") == 0) {
+                if (from_obj) {
+                    PyErr_SetString(PyExc_TypeError, "Received `from` both as positional and keyword argument");
+                    return -1;
+                }
+                from_obj = value;
+            }
+            else if (PyUnicode_CompareWithASCIIString(key, "to") == 0) {
+                if (to_obj) {
+                    PyErr_SetString(PyExc_TypeError, "Received `to` both as positional and keyword argument");
+                    return -1;
+                }
+                to_obj = value;
+            }
+            else {
+                PyErr_SetString(PyExc_TypeError, "Invalid keyword argument");
+                return -1;
+            }
+        }
+    }
 
-    if (str.length < prefix.length) { Py_RETURN_FALSE; }
-    else if (strncmp(str.start, prefix.start, prefix.length) == 0) { Py_RETURN_TRUE; }
-    else { Py_RETURN_FALSE; }
+    // Now, type-check and cast each argument
+    Py_ssize_t from = 0, to = PY_SSIZE_T_MAX;
+    if (from_obj) {
+        from = PyLong_AsSsize_t(from_obj);
+        if (from == -1 && PyErr_Occurred()) {
+            PyErr_SetString(PyExc_TypeError, "The `from` argument must be an integer");
+            return -1;
+        }
+    }
+    if (to_obj) {
+        to = PyLong_AsSsize_t(to_obj);
+        if (to == -1 && PyErr_Occurred()) {
+            PyErr_SetString(PyExc_TypeError, "The `to` argument must be an integer");
+            return -1;
+        }
+    }
+
+    // Handle empty string
+    if (parent_obj == NULL) {
+        self->start = NULL;
+        self->length = 0;
+    }
+    // Increment the reference count of the parent
+    else if (export_string_like(parent_obj, &self->start, &self->length)) {
+        self->parent = parent_obj;
+        Py_INCREF(parent_obj);
+    }
+    else {
+        PyErr_SetString(PyExc_TypeError, "Unsupported parent type");
+        return -1;
+    }
+
+    // Apply slicing
+    size_t normalized_offset, normalized_length;
+    slice(self->length, from, to, &normalized_offset, &normalized_length);
+    self->start = ((char *)self->start) + normalized_offset;
+    self->length = normalized_length;
+    return 0;
 }
 
-static PyObject *Str_endswith(PyObject *self, PyObject *args, PyObject *kwargs) {
-    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
-    Py_ssize_t nargs = PyTuple_Size(args);
-    if (nargs < !is_member + 1 || nargs > !is_member + 3) {
-        PyErr_Format(PyExc_TypeError, "Invalid number of arguments");
+static PyObject *Str_new(PyTypeObject *type, PyObject *args, PyObject *kwds) {
+    Str *self;
+    self = (Str *)type->tp_alloc(type, 0);
+    if (!self) return NULL;
+
+    self->parent = NULL;
+    self->start = NULL;
+    self->length = 0;
+    return (PyObject *)self;
+}
+
+static void Str_dealloc(Str *self) {
+    if (self->parent) { Py_XDECREF(self->parent); }
+    else if (self->start) { free(self->start); }
+    self->parent = NULL;
+    Py_TYPE(self)->tp_free((PyObject *)self);
+}
+
+static PyObject *Str_str(Str *self) { return PyUnicode_FromStringAndSize(self->start, self->length); }
+
+static Py_hash_t Str_hash(Str *self) { return (Py_hash_t)sz_hash_crc32_native(self->start, self->length); }
+
+static Py_ssize_t Str_len(Str *self) { return self->length; }
+
+static PyObject *Str_getitem(Str *self, Py_ssize_t i) {
+
+    // Negative indexing
+    if (i < 0) i += self->length;
+
+    if (i < 0 || (size_t)i >= self->length) {
+        PyErr_SetString(PyExc_IndexError, "Index out of range");
         return NULL;
     }
 
-    PyObject *str_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
-    PyObject *suffix_obj = PyTuple_GET_ITEM(args, !is_member);
-    PyObject *start_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
-    PyObject *end_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL;
+    // Assuming the underlying data is UTF-8 encoded
+    return PyUnicode_FromStringAndSize(self->start + i, 1);
+}
 
-    // Optional start and end arguments
-    Py_ssize_t start = 0, end = PY_SSIZE_T_MAX;
+static PyObject *Str_subscript(Str *self, PyObject *key) {
+    if (PySlice_Check(key)) {
+        // Sanity checks
+        Py_ssize_t start, stop, step;
+        if (PySlice_Unpack(key, &start, &stop, &step) < 0) return NULL;
+        if (PySlice_AdjustIndices(self->length, &start, &stop, step) < 0) return NULL;
+        if (step != 1) {
+            PyErr_SetString(PyExc_IndexError, "Efficient step is not supported");
+            return NULL;
+        }
 
-    if (start_obj && ((start = PyLong_AsSsize_t(start_obj)) == -1 && PyErr_Occurred())) {
-        PyErr_SetString(PyExc_TypeError, "start must be an integer");
+        // Create a new `Str` object
+        Str *self_slice = (Str *)StrType.tp_alloc(&StrType, 0);
+        if (self_slice == NULL && PyErr_NoMemory()) return NULL;
+
+        // Set its properties based on the slice
+        self_slice->start = self->start + start;
+        self_slice->length = stop - start;
+        self_slice->parent = (PyObject *)self; // Set parent to keep it alive
+
+        // Increment the reference count of the parent
+        Py_INCREF(self);
+        return (PyObject *)self_slice;
+    }
+    else if (PyLong_Check(key)) { return Str_getitem(self, PyLong_AsSsize_t(key)); }
+    else {
+        PyErr_SetString(PyExc_TypeError, "Str indices must be integers or slices");
         return NULL;
     }
+}
+
+static int Str_getbuffer(Str *self, Py_buffer *view, int flags) {
+    if (view == NULL) {
+        PyErr_SetString(PyExc_ValueError, "NULL view in getbuffer");
+        return -1;
+    }
+
+    static Py_ssize_t itemsize[1] = {1};
+    view->obj = (PyObject *)self;
+    view->buf = self->start;
+    view->len = self->length;
+    view->readonly = 1;
+    view->itemsize = sizeof(char);
+    view->format = "c"; // https://docs.python.org/3/library/struct.html#format-characters
+    view->ndim = 1;
+    view->shape = &self->length; // 1-D array, so shape is just a pointer to the length
+    view->strides = itemsize;    // strides in a 1-D array is just the item size
+    view->suboffsets = NULL;
+    view->internal = NULL;
+
+    Py_INCREF(self);
+    return 0;
+}
+
+static void Str_releasebuffer(PyObject *_, Py_buffer *view) {
+    // This function MUST NOT decrement view->obj, since that is done automatically
+    // in PyBuffer_Release() (this scheme is useful for breaking reference cycles).
+    // https://docs.python.org/3/c-api/typeobj.html#c.PyBufferProcs.bf_releasebuffer
+}
+
+static int Str_in(Str *self, PyObject *arg) {
+
+    sz_needle_t needle_struct;
+    needle_struct.anomaly_offset = 0;
+    if (!export_string_like(arg, &needle_struct.start, &needle_struct.length)) {
+        PyErr_SetString(PyExc_TypeError, "Unsupported argument type");
+        return -1;
+    }
+
+    sz_haystack_t haystack;
+    haystack.start = self->start;
+    haystack.length = self->length;
+    size_t position = sz_neon_find_substr(haystack, needle_struct);
+    return position != haystack.length;
+}
+
+static Py_ssize_t Strs_len(Strs *self) {
+    switch (self->type) {
+    case STRS_CONSECUTIVE_32: return self->data.consecutive_32bit.count;
+    case STRS_CONSECUTIVE_64: return self->data.consecutive_64bit.count;
+    case STRS_REORDERED: return self->data.reordered.count;
+    case STRS_MULTI_SOURCE: return self->data.multi_source.count;
+    default: return 0;
+    }
+}
 
-    if (end_obj && ((end = PyLong_AsSsize_t(end_obj)) == -1 && PyErr_Occurred())) {
-        PyErr_SetString(PyExc_TypeError, "end must be an integer");
+static PyObject *Strs_getitem(Strs *self, Py_ssize_t i) {
+    // Check for negative index and convert to positive
+    Py_ssize_t count = Strs_len(self);
+    if (i < 0) i += count;
+    if (i < 0 || i >= count) {
+        PyErr_SetString(PyExc_IndexError, "Index out of range");
         return NULL;
     }
 
-    sz_haystack_t str, suffix;
-    if (!export_string_like(str_obj, &str.start, &str.length) ||
-        !export_string_like(suffix_obj, &suffix.start, &suffix.length)) {
-        PyErr_SetString(PyExc_TypeError, "Both arguments must be string-like");
+    PyObject *parent = NULL;
+    char const *start = NULL;
+    size_t length = 0;
+    if (!get_string_at_offset(self, i, count, &parent, &start, &length)) {
+        PyErr_SetString(PyExc_TypeError, "Unknown Strs kind");
         return NULL;
     }
 
-    // Apply start and end arguments
-    str.start += start;
-    str.length -= start;
-    if (end != PY_SSIZE_T_MAX && end - start < str.length) { str.length = end - start; }
+    // Create a new `Str` object
+    Str *parent_slice = (Str *)StrType.tp_alloc(&StrType, 0);
+    if (parent_slice == NULL && PyErr_NoMemory()) return NULL;
 
-    if (str.length < suffix.length) { Py_RETURN_FALSE; }
-    else if (strncmp(str.start + (str.length - suffix.length), suffix.start, suffix.length) == 0) { Py_RETURN_TRUE; }
-    else { Py_RETURN_FALSE; }
+    parent_slice->start = start;
+    parent_slice->length = length;
+    parent_slice->parent = parent;
+    Py_INCREF(parent);
+    return parent_slice;
 }
 
-static Strs *Str_split_(
-    PyObject *parent, sz_haystack_t text, sz_needle_t separator, int keepseparator, Py_ssize_t maxsplit) {
-
-    // Create Strs object
-    Strs *result = (Strs *)PyObject_New(Strs, &StrsType);
-    if (!result) return NULL;
-
-    // Initialize Strs object based on the splitting logic
-    void *offsets_endings = NULL;
-    size_t offsets_capacity = 0;
-    size_t offsets_count = 0;
-    size_t bytes_per_offset;
-    if (text.length >= UINT32_MAX) {
-        bytes_per_offset = 8;
-        result->type = STRS_CONSECUTIVE_64;
-        result->data.consecutive_64bit.start = text.start;
-        result->data.consecutive_64bit.parent = parent;
-        result->data.consecutive_64bit.separator_length = !keepseparator * separator.length;
-    }
-    else {
-        bytes_per_offset = 4;
-        result->type = STRS_CONSECUTIVE_32;
-        result->data.consecutive_32bit.start = text.start;
-        result->data.consecutive_32bit.parent = parent;
-        result->data.consecutive_32bit.separator_length = !keepseparator * separator.length;
-    }
+static PyObject *Strs_subscript(Str *self, PyObject *key) {
+    if (PyLong_Check(key)) return Strs_getitem(self, PyLong_AsSsize_t(key));
+    return NULL;
+}
 
-    // Iterate through string, keeping track of the
-    sz_size_t last_start = 0;
-    while (last_start < text.length && offsets_count < maxsplit) {
-        sz_haystack_t text_remaining;
-        text_remaining.start = text.start + last_start;
-        text_remaining.length = text.length - last_start;
-        sz_size_t offset_in_remaining = sz_neon_find_substr(text_remaining, separator);
+// Will be called by the `PySequence_Contains`
+static int Strs_contains(Str *self, PyObject *arg) { return 0; }
 
-        // Reallocate offsets array if needed
-        if (offsets_count >= offsets_capacity) {
-            offsets_capacity = (offsets_capacity + 1) * 2;
-            void *new_offsets = realloc(offsets_endings, offsets_capacity * bytes_per_offset);
-            if (!new_offsets) {
-                if (offsets_endings) free(offsets_endings);
-            }
-            offsets_endings = new_offsets;
-        }
+static PyObject *Str_richcompare(PyObject *self, PyObject *other, int op) {
 
-        // If the memory allocation has failed - discard the response
-        if (!offsets_endings) {
-            Py_XDECREF(result);
-            PyErr_NoMemory();
-            return NULL;
-        }
+    char const *a_start, *b_start;
+    size_t a_length, b_length;
+    if (!export_string_like(self, &a_start, &a_length) || !export_string_like(other, &b_start, &b_length))
+        Py_RETURN_NOTIMPLEMENTED;
 
-        // Export the offset
-        size_t will_continue = offset_in_remaining != text_remaining.length;
-        size_t next_offset = last_start + offset_in_remaining + separator.length * will_continue;
-        if (text.length >= UINT32_MAX) { ((uint64_t *)offsets_endings)[offsets_count++] = (uint64_t)next_offset; }
-        else { ((uint32_t *)offsets_endings)[offsets_count++] = (uint32_t)next_offset; }
+    // Perform byte-wise comparison up to the minimum length
+    size_t min_length = a_length < b_length ? a_length : b_length;
+    int cmp_result = memcmp(a_start, b_start, min_length);
 
-        // Next time we want to start
-        last_start = last_start + offset_in_remaining + separator.length;
-    }
+    // If the strings are equal up to `min_length`, then the shorter string is smaller
+    if (cmp_result == 0) cmp_result = (a_length > b_length) - (a_length < b_length);
 
-    // Populate the Strs object with the offsets
-    if (text.length >= UINT32_MAX) {
-        result->data.consecutive_64bit.end_offsets = offsets_endings;
-        result->data.consecutive_64bit.count = offsets_count;
-    }
-    else {
-        result->data.consecutive_32bit.end_offsets = offsets_endings;
-        result->data.consecutive_32bit.count = offsets_count;
+    switch (op) {
+    case Py_LT: return PyBool_FromLong(cmp_result < 0);
+    case Py_LE: return PyBool_FromLong(cmp_result <= 0);
+    case Py_EQ: return PyBool_FromLong(cmp_result == 0);
+    case Py_NE: return PyBool_FromLong(cmp_result != 0);
+    case Py_GT: return PyBool_FromLong(cmp_result > 0);
+    case Py_GE: return PyBool_FromLong(cmp_result >= 0);
+    default: Py_RETURN_NOTIMPLEMENTED;
     }
-
-    Py_INCREF(parent);
-    return result;
 }
 
-static PyObject *Str_split(PyObject *self, PyObject *args, PyObject *kwargs) {
+/**
+ *  @return 1 on success, 0 on failure.
+ */
+static int Str_find_( //
+    PyObject *self,
+    PyObject *args,
+    PyObject *kwargs,
+    Py_ssize_t *offset_out,
+    sz_haystack_t *haystack_out,
+    sz_needle_t *needle_out) {
 
-    // Check minimum arguments
     int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
     Py_ssize_t nargs = PyTuple_Size(args);
     if (nargs < !is_member + 1 || nargs > !is_member + 3) {
-        PyErr_SetString(PyExc_TypeError, "sz.split() requires at least 1 argument");
-        return NULL;
+        PyErr_SetString(PyExc_TypeError, "Invalid number of arguments");
+        return 0;
     }
 
-    PyObject *text_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
-    PyObject *separator_obj = nargs > !is_member + 0 ? PyTuple_GET_ITEM(args, !is_member + 0) : NULL;
-    PyObject *maxsplit_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
-    PyObject *keepseparator_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL;
+    PyObject *haystack_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
+    PyObject *needle_obj = PyTuple_GET_ITEM(args, !is_member + 0);
+    PyObject *start_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
+    PyObject *end_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL;
 
+    // Parse keyword arguments
     if (kwargs) {
-        PyObject *key, *value;
         Py_ssize_t pos = 0;
+        PyObject *key, *value;
         while (PyDict_Next(kwargs, &pos, &key, &value)) {
-            if (PyUnicode_CompareWithASCIIString(key, "separator") == 0) { separator_obj = value; }
-            else if (PyUnicode_CompareWithASCIIString(key, "maxsplit") == 0) { maxsplit_obj = value; }
-            else if (PyUnicode_CompareWithASCIIString(key, "keepseparator") == 0) { keepseparator_obj = value; }
-            else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key))
-                return NULL;
+            if (PyUnicode_CompareWithASCIIString(key, "start") == 0) { start_obj = value; }
+            else if (PyUnicode_CompareWithASCIIString(key, "end") == 0) { end_obj = value; }
+            else {
+                PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key);
+                return 0;
+            }
         }
     }
 
-    sz_haystack_t text;
-    sz_needle_t separator;
-    int keepseparator;
-    Py_ssize_t maxsplit;
-    separator.anomaly_offset = 0;
-
-    // Validate and convert `text`
-    if (!export_string_like(text_obj, &text.start, &text.length)) {
-        PyErr_SetString(PyExc_TypeError, "The text argument must be string-like");
-        return NULL;
-    }
+    sz_haystack_t haystack;
+    sz_needle_t needle;
+    Py_ssize_t start, end;
 
-    // Validate and convert `separator`
-    if (separator_obj) {
-        Py_ssize_t len;
-        if (!export_string_like(separator_obj, &separator.start, &len)) {
-            PyErr_SetString(PyExc_TypeError, "The separator argument must be string-like");
-            return NULL;
-        }
-        separator.length = (size_t)len;
-    }
-    else {
-        separator.start = " ";
-        separator.length = 1;
+    // Validate and convert `haystack` and `needle`
+    needle.anomaly_offset = 0;
+    if (!export_string_like(haystack_obj, &haystack.start, &haystack.length) ||
+        !export_string_like(needle_obj, &needle.start, &needle.length)) {
+        PyErr_SetString(PyExc_TypeError, "Haystack and needle must be string-like");
+        return 0;
     }
 
-    // Validate and convert `keepseparator`
-    if (keepseparator_obj) {
-        keepseparator = PyObject_IsTrue(keepseparator_obj);
-        if (keepseparator == -1) {
-            PyErr_SetString(PyExc_TypeError, "The keepseparator argument must be a boolean");
-            return NULL;
+    // Validate and convert `start`
+    if (start_obj) {
+        start = PyLong_AsSsize_t(start_obj);
+        if (start == -1 && PyErr_Occurred()) {
+            PyErr_SetString(PyExc_TypeError, "The start argument must be an integer");
+            return 0;
         }
     }
-    else { keepseparator = 0; }
+    else { start = 0; }
 
-    // Validate and convert `maxsplit`
-    if (maxsplit_obj) {
-        maxsplit = PyLong_AsSsize_t(maxsplit_obj);
-        if (maxsplit == -1 && PyErr_Occurred()) {
-            PyErr_SetString(PyExc_TypeError, "The maxsplit argument must be an integer");
-            return NULL;
+    // Validate and convert `end`
+    if (end_obj) {
+        end = PyLong_AsSsize_t(end_obj);
+        if (end == -1 && PyErr_Occurred()) {
+            PyErr_SetString(PyExc_TypeError, "The end argument must be an integer");
+            return 0;
         }
     }
-    else { maxsplit = PY_SSIZE_T_MAX; }
+    else { end = PY_SSIZE_T_MAX; }
 
-    return Str_split_(text_obj, text, separator, keepseparator, maxsplit);
-}
+    // Limit the `haystack` range
+    size_t normalized_offset, normalized_length;
+    slice(haystack.length, start, end, &normalized_offset, &normalized_length);
+    haystack.start += normalized_offset;
+    haystack.length = normalized_length;
 
-static PyObject *Str_splitlines(PyObject *self, PyObject *args, PyObject *kwargs) {
+    // Perform contains operation
+    size_t offset = sz_neon_find_substr(haystack, needle);
+    if (offset == haystack.length) { *offset_out = -1; }
+    else { *offset_out = (Py_ssize_t)offset; }
 
-    // Check minimum arguments
-    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
-    Py_ssize_t nargs = PyTuple_Size(args);
-    if (nargs < !is_member || nargs > !is_member + 2) {
-        PyErr_SetString(PyExc_TypeError, "splitlines() requires at least 1 argument");
-        return NULL;
-    }
+    *haystack_out = haystack;
+    *needle_out = needle;
+    return 1;
+}
 
-    PyObject *text_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
-    PyObject *keeplinebreaks_obj = nargs > !is_member ? PyTuple_GET_ITEM(args, !is_member) : NULL;
-    PyObject *maxsplit_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
+static PyObject *Str_find(PyObject *self, PyObject *args, PyObject *kwargs) {
+    Py_ssize_t signed_offset;
+    sz_haystack_t text;
+    sz_needle_t separator;
+    if (!Str_find_(self, args, kwargs, &signed_offset, &text, &separator)) return NULL;
+    return PyLong_FromSsize_t(signed_offset);
+}
 
-    if (kwargs) {
-        PyObject *key, *value;
-        Py_ssize_t pos = 0;
-        while (PyDict_Next(kwargs, &pos, &key, &value)) {
-            if (PyUnicode_CompareWithASCIIString(key, "keeplinebreaks") == 0) { keeplinebreaks_obj = value; }
-            else if (PyUnicode_CompareWithASCIIString(key, "maxsplit") == 0) { maxsplit_obj = value; }
-            else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key)) { return NULL; }
-        }
+static PyObject *Str_index(PyObject *self, PyObject *args, PyObject *kwargs) {
+    Py_ssize_t signed_offset;
+    sz_haystack_t text;
+    sz_needle_t separator;
+    if (!Str_find_(self, args, kwargs, &signed_offset, &text, &separator)) return NULL;
+    if (signed_offset == -1) {
+        PyErr_SetString(PyExc_ValueError, "substring not found");
+        return NULL;
     }
+    return PyLong_FromSsize_t(signed_offset);
+}
 
+static PyObject *Str_contains(PyObject *self, PyObject *args, PyObject *kwargs) {
+    Py_ssize_t signed_offset;
     sz_haystack_t text;
-    int keeplinebreaks;
-    Py_ssize_t maxsplit = PY_SSIZE_T_MAX; // Default value for maxsplit
+    sz_needle_t separator;
+    if (!Str_find_(self, args, kwargs, &signed_offset, &text, &separator)) return NULL;
+    if (signed_offset == -1) { Py_RETURN_FALSE; }
+    else { Py_RETURN_TRUE; }
+}
 
-    // Validate and convert `text`
-    if (!export_string_like(text_obj, &text.start, &text.length)) {
-        PyErr_SetString(PyExc_TypeError, "The text argument must be string-like");
-        return NULL;
-    }
+static PyObject *Str_partition(PyObject *self, PyObject *args, PyObject *kwargs) {
+    Py_ssize_t separator_index;
+    sz_haystack_t text;
+    sz_needle_t separator;
+    PyObject *result_tuple;
 
-    // Validate and convert `keeplinebreaks`
-    if (keeplinebreaks_obj) {
-        keeplinebreaks = PyObject_IsTrue(keeplinebreaks_obj);
-        if (keeplinebreaks == -1) {
-            PyErr_SetString(PyExc_TypeError, "The keeplinebreaks argument must be a boolean");
-            return NULL;
-        }
-    }
-    else { keeplinebreaks = 0; }
+    // Use Str_find_ to get the index of the separator
+    if (!Str_find_(self, args, kwargs, &separator_index, &text, &separator)) return NULL;
 
-    // Validate and convert `maxsplit`
-    if (maxsplit_obj) {
-        maxsplit = PyLong_AsSsize_t(maxsplit_obj);
-        if (maxsplit == -1 && PyErr_Occurred()) {
-            PyErr_SetString(PyExc_TypeError, "The maxsplit argument must be an integer");
-            return NULL;
-        }
+    // If separator is not found, return a tuple (self, "", "")
+    if (separator_index == -1) {
+        PyObject *empty_str1 = Str_new(&StrType, Py_None, Py_None);
+        PyObject *empty_str2 = Str_new(&StrType, Py_None, Py_None);
+
+        result_tuple = PyTuple_New(3);
+        Py_INCREF(self);
+        PyTuple_SET_ITEM(result_tuple, 0, self);
+        PyTuple_SET_ITEM(result_tuple, 1, empty_str1);
+        PyTuple_SET_ITEM(result_tuple, 2, empty_str2);
+        return result_tuple;
     }
 
-    // TODO: Support arbitrary newline characters:
-    // https://docs.python.org/3/library/stdtypes.html#str.splitlines
-    // \n, \r, \r\n, \v or \x0b, \f or \x0c, \x1c, \x1d, \x1e, \x85, \u2028, \u2029
-    // https://github.com/ashvardanian/StringZilla/issues/29
-    sz_needle_t separator;
-    separator.start = "\n";
-    separator.length = 1;
-    return Str_split_(text_obj, text, separator, keeplinebreaks, maxsplit);
-}
+    // Create the three parts manually
+    Str *before = Str_new(&StrType, NULL, NULL);
+    Str *middle = Str_new(&StrType, NULL, NULL);
+    Str *after = Str_new(&StrType, NULL, NULL);
 
-static PyObject *Str_concat(PyObject *self, PyObject *other) {
-    struct sz_haystack_t self_str, other_str;
+    before->parent = self, before->start = text.start, before->length = separator_index;
+    middle->parent = self, middle->start = text.start + separator_index, middle->length = separator.length;
+    after->parent = self, after->start = text.start + separator_index + separator.length,
+    after->length = text.length - separator_index - separator.length;
 
-    // Validate and convert `self`
-    if (!export_string_like(self, &self_str.start, &self_str.length)) {
-        PyErr_SetString(PyExc_TypeError, "The self object must be string-like");
-        return NULL;
-    }
+    // All parts reference the same parent
+    Py_INCREF(self);
+    Py_INCREF(self);
+    Py_INCREF(self);
 
-    // Validate and convert `other`
-    if (!export_string_like(other, &other_str.start, &other_str.length)) {
-        PyErr_SetString(PyExc_TypeError, "The other object must be string-like");
+    // Build the result tuple
+    result_tuple = PyTuple_New(3);
+    PyTuple_SET_ITEM(result_tuple, 0, before);
+    PyTuple_SET_ITEM(result_tuple, 1, middle);
+    PyTuple_SET_ITEM(result_tuple, 2, after);
+
+    return result_tuple;
+}
+
+static PyObject *Str_count(PyObject *self, PyObject *args, PyObject *kwargs) {
+    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
+    Py_ssize_t nargs = PyTuple_Size(args);
+    if (nargs < !is_member + 1 || nargs > !is_member + 4) {
+        PyErr_Format(PyExc_TypeError, "Invalid number of arguments");
         return NULL;
     }
 
-    // Allocate a new Str instance
-    Str *result_str = PyObject_New(Str, &StrType);
-    if (result_str == NULL) { return NULL; }
-
-    // Calculate the total length of the new string
-    result_str->parent = NULL;
-    result_str->length = self_str.length + other_str.length;
+    PyObject *haystack_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
+    PyObject *needle_obj = PyTuple_GET_ITEM(args, !is_member + 0);
+    PyObject *start_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
+    PyObject *end_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL;
+    PyObject *allowoverlap_obj = nargs > !is_member + 3 ? PyTuple_GET_ITEM(args, !is_member + 3) : NULL;
 
-    // Allocate memory for the new string
-    result_str->start = malloc(result_str->length);
-    if (result_str->start == NULL) {
-        PyErr_SetString(PyExc_MemoryError, "Unable to allocate memory for string concatenation");
-        return NULL;
+    if (kwargs) {
+        Py_ssize_t pos = 0;
+        PyObject *key, *value;
+        while (PyDict_Next(kwargs, &pos, &key, &value))
+            if (PyUnicode_CompareWithASCIIString(key, "start") == 0) { start_obj = value; }
+            else if (PyUnicode_CompareWithASCIIString(key, "end") == 0) { end_obj = value; }
+            else if (PyUnicode_CompareWithASCIIString(key, "allowoverlap") == 0) { allowoverlap_obj = value; }
+            else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key))
+                return NULL;
     }
 
-    // Perform the string concatenation
-    memcpy(result_str->start, self_str.start, self_str.length);
-    memcpy(result_str->start + self_str.length, other_str.start, other_str.length);
+    sz_haystack_t haystack;
+    sz_needle_t needle;
+    Py_ssize_t start = start_obj ? PyLong_AsSsize_t(start_obj) : 0;
+    Py_ssize_t end = end_obj ? PyLong_AsSsize_t(end_obj) : PY_SSIZE_T_MAX;
+    int allowoverlap = allowoverlap_obj ? PyObject_IsTrue(allowoverlap_obj) : 0;
 
-    return (PyObject *)result_str;
-}
+    needle.anomaly_offset = 0;
+    if (!export_string_like(haystack_obj, &haystack.start, &haystack.length) ||
+        !export_string_like(needle_obj, &needle.start, &needle.length))
+        return PyErr_Format(PyExc_TypeError, "Haystack and needle must be string-like"), NULL;
 
-#pragma endregion
+    if ((start == -1 || end == -1 || allowoverlap == -1) && PyErr_Occurred()) return NULL;
 
-#pragma region MemoryMappingFile
+    size_t normalized_offset, normalized_length;
+    slice(haystack.length, start, end, &normalized_offset, &normalized_length);
+    haystack.start += normalized_offset;
+    haystack.length = normalized_length;
 
-static void File_dealloc(File *self) {
-#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
-    if (self->start) {
-        UnmapViewOfFile(self->start);
-        self->start = NULL;
-    }
-    if (self->mapping_handle) {
-        CloseHandle(self->mapping_handle);
-        self->mapping_handle = NULL;
-    }
-    if (self->file_handle) {
-        CloseHandle(self->file_handle);
-        self->file_handle = NULL;
-    }
-#else
-    if (self->start) {
-        munmap(self->start, self->length);
-        self->start = NULL;
-        self->length = 0;
-    }
-    if (self->file_descriptor != 0) {
-        close(self->file_descriptor);
-        self->file_descriptor = 0;
+    size_t count = needle.length == 1 ? sz_naive_count_char(haystack, *needle.start) : 0;
+    if (needle.length != 1) {
+        if (allowoverlap) {
+            while (haystack.length) {
+                size_t offset = sz_neon_find_substr(haystack, needle);
+                int found = offset != haystack.length;
+                count += found;
+                haystack.start += offset + found;
+                haystack.length -= offset + found;
+            }
+        }
+        else {
+            while (haystack.length) {
+                size_t offset = sz_neon_find_substr(haystack, needle);
+                int found = offset != haystack.length;
+                count += found;
+                haystack.start += offset + needle.length;
+                haystack.length -= offset + needle.length * found;
+            }
+        }
     }
-#endif
-    Py_TYPE(self)->tp_free((PyObject *)self);
+    return PyLong_FromSize_t(count);
 }
 
-static PyObject *File_new(PyTypeObject *type, PyObject *positional_args, PyObject *named_args) {
-    File *self;
-    self = (File *)type->tp_alloc(type, 0);
-    if (self == NULL) return NULL;
-
-#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
-    self->file_handle = NULL;
-    self->mapping_handle = NULL;
-#else
-    self->file_descriptor = 0;
-#endif
-    self->start = NULL;
-    self->length = 0;
-}
+static PyObject *Str_levenstein(PyObject *self, PyObject *args, PyObject *kwargs) {
+    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
+    Py_ssize_t nargs = PyTuple_Size(args);
+    if (nargs < !is_member + 1 || nargs > !is_member + 2) {
+        PyErr_Format(PyExc_TypeError, "Invalid number of arguments");
+        return NULL;
+    }
 
-static int File_init(File *self, PyObject *positional_args, PyObject *named_args) {
-    const char *path;
-    if (!PyArg_ParseTuple(positional_args, "s", &path)) return -1;
+    PyObject *str1_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
+    PyObject *str2_obj = PyTuple_GET_ITEM(args, !is_member + 0);
+    PyObject *bound_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
 
-#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
-    self->file_handle = CreateFile(path, GENERIC_READ, FILE_SHARE_READ, 0, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0);
-    if (self->file_handle == INVALID_HANDLE_VALUE) {
-        PyErr_SetString(PyExc_RuntimeError, "Couldn't map the file!");
-        return -1;
+    if (kwargs) {
+        PyObject *key, *value;
+        Py_ssize_t pos = 0;
+        while (PyDict_Next(kwargs, &pos, &key, &value))
+            if (PyUnicode_CompareWithASCIIString(key, "bound") == 0) {
+                if (bound_obj) {
+                    PyErr_Format(PyExc_TypeError, "Received bound both as positional and keyword argument");
+                    return NULL;
+                }
+                bound_obj = value;
+            }
     }
 
-    self->mapping_handle = CreateFileMapping(self->file_handle, 0, PAGE_READONLY, 0, 0, 0);
-    if (self->mapping_handle == 0) {
-        CloseHandle(self->file_handle);
-        self->file_handle = NULL;
-        PyErr_SetString(PyExc_RuntimeError, "Couldn't map the file!");
-        return -1;
+    int bound = 255; // Default value for bound
+    if (bound_obj && ((bound = PyLong_AsLong(bound_obj)) > 255 || bound < 0)) {
+        PyErr_Format(PyExc_ValueError, "Bound must be an integer between 0 and 255");
+        return NULL;
     }
 
-    char *file = (char *)MapViewOfFile(self->mapping_handle, FILE_MAP_READ, 0, 0, 0);
-    if (file == 0) {
-        CloseHandle(self->mapping_handle);
-        self->mapping_handle = NULL;
-        CloseHandle(self->file_handle);
-        self->file_handle = NULL;
-        PyErr_SetString(PyExc_RuntimeError, "Couldn't map the file!");
-        return -1;
+    sz_haystack_t str1, str2;
+    if (!export_string_like(str1_obj, &str1.start, &str1.length) ||
+        !export_string_like(str2_obj, &str2.start, &str2.length)) {
+        PyErr_Format(PyExc_TypeError, "Both arguments must be string-like");
+        return NULL;
     }
-    self->start = file;
-    self->length = GetFileSize(self->file_handle, 0);
-#else
-    struct stat sb;
-    self->file_descriptor = open(path, O_RDONLY);
-    if (fstat(self->file_descriptor, &sb) != 0) {
-        close(self->file_descriptor);
-        self->file_descriptor = 0;
-        PyErr_SetString(PyExc_RuntimeError, "Can't retrieve file size!");
-        return -1;
+
+    size_t memory_needed = sz_levenstein_memory_needed(str1.length, str2.length);
+    if (temporary_memory.length < memory_needed) {
+        temporary_memory.start = realloc(temporary_memory.start, memory_needed);
+        temporary_memory.length = memory_needed;
     }
-    size_t file_size = sb.st_size;
-    void *map = mmap(NULL, sb.st_size, PROT_READ, MAP_SHARED, self->file_descriptor, 0);
-    if (map == MAP_FAILED) {
-        close(self->file_descriptor);
-        self->file_descriptor = 0;
-        PyErr_SetString(PyExc_RuntimeError, "Couldn't map the file!");
-        return -1;
+    if (!temporary_memory.start) {
+        PyErr_Format(PyExc_MemoryError, "Unable to allocate memory for the Levenshtein matrix");
+        return NULL;
     }
-    self->start = map;
-    self->length = file_size;
-#endif
 
-    return 0;
+    levenstein_distance_t small_bound = (levenstein_distance_t)bound;
+    levenstein_distance_t distance =
+        sz_levenstein(str1.start, str1.length, str2.start, str2.length, small_bound, temporary_memory.start);
+
+    return PyLong_FromLong(distance);
 }
 
-static PyMethodDef File_methods[] = { //
-    {NULL, NULL, 0, NULL}};
+static PyObject *Str_startswith(PyObject *self, PyObject *args, PyObject *kwargs) {
+    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
+    Py_ssize_t nargs = PyTuple_Size(args);
+    if (nargs < !is_member + 1 || nargs > !is_member + 3) {
+        PyErr_Format(PyExc_TypeError, "Invalid number of arguments");
+        return NULL;
+    }
 
-static PyTypeObject FileType = {
-    PyObject_HEAD_INIT(NULL).tp_name = "stringzilla.File",
-    .tp_doc = "Memory mapped file class, that exposes the memory range for low-level access",
-    .tp_basicsize = sizeof(File),
-    .tp_flags = Py_TPFLAGS_DEFAULT,
-    .tp_methods = File_methods,
-    .tp_new = (newfunc)File_new,
-    .tp_init = (initproc)File_init,
-    .tp_dealloc = (destructor)File_dealloc,
-};
+    PyObject *str_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
+    PyObject *prefix_obj = PyTuple_GET_ITEM(args, !is_member);
+    PyObject *start_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
+    PyObject *end_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL;
 
-#pragma endregion
+    // Optional start and end arguments
+    Py_ssize_t start = 0, end = PY_SSIZE_T_MAX;
 
-#pragma region Str
+    if (start_obj && ((start = PyLong_AsSsize_t(start_obj)) == -1 && PyErr_Occurred())) {
+        PyErr_SetString(PyExc_TypeError, "start must be an integer");
+        return NULL;
+    }
 
-static int Str_init(Str *self, PyObject *args, PyObject *kwargs) {
+    if (end_obj && ((end = PyLong_AsSsize_t(end_obj)) == -1 && PyErr_Occurred())) {
+        PyErr_SetString(PyExc_TypeError, "end must be an integer");
+        return NULL;
+    }
 
-    // Parse all arguments into PyObjects first
-    Py_ssize_t nargs = PyTuple_Size(args);
-    if (nargs > 3) {
-        PyErr_SetString(PyExc_TypeError, "Invalid number of arguments");
-        return -1;
+    sz_haystack_t str, prefix;
+    if (!export_string_like(str_obj, &str.start, &str.length) ||
+        !export_string_like(prefix_obj, &prefix.start, &prefix.length)) {
+        PyErr_SetString(PyExc_TypeError, "Both arguments must be string-like");
+        return NULL;
     }
-    PyObject *parent_obj = nargs >= 1 ? PyTuple_GET_ITEM(args, 0) : NULL;
-    PyObject *from_obj = nargs >= 2 ? PyTuple_GET_ITEM(args, 1) : NULL;
-    PyObject *to_obj = nargs >= 3 ? PyTuple_GET_ITEM(args, 2) : NULL;
 
-    // Parse keyword arguments, if provided, and ensure no duplicates
-    if (kwargs) {
-        PyObject *key, *value;
-        Py_ssize_t pos = 0;
-        while (PyDict_Next(kwargs, &pos, &key, &value)) {
-            if (PyUnicode_CompareWithASCIIString(key, "parent") == 0) {
-                if (parent_obj) {
-                    PyErr_SetString(PyExc_TypeError, "Received `parent` both as positional and keyword argument");
-                    return -1;
-                }
-                parent_obj = value;
-            }
-            else if (PyUnicode_CompareWithASCIIString(key, "from") == 0) {
-                if (from_obj) {
-                    PyErr_SetString(PyExc_TypeError, "Received `from` both as positional and keyword argument");
-                    return -1;
-                }
-                from_obj = value;
-            }
-            else if (PyUnicode_CompareWithASCIIString(key, "to") == 0) {
-                if (to_obj) {
-                    PyErr_SetString(PyExc_TypeError, "Received `to` both as positional and keyword argument");
-                    return -1;
-                }
-                to_obj = value;
-            }
-            else {
-                PyErr_SetString(PyExc_TypeError, "Invalid keyword argument");
-                return -1;
-            }
-        }
+    // Apply start and end arguments
+    str.start += start;
+    str.length -= start;
+    if (end != PY_SSIZE_T_MAX && end - start < str.length) { str.length = end - start; }
+
+    if (str.length < prefix.length) { Py_RETURN_FALSE; }
+    else if (strncmp(str.start, prefix.start, prefix.length) == 0) { Py_RETURN_TRUE; }
+    else { Py_RETURN_FALSE; }
+}
+
+static PyObject *Str_endswith(PyObject *self, PyObject *args, PyObject *kwargs) {
+    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
+    Py_ssize_t nargs = PyTuple_Size(args);
+    if (nargs < !is_member + 1 || nargs > !is_member + 3) {
+        PyErr_Format(PyExc_TypeError, "Invalid number of arguments");
+        return NULL;
     }
 
-    // Now, type-check and cast each argument
-    Py_ssize_t from = 0, to = PY_SSIZE_T_MAX;
-    if (from_obj) {
-        from = PyLong_AsSsize_t(from_obj);
-        if (from == -1 && PyErr_Occurred()) {
-            PyErr_SetString(PyExc_TypeError, "The `from` argument must be an integer");
-            return -1;
-        }
+    PyObject *str_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
+    PyObject *suffix_obj = PyTuple_GET_ITEM(args, !is_member);
+    PyObject *start_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
+    PyObject *end_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL;
+
+    // Optional start and end arguments
+    Py_ssize_t start = 0, end = PY_SSIZE_T_MAX;
+
+    if (start_obj && ((start = PyLong_AsSsize_t(start_obj)) == -1 && PyErr_Occurred())) {
+        PyErr_SetString(PyExc_TypeError, "start must be an integer");
+        return NULL;
     }
-    if (to_obj) {
-        to = PyLong_AsSsize_t(to_obj);
-        if (to == -1 && PyErr_Occurred()) {
-            PyErr_SetString(PyExc_TypeError, "The `to` argument must be an integer");
-            return -1;
-        }
+
+    if (end_obj && ((end = PyLong_AsSsize_t(end_obj)) == -1 && PyErr_Occurred())) {
+        PyErr_SetString(PyExc_TypeError, "end must be an integer");
+        return NULL;
     }
 
-    // Handle empty string
-    if (parent_obj == NULL) {
-        self->start = NULL;
-        self->length = 0;
+    sz_haystack_t str, suffix;
+    if (!export_string_like(str_obj, &str.start, &str.length) ||
+        !export_string_like(suffix_obj, &suffix.start, &suffix.length)) {
+        PyErr_SetString(PyExc_TypeError, "Both arguments must be string-like");
+        return NULL;
     }
-    // Increment the reference count of the parent
-    else if (export_string_like(parent_obj, &self->start, &self->length)) {
-        self->parent = parent_obj;
-        Py_INCREF(parent_obj);
+
+    // Apply start and end arguments
+    str.start += start;
+    str.length -= start;
+    if (end != PY_SSIZE_T_MAX && end - start < str.length) { str.length = end - start; }
+
+    if (str.length < suffix.length) { Py_RETURN_FALSE; }
+    else if (strncmp(str.start + (str.length - suffix.length), suffix.start, suffix.length) == 0) { Py_RETURN_TRUE; }
+    else { Py_RETURN_FALSE; }
+}
+
+static Strs *Str_split_(
+    PyObject *parent, sz_haystack_t text, sz_needle_t separator, int keepseparator, Py_ssize_t maxsplit) {
+
+    // Create Strs object
+    Strs *result = (Strs *)PyObject_New(Strs, &StrsType);
+    if (!result) return NULL;
+
+    // Initialize Strs object based on the splitting logic
+    void *offsets_endings = NULL;
+    size_t offsets_capacity = 0;
+    size_t offsets_count = 0;
+    size_t bytes_per_offset;
+    if (text.length >= UINT32_MAX) {
+        bytes_per_offset = 8;
+        result->type = STRS_CONSECUTIVE_64;
+        result->data.consecutive_64bit.start = text.start;
+        result->data.consecutive_64bit.parent = parent;
+        result->data.consecutive_64bit.separator_length = !keepseparator * separator.length;
     }
     else {
-        PyErr_SetString(PyExc_TypeError, "Unsupported parent type");
-        return -1;
+        bytes_per_offset = 4;
+        result->type = STRS_CONSECUTIVE_32;
+        result->data.consecutive_32bit.start = text.start;
+        result->data.consecutive_32bit.parent = parent;
+        result->data.consecutive_32bit.separator_length = !keepseparator * separator.length;
     }
 
-    // Apply slicing
-    size_t normalized_offset, normalized_length;
-    slice(self->length, from, to, &normalized_offset, &normalized_length);
-    self->start = ((char *)self->start) + normalized_offset;
-    self->length = normalized_length;
-    return 0;
-}
-
-static PyObject *Str_new(PyTypeObject *type, PyObject *args, PyObject *kwds) {
-    Str *self;
-    self = (Str *)type->tp_alloc(type, 0);
-    if (!self) return NULL;
+    // Iterate through string, keeping track of the
+    sz_size_t last_start = 0;
+    while (last_start < text.length && offsets_count < maxsplit) {
+        sz_haystack_t text_remaining;
+        text_remaining.start = text.start + last_start;
+        text_remaining.length = text.length - last_start;
+        sz_size_t offset_in_remaining = sz_neon_find_substr(text_remaining, separator);
 
-    self->parent = NULL;
-    self->start = NULL;
-    self->length = 0;
-    return (PyObject *)self;
-}
+        // Reallocate offsets array if needed
+        if (offsets_count >= offsets_capacity) {
+            offsets_capacity = (offsets_capacity + 1) * 2;
+            void *new_offsets = realloc(offsets_endings, offsets_capacity * bytes_per_offset);
+            if (!new_offsets) {
+                if (offsets_endings) free(offsets_endings);
+            }
+            offsets_endings = new_offsets;
+        }
 
-static void Str_dealloc(Str *self) {
-    if (self->parent) { Py_XDECREF(self->parent); }
-    else if (self->start) { free(self->start); }
-    self->parent = NULL;
-    Py_TYPE(self)->tp_free((PyObject *)self);
-}
+        // If the memory allocation has failed - discard the response
+        if (!offsets_endings) {
+            Py_XDECREF(result);
+            PyErr_NoMemory();
+            return NULL;
+        }
 
-static PyObject *Str_str(Str *self) { return PyUnicode_FromStringAndSize(self->start, self->length); }
+        // Export the offset
+        size_t will_continue = offset_in_remaining != text_remaining.length;
+        size_t next_offset = last_start + offset_in_remaining + separator.length * will_continue;
+        if (text.length >= UINT32_MAX) { ((uint64_t *)offsets_endings)[offsets_count++] = (uint64_t)next_offset; }
+        else { ((uint32_t *)offsets_endings)[offsets_count++] = (uint32_t)next_offset; }
 
-static Py_hash_t Str_hash(Str *self) { return (Py_hash_t)sz_hash_crc32_native(self->start, self->length); }
+        // Next time we want to start
+        last_start = last_start + offset_in_remaining + separator.length;
+    }
 
-static Py_ssize_t Str_len(Str *self) { return self->length; }
+    // Populate the Strs object with the offsets
+    if (text.length >= UINT32_MAX) {
+        result->data.consecutive_64bit.end_offsets = offsets_endings;
+        result->data.consecutive_64bit.count = offsets_count;
+    }
+    else {
+        result->data.consecutive_32bit.end_offsets = offsets_endings;
+        result->data.consecutive_32bit.count = offsets_count;
+    }
 
-static PyObject *Str_getitem(Str *self, Py_ssize_t i) {
+    Py_INCREF(parent);
+    return result;
+}
 
-    // Negative indexing
-    if (i < 0) i += self->length;
+static PyObject *Str_split(PyObject *self, PyObject *args, PyObject *kwargs) {
 
-    if (i < 0 || (size_t)i >= self->length) {
-        PyErr_SetString(PyExc_IndexError, "Index out of range");
+    // Check minimum arguments
+    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
+    Py_ssize_t nargs = PyTuple_Size(args);
+    if (nargs < !is_member + 1 || nargs > !is_member + 3) {
+        PyErr_SetString(PyExc_TypeError, "sz.split() requires at least 1 argument");
         return NULL;
     }
 
-    // Assuming the underlying data is UTF-8 encoded
-    return PyUnicode_FromStringAndSize(self->start + i, 1);
-}
+    PyObject *text_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
+    PyObject *separator_obj = nargs > !is_member + 0 ? PyTuple_GET_ITEM(args, !is_member + 0) : NULL;
+    PyObject *maxsplit_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
+    PyObject *keepseparator_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL;
 
-static PyObject *Str_subscript(Str *self, PyObject *key) {
-    if (PySlice_Check(key)) {
-        // Sanity checks
-        Py_ssize_t start, stop, step;
-        if (PySlice_Unpack(key, &start, &stop, &step) < 0) return NULL;
-        if (PySlice_AdjustIndices(self->length, &start, &stop, step) < 0) return NULL;
-        if (step != 1) {
-            PyErr_SetString(PyExc_IndexError, "Efficient step is not supported");
-            return NULL;
+    if (kwargs) {
+        PyObject *key, *value;
+        Py_ssize_t pos = 0;
+        while (PyDict_Next(kwargs, &pos, &key, &value)) {
+            if (PyUnicode_CompareWithASCIIString(key, "separator") == 0) { separator_obj = value; }
+            else if (PyUnicode_CompareWithASCIIString(key, "maxsplit") == 0) { maxsplit_obj = value; }
+            else if (PyUnicode_CompareWithASCIIString(key, "keepseparator") == 0) { keepseparator_obj = value; }
+            else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key))
+                return NULL;
         }
+    }
 
-        // Create a new `Str` object
-        Str *self_slice = (Str *)StrType.tp_alloc(&StrType, 0);
-        if (self_slice == NULL && PyErr_NoMemory()) return NULL;
+    sz_haystack_t text;
+    sz_needle_t separator;
+    int keepseparator;
+    Py_ssize_t maxsplit;
+    separator.anomaly_offset = 0;
 
-        // Set its properties based on the slice
-        self_slice->start = self->start + start;
-        self_slice->length = stop - start;
-        self_slice->parent = (PyObject *)self; // Set parent to keep it alive
+    // Validate and convert `text`
+    if (!export_string_like(text_obj, &text.start, &text.length)) {
+        PyErr_SetString(PyExc_TypeError, "The text argument must be string-like");
+        return NULL;
+    }
 
-        // Increment the reference count of the parent
-        Py_INCREF(self);
-        return (PyObject *)self_slice;
+    // Validate and convert `separator`
+    if (separator_obj) {
+        Py_ssize_t len;
+        if (!export_string_like(separator_obj, &separator.start, &len)) {
+            PyErr_SetString(PyExc_TypeError, "The separator argument must be string-like");
+            return NULL;
+        }
+        separator.length = (size_t)len;
     }
-    else if (PyLong_Check(key)) { return Str_getitem(self, PyLong_AsSsize_t(key)); }
     else {
-        PyErr_SetString(PyExc_TypeError, "Str indices must be integers or slices");
-        return NULL;
+        separator.start = " ";
+        separator.length = 1;
     }
-}
 
-static int Str_getbuffer(Str *self, Py_buffer *view, int flags) {
-    if (view == NULL) {
-        PyErr_SetString(PyExc_ValueError, "NULL view in getbuffer");
-        return -1;
+    // Validate and convert `keepseparator`
+    if (keepseparator_obj) {
+        keepseparator = PyObject_IsTrue(keepseparator_obj);
+        if (keepseparator == -1) {
+            PyErr_SetString(PyExc_TypeError, "The keepseparator argument must be a boolean");
+            return NULL;
+        }
     }
+    else { keepseparator = 0; }
 
-    static Py_ssize_t itemsize[1] = {1};
-    view->obj = (PyObject *)self;
-    view->buf = self->start;
-    view->len = self->length;
-    view->readonly = 1;
-    view->itemsize = sizeof(char);
-    view->format = "c"; // https://docs.python.org/3/library/struct.html#format-characters
-    view->ndim = 1;
-    view->shape = &self->length; // 1-D array, so shape is just a pointer to the length
-    view->strides = itemsize;    // strides in a 1-D array is just the item size
-    view->suboffsets = NULL;
-    view->internal = NULL;
-
-    Py_INCREF(self);
-    return 0;
-}
+    // Validate and convert `maxsplit`
+    if (maxsplit_obj) {
+        maxsplit = PyLong_AsSsize_t(maxsplit_obj);
+        if (maxsplit == -1 && PyErr_Occurred()) {
+            PyErr_SetString(PyExc_TypeError, "The maxsplit argument must be an integer");
+            return NULL;
+        }
+    }
+    else { maxsplit = PY_SSIZE_T_MAX; }
 
-static void Str_releasebuffer(PyObject *_, Py_buffer *view) {
-    // This function MUST NOT decrement view->obj, since that is done automatically
-    // in PyBuffer_Release() (this scheme is useful for breaking reference cycles).
-    // https://docs.python.org/3/c-api/typeobj.html#c.PyBufferProcs.bf_releasebuffer
+    return Str_split_(text_obj, text, separator, keepseparator, maxsplit);
 }
 
-static int Str_in(Str *self, PyObject *arg) {
+static PyObject *Str_splitlines(PyObject *self, PyObject *args, PyObject *kwargs) {
 
-    sz_needle_t needle_struct;
-    needle_struct.anomaly_offset = 0;
-    if (!export_string_like(arg, &needle_struct.start, &needle_struct.length)) {
-        PyErr_SetString(PyExc_TypeError, "Unsupported argument type");
-        return -1;
+    // Check minimum arguments
+    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
+    Py_ssize_t nargs = PyTuple_Size(args);
+    if (nargs < !is_member || nargs > !is_member + 2) {
+        PyErr_SetString(PyExc_TypeError, "splitlines() requires at least 1 argument");
+        return NULL;
     }
 
-    sz_haystack_t haystack;
-    haystack.start = self->start;
-    haystack.length = self->length;
-    size_t position = sz_neon_find_substr(haystack, needle_struct);
-    return position != haystack.length;
-}
+    PyObject *text_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
+    PyObject *keeplinebreaks_obj = nargs > !is_member ? PyTuple_GET_ITEM(args, !is_member) : NULL;
+    PyObject *maxsplit_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
 
-static Py_ssize_t Strs_len(Strs *self) {
-    switch (self->type) {
-    case STRS_CONSECUTIVE_32: return self->data.consecutive_32bit.count;
-    case STRS_CONSECUTIVE_64: return self->data.consecutive_64bit.count;
-    case STRS_REORDERED: return self->data.reordered.count;
-    case STRS_MULTI_SOURCE: return self->data.multi_source.count;
-    default: return 0;
+    if (kwargs) {
+        PyObject *key, *value;
+        Py_ssize_t pos = 0;
+        while (PyDict_Next(kwargs, &pos, &key, &value)) {
+            if (PyUnicode_CompareWithASCIIString(key, "keeplinebreaks") == 0) { keeplinebreaks_obj = value; }
+            else if (PyUnicode_CompareWithASCIIString(key, "maxsplit") == 0) { maxsplit_obj = value; }
+            else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key)) { return NULL; }
+        }
     }
-}
 
-static PyObject *Strs_getitem(Strs *self, Py_ssize_t i) {
-    // Check for negative index and convert to positive
-    Py_ssize_t count = Strs_len(self);
-    if (i < 0) i += count;
-    if (i < 0 || i >= count) {
-        PyErr_SetString(PyExc_IndexError, "Index out of range");
+    sz_haystack_t text;
+    int keeplinebreaks;
+    Py_ssize_t maxsplit = PY_SSIZE_T_MAX; // Default value for maxsplit
+
+    // Validate and convert `text`
+    if (!export_string_like(text_obj, &text.start, &text.length)) {
+        PyErr_SetString(PyExc_TypeError, "The text argument must be string-like");
         return NULL;
     }
 
-    PyObject *parent = NULL;
-    char const *start = NULL;
-    size_t length = 0;
-
-    // Extract a member element based on
-    switch (self->type) {
-    case STRS_CONSECUTIVE_32: {
-        uint32_t start_offset = (i == 0) ? 0 : self->data.consecutive_32bit.end_offsets[i - 1];
-        uint32_t end_offset = self->data.consecutive_32bit.end_offsets[i];
-        start = self->data.consecutive_32bit.start + start_offset;
-        length = end_offset - start_offset - self->data.consecutive_32bit.separator_length * (i + 1 != count);
-        parent = self->data.consecutive_32bit.parent;
-        break;
-    }
-    case STRS_CONSECUTIVE_64: {
-        uint64_t start_offset = (i == 0) ? 0 : self->data.consecutive_64bit.end_offsets[i - 1];
-        uint64_t end_offset = self->data.consecutive_64bit.end_offsets[i];
-        start = self->data.consecutive_64bit.start + start_offset;
-        length = end_offset - start_offset - self->data.consecutive_64bit.separator_length * (i + 1 != count);
-        parent = self->data.consecutive_64bit.parent;
-        break;
-    }
-    case STRS_REORDERED: {
-        //
-        break;
-    }
-    case STRS_MULTI_SOURCE: {
-        //
-        break;
-    }
-    default: PyErr_SetString(PyExc_TypeError, "Unknown Strs kind"); return NULL;
+    // Validate and convert `keeplinebreaks`
+    if (keeplinebreaks_obj) {
+        keeplinebreaks = PyObject_IsTrue(keeplinebreaks_obj);
+        if (keeplinebreaks == -1) {
+            PyErr_SetString(PyExc_TypeError, "The keeplinebreaks argument must be a boolean");
+            return NULL;
+        }
     }
+    else { keeplinebreaks = 0; }
 
-    // Create a new `Str` object
-    Str *parent_slice = (Str *)StrType.tp_alloc(&StrType, 0);
-    if (parent_slice == NULL && PyErr_NoMemory()) return NULL;
-
-    parent_slice->start = start;
-    parent_slice->length = length;
-    parent_slice->parent = parent;
-    Py_INCREF(parent);
-    return parent_slice;
-}
+    // Validate and convert `maxsplit`
+    if (maxsplit_obj) {
+        maxsplit = PyLong_AsSsize_t(maxsplit_obj);
+        if (maxsplit == -1 && PyErr_Occurred()) {
+            PyErr_SetString(PyExc_TypeError, "The maxsplit argument must be an integer");
+            return NULL;
+        }
+    }
 
-static PyObject *Strs_subscript(Str *self, PyObject *key) {
-    if (PyLong_Check(key)) return Strs_getitem(self, PyLong_AsSsize_t(key));
-    return NULL;
+    // TODO: Support arbitrary newline characters:
+    // https://docs.python.org/3/library/stdtypes.html#str.splitlines
+    // \n, \r, \r\n, \v or \x0b, \f or \x0c, \x1c, \x1d, \x1e, \x85, \u2028, \u2029
+    // https://github.com/ashvardanian/StringZilla/issues/29
+    sz_needle_t separator;
+    separator.start = "\n";
+    separator.length = 1;
+    return Str_split_(text_obj, text, separator, keeplinebreaks, maxsplit);
 }
 
-// Will be called by the `PySequence_Contains`
-static int Strs_contains(Str *self, PyObject *arg) { return 0; }
+static PyObject *Str_concat(PyObject *self, PyObject *other) {
+    struct sz_haystack_t self_str, other_str;
 
-static PyObject *Str_richcompare(PyObject *self, PyObject *other, int op) {
+    // Validate and convert `self`
+    if (!export_string_like(self, &self_str.start, &self_str.length)) {
+        PyErr_SetString(PyExc_TypeError, "The self object must be string-like");
+        return NULL;
+    }
 
-    char const *a_start, *b_start;
-    size_t a_length, b_length;
-    if (!export_string_like(self, &a_start, &a_length) || !export_string_like(other, &b_start, &b_length))
-        Py_RETURN_NOTIMPLEMENTED;
+    // Validate and convert `other`
+    if (!export_string_like(other, &other_str.start, &other_str.length)) {
+        PyErr_SetString(PyExc_TypeError, "The other object must be string-like");
+        return NULL;
+    }
 
-    // Perform byte-wise comparison up to the minimum length
-    size_t min_length = a_length < b_length ? a_length : b_length;
-    int cmp_result = memcmp(a_start, b_start, min_length);
+    // Allocate a new Str instance
+    Str *result_str = PyObject_New(Str, &StrType);
+    if (result_str == NULL) { return NULL; }
 
-    // If the strings are equal up to `min_length`, then the shorter string is smaller
-    if (cmp_result == 0) cmp_result = (a_length > b_length) - (a_length < b_length);
+    // Calculate the total length of the new string
+    result_str->parent = NULL;
+    result_str->length = self_str.length + other_str.length;
 
-    switch (op) {
-    case Py_LT: return PyBool_FromLong(cmp_result < 0);
-    case Py_LE: return PyBool_FromLong(cmp_result <= 0);
-    case Py_EQ: return PyBool_FromLong(cmp_result == 0);
-    case Py_NE: return PyBool_FromLong(cmp_result != 0);
-    case Py_GT: return PyBool_FromLong(cmp_result > 0);
-    case Py_GE: return PyBool_FromLong(cmp_result >= 0);
-    default: Py_RETURN_NOTIMPLEMENTED;
+    // Allocate memory for the new string
+    result_str->start = malloc(result_str->length);
+    if (result_str->start == NULL) {
+        PyErr_SetString(PyExc_MemoryError, "Unable to allocate memory for string concatenation");
+        return NULL;
     }
+
+    // Perform the string concatenation
+    memcpy(result_str->start, self_str.start, self_str.length);
+    memcpy(result_str->start + self_str.length, other_str.start, other_str.length);
+
+    return (PyObject *)result_str;
 }
 
 static PySequenceMethods Str_as_sequence = {
@@ -1203,6 +1324,7 @@ static PyMethodDef Str_methods[] = { //
     {"find", Str_find, sz_method_flags_m, "Find the first occurrence of a substring."},
     {"index", Str_index, sz_method_flags_m, "Find the first occurrence of a substring or raise error if missing."},
     {"contains", Str_contains, sz_method_flags_m, "Check if a string contains a substring."},
+    {"partition", Str_partition, sz_method_flags_m, "Splits string into 3-tuple: before, match, after."},
     {"count", Str_count, sz_method_flags_m, "Count the occurrences of a substring."},
     {"levenstein", Str_levenstein, sz_method_flags_m, "Calculate the Levenshtein distance between two strings."},
     {"split", Str_split, sz_method_flags_m, "Split a string by a separator."},
@@ -1263,6 +1385,7 @@ static PyMethodDef stringzilla_methods[] = {
     {"find", Str_find, sz_method_flags_m, "Find the first occurrence of a substring."},
     {"index", Str_index, sz_method_flags_m, "Find the first occurrence of a substring or raise error if missing."},
     {"contains", Str_contains, sz_method_flags_m, "Check if a string contains a substring."},
+    {"partition", Str_partition, sz_method_flags_m, "Splits string into 3-tuple: before, match, after."},
     {"count", Str_count, sz_method_flags_m, "Count the occurrences of a substring."},
     {"levenstein", Str_levenstein, sz_method_flags_m, "Calculate the Levenshtein distance between two strings."},
     {"split", Str_split, sz_method_flags_m, "Split a string by a separator."},

From d7f1f374f441d0d78750d24f555688e2e9afc93d Mon Sep 17 00:00:00 2001
From: Nairi Harutyunyan <nairi.haroutiounian@gmail.com>
Date: Thu, 21 Sep 2023 13:50:38 +0300
Subject: [PATCH 26/72] Draft verison of CountSubstrAPI

---
 javascript/lib.c            | 98 ++++++++++++++++++++++++++++++++++++-
 javascript/stringzilla.d.ts | 10 +++-
 javascript/test.js          |  8 ++-
 package-lock.json           |  4 +-
 4 files changed, 113 insertions(+), 7 deletions(-)

diff --git a/javascript/lib.c b/javascript/lib.c
index d00bf2cd..b2e097ff 100644
--- a/javascript/lib.c
+++ b/javascript/lib.c
@@ -52,10 +52,104 @@ napi_value FindAPI(napi_env env, napi_callback_info info) {
     return js_result;
 }
 
+napi_value CountSubstrAPI(napi_env env, napi_callback_info info) {
+    size_t argc = 3;
+    napi_value args[3];
+    napi_get_cb_info(env, info, &argc, args, NULL, NULL);
+
+    // Extract the C string from the JavaScript string for haystack and needle
+    size_t str_size;
+    size_t str_len;
+
+    // For haystack
+    napi_get_value_string_utf8(env, args[0], NULL, 0, &str_size);
+    char *haystack = malloc(str_size + 1);
+    napi_get_value_string_utf8(env, args[0], haystack, str_size + 1, &str_len);
+    struct strzl_haystack_t strzl_haystack = {haystack, str_len};
+
+
+    // For needle
+    napi_get_value_string_utf8(env, args[1], NULL, 0, &str_size);
+    char *needle = malloc(str_size + 1);
+    napi_get_value_string_utf8(env, args[1], needle, str_size + 1, &str_len);
+    struct strzl_needle_t strzl_needle = {needle, str_len, 0};
+
+    bool overlap = false;
+    napi_get_value_bool(env, args[2], &overlap);
+
+    size_t haystack_l = strlen(haystack);
+    size_t needle_l = strlen(needle);
+
+    size_t result = 0;
+
+    if (haystack_l == 1)
+        result = count_char(haystack, *needle);
+    else if (haystack_l < needle_l)
+        result = 0;
+    else if (overlap) {
+        while (strlen(haystack)) {
+            #if defined(__AVX2__)
+                size_t offset = strzl_avx2_find_substr(strzl_haystack, strzl_needle);
+            #elif defined(__ARM_NEON)
+                size_t offset = strzl_neon_find_substr(strzl_haystack, strzl_needle);
+            #else
+                size_t offset = strzl_naive_find_substr(strzl_haystack, strzl_needle);
+            #endif
+
+
+            bool found = offset != haystack_l;
+            result += found;
+            haystack += offset + found;
+            haystack_l -= offset + found;
+        }
+    }
+
+    else {
+        while (haystack_l) {
+            #if defined(__AVX2__)
+                size_t offset = strzl_avx2_find_substr(strzl_haystack, strzl_needle);
+            #elif defined(__ARM_NEON)
+                size_t offset = strzl_neon_find_substr(strzl_haystack, strzl_needle);
+            #else
+                size_t offset = strzl_naive_find_substr(strzl_haystack, strzl_needle);
+            #endif
+
+            bool found = offset != haystack_l;
+            result += found;
+            haystack += offset + needle_l;
+            haystack_l -= offset + needle_l * found;
+        }
+    }
+
+    // Cleanup
+    free(haystack);
+    free(needle);
+
+    // Convert result to JavaScript BigInt and return
+    napi_value js_result;
+    napi_create_bigint_uint64(env, result, &js_result);
+
+    return js_result;
+}
+
 napi_value Init(napi_env env, napi_value exports) {
-    napi_property_descriptor desc = {"find", 0, FindAPI, 0, 0, 0, napi_default, 0};
-    napi_define_properties(env, exports, 1, &desc);
+    // Define the "find" property
+    napi_property_descriptor findDesc = {"find", 0, FindAPI, 0, 0, 0, napi_default, 0};
+    
+    // Define the "countSubstr" property
+    napi_property_descriptor countSubstrDesc = {"countSubstr", 0, CountSubstrAPI, 0, 0, 0, napi_default, 0};
+    
+    // Define an array of property descriptors
+    napi_property_descriptor properties[] = {findDesc, countSubstrDesc};
+    
+    // Define the number of properties in the array
+    size_t propertyCount = sizeof(properties) / sizeof(properties[0]);
+
+    // Define the properties on the exports object
+    napi_define_properties(env, exports, propertyCount, properties);
+
     return exports;
 }
 
 NAPI_MODULE(NODE_GYP_MODULE_NAME, Init)
+
diff --git a/javascript/stringzilla.d.ts b/javascript/stringzilla.d.ts
index 657e666f..57eff05b 100644
--- a/javascript/stringzilla.d.ts
+++ b/javascript/stringzilla.d.ts
@@ -6,4 +6,12 @@
  * @param {string} needle 
  */
 export function find(haystack: string, needle: string): bigint;
-  
\ No newline at end of file
+
+/**
+ * Searches for a substring in a larger string.
+ * 
+ * @param {string} haystack 
+ * @param {string} needle 
+ * @param {boolean} overlap 
+ */
+export function countSubstr(haystack: string, needle: string, overlap: boolean): bigint;
diff --git a/javascript/test.js b/javascript/test.js
index 084d55cd..04ea7280 100644
--- a/javascript/test.js
+++ b/javascript/test.js
@@ -1,7 +1,11 @@
 var assert = require('assert');
 var stringzilla = require('bindings')('stringzilla');
 
-const result = stringzilla.find("hello world", "world");
-console.log(result);  // Output will depend on the result of your findOperation function.
+const findResult = stringzilla.find("hello world", "world");
+console.log(findResult);  // Output will depend on the result of your findOperation function.
+
+const countResult = stringzilla.countSubstr("hello world", "world");
+console.log(countResult);  // Output will depend on the result of your countSubstr function.
+
 
 console.log('JavaScript tests passed!');
diff --git a/package-lock.json b/package-lock.json
index e577ab31..38555f5c 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "stringzilla",
-  "version": "1.2.0",
+  "version": "1.2.2",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
       "name": "stringzilla",
-      "version": "1.2.0",
+      "version": "1.2.2",
       "license": "Apache 2.0",
       "dependencies": {
         "@types/node": "^20.4.5",

From ca77b0acfe0246f4a286972dfac0b0d2c71d0e17 Mon Sep 17 00:00:00 2001
From: Nairi Harutyunyan <nairi.haroutiounian@gmail.com>
Date: Thu, 21 Sep 2023 13:57:50 +0300
Subject: [PATCH 27/72] Add count_char function

---
 javascript/lib.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/javascript/lib.c b/javascript/lib.c
index b2e097ff..75b3548a 100644
--- a/javascript/lib.c
+++ b/javascript/lib.c
@@ -52,6 +52,11 @@ napi_value FindAPI(napi_env env, napi_callback_info info) {
     return js_result;
 }
 
+size_t count_char(strzl_haystack_t strzl_haystack, char needle) {
+    size_t result = strzl_naive_count_char(strzl_haystack, needle);
+    return result;
+}
+
 napi_value CountSubstrAPI(napi_env env, napi_callback_info info) {
     size_t argc = 3;
     napi_value args[3];
@@ -83,7 +88,7 @@ napi_value CountSubstrAPI(napi_env env, napi_callback_info info) {
     size_t result = 0;
 
     if (haystack_l == 1)
-        result = count_char(haystack, *needle);
+        result = count_char(strzl_haystack, needle);
     else if (haystack_l < needle_l)
         result = 0;
     else if (overlap) {

From 8abb7624ffceea013ef9e59411dfaf65748ae13c Mon Sep 17 00:00:00 2001
From: Nairi Harutyunyan <nairi.haroutiounian@gmail.com>
Date: Fri, 22 Sep 2023 00:46:15 +0300
Subject: [PATCH 28/72] Fix issues

---
 javascript/lib.c   | 63 ++++++++++++++++++++++------------------------
 javascript/test.js |  2 +-
 2 files changed, 31 insertions(+), 34 deletions(-)

diff --git a/javascript/lib.c b/javascript/lib.c
index 75b3548a..b06e48f3 100644
--- a/javascript/lib.c
+++ b/javascript/lib.c
@@ -8,6 +8,7 @@
  *
  *  @see NodeJS docs: https://nodejs.org/api/n-api.html
  */
+#include <stdio.h>
 #include <node_api.h>
 #include <stringzilla.h>
 
@@ -17,20 +18,20 @@ napi_value FindAPI(napi_env env, napi_callback_info info) {
     napi_get_cb_info(env, info, &argc, args, NULL, NULL);
 
     // Extract the C string from the JavaScript string for haystack and needle
-    size_t str_size;
-    size_t str_len;
+    size_t haystack_l;
+    size_t needle_l;
 
     // For haystack
-    napi_get_value_string_utf8(env, args[0], NULL, 0, &str_size);
-    char *haystack = malloc(str_size + 1);
-    napi_get_value_string_utf8(env, args[0], haystack, str_size + 1, &str_len);
-    struct strzl_haystack_t strzl_haystack = {haystack, str_len};
+    napi_get_value_string_utf8(env, args[0], NULL, 0, &haystack_l);
+    char *haystack = malloc(haystack_l + 1);
+    napi_get_value_string_utf8(env, args[0], haystack, haystack_l + 1, &needle_l);
+    struct strzl_haystack_t strzl_haystack = {haystack, needle_l};
 
     // For needle
-    napi_get_value_string_utf8(env, args[1], NULL, 0, &str_size);
-    char *needle = malloc(str_size + 1);
-    napi_get_value_string_utf8(env, args[1], needle, str_size + 1, &str_len);
-    struct strzl_needle_t strzl_needle = {needle, str_len, 0};
+    napi_get_value_string_utf8(env, args[1], NULL, 0, &haystack_l);
+    char *needle = malloc(haystack_l + 1);
+    napi_get_value_string_utf8(env, args[1], needle, haystack_l + 1, &needle_l);
+    struct strzl_needle_t strzl_needle = {needle, needle_l, 0};
 
 // Perform the find operation
 #if defined(__AVX2__)
@@ -54,6 +55,7 @@ napi_value FindAPI(napi_env env, napi_callback_info info) {
 
 size_t count_char(strzl_haystack_t strzl_haystack, char needle) {
     size_t result = strzl_naive_count_char(strzl_haystack, needle);
+
     return result;
 }
 
@@ -63,28 +65,24 @@ napi_value CountSubstrAPI(napi_env env, napi_callback_info info) {
     napi_get_cb_info(env, info, &argc, args, NULL, NULL);
 
     // Extract the C string from the JavaScript string for haystack and needle
-    size_t str_size;
-    size_t str_len;
+    size_t haystack_l;
+    size_t needle_l;
 
     // For haystack
-    napi_get_value_string_utf8(env, args[0], NULL, 0, &str_size);
-    char *haystack = malloc(str_size + 1);
-    napi_get_value_string_utf8(env, args[0], haystack, str_size + 1, &str_len);
-    struct strzl_haystack_t strzl_haystack = {haystack, str_len};
-
+    napi_get_value_string_utf8(env, args[0], NULL, 0, &haystack_l);
+    char *haystack = malloc(haystack_l + 1);
+    napi_get_value_string_utf8(env, args[0], haystack, haystack_l + 1, &needle_l);
+    struct strzl_haystack_t strzl_haystack = {haystack, needle_l};
 
     // For needle
-    napi_get_value_string_utf8(env, args[1], NULL, 0, &str_size);
-    char *needle = malloc(str_size + 1);
-    napi_get_value_string_utf8(env, args[1], needle, str_size + 1, &str_len);
-    struct strzl_needle_t strzl_needle = {needle, str_len, 0};
+    napi_get_value_string_utf8(env, args[1], NULL, 0, &haystack_l);
+    char *needle = malloc(haystack_l + 1);
+    napi_get_value_string_utf8(env, args[1], needle, haystack_l + 1, &needle_l);
+    struct strzl_needle_t strzl_needle = {needle, needle_l, 0};
 
     bool overlap = false;
     napi_get_value_bool(env, args[2], &overlap);
 
-    size_t haystack_l = strlen(haystack);
-    size_t needle_l = strlen(needle);
-
     size_t result = 0;
 
     if (haystack_l == 1)
@@ -92,7 +90,7 @@ napi_value CountSubstrAPI(napi_env env, napi_callback_info info) {
     else if (haystack_l < needle_l)
         result = 0;
     else if (overlap) {
-        while (strlen(haystack)) {
+        while (strzl_haystack.len) {
             #if defined(__AVX2__)
                 size_t offset = strzl_avx2_find_substr(strzl_haystack, strzl_needle);
             #elif defined(__ARM_NEON)
@@ -101,16 +99,15 @@ napi_value CountSubstrAPI(napi_env env, napi_callback_info info) {
                 size_t offset = strzl_naive_find_substr(strzl_haystack, strzl_needle);
             #endif
 
-
-            bool found = offset != haystack_l;
+            bool found = offset != strzl_haystack.len;
             result += found;
-            haystack += offset + found;
-            haystack_l -= offset + found;
+            strzl_haystack.ptr += offset + found;
+            strzl_haystack.len -= offset + found;
         }
     }
 
     else {
-        while (haystack_l) {
+        while (strzl_haystack.len) {
             #if defined(__AVX2__)
                 size_t offset = strzl_avx2_find_substr(strzl_haystack, strzl_needle);
             #elif defined(__ARM_NEON)
@@ -119,10 +116,10 @@ napi_value CountSubstrAPI(napi_env env, napi_callback_info info) {
                 size_t offset = strzl_naive_find_substr(strzl_haystack, strzl_needle);
             #endif
 
-            bool found = offset != haystack_l;
+            bool found = offset != strzl_haystack.len;
             result += found;
-            haystack += offset + needle_l;
-            haystack_l -= offset + needle_l * found;
+            strzl_haystack.ptr += offset + needle_l;
+            strzl_haystack.len -= offset + needle_l * found;
         }
     }
 
diff --git a/javascript/test.js b/javascript/test.js
index 04ea7280..18ea11b2 100644
--- a/javascript/test.js
+++ b/javascript/test.js
@@ -4,7 +4,7 @@ var stringzilla = require('bindings')('stringzilla');
 const findResult = stringzilla.find("hello world", "world");
 console.log(findResult);  // Output will depend on the result of your findOperation function.
 
-const countResult = stringzilla.countSubstr("hello world", "world");
+const countResult = stringzilla.countSubstr("abababab", "aba", true);
 console.log(countResult);  // Output will depend on the result of your countSubstr function.
 
 

From c1953fade9563b8d6627d05662ee1025773b4558 Mon Sep 17 00:00:00 2001
From: Nairi Harutyunyan <nairi.haroutiounian@gmail.com>
Date: Fri, 22 Sep 2023 00:53:29 +0300
Subject: [PATCH 29/72] remove stdio

---
 javascript/lib.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/javascript/lib.c b/javascript/lib.c
index b06e48f3..830b577e 100644
--- a/javascript/lib.c
+++ b/javascript/lib.c
@@ -8,7 +8,7 @@
  *
  *  @see NodeJS docs: https://nodejs.org/api/n-api.html
  */
-#include <stdio.h>
+ 
 #include <node_api.h>
 #include <stringzilla.h>
 

From 2758c3c15e42f89c3bfd202144690f44fd6b320c Mon Sep 17 00:00:00 2001
From: Nairi Harutyunyan <nairi.haroutiounian@gmail.com>
Date: Fri, 22 Sep 2023 01:12:57 +0300
Subject: [PATCH 30/72] In JavaScript if find unable to find the specified
 value then it should return -1

---
 javascript/lib.c | 52 +++++++++++++++++++++++++++++-------------------
 1 file changed, 32 insertions(+), 20 deletions(-)

diff --git a/javascript/lib.c b/javascript/lib.c
index d00bf2cd..a092e242 100644
--- a/javascript/lib.c
+++ b/javascript/lib.c
@@ -8,38 +8,50 @@
  *
  *  @see NodeJS docs: https://nodejs.org/api/n-api.html
  */
+
 #include <node_api.h>
 #include <stringzilla.h>
 
 napi_value FindAPI(napi_env env, napi_callback_info info) {
-    size_t argc = 2;
+size_t argc = 2;
     napi_value args[2];
     napi_get_cb_info(env, info, &argc, args, NULL, NULL);
 
     // Extract the C string from the JavaScript string for haystack and needle
-    size_t str_size;
-    size_t str_len;
+    size_t haystack_l;
+    size_t needle_l;
 
     // For haystack
-    napi_get_value_string_utf8(env, args[0], NULL, 0, &str_size);
-    char *haystack = malloc(str_size + 1);
-    napi_get_value_string_utf8(env, args[0], haystack, str_size + 1, &str_len);
-    struct strzl_haystack_t strzl_haystack = {haystack, str_len};
+    napi_get_value_string_utf8(env, args[0], NULL, 0, &haystack_l);
+    char *haystack = malloc(haystack_l + 1);
+    napi_get_value_string_utf8(env, args[0], haystack, haystack_l + 1, &needle_l);
+    struct strzl_haystack_t strzl_haystack = {haystack, needle_l};
 
     // For needle
-    napi_get_value_string_utf8(env, args[1], NULL, 0, &str_size);
-    char *needle = malloc(str_size + 1);
-    napi_get_value_string_utf8(env, args[1], needle, str_size + 1, &str_len);
-    struct strzl_needle_t strzl_needle = {needle, str_len, 0};
-
-// Perform the find operation
-#if defined(__AVX2__)
-    uint64_t result = strzl_avx2_find_substr(strzl_haystack, strzl_needle);
-#elif defined(__ARM_NEON)
-    uint64_t result = strzl_neon_find_substr(strzl_haystack, strzl_needle);
-#else
-    uint64_t result = strzl_naive_find_substr(strzl_haystack, strzl_needle);
-#endif
+    napi_get_value_string_utf8(env, args[1], NULL, 0, &haystack_l);
+    char *needle = malloc(haystack_l + 1);
+    napi_get_value_string_utf8(env, args[1], needle, haystack_l + 1, &needle_l);
+    struct strzl_needle_t strzl_needle = {needle, needle_l, 0};
+
+    // Perform the find operation
+    #if defined(__AVX2__)
+        uint64_t result = strzl_avx2_find_substr(strzl_haystack, strzl_needle);
+    #elif defined(__ARM_NEON)
+        uint64_t result = strzl_neon_find_substr(strzl_haystack, strzl_needle);
+    #else
+        uint64_t result = strzl_naive_find_substr(strzl_haystack, strzl_needle);
+    #endif
+
+    // Restore length of haystack as it's lost
+    haystack_l = strlen(haystack);
+
+    // In JavaScript if find unable to find the specified value then it should return -1
+    if (haystack_l == (size_t)result) {
+        napi_value js_result;
+        napi_create_int32(env, -1, &js_result);
+
+        return js_result;
+    }
 
     // Cleanup
     free(haystack);

From c7eb66868603b3c0ac21a724b485d9987a01667a Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 22 Sep 2023 09:57:36 +0400
Subject: [PATCH 31/72] Add: Shuffling method in Python

---
 python/lib.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)

diff --git a/python/lib.c b/python/lib.c
index 8459cc30..dbf41114 100644
--- a/python/lib.c
+++ b/python/lib.c
@@ -1351,6 +1351,74 @@ static PyTypeObject StrType = {
     .tp_as_number = &Str_as_number,
 };
 
+#pragma endregion
+
+#pragma regions Strs
+
+static PyObject *Strs_shuffle(Strs *self, PyObject *args, PyObject *kwargs) {
+    unsigned int seed = time(NULL); // Default seed
+
+    // Check for positional arguments
+    Py_ssize_t nargs = PyTuple_Size(args);
+    if (nargs > 1) {
+        PyErr_SetString(PyExc_TypeError, "shuffle() takes at most 1 positional argument");
+        return NULL;
+    }
+    else if (nargs == 1) {
+        PyObject *seed_obj = PyTuple_GET_ITEM(args, 0);
+        if (!PyLong_Check(seed_obj)) {
+            PyErr_SetString(PyExc_TypeError, "The seed must be an integer");
+            return NULL;
+        }
+        seed = PyLong_AsUnsignedLong(seed_obj);
+    }
+
+    // Check for keyword arguments
+    if (kwargs) {
+        PyObject *key, *value;
+        Py_ssize_t pos = 0;
+        while (PyDict_Next(kwargs, &pos, &key, &value)) {
+            if (PyUnicode_CompareWithASCIIString(key, "seed") == 0) {
+                if (nargs == 1) {
+                    PyErr_SetString(PyExc_TypeError, "Received seed both as positional and keyword argument");
+                    return NULL;
+                }
+                if (!PyLong_Check(value)) {
+                    PyErr_SetString(PyExc_TypeError, "The seed must be an integer");
+                    return NULL;
+                }
+                seed = PyLong_AsUnsignedLong(value);
+            }
+            else {
+                PyErr_Format(PyExc_TypeError, "Received an unexpected keyword argument '%U'", key);
+                return NULL;
+            }
+        }
+    }
+
+    // Change the layout
+    if (!prepare_strings_for_reordering(self)) {
+        PyErr_Format(PyExc_TypeError, "Failed to prepare the sequence for shuffling");
+        return NULL;
+    }
+
+    // Get the parts and their count
+    struct reordered_slices_t *reordered = &self->data.reordered;
+    sz_haystack_t *parts = reordered->parts;
+    size_t count = reordered->count;
+
+    // Fisher-Yates Shuffle Algorithm
+    for (size_t i = count - 1; i > 0; --i) {
+        size_t j = rand() % (i + 1);
+        // Swap parts[i] and parts[j]
+        sz_haystack_t temp = parts[i];
+        parts[i] = parts[j];
+        parts[j] = temp;
+    }
+
+    Py_RETURN_NONE;
+}
+
 static PySequenceMethods Strs_as_sequence = {
     .sq_length = Strs_len,        //
     .sq_item = Strs_getitem,      //
@@ -1362,6 +1430,10 @@ static PyMappingMethods Strs_as_mapping = {
     .mp_subscript = Strs_subscript, // Is used to implement slices in Python
 };
 
+static PyMethodDef Strs_methods[] = {
+    {"shuffle", Strs_shuffle, sz_method_flags_m, "Shuffle the elements of the Strs object."}, //
+    {NULL, NULL, 0, NULL}};
+
 static PyTypeObject StrsType = {
     PyVarObject_HEAD_INIT(NULL, 0).tp_name = "stringzilla.Strs",
     .tp_doc = "Space-efficient container for large collections of strings and their slices",
@@ -1369,6 +1441,7 @@ static PyTypeObject StrsType = {
     .tp_itemsize = 0,
     .tp_flags = Py_TPFLAGS_DEFAULT,
     .tp_new = PyType_GenericNew,
+    .tp_methods = Strs_methods,
     .tp_as_sequence = &Strs_as_sequence,
     .tp_as_mapping = &Strs_as_mapping,
 };

From 5a2f72d2267433c3696863d82dd93dd3a34dd0d4 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 22 Sep 2023 09:57:48 +0400
Subject: [PATCH 32/72] Make: Colorful diagnostics

---
 .vscode/settings.json | 1 +
 setup.py              | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 97c0113c..c32a469d 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -149,6 +149,7 @@
     "NOMINMAX",
     "NOTIMPLEMENTED",
     "pytest",
+    "Pythonic",
     "quadgram",
     "readlines",
     "releasebuffer",
diff --git a/setup.py b/setup.py
index cb136b1d..f667c0f2 100644
--- a/setup.py
+++ b/setup.py
@@ -13,6 +13,7 @@
     compile_args.append("-O3")
     compile_args.append("-pedantic")
     compile_args.append("-Wno-unknown-pragmas")
+    compile_args.append("-fdiagnostics-color=always")
     compile_args.append("-fopenmp")
     link_args.append("-lgomp")
 
@@ -40,6 +41,7 @@
     compile_args.append("-Wno-unknown-pragmas")
     compile_args.append("-Wno-incompatible-function-pointer-types")
     compile_args.append("-Wno-incompatible-pointer-types")
+    compile_args.append("-fcolor-diagnostics")
     compile_args.append("-Xpreprocessor -fopenmp")
     link_args.append("-Xpreprocessor -lomp")
 

From 577554cc818025c53c67fb2a7acacdaa031d5a3a Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 22 Sep 2023 10:07:06 +0400
Subject: [PATCH 33/72] Make: `AlwaysBreakBeforeMultilineStrings`

---
 .clang-format | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.clang-format b/.clang-format
index ab9f350a..11877ff7 100644
--- a/.clang-format
+++ b/.clang-format
@@ -24,7 +24,6 @@ AllowShortLambdasOnASingleLine: true
 AllowShortLoopsOnASingleLine: false
 AlwaysBreakBeforeMultilineStrings: true
 AlwaysBreakTemplateDeclarations: Yes
-AlwaysBreakBeforeMultilineStrings: true
 AlwaysBreakAfterReturnType: None
 PenaltyReturnTypeOnItsOwnLine: 200
 

From 021b29830911d50692500fd3fdcf77c634170514 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 22 Sep 2023 10:07:30 +0400
Subject: [PATCH 34/72] Fix: Memory leak and extra `strlen` calls

---
 javascript/lib.c | 93 ++++++++++++++++++++++--------------------------
 1 file changed, 43 insertions(+), 50 deletions(-)

diff --git a/javascript/lib.c b/javascript/lib.c
index 282d7066..8991b37a 100644
--- a/javascript/lib.c
+++ b/javascript/lib.c
@@ -12,45 +12,34 @@
 #include <stringzilla.h>
 
 napi_value FindAPI(napi_env env, napi_callback_info info) {
-size_t argc = 2;
+    size_t argc = 2;
     napi_value args[2];
     napi_get_cb_info(env, info, &argc, args, NULL, NULL);
 
     // Extract the C string from the JavaScript string for haystack and needle
-    size_t haystack_l;
-    size_t needle_l;
+    struct strzl_haystack_t strzl_haystack = {NULL, 0};
+    struct strzl_needle_t strzl_needle = {NULL, 0, 0};
 
     // For haystack
-    napi_get_value_string_utf8(env, args[0], NULL, 0, &haystack_l);
-    char *haystack = malloc(haystack_l + 1);
-    napi_get_value_string_utf8(env, args[0], haystack, haystack_l + 1, &needle_l);
-    struct strzl_haystack_t strzl_haystack = {haystack, needle_l};
+    napi_get_value_string_utf8(env, args[0], NULL, 0, &strzl_haystack.len);
+    char *haystack = malloc(strzl_haystack.len);
+    napi_get_value_string_utf8(env, args[0], haystack, strzl_haystack.len, &strzl_haystack.len);
+    strzl_haystack.ptr = haystack;
 
     // For needle
-    napi_get_value_string_utf8(env, args[1], NULL, 0, &haystack_l);
-    char *needle = malloc(haystack_l + 1);
-    napi_get_value_string_utf8(env, args[1], needle, haystack_l + 1, &needle_l);
-    struct strzl_needle_t strzl_needle = {needle, needle_l, 0};
-
-    // Perform the find operation
-    #if defined(__AVX2__)
-        uint64_t result = strzl_avx2_find_substr(strzl_haystack, strzl_needle);
-    #elif defined(__ARM_NEON)
-        uint64_t result = strzl_neon_find_substr(strzl_haystack, strzl_needle);
-    #else
-        uint64_t result = strzl_naive_find_substr(strzl_haystack, strzl_needle);
-    #endif
-
-    // Restore length of haystack as it's lost
-    haystack_l = strlen(haystack);
-
-    // In JavaScript if find unable to find the specified value then it should return -1
-    if (haystack_l == (size_t)result) {
-        napi_value js_result;
-        napi_create_int32(env, -1, &js_result);
-
-        return js_result;
-    }
+    napi_get_value_string_utf8(env, args[1], NULL, 0, &strzl_needle.len);
+    char *needle = malloc(strzl_needle.len);
+    napi_get_value_string_utf8(env, args[1], needle, strzl_needle.len, &strzl_needle.len);
+    strzl_needle.ptr = needle;
+
+// Perform the find operation
+#if defined(__AVX2__)
+    uint64_t result = strzl_avx2_find_substr(strzl_haystack, strzl_needle);
+#elif defined(__ARM_NEON)
+    uint64_t result = strzl_neon_find_substr(strzl_haystack, strzl_needle);
+#else
+    uint64_t result = strzl_naive_find_substr(strzl_haystack, strzl_needle);
+#endif
 
     // Cleanup
     free(haystack);
@@ -58,7 +47,12 @@ size_t argc = 2;
 
     // Convert result to JavaScript BigInt and return
     napi_value js_result;
-    napi_create_bigint_uint64(env, result, &js_result);
+
+    // In JavaScript if find unable to find the specified value then it should return -1
+    if (result = strzl_haystack.len)
+        napi_create_bigint_int64(env, -1, &js_result);
+    else
+        napi_create_bigint_uint64(env, result, &js_result);
 
     return js_result;
 }
@@ -101,13 +95,13 @@ napi_value CountSubstrAPI(napi_env env, napi_callback_info info) {
         result = 0;
     else if (overlap) {
         while (strzl_haystack.len) {
-            #if defined(__AVX2__)
-                size_t offset = strzl_avx2_find_substr(strzl_haystack, strzl_needle);
-            #elif defined(__ARM_NEON)
-                size_t offset = strzl_neon_find_substr(strzl_haystack, strzl_needle);
-            #else
-                size_t offset = strzl_naive_find_substr(strzl_haystack, strzl_needle);
-            #endif
+#if defined(__AVX2__)
+            size_t offset = strzl_avx2_find_substr(strzl_haystack, strzl_needle);
+#elif defined(__ARM_NEON)
+            size_t offset = strzl_neon_find_substr(strzl_haystack, strzl_needle);
+#else
+            size_t offset = strzl_naive_find_substr(strzl_haystack, strzl_needle);
+#endif
 
             bool found = offset != strzl_haystack.len;
             result += found;
@@ -118,13 +112,13 @@ napi_value CountSubstrAPI(napi_env env, napi_callback_info info) {
 
     else {
         while (strzl_haystack.len) {
-            #if defined(__AVX2__)
-                size_t offset = strzl_avx2_find_substr(strzl_haystack, strzl_needle);
-            #elif defined(__ARM_NEON)
-                size_t offset = strzl_neon_find_substr(strzl_haystack, strzl_needle);
-            #else
-                size_t offset = strzl_naive_find_substr(strzl_haystack, strzl_needle);
-            #endif
+#if defined(__AVX2__)
+            size_t offset = strzl_avx2_find_substr(strzl_haystack, strzl_needle);
+#elif defined(__ARM_NEON)
+            size_t offset = strzl_neon_find_substr(strzl_haystack, strzl_needle);
+#else
+            size_t offset = strzl_naive_find_substr(strzl_haystack, strzl_needle);
+#endif
 
             bool found = offset != strzl_haystack.len;
             result += found;
@@ -147,13 +141,13 @@ napi_value CountSubstrAPI(napi_env env, napi_callback_info info) {
 napi_value Init(napi_env env, napi_value exports) {
     // Define the "find" property
     napi_property_descriptor findDesc = {"find", 0, FindAPI, 0, 0, 0, napi_default, 0};
-    
+
     // Define the "countSubstr" property
     napi_property_descriptor countSubstrDesc = {"countSubstr", 0, CountSubstrAPI, 0, 0, 0, napi_default, 0};
-    
+
     // Define an array of property descriptors
     napi_property_descriptor properties[] = {findDesc, countSubstrDesc};
-    
+
     // Define the number of properties in the array
     size_t propertyCount = sizeof(properties) / sizeof(properties[0]);
 
@@ -164,4 +158,3 @@ napi_value Init(napi_env env, napi_value exports) {
 }
 
 NAPI_MODULE(NODE_GYP_MODULE_NAME, Init)
-

From 880f4371fccc6c925e42b514c302696496638862 Mon Sep 17 00:00:00 2001
From: Nairi Harutyunyan <nairi.haroutiounian@gmail.com>
Date: Sat, 23 Sep 2023 00:51:45 +0300
Subject: [PATCH 35/72] Improvements, add test cases

---
 .vscode/settings.json          |  3 ++-
 javascript/lib.c               | 11 +++++----
 javascript/test.js             | 11 ---------
 javascript/test/countSubstr.js | 44 ++++++++++++++++++++++++++++++++++
 javascript/test/find.js        | 28 ++++++++++++++++++++++
 package.json                   |  3 ++-
 6 files changed, 82 insertions(+), 18 deletions(-)
 delete mode 100644 javascript/test.js
 create mode 100644 javascript/test/countSubstr.js
 create mode 100644 javascript/test/find.js

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 3ebc8b24..5bb0127a 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -116,7 +116,8 @@
     "stop_token": "cpp",
     "__verbose_abort": "cpp",
     "strstream": "cpp",
-    "filesystem": "cpp"
+    "filesystem": "cpp",
+    "stringzilla.h": "c"
   },
   "C_Cpp.default.configurationProvider": "ms-vscode.cmake-tools",
   "cSpell.words": [
diff --git a/javascript/lib.c b/javascript/lib.c
index 8991b37a..0a2ee655 100644
--- a/javascript/lib.c
+++ b/javascript/lib.c
@@ -8,6 +8,7 @@
  *
  *  @see NodeJS docs: https://nodejs.org/api/n-api.html
  */
+
 #include <node_api.h>
 #include <stringzilla.h>
 
@@ -49,7 +50,7 @@ napi_value FindAPI(napi_env env, napi_callback_info info) {
     napi_value js_result;
 
     // In JavaScript if find unable to find the specified value then it should return -1
-    if (result = strzl_haystack.len)
+    if (result == 0)
         napi_create_bigint_int64(env, -1, &js_result);
     else
         napi_create_bigint_uint64(env, result, &js_result);
@@ -75,8 +76,8 @@ napi_value CountSubstrAPI(napi_env env, napi_callback_info info) {
     // For haystack
     napi_get_value_string_utf8(env, args[0], NULL, 0, &haystack_l);
     char *haystack = malloc(haystack_l + 1);
-    napi_get_value_string_utf8(env, args[0], haystack, haystack_l + 1, &needle_l);
-    struct strzl_haystack_t strzl_haystack = {haystack, needle_l};
+    napi_get_value_string_utf8(env, args[0], haystack, haystack_l + 1, &haystack_l);
+    struct strzl_haystack_t strzl_haystack = {haystack, haystack_l};
 
     // For needle
     napi_get_value_string_utf8(env, args[1], NULL, 0, &haystack_l);
@@ -89,8 +90,8 @@ napi_value CountSubstrAPI(napi_env env, napi_callback_info info) {
 
     size_t result = 0;
 
-    if (haystack_l == 1)
-        result = count_char(strzl_haystack, needle);
+    if (needle_l == 1 || needle_l == 0)
+        result = count_char(strzl_haystack, needle[0]);
     else if (haystack_l < needle_l)
         result = 0;
     else if (overlap) {
diff --git a/javascript/test.js b/javascript/test.js
deleted file mode 100644
index 18ea11b2..00000000
--- a/javascript/test.js
+++ /dev/null
@@ -1,11 +0,0 @@
-var assert = require('assert');
-var stringzilla = require('bindings')('stringzilla');
-
-const findResult = stringzilla.find("hello world", "world");
-console.log(findResult);  // Output will depend on the result of your findOperation function.
-
-const countResult = stringzilla.countSubstr("abababab", "aba", true);
-console.log(countResult);  // Output will depend on the result of your countSubstr function.
-
-
-console.log('JavaScript tests passed!');
diff --git a/javascript/test/countSubstr.js b/javascript/test/countSubstr.js
new file mode 100644
index 00000000..973ba541
--- /dev/null
+++ b/javascript/test/countSubstr.js
@@ -0,0 +1,44 @@
+import test from 'node:test';
+import bindings from 'bindings';
+import assert from 'node:assert';
+
+const stringzilla = bindings('stringzilla');
+
+test('Count Words - Single Occurrence', () => {
+    const result = stringzilla.countSubstr('hello world', 'world');
+
+    assert.strictEqual(result, 1n);
+});
+
+test('Count Words - Multiple Occurrence', () => {
+    const result = stringzilla.countSubstr('hello world, hello John', 'hello');
+
+    assert.strictEqual(result, 2n);
+});
+
+test('Count Words - Multiple Occurrences with Overlap Test', () => {
+    const result_1 = stringzilla.countSubstr('abababab', 'aba');
+
+    assert.strictEqual(result_1, 2n);
+
+    const result_2 = stringzilla.countSubstr('abababab', 'aba', true);
+
+    assert.strictEqual(result_2, 3n);
+});
+
+test('Count Words - No Occurrence', () => {
+    const result = stringzilla.countSubstr('hello world', 'hi');
+
+    assert.strictEqual(result, 0n);
+});
+
+test('Count Words - Empty String Inputs', () => {
+    const result_1 = stringzilla.countSubstr('hello world', '');
+    assert.strictEqual(result_1, 0n);
+
+    const result_2 = stringzilla.countSubstr('', 'hi');
+    assert.strictEqual(result_2, 0n);
+
+    const result_3 = stringzilla.countSubstr('', '');
+    assert.strictEqual(result_3, 0n);
+});
diff --git a/javascript/test/find.js b/javascript/test/find.js
new file mode 100644
index 00000000..f0f1ea45
--- /dev/null
+++ b/javascript/test/find.js
@@ -0,0 +1,28 @@
+import test from 'node:test';
+import bindings from 'bindings';
+import assert from 'node:assert';
+
+const stringzilla = bindings('stringzilla');
+
+test('Find Word in Text - Positive Case', () => {
+    const result = stringzilla.find('hello world, hello john', 'world');
+
+    assert.strictEqual(result, 6n);
+});
+
+test('Find Word in Text - Negative Case (Word Not Found)', () => {
+    const result = stringzilla.find('hello world', 'hi');
+
+    assert.strictEqual(result, -1n);
+});
+
+test('Find Word in Text - Negative Case (Empty String Inputs)', () => {
+    const result_1 = stringzilla.find('hello world', '');
+    assert.strictEqual(result_1, -1n);
+
+    const result_2 = stringzilla.find('', 'a');
+    assert.strictEqual(result_2, -1n);
+
+    const result_3 = stringzilla.find('', '');
+    assert.strictEqual(result_2, -1n);
+});
diff --git a/package.json b/package.json
index a1bab16c..e7ff8597 100644
--- a/package.json
+++ b/package.json
@@ -5,6 +5,7 @@
   "author": "Ash Vardanian",
   "license": "Apache 2.0",
   "main": "javascript/stringzilla.js",
+  "type": "module",
   "repository": {
     "type": "git",
     "url": "https://github.com/ashvardanian/stringzilla.git"
@@ -19,7 +20,7 @@
     "node-addon-api": "^3.0.0"
   },
   "scripts": {
-    "test": "node javascript/test.js"
+    "test": "node --test ./javascript/test"
   },
   "devDependencies": {
     "@semantic-release/exec": "^6.0.3",

From 5b395968bbedbd8e2f65cf3139a4d3c7a2545027 Mon Sep 17 00:00:00 2001
From: Nairi Harutyunyan <nairi.haroutiounian@gmail.com>
Date: Sat, 23 Sep 2023 01:10:20 +0300
Subject: [PATCH 36/72] Fix condition in find function

---
 javascript/lib.c        |  2 +-
 javascript/test/find.js | 12 +++++++-----
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/javascript/lib.c b/javascript/lib.c
index 0a2ee655..616a3e81 100644
--- a/javascript/lib.c
+++ b/javascript/lib.c
@@ -50,7 +50,7 @@ napi_value FindAPI(napi_env env, napi_callback_info info) {
     napi_value js_result;
 
     // In JavaScript if find unable to find the specified value then it should return -1
-    if (result == 0)
+    if (result == strzl_haystack.len)
         napi_create_bigint_int64(env, -1, &js_result);
     else
         napi_create_bigint_uint64(env, result, &js_result);
diff --git a/javascript/test/find.js b/javascript/test/find.js
index f0f1ea45..cd2a800d 100644
--- a/javascript/test/find.js
+++ b/javascript/test/find.js
@@ -5,20 +5,22 @@ import assert from 'node:assert';
 const stringzilla = bindings('stringzilla');
 
 test('Find Word in Text - Positive Case', () => {
-    const result = stringzilla.find('hello world, hello john', 'world');
+    const result = stringzilla.find('hello world, hello john', 'hello');
 
-    assert.strictEqual(result, 6n);
+    assert.strictEqual(result, 0n);
 });
 
 test('Find Word in Text - Negative Case (Word Not Found)', () => {
-    const result = stringzilla.find('hello world', 'hi');
+    const result_1 = stringzilla.find('ha', 'aaa');
+    assert.strictEqual(result_1, -1n);
 
-    assert.strictEqual(result, -1n);
+    const result_2 = stringzilla.find('g', 'a');
+    assert.strictEqual(result_2, -1n);
 });
 
 test('Find Word in Text - Negative Case (Empty String Inputs)', () => {
     const result_1 = stringzilla.find('hello world', '');
-    assert.strictEqual(result_1, -1n);
+    assert.strictEqual(result_1, 0n);
 
     const result_2 = stringzilla.find('', 'a');
     assert.strictEqual(result_2, -1n);

From 1a6a8e496f446a0f450c5f955967b0ef64c225b2 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 23 Sep 2023 08:44:07 +0100
Subject: [PATCH 37/72] Improve: Use less temp. variables to count matches

---
 javascript/lib.c | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/javascript/lib.c b/javascript/lib.c
index 616a3e81..c2098a08 100644
--- a/javascript/lib.c
+++ b/javascript/lib.c
@@ -46,10 +46,10 @@ napi_value FindAPI(napi_env env, napi_callback_info info) {
     free(haystack);
     free(needle);
 
-    // Convert result to JavaScript BigInt and return
+    // Convert the result to JavaScript BigInt and return
     napi_value js_result;
 
-    // In JavaScript if find unable to find the specified value then it should return -1
+    // In JavaScript, if `find` is unable to find the specified value, then it should return -1
     if (result == strzl_haystack.len)
         napi_create_bigint_int64(env, -1, &js_result);
     else
@@ -70,30 +70,30 @@ napi_value CountSubstrAPI(napi_env env, napi_callback_info info) {
     napi_get_cb_info(env, info, &argc, args, NULL, NULL);
 
     // Extract the C string from the JavaScript string for haystack and needle
-    size_t haystack_l;
-    size_t needle_l;
+    struct strzl_haystack_t strzl_haystack = {NULL, 0};
+    struct strzl_needle_t strzl_needle = {NULL, 0, 0};
 
     // For haystack
-    napi_get_value_string_utf8(env, args[0], NULL, 0, &haystack_l);
-    char *haystack = malloc(haystack_l + 1);
-    napi_get_value_string_utf8(env, args[0], haystack, haystack_l + 1, &haystack_l);
-    struct strzl_haystack_t strzl_haystack = {haystack, haystack_l};
+    napi_get_value_string_utf8(env, args[0], NULL, 0, &strzl_haystack.len);
+    char *haystack = malloc(strzl_haystack.len);
+    napi_get_value_string_utf8(env, args[0], haystack, strzl_haystack.len, &strzl_haystack.len);
+    strzl_haystack.ptr = haystack;
 
     // For needle
-    napi_get_value_string_utf8(env, args[1], NULL, 0, &haystack_l);
-    char *needle = malloc(haystack_l + 1);
-    napi_get_value_string_utf8(env, args[1], needle, haystack_l + 1, &needle_l);
-    struct strzl_needle_t strzl_needle = {needle, needle_l, 0};
+    napi_get_value_string_utf8(env, args[1], NULL, 0, &strzl_needle.len);
+    char *needle = malloc(strzl_needle.len);
+    napi_get_value_string_utf8(env, args[1], needle, strzl_needle.len, &strzl_needle.len);
+    strzl_needle.ptr = needle;
 
     bool overlap = false;
     napi_get_value_bool(env, args[2], &overlap);
 
-    size_t result = 0;
+    size_t result;
 
-    if (needle_l == 1 || needle_l == 0)
-        result = count_char(strzl_haystack, needle[0]);
-    else if (haystack_l < needle_l)
+    if (strzl_needle.len == 0 || strzl_haystack.len == 0 || strzl_haystack.len < strzl_needle.len) {
         result = 0;
+    else if (strzl_needle.len == 1)
+        result = count_char(strzl_haystack, strzl_needle.ptr[0]);
     else if (overlap) {
         while (strzl_haystack.len) {
 #if defined(__AVX2__)
@@ -123,8 +123,8 @@ napi_value CountSubstrAPI(napi_env env, napi_callback_info info) {
 
             bool found = offset != strzl_haystack.len;
             result += found;
-            strzl_haystack.ptr += offset + needle_l;
-            strzl_haystack.len -= offset + needle_l * found;
+            strzl_haystack.ptr += offset + strzl_needle.len;
+            strzl_haystack.len -= offset + strzl_needle.len * found;
         }
     }
 
@@ -132,7 +132,7 @@ napi_value CountSubstrAPI(napi_env env, napi_callback_info info) {
     free(haystack);
     free(needle);
 
-    // Convert result to JavaScript BigInt and return
+    // Convert the result to JavaScript `BigInt` and return
     napi_value js_result;
     napi_create_bigint_uint64(env, result, &js_result);
 
@@ -152,7 +152,7 @@ napi_value Init(napi_env env, napi_value exports) {
     // Define the number of properties in the array
     size_t propertyCount = sizeof(properties) / sizeof(properties[0]);
 
-    // Define the properties on the exports object
+    // Define the properties on the `exports` object
     napi_define_properties(env, exports, propertyCount, properties);
 
     return exports;

From 98fb43b4d5074cb104e0dd55e285485b2f3ba69d Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 23 Sep 2023 23:40:35 +0400
Subject: [PATCH 38/72] Make: Deduplicate `.clang-format` settings

---
 .clang-format | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.clang-format b/.clang-format
index b1adf3b0..e0f25893 100644
--- a/.clang-format
+++ b/.clang-format
@@ -22,7 +22,6 @@ AllowShortFunctionsOnASingleLine: true
 AllowShortIfStatementsOnASingleLine: Always
 AllowShortLambdasOnASingleLine: true
 AllowShortLoopsOnASingleLine: true
-AlwaysBreakBeforeMultilineStrings: true
 AlwaysBreakTemplateDeclarations: Yes
 AlwaysBreakBeforeMultilineStrings: true
 AlwaysBreakAfterReturnType: None

From 48213a85cb595184da30ecbddccf899bc58ee625 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 23 Sep 2023 23:41:00 +0400
Subject: [PATCH 39/72] Make: Add NumPy dependency

---
 .vscode/settings.json | 2 ++
 pyproject.toml        | 2 +-
 setup.py              | 4 +++-
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index c32a469d..36d0a490 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -131,6 +131,7 @@
     "getitem",
     "getslice",
     "initproc",
+    "intp",
     "itemsize",
     "keeplinebreaks",
     "keepseparator",
@@ -148,6 +149,7 @@
     "NOARGS",
     "NOMINMAX",
     "NOTIMPLEMENTED",
+    "numpy",
     "pytest",
     "Pythonic",
     "quadgram",
diff --git a/pyproject.toml b/pyproject.toml
index fe8221c7..e12df96a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,5 +1,5 @@
 [build-system]
-requires = ["setuptools>=42", "wheel", "cmake>=3.22"]
+requires = ["setuptools>=42", "wheel", "cmake>=3.22", "numpy"]
 build-backend = "setuptools.build_meta"
 
 [tool.pytest.ini_options]
diff --git a/setup.py b/setup.py
index f667c0f2..1b8d83ce 100644
--- a/setup.py
+++ b/setup.py
@@ -3,6 +3,7 @@
 import platform
 from setuptools import setup, Extension
 
+import numpy as np
 
 compile_args = []
 link_args = []
@@ -54,7 +55,7 @@
     Extension(
         "stringzilla",
         ["python/lib.c"],
-        include_dirs=["stringzilla"],
+        include_dirs=["stringzilla", np.get_include()],
         extra_compile_args=compile_args,
         extra_link_args=link_args,
         define_macros=macros_args,
@@ -98,5 +99,6 @@
         "Topic :: Text Processing :: Indexing",
     ],
     include_dirs=[],
+    setup_requires=["numpy"],
     ext_modules=ext_modules,
 )

From d67a00554da060bf5dfa04048a8d3269a2e3bcfe Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 23 Sep 2023 23:41:58 +0400
Subject: [PATCH 40/72] Add: `sort()` and `order()` efficient Py methods

---
 README.md                 |   6 +-
 python/lib.c              | 360 ++++++++++++++++++++++++++++------
 scripts/test.cpp          |  12 +-
 scripts/test.py           | 395 +++++++++++++++++++-------------------
 stringzilla/stringzilla.h |  57 ++++--
 5 files changed, 543 insertions(+), 287 deletions(-)

diff --git a/README.md b/README.md
index 57197cb7..818c8879 100644
--- a/README.md
+++ b/README.md
@@ -119,9 +119,9 @@ sz_haystack_t haystack = {your_text, your_text_length};
 sz_needle_t needle = {your_subtext, your_subtext_length, your_anomaly_offset};
 
 // Perform string-level operations
-size_t character_count = sz_naive_count_char(haystack, 'a');
-size_t character_position = sz_naive_find_char(haystack, 'a');
-size_t substring_position = sz_naive_find_substr(haystack, needle);
+size_t character_count = sz_count_char_swar(haystack, 'a');
+size_t character_position = sz_find_char_swar(haystack, 'a');
+size_t substring_position = sz_find_substr_swar(haystack, needle);
 
 // Perform collection level operations
 sz_sequence_t array = {your_order, your_count, your_get_start, your_get_length, your_handle};
diff --git a/python/lib.c b/python/lib.c
index dbf41114..c1e27113 100644
--- a/python/lib.c
+++ b/python/lib.c
@@ -22,7 +22,9 @@ typedef SSIZE_T ssize_t;
 #include <unistd.h> // `ssize_t`
 #endif
 
-#include <Python.h>
+#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+#include <Python.h>            // Core CPython interfaces
+#include <numpy/arrayobject.h> // NumPy
 
 #include <stringzilla.h>
 
@@ -151,6 +153,47 @@ typedef struct {
 
 #pragma region Helpers
 
+typedef int boolean_t;
+
+inline static char const *haystacks_get_start(sz_haystack_t const *parts, sz_size_t i) { return parts[i].start; }
+inline static size_t haystacks_get_length(sz_haystack_t const *parts, sz_size_t i) { return parts[i].length; }
+
+void reverse_offsets(sz_size_t *array, size_t length) {
+    size_t i, j;
+    // Swap array[i] and array[j]
+    for (i = 0, j = length - 1; i < j; i++, j--) {
+        sz_size_t temp = array[i];
+        array[i] = array[j];
+        array[j] = temp;
+    }
+}
+
+void reverse_haystacks(sz_haystack_t *array, size_t length) {
+    size_t i, j;
+    // Swap array[i] and array[j]
+    for (i = 0, j = length - 1; i < j; i++, j--) {
+        sz_haystack_t temp = array[i];
+        array[i] = array[j];
+        array[j] = temp;
+    }
+}
+
+void apply_order(sz_haystack_t *array, sz_size_t *order, size_t length) {
+    for (size_t i = 0; i < length; ++i) {
+        while (order[i] != i) {
+            // Swap array[i] and array[order[i]]
+            sz_haystack_t temp = array[i];
+            array[i] = array[order[i]];
+            array[order[i]] = temp;
+
+            // Also update the order array to reflect the swap
+            size_t temp_idx = order[i];
+            order[i] = order[temp_idx];
+            order[temp_idx] = temp_idx;
+        }
+    }
+}
+
 void slice(size_t length, ssize_t start, ssize_t end, size_t *normalized_offset, size_t *normalized_length) {
 
     // clang-format off
@@ -172,7 +215,7 @@ void slice(size_t length, ssize_t start, ssize_t end, size_t *normalized_offset,
     *normalized_length = end - start;
 }
 
-int export_string_like(PyObject *object, char const **start, size_t *length) {
+boolean_t export_string_like(PyObject *object, char const **start, size_t *length) {
     if (PyUnicode_Check(object)) {
         // Handle Python str
         Py_ssize_t signed_length;
@@ -205,61 +248,86 @@ int export_string_like(PyObject *object, char const **start, size_t *length) {
     return 0;
 }
 
-int get_string_at_offset(
+typedef void (*get_string_at_offset_t)(Strs *, Py_ssize_t, Py_ssize_t, PyObject **, char const **, size_t *);
+
+void str_at_offset_consecutive_32bit(
     Strs *strs, Py_ssize_t i, Py_ssize_t count, PyObject **parent, char const **start, size_t *length) {
+    uint32_t start_offset = (i == 0) ? 0 : strs->data.consecutive_32bit.end_offsets[i - 1];
+    uint32_t end_offset = strs->data.consecutive_32bit.end_offsets[i];
+    *start = strs->data.consecutive_32bit.start + start_offset;
+    *length = end_offset - start_offset - strs->data.consecutive_32bit.separator_length * (i + 1 != count);
+    *parent = strs->data.consecutive_32bit.parent;
+}
+
+void str_at_offset_consecutive_64bit(
+    Strs *strs, Py_ssize_t i, Py_ssize_t count, PyObject **parent, char const **start, size_t *length) {
+    uint64_t start_offset = (i == 0) ? 0 : strs->data.consecutive_64bit.end_offsets[i - 1];
+    uint64_t end_offset = strs->data.consecutive_64bit.end_offsets[i];
+    *start = strs->data.consecutive_64bit.start + start_offset;
+    *length = end_offset - start_offset - strs->data.consecutive_64bit.separator_length * (i + 1 != count);
+    *parent = strs->data.consecutive_64bit.parent;
+}
+
+void str_at_offset_reordered(
+    Strs *strs, Py_ssize_t i, Py_ssize_t count, PyObject **parent, char const **start, size_t *length) {
+    *start = strs->data.reordered.parts[i].start;
+    *length = strs->data.reordered.parts[i].length;
+    *parent = strs->data.reordered.parent;
+}
+
+void str_at_offset_multi_source(
+    Strs *strs, Py_ssize_t i, Py_ssize_t count, PyObject **parent, char const **start, size_t *length) {
+    *start = strs->data.multi_source.parts[i].start;
+    *length = strs->data.multi_source.parts[i].length;
+    *parent = NULL; // TODO:
+}
+
+get_string_at_offset_t str_at_offset_getter(Strs *strs) {
     switch (strs->type) {
-    case STRS_CONSECUTIVE_32: {
-        uint32_t start_offset = (i == 0) ? 0 : strs->data.consecutive_32bit.end_offsets[i - 1];
-        uint32_t end_offset = strs->data.consecutive_32bit.end_offsets[i];
-        *start = strs->data.consecutive_32bit.start + start_offset;
-        *length = end_offset - start_offset - strs->data.consecutive_32bit.separator_length * (i + 1 != count);
-        *parent = strs->data.consecutive_32bit.parent;
-        return 1;
-    }
-    case STRS_CONSECUTIVE_64: {
-        uint64_t start_offset = (i == 0) ? 0 : strs->data.consecutive_64bit.end_offsets[i - 1];
-        uint64_t end_offset = strs->data.consecutive_64bit.end_offsets[i];
-        *start = strs->data.consecutive_64bit.start + start_offset;
-        *length = end_offset - start_offset - strs->data.consecutive_64bit.separator_length * (i + 1 != count);
-        *parent = strs->data.consecutive_64bit.parent;
-        return 1;
-    }
-    case STRS_REORDERED: {
-        //
-        return 1;
-    }
-    case STRS_MULTI_SOURCE: {
-        //
-        return 1;
-    }
+    case STRS_CONSECUTIVE_32: return str_at_offset_consecutive_32bit;
+    case STRS_CONSECUTIVE_64: return str_at_offset_consecutive_64bit;
+    case STRS_REORDERED: return str_at_offset_reordered;
+    case STRS_MULTI_SOURCE: return str_at_offset_multi_source;
     default:
         // Unsupported type
         PyErr_SetString(PyExc_TypeError, "Unsupported type for conversion");
-        return -1;
+        return NULL;
     }
 }
 
-int prepare_strings_for_reordering(Strs *strs) {
-    // Already in reordered form
-    if (strs->type == STRS_REORDERED) { return 1; }
+boolean_t prepare_strings_for_reordering(Strs *strs) {
 
     // Allocate memory for reordered slices
     size_t count = 0;
+    void *old_buffer = NULL;
+    get_string_at_offset_t getter = NULL;
+    PyObject *parent = NULL;
     switch (strs->type) {
-    case STRS_CONSECUTIVE_32: count = strs->data.consecutive_32bit.count; break;
-    case STRS_CONSECUTIVE_64: count = strs->data.consecutive_64bit.count; break;
+    case STRS_CONSECUTIVE_32:
+        count = strs->data.consecutive_32bit.count;
+        old_buffer = strs->data.consecutive_32bit.end_offsets;
+        parent = strs->data.consecutive_32bit.parent;
+        getter = str_at_offset_consecutive_32bit;
+        break;
+    case STRS_CONSECUTIVE_64:
+        count = strs->data.consecutive_64bit.count;
+        old_buffer = strs->data.consecutive_64bit.end_offsets;
+        parent = strs->data.consecutive_64bit.parent;
+        getter = str_at_offset_consecutive_64bit;
+        break;
+    // Already in reordered form
     case STRS_REORDERED: return 1;
     case STRS_MULTI_SOURCE: return 1;
     default:
         // Unsupported type
         PyErr_SetString(PyExc_TypeError, "Unsupported type for conversion");
-        return -1;
+        return 0;
     }
 
     sz_haystack_t *new_parts = (sz_haystack_t *)malloc(count * sizeof(sz_haystack_t));
     if (new_parts == NULL) {
         PyErr_SetString(PyExc_MemoryError, "Unable to allocate memory for reordered slices");
-        return -1;
+        return 0;
     }
 
     // Populate the new reordered array using get_string_at_offset
@@ -267,26 +335,20 @@ int prepare_strings_for_reordering(Strs *strs) {
         PyObject *parent;
         char const *start;
         size_t length;
-        if (!get_string_at_offset(strs, i, count, &parent, &start, &length)) {
-            // Handle error
-            PyErr_SetString(PyExc_RuntimeError, "Failed to get string at offset");
-            free(new_parts);
-            return -1;
-        }
-
+        getter(strs, i, count, &parent, &start, &length);
         new_parts[i].start = start;
         new_parts[i].length = length;
     }
 
     // Release previous used memory.
+    if (old_buffer) free(old_buffer);
 
     // Update the Strs object
     strs->type = STRS_REORDERED;
     strs->data.reordered.count = count;
     strs->data.reordered.parts = new_parts;
-    strs->data.reordered.parent = NULL; // Assuming the parent is no longer needed
-
-    return 0;
+    strs->data.reordered.parent = parent;
+    return 1;
 }
 
 #pragma endregion
@@ -603,7 +665,7 @@ static int Str_in(Str *self, PyObject *arg) {
     sz_haystack_t haystack;
     haystack.start = self->start;
     haystack.length = self->length;
-    size_t position = sz_neon_find_substr(haystack, needle_struct);
+    size_t position = sz_find_substr_auto(haystack, needle_struct);
     return position != haystack.length;
 }
 
@@ -629,20 +691,23 @@ static PyObject *Strs_getitem(Strs *self, Py_ssize_t i) {
     PyObject *parent = NULL;
     char const *start = NULL;
     size_t length = 0;
-    if (!get_string_at_offset(self, i, count, &parent, &start, &length)) {
+    get_string_at_offset_t getter = str_at_offset_getter(self);
+    if (!getter) {
         PyErr_SetString(PyExc_TypeError, "Unknown Strs kind");
         return NULL;
     }
+    else
+        getter(self, i, count, &parent, &start, &length);
 
     // Create a new `Str` object
-    Str *parent_slice = (Str *)StrType.tp_alloc(&StrType, 0);
-    if (parent_slice == NULL && PyErr_NoMemory()) return NULL;
+    Str *view_copy = (Str *)StrType.tp_alloc(&StrType, 0);
+    if (view_copy == NULL && PyErr_NoMemory()) return NULL;
 
-    parent_slice->start = start;
-    parent_slice->length = length;
-    parent_slice->parent = parent;
+    view_copy->start = start;
+    view_copy->length = length;
+    view_copy->parent = parent;
     Py_INCREF(parent);
-    return parent_slice;
+    return view_copy;
 }
 
 static PyObject *Strs_subscript(Str *self, PyObject *key) {
@@ -754,7 +819,7 @@ static int Str_find_( //
     haystack.length = normalized_length;
 
     // Perform contains operation
-    size_t offset = sz_neon_find_substr(haystack, needle);
+    size_t offset = sz_find_substr_auto(haystack, needle);
     if (offset == haystack.length) { *offset_out = -1; }
     else { *offset_out = (Py_ssize_t)offset; }
 
@@ -881,11 +946,11 @@ static PyObject *Str_count(PyObject *self, PyObject *args, PyObject *kwargs) {
     haystack.start += normalized_offset;
     haystack.length = normalized_length;
 
-    size_t count = needle.length == 1 ? sz_naive_count_char(haystack, *needle.start) : 0;
+    size_t count = needle.length == 1 ? sz_count_char_swar(haystack, *needle.start) : 0;
     if (needle.length != 1) {
         if (allowoverlap) {
             while (haystack.length) {
-                size_t offset = sz_neon_find_substr(haystack, needle);
+                size_t offset = sz_find_substr_auto(haystack, needle);
                 int found = offset != haystack.length;
                 count += found;
                 haystack.start += offset + found;
@@ -894,7 +959,7 @@ static PyObject *Str_count(PyObject *self, PyObject *args, PyObject *kwargs) {
         }
         else {
             while (haystack.length) {
-                size_t offset = sz_neon_find_substr(haystack, needle);
+                size_t offset = sz_find_substr_auto(haystack, needle);
                 int found = offset != haystack.length;
                 count += found;
                 haystack.start += offset + needle.length;
@@ -943,6 +1008,7 @@ static PyObject *Str_levenstein(PyObject *self, PyObject *args, PyObject *kwargs
         return NULL;
     }
 
+    // Allocate memory for the Levenstein matrix
     size_t memory_needed = sz_levenstein_memory_needed(str1.length, str2.length);
     if (temporary_memory.length < memory_needed) {
         temporary_memory.start = realloc(temporary_memory.start, memory_needed);
@@ -1075,11 +1141,11 @@ static Strs *Str_split_(
 
     // Iterate through string, keeping track of the
     sz_size_t last_start = 0;
-    while (last_start < text.length && offsets_count < maxsplit) {
+    while (last_start <= text.length && offsets_count < maxsplit) {
         sz_haystack_t text_remaining;
         text_remaining.start = text.start + last_start;
         text_remaining.length = text.length - last_start;
-        sz_size_t offset_in_remaining = sz_neon_find_substr(text_remaining, separator);
+        sz_size_t offset_in_remaining = sz_find_substr_auto(text_remaining, separator);
 
         // Reallocate offsets array if needed
         if (offsets_count >= offsets_capacity) {
@@ -1419,6 +1485,176 @@ static PyObject *Strs_shuffle(Strs *self, PyObject *args, PyObject *kwargs) {
     Py_RETURN_NONE;
 }
 
+static boolean_t Strs_sort_(Strs *self,
+                            sz_haystack_t **parts_output,
+                            sz_size_t **order_output,
+                            sz_size_t *count_output) {
+
+    // Change the layout
+    if (!prepare_strings_for_reordering(self)) {
+        PyErr_Format(PyExc_TypeError, "Failed to prepare the sequence for sorting");
+        return 0;
+    }
+
+    // Get the parts and their count
+    sz_haystack_t *parts = NULL;
+    size_t count = 0;
+    switch (self->type) {
+    case STRS_REORDERED:
+        parts = self->data.reordered.parts;
+        count = self->data.reordered.count;
+        break;
+
+    case STRS_MULTI_SOURCE:
+        parts = self->data.multi_source.parts;
+        count = self->data.multi_source.count;
+        break;
+    }
+
+    // Allocate temporary memory to store the ordering offsets
+    size_t memory_needed = sizeof(sz_size_t) * count;
+    if (temporary_memory.length < memory_needed) {
+        temporary_memory.start = realloc(temporary_memory.start, memory_needed);
+        temporary_memory.length = memory_needed;
+    }
+    if (!temporary_memory.start) {
+        PyErr_Format(PyExc_MemoryError, "Unable to allocate memory for the Levenshtein matrix");
+        return 0;
+    }
+
+    // Call our sorting algorithm
+    sz_sequence_t sequence = {};
+    sz_sort_config_t sort_config = {};
+    sequence.order = (sz_size_t *)temporary_memory.start;
+    sequence.count = count;
+    sequence.handle = parts;
+    sequence.get_start = haystacks_get_start;
+    sequence.get_length = haystacks_get_length;
+    for (sz_size_t i = 0; i != sequence.count; ++i) sequence.order[i] = i;
+    sz_sort(&sequence, &sort_config);
+
+    // Export results
+    *parts_output = parts;
+    *order_output = sequence.order;
+    *count_output = sequence.count;
+    return 1;
+}
+
+static PyObject *Strs_sort(Strs *self, PyObject *args, PyObject *kwargs) {
+    PyObject *reverse_obj = NULL; // Default is not reversed
+
+    // Check for positional arguments
+    Py_ssize_t nargs = PyTuple_Size(args);
+    if (nargs > 1) {
+        PyErr_SetString(PyExc_TypeError, "sort() takes at most 1 positional argument");
+        return NULL;
+    }
+    else if (nargs == 1) { reverse_obj = PyTuple_GET_ITEM(args, 0); }
+
+    // Check for keyword arguments
+    if (kwargs) {
+        PyObject *key, *value;
+        Py_ssize_t pos = 0;
+        while (PyDict_Next(kwargs, &pos, &key, &value)) {
+            if (PyUnicode_CompareWithASCIIString(key, "reverse") == 0) {
+                if (reverse_obj) {
+                    PyErr_SetString(PyExc_TypeError, "Received reverse both as positional and keyword argument");
+                    return NULL;
+                }
+                reverse_obj = value;
+            }
+            else {
+                PyErr_Format(PyExc_TypeError, "Received an unexpected keyword argument '%U'", key);
+                return NULL;
+            }
+        }
+    }
+
+    boolean_t reverse = 0; // Default is False
+    if (reverse_obj) {
+        if (!PyBool_Check(reverse_obj)) {
+            PyErr_SetString(PyExc_TypeError, "The reverse must be a boolean");
+            return NULL;
+        }
+        reverse = PyObject_IsTrue(reverse_obj);
+    }
+
+    sz_haystack_t *parts = NULL;
+    sz_size_t *order = NULL;
+    sz_size_t *count = NULL;
+    if (!Strs_sort_(self, &parts, &order, &count)) return NULL;
+
+    // Apply the sorting algorithm here, considering the `reverse` value
+    if (reverse) reverse_offsets(order, count);
+
+    // Apply the new order.
+    apply_order(parts, order, count);
+
+    Py_RETURN_NONE;
+}
+
+static PyObject *Strs_order(Strs *self, PyObject *args, PyObject *kwargs) {
+    PyObject *reverse_obj = NULL; // Default is not reversed
+
+    // Check for positional arguments
+    Py_ssize_t nargs = PyTuple_Size(args);
+    if (nargs > 1) {
+        PyErr_SetString(PyExc_TypeError, "order() takes at most 1 positional argument");
+        return NULL;
+    }
+    else if (nargs == 1) { reverse_obj = PyTuple_GET_ITEM(args, 0); }
+
+    // Check for keyword arguments
+    if (kwargs) {
+        PyObject *key, *value;
+        Py_ssize_t pos = 0;
+        while (PyDict_Next(kwargs, &pos, &key, &value)) {
+            if (PyUnicode_CompareWithASCIIString(key, "reverse") == 0) {
+                if (reverse_obj) {
+                    PyErr_SetString(PyExc_TypeError, "Received reverse both as positional and keyword argument");
+                    return NULL;
+                }
+                reverse_obj = value;
+            }
+            else {
+                PyErr_Format(PyExc_TypeError, "Received an unexpected keyword argument '%U'", key);
+                return NULL;
+            }
+        }
+    }
+
+    boolean_t reverse = 0; // Default is False
+    if (reverse_obj) {
+        if (!PyBool_Check(reverse_obj)) {
+            PyErr_SetString(PyExc_TypeError, "The reverse must be a boolean");
+            return NULL;
+        }
+        reverse = PyObject_IsTrue(reverse_obj);
+    }
+
+    sz_haystack_t *parts = NULL;
+    sz_size_t *order = NULL;
+    sz_size_t count = NULL;
+    if (!Strs_sort_(self, &parts, &order, &count)) return NULL;
+
+    // Apply the sorting algorithm here, considering the `reverse` value
+    if (reverse) reverse_offsets(order, count);
+
+    // Here, instead of applying the order, we want to return the copy of the
+    // order as a NumPy array of 64-bit unsigned integers.
+    npy_intp numpy_size = count;
+    PyObject *array = PyArray_SimpleNew(1, &numpy_size, NPY_UINT64);
+    if (!array) {
+        PyErr_SetString(PyExc_RuntimeError, "Failed to create a NumPy array");
+        return NULL;
+    }
+
+    // Copy the data from the order array to the newly created NumPy array
+    sz_size_t *numpy_data_ptr = (sz_size_t *)PyArray_DATA((PyArrayObject *)array);
+    memcpy(numpy_data_ptr, order, count * sizeof(sz_size_t));
+    return array;
+}
+
 static PySequenceMethods Strs_as_sequence = {
     .sq_length = Strs_len,        //
     .sq_item = Strs_getitem,      //
@@ -1431,7 +1667,9 @@ static PyMappingMethods Strs_as_mapping = {
 };
 
 static PyMethodDef Strs_methods[] = {
-    {"shuffle", Strs_shuffle, sz_method_flags_m, "Shuffle the elements of the Strs object."}, //
+    {"shuffle", Strs_shuffle, sz_method_flags_m, "Shuffle the elements of the Strs object."},  //
+    {"sort", Strs_sort, sz_method_flags_m, "Sort the elements of the Strs object."},           //
+    {"order", Strs_order, sz_method_flags_m, "Provides the indexes to achieve sorted order."}, //
     {NULL, NULL, 0, NULL}};
 
 static PyTypeObject StrsType = {
@@ -1482,6 +1720,8 @@ static PyModuleDef stringzilla_module = {
 PyMODINIT_FUNC PyInit_stringzilla(void) {
     PyObject *m;
 
+    import_array();
+
     if (PyType_Ready(&StrType) < 0) return NULL;
     if (PyType_Ready(&FileType) < 0) return NULL;
     if (PyType_Ready(&StrsType) < 0) return NULL;
diff --git a/scripts/test.cpp b/scripts/test.cpp
index 1cf34bb2..e2c83d1b 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -238,23 +238,23 @@ int main(int, char const **) {
         bench_search("std::search", full_text, [&]() {
             return std::search(full_text.begin(), full_text.end(), needle.begin(), needle.end()) - full_text.begin();
         });
-        bench_search("sz_naive_find_substr", full_text, [&]() {
+        bench_search("sz_find_substr_swar", full_text, [&]() {
             sz_haystack_t h {full_text.data(), full_text.size()};
             sz_needle_t n {needle.data(), needle.size()};
-            return sz_naive_find_substr(h, n);
+            return sz_find_substr_swar(h, n);
         });
 #if defined(__ARM_NEON)
-        bench_search("sz_neon_find_substr", full_text, [&]() {
+        bench_search("sz_find_substr_neon", full_text, [&]() {
             sz_haystack_t h {full_text.data(), full_text.size()};
             sz_needle_t n {needle.data(), needle.size()};
-            return sz_neon_find_substr(h, n);
+            return sz_find_substr_neon(h, n);
         });
 #endif
 #if defined(__AVX2__)
-        bench_search("sz_avx2_find_substr", full_text, [&]() {
+        bench_search("sz_find_substr_avx2", full_text, [&]() {
             sz_haystack_t h {full_text.data(), full_text.size()};
             sz_needle_t n {needle.data(), needle.size()};
-            return sz_avx2_find_substr(h, n);
+            return sz_find_substr_avx2(h, n);
         });
 #endif
     }
diff --git a/scripts/test.py b/scripts/test.py
index b9083ea6..14b6e9e7 100644
--- a/scripts/test.py
+++ b/scripts/test.py
@@ -6,55 +6,23 @@
 import pytest
 
 import stringzilla as sz
-from stringzilla import Str
+from stringzilla import Str, Strs
 
 
-def test_globals():
-    assert sz.find("abcdef", "bcdef") == 1
-    assert sz.find("abcdef", "x") == -1
-
-    assert sz.count("abcdef", "x") == 0
-    assert sz.count("aaaaa", "a") == 5
-    assert sz.count("aaaaa", "aa") == 2
-    assert sz.count("aaaaa", "aa", allowoverlap=True) == 4
-
-    assert sz.levenstein("aaa", "aaa") == 0
-    assert sz.levenstein("aaa", "bbb") == 3
-    assert sz.levenstein("abababab", "aaaaaaaa") == 4
-    assert sz.levenstein("abababab", "aaaaaaaa", 2) == 2
-    assert sz.levenstein("abababab", "aaaaaaaa", bound=2) == 2
-
-
-def test_split_keepseparator():
-    native = "word1_word2_word3"
-    big = Str(native)
-
-    words = sz.split(big, "_")
-    assert len(words) == 3
-
-    parts = sz.split(big, "_", keepseparator=True)
-    assert len(parts) == 3
-
-    assert str(words[0]) == "word1"
-    assert str(parts[0]) == "word1_"
-    assert str(words[2]) == "word3"
-    assert str(parts[2]) == "word3"
-
-
-def test_construct():
+def test_unit_construct():
     native = "aaaaa"
     big = Str(native)
     assert len(big) == len(native)
 
 
-def test_indexing():
+def test_unit_indexing():
     native = "abcdef"
     big = Str(native)
     for i in range(len(native)):
         assert big[i] == native[i]
 
 
-def test_count():
+def test_unit_count():
     native = "aaaaa"
     big = Str(native)
     assert big.count("a") == 5
@@ -62,20 +30,20 @@ def test_count():
     assert big.count("aa", allowoverlap=True) == 4
 
 
-def test_contains():
+def test_unit_contains():
     big = Str("abcdef")
     assert "a" in big
     assert "ab" in big
     assert "xxx" not in big
 
 
-def test_rich_comparisons():
+def test_unit_rich_comparisons():
     assert Str("aa") == "aa"
     assert Str("aa") < "b"
     assert Str("abb")[1:] == "bb"
 
 
-def test_buffer_protocol():
+def test_unit_buffer_protocol():
     import numpy as np
 
     my_str = Str("hello")
@@ -85,6 +53,77 @@ def test_buffer_protocol():
     assert "".join([c.decode("utf-8") for c in arr.tolist()]) == "hello"
 
 
+def test_unit_split():
+    native = "token1\ntoken2\ntoken3"
+    big = Str(native)
+    assert native.splitlines() == list(big.splitlines())
+    assert native.splitlines(True) == list(big.splitlines(keeplinebreaks=True))
+    assert native.split("token3") == list(big.split("token3"))
+
+    words = sz.split(big, "\n")
+    assert len(words) == 3
+    assert str(words[0]) == "token1"
+    assert str(words[2]) == "token3"
+
+    parts = sz.split(big, "\n", keepseparator=True)
+    assert len(parts) == 3
+    assert str(parts[0]) == "token1\n"
+    assert str(parts[2]) == "token3"
+
+
+def test_unit_sequence():
+    native = "line3\nline2\nline1"
+    big = Str(native)
+
+    lines = big.splitlines()
+    assert [2, 1, 0] == list(lines.order())
+
+    lines.sort()
+    assert [0, 1, 2] == list(lines.order())
+    assert ["line1", "line2", "line3"] == list(lines)
+
+    shuffled_copy = lines.shuffled(seed=42)
+    assert set(lines) == set(shuffled_copy)
+
+    lines.append("line4")
+    assert 4 == len(lines)
+    lines.extend(["line5", "line6"])
+    assert 6 == len(lines)
+
+    lines.append(lines[0])
+    assert 7 == len(lines)
+    assert lines[6] == "line1"
+
+    lines.extend(lines)
+    assert 14 == len(lines)
+    assert lines[7] == "line1"
+    assert lines[8] == "line2"
+    assert lines[12] == "line6"
+
+    # Test that shuffles are reproducible with the same `seed`
+    a = [str(s) for s in lines.shuffled(seed=42)]
+    b = [str(s) for s in lines.shuffled(seed=42)]
+    assert a == b
+
+
+def test_unit_globals():
+    """Validates that the previously unit-tested member methods are also visible as global functions."""
+
+    assert sz.find("abcdef", "bcdef") == 1
+    assert sz.find("abcdef", "x") == -1
+
+    assert sz.count("abcdef", "x") == 0
+    assert sz.count("aaaaa", "a") == 5
+    assert sz.count("aaaaa", "aa") == 2
+    assert sz.count("aaaaa", "aa", allowoverlap=True) == 4
+
+    assert sz.levenstein("aaa", "aaa") == 0
+    assert sz.levenstein("aaa", "bbb") == 3
+    assert sz.levenstein("abababab", "aaaaaaaa") == 4
+    assert sz.levenstein("abababab", "aaaaaaaa", 2) == 2
+    assert sz.levenstein("abababab", "aaaaaaaa", bound=2) == 2
+
+
 def get_random_string(
     length: Optional[int] = None, variability: Optional[int] = None
 ) -> str:
@@ -100,169 +139,125 @@ def is_equal_strings(native_strings, big_strings):
         assert native_slice == big_slice
 
 
-# def check_identical(
-#     native: str,
-#     big: Union[Str, File],
-#     needle: Optional[str] = None,
-#     check_iterators: bool = False,
-# ):
-#     if needle is None:
-#         part_offset = randint(0, len(native) - 1)
-#         part_length = randint(1, len(native) - part_offset)
-#         needle = native[part_offset:part_length]
-
-#     present_in_native: bool = needle in native
-#     present_in_big = needle in big
-#     assert present_in_native == present_in_big
-#     assert native.find(needle) == big.find(needle)
-#     assert native.count(needle) == big.count(needle)
+def check_identical(
+    native: str,
+    big: Str,
+    needle: Optional[str] = None,
+    check_iterators: bool = False,
+):
+    if needle is None:
+        part_offset = randint(0, len(native) - 1)
+        part_length = randint(1, len(native) - part_offset)
+        needle = native[part_offset:part_length]
+
+    present_in_native: bool = needle in native
+    present_in_big = needle in big
+    assert present_in_native == present_in_big
+    assert native.find(needle) == big.find(needle)
+    assert native.count(needle) == big.count(needle)
+
+    native_strings = native.split(needle)
+    big_strings: Strs = big.split(needle)
+    assert len(native_strings) == len(big_strings)
+
+    if check_iterators:
+        for i in range(len(native_strings)):
+            assert len(native_strings[i]) == len(big_strings[i])
+            assert native_strings[i] == big_strings[i]
+            assert [c for c in native_strings[i]] == [c for c in big_strings[i]]
+
+    is_equal_strings(native_strings, big_strings)
+
+
+@pytest.mark.parametrize("haystack_length", range(1, 65))
+@pytest.mark.parametrize("variability", range(1, 25))
+def test_fuzzy_substrings(haystack_length: int, variability: int):
+    native = get_random_string(variability=variability, length=haystack_length)
+    big = Str(native)
+    pattern = get_random_string(variability=variability, length=randint(1, 5))
+    assert (pattern in native) == big.contains(pattern)
+    assert native.find(pattern) == big.find(pattern)
+
+
+@pytest.mark.parametrize("repetitions", range(1, 10))
+def test_basic(repetitions: int):
+    native = "abcd" * repetitions
+    big = Str(native)
+
+    check_identical(native, big, "a", True)
+    check_identical(native, big, "ab", True)
+    check_identical(native, big, "abc", True)
+    check_identical(native, big, "abcd", True)
+    check_identical(native, big, "abcde", True)  # Missing pattern
+
 
-#     native_strings = native.split(needle)
-#     big_strings: Strs = big.split(needle)
-#     assert len(native_strings) == len(big_strings)
+@pytest.mark.parametrize("pattern_length", [1, 2, 4, 5])
+@pytest.mark.parametrize("haystack_length", range(1, 69, 3))
+@pytest.mark.parametrize("variability", range(1, 27, 3))
+def test_fuzzy(pattern_length: int, haystack_length: int, variability: int):
+    native = get_random_string(variability=variability, length=haystack_length)
+    big = Str(native)
+
+    # Start by matching the prefix and the suffix
+    check_identical(native, big, native[:pattern_length])
+    check_identical(native, big, native[-pattern_length:])
+
+    # Continue with random strs
+    for _ in range(haystack_length // pattern_length):
+        pattern = get_random_string(variability=variability, length=pattern_length)
+        check_identical(native, big, pattern)
+
+
+def test_strs():
+    native = get_random_string(length=10)
+    big = Str(native)
+
+    assert native[0:5] == big.sub(0, 5) and native[0:5] == big[0:5]
+    assert native[5:10] == big.sub(5, 10) and native[5:10] == big[5:10]
+
+    assert native[5:5] == big.sub(5, 5) and native[5:5] == big[5:5]
+    assert native[-5:-5] == big.sub(-5, -5) and native[-5:-5] == big[-5:-5]
+    assert native[2:-2] == big.sub(2, -2) and native[2:-2] == big[2:-2]
+    assert native[7:-7] == big.sub(7, -7) and native[7:-7] == big[7:-7]
+
+    assert native[5:3] == big.sub(5, 3) and native[5:3] == big[5:3]
+    assert native[5:7] == big.sub(5, 7) and native[5:7] == big[5:7]
+    assert native[5:-3] == big.sub(5, -3) and native[5:-3] == big[5:-3]
+    assert native[5:-7] == big.sub(5, -7) and native[5:-7] == big[5:-7]
+
+    assert native[-5:3] == big.sub(-5, 3) and native[-5:3] == big[-5:3]
+    assert native[-5:7] == big.sub(-5, 7) and native[-5:7] == big[-5:7]
+    assert native[-5:-3] == big.sub(-5, -3) and native[-5:-3] == big[-5:-3]
+    assert native[-5:-7] == big.sub(-5, -7) and native[-5:-7] == big[-5:-7]
+
+    assert native[2:] == big.sub(2) and native[2:] == big[2:]
+    assert native[:7] == big.sub(end=7) and native[:7] == big[:7]
+    assert native[-2:] == big.sub(-2) and native[-2:] == big[-2:]
+    assert native[:-7] == big.sub(end=-7) and native[:-7] == big[:-7]
+    assert native[:-10] == big.sub(end=-10) and native[:-10] == big[:-10]
+    assert native[:-1] == big.sub(end=-1) and native[:-1] == big[:-1]
+
+    length = 1000
+    native = get_random_string(length=length)
+    big = Str(native)
 
-#     if check_iterators:
-#         for i in range(len(native_strings)):
-#             assert len(native_strings[i]) == len(big_strings[i])
-#             assert native_strings[i] == big_strings[i]
-#             assert [c for c in native_strings[i]] == [c for c in big_strings[i]]
-
-#     is_equal_strings(native_strings, big_strings)
-
-
-# @pytest.mark.parametrize("haystack_length", range(1, 65))
-# @pytest.mark.parametrize("variability", range(1, 25))
-# def test_contains(haystack_length: int, variability: int):
-#     native = get_random_string(variability=variability, length=haystack_length)
-#     big = Str(native)
-#     pattern = get_random_string(variability=variability, length=randint(1, 5))
-#     assert (pattern in native) == big.contains(pattern)
-
-
-# def test_count_overlap():
-#     native = "aaaaa"
-#     big = Str(native)
-#     assert native.count("aa") == big.count("aa")
-#     assert 4 == big.count("aa", allowoverlap=True)
-
-
-# def test_splitlines():
-#     native = "line1\nline2\nline3"
-#     big = Str(native)
-#     assert native.splitlines() == list(big.splitlines())
-#     assert native.splitlines(True) == list(big.splitlines(keeplinebreaks=True))
-
-
-# def test_strs_operations():
-#     native = "line1\nline2\nline3"
-#     big = Str(native)
-#     lines = big.splitlines()
-#     lines.sort()
-#     assert ["line1", "line2", "line3"] == list(lines)
-
-#     shuffled_copy = lines.shuffled(seed=42)
-#     assert set(lines) == set(shuffled_copy)
-
-#     lines.append("line4")
-#     assert 4 == len(lines)
-#     lines.extend(["line5", "line6"])
-#     assert 6 == len(lines)
-
-#     lines.append(lines[0])
-#     assert 7 == len(lines)
-#     assert lines[6] == "line1"
-
-#     lines.extend(lines)
-#     assert 14 == len(lines)
-#     assert lines[7] == "line1"
-#     assert lines[8] == "line2"
-#     assert lines[12] == "line6"
-
-#     # Test that shuffles are reproducible with the same `seed`
-#     a = [str(s) for s in lines.shuffled(seed=42)]
-#     b = [str(s) for s in lines.shuffled(seed=42)]
-#     assert a == b
-
-
-# @pytest.mark.parametrize("repetitions", range(1, 10))
-# def test_basic(repetitions: int):
-#     native = "abcd" * repetitions
-#     big = Str(native)
-
-#     check_identical(native, big, "a", True)
-#     check_identical(native, big, "ab", True)
-#     check_identical(native, big, "abc", True)
-#     check_identical(native, big, "abcd", True)
-#     check_identical(native, big, "abcde", True)  # Missing pattern
-
-
-# @pytest.mark.parametrize("pattern_length", [1, 2, 4, 5])
-# @pytest.mark.parametrize("haystack_length", range(1, 69, 3))
-# @pytest.mark.parametrize("variability", range(1, 27, 3))
-# def test_fuzzy(pattern_length: int, haystack_length: int, variability: int):
-#     native = get_random_string(variability=variability, length=haystack_length)
-#     big = Str(native)
-
-#     # Start by matching the prefix and the suffix
-#     check_identical(native, big, native[:pattern_length])
-#     check_identical(native, big, native[-pattern_length:])
-
-#     # Continue with random strs
-#     for _ in range(haystack_length // pattern_length):
-#         pattern = get_random_string(variability=variability, length=pattern_length)
-#         check_identical(native, big, pattern)
-
-
-# def test_strs():
-#     native = get_random_string(length=10)
-#     big = Str(native)
-
-#     assert native[0:5] == big.sub(0, 5) and native[0:5] == big[0:5]
-#     assert native[5:10] == big.sub(5, 10) and native[5:10] == big[5:10]
-
-#     assert native[5:5] == big.sub(5, 5) and native[5:5] == big[5:5]
-#     assert native[-5:-5] == big.sub(-5, -5) and native[-5:-5] == big[-5:-5]
-#     assert native[2:-2] == big.sub(2, -2) and native[2:-2] == big[2:-2]
-#     assert native[7:-7] == big.sub(7, -7) and native[7:-7] == big[7:-7]
-
-#     assert native[5:3] == big.sub(5, 3) and native[5:3] == big[5:3]
-#     assert native[5:7] == big.sub(5, 7) and native[5:7] == big[5:7]
-#     assert native[5:-3] == big.sub(5, -3) and native[5:-3] == big[5:-3]
-#     assert native[5:-7] == big.sub(5, -7) and native[5:-7] == big[5:-7]
-
-#     assert native[-5:3] == big.sub(-5, 3) and native[-5:3] == big[-5:3]
-#     assert native[-5:7] == big.sub(-5, 7) and native[-5:7] == big[-5:7]
-#     assert native[-5:-3] == big.sub(-5, -3) and native[-5:-3] == big[-5:-3]
-#     assert native[-5:-7] == big.sub(-5, -7) and native[-5:-7] == big[-5:-7]
-
-#     assert native[2:] == big.sub(2) and native[2:] == big[2:]
-#     assert native[:7] == big.sub(end=7) and native[:7] == big[:7]
-#     assert native[-2:] == big.sub(-2) and native[-2:] == big[-2:]
-#     assert native[:-7] == big.sub(end=-7) and native[:-7] == big[:-7]
-#     assert native[:-10] == big.sub(end=-10) and native[:-10] == big[:-10]
-#     assert native[:-1] == big.sub(end=-1) and native[:-1] == big[:-1]
-
-#     length = 1000
-#     native = get_random_string(length=length)
-#     big = Str(native)
-
-#     needle = native[0 : randint(2, 5)]
-#     native_strings = native.split(needle)
-#     big_strings: Strs = big.split(needle)
-
-#     length = len(native_strings)
-#     for i in range(length):
-#         start = randint(1 - length, length - 1)
-#         stop = randint(1 - length, length - 1)
-#         step = 0
-#         while step == 0:
-#             step = randint(-int(math.sqrt(length)), int(math.sqrt(length)))
-
-#         is_equal_strings(native_strings[start:stop:step], big_strings[start:stop:step])
-#         is_equal_strings(
-#             native_strings[start:stop:step],
-#             big_strings.sub(start, stop, step),
-#         )
+    needle = native[0 : randint(2, 5)]
+    native_strings = native.split(needle)
+    big_strings: Strs = big.split(needle)
+
+    length = len(native_strings)
+    for i in range(length):
+        start = randint(1 - length, length - 1)
+        stop = randint(1 - length, length - 1)
+        step = 0
+        while step == 0:
+            step = randint(-int(math.sqrt(length)), int(math.sqrt(length)))
+
+        is_equal_strings(native_strings[start:stop:step], big_strings[start:stop:step])
+        is_equal_strings(
+            native_strings[start:stop:step],
+            big_strings.sub(start, stop, step),
+        )
 
 
 def test_levenstein():
diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h
index 8bd32fa1..52cc4ec6 100644
--- a/stringzilla/stringzilla.h
+++ b/stringzilla/stringzilla.h
@@ -59,7 +59,7 @@ typedef struct sz_needle_t {
 /**
  *  @brief  SWAR single-character counting procedure, jumping 8 bytes at a time.
  */
-inline static sz_size_t sz_naive_count_char(sz_haystack_t h, char n) {
+inline static sz_size_t sz_count_char_swar(sz_haystack_t h, char n) {
 
     sz_size_t result = 0;
     char const *text = h.start;
@@ -89,7 +89,7 @@ inline static sz_size_t sz_naive_count_char(sz_haystack_t h, char n) {
 /**
  *  @brief  SWAR single-character search in string, jumping 8 bytes at a time.
  */
-inline static sz_size_t sz_naive_find_char(sz_haystack_t h, char n) {
+inline static sz_size_t sz_find_char_swar(sz_haystack_t h, char n) {
 
     char const *text = h.start;
     char const *end = h.start + h.length;
@@ -121,7 +121,7 @@ inline static sz_size_t sz_naive_find_char(sz_haystack_t h, char n) {
 /**
  *  @brief  SWAR character-bigram search in string, jumping 8 bytes at a time.
  */
-inline static sz_size_t sz_naive_find_2chars(sz_haystack_t h, char const *n) {
+inline static sz_size_t sz_find_2chars_swar(sz_haystack_t h, char const *n) {
 
     char const *text = h.start;
     char const *end = h.start + h.length;
@@ -162,7 +162,7 @@ inline static sz_size_t sz_naive_find_2chars(sz_haystack_t h, char const *n) {
 /**
  *  @brief  SWAR character-trigram search in string, jumping 8 bytes at a time.
  */
-inline static sz_size_t sz_naive_find_3chars(sz_haystack_t h, char const *n) {
+inline static sz_size_t sz_find_3chars_swar(sz_haystack_t h, char const *n) {
 
     char const *text = h.start;
     char const *end = h.start + h.length;
@@ -215,7 +215,7 @@ inline static sz_size_t sz_naive_find_3chars(sz_haystack_t h, char const *n) {
 /**
  *  @brief  SWAR character-quadgram search in string, jumping 8 bytes at a time.
  */
-inline static sz_size_t sz_naive_find_4chars(sz_haystack_t h, char const *n) {
+inline static sz_size_t sz_find_4chars_swar(sz_haystack_t h, char const *n) {
 
     char const *text = h.start;
     char const *end = h.start + h.length;
@@ -230,7 +230,7 @@ inline static sz_size_t sz_naive_find_4chars(sz_haystack_t h, char const *n) {
     lookup[0b0100] = lookup[0b1100] = 2;
     lookup[0b1000] = 3;
 
-    // We can perform 5 comparisons per load, but it's easir to perform 4, minimizing the size of the lookup table.
+    // We can perform 5 comparisons per load, but it's easier to perform 4, minimizing the size of the lookup table.
     for (; text + 8 <= end; text += 4) {
         uint64_t text_slice;
         memcpy(&text_slice, text, 8);
@@ -275,19 +275,20 @@ inline static sz_size_t sz_naive_find_4chars(sz_haystack_t h, char const *n) {
  *          it compares 4-byte anomalies first, most commonly prefixes. It's computationally cheaper.
  *          Matching performance fluctuates between 1 GB/s and 3,5 GB/s per core.
  */
-inline static sz_size_t sz_naive_find_substr(sz_haystack_t h, sz_needle_t n) {
+inline static sz_size_t sz_find_substr_swar(sz_haystack_t h, sz_needle_t n) {
 
     if (h.length < n.length) return h.length;
 
-    char const *text = h.start;
-    char const *const end = h.start + h.length;
     switch (n.length) {
     case 0: return 0;
-    case 1: return sz_naive_find_char(h, *n.start);
-    case 2: return sz_naive_find_2chars(h, n.start);
-    case 3: return sz_naive_find_3chars(h, n.start);
-    case 4: return sz_naive_find_4chars(h, n.start);
+    case 1: return sz_find_char_swar(h, *n.start);
+    case 2: return sz_find_2chars_swar(h, n.start);
+    case 3: return sz_find_3chars_swar(h, n.start);
+    case 4: return sz_find_4chars_swar(h, n.start);
     default: {
+        char const *text = h.start;
+        char const *const end = h.start + h.length;
+
         sz_anomaly_t n_anomaly, h_anomaly;
         sz_size_t const n_suffix_len = n.length - 4 - n.anomaly_offset;
         char const *n_suffix_ptr = n.start + 4 + n.anomaly_offset;
@@ -314,7 +315,7 @@ inline static sz_size_t sz_naive_find_substr(sz_haystack_t h, sz_needle_t n) {
  *          was practically more efficient than loading once and shifting around, as introduces
  *          less data dependencies.
  */
-sz_size_t sz_avx2_find_substr(sz_haystack_t h, sz_needle_t n) {
+inline static sz_size_t sz_find_substr_avx2(sz_haystack_t h, sz_needle_t n) {
 
     // Precomputed constants
     char const *const end = h.start + h.length;
@@ -363,7 +364,7 @@ sz_size_t sz_avx2_find_substr(sz_haystack_t h, sz_needle_t n) {
     sz_haystack_t tail;
     tail.start = text;
     tail.length = end - text;
-    size_t tail_match = sz_naive_find_substr(tail, n);
+    size_t tail_match = sz_find_substr_swar(tail, n);
     return text + tail_match - h.start;
 }
 
@@ -377,7 +378,7 @@ sz_size_t sz_avx2_find_substr(sz_haystack_t h, sz_needle_t n) {
  *          was practically more efficient than loading once and shifting around, as introduces
  *          less data dependencies.
  */
-inline static sz_size_t sz_neon_find_substr(sz_haystack_t h, sz_needle_t n) {
+inline static sz_size_t sz_find_substr_neon(sz_haystack_t h, sz_needle_t n) {
 
     // Precomputed constants
     char const *const end = h.start + h.length;
@@ -420,12 +421,33 @@ inline static sz_size_t sz_neon_find_substr(sz_haystack_t h, sz_needle_t n) {
     sz_haystack_t tail;
     tail.start = text;
     tail.length = end - text;
-    size_t tail_match = sz_naive_find_substr(tail, n);
+    size_t tail_match = sz_find_substr_swar(tail, n);
     return text + tail_match - h.start;
 }
 
 #endif // Arm Neon
 
+inline static sz_size_t sz_find_substr_auto(sz_haystack_t h, sz_needle_t n) {
+    if (h.length < n.length) return h.length;
+
+    switch (n.length) {
+    case 0: return 0;
+    case 1: return sz_find_char_swar(h, *n.start);
+    case 2: return sz_find_2chars_swar(h, n.start);
+    case 3: return sz_find_3chars_swar(h, n.start);
+    case 4:
+        return sz_find_4chars_swar(h, n.start);
+        // #if defined(__ARM_NEON)
+        //     default: return sz_find_substr_neon(h, n);
+        // #elif defined(__AVX2__)
+        //     default: return sz_find_substr_avx2(h, n);
+        // #else
+    default:
+        return sz_find_substr_swar(h, n);
+        // #endif
+    }
+}
+
 inline static void sz_swap(sz_size_t *a, sz_size_t *b) {
     sz_size_t t = *a;
     *a = *b;
@@ -517,7 +539,6 @@ inline static void _sz_sort_recursion( //
     {
         sz_size_t mask = (1ul << 63) >> bit_idx;
         while (split != sequence->count && !(sequence->order[split] & mask)) ++split;
-
         for (sz_size_t i = split + 1; i < sequence->count; ++i)
             if (!(sequence->order[i] & mask)) sz_swap(sequence->order + i, sequence->order + split), ++split;
     }

From 1265fce66acf3f1b2b2a35ac2782eddbf40875cf Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 24 Sep 2023 12:29:49 +0400
Subject: [PATCH 41/72] Add: Subscript methods

---
 CMakeLists.txt   |   2 +-
 python/lib.c     | 218 +++++++++++++++++++++++++++++++++++++++++++++--
 scripts/test.c   |  60 +++++++++++++
 scripts/test.cpp |  18 ++--
 scripts/test.py  |  51 ++++++-----
 5 files changed, 300 insertions(+), 49 deletions(-)
 create mode 100644 scripts/test.c

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6909c838..df569329 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -89,7 +89,7 @@ if(STRINGZILLA_INSTALL)
 endif()
 
 if(${STRINGZILLA_BUILD_TEST} OR ${STRINGZILLA_BUILD_BENCHMARK})
-  add_executable(stringzilla_test scripts/test.cpp)
+  add_executable(stringzilla_test scripts/test.c)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
 
diff --git a/python/lib.c b/python/lib.c
index c1e27113..5b576c67 100644
--- a/python/lib.c
+++ b/python/lib.c
@@ -136,11 +136,13 @@ typedef struct {
         /**
          *  Complex structure with two variable length chunks inside - for the parents and their slices.
          *  The parents are sorted in ascending order of their memory ranges, to let us rapidly locate the source
-         *  with a binary search. The slices are preserved
+         *  with a binary search.
          */
-        struct multi_source_strings_t {
+        struct multi_source_slices_t {
             size_t count;
+            size_t capacity;
             size_t parents_count;
+            size_t parents_capacity;
 
             PyObject **parents;
             sz_haystack_t *parts;
@@ -279,7 +281,24 @@ void str_at_offset_multi_source(
     Strs *strs, Py_ssize_t i, Py_ssize_t count, PyObject **parent, char const **start, size_t *length) {
     *start = strs->data.multi_source.parts[i].start;
     *length = strs->data.multi_source.parts[i].length;
-    *parent = NULL; // TODO:
+
+    PyObject **parents = strs->data.multi_source.parents;
+    size_t parents_count = strs->data.multi_source.parents_count;
+    for (size_t j = 0; j < parents_count; ++j) {
+        PyObject *current_parent = parents[j];
+        char *parent_start;
+        Py_ssize_t parent_length;
+        export_string_like(current_parent, &parent_start, &parent_length);
+
+        // Check if the string at offset `i` is within the range of the current parent.
+        if (*start >= parent_start && *start + *length <= parent_start + parent_length) {
+            *parent = current_parent;
+            return;
+        }
+    }
+
+    // If no parent is found, set *parent to NULL.
+    *parent = NULL;
 }
 
 get_string_at_offset_t str_at_offset_getter(Strs *strs) {
@@ -331,11 +350,11 @@ boolean_t prepare_strings_for_reordering(Strs *strs) {
     }
 
     // Populate the new reordered array using get_string_at_offset
-    for (Py_ssize_t i = 0; i < count; ++i) {
+    for (size_t i = 0; i < count; ++i) {
         PyObject *parent;
         char const *start;
         size_t length;
-        getter(strs, i, count, &parent, &start, &length);
+        getter(strs, (Py_ssize_t)i, count, &parent, &start, &length);
         new_parts[i].start = start;
         new_parts[i].length = length;
     }
@@ -351,6 +370,8 @@ boolean_t prepare_strings_for_reordering(Strs *strs) {
     return 1;
 }
 
+boolean_t prepare_strings_for_extension(Strs *strs, size_t new_parents, size_t new_parts) { return 1; }
+
 #pragma endregion
 
 #pragma region MemoryMappingFile
@@ -679,6 +700,16 @@ static Py_ssize_t Strs_len(Strs *self) {
     }
 }
 
+static Py_ssize_t Strs_parents_count(Strs *self) {
+    switch (self->type) {
+    case STRS_CONSECUTIVE_32: return 1;
+    case STRS_CONSECUTIVE_64: return 1;
+    case STRS_REORDERED: return 1;
+    case STRS_MULTI_SOURCE: return self->data.multi_source.parents_count;
+    default: return 0;
+    }
+}
+
 static PyObject *Strs_getitem(Strs *self, Py_ssize_t i) {
     // Check for negative index and convert to positive
     Py_ssize_t count = Strs_len(self);
@@ -710,9 +741,176 @@ static PyObject *Strs_getitem(Strs *self, Py_ssize_t i) {
     return view_copy;
 }
 
-static PyObject *Strs_subscript(Str *self, PyObject *key) {
-    if (PyLong_Check(key)) return Strs_getitem(self, PyLong_AsSsize_t(key));
-    return NULL;
+static PyObject *Strs_subscript(Strs *self, PyObject *key) {
+    if (PySlice_Check(key)) {
+        // Sanity checks
+        Py_ssize_t count = Strs_len(self);
+        Py_ssize_t start, stop, step;
+        if (PySlice_Unpack(key, &start, &stop, &step) < 0) return NULL;
+        if (PySlice_AdjustIndices(count, &start, &stop, step) < 0) return NULL;
+        if (step != 1) {
+            PyErr_SetString(PyExc_IndexError, "Efficient step is not supported");
+            return NULL;
+        }
+
+        // Create a new `Strs` object
+        Strs *self_slice = (Strs *)StrsType.tp_alloc(&StrsType, 0);
+        if (self_slice == NULL && PyErr_NoMemory()) return NULL;
+
+        // Depending on the layout, the procedure will be different.
+        self_slice->type = self->type;
+        switch (self->type) {
+        case STRS_CONSECUTIVE_32: {
+            struct consecutive_slices_32bit_t *from = &self->data.consecutive_32bit;
+            struct consecutive_slices_32bit_t *to = &self_slice->data.consecutive_32bit;
+            to->count = stop - start;
+            to->separator_length = from->sepa rator_length;
+            to->parent = from->parent;
+
+            size_t first_length;
+            str_at_offset_consecutive_32bit(self, start, count, &to->parent, &to->start, &first_length);
+            uint32_t first_offset = to->start - from->start;
+            to->end_offsets = malloc(sizeof(uint32_t) * to->count);
+            if (to->end_offsets == NULL && PyErr_NoMemory()) {
+                Py_XDECREF(self_slice);
+                return NULL;
+            }
+            for (size_t i = 0; i != to->count; ++i) to->end_offsets[i] = from->end_offsets[i] - first_offset;
+            Py_INCREF(to->parent);
+            break;
+        }
+        case STRS_CONSECUTIVE_64: {
+            struct consecutive_slices_64bit_t *from = &self->data.consecutive_64bit;
+            struct consecutive_slices_64bit_t *to = &self_slice->data.consecutive_64bit;
+            to->count = stop - start;
+            to->separator_length = from->separator_length;
+            to->parent = from->parent;
+
+            size_t first_length;
+            str_at_offset_consecutive_64bit(self, start, count, &to->parent, &to->start, &first_length);
+            uint64_t first_offset = to->start - from->start;
+            to->end_offsets = malloc(sizeof(uint64_t) * to->count);
+            if (to->end_offsets == NULL && PyErr_NoMemory()) {
+                Py_XDECREF(self_slice);
+                return NULL;
+            }
+            for (size_t i = 0; i != to->count; ++i) to->end_offsets[i] = from->end_offsets[i] - first_offset;
+            Py_INCREF(to->parent);
+            break;
+        }
+        case STRS_REORDERED: {
+            struct reordered_slices_t *from = &self->data.reordered;
+            struct reordered_slices_t *to = &self_slice->data.reordered;
+            to->count = stop - start;
+            to->parent = from->parent;
+
+            to->parts = malloc(sizeof(sz_haystack_t) * to->count);
+            if (to->parts == NULL && PyErr_NoMemory()) {
+                Py_XDECREF(self_slice);
+                return NULL;
+            }
+            memcpy(to->parts, from->parts + start, sizeof(sz_haystack_t) * to->count);
+            Py_INCREF(to->parent);
+            break;
+        }
+        case STRS_MULTI_SOURCE: {
+            struct multi_source_slices_t *from = &self->data.multi_source;
+            struct multi_source_slices_t *to = &self_slice->data.multi_source;
+            to->count = stop - start;
+            to->capacity = to->count;
+            to->parents_count = 0;
+            to->parents_capacity = from->parents_capacity;
+
+            // Allocate memory for both `parts` and `parents` references
+            to->parts = malloc(sizeof(sz_haystack_t) * to->capacity);
+            if (to->parts == NULL && PyErr_NoMemory()) {
+                Py_XDECREF(self_slice);
+                return NULL;
+            }
+            to->parents = malloc(sizeof(PyObject *) * to->parents_capacity);
+            if (to->parents == NULL && PyErr_NoMemory()) {
+                free(to->parts);
+                Py_XDECREF(self_slice);
+                return NULL;
+            }
+
+            // Iterate through the `parts` of this slice, detect the `parent`
+            // of each exported entry in `from->parents`, and upsert it into the `to->parents`
+            for (Py_ssize_t i = start; i < stop; ++i) {
+                PyObject *detected_parent;
+                char const *part_start;
+                size_t part_length;
+
+                // Find the parent of the part at the offset `i`
+                str_at_offset_multi_source(self, i, count, &detected_parent, &part_start, &part_length);
+                Py_INCREF(detected_parent);
+
+                // Upsert the detected parent into to->parents
+                // As the to->parents array is meant to be sorted,
+                // we insert in a way that maintains the sorting
+                size_t j = 0;
+                while (j < to->parents_count && to->parents[j] != detected_parent) ++j;
+
+                // If the parent is not already in to->parents, insert it.
+                if (j == to->parents_count) {
+                    to->parents[j] = detected_parent;
+                    ++to->parents_count;
+                }
+
+                // Populate the to->parts array
+                to->parts[i - start].start = part_start;
+                to->parts[i - start].length = part_length;
+            }
+
+            break;
+        }
+        default:
+            // Unsupported type
+            PyErr_SetString(PyExc_TypeError, "Unsupported type for conversion");
+            return NULL;
+        }
+
+        return (PyObject *)self_slice;
+    }
+    else if (PyLong_Check(key)) { return Strs_getitem(self, PyLong_AsSsize_t(key)); }
+    else {
+        PyErr_SetString(PyExc_TypeError, "Strs indices must be integers or slices");
+        return NULL;
+    }
+}
+
+static PyObject *Strs_extend(Strs *self, PyObject *seq) {
+    // Check if seq is an instance of Strs
+    if (PyObject_IsInstance(seq, (PyObject *)&StrsType)) {
+        Strs *other = (Strs *)seq;
+        size_t other_parents = Strs_len(other);
+        size_t other_parts = Strs_parents_count(other);
+        if (!prepare_strings_for_extension(self, other_parents, other_parts)) {
+            PyErr_Format(PyExc_TypeError, "Failed to prepare the sequence for extension");
+            return NULL;
+        }
+
+        // TODO:
+    }
+    else if (PySequence_Check(seq)) {
+        // Check if seq is a sequence
+        Py_ssize_t length = PySequence_Size(seq);
+        // Validate that every item in the sequence is string-like with `export_string_like`
+        // TODO:
+
+        for (Py_ssize_t i = 0; i < length; i++) {
+            PyObject *item = PySequence_ITEM(seq, i);
+            if (!item) return NULL; // Error getting item from sequence
+
+            // TODO:
+        }
+    }
+    else {
+        PyErr_SetString(PyExc_TypeError, "Parameter must be a sequence or an instance of Strs");
+        return NULL;
+    }
+
+    Py_RETURN_NONE;
 }
 
 // Will be called by the `PySequence_Contains`
@@ -1670,6 +1868,8 @@ static PyMethodDef Strs_methods[] = {
     {"shuffle", Strs_shuffle, sz_method_flags_m, "Shuffle the elements of the Strs object."},  //
     {"sort", Strs_sort, sz_method_flags_m, "Sort the elements of the Strs object."},           //
     {"order", Strs_order, sz_method_flags_m, "Provides the indexes to achieve sorted order."}, //
+    {"append", Strs_append, sz_method_flags_m, "Append the sequence with a new string."},      //
+    {"extend", Strs_extend, sz_method_flags_m, "Extend the sequence with new strings."},       //
     {NULL, NULL, 0, NULL}};
 
 static PyTypeObject StrsType = {
@@ -1698,11 +1898,11 @@ static PyMethodDef stringzilla_methods[] = {
     {"contains", Str_contains, sz_method_flags_m, "Check if a string contains a substring."},
     {"partition", Str_partition, sz_method_flags_m, "Splits string into 3-tuple: before, match, after."},
     {"count", Str_count, sz_method_flags_m, "Count the occurrences of a substring."},
-    {"levenstein", Str_levenstein, sz_method_flags_m, "Calculate the Levenshtein distance between two strings."},
     {"split", Str_split, sz_method_flags_m, "Split a string by a separator."},
     {"splitlines", Str_splitlines, sz_method_flags_m, "Split a string by line breaks."},
     {"startswith", Str_startswith, sz_method_flags_m, "Check if a string starts with a given prefix."},
     {"endswith", Str_endswith, sz_method_flags_m, "Check if a string ends with a given suffix."},
+    {"levenstein", Str_levenstein, sz_method_flags_m, "Calculate the Levenshtein distance between two strings."},
     {NULL, NULL, 0, NULL}};
 
 static PyModuleDef stringzilla_module = {
diff --git a/scripts/test.c b/scripts/test.c
new file mode 100644
index 00000000..f50d7e62
--- /dev/null
+++ b/scripts/test.c
@@ -0,0 +1,60 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+
+#include <stringzilla.h>
+
+#define MAX_LENGTH 300
+#define MIN_LENGTH 3
+#define ASCII_LOWERCASE "abcdefghijklmnopqrstuvwxyz"
+#define VARIABILITY 25
+
+// Utility function to populate random string in a buffer
+void populate_random_string(char *buffer, int length, int variability) {
+    for (int i = 0; i < length; i++) { buffer[i] = ASCII_LOWERCASE[rand() % variability]; }
+    buffer[length] = '\0';
+}
+
+// Test function for sz_find_substr_auto
+void test_sz_find_substr_auto() {
+    char buffer[MAX_LENGTH + 1];
+    char pattern[6]; // Maximum length of 5 + 1 for '\0'
+
+    for (int length = MIN_LENGTH; length < MAX_LENGTH; length++) {
+        for (int variability = 1; variability < VARIABILITY; variability++) {
+            populate_random_string(buffer, length, variability);
+
+            struct sz_haystack_t haystack;
+            haystack.start = buffer;
+            haystack.length = length;
+
+            int pattern_length = rand() % 5 + 1;
+            populate_random_string(pattern, pattern_length, variability);
+
+            struct sz_needle_t needle;
+            needle.start = pattern;
+            needle.length = pattern_length;
+
+            // Comparing the result of your function with the standard library function.
+            const char *result_libc = strstr(buffer, pattern);
+            uint64_t result_stringzilla = sz_find_substr_auto(haystack, needle);
+
+            assert(((result_libc && result_stringzilla == (uint64_t)(result_libc - buffer)) ||
+                    (!result_libc && result_stringzilla == (uint64_t)-1)) &&
+                   "Test failed for sz_find_substr_auto");
+        }
+    }
+}
+
+int main() {
+    srand((unsigned int)time(NULL));
+
+    test_sz_find_substr_auto();
+    // Add calls to other test functions as you implement them
+
+    printf("All tests passed!\n");
+    return 0;
+}
diff --git a/scripts/test.cpp b/scripts/test.cpp
index e2c83d1b..ddef4e82 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -46,8 +46,7 @@ void populate_from_file( //
 
     std::ifstream f(path, std::ios::in);
     std::string s;
-    while (strings.size() < limit && std::getline(f, s, ' '))
-        strings.push_back(s);
+    while (strings.size() < limit && std::getline(f, s, ' ')) strings.push_back(s);
 }
 
 void populate_with_test(strings_t &strings) {
@@ -79,8 +78,7 @@ inline static idx_t hybrid_sort_cpp(strings_t const &strings, idx_t *order) {
         return *(uint32_t *)(i_bytes + offset_in_word) < *(uint32_t *)(j_bytes + offset_in_word);
     });
 
-    for (size_t i = 0; i != strings.size(); ++i)
-        std::memset((char *)&order[i] + offset_in_word, 0, 4ul);
+    for (size_t i = 0; i != strings.size(); ++i) std::memset((char *)&order[i] + offset_in_word, 0, 4ul);
 
     std::sort(order, order + strings.size(), [&](idx_t i, idx_t j) { return strings[i] < strings[j]; });
 
@@ -144,8 +142,7 @@ inline static idx_t hybrid_stable_sort_cpp(strings_t const &strings, idx_t *orde
         return *(uint32_t *)(i_bytes + offset_in_word) < *(uint32_t *)(j_bytes + offset_in_word);
     });
 
-    for (size_t i = 0; i != strings.size(); ++i)
-        std::memset((char *)&order[i] + offset_in_word, 0, 4ul);
+    for (size_t i = 0; i != strings.size(); ++i) std::memset((char *)&order[i] + offset_in_word, 0, 4ul);
 
     std::stable_sort(order, order + strings.size(), [&](idx_t i, idx_t j) { return strings[i] < strings[j]; });
 
@@ -198,8 +195,7 @@ void bench_search(char const *name, std::string_view full_text, algo_at &&algo)
 
     // Run multiple iterations
     std::size_t bytes_passed = 0;
-    for (std::size_t i = 0; i != iterations; ++i)
-        bytes_passed += algo();
+    for (std::size_t i = 0; i != iterations; ++i) bytes_passed += algo();
 
     // Measure elapsed time
     stdcc::time_point t2 = stdcc::now();
@@ -215,15 +211,13 @@ int main(int, char const **) {
     strings_t strings;
     populate_from_file("leipzig1M.txt", strings, 10000000);
     std::size_t mean_bytes = 0;
-    for (std::string const &str : strings)
-        mean_bytes += str.size();
+    for (std::string const &str : strings) mean_bytes += str.size();
     mean_bytes /= strings.size();
     std::printf("Parsed the file with %zu words of %zu mean length!\n", strings.size(), mean_bytes);
 
     std::string full_text;
     full_text.reserve(mean_bytes + strings.size() * 2);
-    for (std::string const &str : strings)
-        full_text.append(str), full_text.push_back(' ');
+    for (std::string const &str : strings) full_text.append(str), full_text.push_back(' ');
 
     auto make_random_needle = [](std::string_view full_text) {
         std::size_t length = std::rand() % 6 + 2;
diff --git a/scripts/test.py b/scripts/test.py
index 14b6e9e7..ea6aae8a 100644
--- a/scripts/test.py
+++ b/scripts/test.py
@@ -82,9 +82,6 @@ def test_unit_sequence():
     assert [0, 1, 2] == list(lines.order())
     assert ["line1", "line2", "line3"] == list(lines)
 
-    shuffled_copy = lines.shuffled(seed=42)
-    assert set(lines) == set(shuffled_copy)
-
     lines.append("line4")
     assert 4 == len(lines)
     lines.extend(["line5", "line6"])
@@ -212,30 +209,30 @@ def test_strs():
     native = get_random_string(length=10)
     big = Str(native)
 
-    assert native[0:5] == big.sub(0, 5) and native[0:5] == big[0:5]
-    assert native[5:10] == big.sub(5, 10) and native[5:10] == big[5:10]
-
-    assert native[5:5] == big.sub(5, 5) and native[5:5] == big[5:5]
-    assert native[-5:-5] == big.sub(-5, -5) and native[-5:-5] == big[-5:-5]
-    assert native[2:-2] == big.sub(2, -2) and native[2:-2] == big[2:-2]
-    assert native[7:-7] == big.sub(7, -7) and native[7:-7] == big[7:-7]
-
-    assert native[5:3] == big.sub(5, 3) and native[5:3] == big[5:3]
-    assert native[5:7] == big.sub(5, 7) and native[5:7] == big[5:7]
-    assert native[5:-3] == big.sub(5, -3) and native[5:-3] == big[5:-3]
-    assert native[5:-7] == big.sub(5, -7) and native[5:-7] == big[5:-7]
-
-    assert native[-5:3] == big.sub(-5, 3) and native[-5:3] == big[-5:3]
-    assert native[-5:7] == big.sub(-5, 7) and native[-5:7] == big[-5:7]
-    assert native[-5:-3] == big.sub(-5, -3) and native[-5:-3] == big[-5:-3]
-    assert native[-5:-7] == big.sub(-5, -7) and native[-5:-7] == big[-5:-7]
-
-    assert native[2:] == big.sub(2) and native[2:] == big[2:]
-    assert native[:7] == big.sub(end=7) and native[:7] == big[:7]
-    assert native[-2:] == big.sub(-2) and native[-2:] == big[-2:]
-    assert native[:-7] == big.sub(end=-7) and native[:-7] == big[:-7]
-    assert native[:-10] == big.sub(end=-10) and native[:-10] == big[:-10]
-    assert native[:-1] == big.sub(end=-1) and native[:-1] == big[:-1]
+    assert native[0:5] == big[0:5]
+    assert native[5:10] == big[5:10]
+
+    assert native[5:5] == big[5:5]
+    assert native[-5:-5] == big[-5:-5]
+    assert native[2:-2] == big[2:-2]
+    assert native[7:-7] == big[7:-7]
+
+    assert native[5:3] == big[5:3]
+    assert native[5:7] == big[5:7]
+    assert native[5:-3] == big[5:-3]
+    assert native[5:-7] == big[5:-7]
+
+    assert native[-5:3] == big[-5:3]
+    assert native[-5:7] == big[-5:7]
+    assert native[-5:-3] == big[-5:-3]
+    assert native[-5:-7] == big[-5:-7]
+
+    assert native[2:] == big[2:]
+    assert native[:7] == big[:7]
+    assert native[-2:] == big[-2:]
+    assert native[:-7] == big[:-7]
+    assert native[:-10] == big[:-10]
+    assert native[:-1] == big[:-1]
 
     length = 1000
     native = get_random_string(length=length)

From 876a726626dcbc4dd59dd3813241f70a33e9d665 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 24 Sep 2023 12:46:26 +0400
Subject: [PATCH 42/72] Break: Deprecate multi-source `Strs`; split tests

---
 .github/workflows/prerelease.yml |   4 +-
 README.md                        |   2 +-
 pyproject.toml                   |   2 +-
 python/lib.c                     | 146 +----------------
 scripts/test.py                  | 272 -------------------------------
 scripts/test_fuzzy.py            | 113 +++++++++++++
 scripts/test_units.py            | 104 ++++++++++++
 scripts/wc.py                    |  11 --
 8 files changed, 223 insertions(+), 431 deletions(-)
 delete mode 100644 scripts/test.py
 create mode 100644 scripts/test_fuzzy.py
 create mode 100644 scripts/test_units.py
 delete mode 100644 scripts/wc.py

diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
index 0f00d8fc..9c6bdab9 100644
--- a/.github/workflows/prerelease.yml
+++ b/.github/workflows/prerelease.yml
@@ -39,7 +39,7 @@ jobs:
       - name: Build locally
         run: python -m pip install .
       - name: Test with PyTest
-        run: pytest scripts/test.py
+        run: pytest scripts/
 
 
   test_python_37:
@@ -68,6 +68,6 @@ jobs:
         run: python -m pip install .
 
       - name: Test with PyTest
-        run: pytest scripts/test.py
+        run: pytest scripts/
 
 
diff --git a/README.md b/README.md
index 818c8879..f774df35 100644
--- a/README.md
+++ b/README.md
@@ -149,7 +149,7 @@ CPython:
 
 ```sh
 # Clean up and install
-rm -rf build && pip install -e . && pytest scripts/test.py -s -x
+rm -rf build && pip install -e . && pytest scripts/ -s -x
 
 # Install without dependencies
 pip install -e . --no-index --no-deps
diff --git a/pyproject.toml b/pyproject.toml
index e12df96a..5260630a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,7 +10,7 @@ filterwarnings = ["error"]
 
 [tool.cibuildwheel]
 test-requires = ["pytest"]
-test-command = "pytest {project}/scripts/test.py -x"
+test-command = "pytest {project}/scripts/ -x"
 build-verbosity = 0
 skip = ["*musllinux*", "*i686*", "pp*"]
 
diff --git a/python/lib.c b/python/lib.c
index 5b576c67..679c4f30 100644
--- a/python/lib.c
+++ b/python/lib.c
@@ -133,20 +133,6 @@ typedef struct {
             sz_haystack_t *parts;
         } reordered;
 
-        /**
-         *  Complex structure with two variable length chunks inside - for the parents and their slices.
-         *  The parents are sorted in ascending order of their memory ranges, to let us rapidly locate the source
-         *  with a binary search.
-         */
-        struct multi_source_slices_t {
-            size_t count;
-            size_t capacity;
-            size_t parents_count;
-            size_t parents_capacity;
-
-            PyObject **parents;
-            sz_haystack_t *parts;
-        } multi_source;
     } data;
 
 } Strs;
@@ -277,36 +263,11 @@ void str_at_offset_reordered(
     *parent = strs->data.reordered.parent;
 }
 
-void str_at_offset_multi_source(
-    Strs *strs, Py_ssize_t i, Py_ssize_t count, PyObject **parent, char const **start, size_t *length) {
-    *start = strs->data.multi_source.parts[i].start;
-    *length = strs->data.multi_source.parts[i].length;
-
-    PyObject **parents = strs->data.multi_source.parents;
-    size_t parents_count = strs->data.multi_source.parents_count;
-    for (size_t j = 0; j < parents_count; ++j) {
-        PyObject *current_parent = parents[j];
-        char *parent_start;
-        Py_ssize_t parent_length;
-        export_string_like(current_parent, &parent_start, &parent_length);
-
-        // Check if the string at offset `i` is within the range of the current parent.
-        if (*start >= parent_start && *start + *length <= parent_start + parent_length) {
-            *parent = current_parent;
-            return;
-        }
-    }
-
-    // If no parent is found, set *parent to NULL.
-    *parent = NULL;
-}
-
 get_string_at_offset_t str_at_offset_getter(Strs *strs) {
     switch (strs->type) {
     case STRS_CONSECUTIVE_32: return str_at_offset_consecutive_32bit;
     case STRS_CONSECUTIVE_64: return str_at_offset_consecutive_64bit;
     case STRS_REORDERED: return str_at_offset_reordered;
-    case STRS_MULTI_SOURCE: return str_at_offset_multi_source;
     default:
         // Unsupported type
         PyErr_SetString(PyExc_TypeError, "Unsupported type for conversion");
@@ -695,17 +656,6 @@ static Py_ssize_t Strs_len(Strs *self) {
     case STRS_CONSECUTIVE_32: return self->data.consecutive_32bit.count;
     case STRS_CONSECUTIVE_64: return self->data.consecutive_64bit.count;
     case STRS_REORDERED: return self->data.reordered.count;
-    case STRS_MULTI_SOURCE: return self->data.multi_source.count;
-    default: return 0;
-    }
-}
-
-static Py_ssize_t Strs_parents_count(Strs *self) {
-    switch (self->type) {
-    case STRS_CONSECUTIVE_32: return 1;
-    case STRS_CONSECUTIVE_64: return 1;
-    case STRS_REORDERED: return 1;
-    case STRS_MULTI_SOURCE: return self->data.multi_source.parents_count;
     default: return 0;
     }
 }
@@ -764,7 +714,7 @@ static PyObject *Strs_subscript(Strs *self, PyObject *key) {
             struct consecutive_slices_32bit_t *from = &self->data.consecutive_32bit;
             struct consecutive_slices_32bit_t *to = &self_slice->data.consecutive_32bit;
             to->count = stop - start;
-            to->separator_length = from->sepa rator_length;
+            to->separator_length = from->separator_length;
             to->parent = from->parent;
 
             size_t first_length;
@@ -813,57 +763,6 @@ static PyObject *Strs_subscript(Strs *self, PyObject *key) {
             Py_INCREF(to->parent);
             break;
         }
-        case STRS_MULTI_SOURCE: {
-            struct multi_source_slices_t *from = &self->data.multi_source;
-            struct multi_source_slices_t *to = &self_slice->data.multi_source;
-            to->count = stop - start;
-            to->capacity = to->count;
-            to->parents_count = 0;
-            to->parents_capacity = from->parents_capacity;
-
-            // Allocate memory for both `parts` and `parents` references
-            to->parts = malloc(sizeof(sz_haystack_t) * to->capacity);
-            if (to->parts == NULL && PyErr_NoMemory()) {
-                Py_XDECREF(self_slice);
-                return NULL;
-            }
-            to->parents = malloc(sizeof(PyObject *) * to->parents_capacity);
-            if (to->parents == NULL && PyErr_NoMemory()) {
-                free(to->parts);
-                Py_XDECREF(self_slice);
-                return NULL;
-            }
-
-            // Iterate through the `parts` of this slice, detect the `parent`
-            // of each exported entry in `from->parents`, and upsert it into the `to->parents`
-            for (Py_ssize_t i = start; i < stop; ++i) {
-                PyObject *detected_parent;
-                char const *part_start;
-                size_t part_length;
-
-                // Find the parent of the part at the offset `i`
-                str_at_offset_multi_source(self, i, count, &detected_parent, &part_start, &part_length);
-                Py_INCREF(detected_parent);
-
-                // Upsert the detected parent into to->parents
-                // As the to->parents array is meant to be sorted,
-                // we insert in a way that maintains the sorting
-                size_t j = 0;
-                while (j < to->parents_count && to->parents[j] != detected_parent) ++j;
-
-                // If the parent is not already in to->parents, insert it.
-                if (j == to->parents_count) {
-                    to->parents[j] = detected_parent;
-                    ++to->parents_count;
-                }
-
-                // Populate the to->parts array
-                to->parts[i - start].start = part_start;
-                to->parts[i - start].length = part_length;
-            }
-
-            break;
-        }
         default:
             // Unsupported type
             PyErr_SetString(PyExc_TypeError, "Unsupported type for conversion");
@@ -879,40 +778,6 @@ static PyObject *Strs_subscript(Strs *self, PyObject *key) {
     }
 }
 
-static PyObject *Strs_extend(Strs *self, PyObject *seq) {
-    // Check if seq is an instance of Strs
-    if (PyObject_IsInstance(seq, (PyObject *)&StrsType)) {
-        Strs *other = (Strs *)seq;
-        size_t other_parents = Strs_len(other);
-        size_t other_parts = Strs_parents_count(other);
-        if (!prepare_strings_for_extension(self, other_parents, other_parts)) {
-            PyErr_Format(PyExc_TypeError, "Failed to prepare the sequence for extension");
-            return NULL;
-        }
-
-        // TODO:
-    }
-    else if (PySequence_Check(seq)) {
-        // Check if seq is a sequence
-        Py_ssize_t length = PySequence_Size(seq);
-        // Validate that every item in the sequence is string-like with `export_string_like`
-        // TODO:
-
-        for (Py_ssize_t i = 0; i < length; i++) {
-            PyObject *item = PySequence_ITEM(seq, i);
-            if (!item) return NULL; // Error getting item from sequence
-
-            // TODO:
-        }
-    }
-    else {
-        PyErr_SetString(PyExc_TypeError, "Parameter must be a sequence or an instance of Strs");
-        return NULL;
-    }
-
-    Py_RETURN_NONE;
-}
-
 // Will be called by the `PySequence_Contains`
 static int Strs_contains(Str *self, PyObject *arg) { return 0; }
 
@@ -1590,11 +1455,11 @@ static PyMethodDef Str_methods[] = { //
     {"contains", Str_contains, sz_method_flags_m, "Check if a string contains a substring."},
     {"partition", Str_partition, sz_method_flags_m, "Splits string into 3-tuple: before, match, after."},
     {"count", Str_count, sz_method_flags_m, "Count the occurrences of a substring."},
-    {"levenstein", Str_levenstein, sz_method_flags_m, "Calculate the Levenshtein distance between two strings."},
     {"split", Str_split, sz_method_flags_m, "Split a string by a separator."},
     {"splitlines", Str_splitlines, sz_method_flags_m, "Split a string by line breaks."},
     {"startswith", Str_startswith, sz_method_flags_m, "Check if a string starts with a given prefix."},
     {"endswith", Str_endswith, sz_method_flags_m, "Check if a string ends with a given suffix."},
+    {"levenstein", Str_levenstein, sz_method_flags_m, "Calculate the Levenshtein distance between two strings."},
     {NULL, NULL, 0, NULL}};
 
 static PyTypeObject StrType = {
@@ -1702,11 +1567,6 @@ static boolean_t Strs_sort_(Strs *self,
         parts = self->data.reordered.parts;
         count = self->data.reordered.count;
         break;
-
-    case STRS_MULTI_SOURCE:
-        parts = self->data.multi_source.parts;
-        count = self->data.multi_source.count;
-        break;
     }
 
     // Allocate temporary memory to store the ordering offsets
@@ -1868,8 +1728,6 @@ static PyMethodDef Strs_methods[] = {
     {"shuffle", Strs_shuffle, sz_method_flags_m, "Shuffle the elements of the Strs object."},  //
     {"sort", Strs_sort, sz_method_flags_m, "Sort the elements of the Strs object."},           //
     {"order", Strs_order, sz_method_flags_m, "Provides the indexes to achieve sorted order."}, //
-    {"append", Strs_append, sz_method_flags_m, "Append the sequence with a new string."},      //
-    {"extend", Strs_extend, sz_method_flags_m, "Extend the sequence with new strings."},       //
     {NULL, NULL, 0, NULL}};
 
 static PyTypeObject StrsType = {
diff --git a/scripts/test.py b/scripts/test.py
deleted file mode 100644
index ea6aae8a..00000000
--- a/scripts/test.py
+++ /dev/null
@@ -1,272 +0,0 @@
-from typing import Union, Optional
-from random import choice, randint
-from string import ascii_lowercase
-import math
-
-import pytest
-
-import stringzilla as sz
-from stringzilla import Str, Strs
-
-
-def test_unit_construct():
-    native = "aaaaa"
-    big = Str(native)
-    assert len(big) == len(native)
-
-
-def test_unit_indexing():
-    native = "abcdef"
-    big = Str(native)
-    for i in range(len(native)):
-        assert big[i] == native[i]
-
-
-def test_unit_count():
-    native = "aaaaa"
-    big = Str(native)
-    assert big.count("a") == 5
-    assert big.count("aa") == 2
-    assert big.count("aa", allowoverlap=True) == 4
-
-
-def test_unit_contains():
-    big = Str("abcdef")
-    assert "a" in big
-    assert "ab" in big
-    assert "xxx" not in big
-
-
-def test_unit_rich_comparisons():
-    assert Str("aa") == "aa"
-    assert Str("aa") < "b"
-    assert Str("abb")[1:] == "bb"
-
-
-def test_unit_buffer_protocol():
-    import numpy as np
-
-    my_str = Str("hello")
-    arr = np.array(my_str)
-    assert arr.dtype == np.dtype("c")
-    assert arr.shape == (len("hello"),)
-    assert "".join([c.decode("utf-8") for c in arr.tolist()]) == "hello"
-
-
-def test_unit_split():
-    native = "token1\ntoken2\ntoken3"
-    big = Str(native)
-    assert native.splitlines() == list(big.splitlines())
-    assert native.splitlines(True) == list(big.splitlines(keeplinebreaks=True))
-    assert native.split("token3") == list(big.split("token3"))
-
-    words = sz.split(big, "\n")
-    assert len(words) == 3
-    assert str(words[0]) == "token1"
-    assert str(words[2]) == "token3"
-
-    parts = sz.split(big, "\n", keepseparator=True)
-    assert len(parts) == 3
-    assert str(parts[0]) == "token1\n"
-    assert str(parts[2]) == "token3"
-
-
-def test_unit_sequence():
-    native = "line3\nline2\nline1"
-    big = Str(native)
-
-    lines = big.splitlines()
-    assert [2, 1, 0] == list(lines.order())
-
-    lines.sort()
-    assert [0, 1, 2] == list(lines.order())
-    assert ["line1", "line2", "line3"] == list(lines)
-
-    lines.append("line4")
-    assert 4 == len(lines)
-    lines.extend(["line5", "line6"])
-    assert 6 == len(lines)
-
-    lines.append(lines[0])
-    assert 7 == len(lines)
-    assert lines[6] == "line1"
-
-    lines.extend(lines)
-    assert 14 == len(lines)
-    assert lines[7] == "line1"
-    assert lines[8] == "line2"
-    assert lines[12] == "line6"
-
-    # Test that shuffles are reproducible with the same `seed`
-    a = [str(s) for s in lines.shuffled(seed=42)]
-    b = [str(s) for s in lines.shuffled(seed=42)]
-    assert a == b
-
-
-def test_unit_globals():
-    """Validates that the previously unit-tested member methods are also visible as global functions."""
-
-    assert sz.find("abcdef", "bcdef") == 1
-    assert sz.find("abcdef", "x") == -1
-
-    assert sz.count("abcdef", "x") == 0
-    assert sz.count("aaaaa", "a") == 5
-    assert sz.count("aaaaa", "aa") == 2
-    assert sz.count("aaaaa", "aa", allowoverlap=True) == 4
-
-    assert sz.levenstein("aaa", "aaa") == 0
-    assert sz.levenstein("aaa", "bbb") == 3
-    assert sz.levenstein("abababab", "aaaaaaaa") == 4
-    assert sz.levenstein("abababab", "aaaaaaaa", 2) == 2
-    assert sz.levenstein("abababab", "aaaaaaaa", bound=2) == 2
-
-
-def get_random_string(
-    length: Optional[int] = None, variability: Optional[int] = None
-) -> str:
-    if length is None:
-        length = randint(3, 300)
-    if variability is None:
-        variability = len(ascii_lowercase)
-    return "".join(choice(ascii_lowercase[:variability]) for _ in range(length))
-
-
-def is_equal_strings(native_strings, big_strings):
-    for native_slice, big_slice in zip(native_strings, big_strings):
-        assert native_slice == big_slice
-
-
-def check_identical(
-    native: str,
-    big: Str,
-    needle: Optional[str] = None,
-    check_iterators: bool = False,
-):
-    if needle is None:
-        part_offset = randint(0, len(native) - 1)
-        part_length = randint(1, len(native) - part_offset)
-        needle = native[part_offset:part_length]
-
-    present_in_native: bool = needle in native
-    present_in_big = needle in big
-    assert present_in_native == present_in_big
-    assert native.find(needle) == big.find(needle)
-    assert native.count(needle) == big.count(needle)
-
-    native_strings = native.split(needle)
-    big_strings: Strs = big.split(needle)
-    assert len(native_strings) == len(big_strings)
-
-    if check_iterators:
-        for i in range(len(native_strings)):
-            assert len(native_strings[i]) == len(big_strings[i])
-            assert native_strings[i] == big_strings[i]
-            assert [c for c in native_strings[i]] == [c for c in big_strings[i]]
-
-    is_equal_strings(native_strings, big_strings)
-
-
-@pytest.mark.parametrize("haystack_length", range(1, 65))
-@pytest.mark.parametrize("variability", range(1, 25))
-def test_fuzzy_substrings(haystack_length: int, variability: int):
-    native = get_random_string(variability=variability, length=haystack_length)
-    big = Str(native)
-    pattern = get_random_string(variability=variability, length=randint(1, 5))
-    assert (pattern in native) == big.contains(pattern)
-    assert native.find(pattern) == big.find(pattern)
-
-
-@pytest.mark.parametrize("repetitions", range(1, 10))
-def test_basic(repetitions: int):
-    native = "abcd" * repetitions
-    big = Str(native)
-
-    check_identical(native, big, "a", True)
-    check_identical(native, big, "ab", True)
-    check_identical(native, big, "abc", True)
-    check_identical(native, big, "abcd", True)
-    check_identical(native, big, "abcde", True)  # Missing pattern
-
-
-@pytest.mark.parametrize("pattern_length", [1, 2, 4, 5])
-@pytest.mark.parametrize("haystack_length", range(1, 69, 3))
-@pytest.mark.parametrize("variability", range(1, 27, 3))
-def test_fuzzy(pattern_length: int, haystack_length: int, variability: int):
-    native = get_random_string(variability=variability, length=haystack_length)
-    big = Str(native)
-
-    # Start by matching the prefix and the suffix
-    check_identical(native, big, native[:pattern_length])
-    check_identical(native, big, native[-pattern_length:])
-
-    # Continue with random strs
-    for _ in range(haystack_length // pattern_length):
-        pattern = get_random_string(variability=variability, length=pattern_length)
-        check_identical(native, big, pattern)
-
-
-def test_strs():
-    native = get_random_string(length=10)
-    big = Str(native)
-
-    assert native[0:5] == big[0:5]
-    assert native[5:10] == big[5:10]
-
-    assert native[5:5] == big[5:5]
-    assert native[-5:-5] == big[-5:-5]
-    assert native[2:-2] == big[2:-2]
-    assert native[7:-7] == big[7:-7]
-
-    assert native[5:3] == big[5:3]
-    assert native[5:7] == big[5:7]
-    assert native[5:-3] == big[5:-3]
-    assert native[5:-7] == big[5:-7]
-
-    assert native[-5:3] == big[-5:3]
-    assert native[-5:7] == big[-5:7]
-    assert native[-5:-3] == big[-5:-3]
-    assert native[-5:-7] == big[-5:-7]
-
-    assert native[2:] == big[2:]
-    assert native[:7] == big[:7]
-    assert native[-2:] == big[-2:]
-    assert native[:-7] == big[:-7]
-    assert native[:-10] == big[:-10]
-    assert native[:-1] == big[:-1]
-
-    length = 1000
-    native = get_random_string(length=length)
-    big = Str(native)
-
-    needle = native[0 : randint(2, 5)]
-    native_strings = native.split(needle)
-    big_strings: Strs = big.split(needle)
-
-    length = len(native_strings)
-    for i in range(length):
-        start = randint(1 - length, length - 1)
-        stop = randint(1 - length, length - 1)
-        step = 0
-        while step == 0:
-            step = randint(-int(math.sqrt(length)), int(math.sqrt(length)))
-
-        is_equal_strings(native_strings[start:stop:step], big_strings[start:stop:step])
-        is_equal_strings(
-            native_strings[start:stop:step],
-            big_strings.sub(start, stop, step),
-        )
-
-
-def test_levenstein():
-    # Create a new string by slicing and concatenating
-    def insert_char_at(s, char_to_insert, index):
-        return s[:index] + char_to_insert + s[index:]
-
-    for _ in range(100):
-        a = get_random_string(length=20)
-        b = a
-        for i in range(150):
-            source_offset = randint(0, len(ascii_lowercase) - 1)
-            target_offset = randint(0, len(b) - 1)
-            b = insert_char_at(b, ascii_lowercase[source_offset], target_offset)
-            assert sz.levenstein(a, b, 200) == i + 1
diff --git a/scripts/test_fuzzy.py b/scripts/test_fuzzy.py
new file mode 100644
index 00000000..7249e93b
--- /dev/null
+++ b/scripts/test_fuzzy.py
@@ -0,0 +1,113 @@
+from typing import Union, Optional
+from random import choice, randint
+from string import ascii_lowercase
+
+import pytest
+
+import stringzilla as sz
+from stringzilla import Str, Strs
+
+
+def get_random_string(
+    length: Optional[int] = None, variability: Optional[int] = None
+) -> str:
+    if length is None:
+        length = randint(3, 300)
+    if variability is None:
+        variability = len(ascii_lowercase)
+    return "".join(choice(ascii_lowercase[:variability]) for _ in range(length))
+
+
+def is_equal_strings(native_strings, big_strings):
+    for native_slice, big_slice in zip(native_strings, big_strings):
+        assert native_slice == big_slice
+
+
+def check_identical(
+    native: str,
+    big: Str,
+    needle: Optional[str] = None,
+    check_iterators: bool = False,
+):
+    if needle is None:
+        part_offset = randint(0, len(native) - 1)
+        part_length = randint(1, len(native) - part_offset)
+        needle = native[part_offset:part_length]
+
+    present_in_native: bool = needle in native
+    present_in_big = needle in big
+    assert present_in_native == present_in_big
+    assert native.find(needle) == big.find(needle)
+    assert native.count(needle) == big.count(needle)
+
+    native_strings = native.split(needle)
+    big_strings: Strs = big.split(needle)
+    assert len(native_strings) == len(big_strings)
+
+    if check_iterators:
+        for i in range(len(native_strings)):
+            assert len(native_strings[i]) == len(big_strings[i])
+            assert native_strings[i] == big_strings[i]
+            assert [c for c in native_strings[i]] == [c for c in big_strings[i]]
+
+    is_equal_strings(native_strings, big_strings)
+
+
+@pytest.mark.parametrize("repetitions", range(1, 10))
+def test_fuzzy_repetitions(repetitions: int):
+    native = "abcd" * repetitions
+    big = Str(native)
+
+    check_identical(native, big, "a", True)
+    check_identical(native, big, "ab", True)
+    check_identical(native, big, "abc", True)
+    check_identical(native, big, "abcd", True)
+    check_identical(native, big, "abcde", True)  # Missing pattern
+
+
+@pytest.mark.parametrize("pattern_length", [1, 2, 4, 5])
+@pytest.mark.parametrize("haystack_length", range(1, 65))
+@pytest.mark.parametrize("variability", range(1, 25))
+def test_fuzzy_substrings(pattern_length: int, haystack_length: int, variability: int):
+    native = get_random_string(variability=variability, length=haystack_length)
+    big = Str(native)
+    pattern = get_random_string(variability=variability, length=pattern_length)
+    assert (pattern in native) == big.contains(pattern)
+    assert native.find(pattern) == big.find(pattern)
+
+
+@pytest.mark.parametrize("iterations", range(100))
+@pytest.mark.parametrize("max_edit_distance", [150])
+def test_levenstein(iterations: int, max_edit_distance: int):
+    # Create a new string by slicing and concatenating
+    def insert_char_at(s, char_to_insert, index):
+        return s[:index] + char_to_insert + s[index:]
+
+    for _ in range(iterations):
+        a = get_random_string(length=20)
+        b = a
+        for i in range(max_edit_distance):
+            source_offset = randint(0, len(ascii_lowercase) - 1)
+            target_offset = randint(0, len(b) - 1)
+            b = insert_char_at(b, ascii_lowercase[source_offset], target_offset)
+            assert sz.levenstein(a, b, 200) == i + 1
+
+
+@pytest.mark.parametrize("list_length", [10, 20, 30, 40, 50])
+@pytest.mark.parametrize("part_length", [5, 10])
+@pytest.mark.parametrize("variability", [2, 3])
+def test_fuzzy_sorting(list_length: int, part_length: int, variability: int):
+    native_list = [
+        get_random_string(variability=variability, length=part_length)
+        for _ in range(list_length)
+    ]
+    native_joined = ".".join(native_list)
+    big_joined = Str(native_joined)
+    big_list = big_joined.split(".")
+
+    native_list.sort()
+    big_list.sort()
+
+    assert len(native_list) == len(big_list)
+    for native_str, big_str in zip(native_list, big_list):
+        assert native_str == str(big_str)
diff --git a/scripts/test_units.py b/scripts/test_units.py
new file mode 100644
index 00000000..a7c622e3
--- /dev/null
+++ b/scripts/test_units.py
@@ -0,0 +1,104 @@
+from typing import Union, Optional
+from random import choice, randint
+from string import ascii_lowercase
+
+import pytest
+
+import stringzilla as sz
+from stringzilla import Str, Strs
+
+
+def test_unit_construct():
+    native = "aaaaa"
+    big = Str(native)
+    assert len(big) == len(native)
+
+
+def test_unit_indexing():
+    native = "abcdef"
+    big = Str(native)
+    for i in range(len(native)):
+        assert big[i] == native[i]
+
+
+def test_unit_count():
+    native = "aaaaa"
+    big = Str(native)
+    assert big.count("a") == 5
+    assert big.count("aa") == 2
+    assert big.count("aa", allowoverlap=True) == 4
+
+
+def test_unit_contains():
+    big = Str("abcdef")
+    assert "a" in big
+    assert "ab" in big
+    assert "xxx" not in big
+
+
+def test_unit_rich_comparisons():
+    assert Str("aa") == "aa"
+    assert Str("aa") < "b"
+    assert Str("abb")[1:] == "bb"
+
+
+def test_unit_buffer_protocol():
+    import numpy as np
+
+    my_str = Str("hello")
+    arr = np.array(my_str)
+    assert arr.dtype == np.dtype("c")
+    assert arr.shape == (len("hello"),)
+    assert "".join([c.decode("utf-8") for c in arr.tolist()]) == "hello"
+
+
+def test_unit_split():
+    native = "token1\ntoken2\ntoken3"
+    big = Str(native)
+    assert native.splitlines() == list(big.splitlines())
+    assert native.splitlines(True) == list(big.splitlines(keeplinebreaks=True))
+    assert native.split("token3") == list(big.split("token3"))
+
+    words = sz.split(big, "\n")
+    assert len(words) == 3
+    assert str(words[0]) == "token1"
+    assert str(words[2]) == "token3"
+
+    parts = sz.split(big, "\n", keepseparator=True)
+    assert len(parts) == 3
+    assert str(parts[0]) == "token1\n"
+    assert str(parts[2]) == "token3"
+
+
+def test_unit_sequence():
+    native = "line3\nline2\nline1"
+    big = Str(native)
+
+    lines = big.splitlines()
+    assert [2, 1, 0] == list(lines.order())
+
+    lines.sort()
+    assert [0, 1, 2] == list(lines.order())
+    assert ["line1", "line2", "line3"] == list(lines)
+
+    lines.sort(reverse=True)
+    assert [2, 1, 0] == list(lines.order())
+    assert ["line3", "line2", "line1"] == list(lines)
+
+
+def test_unit_globals():
+    """Validates that the previously unit-tested member methods are also visible as global functions."""
+
+    assert sz.find("abcdef", "bcdef") == 1
+    assert sz.find("abcdef", "x") == -1
+
+    assert sz.count("abcdef", "x") == 0
+    assert sz.count("aaaaa", "a") == 5
+    assert sz.count("aaaaa", "aa") == 2
+    assert sz.count("aaaaa", "aa", allowoverlap=True) == 4
+
+    assert sz.levenstein("aaa", "aaa") == 0
+    assert sz.levenstein("aaa", "bbb") == 3
+    assert sz.levenstein("abababab", "aaaaaaaa") == 4
+    assert sz.levenstein("abababab", "aaaaaaaa", 2) == 2
+    assert sz.levenstein("abababab", "aaaaaaaa", bound=2) == 2
diff --git a/scripts/wc.py b/scripts/wc.py
deleted file mode 100644
index 60204345..00000000
--- a/scripts/wc.py
+++ /dev/null
@@ -1,11 +0,0 @@
-import argparse
-from stringzilla import Str, File, Strs
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-l", "--lines", nargs="*", help="Count lines in files")
-    args = parser.parse_args()
-    if args.lines:
-        for filename in args.lines:
-            print(File(filename).count("\n"))

From ccaa1d249d4091db0e19985300605d18431296e9 Mon Sep 17 00:00:00 2001
From: Nairi Harutyunyan <nairi.haroutiounian@gmail.com>
Date: Wed, 27 Sep 2023 16:14:07 +0300
Subject: [PATCH 43/72] Minor fixes

---
 javascript/lib.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/javascript/lib.c b/javascript/lib.c
index c2098a08..6330d90b 100644
--- a/javascript/lib.c
+++ b/javascript/lib.c
@@ -86,11 +86,13 @@ napi_value CountSubstrAPI(napi_env env, napi_callback_info info) {
     strzl_needle.ptr = needle;
 
     bool overlap = false;
-    napi_get_value_bool(env, args[2], &overlap);
+    if (argc > 2) {
+        napi_get_value_bool(env, args[2], &overlap);
+    }
 
     size_t result;
 
-    if (strzl_needle.len == 0 || strzl_haystack.len == 0 || strzl_haystack.len < strzl_needle.len) {
+    if (strzl_needle.len == 0 || strzl_haystack.len == 0 || strzl_haystack.len < strzl_needle.len)
         result = 0;
     else if (strzl_needle.len == 1)
         result = count_char(strzl_haystack, strzl_needle.ptr[0]);

From 730948a1f63975445e207d14ab08d16d5807be55 Mon Sep 17 00:00:00 2001
From: Nairi Harutyunyan <nairi.haroutiounian@gmail.com>
Date: Wed, 27 Sep 2023 16:19:41 +0300
Subject: [PATCH 44/72] Add CI for JavaScript

---
 .github/workflows/javascript-ci.yml | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 .github/workflows/javascript-ci.yml

diff --git a/.github/workflows/javascript-ci.yml b/.github/workflows/javascript-ci.yml
new file mode 100644
index 00000000..674fa882
--- /dev/null
+++ b/.github/workflows/javascript-ci.yml
@@ -0,0 +1,21 @@
+name: CI
+on:
+  pull_request:
+    branches: '*'
+  push:
+    branches: '*'
+
+jobs:
+  tests:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        node-version: [18.x]
+    steps:
+      - uses: actions/checkout@v4
+      - name: Use Node.js
+        uses: actions/setup-node@v3
+        with:
+          node-version: '18.x'
+      - run: npm i
+      - run: npm test
\ No newline at end of file

From 35051f34164b0fb08409df89158cf875add1d81c Mon Sep 17 00:00:00 2001
From: Nairi Harutyunyan <nairi.haroutiounian@gmail.com>
Date: Fri, 29 Sep 2023 23:41:58 +0300
Subject: [PATCH 45/72] Rename countSubstr to count

---
 javascript/lib.c                             |  8 ++++----
 javascript/stringzilla.d.ts                  |  2 +-
 javascript/test/{countSubstr.js => count.js} | 16 ++++++++--------
 3 files changed, 13 insertions(+), 13 deletions(-)
 rename javascript/test/{countSubstr.js => count.js} (58%)

diff --git a/javascript/lib.c b/javascript/lib.c
index 6330d90b..2c46224c 100644
--- a/javascript/lib.c
+++ b/javascript/lib.c
@@ -64,7 +64,7 @@ size_t count_char(strzl_haystack_t strzl_haystack, char needle) {
     return result;
 }
 
-napi_value CountSubstrAPI(napi_env env, napi_callback_info info) {
+napi_value CountAPI(napi_env env, napi_callback_info info) {
     size_t argc = 3;
     napi_value args[3];
     napi_get_cb_info(env, info, &argc, args, NULL, NULL);
@@ -145,11 +145,11 @@ napi_value Init(napi_env env, napi_value exports) {
     // Define the "find" property
     napi_property_descriptor findDesc = {"find", 0, FindAPI, 0, 0, 0, napi_default, 0};
 
-    // Define the "countSubstr" property
-    napi_property_descriptor countSubstrDesc = {"countSubstr", 0, CountSubstrAPI, 0, 0, 0, napi_default, 0};
+    // Define the "count" property
+    napi_property_descriptor countDesc = {"count", 0, CountAPI, 0, 0, 0, napi_default, 0};
 
     // Define an array of property descriptors
-    napi_property_descriptor properties[] = {findDesc, countSubstrDesc};
+    napi_property_descriptor properties[] = {findDesc, countDesc};
 
     // Define the number of properties in the array
     size_t propertyCount = sizeof(properties) / sizeof(properties[0]);
diff --git a/javascript/stringzilla.d.ts b/javascript/stringzilla.d.ts
index 57eff05b..5521c152 100644
--- a/javascript/stringzilla.d.ts
+++ b/javascript/stringzilla.d.ts
@@ -14,4 +14,4 @@ export function find(haystack: string, needle: string): bigint;
  * @param {string} needle 
  * @param {boolean} overlap 
  */
-export function countSubstr(haystack: string, needle: string, overlap: boolean): bigint;
+export function count(haystack: string, needle: string, overlap: boolean): bigint;
diff --git a/javascript/test/countSubstr.js b/javascript/test/count.js
similarity index 58%
rename from javascript/test/countSubstr.js
rename to javascript/test/count.js
index 973ba541..a1c44d16 100644
--- a/javascript/test/countSubstr.js
+++ b/javascript/test/count.js
@@ -5,40 +5,40 @@ import assert from 'node:assert';
 const stringzilla = bindings('stringzilla');
 
 test('Count Words - Single Occurrence', () => {
-    const result = stringzilla.countSubstr('hello world', 'world');
+    const result = stringzilla.count('hello world', 'world');
 
     assert.strictEqual(result, 1n);
 });
 
 test('Count Words - Multiple Occurrence', () => {
-    const result = stringzilla.countSubstr('hello world, hello John', 'hello');
+    const result = stringzilla.count('hello world, hello John', 'hello');
 
     assert.strictEqual(result, 2n);
 });
 
 test('Count Words - Multiple Occurrences with Overlap Test', () => {
-    const result_1 = stringzilla.countSubstr('abababab', 'aba');
+    const result_1 = stringzilla.count('abababab', 'aba');
 
     assert.strictEqual(result_1, 2n);
 
-    const result_2 = stringzilla.countSubstr('abababab', 'aba', true);
+    const result_2 = stringzilla.count('abababab', 'aba', true);
 
     assert.strictEqual(result_2, 3n);
 });
 
 test('Count Words - No Occurrence', () => {
-    const result = stringzilla.countSubstr('hello world', 'hi');
+    const result = stringzilla.count('hello world', 'hi');
 
     assert.strictEqual(result, 0n);
 });
 
 test('Count Words - Empty String Inputs', () => {
-    const result_1 = stringzilla.countSubstr('hello world', '');
+    const result_1 = stringzilla.count('hello world', '');
     assert.strictEqual(result_1, 0n);
 
-    const result_2 = stringzilla.countSubstr('', 'hi');
+    const result_2 = stringzilla.count('', 'hi');
     assert.strictEqual(result_2, 0n);
 
-    const result_3 = stringzilla.countSubstr('', '');
+    const result_3 = stringzilla.count('', '');
     assert.strictEqual(result_3, 0n);
 });

From ff76eaf815abb2bb39f815eed042f0e52964f4be Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 5 Oct 2023 11:54:44 -0700
Subject: [PATCH 46/72] Fix: Applying sort order in Python

---
 python/lib.c          | 17 ++++++++---------
 scripts/test_fuzzy.py | 10 +++++++++-
 2 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/python/lib.c b/python/lib.c
index 679c4f30..17324ba6 100644
--- a/python/lib.c
+++ b/python/lib.c
@@ -168,17 +168,16 @@ void reverse_haystacks(sz_haystack_t *array, size_t length) {
 
 void apply_order(sz_haystack_t *array, sz_size_t *order, size_t length) {
     for (size_t i = 0; i < length; ++i) {
-        while (order[i] != i) {
-            // Swap array[i] and array[order[i]]
+        if (i == order[i]) continue;
             sz_haystack_t temp = array[i];
-            array[i] = array[order[i]];
-            array[order[i]] = temp;
-
-            // Also update the order array to reflect the swap
-            size_t temp_idx = order[i];
-            order[i] = order[temp_idx];
-            order[temp_idx] = temp_idx;
+        size_t k = i, j;
+        while (i != (j = order[k])) {
+            array[k] = array[j];
+            order[k] = k;
+            k = j;
         }
+        array[k] = temp;
+        order[k] = k;
     }
 }
 
diff --git a/scripts/test_fuzzy.py b/scripts/test_fuzzy.py
index 7249e93b..ba4aca42 100644
--- a/scripts/test_fuzzy.py
+++ b/scripts/test_fuzzy.py
@@ -105,9 +105,17 @@ def test_fuzzy_sorting(list_length: int, part_length: int, variability: int):
     big_joined = Str(native_joined)
     big_list = big_joined.split(".")
 
+    native_ordered = sorted(native_list)
+    native_order = big_list.order()
+    for i in range(list_length):
+        assert native_ordered[i] == native_list[native_order[i]], "Order is wrong"
+        assert native_ordered[i] == str(
+            big_list[int(native_order[i])]
+        ), "Split is wrong?!"
+
     native_list.sort()
     big_list.sort()
 
     assert len(native_list) == len(big_list)
     for native_str, big_str in zip(native_list, big_list):
-        assert native_str == str(big_str)
+        assert native_str == str(big_str), "Order is wrong"

From 69636f004fdebcce368deab264741ff33c45a6f6 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 5 Oct 2023 12:49:59 -0700
Subject: [PATCH 47/72] Fix: Reverse order

---
 .clang-format         |  4 ++--
 python/lib.c          | 12 ++++++------
 scripts/bench.py      |  4 ++--
 scripts/test_units.py |  9 +++++----
 4 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/.clang-format b/.clang-format
index e0f25893..bf4937c3 100644
--- a/.clang-format
+++ b/.clang-format
@@ -17,9 +17,9 @@ AllowAllArgumentsOnNextLine: false
 AllowAllConstructorInitializersOnNextLine: true
 AllowAllParametersOfDeclarationOnNextLine: true
 AllowShortBlocksOnASingleLine: Always
+AllowShortIfStatementsOnASingleLine: Always
 AllowShortCaseLabelsOnASingleLine: true
 AllowShortFunctionsOnASingleLine: true
-AllowShortIfStatementsOnASingleLine: Always
 AllowShortLambdasOnASingleLine: true
 AllowShortLoopsOnASingleLine: true
 AlwaysBreakTemplateDeclarations: Yes
@@ -46,7 +46,7 @@ BraceWrapping:
   IndentBraces: false
 
 
-SortIncludes: false
+SortIncludes: true
 SortUsingDeclarations: true 
 
 SpaceAfterCStyleCast: false
diff --git a/python/lib.c b/python/lib.c
index 17324ba6..cef4a751 100644
--- a/python/lib.c
+++ b/python/lib.c
@@ -9,10 +9,10 @@
 #define NOMINMAX
 #include <windows.h>
 #else
-#include <sys/types.h>
-#include <sys/stat.h> // `stat`
-#include <sys/mman.h> // `mmap`
 #include <fcntl.h>    // `O_RDNLY`
+#include <sys/mman.h> // `mmap`
+#include <sys/stat.h> // `stat`
+#include <sys/types.h>
 #endif
 
 #ifdef _MSC_VER
@@ -169,7 +169,7 @@ void reverse_haystacks(sz_haystack_t *array, size_t length) {
 void apply_order(sz_haystack_t *array, sz_size_t *order, size_t length) {
     for (size_t i = 0; i < length; ++i) {
         if (i == order[i]) continue;
-            sz_haystack_t temp = array[i];
+        sz_haystack_t temp = array[i];
         size_t k = i, j;
         while (i != (j = order[k])) {
             array[k] = array[j];
@@ -1638,7 +1638,7 @@ static PyObject *Strs_sort(Strs *self, PyObject *args, PyObject *kwargs) {
 
     sz_haystack_t *parts = NULL;
     sz_size_t *order = NULL;
-    sz_size_t *count = NULL;
+    sz_size_t count = 0;
     if (!Strs_sort_(self, &parts, &order, &count)) return NULL;
 
     // Apply the sorting algorithm here, considering the `reverse` value
@@ -1691,7 +1691,7 @@ static PyObject *Strs_order(Strs *self, PyObject *args, PyObject *kwargs) {
 
     sz_haystack_t *parts = NULL;
     sz_size_t *order = NULL;
-    sz_size_t count = NULL;
+    sz_size_t count = 0;
     if (!Strs_sort_(self, &parts, &order, &count)) return NULL;
 
     // Apply the sorting algorithm here, considering the `reverse` value
diff --git a/scripts/bench.py b/scripts/bench.py
index 5522031a..a7c864fb 100644
--- a/scripts/bench.py
+++ b/scripts/bench.py
@@ -53,8 +53,8 @@ def bench(
         stringzilla_file = File(haystack_path)
     else:
         haystack_length = int(haystack_length)
-        repretitions = haystack_length // len(haystack_pattern)
-        pythonic_str: str = haystack_pattern * repretitions
+        repetitions = haystack_length // len(haystack_pattern)
+        pythonic_str: str = haystack_pattern * repetitions
         stringzilla_file = None
 
     stringzilla_str = Str(pythonic_str)
diff --git a/scripts/test_units.py b/scripts/test_units.py
index a7c622e3..a2f985a7 100644
--- a/scripts/test_units.py
+++ b/scripts/test_units.py
@@ -71,7 +71,7 @@ def test_unit_split():
 
 
 def test_unit_sequence():
-    native = "line3\nline2\nline1"
+    native = "p3\np2\np1"
     big = Str(native)
 
     lines = big.splitlines()
@@ -79,11 +79,12 @@ def test_unit_sequence():
 
     lines.sort()
     assert [0, 1, 2] == list(lines.order())
-    assert ["line1", "line2", "line3"] == list(lines)
+    assert ["p1", "p2", "p3"] == list(lines)
 
+    # Reverse order
+    assert [2, 1, 0] == list(lines.order(reverse=True))
     lines.sort(reverse=True)
-    assert [2, 1, 0] == list(lines.order())
-    assert ["line3", "line2", "line1"] == list(lines)
+    assert ["p3", "p2", "p1"] == list(lines)
 
 
 def test_unit_globals():

From b1cf5e5c29ca8a58cd39ba86aec932db6d9a2cba Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 5 Oct 2023 14:16:52 -0700
Subject: [PATCH 48/72] Fix: Buffer width for NodeJS bindings

---
 javascript/lib.c            | 148 ++++++++++++++----------------------
 javascript/stringzilla.d.ts |  17 -----
 javascript/stringzilla.js   |  24 +++++-
 python/lib.c                |  24 +++---
 scripts/test.c              |  14 ++--
 stringzilla/stringzilla.h   |  12 +--
 6 files changed, 107 insertions(+), 132 deletions(-)
 delete mode 100644 javascript/stringzilla.d.ts

diff --git a/javascript/lib.c b/javascript/lib.c
index 2c46224c..3644468a 100644
--- a/javascript/lib.c
+++ b/javascript/lib.c
@@ -1,12 +1,11 @@
 /**
- *  @file lib.c
- *  @author Ash Vardanian
- *  @brief JavaScript bindings for StringZilla.
- *  @date 2023-09-18
+ *  @file       lib.c
+ *  @brief      JavaScript bindings for StringZilla.
+ *  @author     Ash Vardanian
+ *  @date       September 18, 2023
  *
- *  @copyright Copyright (c) 2023
- *
- *  @see NodeJS docs: https://nodejs.org/api/n-api.html
+ *  @copyright  Copyright (c) 2023
+ *  @see        NodeJS docs: https://nodejs.org/api/n-api.html
  */
 
 #include <node_api.h>
@@ -18,49 +17,39 @@ napi_value FindAPI(napi_env env, napi_callback_info info) {
     napi_get_cb_info(env, info, &argc, args, NULL, NULL);
 
     // Extract the C string from the JavaScript string for haystack and needle
-    struct strzl_haystack_t strzl_haystack = {NULL, 0};
-    struct strzl_needle_t strzl_needle = {NULL, 0, 0};
+    sz_haystack_t haystack_sz = {NULL, 0};
+    sz_needle_t needle_sz = {NULL, 0, 0};
 
     // For haystack
-    napi_get_value_string_utf8(env, args[0], NULL, 0, &strzl_haystack.len);
-    char *haystack = malloc(strzl_haystack.len);
-    napi_get_value_string_utf8(env, args[0], haystack, strzl_haystack.len, &strzl_haystack.len);
-    strzl_haystack.ptr = haystack;
+    napi_get_value_string_utf8(env, args[0], NULL, 0, &haystack_sz.length);
+    haystack_sz.start = malloc(haystack_sz.length + 1);
+    napi_get_value_string_utf8(env, args[0], haystack_sz.start, haystack_sz.length + 1, &haystack_sz.length);
 
     // For needle
-    napi_get_value_string_utf8(env, args[1], NULL, 0, &strzl_needle.len);
-    char *needle = malloc(strzl_needle.len);
-    napi_get_value_string_utf8(env, args[1], needle, strzl_needle.len, &strzl_needle.len);
-    strzl_needle.ptr = needle;
-
-// Perform the find operation
-#if defined(__AVX2__)
-    uint64_t result = strzl_avx2_find_substr(strzl_haystack, strzl_needle);
-#elif defined(__ARM_NEON)
-    uint64_t result = strzl_neon_find_substr(strzl_haystack, strzl_needle);
-#else
-    uint64_t result = strzl_naive_find_substr(strzl_haystack, strzl_needle);
-#endif
+    napi_get_value_string_utf8(env, args[1], NULL, 0, &needle_sz.length);
+    needle_sz.start = malloc(needle_sz.length + 1);
+    napi_get_value_string_utf8(env, args[1], needle_sz.start, needle_sz.length + 1, &needle_sz.length);
+
+    // Perform the find operation
+    sz_size_t result = sz_find_substr(haystack_sz, needle_sz);
 
     // Cleanup
-    free(haystack);
-    free(needle);
+    free(haystack_sz.start);
+    free(needle_sz.start);
 
     // Convert the result to JavaScript BigInt and return
     napi_value js_result;
 
     // In JavaScript, if `find` is unable to find the specified value, then it should return -1
-    if (result == strzl_haystack.len)
-        napi_create_bigint_int64(env, -1, &js_result);
+    if (result == haystack_sz.length) napi_create_bigint_int64(env, -1, &js_result);
     else
         napi_create_bigint_uint64(env, result, &js_result);
 
     return js_result;
 }
 
-size_t count_char(strzl_haystack_t strzl_haystack, char needle) {
-    size_t result = strzl_naive_count_char(strzl_haystack, needle);
-
+size_t count_char(sz_haystack_t haystack_sz, char needle) {
+    size_t result = sz_count_char(haystack_sz, needle);
     return result;
 }
 
@@ -70,91 +59,66 @@ napi_value CountAPI(napi_env env, napi_callback_info info) {
     napi_get_cb_info(env, info, &argc, args, NULL, NULL);
 
     // Extract the C string from the JavaScript string for haystack and needle
-    struct strzl_haystack_t strzl_haystack = {NULL, 0};
-    struct strzl_needle_t strzl_needle = {NULL, 0, 0};
+    sz_haystack_t haystack_sz = {NULL, 0};
+    sz_needle_t needle_sz = {NULL, 0, 0};
 
     // For haystack
-    napi_get_value_string_utf8(env, args[0], NULL, 0, &strzl_haystack.len);
-    char *haystack = malloc(strzl_haystack.len);
-    napi_get_value_string_utf8(env, args[0], haystack, strzl_haystack.len, &strzl_haystack.len);
-    strzl_haystack.ptr = haystack;
+    napi_get_value_string_utf8(env, args[0], NULL, 0, &haystack_sz.length);
+    haystack_sz.start = malloc(haystack_sz.length + 1);
+    napi_get_value_string_utf8(env, args[0], haystack_sz.start, haystack_sz.length + 1, &haystack_sz.length);
 
     // For needle
-    napi_get_value_string_utf8(env, args[1], NULL, 0, &strzl_needle.len);
-    char *needle = malloc(strzl_needle.len);
-    napi_get_value_string_utf8(env, args[1], needle, strzl_needle.len, &strzl_needle.len);
-    strzl_needle.ptr = needle;
+    napi_get_value_string_utf8(env, args[1], NULL, 0, &needle_sz.length);
+    needle_sz.start = malloc(needle_sz.length + 1);
+    napi_get_value_string_utf8(env, args[1], needle_sz.start, needle_sz.length + 1, &needle_sz.length);
 
     bool overlap = false;
-    if (argc > 2) {
-        napi_get_value_bool(env, args[2], &overlap);
-    }
+    if (argc > 2) { napi_get_value_bool(env, args[2], &overlap); }
 
-    size_t result;
+    void const *haystack_start = haystack_sz.start, *needle_start = needle_sz.start;
 
-    if (strzl_needle.len == 0 || strzl_haystack.len == 0 || strzl_haystack.len < strzl_needle.len)
-        result = 0;
-    else if (strzl_needle.len == 1)
-        result = count_char(strzl_haystack, strzl_needle.ptr[0]);
+    size_t count = 0;
+    if (needle_sz.length == 0 || haystack_sz.length == 0 || haystack_sz.length < needle_sz.length) { count = 0; }
+    else if (needle_sz.length == 1) { count = count_char(haystack_sz, needle_sz.start[0]); }
     else if (overlap) {
-        while (strzl_haystack.len) {
-#if defined(__AVX2__)
-            size_t offset = strzl_avx2_find_substr(strzl_haystack, strzl_needle);
-#elif defined(__ARM_NEON)
-            size_t offset = strzl_neon_find_substr(strzl_haystack, strzl_needle);
-#else
-            size_t offset = strzl_naive_find_substr(strzl_haystack, strzl_needle);
-#endif
-
-            bool found = offset != strzl_haystack.len;
-            result += found;
-            strzl_haystack.ptr += offset + found;
-            strzl_haystack.len -= offset + found;
+        while (haystack_sz.length) {
+            sz_size_t offset = sz_find_substr(haystack_sz, needle_sz);
+            int found = offset != haystack_sz.length;
+            count += found;
+            haystack_sz.start += offset + found;
+            haystack_sz.length -= offset + found;
         }
     }
-
     else {
-        while (strzl_haystack.len) {
-#if defined(__AVX2__)
-            size_t offset = strzl_avx2_find_substr(strzl_haystack, strzl_needle);
-#elif defined(__ARM_NEON)
-            size_t offset = strzl_neon_find_substr(strzl_haystack, strzl_needle);
-#else
-            size_t offset = strzl_naive_find_substr(strzl_haystack, strzl_needle);
-#endif
-
-            bool found = offset != strzl_haystack.len;
-            result += found;
-            strzl_haystack.ptr += offset + strzl_needle.len;
-            strzl_haystack.len -= offset + strzl_needle.len * found;
+        while (haystack_sz.length) {
+            sz_size_t offset = sz_find_substr(haystack_sz, needle_sz);
+            int found = offset != haystack_sz.length;
+            count += found;
+            haystack_sz.start += offset + needle_sz.length;
+            haystack_sz.length -= offset + needle_sz.length * found;
         }
     }
 
     // Cleanup
-    free(haystack);
-    free(needle);
+    free(haystack_start);
+    free(needle_start);
 
-    // Convert the result to JavaScript `BigInt` and return
-    napi_value js_result;
-    napi_create_bigint_uint64(env, result, &js_result);
+    // Convert the `count` to JavaScript `BigInt` and return
+    napi_value js_count;
+    napi_create_bigint_uint64(env, count, &js_count);
 
-    return js_result;
+    return js_count;
 }
 
 napi_value Init(napi_env env, napi_value exports) {
-    // Define the "find" property
-    napi_property_descriptor findDesc = {"find", 0, FindAPI, 0, 0, 0, napi_default, 0};
-
-    // Define the "count" property
-    napi_property_descriptor countDesc = {"count", 0, CountAPI, 0, 0, 0, napi_default, 0};
 
     // Define an array of property descriptors
+    napi_property_descriptor findDesc = {"find", 0, FindAPI, 0, 0, 0, napi_default, 0};
+    napi_property_descriptor countDesc = {"count", 0, CountAPI, 0, 0, 0, napi_default, 0};
     napi_property_descriptor properties[] = {findDesc, countDesc};
 
-    // Define the number of properties in the array
-    size_t propertyCount = sizeof(properties) / sizeof(properties[0]);
-
     // Define the properties on the `exports` object
+    size_t propertyCount = sizeof(properties) / sizeof(properties[0]);
     napi_define_properties(env, exports, propertyCount, properties);
 
     return exports;
diff --git a/javascript/stringzilla.d.ts b/javascript/stringzilla.d.ts
deleted file mode 100644
index 5521c152..00000000
--- a/javascript/stringzilla.d.ts
+++ /dev/null
@@ -1,17 +0,0 @@
-
-/**
- * Searches for a short string in a long one.
- * 
- * @param {string} haystack 
- * @param {string} needle 
- */
-export function find(haystack: string, needle: string): bigint;
-
-/**
- * Searches for a substring in a larger string.
- * 
- * @param {string} haystack 
- * @param {string} needle 
- * @param {boolean} overlap 
- */
-export function count(haystack: string, needle: string, overlap: boolean): bigint;
diff --git a/javascript/stringzilla.js b/javascript/stringzilla.js
index d163bee8..24b78e24 100644
--- a/javascript/stringzilla.js
+++ b/javascript/stringzilla.js
@@ -1,2 +1,22 @@
-const stringzilla = require('bindings')('stringzilla');
-module.exports = stringzilla;
\ No newline at end of file
+const compiled = require('bindings')('stringzilla');
+
+module.exports = {
+    /**
+     * Searches for a short string in a long one.
+     * 
+     * @param {string} haystack 
+     * @param {string} needle 
+     * @returns {bigint}
+     */
+    find: compiled.find,
+
+    /**
+     * Searches for a substring in a larger string.
+     * 
+     * @param {string} haystack 
+     * @param {string} needle 
+     * @param {boolean} overlap 
+     * @returns {bigint}
+     */
+    count: compiled.count
+};
diff --git a/python/lib.c b/python/lib.c
index cef4a751..ad10f196 100644
--- a/python/lib.c
+++ b/python/lib.c
@@ -1,6 +1,10 @@
 /**
- *  @brief  Very light-weight CPython wrapper for StringZilla, with support for memory-mapping,
- *          native Python strings, Apache Arrow collections, and more.
+ *  @file       lib.c
+ *  @brief      Very light-weight CPython wrapper for StringZilla, with support for memory-mapping,
+ *              native Python strings, Apache Arrow collections, and more.
+ *  @author     Ash Vardanian
+ *  @date       July 10, 2023
+ *  @copyright  Copyright (c) 2023
  *
  *  - Doesn't use PyBind11, NanoBind, Boost.Python, or any other high-level libs, only CPython API.
  *  - To minimize latency this implementation avoids `PyArg_ParseTupleAndKeywords` calls.
@@ -646,7 +650,7 @@ static int Str_in(Str *self, PyObject *arg) {
     sz_haystack_t haystack;
     haystack.start = self->start;
     haystack.length = self->length;
-    size_t position = sz_find_substr_auto(haystack, needle_struct);
+    size_t position = sz_find_substr(haystack, needle_struct);
     return position != haystack.length;
 }
 
@@ -881,7 +885,7 @@ static int Str_find_( //
     haystack.length = normalized_length;
 
     // Perform contains operation
-    size_t offset = sz_find_substr_auto(haystack, needle);
+    size_t offset = sz_find_substr(haystack, needle);
     if (offset == haystack.length) { *offset_out = -1; }
     else { *offset_out = (Py_ssize_t)offset; }
 
@@ -1008,11 +1012,13 @@ static PyObject *Str_count(PyObject *self, PyObject *args, PyObject *kwargs) {
     haystack.start += normalized_offset;
     haystack.length = normalized_length;
 
-    size_t count = needle.length == 1 ? sz_count_char_swar(haystack, *needle.start) : 0;
-    if (needle.length != 1) {
+    size_t count = 0;
+    if (needle.length == 0 || haystack.length == 0 || haystack.length < needle.length) { count = 0; }
+    else if (needle.length == 1) { count = sz_count_char(haystack, needle.start[0]); }
+    else if (needle.length != 1) {
         if (allowoverlap) {
             while (haystack.length) {
-                size_t offset = sz_find_substr_auto(haystack, needle);
+                sz_size_t offset = sz_find_substr(haystack, needle);
                 int found = offset != haystack.length;
                 count += found;
                 haystack.start += offset + found;
@@ -1021,7 +1027,7 @@ static PyObject *Str_count(PyObject *self, PyObject *args, PyObject *kwargs) {
         }
         else {
             while (haystack.length) {
-                size_t offset = sz_find_substr_auto(haystack, needle);
+                sz_size_t offset = sz_find_substr(haystack, needle);
                 int found = offset != haystack.length;
                 count += found;
                 haystack.start += offset + needle.length;
@@ -1207,7 +1213,7 @@ static Strs *Str_split_(
         sz_haystack_t text_remaining;
         text_remaining.start = text.start + last_start;
         text_remaining.length = text.length - last_start;
-        sz_size_t offset_in_remaining = sz_find_substr_auto(text_remaining, separator);
+        sz_size_t offset_in_remaining = sz_find_substr(text_remaining, separator);
 
         // Reallocate offsets array if needed
         if (offsets_count >= offsets_capacity) {
diff --git a/scripts/test.c b/scripts/test.c
index f50d7e62..a921e76d 100644
--- a/scripts/test.c
+++ b/scripts/test.c
@@ -1,9 +1,9 @@
+#include <assert.h>
+#include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <stdint.h>
 #include <string.h>
 #include <time.h>
-#include <assert.h>
 
 #include <stringzilla.h>
 
@@ -18,8 +18,8 @@ void populate_random_string(char *buffer, int length, int variability) {
     buffer[length] = '\0';
 }
 
-// Test function for sz_find_substr_auto
-void test_sz_find_substr_auto() {
+// Test function for sz_find_substr
+void test_sz_find_substr() {
     char buffer[MAX_LENGTH + 1];
     char pattern[6]; // Maximum length of 5 + 1 for '\0'
 
@@ -40,11 +40,11 @@ void test_sz_find_substr_auto() {
 
             // Comparing the result of your function with the standard library function.
             const char *result_libc = strstr(buffer, pattern);
-            uint64_t result_stringzilla = sz_find_substr_auto(haystack, needle);
+            uint64_t result_stringzilla = sz_find_substr(haystack, needle);
 
             assert(((result_libc && result_stringzilla == (uint64_t)(result_libc - buffer)) ||
                     (!result_libc && result_stringzilla == (uint64_t)-1)) &&
-                   "Test failed for sz_find_substr_auto");
+                   "Test failed for sz_find_substr");
         }
     }
 }
@@ -52,7 +52,7 @@ void test_sz_find_substr_auto() {
 int main() {
     srand((unsigned int)time(NULL));
 
-    test_sz_find_substr_auto();
+    test_sz_find_substr();
     // Add calls to other test functions as you implement them
 
     printf("All tests passed!\n");
diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h
index 52cc4ec6..7e2957a3 100644
--- a/stringzilla/stringzilla.h
+++ b/stringzilla/stringzilla.h
@@ -1,12 +1,12 @@
 #ifndef STRINGZILLA_H_
 #define STRINGZILLA_H_
 
-#include <stdint.h> // `uint8_t`
+#include <ctype.h>  // `tolower`
+#include <search.h> // `qsort_s`
 #include <stddef.h> // `sz_size_t`
-#include <string.h> // `memcpy`
+#include <stdint.h> // `uint8_t`
 #include <stdlib.h> // `qsort_r`
-#include <search.h> // `qsort_s`
-#include <ctype.h>  // `tolower`
+#include <string.h> // `memcpy`
 
 #if defined(__AVX2__)
 #include <x86intrin.h>
@@ -427,7 +427,9 @@ inline static sz_size_t sz_find_substr_neon(sz_haystack_t h, sz_needle_t n) {
 
 #endif // Arm Neon
 
-inline static sz_size_t sz_find_substr_auto(sz_haystack_t h, sz_needle_t n) {
+inline static sz_size_t sz_count_char(sz_haystack_t h, char n) { return sz_count_char_swar(h, n); }
+
+inline static sz_size_t sz_find_substr(sz_haystack_t h, sz_needle_t n) {
     if (h.length < n.length) return h.length;
 
     switch (n.length) {

From 7a317c072072ab431e38ae1f7dedad13490f7722 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 5 Oct 2023 14:23:43 -0700
Subject: [PATCH 49/72] Make: Shift JavaScript CI

---
 .github/workflows/javascript-ci.yml | 21 ---------------------
 .github/workflows/prerelease.yml    | 17 +++++++++++++++++
 .github/workflows/release.yml       |  2 +-
 README.md                           | 10 +++++-----
 stringzilla/stringzilla.h           |  1 +
 5 files changed, 24 insertions(+), 27 deletions(-)
 delete mode 100644 .github/workflows/javascript-ci.yml

diff --git a/.github/workflows/javascript-ci.yml b/.github/workflows/javascript-ci.yml
deleted file mode 100644
index 674fa882..00000000
--- a/.github/workflows/javascript-ci.yml
+++ /dev/null
@@ -1,21 +0,0 @@
-name: CI
-on:
-  pull_request:
-    branches: '*'
-  push:
-    branches: '*'
-
-jobs:
-  tests:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        node-version: [18.x]
-    steps:
-      - uses: actions/checkout@v4
-      - name: Use Node.js
-        uses: actions/setup-node@v3
-        with:
-          node-version: '18.x'
-      - run: npm i
-      - run: npm test
\ No newline at end of file
diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
index 9c6bdab9..9940829f 100644
--- a/.github/workflows/prerelease.yml
+++ b/.github/workflows/prerelease.yml
@@ -70,4 +70,21 @@ jobs:
       - name: Test with PyTest
         run: pytest scripts/
 
+  test_javascript:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        node-version: [18.x]
+    steps:
+    
+      - uses: actions/checkout@v4
+      - name: Set up Node.js
+        uses: actions/setup-node@v3
+        with:
+          node-version: '18.x'
+      
+      - name: Build locally
+        run: npm i
 
+      - name: Test
+        run: npm test
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index f583fc05..b96529a7 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -127,4 +127,4 @@ jobs:
       - run: npm test
       - uses: JS-DevTools/npm-publish@v2
         with:
-          token: ${{ secrets.NPM_TOKEN }}
\ No newline at end of file
+          token: ${{ secrets.NPM_TOKEN }}
diff --git a/README.md b/README.md
index f774df35..3c04c219 100644
--- a/README.md
+++ b/README.md
@@ -119,9 +119,9 @@ sz_haystack_t haystack = {your_text, your_text_length};
 sz_needle_t needle = {your_subtext, your_subtext_length, your_anomaly_offset};
 
 // Perform string-level operations
-size_t character_count = sz_count_char_swar(haystack, 'a');
-size_t character_position = sz_find_char_swar(haystack, 'a');
-size_t substring_position = sz_find_substr_swar(haystack, needle);
+size_t character_count = sz_count_char(haystack, 'a');
+size_t character_position = sz_find_char(haystack, 'a');
+size_t substring_position = sz_find_substr(haystack, needle);
 
 // Perform collection level operations
 sz_sequence_t array = {your_order, your_count, your_get_start, your_get_length, your_handle};
@@ -148,7 +148,7 @@ Here's how to set up your dev environment and run some tests.
 CPython:
 
 ```sh
-# Clean up and install
+# Clean up, install, and test!
 rm -rf build && pip install -e . && pytest scripts/ -s -x
 
 # Install without dependencies
@@ -158,7 +158,7 @@ pip install -e . --no-index --no-deps
 NodeJS:
 
 ```sh
-npm install && node javascript/test.js
+npm install && npm test
 ```
 
 ### Benchmarking
diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h
index 7e2957a3..e1425729 100644
--- a/stringzilla/stringzilla.h
+++ b/stringzilla/stringzilla.h
@@ -428,6 +428,7 @@ inline static sz_size_t sz_find_substr_neon(sz_haystack_t h, sz_needle_t n) {
 #endif // Arm Neon
 
 inline static sz_size_t sz_count_char(sz_haystack_t h, char n) { return sz_count_char_swar(h, n); }
+inline static sz_size_t sz_find_char(sz_haystack_t h, char n) { return sz_find_char_swar(h, n); }
 
 inline static sz_size_t sz_find_substr(sz_haystack_t h, sz_needle_t n) {
     if (h.length < n.length) return h.length;

From 61588cc81f21f0e3b539dcd56a08cecc04677c05 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 5 Oct 2023 14:26:40 -0700
Subject: [PATCH 50/72] Improve: Silence type-casting warnings

---
 javascript/lib.c | 40 ++++++++++++++++++++++++++++------------
 1 file changed, 28 insertions(+), 12 deletions(-)

diff --git a/javascript/lib.c b/javascript/lib.c
index 3644468a..fe1f5f68 100644
--- a/javascript/lib.c
+++ b/javascript/lib.c
@@ -21,21 +21,29 @@ napi_value FindAPI(napi_env env, napi_callback_info info) {
     sz_needle_t needle_sz = {NULL, 0, 0};
 
     // For haystack
-    napi_get_value_string_utf8(env, args[0], NULL, 0, &haystack_sz.length);
+    napi_get_value_string_utf8(env, args[0], NULL, 0, (size_t *)&haystack_sz.length);
     haystack_sz.start = malloc(haystack_sz.length + 1);
-    napi_get_value_string_utf8(env, args[0], haystack_sz.start, haystack_sz.length + 1, &haystack_sz.length);
+    napi_get_value_string_utf8(env,
+                               args[0],
+                               (char *)haystack_sz.start,
+                               haystack_sz.length + 1,
+                               (size_t *)&haystack_sz.length);
 
     // For needle
-    napi_get_value_string_utf8(env, args[1], NULL, 0, &needle_sz.length);
+    napi_get_value_string_utf8(env, args[1], NULL, 0, (size_t *)&needle_sz.length);
     needle_sz.start = malloc(needle_sz.length + 1);
-    napi_get_value_string_utf8(env, args[1], needle_sz.start, needle_sz.length + 1, &needle_sz.length);
+    napi_get_value_string_utf8(env,
+                               args[1],
+                               (char *)needle_sz.start,
+                               needle_sz.length + 1,
+                               (size_t *)&needle_sz.length);
 
     // Perform the find operation
     sz_size_t result = sz_find_substr(haystack_sz, needle_sz);
 
     // Cleanup
-    free(haystack_sz.start);
-    free(needle_sz.start);
+    free((void *)haystack_sz.start);
+    free((void *)needle_sz.start);
 
     // Convert the result to JavaScript BigInt and return
     napi_value js_result;
@@ -63,14 +71,22 @@ napi_value CountAPI(napi_env env, napi_callback_info info) {
     sz_needle_t needle_sz = {NULL, 0, 0};
 
     // For haystack
-    napi_get_value_string_utf8(env, args[0], NULL, 0, &haystack_sz.length);
+    napi_get_value_string_utf8(env, args[0], NULL, 0, (size_t *)&haystack_sz.length);
     haystack_sz.start = malloc(haystack_sz.length + 1);
-    napi_get_value_string_utf8(env, args[0], haystack_sz.start, haystack_sz.length + 1, &haystack_sz.length);
+    napi_get_value_string_utf8(env,
+                               args[0],
+                               (char *)haystack_sz.start,
+                               haystack_sz.length + 1,
+                               (size_t *)&haystack_sz.length);
 
     // For needle
-    napi_get_value_string_utf8(env, args[1], NULL, 0, &needle_sz.length);
+    napi_get_value_string_utf8(env, args[1], NULL, 0, (size_t *)&needle_sz.length);
     needle_sz.start = malloc(needle_sz.length + 1);
-    napi_get_value_string_utf8(env, args[1], needle_sz.start, needle_sz.length + 1, &needle_sz.length);
+    napi_get_value_string_utf8(env,
+                               args[1],
+                               (char *)needle_sz.start,
+                               needle_sz.length + 1,
+                               (size_t *)&needle_sz.length);
 
     bool overlap = false;
     if (argc > 2) { napi_get_value_bool(env, args[2], &overlap); }
@@ -100,8 +116,8 @@ napi_value CountAPI(napi_env env, napi_callback_info info) {
     }
 
     // Cleanup
-    free(haystack_start);
-    free(needle_start);
+    free((void *)haystack_start);
+    free((void *)needle_start);
 
     // Convert the `count` to JavaScript `BigInt` and return
     napi_value js_count;

From b5a0ccf3fe0e4401da06d89461e53b124fa9e034 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 5 Oct 2023 15:13:17 -0700
Subject: [PATCH 51/72] Fix: Overflow bug in Arm NEON

---
 stringzilla/stringzilla.h | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h
index e1425729..c1e6adc3 100644
--- a/stringzilla/stringzilla.h
+++ b/stringzilla/stringzilla.h
@@ -408,7 +408,7 @@ inline static sz_size_t sz_find_substr_neon(sz_haystack_t h, sz_needle_t n) {
         //   vorrq_u32 (all)
         uint32x4_t matches = vorrq_u32(vorrq_u32(matches0, matches1), vorrq_u32(matches2, matches3));
         uint64x2_t matches64x2 = vreinterpretq_u64_u32(matches);
-        int has_match = vgetq_lane_u64(matches64x2, 0) | vgetq_lane_u64(matches64x2, 1);
+        uint64_t has_match = vgetq_lane_u64(matches64x2, 0) | vgetq_lane_u64(matches64x2, 1);
 
         if (has_match) {
             for (sz_size_t i = 0; i < 16; i++) {
@@ -439,15 +439,13 @@ inline static sz_size_t sz_find_substr(sz_haystack_t h, sz_needle_t n) {
     case 2: return sz_find_2chars_swar(h, n.start);
     case 3: return sz_find_3chars_swar(h, n.start);
     case 4:
-        return sz_find_4chars_swar(h, n.start);
-        // #if defined(__ARM_NEON)
-        //     default: return sz_find_substr_neon(h, n);
-        // #elif defined(__AVX2__)
-        //     default: return sz_find_substr_avx2(h, n);
-        // #else
-    default:
-        return sz_find_substr_swar(h, n);
-        // #endif
+#if defined(__ARM_NEON)
+    default: return sz_find_substr_neon(h, n);
+#elif defined(__AVX2__)
+    default: return sz_find_substr_avx2(h, n);
+#else
+    default: return sz_find_substr_swar(h, n);
+#endif
     }
 }
 

From e278b454c05a7598b13cf05e03ed8f32ca3c6622 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 5 Oct 2023 15:15:34 -0700
Subject: [PATCH 52/72] Make: Dependencies for testing

---
 .github/workflows/prerelease.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
index 9940829f..7fb45e98 100644
--- a/.github/workflows/prerelease.yml
+++ b/.github/workflows/prerelease.yml
@@ -61,7 +61,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          python -m pip install --no-cache-dir --upgrade pip
+          python -m pip install --no-cache-dir --upgrade pip numpy
           pip install --no-cache-dir pytest
 
       - name: Build locally
@@ -71,6 +71,7 @@ jobs:
         run: pytest scripts/
 
   test_javascript:
+    name: Test JavaScript
     runs-on: ubuntu-latest
     strategy:
       matrix:

From b281bf0302ff1b1f8092e5933d80b5663d6a9d26 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 5 Oct 2023 15:55:49 -0700
Subject: [PATCH 53/72] Improve: Avoid inner `for`-loop on Arm NEON

---
 scripts/test_fuzzy.py     | 10 ++++--
 stringzilla/stringzilla.h | 70 +++++++++++++++++++++++----------------
 2 files changed, 49 insertions(+), 31 deletions(-)

diff --git a/scripts/test_fuzzy.py b/scripts/test_fuzzy.py
index ba4aca42..dbefd485 100644
--- a/scripts/test_fuzzy.py
+++ b/scripts/test_fuzzy.py
@@ -65,15 +65,19 @@ def test_fuzzy_repetitions(repetitions: int):
     check_identical(native, big, "abcde", True)  # Missing pattern
 
 
-@pytest.mark.parametrize("pattern_length", [1, 2, 4, 5])
+@pytest.mark.parametrize("pattern_length", [1, 2, 3, 4, 5])
 @pytest.mark.parametrize("haystack_length", range(1, 65))
 @pytest.mark.parametrize("variability", range(1, 25))
 def test_fuzzy_substrings(pattern_length: int, haystack_length: int, variability: int):
     native = get_random_string(variability=variability, length=haystack_length)
     big = Str(native)
     pattern = get_random_string(variability=variability, length=pattern_length)
-    assert (pattern in native) == big.contains(pattern)
-    assert native.find(pattern) == big.find(pattern)
+    assert (pattern in native) == big.contains(
+        pattern
+    ), f"Failed to check if {pattern} at offset {native.find(pattern)} is present in {native}"
+    assert native.find(pattern) == big.find(
+        pattern
+    ), f"Failed to locate {pattern} at offset {native.find(pattern)} in {native}"
 
 
 @pytest.mark.parametrize("iterations", range(100))
diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h
index c1e6adc3..7b664ca6 100644
--- a/stringzilla/stringzilla.h
+++ b/stringzilla/stringzilla.h
@@ -393,28 +393,49 @@ inline static sz_size_t sz_find_substr_neon(sz_haystack_t h, sz_needle_t n) {
 
     uint32x4_t const anomalies = vld1q_dup_u32(&anomaly);
     uint32x4_t const masks = vld1q_dup_u32(&mask);
+    uint32x4_t matches, matches0, matches1, matches2, matches3;
 
     char const *text = h.start;
-    for (; (text + n.length + 16) <= end; text += 16) {
-
-        uint32x4_t matches0 = vceqq_u32(vandq_u32(vld1q_u32((uint32_t const *)(text + 0)), masks), anomalies);
-        uint32x4_t matches1 = vceqq_u32(vandq_u32(vld1q_u32((uint32_t const *)(text + 1)), masks), anomalies);
-        uint32x4_t matches2 = vceqq_u32(vandq_u32(vld1q_u32((uint32_t const *)(text + 2)), masks), anomalies);
-        uint32x4_t matches3 = vceqq_u32(vandq_u32(vld1q_u32((uint32_t const *)(text + 3)), masks), anomalies);
-
-        // Extracting matches from matches:
-        //   vmaxvq_u32 (only a64)
-        //   vgetq_lane_u32 (all)
-        //   vorrq_u32 (all)
-        uint32x4_t matches = vorrq_u32(vorrq_u32(matches0, matches1), vorrq_u32(matches2, matches3));
-        uint64x2_t matches64x2 = vreinterpretq_u64_u32(matches);
-        uint64_t has_match = vgetq_lane_u64(matches64x2, 0) | vgetq_lane_u64(matches64x2, 1);
-
-        if (has_match) {
-            for (sz_size_t i = 0; i < 16; i++) {
-                if (sz_equal(text + i, n.start, n.length)) return i + (text - h.start);
+    while (text + n.length + 16 <= end) {
+
+        // Each of the following `matchesX` contains only 4 relevant bits - one per word.
+        // Each signifies a match at the given offset.
+        matches0 = vceqq_u32(vandq_u32(vld1q_u32((uint32_t const *)(text + 0)), masks), anomalies);
+        matches1 = vceqq_u32(vandq_u32(vld1q_u32((uint32_t const *)(text + 1)), masks), anomalies);
+        matches2 = vceqq_u32(vandq_u32(vld1q_u32((uint32_t const *)(text + 2)), masks), anomalies);
+        matches3 = vceqq_u32(vandq_u32(vld1q_u32((uint32_t const *)(text + 3)), masks), anomalies);
+        matches = vorrq_u32(vorrq_u32(matches0, matches1), vorrq_u32(matches2, matches3));
+
+        if (vmaxvq_u32(matches)) {
+            // Let's isolate the match from every word
+            matches0 = vandq_u32(matches0, vdupq_n_u32(0x00000001));
+            matches1 = vandq_u32(matches1, vdupq_n_u32(0x00000002));
+            matches2 = vandq_u32(matches2, vdupq_n_u32(0x00000004));
+            matches3 = vandq_u32(matches3, vdupq_n_u32(0x00000008));
+            matches = vorrq_u32(vorrq_u32(matches0, matches1), vorrq_u32(matches2, matches3));
+
+            // By now, every 32-bit word of `matches` no more than 4 set bits.
+            // Meaning that we can narrow it down to a single 16-bit word.
+            uint16x4_t matches_u16x4 = vmovn_u32(matches);
+            uint16_t matches_u16 =                       //
+                (vget_lane_u16(matches_u16x4, 0) << 0) | //
+                (vget_lane_u16(matches_u16x4, 1) << 4) | //
+                (vget_lane_u16(matches_u16x4, 2) << 8) | //
+                (vget_lane_u16(matches_u16x4, 3) << 12);
+
+            // Find the first match
+            size_t first_match_offset = __builtin_ctz(matches_u16);
+            if (n.length > 4) {
+                if (sz_equal(text + first_match_offset + 4, n.start + 4, n.length - 4))
+                    return text + first_match_offset - h.start;
+                else
+                    text += first_match_offset + 1;
             }
+            else
+                return text + first_match_offset - h.start;
         }
+        else
+            text += 16;
     }
 
     // Don't forget the last (up to 16+3=19) characters.
@@ -433,20 +454,13 @@ inline static sz_size_t sz_find_char(sz_haystack_t h, char n) { return sz_find_c
 inline static sz_size_t sz_find_substr(sz_haystack_t h, sz_needle_t n) {
     if (h.length < n.length) return h.length;
 
-    switch (n.length) {
-    case 0: return 0;
-    case 1: return sz_find_char_swar(h, *n.start);
-    case 2: return sz_find_2chars_swar(h, n.start);
-    case 3: return sz_find_3chars_swar(h, n.start);
-    case 4:
 #if defined(__ARM_NEON)
-    default: return sz_find_substr_neon(h, n);
+    return sz_find_substr_neon(h, n);
 #elif defined(__AVX2__)
-    default: return sz_find_substr_avx2(h, n);
+    return sz_find_substr_avx2(h, n);
 #else
-    default: return sz_find_substr_swar(h, n);
+    return sz_find_substr_swar(h, n);
 #endif
-    }
 }
 
 inline static void sz_swap(sz_size_t *a, sz_size_t *b) {

From 305cec84d79429a8e9263c6b4b0a9e0c6dadcc8d Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 5 Oct 2023 16:06:14 -0700
Subject: [PATCH 54/72] Add: Micro-benchmarking notebook

---
 scripts/bench.ipynb | 185 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 185 insertions(+)
 create mode 100644 scripts/bench.ipynb

diff --git a/scripts/bench.ipynb b/scripts/bench.ipynb
new file mode 100644
index 00000000..b69d2f8f
--- /dev/null
+++ b/scripts/bench.ipynb
@@ -0,0 +1,185 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import stringzilla as sz"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "129,644,797\n"
+     ]
+    }
+   ],
+   "source": [
+    "pythonic_str: str = open(\"../leipzig1M.txt\", \"r\").read()\n",
+    "print(f\"{len(pythonic_str):,}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sz_str = sz.Str(pythonic_str)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pattern = \"the\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1,456,488\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(f\"{pythonic_str.count(pattern):,}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1,456,488\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(f\"{sz_str.count(pattern):,}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "150 ms ± 2.01 ms per loop (mean ± std. dev. of 100 runs, 1 loop each)\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%timeit -n 1 -r 100\n",
+    "pythonic_str.count(pattern)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "37.8 ms ± 286 µs per loop (mean ± std. dev. of 100 runs, 1 loop each)\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%timeit -n 1 -r 100\n",
+    "sz_str.count(pattern)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "173 ns ± 23.7 ns per loop (mean ± std. dev. of 1000 runs, 1 loop each)\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%timeit -n 1 -r 1000\n",
+    "pythonic_str.find(pattern)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The slowest run took 82.51 times longer than the fastest. This could mean that an intermediate result is being cached.\n",
+      "94.3 ns ± 108 ns per loop (mean ± std. dev. of 1000 runs, 1 loop each)\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%timeit -n 1 -r 1000\n",
+    "sz_str.find(pattern)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From b09cb3d64644d6d007b6f118abcbaccd0f9eab3f Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Oct 2023 15:43:57 -0700
Subject: [PATCH 55/72] Make: `numpy` dependency

---
 .github/workflows/prerelease.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
index 7fb45e98..dd09096b 100644
--- a/.github/workflows/prerelease.yml
+++ b/.github/workflows/prerelease.yml
@@ -34,7 +34,7 @@ jobs:
           python-version: ${{ matrix.python-version }}
       - name: Install dependencies
         run: |
-          python -m pip install --no-cache-dir --upgrade pip
+          python -m pip install --no-cache-dir --upgrade pip numpy
           pip install --no-cache-dir pytest
       - name: Build locally
         run: python -m pip install .

From 779dded5c653bf9be56ec270ba0d8e9ee1a26052 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Oct 2023 18:12:38 -0700
Subject: [PATCH 56/72] Improve: drop `ctype`, `stddef`, `stdint` headers

---
 .vscode/settings.json     |   2 +
 README.md                 |   4 +-
 python/lib.c              |  42 +++---
 scripts/bench.ipynb       |   2 +-
 stringzilla/stringzilla.h | 274 +++++++++++++++++++++++++-------------
 5 files changed, 203 insertions(+), 121 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 886d1d22..08c5bb65 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -151,6 +151,7 @@
     "NOMINMAX",
     "NOTIMPLEMENTED",
     "numpy",
+    "octogram",
     "pytest",
     "Pythonic",
     "quadgram",
@@ -166,6 +167,7 @@
     "substr",
     "SWAR",
     "TPFLAGS",
+    "unigram",
     "Vardanian",
     "vectorcallfunc",
     "XDECREF",
diff --git a/README.md b/README.md
index 3c04c219..85032c34 100644
--- a/README.md
+++ b/README.md
@@ -116,11 +116,11 @@ There is an ABI-stable C 99 interface, in case you have a database, an operating
 
 // Initialize your haystack and needle
 sz_haystack_t haystack = {your_text, your_text_length};
-sz_needle_t needle = {your_subtext, your_subtext_length, your_anomaly_offset};
+sz_needle_t needle = {your_subtext, your_subtext_length, your_quadgram_offset};
 
 // Perform string-level operations
 size_t character_count = sz_count_char(haystack, 'a');
-size_t character_position = sz_find_char(haystack, 'a');
+size_t character_position = sz_find_unigram(haystack, 'a');
 size_t substring_position = sz_find_substr(haystack, needle);
 
 // Perform collection level operations
diff --git a/python/lib.c b/python/lib.c
index ad10f196..a0f6caca 100644
--- a/python/lib.c
+++ b/python/lib.c
@@ -48,12 +48,12 @@ static struct {
  *          native `mmap` module, as it exposes the address of the mapping in memory.
  */
 typedef struct {
-    PyObject_HEAD;
+    PyObject_HEAD
 #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
-    HANDLE file_handle;
+        HANDLE file_handle;
     HANDLE mapping_handle;
 #else
-    int file_descriptor;
+        int file_descriptor;
 #endif
     void *start;
     size_t length;
@@ -72,8 +72,7 @@ typedef struct {
  *      - Str(File("some-path.txt"), from=0, to=sys.maxint)
  */
 typedef struct {
-    PyObject_HEAD;
-    PyObject *parent;
+    PyObject_HEAD PyObject *parent;
     char const *start;
     size_t length;
 } Str;
@@ -83,14 +82,14 @@ typedef struct {
  *          for faster sorting, shuffling, joins, and lookups.
  */
 typedef struct {
-    PyObject_HEAD;
+    PyObject_HEAD
 
-    enum {
-        STRS_CONSECUTIVE_32,
-        STRS_CONSECUTIVE_64,
-        STRS_REORDERED,
-        STRS_MULTI_SOURCE,
-    } type;
+        enum {
+            STRS_CONSECUTIVE_32,
+            STRS_CONSECUTIVE_64,
+            STRS_REORDERED,
+            STRS_MULTI_SOURCE,
+        } type;
 
     union {
         /**
@@ -641,7 +640,7 @@ static void Str_releasebuffer(PyObject *_, Py_buffer *view) {
 static int Str_in(Str *self, PyObject *arg) {
 
     sz_needle_t needle_struct;
-    needle_struct.anomaly_offset = 0;
+    needle_struct.quadgram_offset = 0;
     if (!export_string_like(arg, &needle_struct.start, &needle_struct.length)) {
         PyErr_SetString(PyExc_TypeError, "Unsupported argument type");
         return -1;
@@ -851,7 +850,7 @@ static int Str_find_( //
     Py_ssize_t start, end;
 
     // Validate and convert `haystack` and `needle`
-    needle.anomaly_offset = 0;
+    needle.quadgram_offset = 0;
     if (!export_string_like(haystack_obj, &haystack.start, &haystack.length) ||
         !export_string_like(needle_obj, &needle.start, &needle.length)) {
         PyErr_SetString(PyExc_TypeError, "Haystack and needle must be string-like");
@@ -1000,7 +999,7 @@ static PyObject *Str_count(PyObject *self, PyObject *args, PyObject *kwargs) {
     Py_ssize_t end = end_obj ? PyLong_AsSsize_t(end_obj) : PY_SSIZE_T_MAX;
     int allowoverlap = allowoverlap_obj ? PyObject_IsTrue(allowoverlap_obj) : 0;
 
-    needle.anomaly_offset = 0;
+    needle.quadgram_offset = 0;
     if (!export_string_like(haystack_obj, &haystack.start, &haystack.length) ||
         !export_string_like(needle_obj, &needle.start, &needle.length))
         return PyErr_Format(PyExc_TypeError, "Haystack and needle must be string-like"), NULL;
@@ -1287,7 +1286,7 @@ static PyObject *Str_split(PyObject *self, PyObject *args, PyObject *kwargs) {
     sz_needle_t separator;
     int keepseparator;
     Py_ssize_t maxsplit;
-    separator.anomaly_offset = 0;
+    separator.quadgram_offset = 0;
 
     // Validate and convert `text`
     if (!export_string_like(text_obj, &text.start, &text.length)) {
@@ -1565,14 +1564,9 @@ static boolean_t Strs_sort_(Strs *self,
     }
 
     // Get the parts and their count
-    sz_haystack_t *parts = NULL;
-    size_t count = 0;
-    switch (self->type) {
-    case STRS_REORDERED:
-        parts = self->data.reordered.parts;
-        count = self->data.reordered.count;
-        break;
-    }
+    // The only possible `self->type` by now is the `STRS_REORDERED`
+    sz_haystack_t *parts = self->data.reordered.parts;
+    size_t count = self->data.reordered.count;
 
     // Allocate temporary memory to store the ordering offsets
     size_t memory_needed = sizeof(sz_size_t) * count;
diff --git a/scripts/bench.ipynb b/scripts/bench.ipynb
index b69d2f8f..b3bc4392 100644
--- a/scripts/bench.ipynb
+++ b/scripts/bench.ipynb
@@ -176,7 +176,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.11"
+   "version": "3.10.13"
   },
   "orig_nbformat": 4
  },
diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h
index 7b664ca6..51319f01 100644
--- a/stringzilla/stringzilla.h
+++ b/stringzilla/stringzilla.h
@@ -1,10 +1,7 @@
 #ifndef STRINGZILLA_H_
 #define STRINGZILLA_H_
 
-#include <ctype.h>  // `tolower`
 #include <search.h> // `qsort_s`
-#include <stddef.h> // `sz_size_t`
-#include <stdint.h> // `uint8_t`
 #include <stdlib.h> // `qsort_r`
 #include <string.h> // `memcpy`
 
@@ -30,11 +27,71 @@
 extern "C" {
 #endif
 
-typedef uint32_t sz_anomaly_t;
-typedef uint64_t sz_size_t;
+#if defined(__LP64__) || defined(_LP64) || defined(__x86_64__) || defined(_WIN64)
+typedef unsigned long sz_size_t; // 64-bit on most platforms when pointers are 64-bit
+#else
+typedef unsigned sz_size_t; // 32-bit on most platforms when pointers are 32-bit
+#endif
+
+typedef unsigned sz_u32_t;           // Always 32 bits
+typedef unsigned long long sz_u64_t; // Always 64 bits
+
+typedef union sz_quadgram_t {
+    unsigned u32;
+    unsigned char u8s[4];
+} sz_quadgram_t; // Always 32-bit unsigned integer, representing 8 bytes/characters
+
+typedef union sz_octogram_t {
+    unsigned long long u64;
+    unsigned char u8s[8];
+} sz_octogram_t; // Always 64-bit unsigned integer, representing 8 bytes/characters
 
 inline static sz_size_t sz_divide_round_up(sz_size_t x, sz_size_t divisor) { return (x + (divisor - 1)) / divisor; }
 
+inline static sz_size_t sz_tolower_ascii(char c) {
+    static char lowered[256] = {
+        0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  15,  //
+        16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  //
+        32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  //
+        48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  //
+        64,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, //
+        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 91,  92,  93,  94,  95,  //
+        96,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, //
+        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, //
+        128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, //
+        144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, //
+        160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, //
+        176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, //
+        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
+        240, 241, 242, 243, 244, 245, 246, 215, 248, 249, 250, 251, 252, 253, 254, 223, //
+        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
+        240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, //
+    };
+    return lowered[(int)c];
+}
+
+inline static sz_size_t sz_toupper_ascii(char c) {
+    static char upped[256] = {
+        0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  15,  //
+        16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  //
+        32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  //
+        48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  //
+        64,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, //
+        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 91,  92,  93,  94,  95,  //
+        96,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  //
+        80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  123, 124, 125, 126, 127, //
+        128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, //
+        144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, //
+        160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, //
+        176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, //
+        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
+        240, 241, 242, 243, 244, 245, 246, 215, 248, 249, 250, 251, 252, 253, 254, 223, //
+        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
+        240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, //
+    };
+    return upped[(int)c];
+}
+
 /**
  *  @brief This is a faster alternative to `strncmp(a, b, length) == 0`.
  *  @return 1 for `true`, and 0 for `false`.
@@ -53,28 +110,29 @@ typedef struct sz_haystack_t {
 typedef struct sz_needle_t {
     char const *start;
     sz_size_t length;
-    sz_size_t anomaly_offset;
+    sz_size_t quadgram_offset;
 } sz_needle_t;
 
 /**
  *  @brief  SWAR single-character counting procedure, jumping 8 bytes at a time.
  */
-inline static sz_size_t sz_count_char_swar(sz_haystack_t h, char n) {
+inline static sz_size_t sz_count_unigram_swar(sz_haystack_t h, char n) {
 
     sz_size_t result = 0;
     char const *text = h.start;
     char const *end = h.start + h.length;
 
-    for (; (uint64_t)text % 8 != 0 && text < end; ++text) result += *text == n;
+    // Process the misaligned head, to void UB on unaligned 64-bit loads.
+    for (; ((unsigned long)text & 7ul) && text < end; ++text) result += *text == n;
 
     // This code simulates hyper-scalar execution, comparing 8 characters at a time.
-    uint64_t nnnnnnnn = n;
+    sz_u64_t nnnnnnnn = n;
     nnnnnnnn |= nnnnnnnn << 8;
     nnnnnnnn |= nnnnnnnn << 16;
     nnnnnnnn |= nnnnnnnn << 32;
     for (; text + 8 <= end; text += 8) {
-        uint64_t text_slice = *(uint64_t const *)text;
-        uint64_t match_indicators = ~(text_slice ^ nnnnnnnn);
+        sz_u64_t text_slice = *(sz_u64_t const *)text;
+        sz_u64_t match_indicators = ~(text_slice ^ nnnnnnnn);
         match_indicators &= match_indicators >> 1;
         match_indicators &= match_indicators >> 2;
         match_indicators &= match_indicators >> 4;
@@ -89,22 +147,23 @@ inline static sz_size_t sz_count_char_swar(sz_haystack_t h, char n) {
 /**
  *  @brief  SWAR single-character search in string, jumping 8 bytes at a time.
  */
-inline static sz_size_t sz_find_char_swar(sz_haystack_t h, char n) {
+inline static sz_size_t sz_find_unigram_swar(sz_haystack_t h, char n) {
 
     char const *text = h.start;
     char const *end = h.start + h.length;
 
-    for (; (uint64_t)text % 8 != 0 && text < end; ++text)
+    // Process the misaligned head, to void UB on unaligned 64-bit loads.
+    for (; ((unsigned long)text & 7ul) && text < end; ++text)
         if (*text == n) return text - h.start;
 
     // This code simulates hyper-scalar execution, analyzing 8 offsets at a time.
-    uint64_t nnnnnnnn = n;
+    sz_u64_t nnnnnnnn = n;
     nnnnnnnn |= nnnnnnnn << 8;  // broadcast `n` into `nnnnnnnn`
     nnnnnnnn |= nnnnnnnn << 16; // broadcast `n` into `nnnnnnnn`
     nnnnnnnn |= nnnnnnnn << 32; // broadcast `n` into `nnnnnnnn`
     for (; text + 8 <= end; text += 8) {
-        uint64_t text_slice = *(uint64_t const *)text;
-        uint64_t match_indicators = ~(text_slice ^ nnnnnnnn);
+        sz_u64_t text_slice = *(sz_u64_t const *)text;
+        sz_u64_t match_indicators = ~(text_slice ^ nnnnnnnn);
         match_indicators &= match_indicators >> 1;
         match_indicators &= match_indicators >> 2;
         match_indicators &= match_indicators >> 4;
@@ -121,26 +180,31 @@ inline static sz_size_t sz_find_char_swar(sz_haystack_t h, char n) {
 /**
  *  @brief  SWAR character-bigram search in string, jumping 8 bytes at a time.
  */
-inline static sz_size_t sz_find_2chars_swar(sz_haystack_t h, char const *n) {
+inline static sz_size_t sz_find_bigram_swar(sz_haystack_t h, char const *n) {
 
     char const *text = h.start;
     char const *end = h.start + h.length;
 
+    // Process the misaligned head, to void UB on unaligned 64-bit loads.
+    for (; ((unsigned long)text & 7ul) && text + 2 <= end; ++text)
+        if (text[0] == n[0] && text[1] == n[1]) return text - h.start;
+
     // This code simulates hyper-scalar execution, analyzing 7 offsets at a time.
-    uint64_t nnnn = ((uint64_t)(n[0]) << 0) | ((uint64_t)(n[1]) << 8); // broadcast `n` into `nnnn`
+    sz_u64_t nnnn = ((sz_u64_t)(n[0]) << 0) | ((sz_u64_t)(n[1]) << 8); // broadcast `n` into `nnnn`
     nnnn |= nnnn << 16;                                                // broadcast `n` into `nnnn`
     nnnn |= nnnn << 32;                                                // broadcast `n` into `nnnn`
-    uint64_t text_slice;
     for (; text + 8 <= end; text += 7) {
-        memcpy(&text_slice, text, 8);
-        uint64_t even_indicators = ~(text_slice ^ nnnn);
-        uint64_t odd_indicators = ~((text_slice << 8) ^ nnnn);
+        sz_u64_t text_slice = *(sz_u64_t const *)text;
+        sz_u64_t even_indicators = ~(text_slice ^ nnnn);
+        sz_u64_t odd_indicators = ~((text_slice << 8) ^ nnnn);
+
         // For every even match - 2 char (16 bits) must be identical.
         even_indicators &= even_indicators >> 1;
         even_indicators &= even_indicators >> 2;
         even_indicators &= even_indicators >> 4;
         even_indicators &= even_indicators >> 8;
         even_indicators &= 0x0001000100010001;
+
         // For every odd match - 2 char (16 bits) must be identical.
         odd_indicators &= odd_indicators >> 1;
         odd_indicators &= odd_indicators >> 2;
@@ -149,7 +213,7 @@ inline static sz_size_t sz_find_2chars_swar(sz_haystack_t h, char const *n) {
         odd_indicators &= 0x0001000100010000;
 
         if (even_indicators + odd_indicators) {
-            uint64_t match_indicators = even_indicators | (odd_indicators >> 8);
+            sz_u64_t match_indicators = even_indicators | (odd_indicators >> 8);
             return text - h.start + ctz64(match_indicators) / 8;
         }
     }
@@ -162,23 +226,26 @@ inline static sz_size_t sz_find_2chars_swar(sz_haystack_t h, char const *n) {
 /**
  *  @brief  SWAR character-trigram search in string, jumping 8 bytes at a time.
  */
-inline static sz_size_t sz_find_3chars_swar(sz_haystack_t h, char const *n) {
+inline static sz_size_t sz_find_trigram_swar(sz_haystack_t h, char const *n) {
 
     char const *text = h.start;
     char const *end = h.start + h.length;
 
+    // Process the misaligned head, to void UB on unaligned 64-bit loads.
+    for (; ((unsigned long)text & 7ul) && text + 3 <= end; ++text)
+        if (text[0] == n[0] && text[1] == n[1] && text[2] == n[2]) return text - h.start;
+
     // This code simulates hyper-scalar execution, analyzing 6 offsets at a time.
     // We have two unused bytes at the end.
-    uint64_t nn = (uint64_t)(n[0] << 0) | ((uint64_t)(n[1]) << 8) | ((uint64_t)(n[2]) << 16); // broadcast `n` into `nn`
+    sz_u64_t nn = (sz_u64_t)(n[0] << 0) | ((sz_u64_t)(n[1]) << 8) | ((sz_u64_t)(n[2]) << 16); // broadcast `n` into `nn`
     nn |= nn << 24;                                                                           // broadcast `n` into `nn`
     nn <<= 16;                                                                                // broadcast `n` into `nn`
 
     for (; text + 8 <= end; text += 6) {
-        uint64_t text_slice;
-        memcpy(&text_slice, text, 8);
-        uint64_t first_indicators = ~(text_slice ^ nn);
-        uint64_t second_indicators = ~((text_slice << 8) ^ nn);
-        uint64_t third_indicators = ~((text_slice << 16) ^ nn);
+        sz_u64_t text_slice = *(sz_u64_t const *)text;
+        sz_u64_t first_indicators = ~(text_slice ^ nn);
+        sz_u64_t second_indicators = ~((text_slice << 8) ^ nn);
+        sz_u64_t third_indicators = ~((text_slice << 16) ^ nn);
         // For every first match - 3 chars (24 bits) must be identical.
         // For that merge every byte state and then combine those three-way.
         first_indicators &= first_indicators >> 1;
@@ -203,7 +270,7 @@ inline static sz_size_t sz_find_3chars_swar(sz_haystack_t h, char const *n) {
         third_indicators =
             (third_indicators >> 16) & (third_indicators >> 8) & (third_indicators >> 0) & 0x0000010000010000;
 
-        uint64_t match_indicators = first_indicators | (second_indicators >> 8) | (third_indicators >> 16);
+        sz_u64_t match_indicators = first_indicators | (second_indicators >> 8) | (third_indicators >> 16);
         if (match_indicators != 0) return text - h.start + ctz64(match_indicators) / 8;
     }
 
@@ -215,29 +282,32 @@ inline static sz_size_t sz_find_3chars_swar(sz_haystack_t h, char const *n) {
 /**
  *  @brief  SWAR character-quadgram search in string, jumping 8 bytes at a time.
  */
-inline static sz_size_t sz_find_4chars_swar(sz_haystack_t h, char const *n) {
+inline static sz_size_t sz_find_quadgram_swar(sz_haystack_t h, char const *n) {
 
     char const *text = h.start;
     char const *end = h.start + h.length;
 
+    // Process the misaligned head, to void UB on unaligned 64-bit loads.
+    for (; ((unsigned long)text & 7ul) && text + 4 <= end; ++text)
+        if (text[0] == n[0] && text[1] == n[1] && text[2] == n[2] && text[3] == n[3]) return text - h.start;
+
     // This code simulates hyper-scalar execution, analyzing 4 offsets at a time.
-    uint64_t nn = (uint64_t)(n[0] << 0) | ((uint64_t)(n[1]) << 8) | ((uint64_t)(n[2]) << 16) | ((uint64_t)(n[3]) << 24);
+    sz_u64_t nn = (sz_u64_t)(n[0] << 0) | ((sz_u64_t)(n[1]) << 8) | ((sz_u64_t)(n[2]) << 16) | ((sz_u64_t)(n[3]) << 24);
     nn |= nn << 32;
 
     //
-    uint8_t lookup[16] = {0};
-    lookup[0b0010] = lookup[0b0110] = lookup[0b1010] = lookup[0b1110] = 1;
-    lookup[0b0100] = lookup[0b1100] = 2;
-    lookup[0b1000] = 3;
+    unsigned char lookup[16] = {0};
+    lookup[0x2] = lookup[0x6] = lookup[0xA] = lookup[0xE] = 1;
+    lookup[0x4] = lookup[0xC] = 2;
+    lookup[0x8] = 3;
 
     // We can perform 5 comparisons per load, but it's easier to perform 4, minimizing the size of the lookup table.
     for (; text + 8 <= end; text += 4) {
-        uint64_t text_slice;
-        memcpy(&text_slice, text, 8);
-        uint64_t text01 = (text_slice & 0x00000000FFFFFFFF) | ((text_slice & 0x000000FFFFFFFF00) << 24);
-        uint64_t text23 = ((text_slice & 0x0000FFFFFFFF0000) >> 16) | ((text_slice & 0x00FFFFFFFF000000) << 8);
-        uint64_t text01_indicators = ~(text01 ^ nn);
-        uint64_t text23_indicators = ~(text23 ^ nn);
+        sz_u64_t text_slice = *(sz_u64_t const *)text;
+        sz_u64_t text01 = (text_slice & 0x00000000FFFFFFFF) | ((text_slice & 0x000000FFFFFFFF00) << 24);
+        sz_u64_t text23 = ((text_slice & 0x0000FFFFFFFF0000) >> 16) | ((text_slice & 0x00FFFFFFFF000000) << 8);
+        sz_u64_t text01_indicators = ~(text01 ^ nn);
+        sz_u64_t text23_indicators = ~(text23 ^ nn);
 
         // For every first match - 4 chars (32 bits) must be identical.
         text01_indicators &= text01_indicators >> 1;
@@ -258,7 +328,7 @@ inline static sz_size_t sz_find_4chars_swar(sz_haystack_t h, char const *n) {
         if (text01_indicators + text23_indicators) {
             // Assuming we have performed 4 comparisons, we can only have 2^4=16 outcomes.
             // Which is small enough for a lookup table.
-            uint8_t match_indicators = (uint8_t)(                      //
+            unsigned char match_indicators = (unsigned char)(          //
                 (text01_indicators >> 31) | (text01_indicators << 0) | //
                 (text23_indicators >> 29) | (text23_indicators << 2));
             return text - h.start + lookup[match_indicators];
@@ -272,7 +342,7 @@ inline static sz_size_t sz_find_4chars_swar(sz_haystack_t h, char const *n) {
 
 /**
  *  @brief  Trivial substring search with scalar code. Instead of comparing characters one-by-one
- *          it compares 4-byte anomalies first, most commonly prefixes. It's computationally cheaper.
+ *          it compares 4-byte quadgrams first, most commonly prefixes. It's computationally cheaper.
  *          Matching performance fluctuates between 1 GB/s and 3,5 GB/s per core.
  */
 inline static sz_size_t sz_find_substr_swar(sz_haystack_t h, sz_needle_t n) {
@@ -281,26 +351,36 @@ inline static sz_size_t sz_find_substr_swar(sz_haystack_t h, sz_needle_t n) {
 
     switch (n.length) {
     case 0: return 0;
-    case 1: return sz_find_char_swar(h, *n.start);
-    case 2: return sz_find_2chars_swar(h, n.start);
-    case 3: return sz_find_3chars_swar(h, n.start);
-    case 4: return sz_find_4chars_swar(h, n.start);
+    case 1: return sz_find_unigram_swar(h, *n.start);
+    case 2: return sz_find_bigram_swar(h, n.start);
+    case 3: return sz_find_trigram_swar(h, n.start);
+    case 4: return sz_find_quadgram_swar(h, n.start);
     default: {
         char const *text = h.start;
         char const *const end = h.start + h.length;
 
-        sz_anomaly_t n_anomaly, h_anomaly;
-        sz_size_t const n_suffix_len = n.length - 4 - n.anomaly_offset;
-        char const *n_suffix_ptr = n.start + 4 + n.anomaly_offset;
-        memcpy(&n_anomaly, n.start + n.anomaly_offset, 4);
-
-        text += n.anomaly_offset;
-        for (; text + n.length <= end; text++) {
-            memcpy(&h_anomaly, text, 4);
-            if (h_anomaly == n_anomaly)                                               // Match anomaly.
-                if (sz_equal(text + 4, n_suffix_ptr, n_suffix_len))                   // Match suffix.
-                    if (sz_equal(text - n.anomaly_offset, n.start, n.anomaly_offset)) // Match prefix.
-                        return text - h.start - n.anomaly_offset;
+        sz_quadgram_t n_quadgram, h_quadgram;
+        sz_size_t const n_suffix_len = n.length - 4 - n.quadgram_offset;
+        char const *n_suffix_ptr = n.start + 4 + n.quadgram_offset;
+        n_quadgram.u8s[0] = n.start[n.quadgram_offset];
+        n_quadgram.u8s[1] = n.start[n.quadgram_offset + 1];
+        n_quadgram.u8s[2] = n.start[n.quadgram_offset + 2];
+        n_quadgram.u8s[3] = n.start[n.quadgram_offset + 3];
+        h_quadgram.u8s[0] = h.start[0];
+        h_quadgram.u8s[1] = h.start[1];
+        h_quadgram.u8s[2] = h.start[2];
+        h_quadgram.u8s[3] = h.start[3];
+
+        text += n.quadgram_offset;
+        while (text + n.length <= end) {
+            if (h_quadgram.u32 == n_quadgram.u32)                                       // Match quadgram.
+                if (sz_equal(text + 4, n_suffix_ptr, n_suffix_len))                     // Match suffix.
+                    if (sz_equal(text - n.quadgram_offset, n.start, n.quadgram_offset)) // Match prefix.
+                        return text - h.start - n.quadgram_offset;
+
+            h_quadgram.u32 <<= 8;
+            h_quadgram.u8s[3] = *text;
+            ++text;
         }
         return h.length;
     }
@@ -319,17 +399,17 @@ inline static sz_size_t sz_find_substr_avx2(sz_haystack_t h, sz_needle_t n) {
 
     // Precomputed constants
     char const *const end = h.start + h.length;
-    uint32_t anomaly = 0;
-    uint32_t mask = 0;
+    sz_quadgram_t quadgram = 0;
+    sz_quadgram_t mask = 0;
     switch (n.length) {
-    case 1: memset(&mask, 0xFF, 1), memcpy(&anomaly, n.start, 1); break;
-    case 2: memset(&mask, 0xFF, 2), memcpy(&anomaly, n.start, 2); break;
-    case 3: memset(&mask, 0xFF, 3), memcpy(&anomaly, n.start, 3); break;
-    default: memset(&mask, 0xFF, 4), memcpy(&anomaly, n.start, 4); break;
+    case 1: memset(&mask, 0xFF, 1), memcpy(&quadgram, n.start, 1); break;
+    case 2: memset(&mask, 0xFF, 2), memcpy(&quadgram, n.start, 2); break;
+    case 3: memset(&mask, 0xFF, 3), memcpy(&quadgram, n.start, 3); break;
+    default: memset(&mask, 0xFF, 4), memcpy(&quadgram, n.start, 4); break;
     }
 
-    __m256i const anomalies = _mm256_set1_epi32(*(uint32_t const *)&anomaly);
-    __m256i const masks = _mm256_set1_epi32(*(uint32_t const *)&mask);
+    __m256i const quadgrams = _mm256_set1_epi32(quadgram.u32);
+    __m256i const masks = _mm256_set1_epi32(mask.u32);
 
     // Top level for-loop changes dramatically.
     // In sequential computing model for 32 offsets we would do:
@@ -345,13 +425,13 @@ inline static sz_size_t sz_find_substr_avx2(sz_haystack_t h, sz_needle_t n) {
 
         // Performing many unaligned loads ends up being faster than loading once and shuffling around.
         __m256i texts0 = _mm256_and_si256(_mm256_loadu_si256((__m256i const *)(text + 0)), masks);
-        int matches0 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts0, anomalies));
+        int matches0 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts0, quadgrams));
         __m256i texts1 = _mm256_and_si256(_mm256_loadu_si256((__m256i const *)(text + 1)), masks);
-        int matches1 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts1, anomalies));
+        int matches1 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts1, quadgrams));
         __m256i text2 = _mm256_and_si256(_mm256_loadu_si256((__m256i const *)(text + 2)), masks);
-        int matches2 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(text2, anomalies));
+        int matches2 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(text2, quadgrams));
         __m256i texts3 = _mm256_and_si256(_mm256_loadu_si256((__m256i const *)(text + 3)), masks);
-        int matches3 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts3, anomalies));
+        int matches3 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts3, quadgrams));
 
         if (matches0 | matches1 | matches2 | matches3) {
             for (sz_size_t i = 0; i < 32; i++) {
@@ -382,16 +462,22 @@ inline static sz_size_t sz_find_substr_neon(sz_haystack_t h, sz_needle_t n) {
 
     // Precomputed constants
     char const *const end = h.start + h.length;
-    uint32_t anomaly = 0;
-    uint32_t mask = 0;
+    sz_quadgram_t quadgram = {};
+    sz_quadgram_t mask = {};
     switch (n.length) {
-    case 1: memset(&mask, 0xFF, 1), memcpy(&anomaly, n.start, 1); break;
-    case 2: memset(&mask, 0xFF, 2), memcpy(&anomaly, n.start, 2); break;
-    case 3: memset(&mask, 0xFF, 3), memcpy(&anomaly, n.start, 3); break;
-    default: memset(&mask, 0xFF, 4), memcpy(&anomaly, n.start, 4); break;
+    case 1: mask.u8s[0] = 0xFF, quadgram.u8s[0] = n.start[0]; break;
+    case 2: mask.u8s[0] = mask.u8s[1] = 0xFF, quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1]; break;
+    case 3:
+        mask.u8s[0] = mask.u8s[1] = mask.u8s[2] = 0xFF, quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1],
+        quadgram.u8s[2] = n.start[2];
+        break;
+    default:
+        mask.u32 = 0xFFFFFFFF, quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1], quadgram.u8s[2] = n.start[2],
+        quadgram.u8s[3] = n.start[3];
+        break;
     }
 
-    uint32x4_t const anomalies = vld1q_dup_u32(&anomaly);
+    uint32x4_t const quadgrams = vld1q_dup_u32(&quadgram.u32);
     uint32x4_t const masks = vld1q_dup_u32(&mask);
     uint32x4_t matches, matches0, matches1, matches2, matches3;
 
@@ -400,10 +486,10 @@ inline static sz_size_t sz_find_substr_neon(sz_haystack_t h, sz_needle_t n) {
 
         // Each of the following `matchesX` contains only 4 relevant bits - one per word.
         // Each signifies a match at the given offset.
-        matches0 = vceqq_u32(vandq_u32(vld1q_u32((uint32_t const *)(text + 0)), masks), anomalies);
-        matches1 = vceqq_u32(vandq_u32(vld1q_u32((uint32_t const *)(text + 1)), masks), anomalies);
-        matches2 = vceqq_u32(vandq_u32(vld1q_u32((uint32_t const *)(text + 2)), masks), anomalies);
-        matches3 = vceqq_u32(vandq_u32(vld1q_u32((uint32_t const *)(text + 3)), masks), anomalies);
+        matches0 = vceqq_u32(vandq_u32(vld1q_u32((sz_quadgram_t const *)(text + 0)), masks), quadgrams);
+        matches1 = vceqq_u32(vandq_u32(vld1q_u32((sz_quadgram_t const *)(text + 1)), masks), quadgrams);
+        matches2 = vceqq_u32(vandq_u32(vld1q_u32((sz_quadgram_t const *)(text + 2)), masks), quadgrams);
+        matches3 = vceqq_u32(vandq_u32(vld1q_u32((sz_quadgram_t const *)(text + 3)), masks), quadgrams);
         matches = vorrq_u32(vorrq_u32(matches0, matches1), vorrq_u32(matches2, matches3));
 
         if (vmaxvq_u32(matches)) {
@@ -448,8 +534,8 @@ inline static sz_size_t sz_find_substr_neon(sz_haystack_t h, sz_needle_t n) {
 
 #endif // Arm Neon
 
-inline static sz_size_t sz_count_char(sz_haystack_t h, char n) { return sz_count_char_swar(h, n); }
-inline static sz_size_t sz_find_char(sz_haystack_t h, char n) { return sz_find_char_swar(h, n); }
+inline static sz_size_t sz_count_char(sz_haystack_t h, char n) { return sz_count_unigram_swar(h, n); }
+inline static sz_size_t sz_find_unigram(sz_haystack_t h, char n) { return sz_find_unigram_swar(h, n); }
 
 inline static sz_size_t sz_find_substr(sz_haystack_t h, sz_needle_t n) {
     if (h.length < n.length) return h.length;
@@ -665,10 +751,10 @@ inline static void sz_sort(sz_sequence_t *sequence, sz_sort_config_t const *conf
         char *prefix = (char *)&sequence->order[i];
         for (sz_size_t j = 0; j != length; ++j) prefix[7 - j] = begin[j];
         if (case_insensitive) {
-            prefix[0] = tolower(prefix[0]);
-            prefix[1] = tolower(prefix[1]);
-            prefix[2] = tolower(prefix[2]);
-            prefix[3] = tolower(prefix[3]);
+            prefix[0] = sz_tolower_ascii(prefix[0]);
+            prefix[1] = sz_tolower_ascii(prefix[1]);
+            prefix[2] = sz_tolower_ascii(prefix[2]);
+            prefix[3] = sz_tolower_ascii(prefix[3]);
         }
     }
 
@@ -679,7 +765,7 @@ inline static void sz_sort(sz_sequence_t *sequence, sz_sort_config_t const *conf
     _sz_sort_recursion(sequence, 0, 32, comparator);
 }
 
-typedef uint8_t levenstein_distance_t;
+typedef unsigned char levenstein_distance_t;
 
 /**
  *  @return Amount of temporary memory (in bytes) needed to efficiently compute
@@ -758,11 +844,11 @@ inline static levenstein_distance_t sz_levenstein( //
 /**
  *  @brief  Hashes provided string using hardware-accelerated CRC32 instructions.
  */
-inline static uint32_t sz_hash_crc32_native(char const *start, sz_size_t length) { return 0; }
+inline static sz_u32_t sz_hash_crc32_native(char const *start, sz_size_t length) { return 0; }
 
-inline static uint32_t sz_hash_crc32_neon(char const *start, sz_size_t length) { return 0; }
+inline static sz_u32_t sz_hash_crc32_neon(char const *start, sz_size_t length) { return 0; }
 
-inline static uint32_t sz_hash_crc32_sse(char const *start, sz_size_t length) { return 0; }
+inline static sz_u32_t sz_hash_crc32_sse(char const *start, sz_size_t length) { return 0; }
 
 #ifdef __cplusplus
 }

From d102bdf87078d9a6a8b3064759db3dbf7dc4e331 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Oct 2023 19:42:11 -0700
Subject: [PATCH 57/72] Fix: SWAR search bug

---
 stringzilla/stringzilla.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h
index 51319f01..7353024a 100644
--- a/stringzilla/stringzilla.h
+++ b/stringzilla/stringzilla.h
@@ -373,13 +373,13 @@ inline static sz_size_t sz_find_substr_swar(sz_haystack_t h, sz_needle_t n) {
 
         text += n.quadgram_offset;
         while (text + n.length <= end) {
+            h_quadgram.u8s[3] = text[3];
             if (h_quadgram.u32 == n_quadgram.u32)                                       // Match quadgram.
                 if (sz_equal(text + 4, n_suffix_ptr, n_suffix_len))                     // Match suffix.
                     if (sz_equal(text - n.quadgram_offset, n.start, n.quadgram_offset)) // Match prefix.
                         return text - h.start - n.quadgram_offset;
 
-            h_quadgram.u32 <<= 8;
-            h_quadgram.u8s[3] = *text;
+            h_quadgram.u32 >>= 8;
             ++text;
         }
         return h.length;

From 9b3c63d951461cd0dcccc3993ab1f7e18a2589c8 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 7 Oct 2023 20:39:44 -0700
Subject: [PATCH 58/72] Improve: avoiding nested loop in AVX2

---
 stringzilla/stringzilla.h | 93 +++++++++++++++++++++++++--------------
 1 file changed, 61 insertions(+), 32 deletions(-)

diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h
index 7353024a..6b481dda 100644
--- a/stringzilla/stringzilla.h
+++ b/stringzilla/stringzilla.h
@@ -387,6 +387,40 @@ inline static sz_size_t sz_find_substr_swar(sz_haystack_t h, sz_needle_t n) {
     }
 }
 
+/**
+ *  Helper function, used in substring search operations.
+ */
+inline static void _sz_find_substr_populate_quadgram( //
+    sz_haystack_t h,
+    sz_needle_t n,
+    sz_quadgram_t *quadgram_out,
+    sz_quadgram_t *mask_out) {
+
+    sz_quadgram_t quadgram;
+    sz_quadgram_t mask;
+    switch (n.length) {
+    case 1:
+        mask.u8s[0] = 0xFF, mask.u8s[1] = mask.u8s[2] = mask.u8s[3] = 0;
+        quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = quadgram.u8s[2] = quadgram.u8s[3] = 0;
+        break;
+    case 2:
+        mask.u8s[0] = mask.u8s[1] = 0xFF, mask.u8s[2] = mask.u8s[3] = 0;
+        quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1], quadgram.u8s[2] = quadgram.u8s[3] = 0;
+        break;
+    case 3:
+        mask.u8s[0] = mask.u8s[1] = mask.u8s[2] = 0xFF, mask.u8s[3] = 0;
+        quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1], quadgram.u8s[2] = n.start[2], quadgram.u8s[3] = 0;
+        break;
+    default:
+        mask.u32 = 0xFFFFFFFF;
+        quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1], quadgram.u8s[2] = n.start[2],
+        quadgram.u8s[3] = n.start[3];
+        break;
+    }
+    *quadgram_out = quadgram;
+    *mask_out = mask;
+}
+
 #if defined(__AVX2__)
 
 /**
@@ -399,15 +433,9 @@ inline static sz_size_t sz_find_substr_avx2(sz_haystack_t h, sz_needle_t n) {
 
     // Precomputed constants
     char const *const end = h.start + h.length;
-    sz_quadgram_t quadgram = 0;
-    sz_quadgram_t mask = 0;
-    switch (n.length) {
-    case 1: memset(&mask, 0xFF, 1), memcpy(&quadgram, n.start, 1); break;
-    case 2: memset(&mask, 0xFF, 2), memcpy(&quadgram, n.start, 2); break;
-    case 3: memset(&mask, 0xFF, 3), memcpy(&quadgram, n.start, 3); break;
-    default: memset(&mask, 0xFF, 4), memcpy(&quadgram, n.start, 4); break;
-    }
-
+    sz_quadgram_t quadgram;
+    sz_quadgram_t mask;
+    _sz_find_substr_populate_quadgram(h, n, &quadgram, &mask);
     __m256i const quadgrams = _mm256_set1_epi32(quadgram.u32);
     __m256i const masks = _mm256_set1_epi32(mask.u32);
 
@@ -421,7 +449,7 @@ inline static sz_size_t sz_find_substr_avx2(sz_haystack_t h, sz_needle_t n) {
     //  + 3 bitwise ANDs.
     //  + 1 heavy (but very unlikely) branch.
     char const *text = h.start;
-    for (; (text + n.length + 32) <= end; text += 32) {
+    while (text + n.length + 32 <= end) {
 
         // Performing many unaligned loads ends up being faster than loading once and shuffling around.
         __m256i texts0 = _mm256_and_si256(_mm256_loadu_si256((__m256i const *)(text + 0)), masks);
@@ -434,10 +462,23 @@ inline static sz_size_t sz_find_substr_avx2(sz_haystack_t h, sz_needle_t n) {
         int matches3 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts3, quadgrams));
 
         if (matches0 | matches1 | matches2 | matches3) {
-            for (sz_size_t i = 0; i < 32; i++) {
-                if (sz_equal(text + i, n.start, n.length)) return i + (text - h.start);
+            int matches =                   //
+                (matches0 & 0x1111'1111u) | //
+                (matches1 & 0x2222'2222u) | //
+                (matches2 & 0x4444'4444u) | //
+                (matches3 & 0x8888'8888u);
+            size_t first_match_offset = _tzcnt_u32(matches);
+            if (n.length > 4) {
+                if (sz_equal(text + first_match_offset + 4, n.start + 4, n.length - 4))
+                    return text + first_match_offset - h.start;
+                else
+                    text += first_match_offset + 1;
             }
-        }
+            else
+                return text + first_match_offset - h.start;
+            }
+        else
+            text += 32;
     }
 
     // Don't forget the last (up to 35) characters.
@@ -462,21 +503,9 @@ inline static sz_size_t sz_find_substr_neon(sz_haystack_t h, sz_needle_t n) {
 
     // Precomputed constants
     char const *const end = h.start + h.length;
-    sz_quadgram_t quadgram = {};
-    sz_quadgram_t mask = {};
-    switch (n.length) {
-    case 1: mask.u8s[0] = 0xFF, quadgram.u8s[0] = n.start[0]; break;
-    case 2: mask.u8s[0] = mask.u8s[1] = 0xFF, quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1]; break;
-    case 3:
-        mask.u8s[0] = mask.u8s[1] = mask.u8s[2] = 0xFF, quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1],
-        quadgram.u8s[2] = n.start[2];
-        break;
-    default:
-        mask.u32 = 0xFFFFFFFF, quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1], quadgram.u8s[2] = n.start[2],
-        quadgram.u8s[3] = n.start[3];
-        break;
-    }
-
+    sz_quadgram_t quadgram;
+    sz_quadgram_t mask;
+    _sz_find_substr_populate_quadgram(h, n, &quadgram, &mask);
     uint32x4_t const quadgrams = vld1q_dup_u32(&quadgram.u32);
     uint32x4_t const masks = vld1q_dup_u32(&mask);
     uint32x4_t matches, matches0, matches1, matches2, matches3;
@@ -486,10 +515,10 @@ inline static sz_size_t sz_find_substr_neon(sz_haystack_t h, sz_needle_t n) {
 
         // Each of the following `matchesX` contains only 4 relevant bits - one per word.
         // Each signifies a match at the given offset.
-        matches0 = vceqq_u32(vandq_u32(vld1q_u32((sz_quadgram_t const *)(text + 0)), masks), quadgrams);
-        matches1 = vceqq_u32(vandq_u32(vld1q_u32((sz_quadgram_t const *)(text + 1)), masks), quadgrams);
-        matches2 = vceqq_u32(vandq_u32(vld1q_u32((sz_quadgram_t const *)(text + 2)), masks), quadgrams);
-        matches3 = vceqq_u32(vandq_u32(vld1q_u32((sz_quadgram_t const *)(text + 3)), masks), quadgrams);
+        matches0 = vceqq_u32(vandq_u32(vld1q_u32(text + 0), masks), quadgrams);
+        matches1 = vceqq_u32(vandq_u32(vld1q_u32(text + 1), masks), quadgrams);
+        matches2 = vceqq_u32(vandq_u32(vld1q_u32(text + 2), masks), quadgrams);
+        matches3 = vceqq_u32(vandq_u32(vld1q_u32(text + 3), masks), quadgrams);
         matches = vorrq_u32(vorrq_u32(matches0, matches1), vorrq_u32(matches2, matches3));
 
         if (vmaxvq_u32(matches)) {

From fa7984a9f0b70d387cecbc4f4c4442443cf0150f Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 9 Oct 2023 13:01:13 -0700
Subject: [PATCH 59/72] Break: Avoiding LibC and new API

---
 stringzilla/stringzilla.h | 789 +++++++++++++++++++++-----------------
 1 file changed, 446 insertions(+), 343 deletions(-)

diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h
index 6b481dda..0aa8774b 100644
--- a/stringzilla/stringzilla.h
+++ b/stringzilla/stringzilla.h
@@ -1,13 +1,10 @@
 #ifndef STRINGZILLA_H_
 #define STRINGZILLA_H_
 
-#include <search.h> // `qsort_s`
-#include <stdlib.h> // `qsort_r`
-#include <string.h> // `memcpy`
-
 #if defined(__AVX2__)
 #include <x86intrin.h>
 #endif
+
 #if defined(__ARM_NEON)
 #include <arm_neon.h>
 #endif
@@ -16,117 +13,88 @@
 #include <intrin.h>
 #define popcount64 __popcnt64
 #define ctz64 _tzcnt_u64
+#define clz64 _lzcnt_u64
 #define strncasecmp _strnicmp
 #define strcasecmp _stricmp
 #else
 #define popcount64 __builtin_popcountll
 #define ctz64 __builtin_ctzll
+#define clz64 __builtin_clzll
+#endif
+
+/**
+ *  Generally `NULL` is coming from locale.h, stddef.h, stdio.h, stdlib.h, string.h, time.h, and wchar.h,
+ *  according to the C standard.
+ */
+#ifndef NULL
+#define NULL ((void *)0)
 #endif
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+/**
+ *  @brief  Analogous to `size_t` and `std::size_t`, unsigned integer, identical to pointer size.
+ *          64-bit on most platforms where pointers are 64-bit.
+ *          32-bit on platforms where pointers are 32-bit.
+ */
 #if defined(__LP64__) || defined(_LP64) || defined(__x86_64__) || defined(_WIN64)
-typedef unsigned long sz_size_t; // 64-bit on most platforms when pointers are 64-bit
+typedef unsigned long sz_size_t;
 #else
-typedef unsigned sz_size_t; // 32-bit on most platforms when pointers are 32-bit
+typedef unsigned sz_size_t;
 #endif
 
+typedef int sz_bool_t;               // Only one relevant bit
 typedef unsigned sz_u32_t;           // Always 32 bits
 typedef unsigned long long sz_u64_t; // Always 64 bits
+typedef char const *sz_string_ptr_t; // A type alias for `char const * `
+
+/**
+ *  @brief  Helper construct for higher-level bindings.
+ */
+typedef struct sz_string_view_t {
+    sz_string_ptr_t start;
+    sz_size_t length;
+} sz_string_view_t;
 
-typedef union sz_quadgram_t {
+/**
+ *  @brief  Internal data-structure, used to address "anomalies" (often prefixes),
+ *          during substring search. Always a 32-bit unsigned integer, containing 4 chars.
+ */
+typedef union _sz_anomaly_t {
     unsigned u32;
     unsigned char u8s[4];
-} sz_quadgram_t; // Always 32-bit unsigned integer, representing 8 bytes/characters
-
-typedef union sz_octogram_t {
-    unsigned long long u64;
-    unsigned char u8s[8];
-} sz_octogram_t; // Always 64-bit unsigned integer, representing 8 bytes/characters
-
-inline static sz_size_t sz_divide_round_up(sz_size_t x, sz_size_t divisor) { return (x + (divisor - 1)) / divisor; }
-
-inline static sz_size_t sz_tolower_ascii(char c) {
-    static char lowered[256] = {
-        0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  15,  //
-        16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  //
-        32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  //
-        48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  //
-        64,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, //
-        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 91,  92,  93,  94,  95,  //
-        96,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, //
-        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, //
-        128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, //
-        144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, //
-        160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, //
-        176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, //
-        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
-        240, 241, 242, 243, 244, 245, 246, 215, 248, 249, 250, 251, 252, 253, 254, 223, //
-        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
-        240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, //
-    };
-    return lowered[(int)c];
-}
-
-inline static sz_size_t sz_toupper_ascii(char c) {
-    static char upped[256] = {
-        0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  15,  //
-        16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  //
-        32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  //
-        48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  //
-        64,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, //
-        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 91,  92,  93,  94,  95,  //
-        96,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  //
-        80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  123, 124, 125, 126, 127, //
-        128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, //
-        144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, //
-        160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, //
-        176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, //
-        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
-        240, 241, 242, 243, 244, 245, 246, 215, 248, 249, 250, 251, 252, 253, 254, 223, //
-        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
-        240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, //
-    };
-    return upped[(int)c];
-}
+} _sz_anomaly_t;
 
 /**
- *  @brief This is a faster alternative to `strncmp(a, b, length) == 0`.
+ *  @brief  This is a slightly faster alternative to `strncmp(a, b, length) == 0`.
+ *          Doesn't provide major performance improvements, but helps avoid the LibC dependency.
  *  @return 1 for `true`, and 0 for `false`.
  */
-inline static int sz_equal(char const *a, char const *b, sz_size_t length) {
-    char const *const a_end = a + length;
+inline static sz_bool_t sz_equal(sz_string_ptr_t a, sz_string_ptr_t b, sz_size_t length) {
+    sz_string_ptr_t const a_end = a + length;
     while (a != a_end && *a == *b) a++, b++;
     return a_end == a;
 }
 
-typedef struct sz_haystack_t {
-    char const *start;
-    sz_size_t length;
-} sz_haystack_t;
-
-typedef struct sz_needle_t {
-    char const *start;
-    sz_size_t length;
-    sz_size_t quadgram_offset;
-} sz_needle_t;
-
 /**
- *  @brief  SWAR single-character counting procedure, jumping 8 bytes at a time.
+ *  @brief  Count the number of occurrences of a @b single-character needle in an arbitrary length haystack.
+ *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
  */
-inline static sz_size_t sz_count_unigram_swar(sz_haystack_t h, char n) {
+inline static sz_size_t sz_count_char_swar(sz_string_ptr_t const haystack,
+                                           sz_size_t const haystack_length,
+                                           sz_string_ptr_t const needle) {
 
     sz_size_t result = 0;
-    char const *text = h.start;
-    char const *end = h.start + h.length;
+    sz_string_ptr_t text = haystack;
+    sz_string_ptr_t const end = haystack + haystack_length;
 
     // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((unsigned long)text & 7ul) && text < end; ++text) result += *text == n;
+    for (; ((unsigned long)text & 7ul) && text < end; ++text) result += *text == *needle;
 
     // This code simulates hyper-scalar execution, comparing 8 characters at a time.
-    sz_u64_t nnnnnnnn = n;
+    sz_u64_t nnnnnnnn = *needle;
     nnnnnnnn |= nnnnnnnn << 8;
     nnnnnnnn |= nnnnnnnn << 16;
     nnnnnnnn |= nnnnnnnn << 32;
@@ -140,27 +108,31 @@ inline static sz_size_t sz_count_unigram_swar(sz_haystack_t h, char n) {
         result += popcount64(match_indicators);
     }
 
-    for (; text < end; ++text) result += *text == n;
+    for (; text < end; ++text) result += *text == *needle;
     return result;
 }
 
 /**
- *  @brief  SWAR single-character search in string, jumping 8 bytes at a time.
+ *  @brief  Find the first occurrence of a @b single-character needle in an arbitrary length haystack.
+ *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
+ *          Identical to `memchr(haystack, needle[0], haystack_length)`.
  */
-inline static sz_size_t sz_find_unigram_swar(sz_haystack_t h, char n) {
+inline static sz_string_ptr_t sz_find_1char_swar(sz_string_ptr_t const haystack,
+                                                 sz_size_t const haystack_length,
+                                                 sz_string_ptr_t const needle) {
 
-    char const *text = h.start;
-    char const *end = h.start + h.length;
+    sz_string_ptr_t text = haystack;
+    sz_string_ptr_t const end = haystack + haystack_length;
 
     // Process the misaligned head, to void UB on unaligned 64-bit loads.
     for (; ((unsigned long)text & 7ul) && text < end; ++text)
-        if (*text == n) return text - h.start;
+        if (*text == *needle) return text;
 
     // This code simulates hyper-scalar execution, analyzing 8 offsets at a time.
-    sz_u64_t nnnnnnnn = n;
-    nnnnnnnn |= nnnnnnnn << 8;  // broadcast `n` into `nnnnnnnn`
-    nnnnnnnn |= nnnnnnnn << 16; // broadcast `n` into `nnnnnnnn`
-    nnnnnnnn |= nnnnnnnn << 32; // broadcast `n` into `nnnnnnnn`
+    sz_u64_t nnnnnnnn = *needle;
+    nnnnnnnn |= nnnnnnnn << 8;  // broadcast `needle` into `nnnnnnnn`
+    nnnnnnnn |= nnnnnnnn << 16; // broadcast `needle` into `nnnnnnnn`
+    nnnnnnnn |= nnnnnnnn << 32; // broadcast `needle` into `nnnnnnnn`
     for (; text + 8 <= end; text += 8) {
         sz_u64_t text_slice = *(sz_u64_t const *)text;
         sz_u64_t match_indicators = ~(text_slice ^ nnnnnnnn);
@@ -169,30 +141,70 @@ inline static sz_size_t sz_find_unigram_swar(sz_haystack_t h, char n) {
         match_indicators &= match_indicators >> 4;
         match_indicators &= 0x0101010101010101;
 
-        if (match_indicators != 0) return text - h.start + ctz64(match_indicators) / 8;
+        if (match_indicators != 0) return text + ctz64(match_indicators) / 8;
     }
 
     for (; text < end; ++text)
-        if (*text == n) return text - h.start;
-    return h.length;
+        if (*text == *needle) return text;
+    return NULL;
+}
+
+/**
+ *  @brief  Find the last occurrence of a @b single-character needle in an arbitrary length haystack.
+ *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
+ *          Identical to `memrchr(haystack, needle[0], haystack_length)`.
+ */
+inline static sz_string_ptr_t sz_rfind_1char_swar(sz_string_ptr_t const haystack,
+                                                  sz_size_t const haystack_length,
+                                                  sz_string_ptr_t const needle) {
+
+    sz_string_ptr_t const end = haystack + haystack_length;
+    sz_string_ptr_t text = end - 1;
+
+    // Process the misaligned head, to void UB on unaligned 64-bit loads.
+    for (; ((unsigned long)text & 7ul) && text >= haystack; --text)
+        if (*text == *needle) return text;
+
+    // This code simulates hyper-scalar execution, analyzing 8 offsets at a time.
+    sz_u64_t nnnnnnnn = *needle;
+    nnnnnnnn |= nnnnnnnn << 8;  // broadcast `needle` into `nnnnnnnn`
+    nnnnnnnn |= nnnnnnnn << 16; // broadcast `needle` into `nnnnnnnn`
+    nnnnnnnn |= nnnnnnnn << 32; // broadcast `needle` into `nnnnnnnn`
+    for (; text - 8 >= haystack; text -= 8) {
+        sz_u64_t text_slice = *(sz_u64_t const *)text;
+        sz_u64_t match_indicators = ~(text_slice ^ nnnnnnnn);
+        match_indicators &= match_indicators >> 1;
+        match_indicators &= match_indicators >> 2;
+        match_indicators &= match_indicators >> 4;
+        match_indicators &= 0x0101010101010101;
+
+        if (match_indicators != 0) return text - 8 + clz64(match_indicators) / 8;
+    }
+
+    for (; text >= haystack; --text)
+        if (*text == *needle) return text;
+    return NULL;
 }
 
 /**
- *  @brief  SWAR character-bigram search in string, jumping 8 bytes at a time.
+ *  @brief  Find the first occurrence of a @b two-character needle in an arbitrary length haystack.
+ *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
  */
-inline static sz_size_t sz_find_bigram_swar(sz_haystack_t h, char const *n) {
+inline static sz_string_ptr_t sz_find_2char_swar(sz_string_ptr_t const haystack,
+                                                 sz_size_t const haystack_length,
+                                                 sz_string_ptr_t const needle) {
 
-    char const *text = h.start;
-    char const *end = h.start + h.length;
+    sz_string_ptr_t text = haystack;
+    sz_string_ptr_t const end = haystack + haystack_length;
 
     // Process the misaligned head, to void UB on unaligned 64-bit loads.
     for (; ((unsigned long)text & 7ul) && text + 2 <= end; ++text)
-        if (text[0] == n[0] && text[1] == n[1]) return text - h.start;
+        if (text[0] == needle[0] && text[1] == needle[1]) return text;
 
     // This code simulates hyper-scalar execution, analyzing 7 offsets at a time.
-    sz_u64_t nnnn = ((sz_u64_t)(n[0]) << 0) | ((sz_u64_t)(n[1]) << 8); // broadcast `n` into `nnnn`
-    nnnn |= nnnn << 16;                                                // broadcast `n` into `nnnn`
-    nnnn |= nnnn << 32;                                                // broadcast `n` into `nnnn`
+    sz_u64_t nnnn = ((sz_u64_t)(needle[0]) << 0) | ((sz_u64_t)(needle[1]) << 8); // broadcast `needle` into `nnnn`
+    nnnn |= nnnn << 16;                                                          // broadcast `needle` into `nnnn`
+    nnnn |= nnnn << 32;                                                          // broadcast `needle` into `nnnn`
     for (; text + 8 <= end; text += 7) {
         sz_u64_t text_slice = *(sz_u64_t const *)text;
         sz_u64_t even_indicators = ~(text_slice ^ nnnn);
@@ -214,32 +226,38 @@ inline static sz_size_t sz_find_bigram_swar(sz_haystack_t h, char const *n) {
 
         if (even_indicators + odd_indicators) {
             sz_u64_t match_indicators = even_indicators | (odd_indicators >> 8);
-            return text - h.start + ctz64(match_indicators) / 8;
+            return text + ctz64(match_indicators) / 8;
         }
     }
 
     for (; text + 2 <= end; ++text)
-        if (text[0] == n[0] && text[1] == n[1]) return text - h.start;
-    return h.length;
+        if (text[0] == needle[0] && text[1] == needle[1]) return text;
+    return NULL;
 }
 
 /**
- *  @brief  SWAR character-trigram search in string, jumping 8 bytes at a time.
+ *  @brief  Find the first occurrence of a three-character needle in an arbitrary length haystack.
+ *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
  */
-inline static sz_size_t sz_find_trigram_swar(sz_haystack_t h, char const *n) {
+inline static sz_string_ptr_t sz_find_3char_swar(sz_string_ptr_t const haystack,
+                                                 sz_size_t const haystack_length,
+                                                 sz_string_ptr_t const needle) {
 
-    char const *text = h.start;
-    char const *end = h.start + h.length;
+    sz_string_ptr_t text = haystack;
+    sz_string_ptr_t end = haystack + haystack_length;
 
     // Process the misaligned head, to void UB on unaligned 64-bit loads.
     for (; ((unsigned long)text & 7ul) && text + 3 <= end; ++text)
-        if (text[0] == n[0] && text[1] == n[1] && text[2] == n[2]) return text - h.start;
+        if (text[0] == needle[0] && text[1] == needle[1] && text[2] == needle[2]) return text;
 
     // This code simulates hyper-scalar execution, analyzing 6 offsets at a time.
     // We have two unused bytes at the end.
-    sz_u64_t nn = (sz_u64_t)(n[0] << 0) | ((sz_u64_t)(n[1]) << 8) | ((sz_u64_t)(n[2]) << 16); // broadcast `n` into `nn`
-    nn |= nn << 24;                                                                           // broadcast `n` into `nn`
-    nn <<= 16;                                                                                // broadcast `n` into `nn`
+    sz_u64_t nn =                      // broadcast `needle` into `nn`
+        (sz_u64_t)(needle[0] << 0) |   // broadcast `needle` into `nn`
+        ((sz_u64_t)(needle[1]) << 8) | // broadcast `needle` into `nn`
+        ((sz_u64_t)(needle[2]) << 16); // broadcast `needle` into `nn`
+    nn |= nn << 24;                    // broadcast `needle` into `nn`
+    nn <<= 16;                         // broadcast `needle` into `nn`
 
     for (; text + 8 <= end; text += 6) {
         sz_u64_t text_slice = *(sz_u64_t const *)text;
@@ -271,35 +289,39 @@ inline static sz_size_t sz_find_trigram_swar(sz_haystack_t h, char const *n) {
             (third_indicators >> 16) & (third_indicators >> 8) & (third_indicators >> 0) & 0x0000010000010000;
 
         sz_u64_t match_indicators = first_indicators | (second_indicators >> 8) | (third_indicators >> 16);
-        if (match_indicators != 0) return text - h.start + ctz64(match_indicators) / 8;
+        if (match_indicators != 0) return text + ctz64(match_indicators) / 8;
     }
 
     for (; text + 3 <= end; ++text)
-        if (text[0] == n[0] && text[1] == n[1] && text[2] == n[2]) return text - h.start;
-    return h.length;
+        if (text[0] == needle[0] && text[1] == needle[1] && text[2] == needle[2]) return text;
+    return NULL;
 }
 
 /**
- *  @brief  SWAR character-quadgram search in string, jumping 8 bytes at a time.
+ *  @brief  Find the first occurrence of a @b four-character needle in an arbitrary length haystack.
+ *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
  */
-inline static sz_size_t sz_find_quadgram_swar(sz_haystack_t h, char const *n) {
+inline static sz_string_ptr_t sz_find_4char_swar(sz_string_ptr_t const haystack,
+                                                 sz_size_t const haystack_length,
+                                                 sz_string_ptr_t const needle) {
 
-    char const *text = h.start;
-    char const *end = h.start + h.length;
+    sz_string_ptr_t text = haystack;
+    sz_string_ptr_t end = haystack + haystack_length;
 
     // Process the misaligned head, to void UB on unaligned 64-bit loads.
     for (; ((unsigned long)text & 7ul) && text + 4 <= end; ++text)
-        if (text[0] == n[0] && text[1] == n[1] && text[2] == n[2] && text[3] == n[3]) return text - h.start;
+        if (text[0] == needle[0] && text[1] == needle[1] && text[2] == needle[2] && text[3] == needle[3]) return text;
 
     // This code simulates hyper-scalar execution, analyzing 4 offsets at a time.
-    sz_u64_t nn = (sz_u64_t)(n[0] << 0) | ((sz_u64_t)(n[1]) << 8) | ((sz_u64_t)(n[2]) << 16) | ((sz_u64_t)(n[3]) << 24);
+    sz_u64_t nn = (sz_u64_t)(needle[0] << 0) | ((sz_u64_t)(needle[1]) << 8) | ((sz_u64_t)(needle[2]) << 16) |
+                  ((sz_u64_t)(needle[3]) << 24);
     nn |= nn << 32;
 
     //
-    unsigned char lookup[16] = {0};
-    lookup[0x2] = lookup[0x6] = lookup[0xA] = lookup[0xE] = 1;
-    lookup[0x4] = lookup[0xC] = 2;
-    lookup[0x8] = 3;
+    unsigned char offset_in_slice[16] = {0};
+    offset_in_slice[0x2] = offset_in_slice[0x6] = offset_in_slice[0xA] = offset_in_slice[0xE] = 1;
+    offset_in_slice[0x4] = offset_in_slice[0xC] = 2;
+    offset_in_slice[0x8] = 3;
 
     // We can perform 5 comparisons per load, but it's easier to perform 4, minimizing the size of the lookup table.
     for (; text + 8 <= end; text += 4) {
@@ -331,58 +353,63 @@ inline static sz_size_t sz_find_quadgram_swar(sz_haystack_t h, char const *n) {
             unsigned char match_indicators = (unsigned char)(          //
                 (text01_indicators >> 31) | (text01_indicators << 0) | //
                 (text23_indicators >> 29) | (text23_indicators << 2));
-            return text - h.start + lookup[match_indicators];
+            return text + offset_in_slice[match_indicators];
         }
     }
 
     for (; text + 4 <= end; ++text)
-        if (text[0] == n[0] && text[1] == n[1] && text[2] == n[2] && text[3] == n[3]) return text - h.start;
-    return h.length;
+        if (text[0] == needle[0] && text[1] == needle[1] && text[2] == needle[2] && text[3] == needle[3]) return text;
+    return NULL;
 }
 
 /**
- *  @brief  Trivial substring search with scalar code. Instead of comparing characters one-by-one
- *          it compares 4-byte quadgrams first, most commonly prefixes. It's computationally cheaper.
+ *  @brief  Trivial substring search with scalar SWAR code. Instead of comparing characters one-by-one
+ *          it compares 4-byte anomalies first, most commonly prefixes. It's computationally cheaper.
  *          Matching performance fluctuates between 1 GB/s and 3,5 GB/s per core.
  */
-inline static sz_size_t sz_find_substr_swar(sz_haystack_t h, sz_needle_t n) {
-
-    if (h.length < n.length) return h.length;
-
-    switch (n.length) {
-    case 0: return 0;
-    case 1: return sz_find_unigram_swar(h, *n.start);
-    case 2: return sz_find_bigram_swar(h, n.start);
-    case 3: return sz_find_trigram_swar(h, n.start);
-    case 4: return sz_find_quadgram_swar(h, n.start);
+inline static sz_string_ptr_t sz_find_substr_swar( //
+    sz_string_ptr_t const haystack,
+    sz_size_t const haystack_length,
+    sz_string_ptr_t const needle,
+    sz_size_t const needle_length) {
+
+    if (haystack_length < needle_length) return NULL;
+
+    sz_size_t anomaly_offset = 0;
+    switch (needle_length) {
+    case 0: return NULL;
+    case 1: return sz_find_1char_swar(haystack, haystack_length, needle);
+    case 2: return sz_find_2char_swar(haystack, haystack_length, needle);
+    case 3: return sz_find_3char_swar(haystack, haystack_length, needle);
+    case 4: return sz_find_4char_swar(haystack, haystack_length, needle);
     default: {
-        char const *text = h.start;
-        char const *const end = h.start + h.length;
-
-        sz_quadgram_t n_quadgram, h_quadgram;
-        sz_size_t const n_suffix_len = n.length - 4 - n.quadgram_offset;
-        char const *n_suffix_ptr = n.start + 4 + n.quadgram_offset;
-        n_quadgram.u8s[0] = n.start[n.quadgram_offset];
-        n_quadgram.u8s[1] = n.start[n.quadgram_offset + 1];
-        n_quadgram.u8s[2] = n.start[n.quadgram_offset + 2];
-        n_quadgram.u8s[3] = n.start[n.quadgram_offset + 3];
-        h_quadgram.u8s[0] = h.start[0];
-        h_quadgram.u8s[1] = h.start[1];
-        h_quadgram.u8s[2] = h.start[2];
-        h_quadgram.u8s[3] = h.start[3];
-
-        text += n.quadgram_offset;
-        while (text + n.length <= end) {
-            h_quadgram.u8s[3] = text[3];
-            if (h_quadgram.u32 == n_quadgram.u32)                                       // Match quadgram.
-                if (sz_equal(text + 4, n_suffix_ptr, n_suffix_len))                     // Match suffix.
-                    if (sz_equal(text - n.quadgram_offset, n.start, n.quadgram_offset)) // Match prefix.
-                        return text - h.start - n.quadgram_offset;
-
-            h_quadgram.u32 >>= 8;
+        sz_string_ptr_t text = haystack;
+        sz_string_ptr_t const end = haystack + haystack_length;
+
+        _sz_anomaly_t n_anomaly, h_anomaly;
+        sz_size_t const n_suffix_len = needle_length - 4 - anomaly_offset;
+        sz_string_ptr_t n_suffix_ptr = needle + 4 + anomaly_offset;
+        n_anomaly.u8s[0] = needle[anomaly_offset];
+        n_anomaly.u8s[1] = needle[anomaly_offset + 1];
+        n_anomaly.u8s[2] = needle[anomaly_offset + 2];
+        n_anomaly.u8s[3] = needle[anomaly_offset + 3];
+        h_anomaly.u8s[0] = haystack[0];
+        h_anomaly.u8s[1] = haystack[1];
+        h_anomaly.u8s[2] = haystack[2];
+        h_anomaly.u8s[3] = haystack[3];
+
+        text += anomaly_offset;
+        while (text + needle_length <= end) {
+            h_anomaly.u8s[3] = text[3];
+            if (h_anomaly.u32 == n_anomaly.u32)                                  // Match anomaly.
+                if (sz_equal(text + 4, n_suffix_ptr, n_suffix_len))              // Match suffix.
+                    if (sz_equal(text - anomaly_offset, needle, anomaly_offset)) // Match prefix - optimized out.
+                        return text - anomaly_offset;
+
+            h_anomaly.u32 >>= 8;
             ++text;
         }
-        return h.length;
+        return NULL;
     }
     }
 }
@@ -390,34 +417,33 @@ inline static sz_size_t sz_find_substr_swar(sz_haystack_t h, sz_needle_t n) {
 /**
  *  Helper function, used in substring search operations.
  */
-inline static void _sz_find_substr_populate_quadgram( //
-    sz_haystack_t h,
-    sz_needle_t n,
-    sz_quadgram_t *quadgram_out,
-    sz_quadgram_t *mask_out) {
-
-    sz_quadgram_t quadgram;
-    sz_quadgram_t mask;
-    switch (n.length) {
+inline static void _sz_find_substr_populate_anomaly( //
+    sz_string_ptr_t const needle,
+    sz_size_t const needle_length,
+    _sz_anomaly_t *anomaly_out,
+    _sz_anomaly_t *mask_out) {
+
+    _sz_anomaly_t anomaly;
+    _sz_anomaly_t mask;
+    switch (needle_length) {
     case 1:
         mask.u8s[0] = 0xFF, mask.u8s[1] = mask.u8s[2] = mask.u8s[3] = 0;
-        quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = quadgram.u8s[2] = quadgram.u8s[3] = 0;
+        anomaly.u8s[0] = needle[0], anomaly.u8s[1] = anomaly.u8s[2] = anomaly.u8s[3] = 0;
         break;
     case 2:
         mask.u8s[0] = mask.u8s[1] = 0xFF, mask.u8s[2] = mask.u8s[3] = 0;
-        quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1], quadgram.u8s[2] = quadgram.u8s[3] = 0;
+        anomaly.u8s[0] = needle[0], anomaly.u8s[1] = needle[1], anomaly.u8s[2] = anomaly.u8s[3] = 0;
         break;
     case 3:
         mask.u8s[0] = mask.u8s[1] = mask.u8s[2] = 0xFF, mask.u8s[3] = 0;
-        quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1], quadgram.u8s[2] = n.start[2], quadgram.u8s[3] = 0;
+        anomaly.u8s[0] = needle[0], anomaly.u8s[1] = needle[1], anomaly.u8s[2] = needle[2], anomaly.u8s[3] = 0;
         break;
     default:
         mask.u32 = 0xFFFFFFFF;
-        quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1], quadgram.u8s[2] = n.start[2],
-        quadgram.u8s[3] = n.start[3];
+        anomaly.u8s[0] = needle[0], anomaly.u8s[1] = needle[1], anomaly.u8s[2] = needle[2], anomaly.u8s[3] = needle[3];
         break;
     }
-    *quadgram_out = quadgram;
+    *anomaly_out = anomaly;
     *mask_out = mask;
 }
 
@@ -429,14 +455,17 @@ inline static void _sz_find_substr_populate_quadgram( //
  *          was practically more efficient than loading once and shifting around, as introduces
  *          less data dependencies.
  */
-inline static sz_size_t sz_find_substr_avx2(sz_haystack_t h, sz_needle_t n) {
+inline static sz_string_ptr_t sz_find_substr_avx2(sz_string_ptr_t const haystack,
+                                                  sz_size_t const haystack_length,
+                                                  sz_string_ptr_t const needle,
+                                                  sz_size_t const needle_length) {
 
     // Precomputed constants
-    char const *const end = h.start + h.length;
-    sz_quadgram_t quadgram;
-    sz_quadgram_t mask;
-    _sz_find_substr_populate_quadgram(h, n, &quadgram, &mask);
-    __m256i const quadgrams = _mm256_set1_epi32(quadgram.u32);
+    sz_string_ptr_t const end = haystack + haystack_length;
+    _sz_anomaly_t anomaly;
+    _sz_anomaly_t mask;
+    _sz_find_substr_populate_anomaly(needle, needle_length, &anomaly, &mask);
+    __m256i const anomalies = _mm256_set1_epi32(anomaly.u32);
     __m256i const masks = _mm256_set1_epi32(mask.u32);
 
     // Top level for-loop changes dramatically.
@@ -448,18 +477,18 @@ inline static sz_size_t sz_find_substr_avx2(sz_haystack_t h, sz_needle_t n) {
     //  + 4 movemasks.
     //  + 3 bitwise ANDs.
     //  + 1 heavy (but very unlikely) branch.
-    char const *text = h.start;
-    while (text + n.length + 32 <= end) {
+    sz_string_ptr_t text = haystack;
+    while (text + needle_length + 32 <= end) {
 
         // Performing many unaligned loads ends up being faster than loading once and shuffling around.
         __m256i texts0 = _mm256_and_si256(_mm256_loadu_si256((__m256i const *)(text + 0)), masks);
-        int matches0 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts0, quadgrams));
+        int matches0 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts0, anomalies));
         __m256i texts1 = _mm256_and_si256(_mm256_loadu_si256((__m256i const *)(text + 1)), masks);
-        int matches1 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts1, quadgrams));
+        int matches1 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts1, anomalies));
         __m256i text2 = _mm256_and_si256(_mm256_loadu_si256((__m256i const *)(text + 2)), masks);
-        int matches2 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(text2, quadgrams));
+        int matches2 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(text2, anomalies));
         __m256i texts3 = _mm256_and_si256(_mm256_loadu_si256((__m256i const *)(text + 3)), masks);
-        int matches3 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts3, quadgrams));
+        int matches3 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts3, anomalies));
 
         if (matches0 | matches1 | matches2 | matches3) {
             int matches =                   //
@@ -468,25 +497,21 @@ inline static sz_size_t sz_find_substr_avx2(sz_haystack_t h, sz_needle_t n) {
                 (matches2 & 0x4444'4444u) | //
                 (matches3 & 0x8888'8888u);
             size_t first_match_offset = _tzcnt_u32(matches);
-            if (n.length > 4) {
-                if (sz_equal(text + first_match_offset + 4, n.start + 4, n.length - 4))
-                    return text + first_match_offset - h.start;
+            if (needle_length > 4) {
+                if (sz_equal(text + first_match_offset + 4, needle + 4, needle_length - 4))
+                    return text + first_match_offset;
                 else
                     text += first_match_offset + 1;
             }
             else
-                return text + first_match_offset - h.start;
-            }
+                return text + first_match_offset;
+        }
         else
             text += 32;
     }
 
     // Don't forget the last (up to 35) characters.
-    sz_haystack_t tail;
-    tail.start = text;
-    tail.length = end - text;
-    size_t tail_match = sz_find_substr_swar(tail, n);
-    return text + tail_match - h.start;
+    return sz_find_substr_swar(text, end - text, needle, needle_length);
 }
 
 #endif // x86 AVX2
@@ -499,26 +524,29 @@ inline static sz_size_t sz_find_substr_avx2(sz_haystack_t h, sz_needle_t n) {
  *          was practically more efficient than loading once and shifting around, as introduces
  *          less data dependencies.
  */
-inline static sz_size_t sz_find_substr_neon(sz_haystack_t h, sz_needle_t n) {
+inline static sz_string_ptr_t sz_find_substr_neon(sz_string_ptr_t const haystack,
+                                                  sz_size_t const haystack_length,
+                                                  sz_string_ptr_t const needle,
+                                                  sz_size_t const needle_length) {
 
     // Precomputed constants
-    char const *const end = h.start + h.length;
-    sz_quadgram_t quadgram;
-    sz_quadgram_t mask;
-    _sz_find_substr_populate_quadgram(h, n, &quadgram, &mask);
-    uint32x4_t const quadgrams = vld1q_dup_u32(&quadgram.u32);
-    uint32x4_t const masks = vld1q_dup_u32(&mask);
+    sz_string_ptr_t const end = haystack + haystack_length;
+    _sz_anomaly_t anomaly;
+    _sz_anomaly_t mask;
+    _sz_find_substr_populate_anomaly(needle, needle_length, &anomaly, &mask);
+    uint32x4_t const anomalies = vld1q_dup_u32(&anomaly.u32);
+    uint32x4_t const masks = vld1q_dup_u32(&mask.u32);
     uint32x4_t matches, matches0, matches1, matches2, matches3;
 
-    char const *text = h.start;
-    while (text + n.length + 16 <= end) {
+    sz_string_ptr_t text = haystack;
+    while (text + needle_length + 16 <= end) {
 
         // Each of the following `matchesX` contains only 4 relevant bits - one per word.
         // Each signifies a match at the given offset.
-        matches0 = vceqq_u32(vandq_u32(vld1q_u32(text + 0), masks), quadgrams);
-        matches1 = vceqq_u32(vandq_u32(vld1q_u32(text + 1), masks), quadgrams);
-        matches2 = vceqq_u32(vandq_u32(vld1q_u32(text + 2), masks), quadgrams);
-        matches3 = vceqq_u32(vandq_u32(vld1q_u32(text + 3), masks), quadgrams);
+        matches0 = vceqq_u32(vandq_u32(vreinterpretq_u32_u8(vld1q_u8((unsigned char *)text + 0)), masks), anomalies);
+        matches1 = vceqq_u32(vandq_u32(vreinterpretq_u32_u8(vld1q_u8((unsigned char *)text + 1)), masks), anomalies);
+        matches2 = vceqq_u32(vandq_u32(vreinterpretq_u32_u8(vld1q_u8((unsigned char *)text + 2)), masks), anomalies);
+        matches3 = vceqq_u32(vandq_u32(vreinterpretq_u32_u8(vld1q_u8((unsigned char *)text + 3)), masks), anomalies);
         matches = vorrq_u32(vorrq_u32(matches0, matches1), vorrq_u32(matches2, matches3));
 
         if (vmaxvq_u32(matches)) {
@@ -540,73 +568,172 @@ inline static sz_size_t sz_find_substr_neon(sz_haystack_t h, sz_needle_t n) {
 
             // Find the first match
             size_t first_match_offset = __builtin_ctz(matches_u16);
-            if (n.length > 4) {
-                if (sz_equal(text + first_match_offset + 4, n.start + 4, n.length - 4))
-                    return text + first_match_offset - h.start;
+            if (needle_length > 4) {
+                if (sz_equal(text + first_match_offset + 4, needle + 4, needle_length - 4))
+                    return text + first_match_offset;
                 else
                     text += first_match_offset + 1;
             }
             else
-                return text + first_match_offset - h.start;
+                return text + first_match_offset;
         }
         else
             text += 16;
     }
 
     // Don't forget the last (up to 16+3=19) characters.
-    sz_haystack_t tail;
-    tail.start = text;
-    tail.length = end - text;
-    size_t tail_match = sz_find_substr_swar(tail, n);
-    return text + tail_match - h.start;
+    return sz_find_substr_swar(text, end - text, needle, needle_length);
 }
 
 #endif // Arm Neon
 
-inline static sz_size_t sz_count_char(sz_haystack_t h, char n) { return sz_count_unigram_swar(h, n); }
-inline static sz_size_t sz_find_unigram(sz_haystack_t h, char n) { return sz_find_unigram_swar(h, n); }
+inline static sz_size_t sz_count_char(sz_string_ptr_t const haystack,
+                                      sz_size_t const haystack_length,
+                                      sz_string_ptr_t const needle) {
+    return sz_count_char_swar(haystack, haystack_length, needle);
+}
 
-inline static sz_size_t sz_find_substr(sz_haystack_t h, sz_needle_t n) {
-    if (h.length < n.length) return h.length;
+inline static sz_string_ptr_t sz_find_1char(sz_string_ptr_t const haystack,
+                                            sz_size_t const haystack_length,
+                                            sz_string_ptr_t const needle) {
+    return sz_find_1char_swar(haystack, haystack_length, needle);
+}
 
+inline static sz_string_ptr_t sz_rfind_1char(sz_string_ptr_t const haystack,
+                                             sz_size_t const haystack_length,
+                                             sz_string_ptr_t const needle) {
+    return sz_rfind_1char_swar(haystack, haystack_length, needle);
+}
+
+inline static sz_string_ptr_t sz_find_substr(sz_string_ptr_t const haystack,
+                                             sz_size_t const haystack_length,
+                                             sz_string_ptr_t const needle,
+                                             sz_size_t const needle_length) {
+    if (haystack_length < needle_length) return NULL;
 #if defined(__ARM_NEON)
-    return sz_find_substr_neon(h, n);
+    return sz_find_substr_neon(haystack, haystack_length, needle, needle_length);
 #elif defined(__AVX2__)
-    return sz_find_substr_avx2(h, n);
+    return sz_find_substr_avx2(haystack, haystack_length, needle, needle_length);
 #else
-    return sz_find_substr_swar(h, n);
+    return sz_find_substr_swar(haystack, haystack_length, needle, needle_length);
 #endif
 }
 
-inline static void sz_swap(sz_size_t *a, sz_size_t *b) {
-    sz_size_t t = *a;
+/**
+ *  @brief  Maps any ASCII character to itself, or the lowercase variant, if available.
+ */
+inline static char sz_tolower_ascii(char c) {
+    static unsigned char lowered[256] = {
+        0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  15,  //
+        16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  //
+        32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  //
+        48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  //
+        64,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, //
+        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 91,  92,  93,  94,  95,  //
+        96,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, //
+        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, //
+        128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, //
+        144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, //
+        160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, //
+        176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, //
+        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
+        240, 241, 242, 243, 244, 245, 246, 215, 248, 249, 250, 251, 252, 253, 254, 223, //
+        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
+        240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, //
+    };
+    return *(char *)&lowered[(int)c];
+}
+
+/**
+ *  @brief  Maps any ASCII character to itself, or the uppercase variant, if available.
+ */
+inline static char sz_toupper_ascii(char c) {
+    static unsigned char upped[256] = {
+        0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  15,  //
+        16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  //
+        32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  //
+        48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  //
+        64,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, //
+        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 91,  92,  93,  94,  95,  //
+        96,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  //
+        80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  123, 124, 125, 126, 127, //
+        128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, //
+        144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, //
+        160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, //
+        176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, //
+        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
+        240, 241, 242, 243, 244, 245, 246, 215, 248, 249, 250, 251, 252, 253, 254, 223, //
+        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, //
+        240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, //
+    };
+    return *(char *)&upped[(int)c];
+}
+
+/**
+ *  @brief  Char-level lexicographic comparison of two strings.
+ *          Doesn't provide major performance improvements, but helps avoid the LibC dependency.
+ */
+inline static sz_bool_t sz_is_less_ascii(sz_string_ptr_t const a,
+                                         sz_size_t const a_length,
+                                         sz_string_ptr_t const b,
+                                         sz_size_t const b_length) {
+
+    sz_size_t min_length = (a_length < b_length) ? a_length : b_length;
+    for (sz_size_t i = 0; i < min_length; ++i) {
+        if (a[i] < b[i]) return 1;
+        if (a[i] > b[i]) return 0;
+    }
+    return a_length < b_length;
+}
+
+/**
+ *  @brief  Char-level lexicographic comparison of two strings, insensitive to the case of ASCII symbols.
+ *          Doesn't provide major performance improvements, but helps avoid the LibC dependency.
+ */
+inline static sz_bool_t sz_is_less_uncased_ascii(sz_string_ptr_t const a,
+                                                 sz_size_t const a_length,
+                                                 sz_string_ptr_t const b,
+                                                 sz_size_t const b_length) {
+
+    sz_size_t min_length = (a_length < b_length) ? a_length : b_length;
+    for (sz_size_t i = 0; i < min_length; ++i) {
+        char a_lower = sz_tolower_ascii(a[i]);
+        char b_lower = sz_tolower_ascii(b[i]);
+        if (a_lower < b_lower) return 1;
+        if (a_lower > b_lower) return 0;
+    }
+    return a_length < b_length;
+}
+
+/**
+ *  @brief  Helper, that swaps two 64-bit integers representing the order of elements in the sequence.
+ */
+inline static void _sz_swap_order(sz_u64_t *a, sz_u64_t *b) {
+    sz_u64_t t = *a;
     *a = *b;
     *b = t;
 }
 
-typedef char const *(*sz_sequence_get_start_t)(void const *, sz_size_t);
-typedef sz_size_t (*sz_sequence_get_length_t)(void const *, sz_size_t);
-typedef int (*sz_sequence_predicate_t)(void const *, sz_size_t);
-typedef int (*sz_sequence_comparator_t)(void const *, sz_size_t, sz_size_t);
+struct sz_sequence_s;
 
-// Define a type for the comparison function, depending on the platform.
-#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) || defined(__APPLE__)
-typedef int (*sz_qsort_comparison_func_t)(void *, void const *, void const *);
-#else
-typedef int (*sz_qsort_comparison_func_t)(void const *, void const *, void *);
-#endif
+typedef sz_string_ptr_t (*sz_sequence_member_start_t)(void const *, sz_size_t);
+typedef sz_size_t (*sz_sequence_member_length_t)(void const *, sz_size_t);
+typedef sz_bool_t (*sz_sequence_predicate_t)(void const *, sz_size_t);
+typedef sz_bool_t (*sz_sequence_comparator_t)(void const *, sz_size_t, sz_size_t);
+typedef sz_bool_t (*sz_string_is_less_t)(sz_string_ptr_t, sz_size_t, sz_string_ptr_t, sz_size_t);
 
-typedef struct sz_sequence_t {
-    sz_size_t *order;
+typedef struct sz_sequence_s {
+    sz_u64_t *order;
     sz_size_t count;
-    sz_sequence_get_start_t get_start;
-    sz_sequence_get_length_t get_length;
+    sz_sequence_member_start_t get_start;
+    sz_sequence_member_length_t get_length;
     void const *handle;
 } sz_sequence_t;
 
 /**
- *  @brief  Similar to `std::partition`, given a predicate splits the
- *          sequence into two parts.
+ *  @brief  Similar to `std::partition`, given a predicate splits the sequence into two parts.
+ *          The algorithm is unstable, meaning that elements may change relative order, as long
+ *          as they are in the right partition. This is the simpler algorithm for partitioning.
  */
 inline static sz_size_t sz_partition(sz_sequence_t *sequence, sz_sequence_predicate_t predicate) {
 
@@ -615,14 +742,16 @@ inline static sz_size_t sz_partition(sz_sequence_t *sequence, sz_sequence_predic
 
     for (sz_size_t i = matches + 1; i < sequence->count; ++i)
         if (predicate(sequence->handle, sequence->order[i]))
-            sz_swap(sequence->order + i, sequence->order + matches), ++matches;
+            _sz_swap_order(sequence->order + i, sequence->order + matches), ++matches;
 
     return matches;
 }
 
 /**
- *  @brief  Inplace `std::set_union` for two consecutive chunks forming
- *          the same continuous sequence.
+ *  @brief  Inplace `std::set_union` for two consecutive chunks forming the same continuous `sequence`.
+ *
+ *  @param partition The number of elements in the first sub-sequence in `sequence`.
+ *  @param less Comparison function, to determine the lexicographic ordering.
  */
 inline static void sz_merge(sz_sequence_t *sequence, sz_size_t partition, sz_sequence_comparator_t less) {
 
@@ -642,10 +771,7 @@ inline static void sz_merge(sz_sequence_t *sequence, sz_size_t partition, sz_seq
 
             // Shift all the elements between element 1
             // element 2, right by 1.
-            while (index != start_a) {
-                sequence->order[index] = sequence->order[index - 1];
-                index--;
-            }
+            while (index != start_a) { sequence->order[index] = sequence->order[index - 1], index--; }
             sequence->order[start_a] = value;
 
             // Update all the pointers
@@ -656,112 +782,86 @@ inline static void sz_merge(sz_sequence_t *sequence, sz_size_t partition, sz_seq
     }
 }
 
+inline static void sz_sort_insertion(sz_sequence_t *sequence, sz_sequence_comparator_t comparator) {
+    sz_u64_t *keys = sequence->order;
+    sz_size_t keys_count = sequence->count;
+    for (sz_size_t i = 1; i < keys_count; i++) {
+        sz_u64_t i_key = keys[i];
+        // Move elements of arr[0..i-1] that are greater than key to one position ahead of their current position
+        sz_size_t j = i;
+        for (; j > 0 && comparator(sequence, keys[j - 1], i) != 0; --j) keys[j] = keys[j - 1];
+        keys[j] = i_key;
+    }
+}
+
+/**
+ *  @brief  Internal Radix sorting procedure.
+ */
 inline static void _sz_sort_recursion( //
     sz_sequence_t *sequence,
     sz_size_t bit_idx,
     sz_size_t bit_max,
-    sz_qsort_comparison_func_t qsort_comparator) {
+    sz_sequence_comparator_t comparator,
+    sz_size_t partial_order_length) {
 
     if (!sequence->count) return;
 
     // Partition a range of integers according to a specific bit value
     sz_size_t split = 0;
     {
-        sz_size_t mask = (1ul << 63) >> bit_idx;
+        sz_u64_t mask = (1ul << 63) >> bit_idx;
         while (split != sequence->count && !(sequence->order[split] & mask)) ++split;
         for (sz_size_t i = split + 1; i < sequence->count; ++i)
-            if (!(sequence->order[i] & mask)) sz_swap(sequence->order + i, sequence->order + split), ++split;
+            if (!(sequence->order[i] & mask)) _sz_swap_order(sequence->order + i, sequence->order + split), ++split;
     }
 
     // Go down recursively
     if (bit_idx < bit_max) {
         sz_sequence_t a = *sequence;
         a.count = split;
-        _sz_sort_recursion(&a, bit_idx + 1, bit_max, qsort_comparator);
+        _sz_sort_recursion(&a, bit_idx + 1, bit_max, comparator, partial_order_length);
 
         sz_sequence_t b = *sequence;
         b.order += split;
         b.count -= split;
-        _sz_sort_recursion(&b, bit_idx + 1, bit_max, qsort_comparator);
+        _sz_sort_recursion(&b, bit_idx + 1, bit_max, comparator, partial_order_length);
     }
     // Reached the end of recursion
     else {
         // Discard the prefixes
-        for (sz_size_t i = 0; i != sequence->count; ++i) { memset((char *)(&sequence->order[i]) + 4, 0, 4ul); }
-
-        // Perform sorts on smaller chunks instead of the whole handle
-#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
-        // https://stackoverflow.com/a/39561369
-        // https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/qsort-s?view=msvc-170
-        qsort_s(sequence->order, split, sizeof(sz_size_t), qsort_comparator, (void *)sequence);
-        qsort_s(sequence->order + split,
-                sequence->count - split,
-                sizeof(sz_size_t),
-                qsort_comparator,
-                (void *)sequence);
-#elif __APPLE__
-        qsort_r(sequence->order, split, sizeof(sz_size_t), (void *)sequence, qsort_comparator);
-        qsort_r(sequence->order + split,
-                sequence->count - split,
-                sizeof(sz_size_t),
-                (void *)sequence,
-                qsort_comparator);
-#else
-        // https://linux.die.net/man/3/qsort_r
-        qsort_r(sequence->order, split, sizeof(sz_size_t), qsort_comparator, (void *)sequence);
-        qsort_r(sequence->order + split,
-                sequence->count - split,
-                sizeof(sz_size_t),
-                qsort_comparator,
-                (void *)sequence);
-#endif
+        sz_u32_t *order_half_words = (sz_u32_t *)sequence->order;
+        for (sz_size_t i = 0; i != sequence->count; ++i) { order_half_words[i * 2 + 1] = 0; }
+
+        sz_sequence_t a = *sequence;
+        a.count = split;
+        sz_sort_insertion(&a, comparator);
+
+        sz_sequence_t b = *sequence;
+        b.order += split;
+        b.count -= split;
+        sz_sort_insertion(&b, comparator);
     }
 }
 
-inline static int _sz_sort_sequence_strncmp(
-#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) || __APPLE__
-    void *sequence_raw, void const *a_raw, void const *b_raw
-#else
-    void const *a_raw, void const *b_raw, void *sequence_raw
-#endif
-) {
-    // https://man.freebsd.org/cgi/man.cgi?query=qsort_s&sektion=3&n=1
-    // https://www.man7.org/linux/man-pages/man3/strcmp.3.html
-    sz_sequence_t *sequence = (sz_sequence_t *)sequence_raw;
-    sz_size_t a = *(sz_size_t *)a_raw;
-    sz_size_t b = *(sz_size_t *)b_raw;
-    sz_size_t a_len = sequence->get_length(sequence->handle, a);
-    sz_size_t b_len = sequence->get_length(sequence->handle, b);
-    int res = strncmp( //
-        sequence->get_start(sequence->handle, a),
-        sequence->get_start(sequence->handle, b),
-        a_len > b_len ? b_len : a_len);
-    return res ? res : a_len - b_len;
+inline static sz_bool_t _sz_sort_compare_less_ascii(sz_sequence_t *sequence, sz_size_t i_key, sz_size_t j_key) {
+    sz_string_ptr_t i_str = sequence->get_start(sequence->handle, i_key);
+    sz_size_t i_len = sequence->get_length(sequence->handle, i_key);
+    sz_string_ptr_t j_str = sequence->get_start(sequence->handle, j_key);
+    sz_size_t j_len = sequence->get_length(sequence->handle, j_key);
+    return sz_is_less_ascii(i_str, i_len, j_str, j_len);
 }
 
-inline static int _sz_sort_sequence_strncasecmp(
-#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) || __APPLE__
-    void *sequence_raw, void const *a_raw, void const *b_raw
-#else
-    void const *a_raw, void const *b_raw, void *sequence_raw
-#endif
-) {
-    // https://man.freebsd.org/cgi/man.cgi?query=qsort_s&sektion=3&n=1
-    // https://www.man7.org/linux/man-pages/man3/strcmp.3.html
-    sz_sequence_t *sequence = (sz_sequence_t *)sequence_raw;
-    sz_size_t a = *(sz_size_t *)a_raw;
-    sz_size_t b = *(sz_size_t *)b_raw;
-    sz_size_t a_len = sequence->get_length(sequence->handle, a);
-    sz_size_t b_len = sequence->get_length(sequence->handle, b);
-    int res = strncasecmp( //
-        sequence->get_start(sequence->handle, a),
-        sequence->get_start(sequence->handle, b),
-        a_len > b_len ? b_len : a_len);
-    return res ? res : a_len - b_len;
+inline static sz_bool_t _sz_sort_compare_less_uncased_ascii(sz_sequence_t *sequence, sz_size_t i_key, sz_size_t j_key) {
+    sz_string_ptr_t i_str = sequence->get_start(sequence->handle, i_key);
+    sz_size_t i_len = sequence->get_length(sequence->handle, i_key);
+    sz_string_ptr_t j_str = sequence->get_start(sequence->handle, j_key);
+    sz_size_t j_len = sequence->get_length(sequence->handle, j_key);
+    return sz_is_less_uncased_ascii(i_str, i_len, j_str, j_len);
 }
 
 typedef struct sz_sort_config_t {
-    int case_insensitive;
+    sz_bool_t case_insensitive;
+    sz_size_t partial_order_length;
 } sz_sort_config_t;
 
 /**
@@ -770,11 +870,13 @@ typedef struct sz_sort_config_t {
  */
 inline static void sz_sort(sz_sequence_t *sequence, sz_sort_config_t const *config) {
 
-    int case_insensitive = config && config->case_insensitive;
+    sz_bool_t case_insensitive = config && config->case_insensitive;
+    sz_size_t partial_order_length =
+        config && config->partial_order_length ? config->partial_order_length : sequence->count;
 
     // Export up to 4 bytes into the `sequence` bits themselves
     for (sz_size_t i = 0; i != sequence->count; ++i) {
-        char const *begin = sequence->get_start(sequence->handle, sequence->order[i]);
+        sz_string_ptr_t begin = sequence->get_start(sequence->handle, sequence->order[i]);
         sz_size_t length = sequence->get_length(sequence->handle, sequence->order[i]);
         length = length > 4ul ? 4ul : length;
         char *prefix = (char *)&sequence->order[i];
@@ -787,11 +889,11 @@ inline static void sz_sort(sz_sequence_t *sequence, sz_sort_config_t const *conf
         }
     }
 
-    sz_qsort_comparison_func_t comparator = _sz_sort_sequence_strncmp;
-    if (case_insensitive) comparator = _sz_sort_sequence_strncasecmp;
+    sz_sequence_comparator_t comparator = (sz_sequence_comparator_t)_sz_sort_compare_less_ascii;
+    if (case_insensitive) comparator = (sz_sequence_comparator_t)_sz_sort_compare_less_uncased_ascii;
 
     // Perform optionally-parallel radix sort on them
-    _sz_sort_recursion(sequence, 0, 32, comparator);
+    _sz_sort_recursion(sequence, 0, 32, comparator, partial_order_length);
 }
 
 typedef unsigned char levenstein_distance_t;
@@ -806,9 +908,9 @@ inline static sz_size_t sz_levenstein_memory_needed(sz_size_t _, sz_size_t b_len
  *  @brief  Auxiliary function, that computes the minimum of three values.
  */
 inline static levenstein_distance_t _sz_levenstein_minimum( //
-    levenstein_distance_t a,
-    levenstein_distance_t b,
-    levenstein_distance_t c) {
+    levenstein_distance_t const a,
+    levenstein_distance_t const b,
+    levenstein_distance_t const c) {
 
     return (a < b ? (a < c ? a : c) : (b < c ? b : c));
 }
@@ -818,11 +920,11 @@ inline static levenstein_distance_t _sz_levenstein_minimum( //
  *          It accepts an upper bound on the possible error. Quadratic complexity in time, linear in space.
  */
 inline static levenstein_distance_t sz_levenstein( //
-    char const *a,
-    sz_size_t a_length,
-    char const *b,
-    sz_size_t b_length,
-    levenstein_distance_t bound,
+    sz_string_ptr_t const a,
+    sz_size_t const a_length,
+    sz_string_ptr_t const b,
+    sz_size_t const b_length,
+    levenstein_distance_t const bound,
     void *buffer) {
 
     // If one of the strings is empty - the edit distance is equal to the length of the other one
@@ -873,11 +975,11 @@ inline static levenstein_distance_t sz_levenstein( //
 /**
  *  @brief  Hashes provided string using hardware-accelerated CRC32 instructions.
  */
-inline static sz_u32_t sz_hash_crc32_native(char const *start, sz_size_t length) { return 0; }
+inline static sz_u32_t sz_hash_crc32_native(sz_string_ptr_t start, sz_size_t length) { return 0; }
 
-inline static sz_u32_t sz_hash_crc32_neon(char const *start, sz_size_t length) { return 0; }
+inline static sz_u32_t sz_hash_crc32_neon(sz_string_ptr_t start, sz_size_t length) { return 0; }
 
-inline static sz_u32_t sz_hash_crc32_sse(char const *start, sz_size_t length) { return 0; }
+inline static sz_u32_t sz_hash_crc32_sse(sz_string_ptr_t start, sz_size_t length) { return 0; }
 
 #ifdef __cplusplus
 }
@@ -889,5 +991,6 @@ inline static sz_u32_t sz_hash_crc32_sse(char const *start, sz_size_t length) {
 #endif
 #undef popcount64
 #undef ctz64
+#undef clz64
 
 #endif // STRINGZILLA_H_

From a7796a13eb365ff7a0d044576840abb42001db63 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 9 Oct 2023 14:38:44 -0700
Subject: [PATCH 60/72] Improve: Intro-sort

---
 stringzilla/stringzilla.h | 167 ++++++++++++++++++++++++++++++++------
 1 file changed, 142 insertions(+), 25 deletions(-)

diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h
index 0aa8774b..84e864cf 100644
--- a/stringzilla/stringzilla.h
+++ b/stringzilla/stringzilla.h
@@ -714,15 +714,15 @@ inline static void _sz_swap_order(sz_u64_t *a, sz_u64_t *b) {
     *b = t;
 }
 
-struct sz_sequence_s;
+struct sz_sequence_t;
 
-typedef sz_string_ptr_t (*sz_sequence_member_start_t)(void const *, sz_size_t);
-typedef sz_size_t (*sz_sequence_member_length_t)(void const *, sz_size_t);
-typedef sz_bool_t (*sz_sequence_predicate_t)(void const *, sz_size_t);
-typedef sz_bool_t (*sz_sequence_comparator_t)(void const *, sz_size_t, sz_size_t);
+typedef sz_string_ptr_t (*sz_sequence_member_start_t)(struct sz_sequence_t const *, sz_size_t);
+typedef sz_size_t (*sz_sequence_member_length_t)(struct sz_sequence_t const *, sz_size_t);
+typedef sz_bool_t (*sz_sequence_predicate_t)(struct sz_sequence_t const *, sz_size_t);
+typedef sz_bool_t (*sz_sequence_comparator_t)(struct sz_sequence_t const *, sz_size_t, sz_size_t);
 typedef sz_bool_t (*sz_string_is_less_t)(sz_string_ptr_t, sz_size_t, sz_string_ptr_t, sz_size_t);
 
-typedef struct sz_sequence_s {
+typedef struct sz_sequence_t {
     sz_u64_t *order;
     sz_size_t count;
     sz_sequence_member_start_t get_start;
@@ -738,10 +738,10 @@ typedef struct sz_sequence_s {
 inline static sz_size_t sz_partition(sz_sequence_t *sequence, sz_sequence_predicate_t predicate) {
 
     sz_size_t matches = 0;
-    while (matches != sequence->count && predicate(sequence->handle, sequence->order[matches])) ++matches;
+    while (matches != sequence->count && predicate(sequence, sequence->order[matches])) ++matches;
 
     for (sz_size_t i = matches + 1; i < sequence->count; ++i)
-        if (predicate(sequence->handle, sequence->order[i]))
+        if (predicate(sequence, sequence->order[i]))
             _sz_swap_order(sequence->order + i, sequence->order + matches), ++matches;
 
     return matches;
@@ -758,13 +758,13 @@ inline static void sz_merge(sz_sequence_t *sequence, sz_size_t partition, sz_seq
     sz_size_t start_b = partition + 1;
 
     // If the direct merge is already sorted
-    if (!less(sequence->handle, sequence->order[start_b], sequence->order[partition])) return;
+    if (!less(sequence, sequence->order[start_b], sequence->order[partition])) return;
 
     sz_size_t start_a = 0;
     while (start_a <= partition && start_b <= sequence->count) {
 
         // If element 1 is in right place
-        if (!less(sequence->handle, sequence->order[start_b], sequence->order[start_a])) { start_a++; }
+        if (!less(sequence, sequence->order[start_b], sequence->order[start_a])) { start_a++; }
         else {
             sz_size_t value = sequence->order[start_b];
             sz_size_t index = start_b;
@@ -782,18 +782,135 @@ inline static void sz_merge(sz_sequence_t *sequence, sz_size_t partition, sz_seq
     }
 }
 
-inline static void sz_sort_insertion(sz_sequence_t *sequence, sz_sequence_comparator_t comparator) {
+inline static void sz_sort_insertion(sz_sequence_t *sequence, sz_sequence_comparator_t less) {
     sz_u64_t *keys = sequence->order;
     sz_size_t keys_count = sequence->count;
     for (sz_size_t i = 1; i < keys_count; i++) {
         sz_u64_t i_key = keys[i];
-        // Move elements of arr[0..i-1] that are greater than key to one position ahead of their current position
         sz_size_t j = i;
-        for (; j > 0 && comparator(sequence, keys[j - 1], i) != 0; --j) keys[j] = keys[j - 1];
+        for (; j > 0 && less(sequence, i_key, keys[j - 1]); --j) keys[j] = keys[j - 1];
         keys[j] = i_key;
     }
 }
 
+// Utility functions
+inline static sz_size_t _sz_log2i(sz_size_t n) {
+    sz_size_t log2 = 0;
+    while (n >>= 1) ++log2;
+    return log2;
+}
+
+inline static void _sz_sift_down(
+    sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_u64_t *order, sz_size_t start, sz_size_t end) {
+    sz_size_t root = start;
+    while (2 * root + 1 <= end) {
+        sz_size_t child = 2 * root + 1;
+        if (child + 1 <= end && less(sequence, order[child], order[child + 1])) { child++; }
+        if (!less(sequence, order[root], order[child])) { return; }
+        _sz_swap_order(order + root, order + child);
+        root = child;
+    }
+}
+
+inline static void _sz_heapify(sz_sequence_t *sequence,
+                               sz_sequence_comparator_t less,
+                               sz_u64_t *order,
+                               sz_size_t count) {
+    sz_size_t start = (count - 2) / 2;
+    while (1) {
+        _sz_sift_down(sequence, less, order, start, count - 1);
+        if (start == 0) return;
+        start--;
+    }
+}
+
+inline static void _sz_heapsort(sz_sequence_t *sequence,
+                                sz_sequence_comparator_t less,
+                                sz_size_t first,
+                                sz_size_t last) {
+    sz_u64_t *order = sequence->order;
+    sz_size_t count = last - first;
+    _sz_heapify(sequence, less, order + first, count);
+    sz_size_t end = count - 1;
+    while (end > 0) {
+        _sz_swap_order(order + first, order + first + end);
+        end--;
+        _sz_sift_down(sequence, less, order + first, 0, end);
+    }
+}
+
+inline static void _sz_introsort(
+    sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_size_t first, sz_size_t last, sz_size_t depth) {
+
+    sz_size_t length = last - first;
+    switch (length) {
+    case 0:
+    case 1: return;
+    case 2:
+        if (less(sequence, sequence->order[first + 1], sequence->order[first]))
+            _sz_swap_order(&sequence->order[first], &sequence->order[first + 1]);
+        return;
+    case 3:
+        sz_u64_t a = sequence->order[first];
+        sz_u64_t b = sequence->order[first + 1];
+        sz_u64_t c = sequence->order[first + 2];
+        if (less(sequence, b, a)) _sz_swap_order(&a, &b);
+        if (less(sequence, c, b)) _sz_swap_order(&c, &b);
+        if (less(sequence, b, a)) _sz_swap_order(&a, &b);
+        sequence->order[first] = a;
+        sequence->order[first + 1] = b;
+        sequence->order[first + 2] = c;
+        return;
+    }
+    // Until a certain length, the quadratic-complexity insertion-sort is fine
+    if (length <= 16) {
+        sz_sequence_t sub_seq = *sequence;
+        sub_seq.order += first;
+        sub_seq.count = length;
+        sz_sort_insertion(&sub_seq, less);
+        return;
+    }
+
+    // Fallback to N-logN-complexity heap-sort
+    if (depth == 0) {
+        _sz_heapsort(sequence, less, first, last);
+        return;
+    }
+
+    --depth;
+
+    // Median-of-three logic to choose pivot
+    sz_size_t median = first + length / 2;
+    if (less(sequence, sequence->order[median], sequence->order[first]))
+        _sz_swap_order(&sequence->order[first], &sequence->order[median]);
+    if (less(sequence, sequence->order[last - 1], sequence->order[first]))
+        _sz_swap_order(&sequence->order[first], &sequence->order[last - 1]);
+    if (less(sequence, sequence->order[median], sequence->order[last - 1]))
+        _sz_swap_order(&sequence->order[median], &sequence->order[last - 1]);
+
+    // Partition using the median-of-three as the pivot
+    sz_u64_t pivot = sequence->order[median];
+    sz_size_t left = first;
+    sz_size_t right = last - 1;
+    while (true) {
+        while (less(sequence, sequence->order[left], pivot)) left++;
+        while (less(sequence, pivot, sequence->order[right])) right--;
+        if (left >= right) break;
+        _sz_swap_order(&sequence->order[left], &sequence->order[right]);
+        left++;
+        right--;
+    }
+
+    // Recursively sort the partitions
+    _sz_introsort(sequence, less, first, left, depth);
+    _sz_introsort(sequence, less, right + 1, last, depth);
+}
+
+inline static void sz_sort_introsort(sz_sequence_t *sequence, sz_sequence_comparator_t less) {
+    sz_size_t depth_limit = 2 * _sz_log2i(sequence->count);
+    _sz_introsort(sequence, less, 0, sequence->count, depth_limit);
+}
+
 /**
  *  @brief  Internal Radix sorting procedure.
  */
@@ -834,28 +951,28 @@ inline static void _sz_sort_recursion( //
 
         sz_sequence_t a = *sequence;
         a.count = split;
-        sz_sort_insertion(&a, comparator);
+        sz_sort_introsort(&a, comparator);
 
         sz_sequence_t b = *sequence;
         b.order += split;
         b.count -= split;
-        sz_sort_insertion(&b, comparator);
+        sz_sort_introsort(&b, comparator);
     }
 }
 
 inline static sz_bool_t _sz_sort_compare_less_ascii(sz_sequence_t *sequence, sz_size_t i_key, sz_size_t j_key) {
-    sz_string_ptr_t i_str = sequence->get_start(sequence->handle, i_key);
-    sz_size_t i_len = sequence->get_length(sequence->handle, i_key);
-    sz_string_ptr_t j_str = sequence->get_start(sequence->handle, j_key);
-    sz_size_t j_len = sequence->get_length(sequence->handle, j_key);
+    sz_string_ptr_t i_str = sequence->get_start(sequence, i_key);
+    sz_size_t i_len = sequence->get_length(sequence, i_key);
+    sz_string_ptr_t j_str = sequence->get_start(sequence, j_key);
+    sz_size_t j_len = sequence->get_length(sequence, j_key);
     return sz_is_less_ascii(i_str, i_len, j_str, j_len);
 }
 
 inline static sz_bool_t _sz_sort_compare_less_uncased_ascii(sz_sequence_t *sequence, sz_size_t i_key, sz_size_t j_key) {
-    sz_string_ptr_t i_str = sequence->get_start(sequence->handle, i_key);
-    sz_size_t i_len = sequence->get_length(sequence->handle, i_key);
-    sz_string_ptr_t j_str = sequence->get_start(sequence->handle, j_key);
-    sz_size_t j_len = sequence->get_length(sequence->handle, j_key);
+    sz_string_ptr_t i_str = sequence->get_start(sequence, i_key);
+    sz_size_t i_len = sequence->get_length(sequence, i_key);
+    sz_string_ptr_t j_str = sequence->get_start(sequence, j_key);
+    sz_size_t j_len = sequence->get_length(sequence, j_key);
     return sz_is_less_uncased_ascii(i_str, i_len, j_str, j_len);
 }
 
@@ -876,8 +993,8 @@ inline static void sz_sort(sz_sequence_t *sequence, sz_sort_config_t const *conf
 
     // Export up to 4 bytes into the `sequence` bits themselves
     for (sz_size_t i = 0; i != sequence->count; ++i) {
-        sz_string_ptr_t begin = sequence->get_start(sequence->handle, sequence->order[i]);
-        sz_size_t length = sequence->get_length(sequence->handle, sequence->order[i]);
+        sz_string_ptr_t begin = sequence->get_start(sequence, sequence->order[i]);
+        sz_size_t length = sequence->get_length(sequence, sequence->order[i]);
         length = length > 4ul ? 4ul : length;
         char *prefix = (char *)&sequence->order[i];
         for (sz_size_t j = 0; j != length; ++j) prefix[7 - j] = begin[j];

From 51c47fdbc489340d13eb0c8a8879f4bee47d340d Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 9 Oct 2023 16:30:38 -0700
Subject: [PATCH 61/72] Refactor: New C API for JS

---
 javascript/lib.c          | 62 +++++++++++++-------------
 javascript/test/find.js   | 14 +++---
 scripts/test.c            | 13 +++---
 scripts/test.cpp          | 92 +++++++++++++++++++++------------------
 stringzilla/stringzilla.h |  9 ++--
 5 files changed, 97 insertions(+), 93 deletions(-)

diff --git a/javascript/lib.c b/javascript/lib.c
index fe1f5f68..18e36a1b 100644
--- a/javascript/lib.c
+++ b/javascript/lib.c
@@ -8,17 +8,18 @@
  *  @see        NodeJS docs: https://nodejs.org/api/n-api.html
  */
 
-#include <node_api.h>
-#include <stringzilla.h>
+#include <node_api.h>    // `napi_*` functions
+#include <stdlib.h>      // `malloc`
+#include <stringzilla.h> // `sz_*` functions
 
-napi_value FindAPI(napi_env env, napi_callback_info info) {
+napi_value indexOfAPI(napi_env env, napi_callback_info info) {
     size_t argc = 2;
     napi_value args[2];
     napi_get_cb_info(env, info, &argc, args, NULL, NULL);
 
     // Extract the C string from the JavaScript string for haystack and needle
-    sz_haystack_t haystack_sz = {NULL, 0};
-    sz_needle_t needle_sz = {NULL, 0, 0};
+    sz_string_view_t haystack_sz = {NULL, 0};
+    sz_string_view_t needle_sz = {NULL, 0};
 
     // For haystack
     napi_get_value_string_utf8(env, args[0], NULL, 0, (size_t *)&haystack_sz.length);
@@ -38,37 +39,32 @@ napi_value FindAPI(napi_env env, napi_callback_info info) {
                                needle_sz.length + 1,
                                (size_t *)&needle_sz.length);
 
-    // Perform the find operation
-    sz_size_t result = sz_find_substr(haystack_sz, needle_sz);
-
-    // Cleanup
-    free((void *)haystack_sz.start);
-    free((void *)needle_sz.start);
-
     // Convert the result to JavaScript BigInt and return
     napi_value js_result;
+    if (needle_sz.length == 0) { napi_create_bigint_int64(env, 0, &js_result); }
+    else {
+        sz_string_ptr_t result =
+            sz_find_substr(haystack_sz.start, haystack_sz.length, needle_sz.start, needle_sz.length);
 
-    // In JavaScript, if `find` is unable to find the specified value, then it should return -1
-    if (result == haystack_sz.length) napi_create_bigint_int64(env, -1, &js_result);
-    else
-        napi_create_bigint_uint64(env, result, &js_result);
+        // In JavaScript, if `indexOf` is unable to indexOf the specified value, then it should return -1
+        if (result == NULL) { napi_create_bigint_int64(env, -1, &js_result); }
+        else { napi_create_bigint_uint64(env, result - haystack_sz.start, &js_result); }
+    }
 
+    // Cleanup
+    free((void *)haystack_sz.start);
+    free((void *)needle_sz.start);
     return js_result;
 }
 
-size_t count_char(sz_haystack_t haystack_sz, char needle) {
-    size_t result = sz_count_char(haystack_sz, needle);
-    return result;
-}
-
-napi_value CountAPI(napi_env env, napi_callback_info info) {
+napi_value countAPI(napi_env env, napi_callback_info info) {
     size_t argc = 3;
     napi_value args[3];
     napi_get_cb_info(env, info, &argc, args, NULL, NULL);
 
     // Extract the C string from the JavaScript string for haystack and needle
-    sz_haystack_t haystack_sz = {NULL, 0};
-    sz_needle_t needle_sz = {NULL, 0, 0};
+    sz_string_view_t haystack_sz = {NULL, 0};
+    sz_string_view_t needle_sz = {NULL, 0};
 
     // For haystack
     napi_get_value_string_utf8(env, args[0], NULL, 0, (size_t *)&haystack_sz.length);
@@ -95,11 +91,13 @@ napi_value CountAPI(napi_env env, napi_callback_info info) {
 
     size_t count = 0;
     if (needle_sz.length == 0 || haystack_sz.length == 0 || haystack_sz.length < needle_sz.length) { count = 0; }
-    else if (needle_sz.length == 1) { count = count_char(haystack_sz, needle_sz.start[0]); }
+    else if (needle_sz.length == 1) { count = sz_count_char(haystack_sz.start, haystack_sz.length, needle_sz.start); }
     else if (overlap) {
         while (haystack_sz.length) {
-            sz_size_t offset = sz_find_substr(haystack_sz, needle_sz);
-            int found = offset != haystack_sz.length;
+            sz_string_ptr_t ptr =
+                sz_find_substr(haystack_sz.start, haystack_sz.length, needle_sz.start, needle_sz.length);
+            sz_bool_t found = ptr != NULL;
+            sz_size_t offset = found ? ptr - haystack_sz.start : haystack_sz.length;
             count += found;
             haystack_sz.start += offset + found;
             haystack_sz.length -= offset + found;
@@ -107,8 +105,10 @@ napi_value CountAPI(napi_env env, napi_callback_info info) {
     }
     else {
         while (haystack_sz.length) {
-            sz_size_t offset = sz_find_substr(haystack_sz, needle_sz);
-            int found = offset != haystack_sz.length;
+            sz_string_ptr_t ptr =
+                sz_find_substr(haystack_sz.start, haystack_sz.length, needle_sz.start, needle_sz.length);
+            sz_bool_t found = ptr != NULL;
+            sz_size_t offset = found ? ptr - haystack_sz.start : haystack_sz.length;
             count += found;
             haystack_sz.start += offset + needle_sz.length;
             haystack_sz.length -= offset + needle_sz.length * found;
@@ -129,8 +129,8 @@ napi_value CountAPI(napi_env env, napi_callback_info info) {
 napi_value Init(napi_env env, napi_value exports) {
 
     // Define an array of property descriptors
-    napi_property_descriptor findDesc = {"find", 0, FindAPI, 0, 0, 0, napi_default, 0};
-    napi_property_descriptor countDesc = {"count", 0, CountAPI, 0, 0, 0, napi_default, 0};
+    napi_property_descriptor findDesc = {"indexOf", 0, indexOfAPI, 0, 0, 0, napi_default, 0};
+    napi_property_descriptor countDesc = {"count", 0, countAPI, 0, 0, 0, napi_default, 0};
     napi_property_descriptor properties[] = {findDesc, countDesc};
 
     // Define the properties on the `exports` object
diff --git a/javascript/test/find.js b/javascript/test/find.js
index cd2a800d..9fe4e5b7 100644
--- a/javascript/test/find.js
+++ b/javascript/test/find.js
@@ -5,26 +5,26 @@ import assert from 'node:assert';
 const stringzilla = bindings('stringzilla');
 
 test('Find Word in Text - Positive Case', () => {
-    const result = stringzilla.find('hello world, hello john', 'hello');
+    const result = stringzilla.indexOf('hello world, hello john', 'hello');
 
     assert.strictEqual(result, 0n);
 });
 
 test('Find Word in Text - Negative Case (Word Not Found)', () => {
-    const result_1 = stringzilla.find('ha', 'aaa');
+    const result_1 = stringzilla.indexOf('ha', 'aaa');
     assert.strictEqual(result_1, -1n);
 
-    const result_2 = stringzilla.find('g', 'a');
+    const result_2 = stringzilla.indexOf('g', 'a');
     assert.strictEqual(result_2, -1n);
 });
 
 test('Find Word in Text - Negative Case (Empty String Inputs)', () => {
-    const result_1 = stringzilla.find('hello world', '');
+    const result_1 = stringzilla.indexOf('hello world', '');
     assert.strictEqual(result_1, 0n);
 
-    const result_2 = stringzilla.find('', 'a');
+    const result_2 = stringzilla.indexOf('', 'a');
     assert.strictEqual(result_2, -1n);
 
-    const result_3 = stringzilla.find('', '');
-    assert.strictEqual(result_2, -1n);
+    const result_3 = stringzilla.indexOf('', '');
+    assert.strictEqual(result_3, 0n);
 });
diff --git a/scripts/test.c b/scripts/test.c
index a921e76d..127975b0 100644
--- a/scripts/test.c
+++ b/scripts/test.c
@@ -27,24 +27,23 @@ void test_sz_find_substr() {
         for (int variability = 1; variability < VARIABILITY; variability++) {
             populate_random_string(buffer, length, variability);
 
-            struct sz_haystack_t haystack;
+            sz_string_view_t haystack;
             haystack.start = buffer;
             haystack.length = length;
 
             int pattern_length = rand() % 5 + 1;
             populate_random_string(pattern, pattern_length, variability);
 
-            struct sz_needle_t needle;
+            sz_string_view_t needle;
             needle.start = pattern;
             needle.length = pattern_length;
 
             // Comparing the result of your function with the standard library function.
-            const char *result_libc = strstr(buffer, pattern);
-            uint64_t result_stringzilla = sz_find_substr(haystack, needle);
+            sz_string_ptr_t result_libc = strstr(buffer, pattern);
+            sz_string_ptr_t result_stringzilla =
+                sz_find_substr(haystack.start, haystack.length, needle.start, needle.length);
 
-            assert(((result_libc && result_stringzilla == (uint64_t)(result_libc - buffer)) ||
-                    (!result_libc && result_stringzilla == (uint64_t)-1)) &&
-                   "Test failed for sz_find_substr");
+            assert(((result_libc == NULL) ^ (result_stringzilla == NULL)) && "Test failed for sz_find_substr");
         }
     }
 }
diff --git a/scripts/test.cpp b/scripts/test.cpp
index ddef4e82..8dc1a4d2 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -1,39 +1,39 @@
-#include <cstring>
+#include <algorithm>
 #include <chrono>
-#include <iostream>
+#include <cstring>
 #include <fstream>
-#include <vector>
-#include <string>
-#include <numeric>
+#include <iostream>
 #include <limits>
-#include <algorithm>
+#include <numeric>
+#include <string>
 #include <strstream>
+#include <vector>
 
 #include <stringzilla.h>
 
 using strings_t = std::vector<std::string>;
 using idx_t = sz_size_t;
-using permute_t = std::vector<idx_t>;
+using permute_t = std::vector<sz_u64_t>;
 
 #pragma region - C callbacks
 
-static char const *get_start(void const *array_c, sz_size_t i) {
-    strings_t const &array = *reinterpret_cast<strings_t const *>(array_c);
+static char const *get_start(sz_sequence_t const *array_c, sz_size_t i) {
+    strings_t const &array = *reinterpret_cast<strings_t const *>(array_c->handle);
     return array[i].c_str();
 }
 
-static sz_size_t get_length(void const *array_c, sz_size_t i) {
-    strings_t const &array = *reinterpret_cast<strings_t const *>(array_c);
+static sz_size_t get_length(sz_sequence_t const *array_c, sz_size_t i) {
+    strings_t const &array = *reinterpret_cast<strings_t const *>(array_c->handle);
     return array[i].size();
 }
 
-static int is_less(void const *array_c, sz_size_t i, sz_size_t j) {
-    strings_t const &array = *reinterpret_cast<strings_t const *>(array_c);
+static int is_less(sz_sequence_t const *array_c, sz_size_t i, sz_size_t j) {
+    strings_t const &array = *reinterpret_cast<strings_t const *>(array_c->handle);
     return array[i] < array[j];
 }
 
-static int has_under_four_chars(void const *array_c, sz_size_t i) {
-    strings_t const &array = *reinterpret_cast<strings_t const *>(array_c);
+static int has_under_four_chars(sz_sequence_t const *array_c, sz_size_t i) {
+    strings_t const &array = *reinterpret_cast<strings_t const *>(array_c->handle);
     return array[i].size() < 4;
 }
 
@@ -64,7 +64,7 @@ void populate_with_test(strings_t &strings) {
 
 constexpr size_t offset_in_word = 0;
 
-inline static idx_t hybrid_sort_cpp(strings_t const &strings, idx_t *order) {
+inline static idx_t hybrid_sort_cpp(strings_t const &strings, sz_u64_t *order) {
 
     // What if we take up-to 4 first characters and the index
     for (size_t i = 0; i != strings.size(); ++i)
@@ -72,7 +72,7 @@ inline static idx_t hybrid_sort_cpp(strings_t const &strings, idx_t *order) {
                     strings[order[i]].c_str(),
                     std::min(strings[order[i]].size(), 4ul));
 
-    std::sort(order, order + strings.size(), [&](idx_t i, idx_t j) {
+    std::sort(order, order + strings.size(), [&](sz_u64_t i, sz_u64_t j) {
         char *i_bytes = (char *)&i;
         char *j_bytes = (char *)&j;
         return *(uint32_t *)(i_bytes + offset_in_word) < *(uint32_t *)(j_bytes + offset_in_word);
@@ -80,7 +80,7 @@ inline static idx_t hybrid_sort_cpp(strings_t const &strings, idx_t *order) {
 
     for (size_t i = 0; i != strings.size(); ++i) std::memset((char *)&order[i] + offset_in_word, 0, 4ul);
 
-    std::sort(order, order + strings.size(), [&](idx_t i, idx_t j) { return strings[i] < strings[j]; });
+    std::sort(order, order + strings.size(), [&](sz_u64_t i, sz_u64_t j) { return strings[i] < strings[j]; });
 
     return strings.size();
 }
@@ -92,14 +92,14 @@ int hybrid_sort_c_compare_uint32_t(const void *a, const void *b) {
 }
 
 int hybrid_sort_c_compare_strings(void *arg, const void *a, const void *b) {
-    sz_sequence_t *seq = (sz_sequence_t *)arg;
+    sz_sequence_t *sequence = (sz_sequence_t *)arg;
     sz_size_t idx_a = *(sz_size_t *)a;
     sz_size_t idx_b = *(sz_size_t *)b;
 
-    const char *str_a = seq->get_start(seq->handle, idx_a);
-    const char *str_b = seq->get_start(seq->handle, idx_b);
-    sz_size_t len_a = seq->get_length(seq->handle, idx_a);
-    sz_size_t len_b = seq->get_length(seq->handle, idx_b);
+    const char *str_a = sequence->get_start(sequence, idx_a);
+    const char *str_b = sequence->get_start(sequence, idx_b);
+    sz_size_t len_a = sequence->get_length(sequence, idx_a);
+    sz_size_t len_b = sequence->get_length(sequence, idx_b);
 
     int res = strncmp(str_a, str_b, len_a < len_b ? len_a : len_b);
     return res ? res : (int)(len_a - len_b);
@@ -108,8 +108,8 @@ int hybrid_sort_c_compare_strings(void *arg, const void *a, const void *b) {
 sz_size_t hybrid_sort_c(sz_sequence_t *sequence) {
     // Copy up to 4 first characters into the 'order' array.
     for (sz_size_t i = 0; i < sequence->count; ++i) {
-        const char *str = sequence->get_start(sequence->handle, sequence->order[i]);
-        sz_size_t len = sequence->get_length(sequence->handle, sequence->order[i]);
+        const char *str = sequence->get_start(sequence, sequence->order[i]);
+        sz_size_t len = sequence->get_length(sequence, sequence->order[i]);
         len = len > 4 ? 4 : len;
         memcpy((char *)&sequence->order[i] + sizeof(sz_size_t) - 4, str, len);
     }
@@ -128,7 +128,7 @@ sz_size_t hybrid_sort_c(sz_sequence_t *sequence) {
     return sequence->count;
 }
 
-inline static idx_t hybrid_stable_sort_cpp(strings_t const &strings, idx_t *order) {
+inline static idx_t hybrid_stable_sort_cpp(strings_t const &strings, sz_u64_t *order) {
 
     // What if we take up-to 4 first characters and the index
     for (size_t i = 0; i != strings.size(); ++i)
@@ -136,7 +136,7 @@ inline static idx_t hybrid_stable_sort_cpp(strings_t const &strings, idx_t *orde
                     strings[order[i]].c_str(),
                     std::min(strings[order[i]].size(), 4ul));
 
-    std::stable_sort(order, order + strings.size(), [&](idx_t i, idx_t j) {
+    std::stable_sort(order, order + strings.size(), [&](sz_u64_t i, sz_u64_t j) {
         char *i_bytes = (char *)&i;
         char *j_bytes = (char *)&j;
         return *(uint32_t *)(i_bytes + offset_in_word) < *(uint32_t *)(j_bytes + offset_in_word);
@@ -144,7 +144,7 @@ inline static idx_t hybrid_stable_sort_cpp(strings_t const &strings, idx_t *orde
 
     for (size_t i = 0; i != strings.size(); ++i) std::memset((char *)&order[i] + offset_in_word, 0, 4ul);
 
-    std::stable_sort(order, order + strings.size(), [&](idx_t i, idx_t j) { return strings[i] < strings[j]; });
+    std::stable_sort(order, order + strings.size(), [&](sz_u64_t i, sz_u64_t j) { return strings[i] < strings[j]; });
 
     return strings.size();
 }
@@ -209,7 +209,7 @@ int main(int, char const **) {
     std::printf("Hey, Ash!\n");
 
     strings_t strings;
-    populate_from_file("leipzig1M.txt", strings, 10000000);
+    populate_from_file("leipzig1M.txt", strings, 1000000);
     std::size_t mean_bytes = 0;
     for (std::string const &str : strings) mean_bytes += str.size();
     mean_bytes /= strings.size();
@@ -229,26 +229,23 @@ int main(int, char const **) {
     for (std::size_t needle_len = 1; needle_len <= 0; ++needle_len) {
         std::string needle(needle_len, '\4');
         std::printf("---- Needle length: %zu\n", needle_len);
-        bench_search("std::search", full_text, [&]() {
+        bench_search("std::search", full_text, [&]() mutable {
             return std::search(full_text.begin(), full_text.end(), needle.begin(), needle.end()) - full_text.begin();
         });
-        bench_search("sz_find_substr_swar", full_text, [&]() {
-            sz_haystack_t h {full_text.data(), full_text.size()};
-            sz_needle_t n {needle.data(), needle.size()};
-            return sz_find_substr_swar(h, n);
+        bench_search("sz_find_substr_swar", full_text, [&]() mutable {
+            sz_string_ptr_t ptr = sz_find_substr_swar(full_text.data(), full_text.size(), needle.data(), needle.size());
+            return ptr ? ptr - full_text.data() : full_text.size();
         });
 #if defined(__ARM_NEON)
-        bench_search("sz_find_substr_neon", full_text, [&]() {
-            sz_haystack_t h {full_text.data(), full_text.size()};
-            sz_needle_t n {needle.data(), needle.size()};
-            return sz_find_substr_neon(h, n);
+        bench_search("sz_find_substr_neon", full_text, [&]() mutable {
+            sz_string_ptr_t ptr = sz_find_substr_neon(full_text.data(), full_text.size(), needle.data(), needle.size());
+            return ptr ? ptr - full_text.data() : full_text.size();
         });
 #endif
 #if defined(__AVX2__)
-        bench_search("sz_find_substr_avx2", full_text, [&]() {
-            sz_haystack_t h {full_text.data(), full_text.size()};
-            sz_needle_t n {needle.data(), needle.size()};
-            return sz_find_substr_avx2(h, n);
+        bench_search("sz_find_substr_avx2", full_text, [&]() mutable {
+            sz_string_ptr_t ptr = sz_find_substr_avx2(full_text.data(), full_text.size(), needle.data(), needle.size());
+            return ptr ? ptr - full_text.data() : full_text.size();
         });
 #endif
     }
@@ -300,6 +297,17 @@ int main(int, char const **) {
         });
         expect_sorted(strings, permute_new);
 
+        bench_permute("sz_sort_introsort", strings, permute_new, [](strings_t const &strings, permute_t &permute) {
+            sz_sequence_t array;
+            array.order = permute.data();
+            array.count = strings.size();
+            array.handle = &strings;
+            array.get_start = get_start;
+            array.get_length = get_length;
+            sz_sort_introsort(&array, (sz_sequence_comparator_t)_sz_sort_compare_less_ascii);
+        });
+        expect_sorted(strings, permute_new);
+
         bench_permute("hybrid_sort_c", strings, permute_new, [](strings_t const &strings, permute_t &permute) {
             sz_sequence_t array;
             array.order = permute.data();
diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h
index 84e864cf..ba7f5f39 100644
--- a/stringzilla/stringzilla.h
+++ b/stringzilla/stringzilla.h
@@ -609,7 +609,7 @@ inline static sz_string_ptr_t sz_find_substr(sz_string_ptr_t const haystack,
                                              sz_size_t const haystack_length,
                                              sz_string_ptr_t const needle,
                                              sz_size_t const needle_length) {
-    if (haystack_length < needle_length) return NULL;
+    if (haystack_length < needle_length || needle_length == 0) return NULL;
 #if defined(__ARM_NEON)
     return sz_find_substr_neon(haystack, haystack_length, needle, needle_length);
 #elif defined(__AVX2__)
@@ -850,7 +850,7 @@ inline static void _sz_introsort(
         if (less(sequence, sequence->order[first + 1], sequence->order[first]))
             _sz_swap_order(&sequence->order[first], &sequence->order[first + 1]);
         return;
-    case 3:
+    case 3: {
         sz_u64_t a = sequence->order[first];
         sz_u64_t b = sequence->order[first + 1];
         sz_u64_t c = sequence->order[first + 2];
@@ -862,6 +862,7 @@ inline static void _sz_introsort(
         sequence->order[first + 2] = c;
         return;
     }
+    }
     // Until a certain length, the quadratic-complexity insertion-sort is fine
     if (length <= 16) {
         sz_sequence_t sub_seq = *sequence;
@@ -1102,10 +1103,6 @@ inline static sz_u32_t sz_hash_crc32_sse(sz_string_ptr_t start, sz_size_t length
 }
 #endif
 
-#ifdef _MSC_VER
-#undef strncasecmp
-#undef strcasecmp
-#endif
 #undef popcount64
 #undef ctz64
 #undef clz64

From eadad4ed1007f4233d54d0a767d7925bc9713382 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 9 Oct 2023 16:50:13 -0700
Subject: [PATCH 62/72] Refactor: Sync up Py and JS bindings

---
 javascript/lib.c          |  87 ++++++---------
 python/lib.c              | 183 +++++++++++++++---------------
 scripts/test.cpp          |  15 ++-
 stringzilla/stringzilla.h | 226 ++++++++++++++++++++------------------
 4 files changed, 253 insertions(+), 258 deletions(-)

diff --git a/javascript/lib.c b/javascript/lib.c
index 18e36a1b..8ebe72eb 100644
--- a/javascript/lib.c
+++ b/javascript/lib.c
@@ -18,42 +18,33 @@ napi_value indexOfAPI(napi_env env, napi_callback_info info) {
     napi_get_cb_info(env, info, &argc, args, NULL, NULL);
 
     // Extract the C string from the JavaScript string for haystack and needle
-    sz_string_view_t haystack_sz = {NULL, 0};
-    sz_string_view_t needle_sz = {NULL, 0};
+    sz_string_view_t haystack = {NULL, 0};
+    sz_string_view_t needle = {NULL, 0};
 
     // For haystack
-    napi_get_value_string_utf8(env, args[0], NULL, 0, (size_t *)&haystack_sz.length);
-    haystack_sz.start = malloc(haystack_sz.length + 1);
-    napi_get_value_string_utf8(env,
-                               args[0],
-                               (char *)haystack_sz.start,
-                               haystack_sz.length + 1,
-                               (size_t *)&haystack_sz.length);
+    napi_get_value_string_utf8(env, args[0], NULL, 0, (size_t *)&haystack.length);
+    haystack.start = malloc(haystack.length + 1);
+    napi_get_value_string_utf8(env, args[0], (char *)haystack.start, haystack.length + 1, (size_t *)&haystack.length);
 
     // For needle
-    napi_get_value_string_utf8(env, args[1], NULL, 0, (size_t *)&needle_sz.length);
-    needle_sz.start = malloc(needle_sz.length + 1);
-    napi_get_value_string_utf8(env,
-                               args[1],
-                               (char *)needle_sz.start,
-                               needle_sz.length + 1,
-                               (size_t *)&needle_sz.length);
+    napi_get_value_string_utf8(env, args[1], NULL, 0, (size_t *)&needle.length);
+    needle.start = malloc(needle.length + 1);
+    napi_get_value_string_utf8(env, args[1], (char *)needle.start, needle.length + 1, (size_t *)&needle.length);
 
     // Convert the result to JavaScript BigInt and return
     napi_value js_result;
-    if (needle_sz.length == 0) { napi_create_bigint_int64(env, 0, &js_result); }
+    if (needle.length == 0) { napi_create_bigint_int64(env, 0, &js_result); }
     else {
-        sz_string_ptr_t result =
-            sz_find_substr(haystack_sz.start, haystack_sz.length, needle_sz.start, needle_sz.length);
+        sz_string_start_t result = sz_find_substring(haystack.start, haystack.length, needle.start, needle.length);
 
         // In JavaScript, if `indexOf` is unable to indexOf the specified value, then it should return -1
         if (result == NULL) { napi_create_bigint_int64(env, -1, &js_result); }
-        else { napi_create_bigint_uint64(env, result - haystack_sz.start, &js_result); }
+        else { napi_create_bigint_uint64(env, result - haystack.start, &js_result); }
     }
 
     // Cleanup
-    free((void *)haystack_sz.start);
-    free((void *)needle_sz.start);
+    free((void *)haystack.start);
+    free((void *)needle.start);
     return js_result;
 }
 
@@ -63,55 +54,45 @@ napi_value countAPI(napi_env env, napi_callback_info info) {
     napi_get_cb_info(env, info, &argc, args, NULL, NULL);
 
     // Extract the C string from the JavaScript string for haystack and needle
-    sz_string_view_t haystack_sz = {NULL, 0};
-    sz_string_view_t needle_sz = {NULL, 0};
+    sz_string_view_t haystack = {NULL, 0};
+    sz_string_view_t needle = {NULL, 0};
 
     // For haystack
-    napi_get_value_string_utf8(env, args[0], NULL, 0, (size_t *)&haystack_sz.length);
-    haystack_sz.start = malloc(haystack_sz.length + 1);
-    napi_get_value_string_utf8(env,
-                               args[0],
-                               (char *)haystack_sz.start,
-                               haystack_sz.length + 1,
-                               (size_t *)&haystack_sz.length);
+    napi_get_value_string_utf8(env, args[0], NULL, 0, (size_t *)&haystack.length);
+    haystack.start = malloc(haystack.length + 1);
+    napi_get_value_string_utf8(env, args[0], (char *)haystack.start, haystack.length + 1, (size_t *)&haystack.length);
 
     // For needle
-    napi_get_value_string_utf8(env, args[1], NULL, 0, (size_t *)&needle_sz.length);
-    needle_sz.start = malloc(needle_sz.length + 1);
-    napi_get_value_string_utf8(env,
-                               args[1],
-                               (char *)needle_sz.start,
-                               needle_sz.length + 1,
-                               (size_t *)&needle_sz.length);
+    napi_get_value_string_utf8(env, args[1], NULL, 0, (size_t *)&needle.length);
+    needle.start = malloc(needle.length + 1);
+    napi_get_value_string_utf8(env, args[1], (char *)needle.start, needle.length + 1, (size_t *)&needle.length);
 
     bool overlap = false;
     if (argc > 2) { napi_get_value_bool(env, args[2], &overlap); }
 
-    void const *haystack_start = haystack_sz.start, *needle_start = needle_sz.start;
+    void const *haystack_start = haystack.start, *needle_start = needle.start;
 
     size_t count = 0;
-    if (needle_sz.length == 0 || haystack_sz.length == 0 || haystack_sz.length < needle_sz.length) { count = 0; }
-    else if (needle_sz.length == 1) { count = sz_count_char(haystack_sz.start, haystack_sz.length, needle_sz.start); }
+    if (needle.length == 0 || haystack.length == 0 || haystack.length < needle.length) { count = 0; }
+    else if (needle.length == 1) { count = sz_count_char(haystack.start, haystack.length, needle.start); }
     else if (overlap) {
-        while (haystack_sz.length) {
-            sz_string_ptr_t ptr =
-                sz_find_substr(haystack_sz.start, haystack_sz.length, needle_sz.start, needle_sz.length);
+        while (haystack.length) {
+            sz_string_start_t ptr = sz_find_substring(haystack.start, haystack.length, needle.start, needle.length);
             sz_bool_t found = ptr != NULL;
-            sz_size_t offset = found ? ptr - haystack_sz.start : haystack_sz.length;
+            sz_size_t offset = found ? ptr - haystack.start : haystack.length;
             count += found;
-            haystack_sz.start += offset + found;
-            haystack_sz.length -= offset + found;
+            haystack.start += offset + found;
+            haystack.length -= offset + found;
         }
     }
     else {
-        while (haystack_sz.length) {
-            sz_string_ptr_t ptr =
-                sz_find_substr(haystack_sz.start, haystack_sz.length, needle_sz.start, needle_sz.length);
+        while (haystack.length) {
+            sz_string_start_t ptr = sz_find_substring(haystack.start, haystack.length, needle.start, needle.length);
             sz_bool_t found = ptr != NULL;
-            sz_size_t offset = found ? ptr - haystack_sz.start : haystack_sz.length;
+            sz_size_t offset = found ? ptr - haystack.start : haystack.length;
             count += found;
-            haystack_sz.start += offset + needle_sz.length;
-            haystack_sz.length -= offset + needle_sz.length * found;
+            haystack.start += offset + needle.length;
+            haystack.length -= offset + needle.length * found;
         }
     }
 
diff --git a/python/lib.c b/python/lib.c
index a0f6caca..c0ad69d4 100644
--- a/python/lib.c
+++ b/python/lib.c
@@ -38,10 +38,7 @@ static PyTypeObject FileType;
 static PyTypeObject StrType;
 static PyTypeObject StrsType;
 
-static struct {
-    void *start;
-    size_t length;
-} temporary_memory = {NULL, 0};
+static sz_string_view_t temporary_memory = {NULL, 0};
 
 /**
  *  @brief  Describes an on-disk file mapped into RAM, which is different from Python's
@@ -55,8 +52,8 @@ typedef struct {
 #else
         int file_descriptor;
 #endif
-    void *start;
-    size_t length;
+    sz_string_start_t start;
+    sz_size_t length;
 } File;
 
 /**
@@ -73,8 +70,8 @@ typedef struct {
  */
 typedef struct {
     PyObject_HEAD PyObject *parent;
-    char const *start;
-    size_t length;
+    sz_string_start_t start;
+    sz_size_t length;
 } Str;
 
 /**
@@ -133,7 +130,7 @@ typedef struct {
         struct reordered_slices_t {
             size_t count;
             PyObject *parent;
-            sz_haystack_t *parts;
+            sz_string_view_t *parts;
         } reordered;
 
     } data;
@@ -144,10 +141,13 @@ typedef struct {
 
 #pragma region Helpers
 
-typedef int boolean_t;
+inline static sz_string_start_t haystacks_get_start(sz_sequence_t *seq, sz_size_t i) {
+    return ((sz_string_view_t const *)seq->handle)[i].start;
+}
 
-inline static char const *haystacks_get_start(sz_haystack_t const *parts, sz_size_t i) { return parts[i].start; }
-inline static size_t haystacks_get_length(sz_haystack_t const *parts, sz_size_t i) { return parts[i].length; }
+inline static sz_size_t haystacks_get_length(sz_sequence_t *seq, sz_size_t i) {
+    return ((sz_string_view_t const *)seq->handle)[i].length;
+}
 
 void reverse_offsets(sz_size_t *array, size_t length) {
     size_t i, j;
@@ -159,21 +159,21 @@ void reverse_offsets(sz_size_t *array, size_t length) {
     }
 }
 
-void reverse_haystacks(sz_haystack_t *array, size_t length) {
+void reverse_haystacks(sz_string_view_t *array, size_t length) {
     size_t i, j;
     // Swap array[i] and array[j]
     for (i = 0, j = length - 1; i < j; i++, j--) {
-        sz_haystack_t temp = array[i];
+        sz_string_view_t temp = array[i];
         array[i] = array[j];
         array[j] = temp;
     }
 }
 
-void apply_order(sz_haystack_t *array, sz_size_t *order, size_t length) {
-    for (size_t i = 0; i < length; ++i) {
+void apply_order(sz_string_view_t *array, sz_u64_t *order, size_t length) {
+    for (sz_u64_t i = 0; i < length; ++i) {
         if (i == order[i]) continue;
-        sz_haystack_t temp = array[i];
-        size_t k = i, j;
+        sz_string_view_t temp = array[i];
+        sz_u64_t k = i, j;
         while (i != (j = order[k])) {
             array[k] = array[j];
             order[k] = k;
@@ -205,7 +205,7 @@ void slice(size_t length, ssize_t start, ssize_t end, size_t *normalized_offset,
     *normalized_length = end - start;
 }
 
-boolean_t export_string_like(PyObject *object, char const **start, size_t *length) {
+sz_bool_t export_string_like(PyObject *object, sz_string_start_t **start, sz_size_t *length) {
     if (PyUnicode_Check(object)) {
         // Handle Python str
         Py_ssize_t signed_length;
@@ -277,7 +277,7 @@ get_string_at_offset_t str_at_offset_getter(Strs *strs) {
     }
 }
 
-boolean_t prepare_strings_for_reordering(Strs *strs) {
+sz_bool_t prepare_strings_for_reordering(Strs *strs) {
 
     // Allocate memory for reordered slices
     size_t count = 0;
@@ -306,7 +306,7 @@ boolean_t prepare_strings_for_reordering(Strs *strs) {
         return 0;
     }
 
-    sz_haystack_t *new_parts = (sz_haystack_t *)malloc(count * sizeof(sz_haystack_t));
+    sz_string_view_t *new_parts = (sz_string_view_t *)malloc(count * sizeof(sz_string_view_t));
     if (new_parts == NULL) {
         PyErr_SetString(PyExc_MemoryError, "Unable to allocate memory for reordered slices");
         return 0;
@@ -333,7 +333,7 @@ boolean_t prepare_strings_for_reordering(Strs *strs) {
     return 1;
 }
 
-boolean_t prepare_strings_for_extension(Strs *strs, size_t new_parents, size_t new_parts) { return 1; }
+sz_bool_t prepare_strings_for_extension(Strs *strs, size_t new_parents, size_t new_parts) { return 1; }
 
 #pragma endregion
 
@@ -622,8 +622,8 @@ static int Str_getbuffer(Str *self, Py_buffer *view, int flags) {
     view->itemsize = sizeof(char);
     view->format = "c"; // https://docs.python.org/3/library/struct.html#format-characters
     view->ndim = 1;
-    view->shape = &self->length; // 1-D array, so shape is just a pointer to the length
-    view->strides = itemsize;    // strides in a 1-D array is just the item size
+    view->shape = (Py_ssize_t *)&self->length; // 1-D array, so shape is just a pointer to the length
+    view->strides = itemsize;                  // strides in a 1-D array is just the item size
     view->suboffsets = NULL;
     view->internal = NULL;
 
@@ -639,18 +639,13 @@ static void Str_releasebuffer(PyObject *_, Py_buffer *view) {
 
 static int Str_in(Str *self, PyObject *arg) {
 
-    sz_needle_t needle_struct;
-    needle_struct.quadgram_offset = 0;
+    sz_string_view_t needle_struct;
     if (!export_string_like(arg, &needle_struct.start, &needle_struct.length)) {
         PyErr_SetString(PyExc_TypeError, "Unsupported argument type");
         return -1;
     }
 
-    sz_haystack_t haystack;
-    haystack.start = self->start;
-    haystack.length = self->length;
-    size_t position = sz_find_substr(haystack, needle_struct);
-    return position != haystack.length;
+    return sz_find_substring(self->start, self->length, needle_struct.start, needle_struct.length) != NULL;
 }
 
 static Py_ssize_t Strs_len(Strs *self) {
@@ -756,12 +751,12 @@ static PyObject *Strs_subscript(Strs *self, PyObject *key) {
             to->count = stop - start;
             to->parent = from->parent;
 
-            to->parts = malloc(sizeof(sz_haystack_t) * to->count);
+            to->parts = malloc(sizeof(sz_string_view_t) * to->count);
             if (to->parts == NULL && PyErr_NoMemory()) {
                 Py_XDECREF(self_slice);
                 return NULL;
             }
-            memcpy(to->parts, from->parts + start, sizeof(sz_haystack_t) * to->count);
+            memcpy(to->parts, from->parts + start, sizeof(sz_string_view_t) * to->count);
             Py_INCREF(to->parent);
             break;
         }
@@ -816,8 +811,8 @@ static int Str_find_( //
     PyObject *args,
     PyObject *kwargs,
     Py_ssize_t *offset_out,
-    sz_haystack_t *haystack_out,
-    sz_needle_t *needle_out) {
+    sz_string_view_t *haystack_out,
+    sz_string_view_t *needle_out) {
 
     int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
     Py_ssize_t nargs = PyTuple_Size(args);
@@ -845,12 +840,11 @@ static int Str_find_( //
         }
     }
 
-    sz_haystack_t haystack;
-    sz_needle_t needle;
+    sz_string_view_t haystack;
+    sz_string_view_t needle;
     Py_ssize_t start, end;
 
     // Validate and convert `haystack` and `needle`
-    needle.quadgram_offset = 0;
     if (!export_string_like(haystack_obj, &haystack.start, &haystack.length) ||
         !export_string_like(needle_obj, &needle.start, &needle.length)) {
         PyErr_SetString(PyExc_TypeError, "Haystack and needle must be string-like");
@@ -884,9 +878,9 @@ static int Str_find_( //
     haystack.length = normalized_length;
 
     // Perform contains operation
-    size_t offset = sz_find_substr(haystack, needle);
-    if (offset == haystack.length) { *offset_out = -1; }
-    else { *offset_out = (Py_ssize_t)offset; }
+    sz_string_start_t match = sz_find_substring(haystack.start, haystack.length, needle.start, needle.length);
+    if (match == NULL) { *offset_out = -1; }
+    else { *offset_out = (Py_ssize_t)(match - haystack.start); }
 
     *haystack_out = haystack;
     *needle_out = needle;
@@ -895,16 +889,16 @@ static int Str_find_( //
 
 static PyObject *Str_find(PyObject *self, PyObject *args, PyObject *kwargs) {
     Py_ssize_t signed_offset;
-    sz_haystack_t text;
-    sz_needle_t separator;
+    sz_string_view_t text;
+    sz_string_view_t separator;
     if (!Str_find_(self, args, kwargs, &signed_offset, &text, &separator)) return NULL;
     return PyLong_FromSsize_t(signed_offset);
 }
 
 static PyObject *Str_index(PyObject *self, PyObject *args, PyObject *kwargs) {
     Py_ssize_t signed_offset;
-    sz_haystack_t text;
-    sz_needle_t separator;
+    sz_string_view_t text;
+    sz_string_view_t separator;
     if (!Str_find_(self, args, kwargs, &signed_offset, &text, &separator)) return NULL;
     if (signed_offset == -1) {
         PyErr_SetString(PyExc_ValueError, "substring not found");
@@ -915,8 +909,8 @@ static PyObject *Str_index(PyObject *self, PyObject *args, PyObject *kwargs) {
 
 static PyObject *Str_contains(PyObject *self, PyObject *args, PyObject *kwargs) {
     Py_ssize_t signed_offset;
-    sz_haystack_t text;
-    sz_needle_t separator;
+    sz_string_view_t text;
+    sz_string_view_t separator;
     if (!Str_find_(self, args, kwargs, &signed_offset, &text, &separator)) return NULL;
     if (signed_offset == -1) { Py_RETURN_FALSE; }
     else { Py_RETURN_TRUE; }
@@ -924,8 +918,8 @@ static PyObject *Str_contains(PyObject *self, PyObject *args, PyObject *kwargs)
 
 static PyObject *Str_partition(PyObject *self, PyObject *args, PyObject *kwargs) {
     Py_ssize_t separator_index;
-    sz_haystack_t text;
-    sz_needle_t separator;
+    sz_string_view_t text;
+    sz_string_view_t separator;
     PyObject *result_tuple;
 
     // Use Str_find_ to get the index of the separator
@@ -993,13 +987,12 @@ static PyObject *Str_count(PyObject *self, PyObject *args, PyObject *kwargs) {
                 return NULL;
     }
 
-    sz_haystack_t haystack;
-    sz_needle_t needle;
+    sz_string_view_t haystack;
+    sz_string_view_t needle;
     Py_ssize_t start = start_obj ? PyLong_AsSsize_t(start_obj) : 0;
     Py_ssize_t end = end_obj ? PyLong_AsSsize_t(end_obj) : PY_SSIZE_T_MAX;
     int allowoverlap = allowoverlap_obj ? PyObject_IsTrue(allowoverlap_obj) : 0;
 
-    needle.quadgram_offset = 0;
     if (!export_string_like(haystack_obj, &haystack.start, &haystack.length) ||
         !export_string_like(needle_obj, &needle.start, &needle.length))
         return PyErr_Format(PyExc_TypeError, "Haystack and needle must be string-like"), NULL;
@@ -1013,27 +1006,28 @@ static PyObject *Str_count(PyObject *self, PyObject *args, PyObject *kwargs) {
 
     size_t count = 0;
     if (needle.length == 0 || haystack.length == 0 || haystack.length < needle.length) { count = 0; }
-    else if (needle.length == 1) { count = sz_count_char(haystack, needle.start[0]); }
-    else if (needle.length != 1) {
-        if (allowoverlap) {
-            while (haystack.length) {
-                sz_size_t offset = sz_find_substr(haystack, needle);
-                int found = offset != haystack.length;
-                count += found;
-                haystack.start += offset + found;
-                haystack.length -= offset + found;
-            }
+    else if (needle.length == 1) { count = sz_count_char(haystack.start, haystack.length, needle.start); }
+    else if (allowoverlap) {
+        while (haystack.length) {
+            sz_string_start_t ptr = sz_find_substring(haystack.start, haystack.length, needle.start, needle.length);
+            sz_bool_t found = ptr != NULL;
+            sz_size_t offset = found ? ptr - haystack.start : haystack.length;
+            count += found;
+            haystack.start += offset + found;
+            haystack.length -= offset + found;
         }
-        else {
-            while (haystack.length) {
-                sz_size_t offset = sz_find_substr(haystack, needle);
-                int found = offset != haystack.length;
-                count += found;
-                haystack.start += offset + needle.length;
-                haystack.length -= offset + needle.length * found;
-            }
+    }
+    else {
+        while (haystack.length) {
+            sz_string_start_t ptr = sz_find_substring(haystack.start, haystack.length, needle.start, needle.length);
+            sz_bool_t found = ptr != NULL;
+            sz_size_t offset = found ? ptr - haystack.start : haystack.length;
+            count += found;
+            haystack.start += offset + needle.length;
+            haystack.length -= offset + needle.length * found;
         }
     }
+
     return PyLong_FromSize_t(count);
 }
 
@@ -1068,7 +1062,7 @@ static PyObject *Str_levenstein(PyObject *self, PyObject *args, PyObject *kwargs
         return NULL;
     }
 
-    sz_haystack_t str1, str2;
+    sz_string_view_t str1, str2;
     if (!export_string_like(str1_obj, &str1.start, &str1.length) ||
         !export_string_like(str2_obj, &str2.start, &str2.length)) {
         PyErr_Format(PyExc_TypeError, "Both arguments must be string-like");
@@ -1119,7 +1113,7 @@ static PyObject *Str_startswith(PyObject *self, PyObject *args, PyObject *kwargs
         return NULL;
     }
 
-    sz_haystack_t str, prefix;
+    sz_string_view_t str, prefix;
     if (!export_string_like(str_obj, &str.start, &str.length) ||
         !export_string_like(prefix_obj, &prefix.start, &prefix.length)) {
         PyErr_SetString(PyExc_TypeError, "Both arguments must be string-like");
@@ -1162,7 +1156,7 @@ static PyObject *Str_endswith(PyObject *self, PyObject *args, PyObject *kwargs)
         return NULL;
     }
 
-    sz_haystack_t str, suffix;
+    sz_string_view_t str, suffix;
     if (!export_string_like(str_obj, &str.start, &str.length) ||
         !export_string_like(suffix_obj, &suffix.start, &suffix.length)) {
         PyErr_SetString(PyExc_TypeError, "Both arguments must be string-like");
@@ -1180,7 +1174,7 @@ static PyObject *Str_endswith(PyObject *self, PyObject *args, PyObject *kwargs)
 }
 
 static Strs *Str_split_(
-    PyObject *parent, sz_haystack_t text, sz_needle_t separator, int keepseparator, Py_ssize_t maxsplit) {
+    PyObject *parent, sz_string_view_t text, sz_string_view_t separator, int keepseparator, Py_ssize_t maxsplit) {
 
     // Create Strs object
     Strs *result = (Strs *)PyObject_New(Strs, &StrsType);
@@ -1209,10 +1203,9 @@ static Strs *Str_split_(
     // Iterate through string, keeping track of the
     sz_size_t last_start = 0;
     while (last_start <= text.length && offsets_count < maxsplit) {
-        sz_haystack_t text_remaining;
-        text_remaining.start = text.start + last_start;
-        text_remaining.length = text.length - last_start;
-        sz_size_t offset_in_remaining = sz_find_substr(text_remaining, separator);
+        sz_string_start_t match =
+            sz_find_substring(text.start + last_start, text.length - last_start, separator.start, separator.length);
+        sz_size_t offset_in_remaining = match ? match - text.start - last_start : text.length - last_start;
 
         // Reallocate offsets array if needed
         if (offsets_count >= offsets_capacity) {
@@ -1232,7 +1225,7 @@ static Strs *Str_split_(
         }
 
         // Export the offset
-        size_t will_continue = offset_in_remaining != text_remaining.length;
+        size_t will_continue = match != NULL;
         size_t next_offset = last_start + offset_in_remaining + separator.length * will_continue;
         if (text.length >= UINT32_MAX) { ((uint64_t *)offsets_endings)[offsets_count++] = (uint64_t)next_offset; }
         else { ((uint32_t *)offsets_endings)[offsets_count++] = (uint32_t)next_offset; }
@@ -1282,11 +1275,10 @@ static PyObject *Str_split(PyObject *self, PyObject *args, PyObject *kwargs) {
         }
     }
 
-    sz_haystack_t text;
-    sz_needle_t separator;
+    sz_string_view_t text;
+    sz_string_view_t separator;
     int keepseparator;
     Py_ssize_t maxsplit;
-    separator.quadgram_offset = 0;
 
     // Validate and convert `text`
     if (!export_string_like(text_obj, &text.start, &text.length)) {
@@ -1355,7 +1347,7 @@ static PyObject *Str_splitlines(PyObject *self, PyObject *args, PyObject *kwargs
         }
     }
 
-    sz_haystack_t text;
+    sz_string_view_t text;
     int keeplinebreaks;
     Py_ssize_t maxsplit = PY_SSIZE_T_MAX; // Default value for maxsplit
 
@@ -1388,14 +1380,14 @@ static PyObject *Str_splitlines(PyObject *self, PyObject *args, PyObject *kwargs
     // https://docs.python.org/3/library/stdtypes.html#str.splitlines
     // \n, \r, \r\n, \v or \x0b, \f or \x0c, \x1c, \x1d, \x1e, \x85, \u2028, \u2029
     // https://github.com/ashvardanian/StringZilla/issues/29
-    sz_needle_t separator;
+    sz_string_view_t separator;
     separator.start = "\n";
     separator.length = 1;
     return Str_split_(text_obj, text, separator, keeplinebreaks, maxsplit);
 }
 
 static PyObject *Str_concat(PyObject *self, PyObject *other) {
-    struct sz_haystack_t self_str, other_str;
+    struct sz_string_view_t self_str, other_str;
 
     // Validate and convert `self`
     if (!export_string_like(self, &self_str.start, &self_str.length)) {
@@ -1453,7 +1445,8 @@ static PyNumberMethods Str_as_number = {
 
 #define sz_method_flags_m METH_VARARGS | METH_KEYWORDS
 
-static PyMethodDef Str_methods[] = { //
+static PyMethodDef Str_methods[] = {
+    //
     {"find", Str_find, sz_method_flags_m, "Find the first occurrence of a substring."},
     {"index", Str_index, sz_method_flags_m, "Find the first occurrence of a substring or raise error if missing."},
     {"contains", Str_contains, sz_method_flags_m, "Check if a string contains a substring."},
@@ -1537,14 +1530,14 @@ static PyObject *Strs_shuffle(Strs *self, PyObject *args, PyObject *kwargs) {
 
     // Get the parts and their count
     struct reordered_slices_t *reordered = &self->data.reordered;
-    sz_haystack_t *parts = reordered->parts;
+    sz_string_view_t *parts = reordered->parts;
     size_t count = reordered->count;
 
     // Fisher-Yates Shuffle Algorithm
     for (size_t i = count - 1; i > 0; --i) {
         size_t j = rand() % (i + 1);
         // Swap parts[i] and parts[j]
-        sz_haystack_t temp = parts[i];
+        sz_string_view_t temp = parts[i];
         parts[i] = parts[j];
         parts[j] = temp;
     }
@@ -1552,8 +1545,8 @@ static PyObject *Strs_shuffle(Strs *self, PyObject *args, PyObject *kwargs) {
     Py_RETURN_NONE;
 }
 
-static boolean_t Strs_sort_(Strs *self,
-                            sz_haystack_t **parts_output,
+static sz_bool_t Strs_sort_(Strs *self,
+                            sz_string_view_t **parts_output,
                             sz_size_t **order_output,
                             sz_size_t *count_output) {
 
@@ -1565,7 +1558,7 @@ static boolean_t Strs_sort_(Strs *self,
 
     // Get the parts and their count
     // The only possible `self->type` by now is the `STRS_REORDERED`
-    sz_haystack_t *parts = self->data.reordered.parts;
+    sz_string_view_t *parts = self->data.reordered.parts;
     size_t count = self->data.reordered.count;
 
     // Allocate temporary memory to store the ordering offsets
@@ -1627,7 +1620,7 @@ static PyObject *Strs_sort(Strs *self, PyObject *args, PyObject *kwargs) {
         }
     }
 
-    boolean_t reverse = 0; // Default is False
+    sz_bool_t reverse = 0; // Default is False
     if (reverse_obj) {
         if (!PyBool_Check(reverse_obj)) {
             PyErr_SetString(PyExc_TypeError, "The reverse must be a boolean");
@@ -1636,7 +1629,7 @@ static PyObject *Strs_sort(Strs *self, PyObject *args, PyObject *kwargs) {
         reverse = PyObject_IsTrue(reverse_obj);
     }
 
-    sz_haystack_t *parts = NULL;
+    sz_string_view_t *parts = NULL;
     sz_size_t *order = NULL;
     sz_size_t count = 0;
     if (!Strs_sort_(self, &parts, &order, &count)) return NULL;
@@ -1680,7 +1673,7 @@ static PyObject *Strs_order(Strs *self, PyObject *args, PyObject *kwargs) {
         }
     }
 
-    boolean_t reverse = 0; // Default is False
+    sz_bool_t reverse = 0; // Default is False
     if (reverse_obj) {
         if (!PyBool_Check(reverse_obj)) {
             PyErr_SetString(PyExc_TypeError, "The reverse must be a boolean");
@@ -1689,7 +1682,7 @@ static PyObject *Strs_order(Strs *self, PyObject *args, PyObject *kwargs) {
         reverse = PyObject_IsTrue(reverse_obj);
     }
 
-    sz_haystack_t *parts = NULL;
+    sz_string_view_t *parts = NULL;
     sz_size_t *order = NULL;
     sz_size_t count = 0;
     if (!Strs_sort_(self, &parts, &order, &count)) return NULL;
diff --git a/scripts/test.cpp b/scripts/test.cpp
index 8dc1a4d2..b61b7d40 100644
--- a/scripts/test.cpp
+++ b/scripts/test.cpp
@@ -232,19 +232,22 @@ int main(int, char const **) {
         bench_search("std::search", full_text, [&]() mutable {
             return std::search(full_text.begin(), full_text.end(), needle.begin(), needle.end()) - full_text.begin();
         });
-        bench_search("sz_find_substr_swar", full_text, [&]() mutable {
-            sz_string_ptr_t ptr = sz_find_substr_swar(full_text.data(), full_text.size(), needle.data(), needle.size());
+        bench_search("sz_find_substring_swar", full_text, [&]() mutable {
+            sz_string_start_t ptr =
+                sz_find_substring_swar(full_text.data(), full_text.size(), needle.data(), needle.size());
             return ptr ? ptr - full_text.data() : full_text.size();
         });
 #if defined(__ARM_NEON)
-        bench_search("sz_find_substr_neon", full_text, [&]() mutable {
-            sz_string_ptr_t ptr = sz_find_substr_neon(full_text.data(), full_text.size(), needle.data(), needle.size());
+        bench_search("sz_find_substring_neon", full_text, [&]() mutable {
+            sz_string_start_t ptr =
+                sz_find_substring_neon(full_text.data(), full_text.size(), needle.data(), needle.size());
             return ptr ? ptr - full_text.data() : full_text.size();
         });
 #endif
 #if defined(__AVX2__)
-        bench_search("sz_find_substr_avx2", full_text, [&]() mutable {
-            sz_string_ptr_t ptr = sz_find_substr_avx2(full_text.data(), full_text.size(), needle.data(), needle.size());
+        bench_search("sz_find_substring_avx2", full_text, [&]() mutable {
+            sz_string_start_t ptr =
+                sz_find_substring_avx2(full_text.data(), full_text.size(), needle.data(), needle.size());
             return ptr ? ptr - full_text.data() : full_text.size();
         });
 #endif
diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h
index ba7f5f39..c7c0ae49 100644
--- a/stringzilla/stringzilla.h
+++ b/stringzilla/stringzilla.h
@@ -45,16 +45,16 @@ typedef unsigned long sz_size_t;
 typedef unsigned sz_size_t;
 #endif
 
-typedef int sz_bool_t;               // Only one relevant bit
-typedef unsigned sz_u32_t;           // Always 32 bits
-typedef unsigned long long sz_u64_t; // Always 64 bits
-typedef char const *sz_string_ptr_t; // A type alias for `char const * `
+typedef int sz_bool_t;                 // Only one relevant bit
+typedef unsigned sz_u32_t;             // Always 32 bits
+typedef unsigned long long sz_u64_t;   // Always 64 bits
+typedef char const *sz_string_start_t; // A type alias for `char const * `
 
 /**
  *  @brief  Helper construct for higher-level bindings.
  */
 typedef struct sz_string_view_t {
-    sz_string_ptr_t start;
+    sz_string_start_t start;
     sz_size_t length;
 } sz_string_view_t;
 
@@ -72,8 +72,8 @@ typedef union _sz_anomaly_t {
  *          Doesn't provide major performance improvements, but helps avoid the LibC dependency.
  *  @return 1 for `true`, and 0 for `false`.
  */
-inline static sz_bool_t sz_equal(sz_string_ptr_t a, sz_string_ptr_t b, sz_size_t length) {
-    sz_string_ptr_t const a_end = a + length;
+inline static sz_bool_t sz_equal(sz_string_start_t a, sz_string_start_t b, sz_size_t length) {
+    sz_string_start_t const a_end = a + length;
     while (a != a_end && *a == *b) a++, b++;
     return a_end == a;
 }
@@ -82,13 +82,13 @@ inline static sz_bool_t sz_equal(sz_string_ptr_t a, sz_string_ptr_t b, sz_size_t
  *  @brief  Count the number of occurrences of a @b single-character needle in an arbitrary length haystack.
  *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
  */
-inline static sz_size_t sz_count_char_swar(sz_string_ptr_t const haystack,
+inline static sz_size_t sz_count_char_swar(sz_string_start_t const haystack,
                                            sz_size_t const haystack_length,
-                                           sz_string_ptr_t const needle) {
+                                           sz_string_start_t const needle) {
 
     sz_size_t result = 0;
-    sz_string_ptr_t text = haystack;
-    sz_string_ptr_t const end = haystack + haystack_length;
+    sz_string_start_t text = haystack;
+    sz_string_start_t const end = haystack + haystack_length;
 
     // Process the misaligned head, to void UB on unaligned 64-bit loads.
     for (; ((unsigned long)text & 7ul) && text < end; ++text) result += *text == *needle;
@@ -117,12 +117,12 @@ inline static sz_size_t sz_count_char_swar(sz_string_ptr_t const haystack,
  *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
  *          Identical to `memchr(haystack, needle[0], haystack_length)`.
  */
-inline static sz_string_ptr_t sz_find_1char_swar(sz_string_ptr_t const haystack,
-                                                 sz_size_t const haystack_length,
-                                                 sz_string_ptr_t const needle) {
+inline static sz_string_start_t sz_find_1char_swar(sz_string_start_t const haystack,
+                                                   sz_size_t const haystack_length,
+                                                   sz_string_start_t const needle) {
 
-    sz_string_ptr_t text = haystack;
-    sz_string_ptr_t const end = haystack + haystack_length;
+    sz_string_start_t text = haystack;
+    sz_string_start_t const end = haystack + haystack_length;
 
     // Process the misaligned head, to void UB on unaligned 64-bit loads.
     for (; ((unsigned long)text & 7ul) && text < end; ++text)
@@ -154,12 +154,12 @@ inline static sz_string_ptr_t sz_find_1char_swar(sz_string_ptr_t const haystack,
  *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
  *          Identical to `memrchr(haystack, needle[0], haystack_length)`.
  */
-inline static sz_string_ptr_t sz_rfind_1char_swar(sz_string_ptr_t const haystack,
-                                                  sz_size_t const haystack_length,
-                                                  sz_string_ptr_t const needle) {
+inline static sz_string_start_t sz_rfind_1char_swar(sz_string_start_t const haystack,
+                                                    sz_size_t const haystack_length,
+                                                    sz_string_start_t const needle) {
 
-    sz_string_ptr_t const end = haystack + haystack_length;
-    sz_string_ptr_t text = end - 1;
+    sz_string_start_t const end = haystack + haystack_length;
+    sz_string_start_t text = end - 1;
 
     // Process the misaligned head, to void UB on unaligned 64-bit loads.
     for (; ((unsigned long)text & 7ul) && text >= haystack; --text)
@@ -190,12 +190,12 @@ inline static sz_string_ptr_t sz_rfind_1char_swar(sz_string_ptr_t const haystack
  *  @brief  Find the first occurrence of a @b two-character needle in an arbitrary length haystack.
  *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
  */
-inline static sz_string_ptr_t sz_find_2char_swar(sz_string_ptr_t const haystack,
-                                                 sz_size_t const haystack_length,
-                                                 sz_string_ptr_t const needle) {
+inline static sz_string_start_t sz_find_2char_swar(sz_string_start_t const haystack,
+                                                   sz_size_t const haystack_length,
+                                                   sz_string_start_t const needle) {
 
-    sz_string_ptr_t text = haystack;
-    sz_string_ptr_t const end = haystack + haystack_length;
+    sz_string_start_t text = haystack;
+    sz_string_start_t const end = haystack + haystack_length;
 
     // Process the misaligned head, to void UB on unaligned 64-bit loads.
     for (; ((unsigned long)text & 7ul) && text + 2 <= end; ++text)
@@ -239,12 +239,12 @@ inline static sz_string_ptr_t sz_find_2char_swar(sz_string_ptr_t const haystack,
  *  @brief  Find the first occurrence of a three-character needle in an arbitrary length haystack.
  *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
  */
-inline static sz_string_ptr_t sz_find_3char_swar(sz_string_ptr_t const haystack,
-                                                 sz_size_t const haystack_length,
-                                                 sz_string_ptr_t const needle) {
+inline static sz_string_start_t sz_find_3char_swar(sz_string_start_t const haystack,
+                                                   sz_size_t const haystack_length,
+                                                   sz_string_start_t const needle) {
 
-    sz_string_ptr_t text = haystack;
-    sz_string_ptr_t end = haystack + haystack_length;
+    sz_string_start_t text = haystack;
+    sz_string_start_t end = haystack + haystack_length;
 
     // Process the misaligned head, to void UB on unaligned 64-bit loads.
     for (; ((unsigned long)text & 7ul) && text + 3 <= end; ++text)
@@ -301,12 +301,12 @@ inline static sz_string_ptr_t sz_find_3char_swar(sz_string_ptr_t const haystack,
  *  @brief  Find the first occurrence of a @b four-character needle in an arbitrary length haystack.
  *          This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time.
  */
-inline static sz_string_ptr_t sz_find_4char_swar(sz_string_ptr_t const haystack,
-                                                 sz_size_t const haystack_length,
-                                                 sz_string_ptr_t const needle) {
+inline static sz_string_start_t sz_find_4char_swar(sz_string_start_t const haystack,
+                                                   sz_size_t const haystack_length,
+                                                   sz_string_start_t const needle) {
 
-    sz_string_ptr_t text = haystack;
-    sz_string_ptr_t end = haystack + haystack_length;
+    sz_string_start_t text = haystack;
+    sz_string_start_t end = haystack + haystack_length;
 
     // Process the misaligned head, to void UB on unaligned 64-bit loads.
     for (; ((unsigned long)text & 7ul) && text + 4 <= end; ++text)
@@ -367,10 +367,10 @@ inline static sz_string_ptr_t sz_find_4char_swar(sz_string_ptr_t const haystack,
  *          it compares 4-byte anomalies first, most commonly prefixes. It's computationally cheaper.
  *          Matching performance fluctuates between 1 GB/s and 3,5 GB/s per core.
  */
-inline static sz_string_ptr_t sz_find_substr_swar( //
-    sz_string_ptr_t const haystack,
+inline static sz_string_start_t sz_find_substring_swar( //
+    sz_string_start_t const haystack,
     sz_size_t const haystack_length,
-    sz_string_ptr_t const needle,
+    sz_string_start_t const needle,
     sz_size_t const needle_length) {
 
     if (haystack_length < needle_length) return NULL;
@@ -383,12 +383,12 @@ inline static sz_string_ptr_t sz_find_substr_swar( //
     case 3: return sz_find_3char_swar(haystack, haystack_length, needle);
     case 4: return sz_find_4char_swar(haystack, haystack_length, needle);
     default: {
-        sz_string_ptr_t text = haystack;
-        sz_string_ptr_t const end = haystack + haystack_length;
+        sz_string_start_t text = haystack;
+        sz_string_start_t const end = haystack + haystack_length;
 
         _sz_anomaly_t n_anomaly, h_anomaly;
         sz_size_t const n_suffix_len = needle_length - 4 - anomaly_offset;
-        sz_string_ptr_t n_suffix_ptr = needle + 4 + anomaly_offset;
+        sz_string_start_t n_suffix_ptr = needle + 4 + anomaly_offset;
         n_anomaly.u8s[0] = needle[anomaly_offset];
         n_anomaly.u8s[1] = needle[anomaly_offset + 1];
         n_anomaly.u8s[2] = needle[anomaly_offset + 2];
@@ -401,10 +401,9 @@ inline static sz_string_ptr_t sz_find_substr_swar( //
         text += anomaly_offset;
         while (text + needle_length <= end) {
             h_anomaly.u8s[3] = text[3];
-            if (h_anomaly.u32 == n_anomaly.u32)                                  // Match anomaly.
-                if (sz_equal(text + 4, n_suffix_ptr, n_suffix_len))              // Match suffix.
-                    if (sz_equal(text - anomaly_offset, needle, anomaly_offset)) // Match prefix - optimized out.
-                        return text - anomaly_offset;
+            if (h_anomaly.u32 == n_anomaly.u32)                     // Match anomaly.
+                if (sz_equal(text + 4, n_suffix_ptr, n_suffix_len)) // Match suffix.
+                    return text;
 
             h_anomaly.u32 >>= 8;
             ++text;
@@ -417,8 +416,8 @@ inline static sz_string_ptr_t sz_find_substr_swar( //
 /**
  *  Helper function, used in substring search operations.
  */
-inline static void _sz_find_substr_populate_anomaly( //
-    sz_string_ptr_t const needle,
+inline static void _sz_find_substring_populate_anomaly( //
+    sz_string_start_t const needle,
     sz_size_t const needle_length,
     _sz_anomaly_t *anomaly_out,
     _sz_anomaly_t *mask_out) {
@@ -455,16 +454,16 @@ inline static void _sz_find_substr_populate_anomaly( //
  *          was practically more efficient than loading once and shifting around, as introduces
  *          less data dependencies.
  */
-inline static sz_string_ptr_t sz_find_substr_avx2(sz_string_ptr_t const haystack,
-                                                  sz_size_t const haystack_length,
-                                                  sz_string_ptr_t const needle,
-                                                  sz_size_t const needle_length) {
+inline static sz_string_start_t sz_find_substring_avx2(sz_string_start_t const haystack,
+                                                       sz_size_t const haystack_length,
+                                                       sz_string_start_t const needle,
+                                                       sz_size_t const needle_length) {
 
     // Precomputed constants
-    sz_string_ptr_t const end = haystack + haystack_length;
+    sz_string_start_t const end = haystack + haystack_length;
     _sz_anomaly_t anomaly;
     _sz_anomaly_t mask;
-    _sz_find_substr_populate_anomaly(needle, needle_length, &anomaly, &mask);
+    _sz_find_substring_populate_anomaly(needle, needle_length, &anomaly, &mask);
     __m256i const anomalies = _mm256_set1_epi32(anomaly.u32);
     __m256i const masks = _mm256_set1_epi32(mask.u32);
 
@@ -477,7 +476,7 @@ inline static sz_string_ptr_t sz_find_substr_avx2(sz_string_ptr_t const haystack
     //  + 4 movemasks.
     //  + 3 bitwise ANDs.
     //  + 1 heavy (but very unlikely) branch.
-    sz_string_ptr_t text = haystack;
+    sz_string_start_t text = haystack;
     while (text + needle_length + 32 <= end) {
 
         // Performing many unaligned loads ends up being faster than loading once and shuffling around.
@@ -511,7 +510,7 @@ inline static sz_string_ptr_t sz_find_substr_avx2(sz_string_ptr_t const haystack
     }
 
     // Don't forget the last (up to 35) characters.
-    return sz_find_substr_swar(text, end - text, needle, needle_length);
+    return sz_find_substring_swar(text, end - text, needle, needle_length);
 }
 
 #endif // x86 AVX2
@@ -524,21 +523,21 @@ inline static sz_string_ptr_t sz_find_substr_avx2(sz_string_ptr_t const haystack
  *          was practically more efficient than loading once and shifting around, as introduces
  *          less data dependencies.
  */
-inline static sz_string_ptr_t sz_find_substr_neon(sz_string_ptr_t const haystack,
-                                                  sz_size_t const haystack_length,
-                                                  sz_string_ptr_t const needle,
-                                                  sz_size_t const needle_length) {
+inline static sz_string_start_t sz_find_substring_neon(sz_string_start_t const haystack,
+                                                       sz_size_t const haystack_length,
+                                                       sz_string_start_t const needle,
+                                                       sz_size_t const needle_length) {
 
     // Precomputed constants
-    sz_string_ptr_t const end = haystack + haystack_length;
+    sz_string_start_t const end = haystack + haystack_length;
     _sz_anomaly_t anomaly;
     _sz_anomaly_t mask;
-    _sz_find_substr_populate_anomaly(needle, needle_length, &anomaly, &mask);
+    _sz_find_substring_populate_anomaly(needle, needle_length, &anomaly, &mask);
     uint32x4_t const anomalies = vld1q_dup_u32(&anomaly.u32);
     uint32x4_t const masks = vld1q_dup_u32(&mask.u32);
     uint32x4_t matches, matches0, matches1, matches2, matches3;
 
-    sz_string_ptr_t text = haystack;
+    sz_string_start_t text = haystack;
     while (text + needle_length + 16 <= end) {
 
         // Each of the following `matchesX` contains only 4 relevant bits - one per word.
@@ -582,40 +581,40 @@ inline static sz_string_ptr_t sz_find_substr_neon(sz_string_ptr_t const haystack
     }
 
     // Don't forget the last (up to 16+3=19) characters.
-    return sz_find_substr_swar(text, end - text, needle, needle_length);
+    return sz_find_substring_swar(text, end - text, needle, needle_length);
 }
 
 #endif // Arm Neon
 
-inline static sz_size_t sz_count_char(sz_string_ptr_t const haystack,
+inline static sz_size_t sz_count_char(sz_string_start_t const haystack,
                                       sz_size_t const haystack_length,
-                                      sz_string_ptr_t const needle) {
+                                      sz_string_start_t const needle) {
     return sz_count_char_swar(haystack, haystack_length, needle);
 }
 
-inline static sz_string_ptr_t sz_find_1char(sz_string_ptr_t const haystack,
-                                            sz_size_t const haystack_length,
-                                            sz_string_ptr_t const needle) {
+inline static sz_string_start_t sz_find_1char(sz_string_start_t const haystack,
+                                              sz_size_t const haystack_length,
+                                              sz_string_start_t const needle) {
     return sz_find_1char_swar(haystack, haystack_length, needle);
 }
 
-inline static sz_string_ptr_t sz_rfind_1char(sz_string_ptr_t const haystack,
-                                             sz_size_t const haystack_length,
-                                             sz_string_ptr_t const needle) {
+inline static sz_string_start_t sz_rfind_1char(sz_string_start_t const haystack,
+                                               sz_size_t const haystack_length,
+                                               sz_string_start_t const needle) {
     return sz_rfind_1char_swar(haystack, haystack_length, needle);
 }
 
-inline static sz_string_ptr_t sz_find_substr(sz_string_ptr_t const haystack,
-                                             sz_size_t const haystack_length,
-                                             sz_string_ptr_t const needle,
-                                             sz_size_t const needle_length) {
+inline static sz_string_start_t sz_find_substring(sz_string_start_t const haystack,
+                                                  sz_size_t const haystack_length,
+                                                  sz_string_start_t const needle,
+                                                  sz_size_t const needle_length) {
     if (haystack_length < needle_length || needle_length == 0) return NULL;
 #if defined(__ARM_NEON)
-    return sz_find_substr_neon(haystack, haystack_length, needle, needle_length);
+    return sz_find_substring_neon(haystack, haystack_length, needle, needle_length);
 #elif defined(__AVX2__)
-    return sz_find_substr_avx2(haystack, haystack_length, needle, needle_length);
+    return sz_find_substring_avx2(haystack, haystack_length, needle, needle_length);
 #else
-    return sz_find_substr_swar(haystack, haystack_length, needle, needle_length);
+    return sz_find_substring_swar(haystack, haystack_length, needle, needle_length);
 #endif
 }
 
@@ -669,30 +668,46 @@ inline static char sz_toupper_ascii(char c) {
     return *(char *)&upped[(int)c];
 }
 
+inline static sz_u64_t sz_u64_unaligned_load(void const *ptr) {
+#ifdef _MSC_VER
+    return *((__unaligned sz_u64_t *)ptr);
+#else
+    __attribute__((aligned(1))) sz_u64_t const *uptr = (sz_u64_t const *)ptr;
+    return *uptr;
+#endif
+}
+
+inline static sz_u64_t sz_u64_byte_reverse(sz_u64_t val) {
+#ifdef _MSC_VER
+    return _byteswap_uint64(val);
+#else
+    return __builtin_bswap64(val);
+#endif
+}
+
 /**
  *  @brief  Char-level lexicographic comparison of two strings.
  *          Doesn't provide major performance improvements, but helps avoid the LibC dependency.
  */
-inline static sz_bool_t sz_is_less_ascii(sz_string_ptr_t const a,
+inline static sz_bool_t sz_is_less_ascii(sz_string_start_t a,
                                          sz_size_t const a_length,
-                                         sz_string_ptr_t const b,
+                                         sz_string_start_t b,
                                          sz_size_t const b_length) {
 
     sz_size_t min_length = (a_length < b_length) ? a_length : b_length;
-    for (sz_size_t i = 0; i < min_length; ++i) {
-        if (a[i] < b[i]) return 1;
-        if (a[i] > b[i]) return 0;
-    }
-    return a_length < b_length;
+    sz_string_start_t const min_end = a + min_length;
+    while (a + 8 <= min_end && sz_u64_unaligned_load(a) == sz_u64_unaligned_load(b)) a += 8, b += 8;
+    while (a != min_end && *a == *b) a++, b++;
+    return a != min_end ? (*a < *b) : (a_length < b_length);
 }
 
 /**
  *  @brief  Char-level lexicographic comparison of two strings, insensitive to the case of ASCII symbols.
  *          Doesn't provide major performance improvements, but helps avoid the LibC dependency.
  */
-inline static sz_bool_t sz_is_less_uncased_ascii(sz_string_ptr_t const a,
+inline static sz_bool_t sz_is_less_uncased_ascii(sz_string_start_t const a,
                                                  sz_size_t const a_length,
-                                                 sz_string_ptr_t const b,
+                                                 sz_string_start_t const b,
                                                  sz_size_t const b_length) {
 
     sz_size_t min_length = (a_length < b_length) ? a_length : b_length;
@@ -716,11 +731,11 @@ inline static void _sz_swap_order(sz_u64_t *a, sz_u64_t *b) {
 
 struct sz_sequence_t;
 
-typedef sz_string_ptr_t (*sz_sequence_member_start_t)(struct sz_sequence_t const *, sz_size_t);
+typedef sz_string_start_t (*sz_sequence_member_start_t)(struct sz_sequence_t const *, sz_size_t);
 typedef sz_size_t (*sz_sequence_member_length_t)(struct sz_sequence_t const *, sz_size_t);
 typedef sz_bool_t (*sz_sequence_predicate_t)(struct sz_sequence_t const *, sz_size_t);
 typedef sz_bool_t (*sz_sequence_comparator_t)(struct sz_sequence_t const *, sz_size_t, sz_size_t);
-typedef sz_bool_t (*sz_string_is_less_t)(sz_string_ptr_t, sz_size_t, sz_string_ptr_t, sz_size_t);
+typedef sz_bool_t (*sz_string_is_less_t)(sz_string_start_t, sz_size_t, sz_string_start_t, sz_size_t);
 
 typedef struct sz_sequence_t {
     sz_u64_t *order;
@@ -795,9 +810,12 @@ inline static void sz_sort_insertion(sz_sequence_t *sequence, sz_sequence_compar
 
 // Utility functions
 inline static sz_size_t _sz_log2i(sz_size_t n) {
-    sz_size_t log2 = 0;
-    while (n >>= 1) ++log2;
-    return log2;
+    if (n == 0) return 0;                // to avoid undefined behavior with __builtin_clz
+#if defined(__LP64__) || defined(_WIN64) // 64-bit
+    return 63 - __builtin_clzll(n);
+#else // 32-bit
+    return 31 - __builtin_clz(n);
+#endif
 }
 
 inline static void _sz_sift_down(
@@ -893,7 +911,7 @@ inline static void _sz_introsort(
     sz_u64_t pivot = sequence->order[median];
     sz_size_t left = first;
     sz_size_t right = last - 1;
-    while (true) {
+    while (1) {
         while (less(sequence, sequence->order[left], pivot)) left++;
         while (less(sequence, pivot, sequence->order[right])) right--;
         if (left >= right) break;
@@ -962,17 +980,17 @@ inline static void _sz_sort_recursion( //
 }
 
 inline static sz_bool_t _sz_sort_compare_less_ascii(sz_sequence_t *sequence, sz_size_t i_key, sz_size_t j_key) {
-    sz_string_ptr_t i_str = sequence->get_start(sequence, i_key);
+    sz_string_start_t i_str = sequence->get_start(sequence, i_key);
     sz_size_t i_len = sequence->get_length(sequence, i_key);
-    sz_string_ptr_t j_str = sequence->get_start(sequence, j_key);
+    sz_string_start_t j_str = sequence->get_start(sequence, j_key);
     sz_size_t j_len = sequence->get_length(sequence, j_key);
     return sz_is_less_ascii(i_str, i_len, j_str, j_len);
 }
 
 inline static sz_bool_t _sz_sort_compare_less_uncased_ascii(sz_sequence_t *sequence, sz_size_t i_key, sz_size_t j_key) {
-    sz_string_ptr_t i_str = sequence->get_start(sequence, i_key);
+    sz_string_start_t i_str = sequence->get_start(sequence, i_key);
     sz_size_t i_len = sequence->get_length(sequence, i_key);
-    sz_string_ptr_t j_str = sequence->get_start(sequence, j_key);
+    sz_string_start_t j_str = sequence->get_start(sequence, j_key);
     sz_size_t j_len = sequence->get_length(sequence, j_key);
     return sz_is_less_uncased_ascii(i_str, i_len, j_str, j_len);
 }
@@ -994,7 +1012,7 @@ inline static void sz_sort(sz_sequence_t *sequence, sz_sort_config_t const *conf
 
     // Export up to 4 bytes into the `sequence` bits themselves
     for (sz_size_t i = 0; i != sequence->count; ++i) {
-        sz_string_ptr_t begin = sequence->get_start(sequence, sequence->order[i]);
+        sz_string_start_t begin = sequence->get_start(sequence, sequence->order[i]);
         sz_size_t length = sequence->get_length(sequence, sequence->order[i]);
         length = length > 4ul ? 4ul : length;
         char *prefix = (char *)&sequence->order[i];
@@ -1038,9 +1056,9 @@ inline static levenstein_distance_t _sz_levenstein_minimum( //
  *          It accepts an upper bound on the possible error. Quadratic complexity in time, linear in space.
  */
 inline static levenstein_distance_t sz_levenstein( //
-    sz_string_ptr_t const a,
+    sz_string_start_t const a,
     sz_size_t const a_length,
-    sz_string_ptr_t const b,
+    sz_string_start_t const b,
     sz_size_t const b_length,
     levenstein_distance_t const bound,
     void *buffer) {
@@ -1093,11 +1111,11 @@ inline static levenstein_distance_t sz_levenstein( //
 /**
  *  @brief  Hashes provided string using hardware-accelerated CRC32 instructions.
  */
-inline static sz_u32_t sz_hash_crc32_native(sz_string_ptr_t start, sz_size_t length) { return 0; }
+inline static sz_u32_t sz_hash_crc32_native(sz_string_start_t start, sz_size_t length) { return 0; }
 
-inline static sz_u32_t sz_hash_crc32_neon(sz_string_ptr_t start, sz_size_t length) { return 0; }
+inline static sz_u32_t sz_hash_crc32_neon(sz_string_start_t start, sz_size_t length) { return 0; }
 
-inline static sz_u32_t sz_hash_crc32_sse(sz_string_ptr_t start, sz_size_t length) { return 0; }
+inline static sz_u32_t sz_hash_crc32_sse(sz_string_start_t start, sz_size_t length) { return 0; }
 
 #ifdef __cplusplus
 }

From b0a280d783fef546b4b1b3245cdcaa86e169a97b Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 9 Oct 2023 16:51:35 -0700
Subject: [PATCH 63/72] Make: Formatting and docs

---
 .vscode/settings.json |   3 +-
 CMakeLists.txt        | 140 ++++++++++++++++++++++--------------------
 README.md             |  10 +--
 scripts/bench.ipynb   |  20 ++++--
 scripts/test.c        |  14 ++---
 5 files changed, 100 insertions(+), 87 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 08c5bb65..575441f2 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -118,7 +118,8 @@
     "strstream": "cpp",
     "filesystem": "cpp",
     "stringzilla.h": "c",
-    "__memory": "c"
+    "__memory": "c",
+    "charconv": "c"
   },
   "C_Cpp.default.configurationProvider": "ms-vscode.cmake-tools",
   "cSpell.words": [
diff --git a/CMakeLists.txt b/CMakeLists.txt
index df569329..230c2a06 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,105 +1,109 @@
 # This CMake file is heavily inspired by following `stringzilla` CMake:
 # https://github.com/nlohmann/json/blob/develop/CMakeLists.txt
 cmake_minimum_required(VERSION 3.1)
-project(stringzilla VERSION 0.1.0 LANGUAGES C CXX)
+project(
+  stringzilla
+  VERSION 0.1.0
+  LANGUAGES C CXX)
 
-set (CMAKE_C_STANDARD 11)
-set (CMAKE_CXX_STANDARD 17)
+set(CMAKE_C_STANDARD 11)
+set(CMAKE_CXX_STANDARD 17)
 
-# Determine if USearch is built as a subproject (using `add_subdirectory`) or if it is the main project
+# Determine if USearch is built as a subproject (using `add_subdirectory`) or if
+# it is the main project
 set(STRINGZILLA_IS_MAIN_PROJECT OFF)
-if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR)
-    set(STRINGZILLA_IS_MAIN_PROJECT ON)
+if(CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR)
+  set(STRINGZILLA_IS_MAIN_PROJECT ON)
 endif()
 
 # Options
 option(STRINGZILLA_INSTALL "Install CMake targets" OFF)
-option(STRINGZILLA_BUILD_TEST "Compile a native unit test in C++" ${STRINGZILLA_IS_MAIN_PROJECT})
-option(STRINGZILLA_BUILD_BENCHMARK "Compile a native benchmark in C++" ${STRINGZILLA_IS_MAIN_PROJECT})
+option(STRINGZILLA_BUILD_TEST "Compile a native unit test in C++"
+       ${STRINGZILLA_IS_MAIN_PROJECT})
+option(STRINGZILLA_BUILD_BENCHMARK "Compile a native benchmark in C++"
+       ${STRINGZILLA_IS_MAIN_PROJECT})
 option(STRINGZILLA_BUILD_WOLFRAM "Compile Wolfram Language bindings" OFF)
 
 # Includes
 set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH})
 include(ExternalProject)
 
-# Allow CMake 3.13+ to override options when using FetchContent / add_subdirectory
-if (POLICY CMP0077)
-    cmake_policy(SET CMP0077 NEW)
-endif ()
+# Allow CMake 3.13+ to override options when using FetchContent /
+# add_subdirectory
+if(POLICY CMP0077)
+  cmake_policy(SET CMP0077 NEW)
+endif()
 
 # Configuration
 include(GNUInstallDirs)
-set(STRINGZILLA_TARGET_NAME               ${PROJECT_NAME})
-set(STRINGZILLA_CONFIG_INSTALL_DIR        "${CMAKE_INSTALL_DATADIR}/cmake/${PROJECT_NAME}" CACHE INTERNAL "")
-set(STRINGZILLA_INCLUDE_INSTALL_DIR       "${CMAKE_INSTALL_INCLUDEDIR}")
-set(STRINGZILLA_TARGETS_EXPORT_NAME       "${PROJECT_NAME}Targets")
-set(STRINGZILLA_CMAKE_CONFIG_TEMPLATE     "cmake/config.cmake.in")
-set(STRINGZILLA_CMAKE_CONFIG_DIR          "${CMAKE_CURRENT_BINARY_DIR}")
-set(STRINGZILLA_CMAKE_VERSION_CONFIG_FILE "${STRINGZILLA_CMAKE_CONFIG_DIR}/${PROJECT_NAME}ConfigVersion.cmake")
-set(STRINGZILLA_CMAKE_PROJECT_CONFIG_FILE "${STRINGZILLA_CMAKE_CONFIG_DIR}/${PROJECT_NAME}Config.cmake")
-set(STRINGZILLA_CMAKE_PROJECT_TARGETS_FILE "${STRINGZILLA_CMAKE_CONFIG_DIR}/${PROJECT_NAME}Targets.cmake")
-set(STRINGZILLA_PKGCONFIG_INSTALL_DIR      "${CMAKE_INSTALL_DATADIR}/pkgconfig")
-
+set(STRINGZILLA_TARGET_NAME ${PROJECT_NAME})
+set(STRINGZILLA_CONFIG_INSTALL_DIR
+    "${CMAKE_INSTALL_DATADIR}/cmake/${PROJECT_NAME}"
+    CACHE INTERNAL "")
+set(STRINGZILLA_INCLUDE_INSTALL_DIR "${CMAKE_INSTALL_INCLUDEDIR}")
+set(STRINGZILLA_TARGETS_EXPORT_NAME "${PROJECT_NAME}Targets")
+set(STRINGZILLA_CMAKE_CONFIG_TEMPLATE "cmake/config.cmake.in")
+set(STRINGZILLA_CMAKE_CONFIG_DIR "${CMAKE_CURRENT_BINARY_DIR}")
+set(STRINGZILLA_CMAKE_VERSION_CONFIG_FILE
+    "${STRINGZILLA_CMAKE_CONFIG_DIR}/${PROJECT_NAME}ConfigVersion.cmake")
+set(STRINGZILLA_CMAKE_PROJECT_CONFIG_FILE
+    "${STRINGZILLA_CMAKE_CONFIG_DIR}/${PROJECT_NAME}Config.cmake")
+set(STRINGZILLA_CMAKE_PROJECT_TARGETS_FILE
+    "${STRINGZILLA_CMAKE_CONFIG_DIR}/${PROJECT_NAME}Targets.cmake")
+set(STRINGZILLA_PKGCONFIG_INSTALL_DIR "${CMAKE_INSTALL_DATADIR}/pkgconfig")
 
 # Define our header-only library
 add_library(${STRINGZILLA_TARGET_NAME} INTERFACE)
-add_library(${PROJECT_NAME}::${STRINGZILLA_TARGET_NAME} ALIAS ${STRINGZILLA_TARGET_NAME})
+add_library(${PROJECT_NAME}::${STRINGZILLA_TARGET_NAME} ALIAS
+            ${STRINGZILLA_TARGET_NAME})
 set(STRINGZILLA_INCLUDE_BUILD_DIR "${PROJECT_SOURCE_DIR}/include/")
 
 target_compile_definitions(
-    ${STRINGZILLA_TARGET_NAME}
-    INTERFACE
-    $<$<NOT:$<BOOL:${JSON_GlobalUDLs}>>:STRINGZILLA_USE_OPENMP=0>
-)
+  ${STRINGZILLA_TARGET_NAME}
+  INTERFACE $<$<NOT:$<BOOL:${JSON_GlobalUDLs}>>:STRINGZILLA_USE_OPENMP=0>)
 target_include_directories(
-    ${STRINGZILLA_TARGET_NAME}
-    ${STRINGZILLA_SYSTEM_INCLUDE} INTERFACE
-    $<BUILD_INTERFACE:${STRINGZILLA_INCLUDE_BUILD_DIR}>
-    $<INSTALL_INTERFACE:include>
-)
+  ${STRINGZILLA_TARGET_NAME} ${STRINGZILLA_SYSTEM_INCLUDE}
+  INTERFACE $<BUILD_INTERFACE:${STRINGZILLA_INCLUDE_BUILD_DIR}>
+            $<INSTALL_INTERFACE:include>)
 
 if(STRINGZILLA_INSTALL)
-    install(
-        DIRECTORY ${STRINGZILLA_INCLUDE_BUILD_DIR}
-        DESTINATION ${STRINGZILLA_INCLUDE_INSTALL_DIR}
-    )
-    install(
-        FILES ${STRINGZILLA_CMAKE_PROJECT_CONFIG_FILE} ${STRINGZILLA_CMAKE_VERSION_CONFIG_FILE}
-        DESTINATION ${STRINGZILLA_CONFIG_INSTALL_DIR}
-    )
-    export(
-        TARGETS ${STRINGZILLA_TARGET_NAME}
-        NAMESPACE ${PROJECT_NAME}::
-        FILE ${STRINGZILLA_CMAKE_PROJECT_TARGETS_FILE}
-    )
-    install(
-        TARGETS ${STRINGZILLA_TARGET_NAME}
-        EXPORT ${STRINGZILLA_TARGETS_EXPORT_NAME}
-        INCLUDES DESTINATION ${STRINGZILLA_INCLUDE_INSTALL_DIR}
-    )
-    install(
-        EXPORT ${STRINGZILLA_TARGETS_EXPORT_NAME}
-        NAMESPACE ${PROJECT_NAME}::
-        DESTINATION ${STRINGZILLA_CONFIG_INSTALL_DIR}
-    )
-    install(
-        FILES "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}.pc"
-        DESTINATION ${STRINGZILLA_PKGCONFIG_INSTALL_DIR}
-    )
+  install(DIRECTORY ${STRINGZILLA_INCLUDE_BUILD_DIR}
+          DESTINATION ${STRINGZILLA_INCLUDE_INSTALL_DIR})
+  install(FILES ${STRINGZILLA_CMAKE_PROJECT_CONFIG_FILE}
+                ${STRINGZILLA_CMAKE_VERSION_CONFIG_FILE}
+          DESTINATION ${STRINGZILLA_CONFIG_INSTALL_DIR})
+  export(
+    TARGETS ${STRINGZILLA_TARGET_NAME}
+    NAMESPACE ${PROJECT_NAME}::
+    FILE ${STRINGZILLA_CMAKE_PROJECT_TARGETS_FILE})
+  install(
+    TARGETS ${STRINGZILLA_TARGET_NAME}
+    EXPORT ${STRINGZILLA_TARGETS_EXPORT_NAME}
+    INCLUDES
+    DESTINATION ${STRINGZILLA_INCLUDE_INSTALL_DIR})
+  install(
+    EXPORT ${STRINGZILLA_TARGETS_EXPORT_NAME}
+    NAMESPACE ${PROJECT_NAME}::
+    DESTINATION ${STRINGZILLA_CONFIG_INSTALL_DIR})
+  install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}.pc"
+          DESTINATION ${STRINGZILLA_PKGCONFIG_INSTALL_DIR})
 endif()
 
 if(${STRINGZILLA_BUILD_TEST} OR ${STRINGZILLA_BUILD_BENCHMARK})
-  add_executable(stringzilla_test scripts/test.c)
+  add_executable(stringzilla_test scripts/test.cpp)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
+  set(CMAKE_CXX_FLAGS
+      "${CMAKE_CXX_FLAGS} -O3 -flto -march=native -finline-functions -funroll-loops"
+  )
 
   target_include_directories(stringzilla_test PRIVATE stringzilla)
-  set_target_properties(stringzilla_test PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+  set_target_properties(stringzilla_test PROPERTIES RUNTIME_OUTPUT_DIRECTORY
+                                                    ${CMAKE_BINARY_DIR})
 
-  if(${CMAKE_VERSION} VERSION_EQUAL 3.13 OR ${CMAKE_VERSION} VERSION_GREATER 3.13)
-      include(CTest)
-      enable_testing()
-      add_test(NAME stringzilla_test COMMAND stringzilla_test)
+  if(${CMAKE_VERSION} VERSION_EQUAL 3.13 OR ${CMAKE_VERSION} VERSION_GREATER
+                                            3.13)
+    include(CTest)
+    enable_testing()
+    add_test(NAME stringzilla_test COMMAND stringzilla_test)
   endif()
 endif()
-
diff --git a/README.md b/README.md
index 85032c34..8f0765c3 100644
--- a/README.md
+++ b/README.md
@@ -35,8 +35,8 @@ Coming soon.
 
 ## Quick Start: Python 🐍
 
-1️. Install via pip: `pip install stringzilla`  
-1. Import the classes you need: `from stringzilla import Str, Strs, File`  
+1. Install via pip: `pip install stringzilla`  
+2. Import the classes you need: `from stringzilla import Str, Strs, File`  
 
 ### Basic Usage
 
@@ -115,13 +115,13 @@ There is an ABI-stable C 99 interface, in case you have a database, an operating
 #include "stringzilla.h"
 
 // Initialize your haystack and needle
-sz_haystack_t haystack = {your_text, your_text_length};
-sz_needle_t needle = {your_subtext, your_subtext_length, your_quadgram_offset};
+sz_string_view_t haystack = {your_text, your_text_length};
+sz_string_view_t needle = {your_subtext, your_subtext_length};
 
 // Perform string-level operations
 size_t character_count = sz_count_char(haystack, 'a');
 size_t character_position = sz_find_unigram(haystack, 'a');
-size_t substring_position = sz_find_substr(haystack, needle);
+size_t substring_position = sz_find_substring(haystack, needle);
 
 // Perform collection level operations
 sz_sequence_t array = {your_order, your_count, your_get_start, your_get_length, your_handle};
diff --git a/scripts/bench.ipynb b/scripts/bench.ipynb
index b3bc4392..492db50a 100644
--- a/scripts/bench.ipynb
+++ b/scripts/bench.ipynb
@@ -88,7 +88,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "150 ms ± 2.01 ms per loop (mean ± std. dev. of 100 runs, 1 loop each)\n"
+      "152 ms ± 3.24 ms per loop (mean ± std. dev. of 100 runs, 1 loop each)\n"
      ]
     }
    ],
@@ -106,7 +106,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "37.8 ms ± 286 µs per loop (mean ± std. dev. of 100 runs, 1 loop each)\n"
+      "38.1 ms ± 312 µs per loop (mean ± std. dev. of 100 runs, 1 loop each)\n"
      ]
     }
    ],
@@ -124,7 +124,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "173 ns ± 23.7 ns per loop (mean ± std. dev. of 1000 runs, 1 loop each)\n"
+      "The slowest run took 7.28 times longer than the fastest. This could mean that an intermediate result is being cached.\n",
+      "186 ns ± 41.1 ns per loop (mean ± std. dev. of 1000 runs, 1 loop each)\n"
      ]
     }
    ],
@@ -142,8 +143,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "The slowest run took 82.51 times longer than the fastest. This could mean that an intermediate result is being cached.\n",
-      "94.3 ns ± 108 ns per loop (mean ± std. dev. of 1000 runs, 1 loop each)\n"
+      "The slowest run took 120.95 times longer than the fastest. This could mean that an intermediate result is being cached.\n",
+      "99.6 ns ± 155 ns per loop (mean ± std. dev. of 1000 runs, 1 loop each)\n"
      ]
     }
    ],
@@ -152,6 +153,13 @@
     "sz_str.find(pattern)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -176,7 +184,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.13"
+   "version": "3.10.11"
   },
   "orig_nbformat": 4
  },
diff --git a/scripts/test.c b/scripts/test.c
index 127975b0..b39fd982 100644
--- a/scripts/test.c
+++ b/scripts/test.c
@@ -18,8 +18,8 @@ void populate_random_string(char *buffer, int length, int variability) {
     buffer[length] = '\0';
 }
 
-// Test function for sz_find_substr
-void test_sz_find_substr() {
+// Test function for sz_find_substring
+void test_sz_find_substring() {
     char buffer[MAX_LENGTH + 1];
     char pattern[6]; // Maximum length of 5 + 1 for '\0'
 
@@ -39,11 +39,11 @@ void test_sz_find_substr() {
             needle.length = pattern_length;
 
             // Comparing the result of your function with the standard library function.
-            sz_string_ptr_t result_libc = strstr(buffer, pattern);
-            sz_string_ptr_t result_stringzilla =
-                sz_find_substr(haystack.start, haystack.length, needle.start, needle.length);
+            sz_string_start_t result_libc = strstr(buffer, pattern);
+            sz_string_start_t result_stringzilla =
+                sz_find_substring(haystack.start, haystack.length, needle.start, needle.length);
 
-            assert(((result_libc == NULL) ^ (result_stringzilla == NULL)) && "Test failed for sz_find_substr");
+            assert(((result_libc == NULL) ^ (result_stringzilla == NULL)) && "Test failed for sz_find_substring");
         }
     }
 }
@@ -51,7 +51,7 @@ void test_sz_find_substr() {
 int main() {
     srand((unsigned int)time(NULL));
 
-    test_sz_find_substr();
+    test_sz_find_substring();
     // Add calls to other test functions as you implement them
 
     printf("All tests passed!\n");

From bcaf7911d962ce641cab910bc0bc491ed6646ddd Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 9 Oct 2023 17:03:25 -0700
Subject: [PATCH 64/72] Add: Benchmarks notebook

---
 scripts/bench.ipynb       | 26 ++++++--------------------
 stringzilla/stringzilla.h | 28 +++++++++++++---------------
 2 files changed, 19 insertions(+), 35 deletions(-)

diff --git a/scripts/bench.ipynb b/scripts/bench.ipynb
index 492db50a..838ca7af 100644
--- a/scripts/bench.ipynb
+++ b/scripts/bench.ipynb
@@ -88,7 +88,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "152 ms ± 3.24 ms per loop (mean ± std. dev. of 100 runs, 1 loop each)\n"
+      "152 ms ± 2.43 ms per loop (mean ± std. dev. of 100 runs, 1 loop each)\n"
      ]
     }
    ],
@@ -106,7 +106,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "38.1 ms ± 312 µs per loop (mean ± std. dev. of 100 runs, 1 loop each)\n"
+      "37.7 ms ± 341 µs per loop (mean ± std. dev. of 100 runs, 1 loop each)\n"
      ]
     }
    ],
@@ -124,8 +124,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "The slowest run took 7.28 times longer than the fastest. This could mean that an intermediate result is being cached.\n",
-      "186 ns ± 41.1 ns per loop (mean ± std. dev. of 1000 runs, 1 loop each)\n"
+      "The slowest run took 8.67 times longer than the fastest. This could mean that an intermediate result is being cached.\n",
+      "182 ns ± 35 ns per loop (mean ± std. dev. of 1000 runs, 1 loop each)\n"
      ]
     }
    ],
@@ -143,8 +143,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "The slowest run took 120.95 times longer than the fastest. This could mean that an intermediate result is being cached.\n",
-      "99.6 ns ± 155 ns per loop (mean ± std. dev. of 1000 runs, 1 loop each)\n"
+      "The slowest run took 40.69 times longer than the fastest. This could mean that an intermediate result is being cached.\n",
+      "90 ns ± 53.2 ns per loop (mean ± std. dev. of 1000 runs, 1 loop each)\n"
      ]
     }
    ],
@@ -152,20 +152,6 @@
     "%%timeit -n 1 -r 1000\n",
     "sz_str.find(pattern)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h
index c7c0ae49..94bbde44 100644
--- a/stringzilla/stringzilla.h
+++ b/stringzilla/stringzilla.h
@@ -35,7 +35,7 @@ extern "C" {
 #endif
 
 /**
- *  @brief  Analogous to `size_t` and `std::size_t`, unsigned integer, identical to pointer size.
+ *  @brief  Analogous to `sz_size_t` and `std::sz_size_t`, unsigned integer, identical to pointer size.
  *          64-bit on most platforms where pointers are 64-bit.
  *          32-bit on platforms where pointers are 32-bit.
  */
@@ -490,23 +490,21 @@ inline static sz_string_start_t sz_find_substring_avx2(sz_string_start_t const h
         int matches3 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts3, anomalies));
 
         if (matches0 | matches1 | matches2 | matches3) {
-            int matches =                   //
-                (matches0 & 0x1111'1111u) | //
-                (matches1 & 0x2222'2222u) | //
-                (matches2 & 0x4444'4444u) | //
-                (matches3 & 0x8888'8888u);
-            size_t first_match_offset = _tzcnt_u32(matches);
+            int matches =                  //
+                (matches0 & 0x11111111u) | //
+                (matches1 & 0x22222222u) | //
+                (matches2 & 0x44444444u) | //
+                (matches3 & 0x88888888u);
+            sz_size_t first_match_offset = _tzcnt_u32(matches);
             if (needle_length > 4) {
-                if (sz_equal(text + first_match_offset + 4, needle + 4, needle_length - 4))
+                if (sz_equal(text + first_match_offset + 4, needle + 4, needle_length - 4)) {
                     return text + first_match_offset;
-                else
-                    text += first_match_offset + 1;
+                }
+                else { text += first_match_offset + 1; }
             }
-            else
-                return text + first_match_offset;
+            else { return text + first_match_offset; }
         }
-        else
-            text += 32;
+        else { text += 32; }
     }
 
     // Don't forget the last (up to 35) characters.
@@ -566,7 +564,7 @@ inline static sz_string_start_t sz_find_substring_neon(sz_string_start_t const h
                 (vget_lane_u16(matches_u16x4, 3) << 12);
 
             // Find the first match
-            size_t first_match_offset = __builtin_ctz(matches_u16);
+            sz_size_t first_match_offset = __builtin_ctz(matches_u16);
             if (needle_length > 4) {
                 if (sz_equal(text + first_match_offset + 4, needle + 4, needle_length - 4))
                     return text + first_match_offset;

From 9bdbf236c8ca20648c2ccdf350035eba592662a7 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 9 Oct 2023 17:04:35 -0700
Subject: [PATCH 65/72] Make: Automate major releases

---
 .releaserc | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.releaserc b/.releaserc
index faa6b938..ab603bc1 100644
--- a/.releaserc
+++ b/.releaserc
@@ -11,6 +11,10 @@
             {
                 "preset": "eslint",
                 "releaseRules": [
+                    {
+                        "tag": "Break",
+                        "release": "major"
+                    },
                     {
                         "tag": "Add",
                         "release": "minor"
@@ -35,6 +39,10 @@
             {
                 "preset": "eslint",
                 "releaseRules": [
+                    {
+                        "tag": "Break",
+                        "release": "major"
+                    },
                     {
                         "tag": "Add",
                         "release": "minor"

From a878eba876b23e5dfc1277c853add7edd43a91dd Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 9 Oct 2023 17:40:23 -0700
Subject: [PATCH 66/72] Fix: MSVC-compliant initialization

---
 python/lib.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/python/lib.c b/python/lib.c
index c0ad69d4..57902505 100644
--- a/python/lib.c
+++ b/python/lib.c
@@ -30,6 +30,8 @@ typedef SSIZE_T ssize_t;
 #include <Python.h>            // Core CPython interfaces
 #include <numpy/arrayobject.h> // NumPy
 
+#include <string.h> // `memset`
+
 #include <stringzilla.h>
 
 #pragma region Forward Declarations
@@ -1573,8 +1575,10 @@ static sz_bool_t Strs_sort_(Strs *self,
     }
 
     // Call our sorting algorithm
-    sz_sequence_t sequence = {};
-    sz_sort_config_t sort_config = {};
+    sz_sequence_t sequence;
+    sz_sort_config_t sort_config;
+    memset(&sequence, 0, sizeof(sequence));
+    memset(&sort_config, 0, sizeof(sort_config));
     sequence.order = (sz_size_t *)temporary_memory.start;
     sequence.count = count;
     sequence.handle = parts;

From 5d333d5af3d60b61a5da36f1df79337a9850d76c Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 9 Oct 2023 17:46:27 -0700
Subject: [PATCH 67/72] Fix: Missing `__builtin_clzll` symbol

---
 stringzilla/stringzilla.h | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h
index 94bbde44..cfcd0220 100644
--- a/stringzilla/stringzilla.h
+++ b/stringzilla/stringzilla.h
@@ -806,14 +806,26 @@ inline static void sz_sort_insertion(sz_sequence_t *sequence, sz_sequence_compar
     }
 }
 
-// Utility functions
 inline static sz_size_t _sz_log2i(sz_size_t n) {
-    if (n == 0) return 0;                // to avoid undefined behavior with __builtin_clz
+    if (n == 0) return 0;
+
 #if defined(__LP64__) || defined(_WIN64) // 64-bit
+#ifdef _MSC_VER
+    unsigned long index;
+    _BitScanReverse64(&index, n);
+    return index;
+#else
     return 63 - __builtin_clzll(n);
+#endif
 #else // 32-bit
+#ifdef _MSC_VER
+    unsigned long index;
+    _BitScanReverse(&index, n);
+    return index;
+#else
     return 31 - __builtin_clz(n);
 #endif
+#endif
 }
 
 inline static void _sz_sift_down(

From 8f9ca8b14bd5b46a22809f077a4bdbf0954ad89a Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 9 Oct 2023 18:02:29 -0700
Subject: [PATCH 68/72] Improve: Identical bit-counting intrinsics

---
 stringzilla/stringzilla.h | 113 ++++++++++++++++++++++----------------
 1 file changed, 66 insertions(+), 47 deletions(-)

diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h
index cfcd0220..00ef0964 100644
--- a/stringzilla/stringzilla.h
+++ b/stringzilla/stringzilla.h
@@ -9,13 +9,14 @@
 #include <arm_neon.h>
 #endif
 
+/**
+ *  Intrinsics aliases for MSVC, GCC, and Clang.
+ */
 #ifdef _MSC_VER
 #include <intrin.h>
 #define popcount64 __popcnt64
 #define ctz64 _tzcnt_u64
 #define clz64 _lzcnt_u64
-#define strncasecmp _strnicmp
-#define strcasecmp _stricmp
 #else
 #define popcount64 __builtin_popcountll
 #define ctz64 __builtin_ctzll
@@ -23,8 +24,8 @@
 #endif
 
 /**
- *  Generally `NULL` is coming from locale.h, stddef.h, stdio.h, stdlib.h, string.h, time.h, and wchar.h,
- *  according to the C standard.
+ *  @brief  Generally `NULL` is coming from locale.h, stddef.h, stdio.h, stdlib.h, string.h, time.h, and wchar.h,
+ *          according to the C standard.
  */
 #ifndef NULL
 #define NULL ((void *)0)
@@ -50,6 +51,11 @@ typedef unsigned sz_u32_t;             // Always 32 bits
 typedef unsigned long long sz_u64_t;   // Always 64 bits
 typedef char const *sz_string_start_t; // A type alias for `char const * `
 
+/**
+ *  @brief  For faster bounded Levenstein (Edit) distance computation no more than 255 characters are supported.
+ */
+typedef unsigned char levenstein_distance_t;
+
 /**
  *  @brief  Helper construct for higher-level bindings.
  */
@@ -490,12 +496,12 @@ inline static sz_string_start_t sz_find_substring_avx2(sz_string_start_t const h
         int matches3 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts3, anomalies));
 
         if (matches0 | matches1 | matches2 | matches3) {
-            int matches =                  //
-                (matches0 & 0x11111111u) | //
-                (matches1 & 0x22222222u) | //
-                (matches2 & 0x44444444u) | //
-                (matches3 & 0x88888888u);
-            sz_size_t first_match_offset = _tzcnt_u32(matches);
+            int matches =                 //
+                (matches0 & 0x11111111) | //
+                (matches1 & 0x22222222) | //
+                (matches2 & 0x44444444) | //
+                (matches3 & 0x88888888);
+            sz_size_t first_match_offset = ctz64(matches);
             if (needle_length > 4) {
                 if (sz_equal(text + first_match_offset + 4, needle + 4, needle_length - 4)) {
                     return text + first_match_offset;
@@ -564,18 +570,16 @@ inline static sz_string_start_t sz_find_substring_neon(sz_string_start_t const h
                 (vget_lane_u16(matches_u16x4, 3) << 12);
 
             // Find the first match
-            sz_size_t first_match_offset = __builtin_ctz(matches_u16);
+            sz_size_t first_match_offset = ctz64(matches_u16);
             if (needle_length > 4) {
-                if (sz_equal(text + first_match_offset + 4, needle + 4, needle_length - 4))
+                if (sz_equal(text + first_match_offset + 4, needle + 4, needle_length - 4)) {
                     return text + first_match_offset;
-                else
-                    text += first_match_offset + 1;
+                }
+                else { text += first_match_offset + 1; }
             }
-            else
-                return text + first_match_offset;
+            else { return text + first_match_offset; }
         }
-        else
-            text += 16;
+        else { text += 16; }
     }
 
     // Don't forget the last (up to 16+3=19) characters.
@@ -666,6 +670,13 @@ inline static char sz_toupper_ascii(char c) {
     return *(char *)&upped[(int)c];
 }
 
+/**
+ *  @brief Load a 64-bit unsigned integer from a potentially unaligned pointer.
+ *
+ *  @note This function uses compiler-specific attributes or keywords to
+ *        ensure correct and efficient unaligned loads. It's designed to work
+ *        with both MSVC and GCC/Clang.
+ */
 inline static sz_u64_t sz_u64_unaligned_load(void const *ptr) {
 #ifdef _MSC_VER
     return *((__unaligned sz_u64_t *)ptr);
@@ -675,6 +686,12 @@ inline static sz_u64_t sz_u64_unaligned_load(void const *ptr) {
 #endif
 }
 
+/**
+ *  @brief Reverse the byte order of a 64-bit unsigned integer.
+ *
+ *  @note This function uses compiler-specific intrinsics to achieve the
+ *        byte-reversal. It's designed to work with both MSVC and GCC/Clang.
+ */
 inline static sz_u64_t sz_u64_byte_reverse(sz_u64_t val) {
 #ifdef _MSC_VER
     return _byteswap_uint64(val);
@@ -683,6 +700,35 @@ inline static sz_u64_t sz_u64_byte_reverse(sz_u64_t val) {
 #endif
 }
 
+/**
+ *  @brief  Compute the logarithm base 2 of an integer.
+ *
+ *  @note If n is 0, the function returns 0 to avoid undefined behavior.
+ *  @note This function uses compiler-specific intrinsics or built-ins
+ *        to achieve the computation. It's designed to work with GCC/Clang and MSVC.
+ */
+inline static sz_size_t sz_log2i(sz_size_t n) {
+    if (n == 0) return 0;
+
+#if defined(__LP64__) || defined(_WIN64) // 64-bit
+#ifdef _MSC_VER
+    unsigned long index;
+    _BitScanReverse64(&index, n);
+    return index;
+#else
+    return 63 - __builtin_clzll(n);
+#endif
+#else // 32-bit
+#ifdef _MSC_VER
+    unsigned long index;
+    _BitScanReverse(&index, n);
+    return index;
+#else
+    return 31 - __builtin_clz(n);
+#endif
+#endif
+}
+
 /**
  *  @brief  Char-level lexicographic comparison of two strings.
  *          Doesn't provide major performance improvements, but helps avoid the LibC dependency.
@@ -806,28 +852,6 @@ inline static void sz_sort_insertion(sz_sequence_t *sequence, sz_sequence_compar
     }
 }
 
-inline static sz_size_t _sz_log2i(sz_size_t n) {
-    if (n == 0) return 0;
-
-#if defined(__LP64__) || defined(_WIN64) // 64-bit
-#ifdef _MSC_VER
-    unsigned long index;
-    _BitScanReverse64(&index, n);
-    return index;
-#else
-    return 63 - __builtin_clzll(n);
-#endif
-#else // 32-bit
-#ifdef _MSC_VER
-    unsigned long index;
-    _BitScanReverse(&index, n);
-    return index;
-#else
-    return 31 - __builtin_clz(n);
-#endif
-#endif
-}
-
 inline static void _sz_sift_down(
     sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_u64_t *order, sz_size_t start, sz_size_t end) {
     sz_size_t root = start;
@@ -936,13 +960,10 @@ inline static void _sz_introsort(
 }
 
 inline static void sz_sort_introsort(sz_sequence_t *sequence, sz_sequence_comparator_t less) {
-    sz_size_t depth_limit = 2 * _sz_log2i(sequence->count);
+    sz_size_t depth_limit = 2 * sz_log2i(sequence->count);
     _sz_introsort(sequence, less, 0, sequence->count, depth_limit);
 }
 
-/**
- *  @brief  Internal Radix sorting procedure.
- */
 inline static void _sz_sort_recursion( //
     sz_sequence_t *sequence,
     sz_size_t bit_idx,
@@ -1012,7 +1033,7 @@ typedef struct sz_sort_config_t {
 
 /**
  *  @brief  Sorting algorithm, combining Radix Sort for the first 32 bits of every word
- *          and a follow-up Quick Sort on resulting structure.
+ *          and a follow-up by a more conventional sorting procedure on equally prefixed parts.
  */
 inline static void sz_sort(sz_sequence_t *sequence, sz_sort_config_t const *config) {
 
@@ -1042,8 +1063,6 @@ inline static void sz_sort(sz_sequence_t *sequence, sz_sort_config_t const *conf
     _sz_sort_recursion(sequence, 0, 32, comparator, partial_order_length);
 }
 
-typedef unsigned char levenstein_distance_t;
-
 /**
  *  @return Amount of temporary memory (in bytes) needed to efficiently compute
  *          the Levenstein distance between two strings of given size.

From 1a5b7260a96307202aea678d1bb22d4c00f19217 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 9 Oct 2023 18:58:07 -0700
Subject: [PATCH 69/72] Add: SSE and Arm variants of CRC32

---
 python/lib.c              |   2 +-
 stringzilla/stringzilla.h | 110 ++++++++++++++++++++++++++++++++++++--
 2 files changed, 106 insertions(+), 6 deletions(-)

diff --git a/python/lib.c b/python/lib.c
index 57902505..64e3ae70 100644
--- a/python/lib.c
+++ b/python/lib.c
@@ -561,7 +561,7 @@ static void Str_dealloc(Str *self) {
 
 static PyObject *Str_str(Str *self) { return PyUnicode_FromStringAndSize(self->start, self->length); }
 
-static Py_hash_t Str_hash(Str *self) { return (Py_hash_t)sz_hash_crc32_native(self->start, self->length); }
+static Py_hash_t Str_hash(Str *self) { return (Py_hash_t)sz_hash_crc32(self->start, self->length); }
 
 static Py_ssize_t Str_len(Str *self) { return self->length; }
 
diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h
index 00ef0964..57d17b89 100644
--- a/stringzilla/stringzilla.h
+++ b/stringzilla/stringzilla.h
@@ -9,6 +9,10 @@
 #include <arm_neon.h>
 #endif
 
+#if defined(__ARM_FEATURE_CRC32)
+#include <arm_acle.h>
+#endif
+
 /**
  *  Intrinsics aliases for MSVC, GCC, and Clang.
  */
@@ -1137,14 +1141,110 @@ inline static levenstein_distance_t sz_levenstein( //
     return previous_distances[b_length] <= bound ? previous_distances[b_length] : bound;
 }
 
+inline static sz_u32_t sz_hash_crc32_swar(sz_string_start_t start, sz_size_t length) {
+    /*
+     * The following CRC lookup table was generated automagically using the
+     * following model parameters:
+     *
+     * Generator Polynomial = ................. 0x1EDC6F41
+     * Generator Polynomial Length = .......... 32 bits
+     * Reflected Bits = ....................... TRUE
+     * Table Generation Offset = .............. 32 bits
+     * Number of Slices = ..................... 8 slices
+     * Slice Lengths = ........................ 8 8 8 8 8 8 8 8
+     */
+
+    static sz_u32_t const table[256] = {
+        0x00000000, 0xF26B8303, 0xE13B70F7, 0x1350F3F4, 0xC79A971F, 0x35F1141C, 0x26A1E7E8, 0xD4CA64EB, //
+        0x8AD958CF, 0x78B2DBCC, 0x6BE22838, 0x9989AB3B, 0x4D43CFD0, 0xBF284CD3, 0xAC78BF27, 0x5E133C24, //
+        0x105EC76F, 0xE235446C, 0xF165B798, 0x030E349B, 0xD7C45070, 0x25AFD373, 0x36FF2087, 0xC494A384, //
+        0x9A879FA0, 0x68EC1CA3, 0x7BBCEF57, 0x89D76C54, 0x5D1D08BF, 0xAF768BBC, 0xBC267848, 0x4E4DFB4B, //
+        0x20BD8EDE, 0xD2D60DDD, 0xC186FE29, 0x33ED7D2A, 0xE72719C1, 0x154C9AC2, 0x061C6936, 0xF477EA35, //
+        0xAA64D611, 0x580F5512, 0x4B5FA6E6, 0xB93425E5, 0x6DFE410E, 0x9F95C20D, 0x8CC531F9, 0x7EAEB2FA, //
+        0x30E349B1, 0xC288CAB2, 0xD1D83946, 0x23B3BA45, 0xF779DEAE, 0x05125DAD, 0x1642AE59, 0xE4292D5A, //
+        0xBA3A117E, 0x4851927D, 0x5B016189, 0xA96AE28A, 0x7DA08661, 0x8FCB0562, 0x9C9BF696, 0x6EF07595, //
+        0x417B1DBC, 0xB3109EBF, 0xA0406D4B, 0x522BEE48, 0x86E18AA3, 0x748A09A0, 0x67DAFA54, 0x95B17957, //
+        0xCBA24573, 0x39C9C670, 0x2A993584, 0xD8F2B687, 0x0C38D26C, 0xFE53516F, 0xED03A29B, 0x1F682198, //
+        0x5125DAD3, 0xA34E59D0, 0xB01EAA24, 0x42752927, 0x96BF4DCC, 0x64D4CECF, 0x77843D3B, 0x85EFBE38, //
+        0xDBFC821C, 0x2997011F, 0x3AC7F2EB, 0xC8AC71E8, 0x1C661503, 0xEE0D9600, 0xFD5D65F4, 0x0F36E6F7, //
+        0x61C69362, 0x93AD1061, 0x80FDE395, 0x72966096, 0xA65C047D, 0x5437877E, 0x4767748A, 0xB50CF789, //
+        0xEB1FCBAD, 0x197448AE, 0x0A24BB5A, 0xF84F3859, 0x2C855CB2, 0xDEEEDFB1, 0xCDBE2C45, 0x3FD5AF46, //
+        0x7198540D, 0x83F3D70E, 0x90A324FA, 0x62C8A7F9, 0xB602C312, 0x44694011, 0x5739B3E5, 0xA55230E6, //
+        0xFB410CC2, 0x092A8FC1, 0x1A7A7C35, 0xE811FF36, 0x3CDB9BDD, 0xCEB018DE, 0xDDE0EB2A, 0x2F8B6829, //
+        0x82F63B78, 0x709DB87B, 0x63CD4B8F, 0x91A6C88C, 0x456CAC67, 0xB7072F64, 0xA457DC90, 0x563C5F93, //
+        0x082F63B7, 0xFA44E0B4, 0xE9141340, 0x1B7F9043, 0xCFB5F4A8, 0x3DDE77AB, 0x2E8E845F, 0xDCE5075C, //
+        0x92A8FC17, 0x60C37F14, 0x73938CE0, 0x81F80FE3, 0x55326B08, 0xA759E80B, 0xB4091BFF, 0x466298FC, //
+        0x1871A4D8, 0xEA1A27DB, 0xF94AD42F, 0x0B21572C, 0xDFEB33C7, 0x2D80B0C4, 0x3ED04330, 0xCCBBC033, //
+        0xA24BB5A6, 0x502036A5, 0x4370C551, 0xB11B4652, 0x65D122B9, 0x97BAA1BA, 0x84EA524E, 0x7681D14D, //
+        0x2892ED69, 0xDAF96E6A, 0xC9A99D9E, 0x3BC21E9D, 0xEF087A76, 0x1D63F975, 0x0E330A81, 0xFC588982, //
+        0xB21572C9, 0x407EF1CA, 0x532E023E, 0xA145813D, 0x758FE5D6, 0x87E466D5, 0x94B49521, 0x66DF1622, //
+        0x38CC2A06, 0xCAA7A905, 0xD9F75AF1, 0x2B9CD9F2, 0xFF56BD19, 0x0D3D3E1A, 0x1E6DCDEE, 0xEC064EED, //
+        0xC38D26C4, 0x31E6A5C7, 0x22B65633, 0xD0DDD530, 0x0417B1DB, 0xF67C32D8, 0xE52CC12C, 0x1747422F, //
+        0x49547E0B, 0xBB3FFD08, 0xA86F0EFC, 0x5A048DFF, 0x8ECEE914, 0x7CA56A17, 0x6FF599E3, 0x9D9E1AE0, //
+        0xD3D3E1AB, 0x21B862A8, 0x32E8915C, 0xC083125F, 0x144976B4, 0xE622F5B7, 0xF5720643, 0x07198540, //
+        0x590AB964, 0xAB613A67, 0xB831C993, 0x4A5A4A90, 0x9E902E7B, 0x6CFBAD78, 0x7FAB5E8C, 0x8DC0DD8F, //
+        0xE330A81A, 0x115B2B19, 0x020BD8ED, 0xF0605BEE, 0x24AA3F05, 0xD6C1BC06, 0xC5914FF2, 0x37FACCF1, //
+        0x69E9F0D5, 0x9B8273D6, 0x88D28022, 0x7AB90321, 0xAE7367CA, 0x5C18E4C9, 0x4F48173D, 0xBD23943E, //
+        0xF36E6F75, 0x0105EC76, 0x12551F82, 0xE03E9C81, 0x34F4F86A, 0xC69F7B69, 0xD5CF889D, 0x27A40B9E, //
+        0x79B737BA, 0x8BDCB4B9, 0x988C474D, 0x6AE7C44E, 0xBE2DA0A5, 0x4C4623A6, 0x5F16D052, 0xAD7D5351  //
+    };
+
+    sz_u32_t crc = 0xFFFFFFFF;
+    for (sz_string_start_t const end = start + length; start != end; ++start)
+        crc = (crc >> 8) ^ table[(crc ^ (sz_u32_t)*start) & 0xff];
+    return crc ^ 0xFFFFFFFF;
+}
+
+#if defined(__ARM_FEATURE_CRC32)
+inline static sz_u32_t sz_hash_crc32_arm(sz_string_start_t start, sz_size_t length) {
+    sz_u32_t crc = 0xFFFFFFFF;
+    sz_string_start_t const end = start + length;
+
+    // Align the input to the word boundary
+    while (((unsigned long)start & 7ul) && start != end) { crc = __crc32cb(crc, *start), start++; }
+
+    // Process the body 8 bytes at a time
+    while (start + 8 <= end) { crc = __crc32cd(crc, *(unsigned long long *)start), start += 8; }
+
+    // Process the tail bytes
+    if (start + 4 <= end) { crc = __crc32cw(crc, *(unsigned int *)start), start += 4; }
+    if (start + 2 <= end) { crc = __crc32ch(crc, *(unsigned short *)start), start += 2; }
+    if (start < end) { crc = __crc32cb(crc, *start); }
+    return crc ^ 0xFFFFFFFF;
+}
+#endif
+
+#if defined(__SSE4_2__)
+inline static sz_u32_t sz_hash_crc32_sse(sz_string_start_t start, sz_size_t length) {
+    sz_u32_t crc = 0xFFFFFFFF;
+    sz_string_start_t const end = start + length;
+
+    // Align the input to the word boundary
+    while (((unsigned long)start & 7ul) && start != end) { crc = _mm_crc32_u8(crc, *start), start++; }
+
+    // Process the body 8 bytes at a time
+    while (start + 8 <= end) { crc = (sz_u32_t)_mm_crc32_u64(crc, *(unsigned long long *)start), start += 8; }
+
+    // Process the tail bytes
+    if (start + 4 <= end) { crc = _mm_crc32_u32(crc, *(unsigned int *)start), start += 4; }
+    if (start + 2 <= end) { crc = _mm_crc32_u16(crc, *(unsigned short *)start), start += 2; }
+    if (start < end) { crc = _mm_crc32_u8(crc, *start); }
+    return crc ^ 0xFFFFFFFF;
+}
+#endif
+
 /**
  *  @brief  Hashes provided string using hardware-accelerated CRC32 instructions.
  */
-inline static sz_u32_t sz_hash_crc32_native(sz_string_start_t start, sz_size_t length) { return 0; }
-
-inline static sz_u32_t sz_hash_crc32_neon(sz_string_start_t start, sz_size_t length) { return 0; }
-
-inline static sz_u32_t sz_hash_crc32_sse(sz_string_start_t start, sz_size_t length) { return 0; }
+inline static sz_u32_t sz_hash_crc32(sz_string_start_t start, sz_size_t length) {
+#if defined(__ARM_FEATURE_CRC32)
+    return sz_hash_crc32_arm(start, length);
+#elif defined(__SSE4_2__)
+    return sz_hash_crc32_sse(start, length);
+#else
+    return sz_hash_crc32_swar(start, length);
+#endif
+}
 
 #ifdef __cplusplus
 }

From 7b1e170e7ca2407e2d35a043dc34af29551ef7c2 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 9 Oct 2023 19:08:48 -0700
Subject: [PATCH 70/72] Improve: BitScan dispatch on Windows

---
 stringzilla/stringzilla.h | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h
index 57d17b89..136d93c5 100644
--- a/stringzilla/stringzilla.h
+++ b/stringzilla/stringzilla.h
@@ -714,19 +714,26 @@ inline static sz_u64_t sz_u64_byte_reverse(sz_u64_t val) {
 inline static sz_size_t sz_log2i(sz_size_t n) {
     if (n == 0) return 0;
 
-#if defined(__LP64__) || defined(_WIN64) // 64-bit
+#ifdef _WIN64
 #ifdef _MSC_VER
     unsigned long index;
-    _BitScanReverse64(&index, n);
-    return index;
+    if (_BitScanReverse64(&index, n)) return index;
+    return 0; // This line might be redundant due to the initial check, but it's safer to include it.
 #else
     return 63 - __builtin_clzll(n);
 #endif
-#else // 32-bit
+#elif defined(_WIN32)
 #ifdef _MSC_VER
     unsigned long index;
-    _BitScanReverse(&index, n);
-    return index;
+    if (_BitScanReverse(&index, n)) return index;
+    return 0; // Same note as above.
+#else
+    return 31 - __builtin_clz(n);
+#endif
+#else
+// Handle non-Windows platforms. You can further differentiate between 32-bit and 64-bit if needed.
+#if defined(__LP64__)
+    return 63 - __builtin_clzll(n);
 #else
     return 31 - __builtin_clz(n);
 #endif

From 051f0a886ed2438541be1e2430758661662229a8 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 9 Oct 2023 19:15:11 -0700
Subject: [PATCH 71/72] Test: Printing failed cases

---
 scripts/test_fuzzy.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/scripts/test_fuzzy.py b/scripts/test_fuzzy.py
index dbefd485..694c1818 100644
--- a/scripts/test_fuzzy.py
+++ b/scripts/test_fuzzy.py
@@ -20,7 +20,9 @@ def get_random_string(
 
 def is_equal_strings(native_strings, big_strings):
     for native_slice, big_slice in zip(native_strings, big_strings):
-        assert native_slice == big_slice
+        assert (
+            native_slice == big_slice
+        ), f"Mismatch between `{native_slice}` and `{str(big_slice)}`"
 
 
 def check_identical(
@@ -47,7 +49,9 @@ def check_identical(
     if check_iterators:
         for i in range(len(native_strings)):
             assert len(native_strings[i]) == len(big_strings[i])
-            assert native_strings[i] == big_strings[i]
+            assert (
+                native_strings[i] == big_strings[i]
+            ), f"Mismatch between `{native_strings[i]}` and `{str(big_strings[i])}`"
             assert [c for c in native_strings[i]] == [c for c in big_strings[i]]
 
     is_equal_strings(native_strings, big_strings)

From 9a575ce91a945430abb158ff892eef90d546c14c Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 9 Oct 2023 21:28:28 -0700
Subject: [PATCH 72/72] Fix: `sz_size_t` size in MSVC

---
 .gitignore                |  1 +
 python/lib.c              |  9 +++++----
 setup.py                  |  2 +-
 stringzilla/stringzilla.h | 31 ++++++++++++++++++++-----------
 4 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/.gitignore b/.gitignore
index cfbdf78a..ca44f760 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,6 +13,7 @@ substr_search_cpp
 *.so
 *.egg-info
 *.whl
+*.pyd
 node_modules/
 
 leipzig1M.txt
\ No newline at end of file
diff --git a/python/lib.c b/python/lib.c
index 64e3ae70..0fa67358 100644
--- a/python/lib.c
+++ b/python/lib.c
@@ -71,7 +71,8 @@ typedef struct {
  *      - Str(File("some-path.txt"), from=0, to=sys.maxint)
  */
 typedef struct {
-    PyObject_HEAD PyObject *parent;
+    PyObject_HEAD //
+        PyObject *parent;
     sz_string_start_t start;
     sz_size_t length;
 } Str;
@@ -782,13 +783,13 @@ static int Strs_contains(Str *self, PyObject *arg) { return 0; }
 
 static PyObject *Str_richcompare(PyObject *self, PyObject *other, int op) {
 
-    char const *a_start, *b_start;
-    size_t a_length, b_length;
+    sz_string_start_t a_start = NULL, b_start = NULL;
+    sz_size_t a_length = 0, b_length = 0;
     if (!export_string_like(self, &a_start, &a_length) || !export_string_like(other, &b_start, &b_length))
         Py_RETURN_NOTIMPLEMENTED;
 
     // Perform byte-wise comparison up to the minimum length
-    size_t min_length = a_length < b_length ? a_length : b_length;
+    sz_size_t min_length = a_length < b_length ? a_length : b_length;
     int cmp_result = memcmp(a_start, b_start, min_length);
 
     // If the strings are equal up to `min_length`, then the shorter string is smaller
diff --git a/setup.py b/setup.py
index 1b8d83ce..12357369 100644
--- a/setup.py
+++ b/setup.py
@@ -47,7 +47,7 @@
     link_args.append("-Xpreprocessor -lomp")
 
 if sys.platform == "win32":
-    compile_args.append("/std:c++17")
+    compile_args.append("/std:c99")
     compile_args.append("/O2")
 
 
diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h
index 136d93c5..b93e191a 100644
--- a/stringzilla/stringzilla.h
+++ b/stringzilla/stringzilla.h
@@ -35,6 +35,14 @@
 #define NULL ((void *)0)
 #endif
 
+/**
+ * @brief   Compile-time assert macro.
+ */
+#define SZ_STATIC_ASSERT(condition, name)                \
+    typedef struct {                                     \
+        int static_assert_##name : (condition) ? 1 : -1; \
+    } sz_static_assert_##name##_t
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -45,10 +53,11 @@ extern "C" {
  *          32-bit on platforms where pointers are 32-bit.
  */
 #if defined(__LP64__) || defined(_LP64) || defined(__x86_64__) || defined(_WIN64)
-typedef unsigned long sz_size_t;
+typedef unsigned long long sz_size_t;
 #else
 typedef unsigned sz_size_t;
 #endif
+SZ_STATIC_ASSERT(sizeof(sz_size_t) == sizeof(void *), sz_size_t_must_be_pointer_size);
 
 typedef int sz_bool_t;                 // Only one relevant bit
 typedef unsigned sz_u32_t;             // Always 32 bits
@@ -101,7 +110,7 @@ inline static sz_size_t sz_count_char_swar(sz_string_start_t const haystack,
     sz_string_start_t const end = haystack + haystack_length;
 
     // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((unsigned long)text & 7ul) && text < end; ++text) result += *text == *needle;
+    for (; ((sz_size_t)text & 7ull) && text < end; ++text) result += *text == *needle;
 
     // This code simulates hyper-scalar execution, comparing 8 characters at a time.
     sz_u64_t nnnnnnnn = *needle;
@@ -135,7 +144,7 @@ inline static sz_string_start_t sz_find_1char_swar(sz_string_start_t const hayst
     sz_string_start_t const end = haystack + haystack_length;
 
     // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((unsigned long)text & 7ul) && text < end; ++text)
+    for (; ((sz_size_t)text & 7ull) && text < end; ++text)
         if (*text == *needle) return text;
 
     // This code simulates hyper-scalar execution, analyzing 8 offsets at a time.
@@ -172,7 +181,7 @@ inline static sz_string_start_t sz_rfind_1char_swar(sz_string_start_t const hays
     sz_string_start_t text = end - 1;
 
     // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((unsigned long)text & 7ul) && text >= haystack; --text)
+    for (; ((sz_size_t)text & 7ull) && text >= haystack; --text)
         if (*text == *needle) return text;
 
     // This code simulates hyper-scalar execution, analyzing 8 offsets at a time.
@@ -208,7 +217,7 @@ inline static sz_string_start_t sz_find_2char_swar(sz_string_start_t const hayst
     sz_string_start_t const end = haystack + haystack_length;
 
     // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((unsigned long)text & 7ul) && text + 2 <= end; ++text)
+    for (; ((sz_size_t)text & 7ull) && text + 2 <= end; ++text)
         if (text[0] == needle[0] && text[1] == needle[1]) return text;
 
     // This code simulates hyper-scalar execution, analyzing 7 offsets at a time.
@@ -257,7 +266,7 @@ inline static sz_string_start_t sz_find_3char_swar(sz_string_start_t const hayst
     sz_string_start_t end = haystack + haystack_length;
 
     // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((unsigned long)text & 7ul) && text + 3 <= end; ++text)
+    for (; ((sz_size_t)text & 7ull) && text + 3 <= end; ++text)
         if (text[0] == needle[0] && text[1] == needle[1] && text[2] == needle[2]) return text;
 
     // This code simulates hyper-scalar execution, analyzing 6 offsets at a time.
@@ -319,7 +328,7 @@ inline static sz_string_start_t sz_find_4char_swar(sz_string_start_t const hayst
     sz_string_start_t end = haystack + haystack_length;
 
     // Process the misaligned head, to void UB on unaligned 64-bit loads.
-    for (; ((unsigned long)text & 7ul) && text + 4 <= end; ++text)
+    for (; ((sz_size_t)text & 7ull) && text + 4 <= end; ++text)
         if (text[0] == needle[0] && text[1] == needle[1] && text[2] == needle[2] && text[3] == needle[3]) return text;
 
     // This code simulates hyper-scalar execution, analyzing 4 offsets at a time.
@@ -987,7 +996,7 @@ inline static void _sz_sort_recursion( //
     // Partition a range of integers according to a specific bit value
     sz_size_t split = 0;
     {
-        sz_u64_t mask = (1ul << 63) >> bit_idx;
+        sz_u64_t mask = (1ull << 63) >> bit_idx;
         while (split != sequence->count && !(sequence->order[split] & mask)) ++split;
         for (sz_size_t i = split + 1; i < sequence->count; ++i)
             if (!(sequence->order[i] & mask)) _sz_swap_order(sequence->order + i, sequence->order + split), ++split;
@@ -1056,7 +1065,7 @@ inline static void sz_sort(sz_sequence_t *sequence, sz_sort_config_t const *conf
     for (sz_size_t i = 0; i != sequence->count; ++i) {
         sz_string_start_t begin = sequence->get_start(sequence, sequence->order[i]);
         sz_size_t length = sequence->get_length(sequence, sequence->order[i]);
-        length = length > 4ul ? 4ul : length;
+        length = length > 4ull ? 4ull : length;
         char *prefix = (char *)&sequence->order[i];
         for (sz_size_t j = 0; j != length; ++j) prefix[7 - j] = begin[j];
         if (case_insensitive) {
@@ -1208,7 +1217,7 @@ inline static sz_u32_t sz_hash_crc32_arm(sz_string_start_t start, sz_size_t leng
     sz_string_start_t const end = start + length;
 
     // Align the input to the word boundary
-    while (((unsigned long)start & 7ul) && start != end) { crc = __crc32cb(crc, *start), start++; }
+    while (((unsigned long)start & 7ull) && start != end) { crc = __crc32cb(crc, *start), start++; }
 
     // Process the body 8 bytes at a time
     while (start + 8 <= end) { crc = __crc32cd(crc, *(unsigned long long *)start), start += 8; }
@@ -1227,7 +1236,7 @@ inline static sz_u32_t sz_hash_crc32_sse(sz_string_start_t start, sz_size_t leng
     sz_string_start_t const end = start + length;
 
     // Align the input to the word boundary
-    while (((unsigned long)start & 7ul) && start != end) { crc = _mm_crc32_u8(crc, *start), start++; }
+    while (((unsigned long)start & 7ull) && start != end) { crc = _mm_crc32_u8(crc, *start), start++; }
 
     // Process the body 8 bytes at a time
     while (start + 8 <= end) { crc = (sz_u32_t)_mm_crc32_u64(crc, *(unsigned long long *)start), start += 8; }