From bc4d2e9053d393643a3f5f88371065abc2ca7a69 Mon Sep 17 00:00:00 2001 From: r2rSahakyan <89134416+r2rSahakyan@users.noreply.github.com> Date: Sat, 2 Sep 2023 23:35:57 +0400 Subject: [PATCH 01/72] Ver First Binding Draft in CPython Span class with size , hash, comparison, contains and find functions. all added code under #ifndef PURE_CPYTHON --- python/lib.cpp | 186 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 185 insertions(+), 1 deletion(-) diff --git a/python/lib.cpp b/python/lib.cpp index 979e1f56..4724e176 100644 --- a/python/lib.cpp +++ b/python/lib.cpp @@ -25,6 +25,11 @@ typedef SSIZE_T ssize_t; #include // `std::string` #include // `std::string_view` +#define PURE_CPYTHON +#ifdef PURE_CPYTHON +#include +#endif + #include #include @@ -647,6 +652,185 @@ void define_slice_ops(py::class_> &str_view_struct) { py::keep_alive<0, 1>()); } +#ifdef PURE_CPYTHON +typedef struct +{ + PyObject_HEAD + std::shared_ptr span; +} PySpan; + + +static PyMethodDef PySpan_methods[] = { + {"size", (PyCFunction)PySpan_size, METH_NOARGS, "Get the size"}, + {"__hash__", (PyCFunction) PySpan_hash, METH_NOARGS, "Returns the hash value"}, + {"__eq__", (PyCFunction) PySpan_eq, METH_O, "Equality check"}, + {"__ne__", (PyCFunction) PySpan_ne, METH_O, "Non-equality check"}, + {"__gt__", (PyCFunction) PySpan_gt, METH_O, "Greater than check"}, + {"__lt__", (PyCFunction) PySpan_lt, METH_O, "Less than check"}, + {"contains", (PyCFunction)PyStrView_contains, METH_VARARGS | METH_KEYWORDS, "Check if contains"}, + {"find", (PyCFunction)PyStrView_find, METH_VARARGS | METH_KEYWORDS, "Find needle"}, + {NULL} // Sentinel +}; + +static PyObject *PySpan_new(PyTypeObject *type, PyObject *args, PyObject *kwds) +{ + PySpan *self; + self = (PySpan *)type->tp_alloc(type, 0); + return (PyObject *)self; +} +static int PySpan_init(PySpan *self, PyObject *args, PyObject *kwds) +{ + self->span = std::make_shared(); + return 0; +} + +static PyObject *PySpan_size(PySpan *self, PyObject *Py_UNUSED(ignored)) +{ + return PyLong_FromSsize_t(self->span->size()); +} + +static void PySpan_dealloc(PySpan *self) +{ + // Handle the deallocation of the C++ object + self->span.reset(); + Py_TYPE(self)->tp_free((PyObject *)self); +} + +static PyObject* PySpan_hash(PySpan *self) { + return PyLong_FromUnsignedLong(self->span->hash()); +} + +static PyObject* PySpan_eq(PySpan *self, PyObject *other) { + if (PyUnicode_Check(other)) { + return PyBool_FromLong(self->span->operator==(PyUnicode_AsUTF8(other))); + } else if (PyObject_TypeCheck(other, &PySpanType)) { + return PyBool_FromLong(self->span->operator==(((PySpan *)other)->span)); + } + Py_RETURN_FALSE; +} + +static PyObject* PySpan_ne(PySpan *self, PyObject *other) { + if (PyUnicode_Check(other)) { + return PyBool_FromLong(self->span->operator!=(PyUnicode_AsUTF8(other))); + } else if (PyObject_TypeCheck(other, &PySpanType)) { + return PyBool_FromLong(self->span->operator!=(((PySpan *)other)->span)); + } + Py_RETURN_TRUE; +} + +static PyObject* PySpan_gt(PySpan *self, PyObject *other) { + if (PyUnicode_Check(other)) { + return PyBool_FromLong(self->span->operator>(PyUnicode_AsUTF8(other))); + } else if (PyObject_TypeCheck(other, &PySpanType)) { + return PyBool_FromLong(self->span->operator>(((PySpan *)other)->span)); + } + Py_RETURN_FALSE; +} + +static PyObject* PySpan_lt(PySpan *self, PyObject *other) { + if (PyUnicode_Check(other)) { + return PyBool_FromLong(self->span->operator<(PyUnicode_AsUTF8(other))); + } else if (PyObject_TypeCheck(other, &PySpanType)) { + return PyBool_FromLong(self->span->operator<(((PySpan *)other)->span)); + } + Py_RETURN_FALSE; +} + +static PyObject * PyStrView_contains(PyStrView *self, PyObject *args, PyObject *kwargs) { + char *needle; + int start = 0, end = INT_MAX; + static char *kwlist[] = {"needle", "start", "end", NULL}; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s|ii", kwlist, &needle, &start, &end)) { + return NULL; + } + + if(self->span->contains(needle,start,end)){ + Py_INCREF(Py_True); + return Py_True; + } else { + Py_INCREF(Py_False); + return Py_False; + } +} +static PyObject * PyStrView_find(PyStrView *self, PyObject *args, PyObject *kwargs) { + char *needle; + int start = 0, end = INT_MAX; + static char *kwlist[] = {"needle", "start", "end", NULL}; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s|ii", kwlist, &needle, &start, &end)) { + return NULL; + } + return PyLong_FromLong(self->span->find(needle,start,end)); +} + +static PyTypeObject PySpanType = { + PyVarObject_HEAD_INIT(NULL, 0) /* ob_size */ + "YourModule.Span", /* tp_name */ + sizeof(PySpan), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor)PySpan_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT, /* tp_flags */ + "Span objects", /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + PySpan_methods, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + (initproc)PySpan_init, /* tp_init */ + 0, /* tp_alloc */ + PySpan_new, /* tp_new */ +}; + +static PyModuleDef stringzilla_module = { + PyModuleDef_HEAD_INIT, + "stringzilla", + "Crunch 100+ GB Strings in Python with ease", + -1, + NULL, NULL, NULL, NULL, NULL +}; + +PyMODINIT_FUNC PyInit_stringzilla(void) { + PyObject *m; + + if (PyType_Ready(&PySpanType) < 0) + return NULL; + + m = PyModule_Create(&stringzilla_module); + if (m == NULL) + return NULL; + + Py_INCREF(&PySpanType); + PyModule_AddObject(m, "Span", (PyObject *)&PySpanType); + + return m; +} + +#endif + PYBIND11_MODULE(stringzilla, m) { m.doc() = "Crunch 100+ GB Strings in Python with ease"; @@ -732,4 +916,4 @@ PYBIND11_MODULE(stringzilla, m) { py_strs.def("append", &py_spans_t::append, py::call_guard()); py_strs.def("append", &py_spans_t::append_copy); py_strs.def("extend", &py_spans_t::extend_copy); -} \ No newline at end of file +} From f633d12a1346e729d10aa3f1423316b5daa9ef67 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sun, 10 Sep 2023 17:37:59 +0400 Subject: [PATCH 02/72] Refactor: Restarting CPython bindings --- python/lib.cpp | 919 +++---------------------------------------------- 1 file changed, 42 insertions(+), 877 deletions(-) diff --git a/python/lib.cpp b/python/lib.cpp index 4724e176..77678767 100644 --- a/python/lib.cpp +++ b/python/lib.cpp @@ -1,4 +1,6 @@ - +/** + * @brief + */ #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) #define NOMINMAX #include @@ -16,801 +18,53 @@ typedef SSIZE_T ssize_t; #include // `ssize_t` #endif -#include // `std::random_device` -#include // `std::exchange` -#include // `std::numeric_limits` -#include // `std::iota` -#include // `std::abs` -#include // `std::shuffle` -#include // `std::string` -#include // `std::string_view` - -#define PURE_CPYTHON -#ifdef PURE_CPYTHON #include -#endif - -#include -#include - -#include "stringzilla.h" - -namespace py = pybind11; - -struct py_span_t; -struct py_str_t; -struct py_file_t; -struct py_subspan_t; -struct py_spans_t; - -struct span_t { - char const *ptr; - size_t len; - - explicit operator bool() const noexcept { return ptr; } - char const *data() const noexcept { return ptr; } - size_t size() const noexcept { return len; } - bool contains(char const *fragment) const noexcept { return ptr <= fragment && fragment < (ptr + len); } -}; - -static constexpr ssize_t ssize_max_k = std::numeric_limits::max(); -static constexpr size_t size_max_k = std::numeric_limits::max(); - -inline size_t find_substr(span_t h_span, char n) noexcept { - strzl_haystack_t h {h_span.ptr, h_span.len}; - return strzl_naive_find_char(h, n); -} - -inline size_t find_substr(span_t h_span, span_t n_span) noexcept { - strzl_haystack_t h {h_span.ptr, h_span.len}; - strzl_needle_t n {n_span.ptr, n_span.len, 0}; - -#if defined(__AVX2__) - return strzl_avx2_find_substr(h, n); -#elif defined(__ARM_NEON) - return strzl_neon_find_substr(h, n); -#else - return strzl_naive_find_substr(h, n); -#endif -} - -inline size_t count_char(span_t h_span, char n) noexcept { - strzl_haystack_t h {h_span.ptr, h_span.len}; - return strzl_naive_count_char(h, n); -} - -inline size_t count_substr(span_t h, span_t n, bool overlap = false) noexcept { - - if (n.len == 1) - return count_char(h, *n.ptr); - if (h.len < n.len) - return 0; - - size_t result = 0; - if (overlap) { - while (h.len) { - size_t offset = find_substr(h, n); - bool found = offset != h.len; - result += found; - h.ptr += offset + found; - h.len -= offset + found; - } - } - - else { - while (h.len) { - size_t offset = find_substr(h, n); - bool found = offset != h.len; - result += found; - h.ptr += offset + n.len; - h.len -= offset + n.len * found; - } - } - - return result; -} - -span_t to_span(std::string_view s) { return {s.data(), s.size()}; } -std::string_view to_stl(span_t s) { return {s.data(), s.size()}; } - -struct index_span_t { - size_t offset; - size_t length; -}; - -index_span_t slice(size_t length, ssize_t start, ssize_t end) { - ssize_t len = static_cast(length); - ssize_t abs_start = std::abs(start); - ssize_t abs_end = std::abs(end); - - if (len == 0 || start == end) - return {0ul, 0ul}; - - if (start > end) { - if ((start < 0 && end < 0) || (start >= 0 && end > 0) || len - abs_end < start) - return {0ul, 0ul}; - end = len - abs_end; - } - else if (start < 0 && end < 0) { - if (abs_start <= len && abs_end <= len) { - start = len + start; - end = len + end; - } - else if (abs_start > len && abs_end <= len) { - start = 0; - end = len + end; - } - else if (abs_start <= len && abs_end > len) { - start = len + start; - end = len; - } - else if (abs_start > len && abs_end > len) { - start = 0; - end = len; - } - } - else if (start < 0 && end >= 0) { - end = end == 0 ? len : std::min(end, len); - if (!((start = len - abs_start) < end && start >= 0)) - start = end = 0; - } - else if (start >= 0 && end < 0) { - if (len >= start) { - if ((len + end) >= start) - end = len + end; - else - end = len; - } - else - end = start; - } - else { - start = std::min(start, len); - end = end == 0 ? len : std::min(end, len); - } - return {static_cast(start), static_cast(end - start)}; -} - -size_t unsigned_offset(size_t length, ssize_t idx) { - if (idx >= 0) { - if (static_cast(idx) > length) - throw std::out_of_range("Accessing beyond content length"); - return static_cast(idx); - } - else { - if (static_cast(-idx) > length) - throw std::out_of_range("Accessing beyond content length"); - return static_cast(length + idx); - } -} - -span_t subspan(span_t span, ssize_t start, ssize_t end = ssize_max_k) { - index_span_t index_span = slice(span.size(), start, end); - return {span.ptr + index_span.offset, index_span.length}; -} - -struct py_span_t : public span_t, public std::enable_shared_from_this { - - py_span_t(span_t view = {}) : span_t(view) {} - virtual ~py_span_t() {} - - using span_t::len; - using span_t::ptr; - - span_t span() const { return {ptr, len}; } - ssize_t size() const { return static_cast(len); } - bool contains(std::string_view needle, ssize_t start, ssize_t end) const; - ssize_t find(std::string_view, ssize_t start, ssize_t end) const; - ssize_t count(std::string_view, ssize_t start, ssize_t end, bool allowoverlap) const; - std::shared_ptr splitlines(bool keeplinebreaks, char separator, size_t maxsplit) const; - std::shared_ptr split(std::string_view separator, size_t maxsplit, bool keepseparator) const; - std::shared_ptr sub(ssize_t start, ssize_t end) const; - - char const *begin() const { return reinterpret_cast(ptr); } - char const *end() const { return begin() + len; } - char at(ssize_t offset) const { return begin()[unsigned_offset(len, offset)]; } - py::str to_python() const { return {begin(), len}; } - std::size_t hash() const { return std::hash {}({ptr, len}); } - - bool operator==(py::str const &str) const { return to_stl({ptr, len}) == str.cast(); } - bool operator!=(py::str const &str) const { return !(*this == str); } - bool operator==(py_span_t const &other) const { return to_stl({ptr, len}) == to_stl({other.ptr, other.len}); } - bool operator!=(py_span_t const &other) const { return !(*this == other); } - bool operator>(py::str const &str) const { return to_stl({ptr, len}) > str.cast(); } - bool operator<(py::str const &str) const { return to_stl({ptr, len}) < str.cast(); } - bool operator>(py_span_t const &other) const { return to_stl({ptr, len}) > to_stl({other.ptr, other.len}); } - bool operator<(py_span_t const &other) const { return to_stl({ptr, len}) < to_stl({other.ptr, other.len}); } - - span_t after_n(size_t offset) const noexcept { - return (offset < len) ? span_t {ptr + offset, len - offset} : span_t {}; - } - span_t before_n(size_t tail) const noexcept { - return (tail < len) ? span_t {ptr + len - tail, len - tail} : span_t {}; - } -}; - -struct py_str_t : public py_span_t { - std::string copy_; - - py_str_t(std::string_view string = "") : copy_(string) { ptr = to_span(copy_).ptr, len = to_span(copy_).len; } - ~py_str_t() {} - - using py_span_t::contains; - using py_span_t::count; - using py_span_t::find; - using py_span_t::size; - using py_span_t::split; - using py_span_t::splitlines; -}; - -struct py_file_t : public py_span_t { - std::string path; -#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) - HANDLE file_handle = nullptr; - HANDLE mapping_handle = nullptr; -#else - int file_descriptor = 0; -#endif - - public: - py_file_t(std::string const &path) { open(path); } - ~py_file_t() { close(); } - - void reopen() { open(path); } - void open(std::string const &p) { - close(); - path = p; - -#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) - - file_handle = - CreateFile(path.c_str(), GENERIC_READ, FILE_SHARE_READ, 0, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0); - if (file_handle == INVALID_HANDLE_VALUE) - throw std::runtime_error("Couldn't map the file!"); - - mapping_handle = CreateFileMapping(file_handle, 0, PAGE_READONLY, 0, 0, 0); - if (mapping_handle == 0) { - CloseHandle(std::exchange(file_handle, nullptr)); - throw std::runtime_error("Couldn't map the file!"); - } - - char *file = (char *)MapViewOfFile(mapping_handle, FILE_MAP_READ, 0, 0, 0); - if (file == 0) { - CloseHandle(std::exchange(mapping_handle, nullptr)); - CloseHandle(std::exchange(file_handle, nullptr)); - throw std::runtime_error("Couldn't map the file!"); - } - ptr = file; - len = GetFileSize(file_handle, 0); -#else - struct stat sb; - file_descriptor = ::open(path.c_str(), O_RDONLY); - if (fstat(file_descriptor, &sb) != 0) { - ::close(std::exchange(file_descriptor, 0)); - throw std::runtime_error("Can't retrieve file size!"); - } - size_t file_size = sb.st_size; - void *map = mmap(NULL, sb.st_size, PROT_READ, MAP_SHARED, file_descriptor, 0); - if (map == MAP_FAILED) { - ::close(std::exchange(file_descriptor, 0)); - throw std::runtime_error("Couldn't map the file!"); - } - ptr = reinterpret_cast(map); - len = file_size; -#endif - } - - void close() { - -#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) - if (ptr) - UnmapViewOfFile(std::exchange(ptr, nullptr)), len = 0; - if (mapping_handle) - CloseHandle(std::exchange(mapping_handle, nullptr)); - if (file_handle) - CloseHandle(std::exchange(file_handle, nullptr)); - -#else - if (ptr) - munmap((void *)std::exchange(ptr, nullptr), std::exchange(len, 0)); - if (file_descriptor != 0) - ::close(std::exchange(file_descriptor, 0)); -#endif - } - - std::shared_ptr load() const { return std::make_shared(to_stl(*this)); } - - using py_span_t::contains; - using py_span_t::count; - using py_span_t::find; - using py_span_t::size; - using py_span_t::split; - using py_span_t::splitlines; -}; - -struct py_subspan_t : public py_span_t { - std::shared_ptr parent_; - - public: - py_subspan_t() = default; - py_subspan_t(py_subspan_t &&) = default; - py_subspan_t &operator=(py_subspan_t &&) = default; - py_subspan_t(std::shared_ptr parent, span_t str) : parent_(std::move(parent)) { - ptr = str.ptr, len = str.len; - } - - using py_span_t::contains; - using py_span_t::count; - using py_span_t::find; - using py_span_t::size; - using py_span_t::split; - using py_span_t::splitlines; -}; - -static std::shared_ptr empty_subspan = std::make_shared(); - -struct py_spans_t : public std::enable_shared_from_this { - - using parent_t = std::shared_ptr; - - struct less_address_t { - using is_transparent = void; - bool operator()(py_span_t const &a, py_span_t const &b) const noexcept { return a.data() < b.data(); } - bool operator()(parent_t const &a, parent_t const &b) const noexcept { return a->data() < b->data(); } - bool operator()(py_span_t const &a, char const *b) const noexcept { - return a.span().contains(b) ? false : a.data() < b; - } - bool operator()(parent_t const &a, char const *b) const noexcept { - return a->span().contains(b) ? false : a->data() < b; - } - bool operator()(char const *a, py_span_t const &b) const noexcept { - return b.span().contains(a) ? false : a < b.data(); - } - bool operator()(char const *a, parent_t const &b) const noexcept { - return b->span().contains(a) ? false : a < b->data(); - } - }; - - using parents_t = std::set; - using parts_t = std::vector; - - private: - parents_t parents_; - parts_t parts_; - static char const *strzl_array_get_begin(void const *raw, size_t i) { return ((span_t *)raw)[i].ptr; } - static size_t strzl_array_get_length(void const *raw, size_t i) { return ((span_t *)raw)[i].len; } - - public: - py_spans_t() = default; - py_spans_t(py_spans_t &&) = default; - py_spans_t &operator=(py_spans_t &&) = default; - py_spans_t(parents_t parents, parts_t parts) : parents_(std::move(parents)), parts_(std::move(parts)) {} - - struct iterator_t { - py_spans_t const *py_spans_ = nullptr; - size_t idx_ = 0; - - bool operator==(iterator_t const &other) const { return idx_ == other.idx_; } - bool operator!=(iterator_t const &other) const { return idx_ != other.idx_; } - std::shared_ptr operator*() const { return py_spans_->at(idx_); } - iterator_t &operator++() { - idx_++; - return *this; - } - iterator_t operator++(int) { - iterator_t old(*this); - ++*this; - return old; - } - }; - - std::shared_ptr pop(ssize_t i) { - std::size_t offset = unsigned_offset(size(), i); - span_t part = parts_[offset]; - if (!part) { - parts_.erase(parts_.begin() + offset); - return empty_subspan; - } - auto parent_iterator = parents_.find(part.data()); - auto popped = std::make_shared(*parent_iterator, part); - parts_.erase(parts_.begin() + offset); - return popped; - } - - std::shared_ptr at(ssize_t i) const { - std::size_t offset = unsigned_offset(size(), i); - span_t part = parts_[offset]; - if (!part) - return empty_subspan; - auto parent_iterator = parents_.find(part.data()); - auto popped = std::make_shared(*parent_iterator, part); - return popped; - } - - std::shared_ptr sub(ssize_t start, ssize_t end, ssize_t step, ssize_t length) const { - if (step == 1) { - auto first_part_it = parts_.begin() + start; - std::vector sub_parts(first_part_it, first_part_it + length); - return std::make_shared(parents_, std::move(sub_parts)); - } - std::vector sub_parts(length); - for (ssize_t parts_idx = start, sub_idx = 0; sub_idx < length; parts_idx += step, ++sub_idx) - sub_parts[sub_idx] = parts_[parts_idx]; - return std::make_shared(parents_, std::move(sub_parts)); - } - - iterator_t begin() const { return {this, 0}; } - iterator_t end() const { return {this, parts_.size()}; } - ssize_t size() const { return static_cast(parts_.size()); } - - void sort() { - std::vector permute(parts_.size()); - std::iota(permute.begin(), permute.end(), 0ul); - strzl_array_t array; - array.order = permute.data(); - array.count = permute.size(); - array.handle = parts_.data(); - array.get_begin = strzl_array_get_begin; - array.get_length = strzl_array_get_length; - strzl_sort(&array, nullptr); - std::vector new_parts(parts_.size()); - for (std::size_t i = 0; i != parts_.size(); ++i) - new_parts[i] = parts_[permute[i]]; - parts_ = new_parts; - } - - void shuffle(std::optional maybe_seed) { - std::random_device device; - std::size_t seed = maybe_seed ? *maybe_seed : device(); - using seed_t = typename std::mt19937::result_type; - std::mt19937 generator {static_cast(seed)}; - std::shuffle(parts_.begin(), parts_.end(), generator); - } - - void reverse() { std::reverse(parts_.begin(), parts_.end()); } - - void extend(py_spans_t const &other) { - parents_.insert(other.parents_.begin(), other.parents_.end()); - parts_.insert(parts_.end(), other.parts_.begin(), other.parts_.end()); - } - - template - void append(std::shared_ptr const &other) { - parents_.insert(std::dynamic_pointer_cast(other)); - parts_.push_back(other->span()); - } - - void append_copy(std::string_view other) { append(std::make_shared(other)); } - - void extend_copy(std::vector const &others) { - // `std::set` doesn't ahve such an interface: - // parents_.reserve(parents_.size() + others.size()); - parts_.reserve(parts_.size() + others.size()); - for (std::string_view other : others) - append_copy(other); - } - - std::shared_ptr sorted() const { - auto copy = std::make_shared(parents_, parts_); - copy->sort(); - return copy; - } - - std::shared_ptr shuffled(std::optional maybe_seed) const { - auto copy = std::make_shared(parents_, parts_); - copy->shuffle(maybe_seed); - return copy; - } -}; - -bool py_span_t::contains(std::string_view needle, ssize_t start, ssize_t end) const { - if (needle.size() == 0) - return true; - span_t part = subspan(span(), start, end); - size_t offset = needle.size() == 1 // - ? find_substr(part, needle.front()) - : find_substr(part, to_span(needle)); - return offset != part.len; -} - -ssize_t py_span_t::find(std::string_view needle, ssize_t start, ssize_t end) const { - if (needle.size() == 0) - return 0; - span_t part = subspan(span(), start, end); - size_t offset = needle.size() == 1 // - ? find_substr(part, needle.front()) - : find_substr(part, to_span(needle)); - return offset != part.len ? offset : -1; -} - -ssize_t py_span_t::count(std::string_view needle, ssize_t start, ssize_t end, bool allowoverlap) const { - if (needle.size() == 0) - return 0; - span_t part = subspan(span(), start, end); - auto result = needle.size() == 1 // - ? count_char(part, needle.front()) - : count_substr(part, to_span(needle), allowoverlap); - return result; -} - -std::shared_ptr py_span_t::splitlines(bool keeplinebreaks, char separator, size_t maxsplit) const { - - size_t count_separators = count_char(span(), separator); - std::vector parts(std::min(count_separators + 1, maxsplit)); - size_t last_start = 0; - for (size_t i = 0; i + 1 < parts.size(); ++i) { - span_t remaining = after_n(last_start); - size_t offset_in_remaining = find_substr(remaining, separator); - parts[i] = span_t {ptr + last_start, offset_in_remaining + keeplinebreaks}; - last_start += offset_in_remaining + 1; - } - parts[count_separators] = after_n(last_start); - py_spans_t::parent_t parent = shared_from_this(); - return std::make_shared(py_spans_t::parents_t {std::move(parent)}, std::move(parts)); -} - -std::shared_ptr py_span_t::split(std::string_view separator, size_t maxsplit, bool keepseparator) const { - - if (separator.size() == 1 && maxsplit == ssize_max_k) - return splitlines(keepseparator, separator.front(), maxsplit); - - std::vector parts; - size_t last_start = 0; - bool will_continue = true; - while (last_start < len && parts.size() + 1 < maxsplit) { - span_t remaining = after_n(last_start); - size_t offset_in_remaining = find_substr(remaining, to_span(separator)); - will_continue = offset_in_remaining != remaining.size(); - size_t part_len = offset_in_remaining + separator.size() * keepseparator * will_continue; - parts.emplace_back(span_t {remaining.data(), part_len}); - last_start += offset_in_remaining + separator.size(); - } - // Python marks includes empy ending as well - if (will_continue) - parts.emplace_back(after_n(last_start)); - py_spans_t::parent_t parent = shared_from_this(); - return std::make_shared(py_spans_t::parents_t {std::move(parent)}, std::move(parts)); -} - -std::shared_ptr py_span_t::sub(ssize_t start, ssize_t end) const { - index_span_t index_span = slice(size(), start, end); - return std::make_shared(shared_from_this(), span_t {ptr + index_span.offset, index_span.length}); -} - -template -void define_comparsion_ops(py::class_> &str_view_struct) { - str_view_struct.def("__hash__", [](at const &self) { return self.hash(); }); - str_view_struct.def("__eq__", [](at const &self, py::str const &str) { return self == str; }); - str_view_struct.def("__ne__", [](at const &self, py::str const &str) { return self != str; }); - str_view_struct.def("__eq__", [](at const &self, at const &other) { return self == other; }); - str_view_struct.def("__ne__", [](at const &self, at const &other) { return self != other; }); - str_view_struct.def("__gt__", [](at const &self, py::str const &str) { return self > str; }); - str_view_struct.def("__lt__", [](at const &self, py::str const &str) { return self < str; }); - str_view_struct.def("__gt__", [](at const &self, at const &other) { return self > other; }); - str_view_struct.def("__lt__", [](at const &self, at const &other) { return self < other; }); -} - -template -void define_slice_ops(py::class_> &str_view_struct) { - - str_view_struct.def( // - "contains", - &at::contains, - py::arg("needle"), - py::arg("start") = 0, - py::arg("end") = ssize_max_k, - py::call_guard()); - str_view_struct.def( // - "find", - &at::find, - py::arg("needle"), - py::arg("start") = 0, - py::arg("end") = ssize_max_k, - py::call_guard()); - str_view_struct.def( // - "count", - &at::count, - py::arg("needle"), - py::arg("start") = 0, - py::arg("end") = ssize_max_k, - py::arg("allowoverlap") = false, - py::call_guard()); - str_view_struct.def( // - "splitlines", - &at::splitlines, - py::arg("keeplinebreaks") = false, - py::arg("separator") = '\n', - py::kw_only(), - py::arg("maxsplit") = size_max_k, - py::call_guard()); - str_view_struct.def( // - "split", - &at::split, - py::arg("separator") = " ", - py::arg("maxsplit") = size_max_k, - py::kw_only(), - py::arg("keepseparator") = false, - py::call_guard()); - str_view_struct.def( // - "sub", - &at::sub, - py::arg("start") = 0, - py::arg("end") = 0); - - // Substring presence operator - str_view_struct.def("__contains__", - [](at const &str, std::string_view needle) { return str.contains(needle, 0, ssize_max_k); }); - - // Character access operators - str_view_struct.def("__str__", &at::to_python); - str_view_struct.def("__getitem__", &at::at, py::arg("index")); - str_view_struct.def("__len__", &at::size); - str_view_struct.def( - "__iter__", - [](at const &s) { return py::make_iterator(s.begin(), s.end()); }, - py::keep_alive<0, 1>()); -} - -#ifdef PURE_CPYTHON -typedef struct -{ - PyObject_HEAD - std::shared_ptr span; -} PySpan; - - -static PyMethodDef PySpan_methods[] = { - {"size", (PyCFunction)PySpan_size, METH_NOARGS, "Get the size"}, - {"__hash__", (PyCFunction) PySpan_hash, METH_NOARGS, "Returns the hash value"}, - {"__eq__", (PyCFunction) PySpan_eq, METH_O, "Equality check"}, - {"__ne__", (PyCFunction) PySpan_ne, METH_O, "Non-equality check"}, - {"__gt__", (PyCFunction) PySpan_gt, METH_O, "Greater than check"}, - {"__lt__", (PyCFunction) PySpan_lt, METH_O, "Less than check"}, - {"contains", (PyCFunction)PyStrView_contains, METH_VARARGS | METH_KEYWORDS, "Check if contains"}, - {"find", (PyCFunction)PyStrView_find, METH_VARARGS | METH_KEYWORDS, "Find needle"}, - {NULL} // Sentinel -}; - -static PyObject *PySpan_new(PyTypeObject *type, PyObject *args, PyObject *kwds) -{ - PySpan *self; - self = (PySpan *)type->tp_alloc(type, 0); - return (PyObject *)self; -} -static int PySpan_init(PySpan *self, PyObject *args, PyObject *kwds) -{ - self->span = std::make_shared(); - return 0; -} - -static PyObject *PySpan_size(PySpan *self, PyObject *Py_UNUSED(ignored)) -{ - return PyLong_FromSsize_t(self->span->size()); -} - -static void PySpan_dealloc(PySpan *self) -{ - // Handle the deallocation of the C++ object - self->span.reset(); - Py_TYPE(self)->tp_free((PyObject *)self); -} - -static PyObject* PySpan_hash(PySpan *self) { - return PyLong_FromUnsignedLong(self->span->hash()); -} - -static PyObject* PySpan_eq(PySpan *self, PyObject *other) { - if (PyUnicode_Check(other)) { - return PyBool_FromLong(self->span->operator==(PyUnicode_AsUTF8(other))); - } else if (PyObject_TypeCheck(other, &PySpanType)) { - return PyBool_FromLong(self->span->operator==(((PySpan *)other)->span)); - } - Py_RETURN_FALSE; -} - -static PyObject* PySpan_ne(PySpan *self, PyObject *other) { - if (PyUnicode_Check(other)) { - return PyBool_FromLong(self->span->operator!=(PyUnicode_AsUTF8(other))); - } else if (PyObject_TypeCheck(other, &PySpanType)) { - return PyBool_FromLong(self->span->operator!=(((PySpan *)other)->span)); - } - Py_RETURN_TRUE; -} - -static PyObject* PySpan_gt(PySpan *self, PyObject *other) { - if (PyUnicode_Check(other)) { - return PyBool_FromLong(self->span->operator>(PyUnicode_AsUTF8(other))); - } else if (PyObject_TypeCheck(other, &PySpanType)) { - return PyBool_FromLong(self->span->operator>(((PySpan *)other)->span)); - } - Py_RETURN_FALSE; -} - -static PyObject* PySpan_lt(PySpan *self, PyObject *other) { - if (PyUnicode_Check(other)) { - return PyBool_FromLong(self->span->operator<(PyUnicode_AsUTF8(other))); - } else if (PyObject_TypeCheck(other, &PySpanType)) { - return PyBool_FromLong(self->span->operator<(((PySpan *)other)->span)); - } - Py_RETURN_FALSE; -} - -static PyObject * PyStrView_contains(PyStrView *self, PyObject *args, PyObject *kwargs) { - char *needle; - int start = 0, end = INT_MAX; - static char *kwlist[] = {"needle", "start", "end", NULL}; - - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s|ii", kwlist, &needle, &start, &end)) { - return NULL; - } - - if(self->span->contains(needle,start,end)){ - Py_INCREF(Py_True); - return Py_True; - } else { - Py_INCREF(Py_False); - return Py_False; - } -} -static PyObject * PyStrView_find(PyStrView *self, PyObject *args, PyObject *kwargs) { - char *needle; - int start = 0, end = INT_MAX; - static char *kwlist[] = {"needle", "start", "end", NULL}; - - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s|ii", kwlist, &needle, &start, &end)) { - return NULL; - } - return PyLong_FromLong(self->span->find(needle,start,end)); -} - -static PyTypeObject PySpanType = { - PyVarObject_HEAD_INIT(NULL, 0) /* ob_size */ - "YourModule.Span", /* tp_name */ - sizeof(PySpan), /* tp_basicsize */ - 0, /* tp_itemsize */ - (destructor)PySpan_dealloc, /* tp_dealloc */ - 0, /* tp_print */ - 0, /* tp_getattr */ - 0, /* tp_setattr */ - 0, /* tp_compare */ - 0, /* tp_repr */ - 0, /* tp_as_number */ - 0, /* tp_as_sequence */ - 0, /* tp_as_mapping */ - 0, /* tp_hash */ - 0, /* tp_call */ - 0, /* tp_str */ - 0, /* tp_getattro */ - 0, /* tp_setattro */ - 0, /* tp_as_buffer */ - Py_TPFLAGS_DEFAULT, /* tp_flags */ - "Span objects", /* tp_doc */ - 0, /* tp_traverse */ - 0, /* tp_clear */ - 0, /* tp_richcompare */ - 0, /* tp_weaklistoffset */ - 0, /* tp_iter */ - 0, /* tp_iternext */ - PySpan_methods, /* tp_methods */ - 0, /* tp_members */ - 0, /* tp_getset */ - 0, /* tp_base */ - 0, /* tp_dict */ - 0, /* tp_descr_get */ - 0, /* tp_descr_set */ - 0, /* tp_dictoffset */ - (initproc)PySpan_init, /* tp_init */ - 0, /* tp_alloc */ - PySpan_new, /* tp_new */ -}; +/** + * @brief Type-punned StringZilla-string, that may either be an immutable in-memory string, + * similar to Python's native `str`, or a memory-mapped immutable file from disk, + * or a slice of one of those classes or the Python's native `str` and `bytes` classes. + * + * When a slice is being used, the `parent` object's reference count is being incremented. + * When an in-memory string is used - we avoid the second memory allocation and allocate the `HEAD`, + * the length, and the content region in a single continuous chunk. + */ +typedef struct { + PyObject_HEAD; + + typedef enum { + in_memory_k, + on_disk_k, + slice_k, + } variant; + + typedef struct { + size_t length; + } in_memory_t; + + typedef struct { + void *start; + size_t length; + int file_descriptor; + } on_disk_t; + + typedef struct { + PyObject *parent; + void *start; + size_t length; + } slice_t; +} strzl_t; static PyModuleDef stringzilla_module = { PyModuleDef_HEAD_INIT, "stringzilla", "Crunch 100+ GB Strings in Python with ease", -1, - NULL, NULL, NULL, NULL, NULL + NULL, + NULL, + NULL, + NULL, + NULL, }; PyMODINIT_FUNC PyInit_stringzilla(void) { @@ -828,92 +82,3 @@ PyMODINIT_FUNC PyInit_stringzilla(void) { return m; } - -#endif - -PYBIND11_MODULE(stringzilla, m) { - m.doc() = "Crunch 100+ GB Strings in Python with ease"; - - auto py_span = py::class_>(m, "Span"); - define_comparsion_ops(py_span); - define_slice_ops(py_span); - - auto py_subspan = py::class_>(m, "SubSpan"); - define_comparsion_ops(py_subspan); - define_slice_ops(py_subspan); - - auto py_str = py::class_>(m, "Str"); - py_str.def(py::init([](std::string arg) { return std::make_shared(std::move(arg)); }), py::arg("str")); - py_str.def("__getitem__", [](py_str_t &s, py::slice slice) { - ssize_t start, stop, step, length; - if (!slice.compute(s.size(), &start, &stop, &step, &length)) - throw py::error_already_set(); - if (step != 1) - throw std::invalid_argument("Step argument is not supported for Str"); - return s.sub(start, stop); - }); - define_comparsion_ops(py_str); - define_slice_ops(py_str); - - auto py_file = py::class_>(m, "File"); - py_file.def( // - py::init([](std::string path) { return std::make_shared(std::move(path)); }), - py::arg("path")); - define_slice_ops(py_file); - py_file.def("open", &py_file_t::open, py::arg("path")); - py_file.def("open", &py_file_t::reopen); - py_file.def("load", &py_file_t::load); - py_file.def("close", &py_file_t::close); - py_file.def("__getitem__", [](py_file_t &s, py::slice slice) { - ssize_t start, stop, step, length; - if (!slice.compute(s.size(), &start, &stop, &step, &length)) - throw py::error_already_set(); - if (step != 1) - throw std::invalid_argument("Step argument is not supported for File"); - return s.sub(start, stop); - }); - - auto py_strs = py::class_>(m, "Strs"); - py_strs.def(py::init([]() { return std::make_shared(); })); - py_strs.def("__len__", &py_spans_t::size); - py_strs.def("__getitem__", &py_spans_t::at, py::arg("index")); - py_strs.def( - "__iter__", - [](py_spans_t const &s) { return py::make_iterator(s.begin(), s.end()); }, - py::keep_alive<0, 1>()); - py_strs.def("pop", &py_spans_t::pop, py::call_guard()); - py_strs.def("sort", &py_spans_t::sort, py::call_guard()); - py_strs.def("reverse", &py_spans_t::reverse, py::call_guard()); - py_strs.def("shuffle", - &py_spans_t::shuffle, - py::arg("seed") = std::nullopt, - py::call_guard()); - py_strs.def("__getitem__", [](py_spans_t &s, py::slice slice) { - ssize_t start, stop, step, length; - if (!slice.compute(s.size(), &start, &stop, &step, &length)) - throw py::error_already_set(); - return s.sub(start, stop, step, length); - }); - py_strs.def( // - "sub", - [](py_spans_t &s, ssize_t start, ssize_t stop, ssize_t step = 1) { - auto index_span = slice(s.size(), start, stop); - ssize_t length = stop = index_span.length; - start = index_span.offset; - return s.sub(start, stop, step, length); - }); - - py_strs.def("shuffled", - &py_spans_t::shuffled, - py::arg("seed") = std::nullopt, - py::call_guard()); - py_strs.def("sorted", &py_spans_t::sorted, py::call_guard()); - - py_strs.def("extend", &py_spans_t::extend, py::call_guard()); - py_strs.def("append", &py_spans_t::append, py::call_guard()); - py_strs.def("append", &py_spans_t::append, py::call_guard()); - py_strs.def("append", &py_spans_t::append, py::call_guard()); - py_strs.def("append", &py_spans_t::append, py::call_guard()); - py_strs.def("append", &py_spans_t::append_copy); - py_strs.def("extend", &py_spans_t::extend_copy); -} From 963cd4d90f0cf30ed6ba361e73d6915a6eee1ff5 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Mon, 11 Sep 2023 18:09:57 +0400 Subject: [PATCH 03/72] Docs: Annotate SWAR methods --- .vscode/settings.json | 2 ++ stringzilla/stringzilla.h | 25 ++++++++++++++++++------- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 0ed99251..3a4c79d2 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -121,6 +121,7 @@ "C_Cpp.default.configurationProvider": "ms-vscode.cmake-tools", "cSpell.words": [ "allowoverlap", + "bigram", "cibuildwheel", "getitem", "keeplinebreaks", @@ -129,6 +130,7 @@ "maxsplit", "memcpy", "pytest", + "quadgram", "readlines", "SIMD", "splitlines", diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h index 303c8dbd..3e7df130 100644 --- a/stringzilla/stringzilla.h +++ b/stringzilla/stringzilla.h @@ -55,8 +55,7 @@ typedef struct strzl_needle_t { } strzl_needle_t; /** - * @brief A naive subtring matching algorithm with O(|h|*|n|) comparisons. - * Matching performance fluctuates between 200 MB/s and 2 GB/s. + * @brief SWAR single-character counting procedure, jumping 8 bytes at a time. */ inline static size_t strzl_naive_count_char(strzl_haystack_t h, char n) { @@ -67,7 +66,7 @@ inline static size_t strzl_naive_count_char(strzl_haystack_t h, char n) { for (; (uint64_t)text % 8 != 0 && text < end; ++text) result += *text == n; - // This code simulates hyperscalar execution, comparing 8 characters at a time. + // This code simulates hyper-scalar execution, comparing 8 characters at a time. uint64_t nnnnnnnn = n; nnnnnnnn |= nnnnnnnn << 8; nnnnnnnn |= nnnnnnnn << 16; @@ -87,6 +86,9 @@ inline static size_t strzl_naive_count_char(strzl_haystack_t h, char n) { return result; } +/** + * @brief SWAR single-character search in string, jumping 8 bytes at a time. + */ inline static size_t strzl_naive_find_char(strzl_haystack_t h, char n) { char const *text = h.ptr; @@ -96,7 +98,7 @@ inline static size_t strzl_naive_find_char(strzl_haystack_t h, char n) { if (*text == n) return text - h.ptr; - // This code simulates hyperscalar execution, analyzing 8 offsets at a time. + // This code simulates hyper-scalar execution, analyzing 8 offsets at a time. uint64_t nnnnnnnn = n; nnnnnnnn |= nnnnnnnn << 8; // broadcast `n` into `nnnnnnnn` nnnnnnnn |= nnnnnnnn << 16; // broadcast `n` into `nnnnnnnn` @@ -119,12 +121,15 @@ inline static size_t strzl_naive_find_char(strzl_haystack_t h, char n) { return h.len; } +/** + * @brief SWAR character-bigram search in string, jumping 8 bytes at a time. + */ inline static size_t strzl_naive_find_2chars(strzl_haystack_t h, char const *n) { char const *text = h.ptr; char const *end = h.ptr + h.len; - // This code simulates hyperscalar execution, analyzing 7 offsets at a time. + // This code simulates hyper-scalar execution, analyzing 7 offsets at a time. uint64_t nnnn = (uint64_t(n[0]) << 0) | (uint64_t(n[1]) << 8); // broadcast `n` into `nnnn` nnnn |= nnnn << 16; // broadcast `n` into `nnnn` nnnn |= nnnn << 32; // broadcast `n` into `nnnn` @@ -158,12 +163,15 @@ inline static size_t strzl_naive_find_2chars(strzl_haystack_t h, char const *n) return h.len; } +/** + * @brief SWAR character-trigram search in string, jumping 8 bytes at a time. + */ inline static size_t strzl_naive_find_3chars(strzl_haystack_t h, char const *n) { char const *text = h.ptr; char const *end = h.ptr + h.len; - // This code simulates hyperscalar execution, analyzing 6 offsets at a time. + // This code simulates hyper-scalar execution, analyzing 6 offsets at a time. // We have two unused bytes at the end. uint64_t nn = uint64_t(n[0] << 0) | (uint64_t(n[1]) << 8) | (uint64_t(n[2]) << 16); // broadcast `n` into `nn` nn |= nn << 24; // broadcast `n` into `nn` @@ -210,12 +218,15 @@ inline static size_t strzl_naive_find_3chars(strzl_haystack_t h, char const *n) return h.len; } +/** + * @brief SWAR character-quadgram search in string, jumping 8 bytes at a time. + */ inline static size_t strzl_naive_find_4chars(strzl_haystack_t h, char const *n) { char const *text = h.ptr; char const *end = h.ptr + h.len; - // This code simulates hyperscalar execution, analyzing 4 offsets at a time. + // This code simulates hyper-scalar execution, analyzing 4 offsets at a time. uint64_t nn = uint64_t(n[0] << 0) | (uint64_t(n[1]) << 8) | (uint64_t(n[2]) << 16) | (uint64_t(n[3]) << 24); nn |= nn << 32; nn = nn; From aa57b4f63e98bab674945a4c4a14a75dcba0e7ee Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Mon, 11 Sep 2023 18:11:34 +0400 Subject: [PATCH 04/72] Add: `MemoryMappedFile` --- python/lib.cpp | 303 +++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 270 insertions(+), 33 deletions(-) diff --git a/python/lib.cpp b/python/lib.cpp index 77678767..5267cff4 100644 --- a/python/lib.cpp +++ b/python/lib.cpp @@ -1,5 +1,6 @@ /** - * @brief + * @brief Very light-weight CPython wrapper for StringZilla, with support for memory-mapping, + * native Python strings, Apache Arrow collections, and more. */ #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) #define NOMINMAX @@ -20,40 +21,264 @@ typedef SSIZE_T ssize_t; #include +#pragma region Helpers + +void slice(size_t length, ssize_t start, ssize_t end, size_t *normalized_offset, size_t *normalized_length) { + + // clang-format off + // Normalize negative indices + if (start < 0) start += length; + if (end < 0) end += length; + + // Clamp indices to a valid range + if (start < 0) start = 0; + if (end < 0) end = 0; + if (start > length) start = length; + if (end > length) end = length; + + // Ensure start <= end + if (start > end) start = end; + // clang-format on + + *normalized_offset = start; + *normalized_length = end - start; +} + +#pragma endregion + +#pragma region MemoryMappingFile + +/** + * @brief Describes an on-disk file mapped into RAM, which is different from Python's + * native `mmap` module, as it exposes the address of the mapping in memory. + */ +typedef struct { + PyObject_HEAD; +#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) + HANDLE file_handle; + HANDLE mapping_handle; +#else + int file_descriptor; +#endif + void *ptr; + size_t len; +} MemoryMappedFile; + +static void MemoryMappedFile_dealloc(MemoryMappedFile *self) { +#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) + if (self->ptr) { + UnmapViewOfFile(self->ptr); + self->ptr = NULL; + } + if (self->mapping_handle) { + CloseHandle(self->mapping_handle); + self->mapping_handle = NULL; + } + if (self->file_handle) { + CloseHandle(self->file_handle); + self->file_handle = NULL; + } +#else + if (self->ptr) { + munmap(self->ptr, self->len); + self->ptr = NULL; + self->len = 0; + } + if (self->file_descriptor != 0) { + close(self->file_descriptor); + self->file_descriptor = 0; + } +#endif + Py_TYPE(self)->tp_free((PyObject *)self); +} + +static PyObject *MemoryMappedFile_new(PyTypeObject *type, PyObject *positional_args, PyObject *named_args) { + MemoryMappedFile *self; + self = (MemoryMappedFile *)type->tp_alloc(type, 0); + if (self != NULL) { + self->ptr = NULL; + self->len = 0; +#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) + self->file_handle = NULL; + self->mapping_handle = NULL; +#else + self->file_descriptor = 0; +#endif + } + return (PyObject *)self; +} + +static int MemoryMappedFile_init(MemoryMappedFile *self, PyObject *positional_args, PyObject *named_args) { + const char *path; + if (!PyArg_ParseTuple(positional_args, "s", &path)) { + return -1; + } + +#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) + self->file_handle = CreateFile(path, GENERIC_READ, FILE_SHARE_READ, 0, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0); + if (self->file_handle == INVALID_HANDLE_VALUE) { + PyErr_SetString(PyExc_RuntimeError, "Couldn't map the file!"); + return -1; + } + + self->mapping_handle = CreateFileMapping(self->file_handle, 0, PAGE_READONLY, 0, 0, 0); + if (self->mapping_handle == 0) { + CloseHandle(self->file_handle); + self->file_handle = NULL; + PyErr_SetString(PyExc_RuntimeError, "Couldn't map the file!"); + return -1; + } + + char *file = (char *)MapViewOfFile(self->mapping_handle, FILE_MAP_READ, 0, 0, 0); + if (file == 0) { + CloseHandle(self->mapping_handle); + self->mapping_handle = NULL; + CloseHandle(self->file_handle); + self->file_handle = NULL; + PyErr_SetString(PyExc_RuntimeError, "Couldn't map the file!"); + return -1; + } + self->ptr = file; + self->len = GetFileSize(self->file_handle, 0); +#else + struct stat sb; + self->file_descriptor = open(path, O_RDONLY); + if (fstat(self->file_descriptor, &sb) != 0) { + close(self->file_descriptor); + self->file_descriptor = 0; + PyErr_SetString(PyExc_RuntimeError, "Can't retrieve file size!"); + return -1; + } + size_t file_size = sb.st_size; + void *map = mmap(NULL, sb.st_size, PROT_READ, MAP_SHARED, self->file_descriptor, 0); + if (map == MAP_FAILED) { + close(self->file_descriptor); + self->file_descriptor = 0; + PyErr_SetString(PyExc_RuntimeError, "Couldn't map the file!"); + return -1; + } + self->ptr = map; + self->len = file_size; +#endif + + return 0; +} + +static PyMethodDef MemoryMappedFile_methods[] = { + // Your method definitions here + {NULL} /* Sentinel */ +}; + +static PyTypeObject MemoryMappedFileType = { + PyVarObject_HEAD_INIT(NULL, 0).tp_name = "stringzilla.MemoryMappedFile", + .tp_doc = "MemoryMappedFile objects", + .tp_basicsize = sizeof(MemoryMappedFile), + .tp_itemsize = 0, + .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, + .tp_new = MemoryMappedFile_new, + .tp_init = (initproc)MemoryMappedFile_init, + .tp_dealloc = (destructor)MemoryMappedFile_dealloc, + .tp_methods = MemoryMappedFile_methods, +}; + +#pragma endregion + +#pragma region Str + /** - * @brief Type-punned StringZilla-string, that may either be an immutable in-memory string, - * similar to Python's native `str`, or a memory-mapped immutable file from disk, - * or a slice of one of those classes or the Python's native `str` and `bytes` classes. + * @brief Type-punned StringZilla-string, that points to a slice of an existing Python `str` + * or a `MemoryMappedFile`. * - * When a slice is being used, the `parent` object's reference count is being incremented. - * When an in-memory string is used - we avoid the second memory allocation and allocate the `HEAD`, - * the length, and the content region in a single continuous chunk. + * When a slice is constructed, the `parent` object's reference count is being incremented to preserve lifetime. + * It usage in Python would look like: + * + * - Str() # Empty string + * - Str("some-string") # Full-range slice of a Python `str` + * - Str(File("some-path.txt")) # Full-range view of a persisted file + * - Str(File("some-path.txt"), from=0, to=sys.maxint) */ typedef struct { PyObject_HEAD; + PyObject *parent; + void *start; + size_t length; +} Str; + +static int Str_init(Str *self, PyObject *positional_args, PyObject *named_args) { + PyObject *parent = NULL; + Py_ssize_t from = 0; + Py_ssize_t to = PY_SSIZE_T_MAX; + + // The `named_args` would be `NULL` + if (named_args) { + static char *names[] = {"parent", "from", "to", NULL}; + if (!PyArg_ParseTupleAndKeywords(positional_args, named_args, "|Onn", names, &parent, &from, &to)) + return -1; + } + else if (!PyArg_ParseTuple(positional_args, "|Onn", &parent, &from, &to)) + return -1; - typedef enum { - in_memory_k, - on_disk_k, - slice_k, - } variant; - - typedef struct { - size_t length; - } in_memory_t; - - typedef struct { - void *start; - size_t length; - int file_descriptor; - } on_disk_t; - - typedef struct { - PyObject *parent; - void *start; - size_t length; - } slice_t; -} strzl_t; + self->parent = parent; + if (PyUnicode_Check(parent)) { + // Handle Python str + self->start = PyUnicode_DATA(parent); + self->length = PyUnicode_GET_DATA_SIZE(parent); + Py_INCREF(parent); // Increment the reference count of the parent + } + else if (PyObject_TypeCheck(parent, &MemoryMappedFileType)) { + // Handle MemoryMappedFile + MemoryMappedFile *file = (MemoryMappedFile *)parent; + self->start = file->ptr; + self->length = file->len; + Py_INCREF(parent); // Increment the reference count of the parent + } + else if (parent == NULL) { + // Handle empty string + self->start = NULL; + self->length = 0; + } + else { + PyErr_SetString(PyExc_TypeError, "Unsupported parent type"); + return -1; + } + + // Apply slicing + size_t normalized_offset, normalized_length; + slice(self->length, from, to, &normalized_offset, &normalized_length); + self->start = ((char *)self->start) + normalized_offset; + self->length = normalized_length; + return 0; +} + +static PyObject *Str_new(PyTypeObject *type, PyObject *args, PyObject *kwds) { + Str *self; + self = (Str *)type->tp_alloc(type, 0); + if (!self) + return NULL; + + self->parent = NULL; + self->start = NULL; + self->length = 0; + return (PyObject *)self; +} + +static void Str_dealloc(Str *self) { + if (self->parent) + Py_XDECREF(self->parent); + Py_TYPE(self)->tp_free((PyObject *)self); +} + +static PyTypeObject StrType = { + PyVarObject_HEAD_INIT(NULL, 0).tp_name = "stringzilla.Str", + .tp_doc = "Stringzilla Str objects", + .tp_basicsize = sizeof(Str), + .tp_itemsize = 0, + .tp_flags = Py_TPFLAGS_DEFAULT, + .tp_new = Str_new, + .tp_dealloc = (destructor)Str_dealloc, +}; + +#pragma endregion static PyModuleDef stringzilla_module = { PyModuleDef_HEAD_INIT, @@ -70,15 +295,27 @@ static PyModuleDef stringzilla_module = { PyMODINIT_FUNC PyInit_stringzilla(void) { PyObject *m; - if (PyType_Ready(&PySpanType) < 0) + if (PyType_Ready(&StrType) < 0) return NULL; m = PyModule_Create(&stringzilla_module); if (m == NULL) return NULL; - Py_INCREF(&PySpanType); - PyModule_AddObject(m, "Span", (PyObject *)&PySpanType); + Py_INCREF(&StrType); + if (PyModule_AddObject(m, "Str", (PyObject *)&StrType) < 0) { + Py_XDECREF(&StrType); + Py_XDECREF(m); + return NULL; + } + + Py_INCREF(&MemoryMappedFileType); + if (PyModule_AddObject(m, "MemoryMappedFile", (PyObject *)&MemoryMappedFileType) < 0) { + Py_XDECREF(&MemoryMappedFileType); + Py_XDECREF(&StrType); + Py_XDECREF(m); + return NULL; + } return m; -} +} \ No newline at end of file From 21a3737f5e9050bc5bed1164f76f11a25e83da32 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Mon, 11 Sep 2023 18:12:55 +0400 Subject: [PATCH 05/72] Make: Switch to pure C --- python/{lib.cpp => lib.c} | 0 setup.py | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename python/{lib.cpp => lib.c} (100%) diff --git a/python/lib.cpp b/python/lib.c similarity index 100% rename from python/lib.cpp rename to python/lib.c diff --git a/setup.py b/setup.py index 32bad412..44cf4cf8 100644 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ ext_modules = [ Pybind11Extension( "stringzilla", - ["python/lib.cpp"], + ["python/lib.c"], include_dirs=["stringzilla"], extra_compile_args=compile_args, extra_link_args=link_args, From 45cb82b5836033b352c0ac3cc2a1856ec3e532a3 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sat, 16 Sep 2023 20:41:45 +0400 Subject: [PATCH 06/72] Improve: Passing basic tests --- .vscode/settings.json | 9 + python/lib.c | 356 ++++++++++++++++++++++++++++---------- scripts/test.py | 22 ++- setup.py | 4 +- stringzilla/stringzilla.h | 63 ++++--- 5 files changed, 336 insertions(+), 118 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 3a4c79d2..a32765cc 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -121,14 +121,22 @@ "C_Cpp.default.configurationProvider": "ms-vscode.cmake-tools", "cSpell.words": [ "allowoverlap", + "basicsize", "bigram", "cibuildwheel", + "endregion", "getitem", + "getslice", + "initproc", + "itemsize", "keeplinebreaks", "keepseparator", "levenstein", "maxsplit", "memcpy", + "newfunc", + "NOARGS", + "NOMINMAX", "pytest", "quadgram", "readlines", @@ -139,6 +147,7 @@ "strzl", "substr", "SWAR", + "TPFLAGS", "Zilla" ] } \ No newline at end of file diff --git a/python/lib.c b/python/lib.c index 5267cff4..fb42b7df 100644 --- a/python/lib.c +++ b/python/lib.c @@ -21,6 +21,50 @@ typedef SSIZE_T ssize_t; #include +#include + +#pragma region Forward Declarations + +static PyTypeObject MemoryMappedFileType; +static PyTypeObject StrType; + +/** + * @brief Describes an on-disk file mapped into RAM, which is different from Python's + * native `mmap` module, as it exposes the address of the mapping in memory. + */ +typedef struct { + PyObject_HEAD; +#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) + HANDLE file_handle; + HANDLE mapping_handle; +#else + int file_descriptor; +#endif + void *start; + size_t length; +} MemoryMappedFile; + +/** + * @brief Type-punned StringZilla-string, that points to a slice of an existing Python `str` + * or a `MemoryMappedFile`. + * + * When a slice is constructed, the `parent` object's reference count is being incremented to preserve lifetime. + * It usage in Python would look like: + * + * - Str() # Empty string + * - Str("some-string") # Full-range slice of a Python `str` + * - Str(File("some-path.txt")) # Full-range view of a persisted file + * - Str(File("some-path.txt"), from=0, to=sys.maxint) + */ +typedef struct { + PyObject_HEAD; + PyObject *parent; + char const *start; + size_t length; +} Str; + +#pragma endregion + #pragma region Helpers void slice(size_t length, ssize_t start, ssize_t end, size_t *normalized_offset, size_t *normalized_length) { @@ -33,8 +77,8 @@ void slice(size_t length, ssize_t start, ssize_t end, size_t *normalized_offset, // Clamp indices to a valid range if (start < 0) start = 0; if (end < 0) end = 0; - if (start > length) start = length; - if (end > length) end = length; + if (start > (ssize_t)length) start = length; + if (end > (ssize_t)length) end = length; // Ensure start <= end if (start > end) start = end; @@ -48,27 +92,11 @@ void slice(size_t length, ssize_t start, ssize_t end, size_t *normalized_offset, #pragma region MemoryMappingFile -/** - * @brief Describes an on-disk file mapped into RAM, which is different from Python's - * native `mmap` module, as it exposes the address of the mapping in memory. - */ -typedef struct { - PyObject_HEAD; -#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) - HANDLE file_handle; - HANDLE mapping_handle; -#else - int file_descriptor; -#endif - void *ptr; - size_t len; -} MemoryMappedFile; - static void MemoryMappedFile_dealloc(MemoryMappedFile *self) { #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) - if (self->ptr) { - UnmapViewOfFile(self->ptr); - self->ptr = NULL; + if (self->start) { + UnmapViewOfFile(self->start); + self->start = NULL; } if (self->mapping_handle) { CloseHandle(self->mapping_handle); @@ -79,10 +107,10 @@ static void MemoryMappedFile_dealloc(MemoryMappedFile *self) { self->file_handle = NULL; } #else - if (self->ptr) { - munmap(self->ptr, self->len); - self->ptr = NULL; - self->len = 0; + if (self->start) { + munmap(self->start, self->length); + self->start = NULL; + self->length = 0; } if (self->file_descriptor != 0) { close(self->file_descriptor); @@ -95,24 +123,23 @@ static void MemoryMappedFile_dealloc(MemoryMappedFile *self) { static PyObject *MemoryMappedFile_new(PyTypeObject *type, PyObject *positional_args, PyObject *named_args) { MemoryMappedFile *self; self = (MemoryMappedFile *)type->tp_alloc(type, 0); - if (self != NULL) { - self->ptr = NULL; - self->len = 0; + if (self == NULL) + return NULL; + #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) - self->file_handle = NULL; - self->mapping_handle = NULL; + self->file_handle = NULL; + self->mapping_handle = NULL; #else - self->file_descriptor = 0; + self->file_descriptor = 0; #endif - } - return (PyObject *)self; + self->start = NULL; + self->length = 0; } static int MemoryMappedFile_init(MemoryMappedFile *self, PyObject *positional_args, PyObject *named_args) { const char *path; - if (!PyArg_ParseTuple(positional_args, "s", &path)) { + if (!PyArg_ParseTuple(positional_args, "s", &path)) return -1; - } #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) self->file_handle = CreateFile(path, GENERIC_READ, FILE_SHARE_READ, 0, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0); @@ -138,8 +165,8 @@ static int MemoryMappedFile_init(MemoryMappedFile *self, PyObject *positional_ar PyErr_SetString(PyExc_RuntimeError, "Couldn't map the file!"); return -1; } - self->ptr = file; - self->len = GetFileSize(self->file_handle, 0); + self->start = file; + self->length = GetFileSize(self->file_handle, 0); #else struct stat sb; self->file_descriptor = open(path, O_RDONLY); @@ -157,52 +184,74 @@ static int MemoryMappedFile_init(MemoryMappedFile *self, PyObject *positional_ar PyErr_SetString(PyExc_RuntimeError, "Couldn't map the file!"); return -1; } - self->ptr = map; - self->len = file_size; + self->start = map; + self->length = file_size; #endif return 0; } -static PyMethodDef MemoryMappedFile_methods[] = { - // Your method definitions here - {NULL} /* Sentinel */ -}; +static PyMethodDef MemoryMappedFile_methods[] = { // + {NULL, NULL, 0, NULL}}; static PyTypeObject MemoryMappedFileType = { - PyVarObject_HEAD_INIT(NULL, 0).tp_name = "stringzilla.MemoryMappedFile", - .tp_doc = "MemoryMappedFile objects", + PyObject_HEAD_INIT(NULL).tp_name = "stringzilla.MemoryMappedFile", + .tp_doc = "Memory mapped file class, that exposes the memory range for low-level access", .tp_basicsize = sizeof(MemoryMappedFile), - .tp_itemsize = 0, - .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, - .tp_new = MemoryMappedFile_new, + .tp_flags = Py_TPFLAGS_DEFAULT, + .tp_methods = MemoryMappedFile_methods, + .tp_new = (newfunc)MemoryMappedFile_new, .tp_init = (initproc)MemoryMappedFile_init, .tp_dealloc = (destructor)MemoryMappedFile_dealloc, - .tp_methods = MemoryMappedFile_methods, + + // PyBufferProcs *tp_as_buffer; + + // reprfunc tp_repr; + // PyNumberMethods *tp_as_number; + // PySequenceMethods *tp_as_sequence; + // PyMappingMethods *tp_as_mapping; + // ternaryfunc tp_call; + // reprfunc tp_str; + // getattrofunc tp_getattro; + // setattrofunc tp_setattro; }; #pragma endregion -#pragma region Str +int export_string_like(PyObject *object, char const **start, size_t *length) { + if (PyUnicode_Check(object)) { + // Handle Python str + Py_ssize_t signed_length; + *start = PyUnicode_AsUTF8AndSize(object, &signed_length); + *length = (size_t)signed_length; + return 1; + } + else if (PyBytes_Check(object)) { + // Handle Python str + Py_ssize_t signed_length; + if (PyBytes_AsStringAndSize(object, start, signed_length) == -1) { + PyErr_SetString(PyExc_TypeError, "Mapping bytes failed"); + return 0; + } + *length = (size_t)signed_length; + return 1; + } + else if (PyObject_TypeCheck(object, &StrType)) { + Str *str = (Str *)object; + *start = str->start; + *length = str->length; + return 1; + } + else if (PyObject_TypeCheck(object, &MemoryMappedFileType)) { + MemoryMappedFile *file = (MemoryMappedFile *)object; + *start = file->start; + *length = file->length; + return 1; + } + return 0; +} -/** - * @brief Type-punned StringZilla-string, that points to a slice of an existing Python `str` - * or a `MemoryMappedFile`. - * - * When a slice is constructed, the `parent` object's reference count is being incremented to preserve lifetime. - * It usage in Python would look like: - * - * - Str() # Empty string - * - Str("some-string") # Full-range slice of a Python `str` - * - Str(File("some-path.txt")) # Full-range view of a persisted file - * - Str(File("some-path.txt"), from=0, to=sys.maxint) - */ -typedef struct { - PyObject_HEAD; - PyObject *parent; - void *start; - size_t length; -} Str; +#pragma region Str static int Str_init(Str *self, PyObject *positional_args, PyObject *named_args) { PyObject *parent = NULL; @@ -218,25 +267,16 @@ static int Str_init(Str *self, PyObject *positional_args, PyObject *named_args) else if (!PyArg_ParseTuple(positional_args, "|Onn", &parent, &from, &to)) return -1; - self->parent = parent; - if (PyUnicode_Check(parent)) { - // Handle Python str - self->start = PyUnicode_DATA(parent); - self->length = PyUnicode_GET_DATA_SIZE(parent); - Py_INCREF(parent); // Increment the reference count of the parent - } - else if (PyObject_TypeCheck(parent, &MemoryMappedFileType)) { - // Handle MemoryMappedFile - MemoryMappedFile *file = (MemoryMappedFile *)parent; - self->start = file->ptr; - self->length = file->len; - Py_INCREF(parent); // Increment the reference count of the parent - } - else if (parent == NULL) { - // Handle empty string + // Handle empty string + if (parent == NULL) { self->start = NULL; self->length = 0; } + // Increment the reference count of the parent + else if (export_string_like(parent, &self->start, &self->length)) { + self->parent = parent; + Py_INCREF(parent); + } else { PyErr_SetString(PyExc_TypeError, "Unsupported parent type"); return -1; @@ -268,24 +308,157 @@ static void Str_dealloc(Str *self) { Py_TYPE(self)->tp_free((PyObject *)self); } +static Py_ssize_t Str_len(Str *self) { return self->length; } + +static PyObject *Str_getitem(Str *self, Py_ssize_t i) { + + // Negative indexing + if (i < 0) + i += self->length; + + if (i < 0 || (size_t)i >= self->length) { + PyErr_SetString(PyExc_IndexError, "Index out of range"); + return NULL; + } + + // Assuming the underlying data is UTF-8 encoded + return PyUnicode_FromStringAndSize(self->start + i, 1); +} + +// Will be called by the `PySequence_Contains` +static int Str_contains(Str *self, PyObject *arg) { + + struct strzl_needle_t needle_struct; + needle_struct.anomaly_offset = 0; + if (!export_string_like(arg, &needle_struct.ptr, &needle_struct.len)) { + PyErr_SetString(PyExc_TypeError, "Unsupported argument type"); + return -1; + } + + struct strzl_haystack_t haystack; + haystack.ptr = self->start; + haystack.len = self->length; + size_t position = strzl_neon_find_substr(haystack, needle_struct); + return position != haystack.len; +} + +static Py_hash_t Str_hash(Str *self) { return (Py_hash_t)strzl_hash_crc32_native(self->start, self->length); } + +static PyObject *Str_getslice(Str *self, PyObject *args) { + PyObject *start_obj = NULL, *end_obj = NULL; + ssize_t start = 0, end = self->length; // Default values + + if (!PyArg_ParseTuple(args, "|OO", &start_obj, &end_obj)) + return NULL; + + if (start_obj != NULL && start_obj != Py_None) { + if (!PyLong_Check(start_obj)) { + PyErr_SetString(PyExc_TypeError, "Start index must be an integer or None"); + return NULL; + } + start = PyLong_AsSsize_t(start_obj); + } + + if (end_obj != NULL && end_obj != Py_None) { + if (!PyLong_Check(end_obj)) { + PyErr_SetString(PyExc_TypeError, "End index must be an integer or None"); + return NULL; + } + end = PyLong_AsSsize_t(end_obj); + } + + size_t normalized_offset, normalized_length; + slice(self->length, start, end, &normalized_offset, &normalized_length); + + if (normalized_length == 0) + return PyUnicode_FromString(""); + + // Create a new Str object + Str *new_str = (Str *)PyObject_New(Str, &StrType); + if (new_str == NULL) + return NULL; + + // Set the parent to the original Str object and increment its reference count + new_str->parent = (PyObject *)self; + Py_INCREF(self); + + // Set the start and length to point to the slice + new_str->start = self->start + normalized_offset; + new_str->length = normalized_length; + return (PyObject *)new_str; +} + +static PyObject *Str_str(Str *self, PyObject *args) { return PyUnicode_FromStringAndSize(self->start, self->length); } + +static PyObject *Str_find_substr(Str *self, PyObject *args) { + PyObject *needle_obj; + if (!PyArg_ParseTuple(args, "O", &needle_obj)) + return NULL; + + struct strzl_needle_t needle_struct; + needle_struct.anomaly_offset = 0; + + if (PyObject_TypeCheck(needle_obj, &StrType)) { + Str *needle = (Str *)needle_obj; + needle_struct.ptr = needle->start; + needle_struct.len = needle->length; + } + else if (PyUnicode_Check(needle_obj)) { + needle_struct.ptr = PyUnicode_AsUTF8AndSize(needle_obj, (Py_ssize_t *)&needle_struct.len); + if (needle_struct.ptr == NULL) + return NULL; // Error case, likely a UnicodeEncodeError + } + else { + PyErr_SetString(PyExc_TypeError, "Argument must be an instance of Str or a native Python str"); + return NULL; + } + + struct strzl_haystack_t haystack; + haystack.ptr = self->start; + haystack.len = self->length; + size_t position = strzl_neon_find_substr(haystack, needle_struct); + return PyLong_FromSize_t(position); +} + +static PySequenceMethods Str_as_sequence = { + .sq_length = (lenfunc)Str_len, // + .sq_item = (ssizeargfunc)Str_getitem, // + .sq_contains = (objobjproc)Str_contains, // +}; + +static PyMethodDef Str_methods[] = { // + {"contains", (PyCFunction)Str_str, METH_NOARGS, "Convert to Python `str`"}, + {"find", (PyCFunction)Str_len, METH_NOARGS, "Get length"}, + {"find", (PyCFunction)Str_len, METH_NOARGS, "Get length"}, + {"__getitem__", (PyCFunction)Str_getitem, METH_O, "Indexing"}, + {NULL, NULL, 0, NULL}}; + static PyTypeObject StrType = { - PyVarObject_HEAD_INIT(NULL, 0).tp_name = "stringzilla.Str", - .tp_doc = "Stringzilla Str objects", + PyObject_HEAD_INIT(NULL).tp_name = "stringzilla.Str", + .tp_doc = "Immutable string/slice class with SIMD and SWAR-accelerated operations", .tp_basicsize = sizeof(Str), - .tp_itemsize = 0, .tp_flags = Py_TPFLAGS_DEFAULT, - .tp_new = Str_new, + .tp_methods = Str_methods, + .tp_new = (newfunc)Str_new, + .tp_init = (initproc)Str_init, .tp_dealloc = (destructor)Str_dealloc, + .tp_as_sequence = &Str_as_sequence, + .tp_hash = (hashfunc)Str_hash, // String hashing functions + // .tp_as_buffer = (PyBufferProcs *)NULL, // Functions to access object as input/output buffer + // .tp_vectorcall = (vectorcallfunc)NULL, // Faster function dispatch }; #pragma endregion +static PyMethodDef stringzilla_methods[] = { // + {NULL, NULL, 0, NULL}}; + static PyModuleDef stringzilla_module = { PyModuleDef_HEAD_INIT, "stringzilla", "Crunch 100+ GB Strings in Python with ease", -1, - NULL, + stringzilla_methods, NULL, NULL, NULL, @@ -298,6 +471,9 @@ PyMODINIT_FUNC PyInit_stringzilla(void) { if (PyType_Ready(&StrType) < 0) return NULL; + if (PyType_Ready(&MemoryMappedFileType) < 0) + return NULL; + m = PyModule_Create(&stringzilla_module); if (m == NULL) return NULL; @@ -318,4 +494,4 @@ PyMODINIT_FUNC PyInit_stringzilla(void) { } return m; -} \ No newline at end of file +} diff --git a/scripts/test.py b/scripts/test.py index 68cb105c..189345e0 100644 --- a/scripts/test.py +++ b/scripts/test.py @@ -5,7 +5,27 @@ import pytest -from stringzilla import Str, File, Strs, levenstein +from stringzilla import Str + + +def test_construct(): + native = "aaaaa" + big = Str(native) + assert len(big) == len(native) + + +def test_indexing(): + native = "abcdef" + big = Str(native) + for i in range(len(native)): + assert big[i] == native[i] + + +def test_contains(): + big = Str("abcdef") + assert "a" in big + assert "ab" in big + assert "xxx" not in big def get_random_string( diff --git a/setup.py b/setup.py index 44cf4cf8..83c22aea 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ macros_args = [] if sys.platform == "linux": - compile_args.append("-std=c++17") + compile_args.append("-std=c99") compile_args.append("-O3") compile_args.append("-pedantic") compile_args.append("-Wno-unknown-pragmas") @@ -36,7 +36,7 @@ if sys.platform == "darwin": - compile_args.append("-std=c++17") + compile_args.append("-std=c99") compile_args.append("-O3") compile_args.append("-pedantic") compile_args.append("-Wno-unknown-pragmas") diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h index 3e7df130..ca58b1ed 100644 --- a/stringzilla/stringzilla.h +++ b/stringzilla/stringzilla.h @@ -35,8 +35,9 @@ inline static size_t strzl_divide_round_up(size_t x, size_t divisor) { return (x /** * @brief This is a faster alternative to `strncmp(a, b, len) == 0`. + * @return 1 for `true`, and 0 for `false`. */ -inline static bool strzl_equal(char const *a, char const *b, size_t len) { +inline static int strzl_equal(char const *a, char const *b, size_t len) { char const *const a_end = a + len; while (a != a_end && *a == *b) a++, b++; @@ -130,9 +131,9 @@ inline static size_t strzl_naive_find_2chars(strzl_haystack_t h, char const *n) char const *end = h.ptr + h.len; // This code simulates hyper-scalar execution, analyzing 7 offsets at a time. - uint64_t nnnn = (uint64_t(n[0]) << 0) | (uint64_t(n[1]) << 8); // broadcast `n` into `nnnn` - nnnn |= nnnn << 16; // broadcast `n` into `nnnn` - nnnn |= nnnn << 32; // broadcast `n` into `nnnn` + uint64_t nnnn = ((uint64_t)(n[0]) << 0) | ((uint64_t)(n[1]) << 8); // broadcast `n` into `nnnn` + nnnn |= nnnn << 16; // broadcast `n` into `nnnn` + nnnn |= nnnn << 32; // broadcast `n` into `nnnn` uint64_t text_slice; for (; text + 8 <= end; text += 7) { memcpy(&text_slice, text, 8); @@ -173,9 +174,9 @@ inline static size_t strzl_naive_find_3chars(strzl_haystack_t h, char const *n) // This code simulates hyper-scalar execution, analyzing 6 offsets at a time. // We have two unused bytes at the end. - uint64_t nn = uint64_t(n[0] << 0) | (uint64_t(n[1]) << 8) | (uint64_t(n[2]) << 16); // broadcast `n` into `nn` - nn |= nn << 24; // broadcast `n` into `nn` - nn <<= 16; // broadcast `n` into `nn` + uint64_t nn = (uint64_t)(n[0] << 0) | ((uint64_t)(n[1]) << 8) | ((uint64_t)(n[2]) << 16); // broadcast `n` into `nn` + nn |= nn << 24; // broadcast `n` into `nn` + nn <<= 16; // broadcast `n` into `nn` for (; text + 8 <= end; text += 6) { uint64_t text_slice; @@ -227,9 +228,8 @@ inline static size_t strzl_naive_find_4chars(strzl_haystack_t h, char const *n) char const *end = h.ptr + h.len; // This code simulates hyper-scalar execution, analyzing 4 offsets at a time. - uint64_t nn = uint64_t(n[0] << 0) | (uint64_t(n[1]) << 8) | (uint64_t(n[2]) << 16) | (uint64_t(n[3]) << 24); + uint64_t nn = (uint64_t)(n[0] << 0) | ((uint64_t)(n[1]) << 8) | ((uint64_t)(n[2]) << 16) | ((uint64_t)(n[3]) << 24); nn |= nn << 32; - nn = nn; // uint8_t lookup[16] = {0}; @@ -264,8 +264,8 @@ inline static size_t strzl_naive_find_4chars(strzl_haystack_t h, char const *n) if (text01_indicators + text23_indicators) { // Assuming we have performed 4 comparisons, we can only have 2^4=16 outcomes. - // Which is small enought for a lookup table. - uint8_t match_indicators = uint8_t( // + // Which is small enough for a lookup table. + uint8_t match_indicators = (uint8_t)( // (text01_indicators >> 31) | (text01_indicators << 0) | // (text23_indicators >> 29) | (text23_indicators << 2)); return text - h.ptr + lookup[match_indicators]; @@ -370,8 +370,10 @@ size_t strzl_avx2_find_substr(strzl_haystack_t h, strzl_needle_t n) { } // Don't forget the last (up to 35) characters. - size_t tail_len = end - text; - size_t tail_match = strzl_naive_find_substr({text, tail_len}, n); + strzl_haystack_t h_remainder; + h_remainder.ptr = text; + h_remainder.len = end - text; + size_t tail_match = strzl_naive_find_substr(h_remainder, n); return text + tail_match - h.ptr; } @@ -415,7 +417,7 @@ inline static size_t strzl_neon_find_substr(strzl_haystack_t h, strzl_needle_t n // vorrq_u32 (all) uint32x4_t matches = vorrq_u32(vorrq_u32(matches0, matches1), vorrq_u32(matches2, matches3)); uint64x2_t matches64x2 = vreinterpretq_u64_u32(matches); - bool has_match = vgetq_lane_u64(matches64x2, 0) | vgetq_lane_u64(matches64x2, 1); + int has_match = vgetq_lane_u64(matches64x2, 0) | vgetq_lane_u64(matches64x2, 1); if (has_match) { for (size_t i = 0; i < 16; i++) { @@ -426,8 +428,10 @@ inline static size_t strzl_neon_find_substr(strzl_haystack_t h, strzl_needle_t n } // Don't forget the last (up to 16+3=19) characters. - size_t tail_len = end - text; - size_t tail_match = strzl_naive_find_substr({text, tail_len}, n); + strzl_haystack_t h_remainder; + h_remainder.ptr = text; + h_remainder.len = end - text; + size_t tail_match = strzl_naive_find_substr(h_remainder, n); return text + tail_match - h.ptr; } @@ -441,16 +445,16 @@ inline static void strzl_swap(size_t *a, size_t *b) { typedef char const *(*strzl_array_get_begin_t)(void const *, size_t); typedef size_t (*strzl_array_get_length_t)(void const *, size_t); -typedef bool (*strzl_array_predicate_t)(void const *, size_t); -typedef bool (*strzl_array_comparator_t)(void const *, size_t, size_t); +typedef int (*strzl_array_predicate_t)(void const *, size_t); +typedef int (*strzl_array_comparator_t)(void const *, size_t, size_t); -struct strzl_array_t { +typedef struct strzl_array_t { size_t *order; size_t count; strzl_array_get_begin_t get_begin; strzl_array_get_length_t get_length; void const *handle; -}; +} strzl_array_t; /** * @brief Similar to `std::partition`, given a predicate splits the @@ -610,9 +614,9 @@ inline static int _strzl_sort_array_strncasecmp( return res ? res : a_len - b_len; } -struct strzl_sort_config_t { - bool case_insensitive; -}; +typedef struct strzl_sort_config_t { + int case_insensitive; +} strzl_sort_config_t; /** * @brief Sorting algorithm, combining Radix Sort for the first 32 bits of every word @@ -620,7 +624,7 @@ struct strzl_sort_config_t { */ inline static void strzl_sort(strzl_array_t *array, strzl_sort_config_t const *config) { - bool case_insensitive = config && config->case_insensitive; + int case_insensitive = config && config->case_insensitive; // Export up to 4 bytes into the `array` bits themselves for (size_t i = 0; i != array->count; ++i) { @@ -657,7 +661,7 @@ typedef uint8_t levenstein_distance_t; * @return Amount of temporary memory (in bytes) needed to efficiently compute * the Levenstein distance between two strings of given size. */ -inline static size_t strzl_levenstein_memory_needed(size_t, size_t b_length) { return b_length + b_length + 2; } +inline static size_t strzl_levenstein_memory_needed(size_t _, size_t b_length) { return b_length + b_length + 2; } /** * @brief Auxiliary function, that computes the minimum of three values. @@ -712,6 +716,15 @@ inline static levenstein_distance_t strzl_levenstein( // return previous_distances[b_length]; } +/** + * @brief Hashes provided string using hardware-accelerated CRC32 instructions. + */ +inline static uint32_t strzl_hash_crc32_native(char const *start, size_t length) { return 0; } + +inline static uint32_t strzl_hash_crc32_neon(char const *start, size_t length) { return 0; } + +inline static uint32_t strzl_hash_crc32_sse(char const *start, size_t length) { return 0; } + #ifdef __cplusplus } #endif From faacdd138bcbbef07381ae81a70be0011c98a2d1 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sat, 16 Sep 2023 21:15:32 +0400 Subject: [PATCH 07/72] Improve: Vectorized function calls --- .vscode/settings.json | 2 + python/lib.c | 184 +++++++++++++-------- scripts/test.py | 370 +++++++++++++++++++++--------------------- 3 files changed, 310 insertions(+), 246 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index a32765cc..1696d8d4 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -131,9 +131,11 @@ "itemsize", "keeplinebreaks", "keepseparator", + "kwnames", "levenstein", "maxsplit", "memcpy", + "nargsf", "newfunc", "NOARGS", "NOMINMAX", diff --git a/python/lib.c b/python/lib.c index fb42b7df..4dfc90e9 100644 --- a/python/lib.c +++ b/python/lib.c @@ -88,6 +88,80 @@ void slice(size_t length, ssize_t start, ssize_t end, size_t *normalized_offset, *normalized_length = end - start; } +int export_string_like(PyObject *object, char const **start, size_t *length) { + if (PyUnicode_Check(object)) { + // Handle Python str + Py_ssize_t signed_length; + *start = PyUnicode_AsUTF8AndSize(object, &signed_length); + *length = (size_t)signed_length; + return 1; + } + else if (PyBytes_Check(object)) { + // Handle Python str + Py_ssize_t signed_length; + if (PyBytes_AsStringAndSize(object, (char **)start, &signed_length) == -1) { + PyErr_SetString(PyExc_TypeError, "Mapping bytes failed"); + return 0; + } + *length = (size_t)signed_length; + return 1; + } + else if (PyObject_TypeCheck(object, &StrType)) { + Str *str = (Str *)object; + *start = str->start; + *length = str->length; + return 1; + } + else if (PyObject_TypeCheck(object, &MemoryMappedFileType)) { + MemoryMappedFile *file = (MemoryMappedFile *)object; + *start = file->start; + *length = file->length; + return 1; + } + return 0; +} + +#pragma endregion + +#pragma region Global Functions + +static PyObject *str_find_vectorcall(PyObject *_, PyObject *const *args, size_t nargsf, PyObject *kwnames) { + // Check the number of arguments and types + Py_ssize_t nargs = PyVectorcall_NARGS(nargsf); + if (nargs < 2 || nargs > 4) { + PyErr_SetString(PyExc_TypeError, "Invalid arguments"); + return NULL; + } + + // Parse the haystack. + PyObject *haystack_obj = args[0]; + struct strzl_haystack_t haystack; + if (!export_string_like(haystack_obj, &haystack.ptr, &haystack.len)) { + PyErr_SetString(PyExc_TypeError, "First argument (haystack) must be string-like"); + return NULL; + } + + // Parse the needle. + PyObject *needle_obj = args[1]; + struct strzl_needle_t needle; + needle.anomaly_offset = 0; + if (!export_string_like(needle_obj, &needle.ptr, &needle.len)) { + PyErr_SetString(PyExc_TypeError, "Second argument (needle) must be string-like"); + return NULL; + } + + // Limit the haystack range. + Py_ssize_t start = (nargs > 2) ? PyLong_AsSsize_t(args[2]) : 0; + Py_ssize_t end = (nargs > 3) ? PyLong_AsSsize_t(args[3]) : PY_SSIZE_T_MAX; + size_t normalized_offset, normalized_length; + slice(haystack.len, start, end, &normalized_offset, &normalized_length); + + haystack.ptr = haystack.ptr + normalized_offset; + haystack.len = normalized_length; + size_t position = strzl_neon_find_substr(haystack, needle); + return PyLong_FromSize_t(position); +} + #pragma endregion #pragma region MemoryMappingFile @@ -218,39 +292,6 @@ static PyTypeObject MemoryMappedFileType = { #pragma endregion -int export_string_like(PyObject *object, char const **start, size_t *length) { - if (PyUnicode_Check(object)) { - // Handle Python str - Py_ssize_t signed_length; - *start = PyUnicode_AsUTF8AndSize(object, &signed_length); - *length = (size_t)signed_length; - return 1; - } - else if (PyBytes_Check(object)) { - // Handle Python str - Py_ssize_t signed_length; - if (PyBytes_AsStringAndSize(object, start, signed_length) == -1) { - PyErr_SetString(PyExc_TypeError, "Mapping bytes failed"); - return 0; - } - *length = (size_t)signed_length; - return 1; - } - else if (PyObject_TypeCheck(object, &StrType)) { - Str *str = (Str *)object; - *start = str->start; - *length = str->length; - return 1; - } - else if (PyObject_TypeCheck(object, &MemoryMappedFileType)) { - MemoryMappedFile *file = (MemoryMappedFile *)object; - *start = file->start; - *length = file->length; - return 1; - } - return 0; -} - #pragma region Str static int Str_init(Str *self, PyObject *positional_args, PyObject *named_args) { @@ -390,36 +431,6 @@ static PyObject *Str_getslice(Str *self, PyObject *args) { static PyObject *Str_str(Str *self, PyObject *args) { return PyUnicode_FromStringAndSize(self->start, self->length); } -static PyObject *Str_find_substr(Str *self, PyObject *args) { - PyObject *needle_obj; - if (!PyArg_ParseTuple(args, "O", &needle_obj)) - return NULL; - - struct strzl_needle_t needle_struct; - needle_struct.anomaly_offset = 0; - - if (PyObject_TypeCheck(needle_obj, &StrType)) { - Str *needle = (Str *)needle_obj; - needle_struct.ptr = needle->start; - needle_struct.len = needle->length; - } - else if (PyUnicode_Check(needle_obj)) { - needle_struct.ptr = PyUnicode_AsUTF8AndSize(needle_obj, (Py_ssize_t *)&needle_struct.len); - if (needle_struct.ptr == NULL) - return NULL; // Error case, likely a UnicodeEncodeError - } - else { - PyErr_SetString(PyExc_TypeError, "Argument must be an instance of Str or a native Python str"); - return NULL; - } - - struct strzl_haystack_t haystack; - haystack.ptr = self->start; - haystack.len = self->length; - size_t position = strzl_neon_find_substr(haystack, needle_struct); - return PyLong_FromSize_t(position); -} - static PySequenceMethods Str_as_sequence = { .sq_length = (lenfunc)Str_len, // .sq_item = (ssizeargfunc)Str_getitem, // @@ -445,7 +456,6 @@ static PyTypeObject StrType = { .tp_as_sequence = &Str_as_sequence, .tp_hash = (hashfunc)Str_hash, // String hashing functions // .tp_as_buffer = (PyBufferProcs *)NULL, // Functions to access object as input/output buffer - // .tp_vectorcall = (vectorcallfunc)NULL, // Faster function dispatch }; #pragma endregion @@ -465,6 +475,13 @@ static PyModuleDef stringzilla_module = { NULL, }; +static PyObject *vectorized_find = NULL; +static PyObject *vectorized_count = NULL; +static PyObject *vectorized_contains = NULL; +static PyObject *vectorized_split = NULL; +static PyObject *vectorized_sort = NULL; +static PyObject *vectorized_shuffle = NULL; + PyMODINIT_FUNC PyInit_stringzilla(void) { PyObject *m; @@ -493,5 +510,44 @@ PyMODINIT_FUNC PyInit_stringzilla(void) { return NULL; } + // Create the 'find' function + vectorized_find = PyObject_Malloc(sizeof(PyCFunctionObject)); + if (vectorized_find == NULL) { + Py_XDECREF(&MemoryMappedFileType); + Py_XDECREF(&StrType); + Py_XDECREF(m); + PyErr_NoMemory(); + return NULL; + } + PyObject_Init(vectorized_find, &PyCFunction_Type); + ((PyCFunctionObject *)vectorized_find)->m_ml = NULL; // No regular PyMethodDef + ((PyCFunctionObject *)vectorized_find)->vectorcall = str_find_vectorcall; + + // Add the 'find' function to the module + if (PyModule_AddObject(m, "find", vectorized_find) < 0) { + PyObject_Free(vectorized_find); + Py_XDECREF(&MemoryMappedFileType); + Py_XDECREF(&StrType); + Py_XDECREF(m); + return NULL; + } + return m; + +cleanup: + if (vectorized_find) + Py_XDECREF(vectorized_find); + if (vectorized_count) + Py_XDECREF(vectorized_count); + if (vectorized_contains) + Py_XDECREF(vectorized_contains); + if (vectorized_split) + Py_XDECREF(vectorized_split); + if (vectorized_sort) + Py_XDECREF(vectorized_sort); + if (vectorized_shuffle) + Py_XDECREF(vectorized_shuffle); + Py_XDECREF(m); + PyErr_NoMemory(); + return NULL; } diff --git a/scripts/test.py b/scripts/test.py index 189345e0..a846b6ae 100644 --- a/scripts/test.py +++ b/scripts/test.py @@ -5,6 +5,7 @@ import pytest +import stringzilla as sz from stringzilla import Str @@ -28,202 +29,207 @@ def test_contains(): assert "xxx" not in big -def get_random_string( - length: Optional[int] = None, variability: Optional[int] = None -) -> str: - if length is None: - length = randint(3, 300) - if variability is None: - variability = len(ascii_lowercase) - return "".join(choice(ascii_lowercase[:variability]) for _ in range(length)) +def test_globals(): + assert sz.find("abcdef", "bcdef") == 1 + assert sz.find("abcdef", "x") == 6 -def is_equal_strings(native_strings, big_strings): - for native_slice, big_slice in zip(native_strings, big_strings): - assert native_slice == big_slice +# def get_random_string( +# length: Optional[int] = None, variability: Optional[int] = None +# ) -> str: +# if length is None: +# length = randint(3, 300) +# if variability is None: +# variability = len(ascii_lowercase) +# return "".join(choice(ascii_lowercase[:variability]) for _ in range(length)) -def check_identical( - native: str, - big: Union[Str, File], - needle: Optional[str] = None, - check_iterators: bool = False, -): - if needle is None: - part_offset = randint(0, len(native) - 1) - part_length = randint(1, len(native) - part_offset) - needle = native[part_offset:part_length] +# def is_equal_strings(native_strings, big_strings): +# for native_slice, big_slice in zip(native_strings, big_strings): +# assert native_slice == big_slice - present_in_native: bool = needle in native - present_in_big = needle in big - assert present_in_native == present_in_big - assert native.find(needle) == big.find(needle) - assert native.count(needle) == big.count(needle) - native_strings = native.split(needle) - big_strings: Strs = big.split(needle) - assert len(native_strings) == len(big_strings) +# def check_identical( +# native: str, +# big: Union[Str, File], +# needle: Optional[str] = None, +# check_iterators: bool = False, +# ): +# if needle is None: +# part_offset = randint(0, len(native) - 1) +# part_length = randint(1, len(native) - part_offset) +# needle = native[part_offset:part_length] - if check_iterators: - for i in range(len(native_strings)): - assert len(native_strings[i]) == len(big_strings[i]) - assert native_strings[i] == big_strings[i] - assert [c for c in native_strings[i]] == [c for c in big_strings[i]] +# present_in_native: bool = needle in native +# present_in_big = needle in big +# assert present_in_native == present_in_big +# assert native.find(needle) == big.find(needle) +# assert native.count(needle) == big.count(needle) - is_equal_strings(native_strings, big_strings) +# native_strings = native.split(needle) +# big_strings: Strs = big.split(needle) +# assert len(native_strings) == len(big_strings) +# if check_iterators: +# for i in range(len(native_strings)): +# assert len(native_strings[i]) == len(big_strings[i]) +# assert native_strings[i] == big_strings[i] +# assert [c for c in native_strings[i]] == [c for c in big_strings[i]] -@pytest.mark.parametrize("haystack_length", range(1, 65)) -@pytest.mark.parametrize("variability", range(1, 25)) -def test_contains(haystack_length: int, variability: int): - native = get_random_string(variability=variability, length=haystack_length) - big = Str(native) - pattern = get_random_string(variability=variability, length=randint(1, 5)) - assert (pattern in native) == big.contains(pattern) - - -def test_count_overlap(): - native = "aaaaa" - big = Str(native) - assert native.count("aa") == big.count("aa") - assert 4 == big.count("aa", allowoverlap=True) - - -def test_splitlines(): - native = "line1\nline2\nline3" - big = Str(native) - assert native.splitlines() == list(big.splitlines()) - assert native.splitlines(True) == list(big.splitlines(keeplinebreaks=True)) - - -def test_split_keepseparator(): - native = "word1 word2 word3" - big = Str(native) - assert ["word1 ", "word2 ", "word3"] == list(big.split(" ", keepseparator=True)) - - -def test_strs_operations(): - native = "line1\nline2\nline3" - big = Str(native) - lines = big.splitlines() - lines.sort() - assert ["line1", "line2", "line3"] == list(lines) +# is_equal_strings(native_strings, big_strings) - shuffled_copy = lines.shuffled(seed=42) - assert set(lines) == set(shuffled_copy) - lines.append("line4") - assert 4 == len(lines) - lines.extend(["line5", "line6"]) - assert 6 == len(lines) +# @pytest.mark.parametrize("haystack_length", range(1, 65)) +# @pytest.mark.parametrize("variability", range(1, 25)) +# def test_contains(haystack_length: int, variability: int): +# native = get_random_string(variability=variability, length=haystack_length) +# big = Str(native) +# pattern = get_random_string(variability=variability, length=randint(1, 5)) +# assert (pattern in native) == big.contains(pattern) - lines.append(lines[0]) - assert 7 == len(lines) - assert lines[6] == "line1" - lines.extend(lines) - assert 14 == len(lines) - assert lines[7] == "line1" - assert lines[8] == "line2" - assert lines[12] == "line6" +# def test_count_overlap(): +# native = "aaaaa" +# big = Str(native) +# assert native.count("aa") == big.count("aa") +# assert 4 == big.count("aa", allowoverlap=True) - # Test that shuffles are reproducible with the same `seed` - a = [str(s) for s in lines.shuffled(seed=42)] - b = [str(s) for s in lines.shuffled(seed=42)] - assert a == b + +# def test_splitlines(): +# native = "line1\nline2\nline3" +# big = Str(native) +# assert native.splitlines() == list(big.splitlines()) +# assert native.splitlines(True) == list(big.splitlines(keeplinebreaks=True)) -@pytest.mark.parametrize("repetitions", range(1, 10)) -def test_basic(repetitions: int): - native = "abcd" * repetitions - big = Str(native) - - check_identical(native, big, "a", True) - check_identical(native, big, "ab", True) - check_identical(native, big, "abc", True) - check_identical(native, big, "abcd", True) - check_identical(native, big, "abcde", True) # Missing pattern - - -@pytest.mark.parametrize("pattern_length", [1, 2, 4, 5]) -@pytest.mark.parametrize("haystack_length", range(1, 69, 3)) -@pytest.mark.parametrize("variability", range(1, 27, 3)) -def test_fuzzy(pattern_length: int, haystack_length: int, variability: int): - native = get_random_string(variability=variability, length=haystack_length) - big = Str(native) - - # Start by matching the prefix and the suffix - check_identical(native, big, native[:pattern_length]) - check_identical(native, big, native[-pattern_length:]) - - # Continue with random strs - for _ in range(haystack_length // pattern_length): - pattern = get_random_string(variability=variability, length=pattern_length) - check_identical(native, big, pattern) - - -def test_strs(): - native = get_random_string(length=10) - big = Str(native) - - assert native[0:5] == big.sub(0, 5) and native[0:5] == big[0:5] - assert native[5:10] == big.sub(5, 10) and native[5:10] == big[5:10] - - assert native[5:5] == big.sub(5, 5) and native[5:5] == big[5:5] - assert native[-5:-5] == big.sub(-5, -5) and native[-5:-5] == big[-5:-5] - assert native[2:-2] == big.sub(2, -2) and native[2:-2] == big[2:-2] - assert native[7:-7] == big.sub(7, -7) and native[7:-7] == big[7:-7] - - assert native[5:3] == big.sub(5, 3) and native[5:3] == big[5:3] - assert native[5:7] == big.sub(5, 7) and native[5:7] == big[5:7] - assert native[5:-3] == big.sub(5, -3) and native[5:-3] == big[5:-3] - assert native[5:-7] == big.sub(5, -7) and native[5:-7] == big[5:-7] - - assert native[-5:3] == big.sub(-5, 3) and native[-5:3] == big[-5:3] - assert native[-5:7] == big.sub(-5, 7) and native[-5:7] == big[-5:7] - assert native[-5:-3] == big.sub(-5, -3) and native[-5:-3] == big[-5:-3] - assert native[-5:-7] == big.sub(-5, -7) and native[-5:-7] == big[-5:-7] - - assert native[2:] == big.sub(2) and native[2:] == big[2:] - assert native[:7] == big.sub(end=7) and native[:7] == big[:7] - assert native[-2:] == big.sub(-2) and native[-2:] == big[-2:] - assert native[:-7] == big.sub(end=-7) and native[:-7] == big[:-7] - assert native[:-10] == big.sub(end=-10) and native[:-10] == big[:-10] - assert native[:-1] == big.sub(end=-1) and native[:-1] == big[:-1] - - length = 1000 - native = get_random_string(length=length) - big = Str(native) - - needle = native[0 : randint(2, 5)] - native_strings = native.split(needle) - big_strings: Strs = big.split(needle) - - length = len(native_strings) - for i in range(length): - start = randint(1 - length, length - 1) - stop = randint(1 - length, length - 1) - step = 0 - while step == 0: - step = randint(-int(math.sqrt(length)), int(math.sqrt(length))) - - is_equal_strings(native_strings[start:stop:step], big_strings[start:stop:step]) - is_equal_strings( - native_strings[start:stop:step], - big_strings.sub(start, stop, step), - ) - - -def test_levenstein(): - # Create a new string by slicing and concatenating - def insert_char_at(s, char_to_insert, index): - return s[:index] + char_to_insert + s[index:] - - for _ in range(100): - a = get_random_string(length=20) - b = a - for i in range(150): - source_offset = randint(0, len(ascii_lowercase) - 1) - target_offset = randint(0, len(b) - 1) - b = insert_char_at(b, ascii_lowercase[source_offset], target_offset) - assert levenstein(a, b, 200) == i + 1 +# def test_split_keepseparator(): +# native = "word1 word2 word3" +# big = Str(native) +# assert ["word1 ", "word2 ", "word3"] == list(big.split(" ", keepseparator=True)) + + +# def test_strs_operations(): +# native = "line1\nline2\nline3" +# big = Str(native) +# lines = big.splitlines() +# lines.sort() +# assert ["line1", "line2", "line3"] == list(lines) + +# shuffled_copy = lines.shuffled(seed=42) +# assert set(lines) == set(shuffled_copy) + +# lines.append("line4") +# assert 4 == len(lines) +# lines.extend(["line5", "line6"]) +# assert 6 == len(lines) + +# lines.append(lines[0]) +# assert 7 == len(lines) +# assert lines[6] == "line1" + +# lines.extend(lines) +# assert 14 == len(lines) +# assert lines[7] == "line1" +# assert lines[8] == "line2" +# assert lines[12] == "line6" + +# # Test that shuffles are reproducible with the same `seed` +# a = [str(s) for s in lines.shuffled(seed=42)] +# b = [str(s) for s in lines.shuffled(seed=42)] +# assert a == b + + +# @pytest.mark.parametrize("repetitions", range(1, 10)) +# def test_basic(repetitions: int): +# native = "abcd" * repetitions +# big = Str(native) + +# check_identical(native, big, "a", True) +# check_identical(native, big, "ab", True) +# check_identical(native, big, "abc", True) +# check_identical(native, big, "abcd", True) +# check_identical(native, big, "abcde", True) # Missing pattern + + +# @pytest.mark.parametrize("pattern_length", [1, 2, 4, 5]) +# @pytest.mark.parametrize("haystack_length", range(1, 69, 3)) +# @pytest.mark.parametrize("variability", range(1, 27, 3)) +# def test_fuzzy(pattern_length: int, haystack_length: int, variability: int): +# native = get_random_string(variability=variability, length=haystack_length) +# big = Str(native) + +# # Start by matching the prefix and the suffix +# check_identical(native, big, native[:pattern_length]) +# check_identical(native, big, native[-pattern_length:]) + +# # Continue with random strs +# for _ in range(haystack_length // pattern_length): +# pattern = get_random_string(variability=variability, length=pattern_length) +# check_identical(native, big, pattern) + + +# def test_strs(): +# native = get_random_string(length=10) +# big = Str(native) + +# assert native[0:5] == big.sub(0, 5) and native[0:5] == big[0:5] +# assert native[5:10] == big.sub(5, 10) and native[5:10] == big[5:10] + +# assert native[5:5] == big.sub(5, 5) and native[5:5] == big[5:5] +# assert native[-5:-5] == big.sub(-5, -5) and native[-5:-5] == big[-5:-5] +# assert native[2:-2] == big.sub(2, -2) and native[2:-2] == big[2:-2] +# assert native[7:-7] == big.sub(7, -7) and native[7:-7] == big[7:-7] + +# assert native[5:3] == big.sub(5, 3) and native[5:3] == big[5:3] +# assert native[5:7] == big.sub(5, 7) and native[5:7] == big[5:7] +# assert native[5:-3] == big.sub(5, -3) and native[5:-3] == big[5:-3] +# assert native[5:-7] == big.sub(5, -7) and native[5:-7] == big[5:-7] + +# assert native[-5:3] == big.sub(-5, 3) and native[-5:3] == big[-5:3] +# assert native[-5:7] == big.sub(-5, 7) and native[-5:7] == big[-5:7] +# assert native[-5:-3] == big.sub(-5, -3) and native[-5:-3] == big[-5:-3] +# assert native[-5:-7] == big.sub(-5, -7) and native[-5:-7] == big[-5:-7] + +# assert native[2:] == big.sub(2) and native[2:] == big[2:] +# assert native[:7] == big.sub(end=7) and native[:7] == big[:7] +# assert native[-2:] == big.sub(-2) and native[-2:] == big[-2:] +# assert native[:-7] == big.sub(end=-7) and native[:-7] == big[:-7] +# assert native[:-10] == big.sub(end=-10) and native[:-10] == big[:-10] +# assert native[:-1] == big.sub(end=-1) and native[:-1] == big[:-1] + +# length = 1000 +# native = get_random_string(length=length) +# big = Str(native) + +# needle = native[0 : randint(2, 5)] +# native_strings = native.split(needle) +# big_strings: Strs = big.split(needle) + +# length = len(native_strings) +# for i in range(length): +# start = randint(1 - length, length - 1) +# stop = randint(1 - length, length - 1) +# step = 0 +# while step == 0: +# step = randint(-int(math.sqrt(length)), int(math.sqrt(length))) + +# is_equal_strings(native_strings[start:stop:step], big_strings[start:stop:step]) +# is_equal_strings( +# native_strings[start:stop:step], +# big_strings.sub(start, stop, step), +# ) + + +# def test_levenstein(): +# # Create a new string by slicing and concatenating +# def insert_char_at(s, char_to_insert, index): +# return s[:index] + char_to_insert + s[index:] + +# for _ in range(100): +# a = get_random_string(length=20) +# b = a +# for i in range(150): +# source_offset = randint(0, len(ascii_lowercase) - 1) +# target_offset = randint(0, len(b) - 1) +# b = insert_char_at(b, ascii_lowercase[source_offset], target_offset) +# assert levenstein(a, b, 200) == i + 1 From 9a38259fc0f120a34ee2c369443fbc0d5dffc8b3 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sat, 16 Sep 2023 21:42:24 +0400 Subject: [PATCH 08/72] Add: Slices and rich comparisons --- .vscode/settings.json | 2 + python/lib.c | 92 ++++++++++++++++++++++++++++++++++++++----- scripts/test.py | 6 +++ 3 files changed, 90 insertions(+), 10 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 1696d8d4..6e4162b1 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -139,9 +139,11 @@ "newfunc", "NOARGS", "NOMINMAX", + "NOTIMPLEMENTED", "pytest", "quadgram", "readlines", + "richcompare", "SIMD", "splitlines", "stringzilla", diff --git a/python/lib.c b/python/lib.c index 4dfc90e9..01240c44 100644 --- a/python/lib.c +++ b/python/lib.c @@ -366,6 +366,42 @@ static PyObject *Str_getitem(Str *self, Py_ssize_t i) { return PyUnicode_FromStringAndSize(self->start + i, 1); } +static PyObject *Str_subscript(Str *self, PyObject *key) { + if (PySlice_Check(key)) { + Py_ssize_t start, stop, step; + if (PySlice_Unpack(key, &start, &stop, &step) < 0) + return NULL; + if (PySlice_AdjustIndices(self->length, &start, &stop, step) < 0) + return NULL; + + if (step != 1) { + PyErr_SetString(PyExc_IndexError, "Efficient step is not supported"); + return NULL; + } + + // Create a new `Str` object + Str *self_slice = (Str *)StrType.tp_alloc(&StrType, 0); + if (self_slice == NULL && PyErr_NoMemory()) + return NULL; + + // Set its properties based on the slice + self_slice->start = self->start + start; + self_slice->length = stop - start; + self_slice->parent = (PyObject *)self; // Set parent to keep it alive + + // Increment the reference count of the parent + Py_INCREF(self); + return (PyObject *)self_slice; + } + else if (PyLong_Check(key)) { + return Str_getitem(self, PyLong_AsSsize_t(key)); + } + else { + PyErr_SetString(PyExc_TypeError, "Str indices must be integers or slices"); + return NULL; + } +} + // Will be called by the `PySequence_Contains` static int Str_contains(Str *self, PyObject *arg) { @@ -431,17 +467,47 @@ static PyObject *Str_getslice(Str *self, PyObject *args) { static PyObject *Str_str(Str *self, PyObject *args) { return PyUnicode_FromStringAndSize(self->start, self->length); } +static PyObject *Str_richcompare(PyObject *self, PyObject *other, int op) { + + char const *a_start, *b_start; + size_t a_length, b_length; + if (!export_string_like(self, &a_start, &a_length) || !export_string_like(other, &b_start, &b_length)) + Py_RETURN_NOTIMPLEMENTED; + + // Perform byte-wise comparison up to the minimum length + size_t min_length = a_length < b_length ? a_length : b_length; + int cmp_result = memcmp(a_start, b_start, min_length); + + // If the strings are equal up to `min_length`, then the shorter string is smaller + if (cmp_result == 0) + cmp_result = (a_length > b_length) - (a_length < b_length); + + switch (op) { + case Py_LT: return PyBool_FromLong(cmp_result < 0); + case Py_LE: return PyBool_FromLong(cmp_result <= 0); + case Py_EQ: return PyBool_FromLong(cmp_result == 0); + case Py_NE: return PyBool_FromLong(cmp_result != 0); + case Py_GT: return PyBool_FromLong(cmp_result > 0); + case Py_GE: return PyBool_FromLong(cmp_result >= 0); + default: Py_RETURN_NOTIMPLEMENTED; + } +} + static PySequenceMethods Str_as_sequence = { - .sq_length = (lenfunc)Str_len, // - .sq_item = (ssizeargfunc)Str_getitem, // - .sq_contains = (objobjproc)Str_contains, // + .sq_length = Str_len, // + .sq_item = Str_getitem, // + .sq_contains = Str_contains, // +}; + +static PyMappingMethods Str_as_mapping = { + .mp_length = Str_len, // + .mp_subscript = Str_subscript, // Is used to implement slices in Python }; static PyMethodDef Str_methods[] = { // {"contains", (PyCFunction)Str_str, METH_NOARGS, "Convert to Python `str`"}, - {"find", (PyCFunction)Str_len, METH_NOARGS, "Get length"}, - {"find", (PyCFunction)Str_len, METH_NOARGS, "Get length"}, - {"__getitem__", (PyCFunction)Str_getitem, METH_O, "Indexing"}, + // {"find", (PyCFunction)Str_len, METH_NOARGS, "Get length"}, + // {"__getitem__", (PyCFunction)Str_getitem, METH_O, "Indexing"}, {NULL, NULL, 0, NULL}}; static PyTypeObject StrType = { @@ -450,11 +516,13 @@ static PyTypeObject StrType = { .tp_basicsize = sizeof(Str), .tp_flags = Py_TPFLAGS_DEFAULT, .tp_methods = Str_methods, - .tp_new = (newfunc)Str_new, - .tp_init = (initproc)Str_init, - .tp_dealloc = (destructor)Str_dealloc, + .tp_new = Str_new, + .tp_init = Str_init, + .tp_dealloc = Str_dealloc, .tp_as_sequence = &Str_as_sequence, - .tp_hash = (hashfunc)Str_hash, // String hashing functions + .tp_as_mapping = &Str_as_mapping, + .tp_hash = Str_hash, // String hashing functions + .tp_richcompare = Str_richcompare, // .tp_as_buffer = (PyBufferProcs *)NULL, // Functions to access object as input/output buffer }; @@ -475,9 +543,13 @@ static PyModuleDef stringzilla_module = { NULL, }; +// String functions: static PyObject *vectorized_find = NULL; static PyObject *vectorized_count = NULL; static PyObject *vectorized_contains = NULL; +static PyObject *vectorized_levenstein = NULL; + +// String collections: static PyObject *vectorized_split = NULL; static PyObject *vectorized_sort = NULL; static PyObject *vectorized_shuffle = NULL; diff --git a/scripts/test.py b/scripts/test.py index a846b6ae..61b121ad 100644 --- a/scripts/test.py +++ b/scripts/test.py @@ -29,6 +29,12 @@ def test_contains(): assert "xxx" not in big +def test_rich_comparisons(): + assert Str("aa") == "aa" + assert Str("aa") < "b" + assert Str("abb")[1:] == "bb" + + def test_globals(): assert sz.find("abcdef", "bcdef") == 1 assert sz.find("abcdef", "x") == 6 From 6f9a9bdd0eb5366e695b3e20014b727d77c7779b Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sun, 17 Sep 2023 13:20:33 +0400 Subject: [PATCH 09/72] Make: Remove PyBind11 dependency --- .vscode/settings.json | 1 + README.md | 2 +- pyproject.toml | 2 +- setup.py | 6 ++---- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 6e4162b1..ae2ecb67 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -152,6 +152,7 @@ "substr", "SWAR", "TPFLAGS", + "Vardanian", "Zilla" ] } \ No newline at end of file diff --git a/README.md b/README.md index 3b31b262..1049ef63 100644 --- a/README.md +++ b/README.md @@ -118,7 +118,7 @@ strzl_sort(&array, &your_config); Future development plans include: -- Replace PyBind11 with CPython. +- [x] Replace PyBind11 with CPython. - Reverse-order operations in Python #12. - Bindings for JavaScript #25, Java, and Rust. - Faster string sorting algorithm. diff --git a/pyproject.toml b/pyproject.toml index 84e82bf3..fe8221c7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["setuptools>=42", "wheel", "cmake>=3.22", "pybind11"] +requires = ["setuptools>=42", "wheel", "cmake>=3.22"] build-backend = "setuptools.build_meta" [tool.pytest.ini_options] diff --git a/setup.py b/setup.py index 83c22aea..546aed5a 100644 --- a/setup.py +++ b/setup.py @@ -1,9 +1,7 @@ import os import sys -from setuptools import setup import platform - -from pybind11.setup_helpers import Pybind11Extension +from setuptools import setup, Extension compile_args = [] @@ -49,7 +47,7 @@ ext_modules = [ - Pybind11Extension( + Extension( "stringzilla", ["python/lib.c"], include_dirs=["stringzilla"], From b5689b748637f17bed07fe20a0a4311ee3868665 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sun, 17 Sep 2023 15:27:59 +0400 Subject: [PATCH 10/72] Add: Vectorized `count` --- .vscode/settings.json | 3 + README.md | 28 ++++-- python/lib.c | 228 +++++++++++++++++++++++++++++++++--------- scripts/test.py | 15 ++- 4 files changed, 216 insertions(+), 58 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index ae2ecb67..7ed8de05 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -131,10 +131,12 @@ "itemsize", "keeplinebreaks", "keepseparator", + "kwds", "kwnames", "levenstein", "maxsplit", "memcpy", + "MODINIT", "nargsf", "newfunc", "NOARGS", @@ -153,6 +155,7 @@ "SWAR", "TPFLAGS", "Vardanian", + "vectorcallfunc", "Zilla" ] } \ No newline at end of file diff --git a/README.md b/README.md index 1049ef63..973deaee 100644 --- a/README.md +++ b/README.md @@ -36,21 +36,21 @@ Coming soon. ## Quick Start: Python 🐍 1️. Install via pip: `pip install stringzilla` -2. Import classes: `from stringzilla import Str, File, Strs` +1. Import the classes you need: `from stringzilla import Str, Strs, MemoryMappedFile` ### Basic Usage StringZilla offers two mostly interchangeable core classes: ```python -from stringzilla import Str, File +from stringzilla import Str, MemoryMappedFile -text1 = Str('some-string') -text2 = File('some-file.txt') +text_from_str = Str('some-string') +text_from_file = Str(MemoryMappedFile('some-file.txt')) ``` The `Str` is designed to replace long Python `str` strings and wrap our C-level API. -On the other hand, the `File` memory-maps a file from persistent memory without loading its copy into RAM. +On the other hand, the `MemoryMappedFile` memory-maps a file from persistent memory without loading its copy into RAM. The contents of that file would remain immutable, and the mapping can be shared by multiple Python processes simultaneously. A standard dataset pre-processing use case would be to map a sizeable textual dataset like Common Crawl into memory, spawn child processes, and split the job between them. @@ -58,11 +58,12 @@ A standard dataset pre-processing use case would be to map a sizeable textual da - Length: `len(text) -> int` - Indexing: `text[42] -> str` -- Slicing: `text[42:46] -> str` +- Slicing: `text[42:46] -> Str` +- String conversion: `str(text) -> str` +- Substring check: `'substring' in text -> bool` ### Advanced Operations -- `'substring' in text -> bool` - `text.contains('substring', start=0, end=9223372036854775807) -> bool` - `text.find('substring', start=0, end=9223372036854775807) -> int` - `text.count('substring', start=0, end=9223372036854775807, allowoverlap=False) -> int` @@ -93,6 +94,19 @@ lines.append('Pythonic string') lines.extend(shuffled_copy) ``` +### Low-Level Python API + +The StringZilla CPython bindings implement vector-call conventions for faster calls. + +```py +import stringzilla as sz + +contains: bool = sz.contains("haystack", "needle", start=0, end=9223372036854775807) +offset: int = sz.find("haystack", "needle", start=0, end=9223372036854775807) +count: int = sz.count("haystack", "needle", start=0, end=9223372036854775807, allowoverlap=False) +levenstein: int = sz.levenstein("needle", "nidl") +``` + ## Quick Start: C 🛠️ There is an ABI-stable C 99 interface, in case you have a database, an operating system, or a runtime you want to integrate with StringZilla. diff --git a/python/lib.c b/python/lib.c index 01240c44..93bbce93 100644 --- a/python/lib.c +++ b/python/lib.c @@ -125,41 +125,160 @@ int export_string_like(PyObject *object, char const **start, size_t *length) { #pragma region Global Functions -static PyObject *str_find_vectorcall(PyObject *_, PyObject *const *args, size_t nargsf, PyObject *kwnames) { - // Check the number of arguments and types +static size_t str_find_vectorcall_(PyObject *_, PyObject *const *args, size_t nargsf, PyObject *kwnames) { Py_ssize_t nargs = PyVectorcall_NARGS(nargsf); - if (nargs < 2 || nargs > 4) { - PyErr_SetString(PyExc_TypeError, "Invalid arguments"); + + // Initialize defaults + Py_ssize_t start = 0; + Py_ssize_t end = PY_SSIZE_T_MAX; + + // Parse positional arguments: haystack and needle + if (nargs < 2) { + PyErr_SetString(PyExc_TypeError, "Invalid number of arguments"); return NULL; } - // Parse the haystack. PyObject *haystack_obj = args[0]; + PyObject *needle_obj = args[1]; + struct strzl_haystack_t haystack; - if (!export_string_like(haystack_obj, &haystack.ptr, &haystack.len)) { - PyErr_SetString(PyExc_TypeError, "First argument (haystack) must be string-like"); + struct strzl_needle_t needle; + if (!export_string_like(haystack_obj, &haystack.ptr, &haystack.len) || + !export_string_like(needle_obj, &needle.ptr, &needle.len)) { + PyErr_SetString(PyExc_TypeError, "Haystack and needle must be string-like"); return NULL; } - // Parse the needle. + // Parse additional positional arguments + if (nargs > 2) + start = PyLong_AsSsize_t(args[2]); + if (nargs > 3) + end = PyLong_AsSsize_t(args[3]); + + // Parse keyword arguments + if (kwnames != NULL) { + for (Py_ssize_t i = 0; i < PyTuple_Size(kwnames); ++i) { + PyObject *key = PyTuple_GetItem(kwnames, i); + PyObject *value = args[nargs + i]; + if (PyUnicode_CompareWithASCIIString(key, "start") == 0) + start = PyLong_AsSsize_t(value); + else if (PyUnicode_CompareWithASCIIString(key, "end") == 0) + end = PyLong_AsSsize_t(value); + else { + PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key); + return NULL; + } + } + } + + // Limit the haystack range + size_t normalized_offset, normalized_length; + slice(haystack.len, start, end, &normalized_offset, &normalized_length); + haystack.ptr += normalized_offset; + haystack.len = normalized_length; + + // Perform contains operation + return strzl_neon_find_substr(haystack, needle); +} + +static PyObject *str_find_vectorcall(PyObject *_, PyObject *const *args, size_t nargsf, PyObject *kwnames) { + size_t offset = str_find_vectorcall_(NULL, args, nargsf, kwnames); + return PyLong_FromSize_t(offset); +} + +static PyObject *str_contains_vectorcall(PyObject *_, PyObject *const *args, size_t nargsf, PyObject *kwnames) { + size_t offset = str_find_vectorcall_(NULL, args, nargsf, kwnames); + if (offset != haystack.len) { + Py_RETURN_TRUE; + } + else { + Py_RETURN_FALSE; + } +} + +static PyObject *str_count_vectorcall(PyObject *_, PyObject *const *args, size_t nargsf, PyObject *kwnames) { + Py_ssize_t nargs = PyVectorcall_NARGS(nargsf); + + // Initialize defaults + Py_ssize_t start = 0; + Py_ssize_t end = PY_SSIZE_T_MAX; + int allow_overlap = 0; + + // Parse positional arguments: haystack and needle + if (nargs < 2) { + PyErr_SetString(PyExc_TypeError, "Invalid number of arguments"); + return NULL; + } + + PyObject *haystack_obj = args[0]; PyObject *needle_obj = args[1]; + + struct strzl_haystack_t haystack; struct strzl_needle_t needle; - needle.anomaly_offset = 0; - if (!export_string_like(needle_obj, &needle.ptr, &needle.len)) { - PyErr_SetString(PyExc_TypeError, "Second argument (needle) must be string-like"); + if (!export_string_like(haystack_obj, &haystack.ptr, &haystack.len) || + !export_string_like(needle_obj, &needle.ptr, &needle.len)) { + PyErr_SetString(PyExc_TypeError, "Haystack and needle must be string-like"); return NULL; } - // Limit the haystack range. - Py_ssize_t start = (nargs > 2) ? PyLong_AsSsize_t(args[2]) : 0; - Py_ssize_t end = (nargs > 3) ? PyLong_AsSsize_t(args[3]) : PY_SSIZE_T_MAX; + // Parse additional positional arguments + if (nargs > 2) + start = PyLong_AsSsize_t(args[2]); + if (nargs > 3) + end = PyLong_AsSsize_t(args[3]); + + // Parse keyword arguments + if (kwnames != NULL) { + for (Py_ssize_t i = 0; i < PyTuple_Size(kwnames); ++i) { + PyObject *key = PyTuple_GetItem(kwnames, i); + PyObject *value = args[nargs + i]; + if (PyUnicode_CompareWithASCIIString(key, "start") == 0) + start = PyLong_AsSsize_t(value); + else if (PyUnicode_CompareWithASCIIString(key, "end") == 0) + end = PyLong_AsSsize_t(value); + else if (PyUnicode_CompareWithASCIIString(key, "allowoverlap") == 0) + allow_overlap = PyObject_IsTrue(value); + else { + PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key); + return NULL; + } + } + } + + // Limit the haystack range size_t normalized_offset, normalized_length; slice(haystack.len, start, end, &normalized_offset, &normalized_length); - - haystack.ptr = haystack.ptr + normalized_offset; + haystack.ptr += normalized_offset; haystack.len = normalized_length; - size_t position = strzl_neon_find_substr(haystack, needle); - return PyLong_FromSize_t(position); + + // Perform counting operation + size_t count = 0; + if (needle.len == 1) { + count = strzl_naive_count_char(haystack, *needle.ptr); + } + else { + // Your existing logic for count_substr can be embedded here + if (allow_overlap) { + while (haystack.len) { + size_t offset = strzl_neon_find_substr(haystack, needle); + int found = offset != haystack.len; + count += found; + haystack.ptr += offset + found; + haystack.len -= offset + found; + } + } + else { + while (haystack.len) { + size_t offset = strzl_neon_find_substr(haystack, needle); + int found = offset != haystack.len; + count += found; + haystack.ptr += offset + needle.len; + haystack.len -= offset + needle.len * found; + } + } + } + + return PyLong_FromSize_t(count); } #pragma endregion @@ -349,8 +468,12 @@ static void Str_dealloc(Str *self) { Py_TYPE(self)->tp_free((PyObject *)self); } +static PyObject *Str_str(Str *self) { return PyUnicode_FromStringAndSize(self->start, self->length); } + static Py_ssize_t Str_len(Str *self) { return self->length; } +static Py_hash_t Str_hash(Str *self) { return (Py_hash_t)strzl_hash_crc32_native(self->start, self->length); } + static PyObject *Str_getitem(Str *self, Py_ssize_t i) { // Negative indexing @@ -419,8 +542,6 @@ static int Str_contains(Str *self, PyObject *arg) { return position != haystack.len; } -static Py_hash_t Str_hash(Str *self) { return (Py_hash_t)strzl_hash_crc32_native(self->start, self->length); } - static PyObject *Str_getslice(Str *self, PyObject *args) { PyObject *start_obj = NULL, *end_obj = NULL; ssize_t start = 0, end = self->length; // Default values @@ -465,8 +586,6 @@ static PyObject *Str_getslice(Str *self, PyObject *args) { return (PyObject *)new_str; } -static PyObject *Str_str(Str *self, PyObject *args) { return PyUnicode_FromStringAndSize(self->start, self->length); } - static PyObject *Str_richcompare(PyObject *self, PyObject *other, int op) { char const *a_start, *b_start; @@ -505,9 +624,9 @@ static PyMappingMethods Str_as_mapping = { }; static PyMethodDef Str_methods[] = { // - {"contains", (PyCFunction)Str_str, METH_NOARGS, "Convert to Python `str`"}, - // {"find", (PyCFunction)Str_len, METH_NOARGS, "Get length"}, - // {"__getitem__", (PyCFunction)Str_getitem, METH_O, "Indexing"}, + // {"contains", (PyCFunction)..., METH_NOARGS, "Convert to Python `str`"}, + // {"find", (PyCFunction)..., METH_NOARGS, "Get length"}, + // {"__getitem__", (PyCFunction)..., METH_O, "Indexing"}, {NULL, NULL, 0, NULL}}; static PyTypeObject StrType = { @@ -523,6 +642,7 @@ static PyTypeObject StrType = { .tp_as_mapping = &Str_as_mapping, .tp_hash = Str_hash, // String hashing functions .tp_richcompare = Str_richcompare, + .tp_str = Str_str, // .tp_as_buffer = (PyBufferProcs *)NULL, // Functions to access object as input/output buffer }; @@ -554,6 +674,24 @@ static PyObject *vectorized_split = NULL; static PyObject *vectorized_sort = NULL; static PyObject *vectorized_shuffle = NULL; +PyObject *register_vectorcall(PyObject *module, char const *name, vectorcallfunc vectorcall) { + + PyCFunctionObject *vectorcall_object = (PyCFunctionObject *)PyObject_Malloc(sizeof(PyCFunctionObject)); + if (vectorcall_object == NULL) + return NULL; + + PyObject_Init(vectorcall_object, &PyCFunction_Type); + vectorcall_object->m_ml = NULL; // No regular `PyMethodDef` + vectorcall_object->vectorcall = vectorcall; + + // Add the 'find' function to the module + if (PyModule_AddObject(module, name, vectorcall_object) < 0) { + Py_XDECREF(vectorcall_object); + return NULL; + } + return vectorcall_object; +} + PyMODINIT_FUNC PyInit_stringzilla(void) { PyObject *m; @@ -582,26 +720,19 @@ PyMODINIT_FUNC PyInit_stringzilla(void) { return NULL; } - // Create the 'find' function - vectorized_find = PyObject_Malloc(sizeof(PyCFunctionObject)); - if (vectorized_find == NULL) { - Py_XDECREF(&MemoryMappedFileType); - Py_XDECREF(&StrType); - Py_XDECREF(m); - PyErr_NoMemory(); - return NULL; - } - PyObject_Init(vectorized_find, &PyCFunction_Type); - ((PyCFunctionObject *)vectorized_find)->m_ml = NULL; // No regular PyMethodDef - ((PyCFunctionObject *)vectorized_find)->vectorcall = str_find_vectorcall; + // Register the vectorized functions + vectorized_find = register_vectorcall(m, "find", str_find_vectorcall); + vectorized_contains = register_vectorcall(m, "contains", str_contains_vectorcall); + vectorized_count = register_vectorcall(m, "count", str_count_vectorcall); + vectorized_levenstein = register_vectorcall(m, "levenstein", str_find_vectorcall); - // Add the 'find' function to the module - if (PyModule_AddObject(m, "find", vectorized_find) < 0) { - PyObject_Free(vectorized_find); - Py_XDECREF(&MemoryMappedFileType); - Py_XDECREF(&StrType); - Py_XDECREF(m); - return NULL; + vectorized_split = register_vectorcall(m, "split", str_find_vectorcall); + vectorized_sort = register_vectorcall(m, "sort", str_find_vectorcall); + vectorized_shuffle = register_vectorcall(m, "shuffle", str_find_vectorcall); + if (!vectorized_find || !vectorized_count || // + !vectorized_contains || !vectorized_levenstein || // + !vectorized_split || !vectorized_sort || !vectorized_shuffle) { + goto cleanup; } return m; @@ -609,16 +740,21 @@ PyMODINIT_FUNC PyInit_stringzilla(void) { cleanup: if (vectorized_find) Py_XDECREF(vectorized_find); - if (vectorized_count) - Py_XDECREF(vectorized_count); if (vectorized_contains) Py_XDECREF(vectorized_contains); + if (vectorized_count) + Py_XDECREF(vectorized_count); + if (vectorized_levenstein) + Py_XDECREF(vectorized_levenstein); if (vectorized_split) Py_XDECREF(vectorized_split); if (vectorized_sort) Py_XDECREF(vectorized_sort); if (vectorized_shuffle) Py_XDECREF(vectorized_shuffle); + + Py_XDECREF(&MemoryMappedFileType); + Py_XDECREF(&StrType); Py_XDECREF(m); PyErr_NoMemory(); return NULL; diff --git a/scripts/test.py b/scripts/test.py index 61b121ad..21d5d7ba 100644 --- a/scripts/test.py +++ b/scripts/test.py @@ -9,6 +9,16 @@ from stringzilla import Str +def test_globals(): + assert sz.find("abcdef", "bcdef") == 1 + assert sz.find("abcdef", "x") == 6 + + assert sz.count("abcdef", "x") == 0 + assert sz.count("aaaaa", "a") == 5 + assert sz.count("aaaaa", "aa") == 2 + assert sz.count("aaaaa", "aa", allowoverlap=True) == 4 + + def test_construct(): native = "aaaaa" big = Str(native) @@ -35,11 +45,6 @@ def test_rich_comparisons(): assert Str("abb")[1:] == "bb" -def test_globals(): - assert sz.find("abcdef", "bcdef") == 1 - assert sz.find("abcdef", "x") == 6 - - # def get_random_string( # length: Optional[int] = None, variability: Optional[int] = None # ) -> str: From 4ed2e9a83d286ac299d84f22ea0172152a40f0b0 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sun, 17 Sep 2023 16:20:15 +0400 Subject: [PATCH 11/72] Fix: Bounded Levenstein distance --- .vscode/settings.json | 4 +- README.md | 8 +- python/lib.c | 183 +++++++++++++++++++++++++++----------- scripts/test.py | 8 +- stringzilla/stringzilla.h | 35 ++++++-- 5 files changed, 175 insertions(+), 63 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 7ed8de05..2a8fa9c2 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -116,10 +116,12 @@ "stop_token": "cpp", "__verbose_abort": "cpp", "strstream": "cpp", - "filesystem": "cpp" + "filesystem": "cpp", + "__memory": "c" }, "C_Cpp.default.configurationProvider": "ms-vscode.cmake-tools", "cSpell.words": [ + "abababab", "allowoverlap", "basicsize", "bigram", diff --git a/README.md b/README.md index 973deaee..8b4630de 100644 --- a/README.md +++ b/README.md @@ -36,21 +36,21 @@ Coming soon. ## Quick Start: Python 🐍 1️. Install via pip: `pip install stringzilla` -1. Import the classes you need: `from stringzilla import Str, Strs, MemoryMappedFile` +1. Import the classes you need: `from stringzilla import Str, Strs, File` ### Basic Usage StringZilla offers two mostly interchangeable core classes: ```python -from stringzilla import Str, MemoryMappedFile +from stringzilla import Str, File text_from_str = Str('some-string') -text_from_file = Str(MemoryMappedFile('some-file.txt')) +text_from_file = Str(File('some-file.txt')) ``` The `Str` is designed to replace long Python `str` strings and wrap our C-level API. -On the other hand, the `MemoryMappedFile` memory-maps a file from persistent memory without loading its copy into RAM. +On the other hand, the `File` memory-maps a file from persistent memory without loading its copy into RAM. The contents of that file would remain immutable, and the mapping can be shared by multiple Python processes simultaneously. A standard dataset pre-processing use case would be to map a sizeable textual dataset like Common Crawl into memory, spawn child processes, and split the job between them. diff --git a/python/lib.c b/python/lib.c index 93bbce93..768b0d7a 100644 --- a/python/lib.c +++ b/python/lib.c @@ -25,9 +25,14 @@ typedef SSIZE_T ssize_t; #pragma region Forward Declarations -static PyTypeObject MemoryMappedFileType; +static PyTypeObject FileType; static PyTypeObject StrType; +struct { + void *ptr; + size_t len; +} temporary_memory = {NULL, 0}; + /** * @brief Describes an on-disk file mapped into RAM, which is different from Python's * native `mmap` module, as it exposes the address of the mapping in memory. @@ -42,11 +47,11 @@ typedef struct { #endif void *start; size_t length; -} MemoryMappedFile; +} File; /** * @brief Type-punned StringZilla-string, that points to a slice of an existing Python `str` - * or a `MemoryMappedFile`. + * or a `File`. * * When a slice is constructed, the `parent` object's reference count is being incremented to preserve lifetime. * It usage in Python would look like: @@ -112,8 +117,8 @@ int export_string_like(PyObject *object, char const **start, size_t *length) { *length = str->length; return 1; } - else if (PyObject_TypeCheck(object, &MemoryMappedFileType)) { - MemoryMappedFile *file = (MemoryMappedFile *)object; + else if (PyObject_TypeCheck(object, &FileType)) { + File *file = (File *)object; *start = file->start; *length = file->length; return 1; @@ -125,7 +130,7 @@ int export_string_like(PyObject *object, char const **start, size_t *length) { #pragma region Global Functions -static size_t str_find_vectorcall_(PyObject *_, PyObject *const *args, size_t nargsf, PyObject *kwnames) { +static Py_ssize_t str_find_vectorcall_(PyObject *_, PyObject *const *args, size_t nargsf, PyObject *kwnames) { Py_ssize_t nargs = PyVectorcall_NARGS(nargsf); // Initialize defaults @@ -140,9 +145,9 @@ static size_t str_find_vectorcall_(PyObject *_, PyObject *const *args, size_t na PyObject *haystack_obj = args[0]; PyObject *needle_obj = args[1]; - struct strzl_haystack_t haystack; struct strzl_needle_t needle; + needle.anomaly_offset = 0; if (!export_string_like(haystack_obj, &haystack.ptr, &haystack.len) || !export_string_like(needle_obj, &needle.ptr, &needle.len)) { PyErr_SetString(PyExc_TypeError, "Haystack and needle must be string-like"); @@ -178,21 +183,24 @@ static size_t str_find_vectorcall_(PyObject *_, PyObject *const *args, size_t na haystack.len = normalized_length; // Perform contains operation - return strzl_neon_find_substr(haystack, needle); + size_t offset = strzl_neon_find_substr(haystack, needle); + if (offset == haystack.len) + return -1; + return (Py_ssize_t)offset; } static PyObject *str_find_vectorcall(PyObject *_, PyObject *const *args, size_t nargsf, PyObject *kwnames) { - size_t offset = str_find_vectorcall_(NULL, args, nargsf, kwnames); - return PyLong_FromSize_t(offset); + Py_ssize_t signed_offset = str_find_vectorcall_(NULL, args, nargsf, kwnames); + return PyLong_FromSsize_t(signed_offset); } static PyObject *str_contains_vectorcall(PyObject *_, PyObject *const *args, size_t nargsf, PyObject *kwnames) { - size_t offset = str_find_vectorcall_(NULL, args, nargsf, kwnames); - if (offset != haystack.len) { - Py_RETURN_TRUE; + Py_ssize_t signed_offset = str_find_vectorcall_(NULL, args, nargsf, kwnames); + if (signed_offset == -1) { + Py_RETURN_FALSE; } else { - Py_RETURN_FALSE; + Py_RETURN_TRUE; } } @@ -215,6 +223,7 @@ static PyObject *str_count_vectorcall(PyObject *_, PyObject *const *args, size_t struct strzl_haystack_t haystack; struct strzl_needle_t needle; + needle.anomaly_offset = 0; if (!export_string_like(haystack_obj, &haystack.ptr, &haystack.len) || !export_string_like(needle_obj, &needle.ptr, &needle.len)) { PyErr_SetString(PyExc_TypeError, "Haystack and needle must be string-like"); @@ -281,11 +290,81 @@ static PyObject *str_count_vectorcall(PyObject *_, PyObject *const *args, size_t return PyLong_FromSize_t(count); } +static PyObject *str_levenstein_vectorcall(PyObject *_, PyObject *const *args, size_t nargsf, PyObject *kwnames) { + Py_ssize_t nargs = PyVectorcall_NARGS(nargsf); + + // Validate the number of arguments + if (nargs < 2 || nargs > 3) { + PyErr_SetString(PyExc_TypeError, "Invalid number of arguments"); + return NULL; + } + + PyObject *str1_obj = args[0]; + PyObject *str2_obj = args[1]; + + struct strzl_haystack_t str1, str2; + if (!export_string_like(str1_obj, &str1.ptr, &str1.len) || !export_string_like(str2_obj, &str2.ptr, &str2.len)) { + PyErr_SetString(PyExc_TypeError, "Both arguments must be string-like"); + return NULL; + } + + // Initialize bound argument + int bound = 255; + + // Check if `bound` is given as a positional argument + if (nargs == 3) { + bound = PyLong_AsLong(args[2]); + if (bound > 255 || bound < 0) { + PyErr_SetString(PyExc_ValueError, "Bound must be an integer between 0 and 255"); + return NULL; + } + } + + // Parse keyword arguments + if (kwnames != NULL) { + for (Py_ssize_t i = 0; i < PyTuple_Size(kwnames); ++i) { + PyObject *key = PyTuple_GetItem(kwnames, i); + PyObject *value = args[nargs + i]; + if (PyUnicode_CompareWithASCIIString(key, "bound") == 0) { + if (nargs == 3) { + PyErr_SetString(PyExc_TypeError, "Received bound both as positional and keyword argument"); + return NULL; + } + bound = PyLong_AsLong(value); + if (bound > 255 || bound < 0) { + PyErr_SetString(PyExc_ValueError, "Bound must be an integer between 0 and 255"); + return NULL; + } + } + } + } + + // Initialize or reallocate the Levenshtein distance matrix + size_t memory_needed = strzl_levenstein_memory_needed(str1.len, str2.len); + if (temporary_memory.len < memory_needed) { + temporary_memory.ptr = realloc(temporary_memory.ptr, memory_needed); + temporary_memory.len = memory_needed; + } + if (temporary_memory.ptr == NULL) { + PyErr_SetString(PyExc_MemoryError, "Unable to allocate memory for the Levenshtein matrix"); + return NULL; + } + + levenstein_distance_t distance = strzl_levenstein( // + str1.ptr, + str1.len, + str2.ptr, + str2.len, + (levenstein_distance_t)bound, + temporary_memory.ptr); + return PyLong_FromLong(distance); +} + #pragma endregion #pragma region MemoryMappingFile -static void MemoryMappedFile_dealloc(MemoryMappedFile *self) { +static void File_dealloc(File *self) { #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) if (self->start) { UnmapViewOfFile(self->start); @@ -313,9 +392,9 @@ static void MemoryMappedFile_dealloc(MemoryMappedFile *self) { Py_TYPE(self)->tp_free((PyObject *)self); } -static PyObject *MemoryMappedFile_new(PyTypeObject *type, PyObject *positional_args, PyObject *named_args) { - MemoryMappedFile *self; - self = (MemoryMappedFile *)type->tp_alloc(type, 0); +static PyObject *File_new(PyTypeObject *type, PyObject *positional_args, PyObject *named_args) { + File *self; + self = (File *)type->tp_alloc(type, 0); if (self == NULL) return NULL; @@ -329,7 +408,7 @@ static PyObject *MemoryMappedFile_new(PyTypeObject *type, PyObject *positional_a self->length = 0; } -static int MemoryMappedFile_init(MemoryMappedFile *self, PyObject *positional_args, PyObject *named_args) { +static int File_init(File *self, PyObject *positional_args, PyObject *named_args) { const char *path; if (!PyArg_ParseTuple(positional_args, "s", &path)) return -1; @@ -384,18 +463,18 @@ static int MemoryMappedFile_init(MemoryMappedFile *self, PyObject *positional_ar return 0; } -static PyMethodDef MemoryMappedFile_methods[] = { // +static PyMethodDef File_methods[] = { // {NULL, NULL, 0, NULL}}; -static PyTypeObject MemoryMappedFileType = { - PyObject_HEAD_INIT(NULL).tp_name = "stringzilla.MemoryMappedFile", +static PyTypeObject FileType = { + PyObject_HEAD_INIT(NULL).tp_name = "stringzilla.File", .tp_doc = "Memory mapped file class, that exposes the memory range for low-level access", - .tp_basicsize = sizeof(MemoryMappedFile), + .tp_basicsize = sizeof(File), .tp_flags = Py_TPFLAGS_DEFAULT, - .tp_methods = MemoryMappedFile_methods, - .tp_new = (newfunc)MemoryMappedFile_new, - .tp_init = (initproc)MemoryMappedFile_init, - .tp_dealloc = (destructor)MemoryMappedFile_dealloc, + .tp_methods = File_methods, + .tp_new = (newfunc)File_new, + .tp_init = (initproc)File_init, + .tp_dealloc = (destructor)File_dealloc, // PyBufferProcs *tp_as_buffer; @@ -663,17 +742,6 @@ static PyModuleDef stringzilla_module = { NULL, }; -// String functions: -static PyObject *vectorized_find = NULL; -static PyObject *vectorized_count = NULL; -static PyObject *vectorized_contains = NULL; -static PyObject *vectorized_levenstein = NULL; - -// String collections: -static PyObject *vectorized_split = NULL; -static PyObject *vectorized_sort = NULL; -static PyObject *vectorized_shuffle = NULL; - PyObject *register_vectorcall(PyObject *module, char const *name, vectorcallfunc vectorcall) { PyCFunctionObject *vectorcall_object = (PyCFunctionObject *)PyObject_Malloc(sizeof(PyCFunctionObject)); @@ -692,13 +760,19 @@ PyObject *register_vectorcall(PyObject *module, char const *name, vectorcallfunc return vectorcall_object; } +void cleanup_module(void) { + free(temporary_memory.ptr); + temporary_memory.ptr = NULL; + temporary_memory.len = 0; +} + PyMODINIT_FUNC PyInit_stringzilla(void) { PyObject *m; if (PyType_Ready(&StrType) < 0) return NULL; - if (PyType_Ready(&MemoryMappedFileType) < 0) + if (PyType_Ready(&FileType) < 0) return NULL; m = PyModule_Create(&stringzilla_module); @@ -712,23 +786,30 @@ PyMODINIT_FUNC PyInit_stringzilla(void) { return NULL; } - Py_INCREF(&MemoryMappedFileType); - if (PyModule_AddObject(m, "MemoryMappedFile", (PyObject *)&MemoryMappedFileType) < 0) { - Py_XDECREF(&MemoryMappedFileType); + Py_INCREF(&FileType); + if (PyModule_AddObject(m, "File", (PyObject *)&FileType) < 0) { + Py_XDECREF(&FileType); Py_XDECREF(&StrType); Py_XDECREF(m); return NULL; } + // Initialize temporary_memory, if needed + // For example, allocate an initial chunk + temporary_memory.ptr = malloc(4096); + temporary_memory.len = 4096 * (temporary_memory.ptr != NULL); + atexit(cleanup_module); + // Register the vectorized functions - vectorized_find = register_vectorcall(m, "find", str_find_vectorcall); - vectorized_contains = register_vectorcall(m, "contains", str_contains_vectorcall); - vectorized_count = register_vectorcall(m, "count", str_count_vectorcall); - vectorized_levenstein = register_vectorcall(m, "levenstein", str_find_vectorcall); - - vectorized_split = register_vectorcall(m, "split", str_find_vectorcall); - vectorized_sort = register_vectorcall(m, "sort", str_find_vectorcall); - vectorized_shuffle = register_vectorcall(m, "shuffle", str_find_vectorcall); + PyObject *vectorized_find = register_vectorcall(m, "find", str_find_vectorcall); + PyObject *vectorized_contains = register_vectorcall(m, "contains", str_contains_vectorcall); + PyObject *vectorized_count = register_vectorcall(m, "count", str_count_vectorcall); + PyObject *vectorized_levenstein = register_vectorcall(m, "levenstein", str_levenstein_vectorcall); + + PyObject *vectorized_split = register_vectorcall(m, "split", str_find_vectorcall); + PyObject *vectorized_sort = register_vectorcall(m, "sort", str_find_vectorcall); + PyObject *vectorized_shuffle = register_vectorcall(m, "shuffle", str_find_vectorcall); + if (!vectorized_find || !vectorized_count || // !vectorized_contains || !vectorized_levenstein || // !vectorized_split || !vectorized_sort || !vectorized_shuffle) { @@ -753,7 +834,7 @@ PyMODINIT_FUNC PyInit_stringzilla(void) { if (vectorized_shuffle) Py_XDECREF(vectorized_shuffle); - Py_XDECREF(&MemoryMappedFileType); + Py_XDECREF(&FileType); Py_XDECREF(&StrType); Py_XDECREF(m); PyErr_NoMemory(); diff --git a/scripts/test.py b/scripts/test.py index 21d5d7ba..c3f70523 100644 --- a/scripts/test.py +++ b/scripts/test.py @@ -11,13 +11,19 @@ def test_globals(): assert sz.find("abcdef", "bcdef") == 1 - assert sz.find("abcdef", "x") == 6 + assert sz.find("abcdef", "x") == -1 assert sz.count("abcdef", "x") == 0 assert sz.count("aaaaa", "a") == 5 assert sz.count("aaaaa", "aa") == 2 assert sz.count("aaaaa", "aa", allowoverlap=True) == 4 + assert sz.levenstein("aaa", "aaa") == 0 + assert sz.levenstein("aaa", "bbb") == 3 + assert sz.levenstein("abababab", "aaaaaaaa") == 4 + assert sz.levenstein("abababab", "aaaaaaaa", 2) == 2 + assert sz.levenstein("abababab", "aaaaaaaa", bound=2) == 2 + def test_construct(): native = "aaaaa" diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h index ca58b1ed..dc89934c 100644 --- a/stringzilla/stringzilla.h +++ b/stringzilla/stringzilla.h @@ -412,9 +412,9 @@ inline static size_t strzl_neon_find_substr(strzl_haystack_t h, strzl_needle_t n uint32x4_t matches3 = vceqq_u32(vandq_u32(vld1q_u32((uint32_t const *)(text + 3)), masks), anomalies); // Extracting matches from matches: - // vmaxvq_u32 (only a64) - // vgetq_lane_u32 (all) - // vorrq_u32 (all) + // vmaxvq_u32 (only a64) + // vgetq_lane_u32 (all) + // vorrq_u32 (all) uint32x4_t matches = vorrq_u32(vorrq_u32(matches0, matches1), vorrq_u32(matches2, matches3)); uint64x2_t matches64x2 = vreinterpretq_u64_u32(matches); int has_match = vgetq_lane_u64(matches64x2, 0) | vgetq_lane_u64(matches64x2, 1); @@ -686,10 +686,21 @@ inline static levenstein_distance_t strzl_levenstein( // levenstein_distance_t bound, void *buffer) { + // If one of the strings is empty - the edit distance is equal to the length of the other one if (a_length == 0) - return b_length <= bound ? b_length : bound + 1; + return b_length <= bound ? b_length : bound; if (b_length == 0) - return a_length <= bound ? a_length : bound + 1; + return a_length <= bound ? a_length : bound; + + // If the difference in length is beyond the `bound`, there is no need to check at all + if (a_length > b_length) { + if (a_length - b_length > bound) + return bound + 1; + } + else { + if (b_length - a_length > bound) + return bound + 1; + } levenstein_distance_t *previous_distances = (levenstein_distance_t *)buffer; levenstein_distance_t *current_distances = previous_distances + b_length + 1; @@ -700,20 +711,32 @@ inline static levenstein_distance_t strzl_levenstein( // for (size_t idx_a = 0; idx_a != a_length; ++idx_a) { current_distances[0] = idx_a + 1; + // Initialize min_distance with a value greater than bound + levenstein_distance_t min_distance = bound; + for (size_t idx_b = 0; idx_b != b_length; ++idx_b) { levenstein_distance_t cost_deletion = previous_distances[idx_b + 1] + 1; levenstein_distance_t cost_insertion = current_distances[idx_b] + 1; levenstein_distance_t cost_substitution = previous_distances[idx_b] + (a[idx_a] != b[idx_b]); current_distances[idx_b + 1] = _strzl_levenstein_minimum(cost_deletion, cost_insertion, cost_substitution); + + // Keep track of the minimum distance seen so far in this row + if (current_distances[idx_b + 1] < min_distance) { + min_distance = current_distances[idx_b + 1]; + } } + // If the minimum distance in this row exceeded the bound, return early + if (min_distance > bound) + return bound; + // Swap previous_distances and current_distances pointers levenstein_distance_t *temp = previous_distances; previous_distances = current_distances; current_distances = temp; } - return previous_distances[b_length]; + return previous_distances[b_length] <= bound ? previous_distances[b_length] : bound; } /** From d37b3422d99d1cfa20532ff9ff29ae6200acee07 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sun, 17 Sep 2023 16:28:57 +0400 Subject: [PATCH 12/72] Break: Shorter function prefixes --- .vscode/settings.json | 1 + README.md | 14 +-- python/lib.c | 30 +++--- scripts/test.cpp | 36 +++---- stringzilla/stringzilla.h | 221 +++++++++++++++++++------------------- 5 files changed, 152 insertions(+), 150 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 2a8fa9c2..48034254 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -158,6 +158,7 @@ "TPFLAGS", "Vardanian", "vectorcallfunc", + "XDECREF", "Zilla" ] } \ No newline at end of file diff --git a/README.md b/README.md index 8b4630de..1f9cea00 100644 --- a/README.md +++ b/README.md @@ -115,17 +115,17 @@ There is an ABI-stable C 99 interface, in case you have a database, an operating #include "stringzilla.h" // Initialize your haystack and needle -strzl_haystack_t haystack = {your_text, your_text_length}; -strzl_needle_t needle = {your_subtext, your_subtext_length, your_anomaly_offset}; +sz_haystack_t haystack = {your_text, your_text_length}; +sz_needle_t needle = {your_subtext, your_subtext_length, your_anomaly_offset}; // Perform string-level operations -size_t character_count = strzl_naive_count_char(haystack, 'a'); -size_t character_position = strzl_naive_find_char(haystack, 'a'); -size_t substring_position = strzl_naive_find_substr(haystack, needle); +size_t character_count = sz_naive_count_char(haystack, 'a'); +size_t character_position = sz_naive_find_char(haystack, 'a'); +size_t substring_position = sz_naive_find_substr(haystack, needle); // Perform collection level operations -strzl_array_t array = {your_order, your_count, your_get_begin, your_get_length, your_handle}; -strzl_sort(&array, &your_config); +sz_array_t array = {your_order, your_count, your_get_begin, your_get_length, your_handle}; +sz_sort(&array, &your_config); ``` ## Contributing 👾 diff --git a/python/lib.c b/python/lib.c index 768b0d7a..84582d32 100644 --- a/python/lib.c +++ b/python/lib.c @@ -145,8 +145,8 @@ static Py_ssize_t str_find_vectorcall_(PyObject *_, PyObject *const *args, size_ PyObject *haystack_obj = args[0]; PyObject *needle_obj = args[1]; - struct strzl_haystack_t haystack; - struct strzl_needle_t needle; + struct sz_haystack_t haystack; + struct sz_needle_t needle; needle.anomaly_offset = 0; if (!export_string_like(haystack_obj, &haystack.ptr, &haystack.len) || !export_string_like(needle_obj, &needle.ptr, &needle.len)) { @@ -183,7 +183,7 @@ static Py_ssize_t str_find_vectorcall_(PyObject *_, PyObject *const *args, size_ haystack.len = normalized_length; // Perform contains operation - size_t offset = strzl_neon_find_substr(haystack, needle); + size_t offset = sz_neon_find_substr(haystack, needle); if (offset == haystack.len) return -1; return (Py_ssize_t)offset; @@ -221,8 +221,8 @@ static PyObject *str_count_vectorcall(PyObject *_, PyObject *const *args, size_t PyObject *haystack_obj = args[0]; PyObject *needle_obj = args[1]; - struct strzl_haystack_t haystack; - struct strzl_needle_t needle; + struct sz_haystack_t haystack; + struct sz_needle_t needle; needle.anomaly_offset = 0; if (!export_string_like(haystack_obj, &haystack.ptr, &haystack.len) || !export_string_like(needle_obj, &needle.ptr, &needle.len)) { @@ -263,13 +263,13 @@ static PyObject *str_count_vectorcall(PyObject *_, PyObject *const *args, size_t // Perform counting operation size_t count = 0; if (needle.len == 1) { - count = strzl_naive_count_char(haystack, *needle.ptr); + count = sz_naive_count_char(haystack, *needle.ptr); } else { // Your existing logic for count_substr can be embedded here if (allow_overlap) { while (haystack.len) { - size_t offset = strzl_neon_find_substr(haystack, needle); + size_t offset = sz_neon_find_substr(haystack, needle); int found = offset != haystack.len; count += found; haystack.ptr += offset + found; @@ -278,7 +278,7 @@ static PyObject *str_count_vectorcall(PyObject *_, PyObject *const *args, size_t } else { while (haystack.len) { - size_t offset = strzl_neon_find_substr(haystack, needle); + size_t offset = sz_neon_find_substr(haystack, needle); int found = offset != haystack.len; count += found; haystack.ptr += offset + needle.len; @@ -302,7 +302,7 @@ static PyObject *str_levenstein_vectorcall(PyObject *_, PyObject *const *args, s PyObject *str1_obj = args[0]; PyObject *str2_obj = args[1]; - struct strzl_haystack_t str1, str2; + struct sz_haystack_t str1, str2; if (!export_string_like(str1_obj, &str1.ptr, &str1.len) || !export_string_like(str2_obj, &str2.ptr, &str2.len)) { PyErr_SetString(PyExc_TypeError, "Both arguments must be string-like"); return NULL; @@ -340,7 +340,7 @@ static PyObject *str_levenstein_vectorcall(PyObject *_, PyObject *const *args, s } // Initialize or reallocate the Levenshtein distance matrix - size_t memory_needed = strzl_levenstein_memory_needed(str1.len, str2.len); + size_t memory_needed = sz_levenstein_memory_needed(str1.len, str2.len); if (temporary_memory.len < memory_needed) { temporary_memory.ptr = realloc(temporary_memory.ptr, memory_needed); temporary_memory.len = memory_needed; @@ -350,7 +350,7 @@ static PyObject *str_levenstein_vectorcall(PyObject *_, PyObject *const *args, s return NULL; } - levenstein_distance_t distance = strzl_levenstein( // + levenstein_distance_t distance = sz_levenstein( // str1.ptr, str1.len, str2.ptr, @@ -551,7 +551,7 @@ static PyObject *Str_str(Str *self) { return PyUnicode_FromStringAndSize(self->s static Py_ssize_t Str_len(Str *self) { return self->length; } -static Py_hash_t Str_hash(Str *self) { return (Py_hash_t)strzl_hash_crc32_native(self->start, self->length); } +static Py_hash_t Str_hash(Str *self) { return (Py_hash_t)sz_hash_crc32_native(self->start, self->length); } static PyObject *Str_getitem(Str *self, Py_ssize_t i) { @@ -607,17 +607,17 @@ static PyObject *Str_subscript(Str *self, PyObject *key) { // Will be called by the `PySequence_Contains` static int Str_contains(Str *self, PyObject *arg) { - struct strzl_needle_t needle_struct; + struct sz_needle_t needle_struct; needle_struct.anomaly_offset = 0; if (!export_string_like(arg, &needle_struct.ptr, &needle_struct.len)) { PyErr_SetString(PyExc_TypeError, "Unsupported argument type"); return -1; } - struct strzl_haystack_t haystack; + struct sz_haystack_t haystack; haystack.ptr = self->start; haystack.len = self->length; - size_t position = strzl_neon_find_substr(haystack, needle_struct); + size_t position = sz_neon_find_substr(haystack, needle_struct); return position != haystack.len; } diff --git a/scripts/test.cpp b/scripts/test.cpp index c6fa28ab..3e76248e 100644 --- a/scripts/test.cpp +++ b/scripts/test.cpp @@ -195,23 +195,23 @@ int main(int, char const **) { bench_search("std::search", full_text, [&]() { return std::search(full_text.begin(), full_text.end(), needle.begin(), needle.end()) - full_text.begin(); }); - bench_search("strzl_naive_find_substr", full_text, [&]() { - strzl_haystack_t h {full_text.data(), full_text.size()}; - strzl_needle_t n {needle.data(), needle.size()}; - return strzl_naive_find_substr(h, n); + bench_search("sz_naive_find_substr", full_text, [&]() { + sz_haystack_t h {full_text.data(), full_text.size()}; + sz_needle_t n {needle.data(), needle.size()}; + return sz_naive_find_substr(h, n); }); #if defined(__ARM_NEON) - bench_search("strzl_neon_find_substr", full_text, [&]() { - strzl_haystack_t h {full_text.data(), full_text.size()}; - strzl_needle_t n {needle.data(), needle.size()}; - return strzl_neon_find_substr(h, n); + bench_search("sz_neon_find_substr", full_text, [&]() { + sz_haystack_t h {full_text.data(), full_text.size()}; + sz_needle_t n {needle.data(), needle.size()}; + return sz_neon_find_substr(h, n); }); #endif #if defined(__AVX2__) - bench_search("strzl_avx2_find_substr", full_text, [&]() { - strzl_haystack_t h {full_text.data(), full_text.size()}; - strzl_needle_t n {needle.data(), needle.size()}; - return strzl_avx2_find_substr(h, n); + bench_search("sz_avx2_find_substr", full_text, [&]() { + sz_haystack_t h {full_text.data(), full_text.size()}; + sz_needle_t n {needle.data(), needle.size()}; + return sz_avx2_find_substr(h, n); }); #endif } @@ -233,12 +233,12 @@ int main(int, char const **) { }); expect_partitioned_by_length(strings, permute_base); - bench_permute("strzl_partition", strings, permute_new, [](strings_t const &strings, permute_t &permute) { - strzl_array_t array; + bench_permute("sz_partition", strings, permute_new, [](strings_t const &strings, permute_t &permute) { + sz_array_t array; array.order = permute.data(); array.count = strings.size(); array.handle = &strings; - strzl_partition(&array, &has_under_four_chars); + sz_partition(&array, &has_under_four_chars); }); expect_partitioned_by_length(strings, permute_new); // TODO: expect_same(permute_base, permute_new); @@ -252,14 +252,14 @@ int main(int, char const **) { }); expect_sorted(strings, permute_base); - bench_permute("strzl_sort", strings, permute_new, [](strings_t const &strings, permute_t &permute) { - strzl_array_t array; + bench_permute("sz_sort", strings, permute_new, [](strings_t const &strings, permute_t &permute) { + sz_array_t array; array.order = permute.data(); array.count = strings.size(); array.handle = &strings; array.get_begin = get_begin; array.get_length = get_length; - strzl_sort(&array, nullptr); + sz_sort(&array, nullptr); }); expect_sorted(strings, permute_new); diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h index dc89934c..5e60adb5 100644 --- a/stringzilla/stringzilla.h +++ b/stringzilla/stringzilla.h @@ -2,7 +2,7 @@ #define STRINGZILLA_H_ #include // `uint8_t` -#include // `size_t` +#include // `sz_size_t` #include // `memcpy` #include // `qsort_r` #include // `qsort_s` @@ -29,38 +29,39 @@ extern "C" { #endif -typedef uint32_t strzl_anomaly_t; +typedef uint32_t sz_anomaly_t; +typedef uint64_t sz_size_t; -inline static size_t strzl_divide_round_up(size_t x, size_t divisor) { return (x + (divisor - 1)) / divisor; } +inline static sz_size_t sz_divide_round_up(sz_size_t x, sz_size_t divisor) { return (x + (divisor - 1)) / divisor; } /** * @brief This is a faster alternative to `strncmp(a, b, len) == 0`. * @return 1 for `true`, and 0 for `false`. */ -inline static int strzl_equal(char const *a, char const *b, size_t len) { +inline static int sz_equal(char const *a, char const *b, sz_size_t len) { char const *const a_end = a + len; while (a != a_end && *a == *b) a++, b++; return a_end == a; } -typedef struct strzl_haystack_t { +typedef struct sz_haystack_t { char const *ptr; - size_t len; -} strzl_haystack_t; + sz_size_t len; +} sz_haystack_t; -typedef struct strzl_needle_t { +typedef struct sz_needle_t { char const *ptr; - size_t len; - size_t anomaly_offset; -} strzl_needle_t; + sz_size_t len; + sz_size_t anomaly_offset; +} sz_needle_t; /** * @brief SWAR single-character counting procedure, jumping 8 bytes at a time. */ -inline static size_t strzl_naive_count_char(strzl_haystack_t h, char n) { +inline static sz_size_t sz_naive_count_char(sz_haystack_t h, char n) { - size_t result = 0; + sz_size_t result = 0; char const *text = h.ptr; char const *end = h.ptr + h.len; @@ -90,7 +91,7 @@ inline static size_t strzl_naive_count_char(strzl_haystack_t h, char n) { /** * @brief SWAR single-character search in string, jumping 8 bytes at a time. */ -inline static size_t strzl_naive_find_char(strzl_haystack_t h, char n) { +inline static sz_size_t sz_naive_find_char(sz_haystack_t h, char n) { char const *text = h.ptr; char const *end = h.ptr + h.len; @@ -125,7 +126,7 @@ inline static size_t strzl_naive_find_char(strzl_haystack_t h, char n) { /** * @brief SWAR character-bigram search in string, jumping 8 bytes at a time. */ -inline static size_t strzl_naive_find_2chars(strzl_haystack_t h, char const *n) { +inline static sz_size_t sz_naive_find_2chars(sz_haystack_t h, char const *n) { char const *text = h.ptr; char const *end = h.ptr + h.len; @@ -167,7 +168,7 @@ inline static size_t strzl_naive_find_2chars(strzl_haystack_t h, char const *n) /** * @brief SWAR character-trigram search in string, jumping 8 bytes at a time. */ -inline static size_t strzl_naive_find_3chars(strzl_haystack_t h, char const *n) { +inline static sz_size_t sz_naive_find_3chars(sz_haystack_t h, char const *n) { char const *text = h.ptr; char const *end = h.ptr + h.len; @@ -222,7 +223,7 @@ inline static size_t strzl_naive_find_3chars(strzl_haystack_t h, char const *n) /** * @brief SWAR character-quadgram search in string, jumping 8 bytes at a time. */ -inline static size_t strzl_naive_find_4chars(strzl_haystack_t h, char const *n) { +inline static sz_size_t sz_naive_find_4chars(sz_haystack_t h, char const *n) { char const *text = h.ptr; char const *end = h.ptr + h.len; @@ -283,7 +284,7 @@ inline static size_t strzl_naive_find_4chars(strzl_haystack_t h, char const *n) * it compares 4-byte anomalies first, most commonly prefixes. It's computationally cheaper. * Matching performance fluctuates between 1 GB/s and 3,5 GB/s per core. */ -inline static size_t strzl_naive_find_substr(strzl_haystack_t h, strzl_needle_t n) { +inline static sz_size_t sz_naive_find_substr(sz_haystack_t h, sz_needle_t n) { if (h.len < n.len) return h.len; @@ -292,22 +293,22 @@ inline static size_t strzl_naive_find_substr(strzl_haystack_t h, strzl_needle_t char const *const end = h.ptr + h.len; switch (n.len) { case 0: return 0; - case 1: return strzl_naive_find_char(h, *n.ptr); - case 2: return strzl_naive_find_2chars(h, n.ptr); - case 3: return strzl_naive_find_3chars(h, n.ptr); - case 4: return strzl_naive_find_4chars(h, n.ptr); + case 1: return sz_naive_find_char(h, *n.ptr); + case 2: return sz_naive_find_2chars(h, n.ptr); + case 3: return sz_naive_find_3chars(h, n.ptr); + case 4: return sz_naive_find_4chars(h, n.ptr); default: { - strzl_anomaly_t n_anomaly, h_anomaly; - size_t const n_suffix_len = n.len - 4 - n.anomaly_offset; + sz_anomaly_t n_anomaly, h_anomaly; + sz_size_t const n_suffix_len = n.len - 4 - n.anomaly_offset; char const *n_suffix_ptr = n.ptr + 4 + n.anomaly_offset; memcpy(&n_anomaly, n.ptr + n.anomaly_offset, 4); text += n.anomaly_offset; for (; text + n.len <= end; text++) { memcpy(&h_anomaly, text, 4); - if (h_anomaly == n_anomaly) // Match anomaly. - if (strzl_equal(text + 4, n_suffix_ptr, n_suffix_len)) // Match suffix. - if (strzl_equal(text - n.anomaly_offset, n.ptr, n.anomaly_offset)) // Match prefix. + if (h_anomaly == n_anomaly) // Match anomaly. + if (sz_equal(text + 4, n_suffix_ptr, n_suffix_len)) // Match suffix. + if (sz_equal(text - n.anomaly_offset, n.ptr, n.anomaly_offset)) // Match prefix. return text - h.ptr - n.anomaly_offset; } return h.len; @@ -323,7 +324,7 @@ inline static size_t strzl_naive_find_substr(strzl_haystack_t h, strzl_needle_t * was practically more efficient than loading once and shifting around, as introduces * less data dependencies. */ -size_t strzl_avx2_find_substr(strzl_haystack_t h, strzl_needle_t n) { +sz_size_t sz_avx2_find_substr(sz_haystack_t h, sz_needle_t n) { // Precomputed constants char const *const end = h.ptr + h.len; @@ -362,18 +363,18 @@ size_t strzl_avx2_find_substr(strzl_haystack_t h, strzl_needle_t n) { int matches3 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts3, anomalies)); if (matches0 | matches1 | matches2 | matches3) { - for (size_t i = 0; i < 32; i++) { - if (strzl_equal(text + i, n.ptr, n.len)) + for (sz_size_t i = 0; i < 32; i++) { + if (sz_equal(text + i, n.ptr, n.len)) return i + (text - h.ptr); } } } // Don't forget the last (up to 35) characters. - strzl_haystack_t h_remainder; + sz_haystack_t h_remainder; h_remainder.ptr = text; h_remainder.len = end - text; - size_t tail_match = strzl_naive_find_substr(h_remainder, n); + sz_size_t tail_match = sz_naive_find_substr(h_remainder, n); return text + tail_match - h.ptr; } @@ -387,7 +388,7 @@ size_t strzl_avx2_find_substr(strzl_haystack_t h, strzl_needle_t n) { * was practically more efficient than loading once and shifting around, as introduces * less data dependencies. */ -inline static size_t strzl_neon_find_substr(strzl_haystack_t h, strzl_needle_t n) { +inline static sz_size_t sz_neon_find_substr(sz_haystack_t h, sz_needle_t n) { // Precomputed constants char const *const end = h.ptr + h.len; @@ -420,55 +421,55 @@ inline static size_t strzl_neon_find_substr(strzl_haystack_t h, strzl_needle_t n int has_match = vgetq_lane_u64(matches64x2, 0) | vgetq_lane_u64(matches64x2, 1); if (has_match) { - for (size_t i = 0; i < 16; i++) { - if (strzl_equal(text + i, n.ptr, n.len)) + for (sz_size_t i = 0; i < 16; i++) { + if (sz_equal(text + i, n.ptr, n.len)) return i + (text - h.ptr); } } } // Don't forget the last (up to 16+3=19) characters. - strzl_haystack_t h_remainder; + sz_haystack_t h_remainder; h_remainder.ptr = text; h_remainder.len = end - text; - size_t tail_match = strzl_naive_find_substr(h_remainder, n); + sz_size_t tail_match = sz_naive_find_substr(h_remainder, n); return text + tail_match - h.ptr; } #endif // Arm Neon -inline static void strzl_swap(size_t *a, size_t *b) { - size_t t = *a; +inline static void sz_swap(sz_size_t *a, sz_size_t *b) { + sz_size_t t = *a; *a = *b; *b = t; } -typedef char const *(*strzl_array_get_begin_t)(void const *, size_t); -typedef size_t (*strzl_array_get_length_t)(void const *, size_t); -typedef int (*strzl_array_predicate_t)(void const *, size_t); -typedef int (*strzl_array_comparator_t)(void const *, size_t, size_t); +typedef char const *(*sz_array_get_begin_t)(void const *, sz_size_t); +typedef sz_size_t (*sz_array_get_length_t)(void const *, sz_size_t); +typedef int (*sz_array_predicate_t)(void const *, sz_size_t); +typedef int (*sz_array_comparator_t)(void const *, sz_size_t, sz_size_t); -typedef struct strzl_array_t { - size_t *order; - size_t count; - strzl_array_get_begin_t get_begin; - strzl_array_get_length_t get_length; +typedef struct sz_array_t { + sz_size_t *order; + sz_size_t count; + sz_array_get_begin_t get_begin; + sz_array_get_length_t get_length; void const *handle; -} strzl_array_t; +} sz_array_t; /** * @brief Similar to `std::partition`, given a predicate splits the * array into two parts. */ -inline static size_t strzl_partition(strzl_array_t *array, strzl_array_predicate_t predicate) { +inline static sz_size_t sz_partition(sz_array_t *array, sz_array_predicate_t predicate) { - size_t matches = 0; + sz_size_t matches = 0; while (matches != array->count && predicate(array->handle, array->order[matches])) ++matches; - for (size_t i = matches + 1; i < array->count; ++i) + for (sz_size_t i = matches + 1; i < array->count; ++i) if (predicate(array->handle, array->order[i])) - strzl_swap(array->order + i, array->order + matches), ++matches; + sz_swap(array->order + i, array->order + matches), ++matches; return matches; } @@ -477,15 +478,15 @@ inline static size_t strzl_partition(strzl_array_t *array, strzl_array_predicate * @brief Inplace `std::set_union` for two consecutive chunks forming * the same continuous array. */ -inline static void strzl_merge(strzl_array_t *array, size_t partition, strzl_array_comparator_t less) { +inline static void sz_merge(sz_array_t *array, sz_size_t partition, sz_array_comparator_t less) { - size_t start_b = partition + 1; + sz_size_t start_b = partition + 1; // If the direct merge is already sorted if (!less(array->handle, array->order[start_b], array->order[partition])) return; - size_t start_a = 0; + sz_size_t start_a = 0; while (start_a <= partition && start_b <= array->count) { // If element 1 is in right place @@ -493,8 +494,8 @@ inline static void strzl_merge(strzl_array_t *array, size_t partition, strzl_arr start_a++; } else { - size_t value = array->order[start_b]; - size_t index = start_b; + sz_size_t value = array->order[start_b]; + sz_size_t index = start_b; // Shift all the elements between element 1 // element 2, right by 1. @@ -512,10 +513,10 @@ inline static void strzl_merge(strzl_array_t *array, size_t partition, strzl_arr } } -inline static void _strzl_sort_recursion( // - strzl_array_t *array, - size_t bit_idx, - size_t bit_max, +inline static void _sz_sort_recursion( // + sz_array_t *array, + sz_size_t bit_idx, + sz_size_t bit_max, #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) || __APPLE__ int (*libc_comparator)(void *, void const *, void const *) #else @@ -527,52 +528,52 @@ inline static void _strzl_sort_recursion( // return; // Partition a range of integers according to a specific bit value - size_t split = 0; + sz_size_t split = 0; { - size_t mask = (1ul << 63) >> bit_idx; + sz_size_t mask = (1ul << 63) >> bit_idx; while (split != array->count && !(array->order[split] & mask)) ++split; - for (size_t i = split + 1; i < array->count; ++i) + for (sz_size_t i = split + 1; i < array->count; ++i) if (!(array->order[i] & mask)) - strzl_swap(array->order + i, array->order + split), ++split; + sz_swap(array->order + i, array->order + split), ++split; } // Go down recursively if (bit_idx < bit_max) { - strzl_array_t a = *array; + sz_array_t a = *array; a.count = split; - _strzl_sort_recursion(&a, bit_idx + 1, bit_max, libc_comparator); + _sz_sort_recursion(&a, bit_idx + 1, bit_max, libc_comparator); - strzl_array_t b = *array; + sz_array_t b = *array; b.order += split; b.count -= split; - _strzl_sort_recursion(&b, bit_idx + 1, bit_max, libc_comparator); + _sz_sort_recursion(&b, bit_idx + 1, bit_max, libc_comparator); } // Reached the end of recursion else { // Discard the prefixes - for (size_t i = 0; i != array->count; ++i) + for (sz_size_t i = 0; i != array->count; ++i) memset((char *)(&array->order[i]) + 4, 0, 4ul); #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) // Perform sorts on smaller chunks instead of the whole handle // https://stackoverflow.com/a/39561369 // https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/qsort-s?view=msvc-170 - qsort_s(array->order, split, sizeof(size_t), libc_comparator, (void *)array); - qsort_s(array->order + split, array->count - split, sizeof(size_t), libc_comparator, (void *)array); + qsort_s(array->order, split, sizeof(sz_size_t), libc_comparator, (void *)array); + qsort_s(array->order + split, array->count - split, sizeof(sz_size_t), libc_comparator, (void *)array); #elif __APPLE__ - qsort_r(array->order, split, sizeof(size_t), (void *)array, libc_comparator); - qsort_r(array->order + split, array->count - split, sizeof(size_t), (void *)array, libc_comparator); + qsort_r(array->order, split, sizeof(sz_size_t), (void *)array, libc_comparator); + qsort_r(array->order + split, array->count - split, sizeof(sz_size_t), (void *)array, libc_comparator); #else // https://linux.die.net/man/3/qsort_r - qsort_r(array->order, split, sizeof(size_t), libc_comparator, (void *)array); - qsort_r(array->order + split, array->count - split, sizeof(size_t), libc_comparator, (void *)array); + qsort_r(array->order, split, sizeof(sz_size_t), libc_comparator, (void *)array); + qsort_r(array->order + split, array->count - split, sizeof(sz_size_t), libc_comparator, (void *)array); #endif } } -inline static int _strzl_sort_array_strncmp( +inline static int _sz_sort_array_strncmp( #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) || __APPLE__ void *array_raw, void const *a_raw, void const *b_raw #else @@ -581,11 +582,11 @@ inline static int _strzl_sort_array_strncmp( ) { // https://man.freebsd.org/cgi/man.cgi?query=qsort_s&sektion=3&n=1 // https://www.man7.org/linux/man-pages/man3/strcmp.3.html - strzl_array_t *array = (strzl_array_t *)array_raw; - size_t a = *(size_t *)a_raw; - size_t b = *(size_t *)b_raw; - size_t a_len = array->get_length(array->handle, a); - size_t b_len = array->get_length(array->handle, b); + sz_array_t *array = (sz_array_t *)array_raw; + sz_size_t a = *(sz_size_t *)a_raw; + sz_size_t b = *(sz_size_t *)b_raw; + sz_size_t a_len = array->get_length(array->handle, a); + sz_size_t b_len = array->get_length(array->handle, b); int res = strncmp( // array->get_begin(array->handle, a), array->get_begin(array->handle, b), @@ -593,7 +594,7 @@ inline static int _strzl_sort_array_strncmp( return res ? res : a_len - b_len; } -inline static int _strzl_sort_array_strncasecmp( +inline static int _sz_sort_array_strncasecmp( #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) || __APPLE__ void *array_raw, void const *a_raw, void const *b_raw #else @@ -602,11 +603,11 @@ inline static int _strzl_sort_array_strncasecmp( ) { // https://man.freebsd.org/cgi/man.cgi?query=qsort_s&sektion=3&n=1 // https://www.man7.org/linux/man-pages/man3/strcmp.3.html - strzl_array_t *array = (strzl_array_t *)array_raw; - size_t a = *(size_t *)a_raw; - size_t b = *(size_t *)b_raw; - size_t a_len = array->get_length(array->handle, a); - size_t b_len = array->get_length(array->handle, b); + sz_array_t *array = (sz_array_t *)array_raw; + sz_size_t a = *(sz_size_t *)a_raw; + sz_size_t b = *(sz_size_t *)b_raw; + sz_size_t a_len = array->get_length(array->handle, a); + sz_size_t b_len = array->get_length(array->handle, b); int res = strncasecmp( // array->get_begin(array->handle, a), array->get_begin(array->handle, b), @@ -614,25 +615,25 @@ inline static int _strzl_sort_array_strncasecmp( return res ? res : a_len - b_len; } -typedef struct strzl_sort_config_t { +typedef struct sz_sort_config_t { int case_insensitive; -} strzl_sort_config_t; +} sz_sort_config_t; /** * @brief Sorting algorithm, combining Radix Sort for the first 32 bits of every word * and a follow-up Quick Sort on resulting structure. */ -inline static void strzl_sort(strzl_array_t *array, strzl_sort_config_t const *config) { +inline static void sz_sort(sz_array_t *array, sz_sort_config_t const *config) { int case_insensitive = config && config->case_insensitive; // Export up to 4 bytes into the `array` bits themselves - for (size_t i = 0; i != array->count; ++i) { + for (sz_size_t i = 0; i != array->count; ++i) { char const *begin = array->get_begin(array->handle, array->order[i]); - size_t length = array->get_length(array->handle, array->order[i]); + sz_size_t length = array->get_length(array->handle, array->order[i]); length = length > 4ul ? 4ul : length; char *prefix = (char *)&array->order[i]; - for (size_t j = 0; j != length; ++j) + for (sz_size_t j = 0; j != length; ++j) prefix[7 - j] = begin[j]; if (case_insensitive) { prefix[0] = tolower(prefix[0]); @@ -647,12 +648,12 @@ inline static void strzl_sort(strzl_array_t *array, strzl_sort_config_t const *c #else int (*comparator)(void const *, void const *, void *); #endif - comparator = _strzl_sort_array_strncmp; + comparator = _sz_sort_array_strncmp; if (case_insensitive) - comparator = _strzl_sort_array_strncasecmp; + comparator = _sz_sort_array_strncasecmp; // Perform optionally-parallel radix sort on them - _strzl_sort_recursion(array, 0, 32, comparator); + _sz_sort_recursion(array, 0, 32, comparator); } typedef uint8_t levenstein_distance_t; @@ -661,12 +662,12 @@ typedef uint8_t levenstein_distance_t; * @return Amount of temporary memory (in bytes) needed to efficiently compute * the Levenstein distance between two strings of given size. */ -inline static size_t strzl_levenstein_memory_needed(size_t _, size_t b_length) { return b_length + b_length + 2; } +inline static sz_size_t sz_levenstein_memory_needed(sz_size_t _, sz_size_t b_length) { return b_length + b_length + 2; } /** * @brief Auxiliary function, that computes the minimum of three values. */ -inline static levenstein_distance_t _strzl_levenstein_minimum( // +inline static levenstein_distance_t _sz_levenstein_minimum( // levenstein_distance_t a, levenstein_distance_t b, levenstein_distance_t c) { @@ -678,11 +679,11 @@ inline static levenstein_distance_t _strzl_levenstein_minimum( // * @brief Levenshtein String Similarity function, implemented with linear memory consumption. * It accepts an upper bound on the possible error. Quadratic complexity in time, linear in space. */ -inline static levenstein_distance_t strzl_levenstein( // +inline static levenstein_distance_t sz_levenstein( // char const *a, - size_t a_length, + sz_size_t a_length, char const *b, - size_t b_length, + sz_size_t b_length, levenstein_distance_t bound, void *buffer) { @@ -705,20 +706,20 @@ inline static levenstein_distance_t strzl_levenstein( // levenstein_distance_t *previous_distances = (levenstein_distance_t *)buffer; levenstein_distance_t *current_distances = previous_distances + b_length + 1; - for (size_t idx_b = 0; idx_b != (b_length + 1); ++idx_b) + for (sz_size_t idx_b = 0; idx_b != (b_length + 1); ++idx_b) previous_distances[idx_b] = idx_b; - for (size_t idx_a = 0; idx_a != a_length; ++idx_a) { + for (sz_size_t idx_a = 0; idx_a != a_length; ++idx_a) { current_distances[0] = idx_a + 1; // Initialize min_distance with a value greater than bound levenstein_distance_t min_distance = bound; - for (size_t idx_b = 0; idx_b != b_length; ++idx_b) { + for (sz_size_t idx_b = 0; idx_b != b_length; ++idx_b) { levenstein_distance_t cost_deletion = previous_distances[idx_b + 1] + 1; levenstein_distance_t cost_insertion = current_distances[idx_b] + 1; levenstein_distance_t cost_substitution = previous_distances[idx_b] + (a[idx_a] != b[idx_b]); - current_distances[idx_b + 1] = _strzl_levenstein_minimum(cost_deletion, cost_insertion, cost_substitution); + current_distances[idx_b + 1] = _sz_levenstein_minimum(cost_deletion, cost_insertion, cost_substitution); // Keep track of the minimum distance seen so far in this row if (current_distances[idx_b + 1] < min_distance) { @@ -742,11 +743,11 @@ inline static levenstein_distance_t strzl_levenstein( // /** * @brief Hashes provided string using hardware-accelerated CRC32 instructions. */ -inline static uint32_t strzl_hash_crc32_native(char const *start, size_t length) { return 0; } +inline static uint32_t sz_hash_crc32_native(char const *start, sz_size_t length) { return 0; } -inline static uint32_t strzl_hash_crc32_neon(char const *start, size_t length) { return 0; } +inline static uint32_t sz_hash_crc32_neon(char const *start, sz_size_t length) { return 0; } -inline static uint32_t strzl_hash_crc32_sse(char const *start, size_t length) { return 0; } +inline static uint32_t sz_hash_crc32_sse(char const *start, sz_size_t length) { return 0; } #ifdef __cplusplus } From cf324fb0bc12cd59a4d90ca4d4462255d3d3f0ec Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sun, 17 Sep 2023 16:40:06 +0400 Subject: [PATCH 13/72] Refactor: `start` and `length` common member names --- README.md | 2 +- python/lib.c | 87 +++++------ scripts/test.cpp | 8 +- stringzilla/stringzilla.h | 312 ++++++++++++++++++++------------------ 4 files changed, 210 insertions(+), 199 deletions(-) diff --git a/README.md b/README.md index 1f9cea00..bd0ee0c7 100644 --- a/README.md +++ b/README.md @@ -124,7 +124,7 @@ size_t character_position = sz_naive_find_char(haystack, 'a'); size_t substring_position = sz_naive_find_substr(haystack, needle); // Perform collection level operations -sz_array_t array = {your_order, your_count, your_get_begin, your_get_length, your_handle}; +sz_sequence_t array = {your_order, your_count, your_get_start, your_get_length, your_handle}; sz_sort(&array, &your_config); ``` diff --git a/python/lib.c b/python/lib.c index 84582d32..c670ae81 100644 --- a/python/lib.c +++ b/python/lib.c @@ -29,8 +29,8 @@ static PyTypeObject FileType; static PyTypeObject StrType; struct { - void *ptr; - size_t len; + void *start; + size_t length; } temporary_memory = {NULL, 0}; /** @@ -148,8 +148,8 @@ static Py_ssize_t str_find_vectorcall_(PyObject *_, PyObject *const *args, size_ struct sz_haystack_t haystack; struct sz_needle_t needle; needle.anomaly_offset = 0; - if (!export_string_like(haystack_obj, &haystack.ptr, &haystack.len) || - !export_string_like(needle_obj, &needle.ptr, &needle.len)) { + if (!export_string_like(haystack_obj, &haystack.start, &haystack.length) || + !export_string_like(needle_obj, &needle.start, &needle.length)) { PyErr_SetString(PyExc_TypeError, "Haystack and needle must be string-like"); return NULL; } @@ -178,13 +178,13 @@ static Py_ssize_t str_find_vectorcall_(PyObject *_, PyObject *const *args, size_ // Limit the haystack range size_t normalized_offset, normalized_length; - slice(haystack.len, start, end, &normalized_offset, &normalized_length); - haystack.ptr += normalized_offset; - haystack.len = normalized_length; + slice(haystack.length, start, end, &normalized_offset, &normalized_length); + haystack.start += normalized_offset; + haystack.length = normalized_length; // Perform contains operation size_t offset = sz_neon_find_substr(haystack, needle); - if (offset == haystack.len) + if (offset == haystack.length) return -1; return (Py_ssize_t)offset; } @@ -224,8 +224,8 @@ static PyObject *str_count_vectorcall(PyObject *_, PyObject *const *args, size_t struct sz_haystack_t haystack; struct sz_needle_t needle; needle.anomaly_offset = 0; - if (!export_string_like(haystack_obj, &haystack.ptr, &haystack.len) || - !export_string_like(needle_obj, &needle.ptr, &needle.len)) { + if (!export_string_like(haystack_obj, &haystack.start, &haystack.length) || + !export_string_like(needle_obj, &needle.start, &needle.length)) { PyErr_SetString(PyExc_TypeError, "Haystack and needle must be string-like"); return NULL; } @@ -256,33 +256,33 @@ static PyObject *str_count_vectorcall(PyObject *_, PyObject *const *args, size_t // Limit the haystack range size_t normalized_offset, normalized_length; - slice(haystack.len, start, end, &normalized_offset, &normalized_length); - haystack.ptr += normalized_offset; - haystack.len = normalized_length; + slice(haystack.length, start, end, &normalized_offset, &normalized_length); + haystack.start += normalized_offset; + haystack.length = normalized_length; // Perform counting operation size_t count = 0; - if (needle.len == 1) { - count = sz_naive_count_char(haystack, *needle.ptr); + if (needle.length == 1) { + count = sz_naive_count_char(haystack, *needle.start); } else { // Your existing logic for count_substr can be embedded here if (allow_overlap) { - while (haystack.len) { + while (haystack.length) { size_t offset = sz_neon_find_substr(haystack, needle); - int found = offset != haystack.len; + int found = offset != haystack.length; count += found; - haystack.ptr += offset + found; - haystack.len -= offset + found; + haystack.start += offset + found; + haystack.length -= offset + found; } } else { - while (haystack.len) { + while (haystack.length) { size_t offset = sz_neon_find_substr(haystack, needle); - int found = offset != haystack.len; + int found = offset != haystack.length; count += found; - haystack.ptr += offset + needle.len; - haystack.len -= offset + needle.len * found; + haystack.start += offset + needle.length; + haystack.length -= offset + needle.length * found; } } } @@ -303,7 +303,8 @@ static PyObject *str_levenstein_vectorcall(PyObject *_, PyObject *const *args, s PyObject *str2_obj = args[1]; struct sz_haystack_t str1, str2; - if (!export_string_like(str1_obj, &str1.ptr, &str1.len) || !export_string_like(str2_obj, &str2.ptr, &str2.len)) { + if (!export_string_like(str1_obj, &str1.start, &str1.length) || + !export_string_like(str2_obj, &str2.start, &str2.length)) { PyErr_SetString(PyExc_TypeError, "Both arguments must be string-like"); return NULL; } @@ -340,23 +341,23 @@ static PyObject *str_levenstein_vectorcall(PyObject *_, PyObject *const *args, s } // Initialize or reallocate the Levenshtein distance matrix - size_t memory_needed = sz_levenstein_memory_needed(str1.len, str2.len); - if (temporary_memory.len < memory_needed) { - temporary_memory.ptr = realloc(temporary_memory.ptr, memory_needed); - temporary_memory.len = memory_needed; + size_t memory_needed = sz_levenstein_memory_needed(str1.length, str2.length); + if (temporary_memory.length < memory_needed) { + temporary_memory.start = realloc(temporary_memory.start, memory_needed); + temporary_memory.length = memory_needed; } - if (temporary_memory.ptr == NULL) { + if (temporary_memory.start == NULL) { PyErr_SetString(PyExc_MemoryError, "Unable to allocate memory for the Levenshtein matrix"); return NULL; } levenstein_distance_t distance = sz_levenstein( // - str1.ptr, - str1.len, - str2.ptr, - str2.len, + str1.start, + str1.length, + str2.start, + str2.length, (levenstein_distance_t)bound, - temporary_memory.ptr); + temporary_memory.start); return PyLong_FromLong(distance); } @@ -609,16 +610,16 @@ static int Str_contains(Str *self, PyObject *arg) { struct sz_needle_t needle_struct; needle_struct.anomaly_offset = 0; - if (!export_string_like(arg, &needle_struct.ptr, &needle_struct.len)) { + if (!export_string_like(arg, &needle_struct.start, &needle_struct.length)) { PyErr_SetString(PyExc_TypeError, "Unsupported argument type"); return -1; } struct sz_haystack_t haystack; - haystack.ptr = self->start; - haystack.len = self->length; + haystack.start = self->start; + haystack.length = self->length; size_t position = sz_neon_find_substr(haystack, needle_struct); - return position != haystack.len; + return position != haystack.length; } static PyObject *Str_getslice(Str *self, PyObject *args) { @@ -761,9 +762,9 @@ PyObject *register_vectorcall(PyObject *module, char const *name, vectorcallfunc } void cleanup_module(void) { - free(temporary_memory.ptr); - temporary_memory.ptr = NULL; - temporary_memory.len = 0; + free(temporary_memory.start); + temporary_memory.start = NULL; + temporary_memory.length = 0; } PyMODINIT_FUNC PyInit_stringzilla(void) { @@ -796,8 +797,8 @@ PyMODINIT_FUNC PyInit_stringzilla(void) { // Initialize temporary_memory, if needed // For example, allocate an initial chunk - temporary_memory.ptr = malloc(4096); - temporary_memory.len = 4096 * (temporary_memory.ptr != NULL); + temporary_memory.start = malloc(4096); + temporary_memory.length = 4096 * (temporary_memory.start != NULL); atexit(cleanup_module); // Register the vectorized functions diff --git a/scripts/test.cpp b/scripts/test.cpp index 3e76248e..5c97c452 100644 --- a/scripts/test.cpp +++ b/scripts/test.cpp @@ -17,7 +17,7 @@ using permute_t = std::vector; #pragma region - C callbacks -static char const *get_begin(void const *array_c, size_t i) { +static char const *get_start(void const *array_c, size_t i) { strings_t const &array = *reinterpret_cast(array_c); return array[i].c_str(); } @@ -234,7 +234,7 @@ int main(int, char const **) { expect_partitioned_by_length(strings, permute_base); bench_permute("sz_partition", strings, permute_new, [](strings_t const &strings, permute_t &permute) { - sz_array_t array; + sz_sequence_t array; array.order = permute.data(); array.count = strings.size(); array.handle = &strings; @@ -253,11 +253,11 @@ int main(int, char const **) { expect_sorted(strings, permute_base); bench_permute("sz_sort", strings, permute_new, [](strings_t const &strings, permute_t &permute) { - sz_array_t array; + sz_sequence_t array; array.order = permute.data(); array.count = strings.size(); array.handle = &strings; - array.get_begin = get_begin; + array.get_start = get_start; array.get_length = get_length; sz_sort(&array, nullptr); }); diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h index 5e60adb5..a60c0dea 100644 --- a/stringzilla/stringzilla.h +++ b/stringzilla/stringzilla.h @@ -35,24 +35,24 @@ typedef uint64_t sz_size_t; inline static sz_size_t sz_divide_round_up(sz_size_t x, sz_size_t divisor) { return (x + (divisor - 1)) / divisor; } /** - * @brief This is a faster alternative to `strncmp(a, b, len) == 0`. + * @brief This is a faster alternative to `strncmp(a, b, length) == 0`. * @return 1 for `true`, and 0 for `false`. */ -inline static int sz_equal(char const *a, char const *b, sz_size_t len) { - char const *const a_end = a + len; +inline static int sz_equal(char const *a, char const *b, sz_size_t length) { + char const *const a_end = a + length; while (a != a_end && *a == *b) a++, b++; return a_end == a; } typedef struct sz_haystack_t { - char const *ptr; - sz_size_t len; + char const *start; + sz_size_t length; } sz_haystack_t; typedef struct sz_needle_t { - char const *ptr; - sz_size_t len; + char const *start; + sz_size_t length; sz_size_t anomaly_offset; } sz_needle_t; @@ -62,8 +62,8 @@ typedef struct sz_needle_t { inline static sz_size_t sz_naive_count_char(sz_haystack_t h, char n) { sz_size_t result = 0; - char const *text = h.ptr; - char const *end = h.ptr + h.len; + char const *text = h.start; + char const *end = h.start + h.length; for (; (uint64_t)text % 8 != 0 && text < end; ++text) result += *text == n; @@ -93,12 +93,12 @@ inline static sz_size_t sz_naive_count_char(sz_haystack_t h, char n) { */ inline static sz_size_t sz_naive_find_char(sz_haystack_t h, char n) { - char const *text = h.ptr; - char const *end = h.ptr + h.len; + char const *text = h.start; + char const *end = h.start + h.length; for (; (uint64_t)text % 8 != 0 && text < end; ++text) if (*text == n) - return text - h.ptr; + return text - h.start; // This code simulates hyper-scalar execution, analyzing 8 offsets at a time. uint64_t nnnnnnnn = n; @@ -114,13 +114,13 @@ inline static sz_size_t sz_naive_find_char(sz_haystack_t h, char n) { match_indicators &= 0x0101010101010101; if (match_indicators != 0) - return text - h.ptr + ctz64(match_indicators) / 8; + return text - h.start + ctz64(match_indicators) / 8; } for (; text < end; ++text) if (*text == n) - return text - h.ptr; - return h.len; + return text - h.start; + return h.length; } /** @@ -128,8 +128,8 @@ inline static sz_size_t sz_naive_find_char(sz_haystack_t h, char n) { */ inline static sz_size_t sz_naive_find_2chars(sz_haystack_t h, char const *n) { - char const *text = h.ptr; - char const *end = h.ptr + h.len; + char const *text = h.start; + char const *end = h.start + h.length; // This code simulates hyper-scalar execution, analyzing 7 offsets at a time. uint64_t nnnn = ((uint64_t)(n[0]) << 0) | ((uint64_t)(n[1]) << 8); // broadcast `n` into `nnnn` @@ -155,14 +155,14 @@ inline static sz_size_t sz_naive_find_2chars(sz_haystack_t h, char const *n) { if (even_indicators + odd_indicators) { uint64_t match_indicators = even_indicators | (odd_indicators >> 8); - return text - h.ptr + ctz64(match_indicators) / 8; + return text - h.start + ctz64(match_indicators) / 8; } } for (; text + 2 <= end; ++text) if (text[0] == n[0] && text[1] == n[1]) - return text - h.ptr; - return h.len; + return text - h.start; + return h.length; } /** @@ -170,8 +170,8 @@ inline static sz_size_t sz_naive_find_2chars(sz_haystack_t h, char const *n) { */ inline static sz_size_t sz_naive_find_3chars(sz_haystack_t h, char const *n) { - char const *text = h.ptr; - char const *end = h.ptr + h.len; + char const *text = h.start; + char const *end = h.start + h.length; // This code simulates hyper-scalar execution, analyzing 6 offsets at a time. // We have two unused bytes at the end. @@ -211,13 +211,13 @@ inline static sz_size_t sz_naive_find_3chars(sz_haystack_t h, char const *n) { uint64_t match_indicators = first_indicators | (second_indicators >> 8) | (third_indicators >> 16); if (match_indicators != 0) - return text - h.ptr + ctz64(match_indicators) / 8; + return text - h.start + ctz64(match_indicators) / 8; } for (; text + 3 <= end; ++text) if (text[0] == n[0] && text[1] == n[1] && text[2] == n[2]) - return text - h.ptr; - return h.len; + return text - h.start; + return h.length; } /** @@ -225,8 +225,8 @@ inline static sz_size_t sz_naive_find_3chars(sz_haystack_t h, char const *n) { */ inline static sz_size_t sz_naive_find_4chars(sz_haystack_t h, char const *n) { - char const *text = h.ptr; - char const *end = h.ptr + h.len; + char const *text = h.start; + char const *end = h.start + h.length; // This code simulates hyper-scalar execution, analyzing 4 offsets at a time. uint64_t nn = (uint64_t)(n[0] << 0) | ((uint64_t)(n[1]) << 8) | ((uint64_t)(n[2]) << 16) | ((uint64_t)(n[3]) << 24); @@ -269,14 +269,14 @@ inline static sz_size_t sz_naive_find_4chars(sz_haystack_t h, char const *n) { uint8_t match_indicators = (uint8_t)( // (text01_indicators >> 31) | (text01_indicators << 0) | // (text23_indicators >> 29) | (text23_indicators << 2)); - return text - h.ptr + lookup[match_indicators]; + return text - h.start + lookup[match_indicators]; } } for (; text + 4 <= end; ++text) if (text[0] == n[0] && text[1] == n[1] && text[2] == n[2] && text[3] == n[3]) - return text - h.ptr; - return h.len; + return text - h.start; + return h.length; } /** @@ -286,32 +286,32 @@ inline static sz_size_t sz_naive_find_4chars(sz_haystack_t h, char const *n) { */ inline static sz_size_t sz_naive_find_substr(sz_haystack_t h, sz_needle_t n) { - if (h.len < n.len) - return h.len; + if (h.length < n.length) + return h.length; - char const *text = h.ptr; - char const *const end = h.ptr + h.len; - switch (n.len) { + char const *text = h.start; + char const *const end = h.start + h.length; + switch (n.length) { case 0: return 0; - case 1: return sz_naive_find_char(h, *n.ptr); - case 2: return sz_naive_find_2chars(h, n.ptr); - case 3: return sz_naive_find_3chars(h, n.ptr); - case 4: return sz_naive_find_4chars(h, n.ptr); + case 1: return sz_naive_find_char(h, *n.start); + case 2: return sz_naive_find_2chars(h, n.start); + case 3: return sz_naive_find_3chars(h, n.start); + case 4: return sz_naive_find_4chars(h, n.start); default: { sz_anomaly_t n_anomaly, h_anomaly; - sz_size_t const n_suffix_len = n.len - 4 - n.anomaly_offset; - char const *n_suffix_ptr = n.ptr + 4 + n.anomaly_offset; - memcpy(&n_anomaly, n.ptr + n.anomaly_offset, 4); + sz_size_t const n_suffix_len = n.length - 4 - n.anomaly_offset; + char const *n_suffix_ptr = n.start + 4 + n.anomaly_offset; + memcpy(&n_anomaly, n.start + n.anomaly_offset, 4); text += n.anomaly_offset; - for (; text + n.len <= end; text++) { + for (; text + n.length <= end; text++) { memcpy(&h_anomaly, text, 4); - if (h_anomaly == n_anomaly) // Match anomaly. - if (sz_equal(text + 4, n_suffix_ptr, n_suffix_len)) // Match suffix. - if (sz_equal(text - n.anomaly_offset, n.ptr, n.anomaly_offset)) // Match prefix. - return text - h.ptr - n.anomaly_offset; + if (h_anomaly == n_anomaly) // Match anomaly. + if (sz_equal(text + 4, n_suffix_ptr, n_suffix_len)) // Match suffix. + if (sz_equal(text - n.anomaly_offset, n.start, n.anomaly_offset)) // Match prefix. + return text - h.start - n.anomaly_offset; } - return h.len; + return h.length; } } } @@ -327,14 +327,14 @@ inline static sz_size_t sz_naive_find_substr(sz_haystack_t h, sz_needle_t n) { sz_size_t sz_avx2_find_substr(sz_haystack_t h, sz_needle_t n) { // Precomputed constants - char const *const end = h.ptr + h.len; + char const *const end = h.start + h.length; uint32_t anomaly = 0; uint32_t mask = 0; - switch (n.len) { - case 1: memset(&mask, 0xFF, 1), memcpy(&anomaly, n.ptr, 1); break; - case 2: memset(&mask, 0xFF, 2), memcpy(&anomaly, n.ptr, 2); break; - case 3: memset(&mask, 0xFF, 3), memcpy(&anomaly, n.ptr, 3); break; - default: memset(&mask, 0xFF, 4), memcpy(&anomaly, n.ptr, 4); break; + switch (n.length) { + case 1: memset(&mask, 0xFF, 1), memcpy(&anomaly, n.start, 1); break; + case 2: memset(&mask, 0xFF, 2), memcpy(&anomaly, n.start, 2); break; + case 3: memset(&mask, 0xFF, 3), memcpy(&anomaly, n.start, 3); break; + default: memset(&mask, 0xFF, 4), memcpy(&anomaly, n.start, 4); break; } __m256i const anomalies = _mm256_set1_epi32(*(uint32_t const *)&anomaly); @@ -349,8 +349,8 @@ sz_size_t sz_avx2_find_substr(sz_haystack_t h, sz_needle_t n) { // + 4 movemasks. // + 3 bitwise ANDs. // + 1 heavy (but very unlikely) branch. - char const *text = h.ptr; - for (; (text + n.len + 32) <= end; text += 32) { + char const *text = h.start; + for (; (text + n.length + 32) <= end; text += 32) { // Performing many unaligned loads ends up being faster than loading once and shuffling around. __m256i texts0 = _mm256_and_si256(_mm256_loadu_si256((__m256i const *)(text + 0)), masks); @@ -364,18 +364,18 @@ sz_size_t sz_avx2_find_substr(sz_haystack_t h, sz_needle_t n) { if (matches0 | matches1 | matches2 | matches3) { for (sz_size_t i = 0; i < 32; i++) { - if (sz_equal(text + i, n.ptr, n.len)) - return i + (text - h.ptr); + if (sz_equal(text + i, n.start, n.length)) + return i + (text - h.start); } } } // Don't forget the last (up to 35) characters. sz_haystack_t h_remainder; - h_remainder.ptr = text; - h_remainder.len = end - text; + h_remainder.start = text; + h_remainder.length = end - text; sz_size_t tail_match = sz_naive_find_substr(h_remainder, n); - return text + tail_match - h.ptr; + return text + tail_match - h.start; } #endif // x86 AVX2 @@ -391,21 +391,21 @@ sz_size_t sz_avx2_find_substr(sz_haystack_t h, sz_needle_t n) { inline static sz_size_t sz_neon_find_substr(sz_haystack_t h, sz_needle_t n) { // Precomputed constants - char const *const end = h.ptr + h.len; + char const *const end = h.start + h.length; uint32_t anomaly = 0; uint32_t mask = 0; - switch (n.len) { - case 1: memset(&mask, 0xFF, 1), memcpy(&anomaly, n.ptr, 1); break; - case 2: memset(&mask, 0xFF, 2), memcpy(&anomaly, n.ptr, 2); break; - case 3: memset(&mask, 0xFF, 3), memcpy(&anomaly, n.ptr, 3); break; - default: memset(&mask, 0xFF, 4), memcpy(&anomaly, n.ptr, 4); break; + switch (n.length) { + case 1: memset(&mask, 0xFF, 1), memcpy(&anomaly, n.start, 1); break; + case 2: memset(&mask, 0xFF, 2), memcpy(&anomaly, n.start, 2); break; + case 3: memset(&mask, 0xFF, 3), memcpy(&anomaly, n.start, 3); break; + default: memset(&mask, 0xFF, 4), memcpy(&anomaly, n.start, 4); break; } uint32x4_t const anomalies = vld1q_dup_u32(&anomaly); uint32x4_t const masks = vld1q_dup_u32(&mask); - char const *text = h.ptr; - for (; (text + n.len + 16) <= end; text += 16) { + char const *text = h.start; + for (; (text + n.length + 16) <= end; text += 16) { uint32x4_t matches0 = vceqq_u32(vandq_u32(vld1q_u32((uint32_t const *)(text + 0)), masks), anomalies); uint32x4_t matches1 = vceqq_u32(vandq_u32(vld1q_u32((uint32_t const *)(text + 1)), masks), anomalies); @@ -422,18 +422,18 @@ inline static sz_size_t sz_neon_find_substr(sz_haystack_t h, sz_needle_t n) { if (has_match) { for (sz_size_t i = 0; i < 16; i++) { - if (sz_equal(text + i, n.ptr, n.len)) - return i + (text - h.ptr); + if (sz_equal(text + i, n.start, n.length)) + return i + (text - h.start); } } } // Don't forget the last (up to 16+3=19) characters. sz_haystack_t h_remainder; - h_remainder.ptr = text; - h_remainder.len = end - text; + h_remainder.start = text; + h_remainder.length = end - text; sz_size_t tail_match = sz_naive_find_substr(h_remainder, n); - return text + tail_match - h.ptr; + return text + tail_match - h.start; } #endif // Arm Neon @@ -444,66 +444,73 @@ inline static void sz_swap(sz_size_t *a, sz_size_t *b) { *b = t; } -typedef char const *(*sz_array_get_begin_t)(void const *, sz_size_t); -typedef sz_size_t (*sz_array_get_length_t)(void const *, sz_size_t); -typedef int (*sz_array_predicate_t)(void const *, sz_size_t); -typedef int (*sz_array_comparator_t)(void const *, sz_size_t, sz_size_t); +typedef char const *(*sz_sequence_get_start_t)(void const *, sz_size_t); +typedef sz_size_t (*sz_sequence_get_length_t)(void const *, sz_size_t); +typedef int (*sz_sequence_predicate_t)(void const *, sz_size_t); +typedef int (*sz_sequence_comparator_t)(void const *, sz_size_t, sz_size_t); -typedef struct sz_array_t { +// Define a type for the comparison function, depending on the platform. +#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) || defined(__APPLE__) +typedef int (*sz_qsort_comparison_func_t)(void *, void const *, void const *); +#else +typedef int (*sz_qsort_comparison_func_t)(void const *, void const *, void *); +#endif + +typedef struct sz_sequence_t { sz_size_t *order; sz_size_t count; - sz_array_get_begin_t get_begin; - sz_array_get_length_t get_length; + sz_sequence_get_start_t get_start; + sz_sequence_get_length_t get_length; void const *handle; -} sz_array_t; +} sz_sequence_t; /** * @brief Similar to `std::partition`, given a predicate splits the - * array into two parts. + * sequence into two parts. */ -inline static sz_size_t sz_partition(sz_array_t *array, sz_array_predicate_t predicate) { +inline static sz_size_t sz_partition(sz_sequence_t *sequence, sz_sequence_predicate_t predicate) { sz_size_t matches = 0; - while (matches != array->count && predicate(array->handle, array->order[matches])) + while (matches != sequence->count && predicate(sequence->handle, sequence->order[matches])) ++matches; - for (sz_size_t i = matches + 1; i < array->count; ++i) - if (predicate(array->handle, array->order[i])) - sz_swap(array->order + i, array->order + matches), ++matches; + for (sz_size_t i = matches + 1; i < sequence->count; ++i) + if (predicate(sequence->handle, sequence->order[i])) + sz_swap(sequence->order + i, sequence->order + matches), ++matches; return matches; } /** * @brief Inplace `std::set_union` for two consecutive chunks forming - * the same continuous array. + * the same continuous sequence. */ -inline static void sz_merge(sz_array_t *array, sz_size_t partition, sz_array_comparator_t less) { +inline static void sz_merge(sz_sequence_t *sequence, sz_size_t partition, sz_sequence_comparator_t less) { sz_size_t start_b = partition + 1; // If the direct merge is already sorted - if (!less(array->handle, array->order[start_b], array->order[partition])) + if (!less(sequence->handle, sequence->order[start_b], sequence->order[partition])) return; sz_size_t start_a = 0; - while (start_a <= partition && start_b <= array->count) { + while (start_a <= partition && start_b <= sequence->count) { // If element 1 is in right place - if (!less(array->handle, array->order[start_b], array->order[start_a])) { + if (!less(sequence->handle, sequence->order[start_b], sequence->order[start_a])) { start_a++; } else { - sz_size_t value = array->order[start_b]; + sz_size_t value = sequence->order[start_b]; sz_size_t index = start_b; // Shift all the elements between element 1 // element 2, right by 1. while (index != start_a) { - array->order[index] = array->order[index - 1]; + sequence->order[index] = sequence->order[index - 1]; index--; } - array->order[start_a] = value; + sequence->order[start_a] = value; // Update all the pointers start_a++; @@ -514,103 +521,111 @@ inline static void sz_merge(sz_array_t *array, sz_size_t partition, sz_array_com } inline static void _sz_sort_recursion( // - sz_array_t *array, + sz_sequence_t *sequence, sz_size_t bit_idx, sz_size_t bit_max, -#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) || __APPLE__ - int (*libc_comparator)(void *, void const *, void const *) -#else - int (*libc_comparator)(void const *, void const *, void *) -#endif -) { + sz_qsort_comparison_func_t qsort_comparator) { - if (!array->count) + if (!sequence->count) return; // Partition a range of integers according to a specific bit value sz_size_t split = 0; { sz_size_t mask = (1ul << 63) >> bit_idx; - while (split != array->count && !(array->order[split] & mask)) + while (split != sequence->count && !(sequence->order[split] & mask)) ++split; - for (sz_size_t i = split + 1; i < array->count; ++i) - if (!(array->order[i] & mask)) - sz_swap(array->order + i, array->order + split), ++split; + for (sz_size_t i = split + 1; i < sequence->count; ++i) + if (!(sequence->order[i] & mask)) + sz_swap(sequence->order + i, sequence->order + split), ++split; } // Go down recursively if (bit_idx < bit_max) { - sz_array_t a = *array; + sz_sequence_t a = *sequence; a.count = split; - _sz_sort_recursion(&a, bit_idx + 1, bit_max, libc_comparator); + _sz_sort_recursion(&a, bit_idx + 1, bit_max, qsort_comparator); - sz_array_t b = *array; + sz_sequence_t b = *sequence; b.order += split; b.count -= split; - _sz_sort_recursion(&b, bit_idx + 1, bit_max, libc_comparator); + _sz_sort_recursion(&b, bit_idx + 1, bit_max, qsort_comparator); } // Reached the end of recursion else { // Discard the prefixes - for (sz_size_t i = 0; i != array->count; ++i) - memset((char *)(&array->order[i]) + 4, 0, 4ul); + for (sz_size_t i = 0; i != sequence->count; ++i) { + memset((char *)(&sequence->order[i]) + 4, 0, 4ul); + } -#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) // Perform sorts on smaller chunks instead of the whole handle +#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) // https://stackoverflow.com/a/39561369 // https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/qsort-s?view=msvc-170 - qsort_s(array->order, split, sizeof(sz_size_t), libc_comparator, (void *)array); - qsort_s(array->order + split, array->count - split, sizeof(sz_size_t), libc_comparator, (void *)array); + qsort_s(sequence->order, split, sizeof(sz_size_t), qsort_comparator, (void *)sequence); + qsort_s(sequence->order + split, + sequence->count - split, + sizeof(sz_size_t), + qsort_comparator, + (void *)sequence); #elif __APPLE__ - qsort_r(array->order, split, sizeof(sz_size_t), (void *)array, libc_comparator); - qsort_r(array->order + split, array->count - split, sizeof(sz_size_t), (void *)array, libc_comparator); + qsort_r(sequence->order, split, sizeof(sz_size_t), (void *)sequence, qsort_comparator); + qsort_r(sequence->order + split, + sequence->count - split, + sizeof(sz_size_t), + (void *)sequence, + qsort_comparator); #else // https://linux.die.net/man/3/qsort_r - qsort_r(array->order, split, sizeof(sz_size_t), libc_comparator, (void *)array); - qsort_r(array->order + split, array->count - split, sizeof(sz_size_t), libc_comparator, (void *)array); + qsort_r(sequence->order, split, sizeof(sz_size_t), qsort_comparator, (void *)sequence); + qsort_r(sequence->order + split, + sequence->count - split, + sizeof(sz_size_t), + qsort_comparator, + (void *)sequence); #endif } } -inline static int _sz_sort_array_strncmp( +inline static int _sz_sort_sequence_strncmp( #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) || __APPLE__ - void *array_raw, void const *a_raw, void const *b_raw + void *sequence_raw, void const *a_raw, void const *b_raw #else - void const *a_raw, void const *b_raw, void *array_raw + void const *a_raw, void const *b_raw, void *sequence_raw #endif ) { // https://man.freebsd.org/cgi/man.cgi?query=qsort_s&sektion=3&n=1 // https://www.man7.org/linux/man-pages/man3/strcmp.3.html - sz_array_t *array = (sz_array_t *)array_raw; + sz_sequence_t *sequence = (sz_sequence_t *)sequence_raw; sz_size_t a = *(sz_size_t *)a_raw; sz_size_t b = *(sz_size_t *)b_raw; - sz_size_t a_len = array->get_length(array->handle, a); - sz_size_t b_len = array->get_length(array->handle, b); + sz_size_t a_len = sequence->get_length(sequence->handle, a); + sz_size_t b_len = sequence->get_length(sequence->handle, b); int res = strncmp( // - array->get_begin(array->handle, a), - array->get_begin(array->handle, b), + sequence->get_start(sequence->handle, a), + sequence->get_start(sequence->handle, b), a_len > b_len ? b_len : a_len); return res ? res : a_len - b_len; } -inline static int _sz_sort_array_strncasecmp( +inline static int _sz_sort_sequence_strncasecmp( #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) || __APPLE__ - void *array_raw, void const *a_raw, void const *b_raw + void *sequence_raw, void const *a_raw, void const *b_raw #else - void const *a_raw, void const *b_raw, void *array_raw + void const *a_raw, void const *b_raw, void *sequence_raw #endif ) { // https://man.freebsd.org/cgi/man.cgi?query=qsort_s&sektion=3&n=1 // https://www.man7.org/linux/man-pages/man3/strcmp.3.html - sz_array_t *array = (sz_array_t *)array_raw; + sz_sequence_t *sequence = (sz_sequence_t *)sequence_raw; sz_size_t a = *(sz_size_t *)a_raw; sz_size_t b = *(sz_size_t *)b_raw; - sz_size_t a_len = array->get_length(array->handle, a); - sz_size_t b_len = array->get_length(array->handle, b); + sz_size_t a_len = sequence->get_length(sequence->handle, a); + sz_size_t b_len = sequence->get_length(sequence->handle, b); int res = strncasecmp( // - array->get_begin(array->handle, a), - array->get_begin(array->handle, b), + sequence->get_start(sequence->handle, a), + sequence->get_start(sequence->handle, b), a_len > b_len ? b_len : a_len); return res ? res : a_len - b_len; } @@ -623,16 +638,16 @@ typedef struct sz_sort_config_t { * @brief Sorting algorithm, combining Radix Sort for the first 32 bits of every word * and a follow-up Quick Sort on resulting structure. */ -inline static void sz_sort(sz_array_t *array, sz_sort_config_t const *config) { +inline static void sz_sort(sz_sequence_t *sequence, sz_sort_config_t const *config) { int case_insensitive = config && config->case_insensitive; - // Export up to 4 bytes into the `array` bits themselves - for (sz_size_t i = 0; i != array->count; ++i) { - char const *begin = array->get_begin(array->handle, array->order[i]); - sz_size_t length = array->get_length(array->handle, array->order[i]); + // Export up to 4 bytes into the `sequence` bits themselves + for (sz_size_t i = 0; i != sequence->count; ++i) { + char const *begin = sequence->get_start(sequence->handle, sequence->order[i]); + sz_size_t length = sequence->get_length(sequence->handle, sequence->order[i]); length = length > 4ul ? 4ul : length; - char *prefix = (char *)&array->order[i]; + char *prefix = (char *)&sequence->order[i]; for (sz_size_t j = 0; j != length; ++j) prefix[7 - j] = begin[j]; if (case_insensitive) { @@ -643,17 +658,12 @@ inline static void sz_sort(sz_array_t *array, sz_sort_config_t const *config) { } } -#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) || __APPLE__ - int (*comparator)(void *, void const *, void const *); -#else - int (*comparator)(void const *, void const *, void *); -#endif - comparator = _sz_sort_array_strncmp; + sz_qsort_comparison_func_t comparator = _sz_sort_sequence_strncmp; if (case_insensitive) - comparator = _sz_sort_array_strncasecmp; + comparator = _sz_sort_sequence_strncasecmp; // Perform optionally-parallel radix sort on them - _sz_sort_recursion(array, 0, 32, comparator); + _sz_sort_recursion(sequence, 0, 32, comparator); } typedef uint8_t levenstein_distance_t; From 21200c8311f4af37e8f5c55af508222cd8e29aa9 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sun, 17 Sep 2023 16:48:02 +0400 Subject: [PATCH 14/72] Fix: Benchmarks compilation --- scripts/test.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/test.cpp b/scripts/test.cpp index 5c97c452..c1462c6d 100644 --- a/scripts/test.cpp +++ b/scripts/test.cpp @@ -12,27 +12,27 @@ #include using strings_t = std::vector; -using idx_t = std::size_t; +using idx_t = sz_size_t; using permute_t = std::vector; #pragma region - C callbacks -static char const *get_start(void const *array_c, size_t i) { +static char const *get_start(void const *array_c, sz_size_t i) { strings_t const &array = *reinterpret_cast(array_c); return array[i].c_str(); } -static size_t get_length(void const *array_c, size_t i) { +static sz_size_t get_length(void const *array_c, sz_size_t i) { strings_t const &array = *reinterpret_cast(array_c); return array[i].size(); } -static bool is_less(void const *array_c, size_t i, size_t j) { +static int is_less(void const *array_c, sz_size_t i, sz_size_t j) { strings_t const &array = *reinterpret_cast(array_c); return array[i] < array[j]; } -static bool has_under_four_chars(void const *array_c, size_t i) { +static int has_under_four_chars(void const *array_c, sz_size_t i) { strings_t const &array = *reinterpret_cast(array_c); return array[i].size() < 4; } From 12b6d0b844cd0506bbfc75885fbe61691a6068d7 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sun, 17 Sep 2023 17:29:56 +0400 Subject: [PATCH 15/72] Add: `Strs` structure in CPython --- python/lib.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/python/lib.c b/python/lib.c index c670ae81..a74f9816 100644 --- a/python/lib.c +++ b/python/lib.c @@ -27,6 +27,7 @@ typedef SSIZE_T ssize_t; static PyTypeObject FileType; static PyTypeObject StrType; +static PyTypeObject StrsType; struct { void *start; @@ -68,6 +69,69 @@ typedef struct { size_t length; } Str; +/** + * @brief Variable length Python object similar to `Tuple[Union[Str, str]]`, + * for faster sorting, shuffling, joins, and lookups. + */ +typedef struct { + PyObject_HEAD; + + enum { + STRS_CONSECUTIVE_32, + STRS_CONSECUTIVE_64, + STRS_REORDERED, + STRS_MULTI_SOURCE, + } type; + + union { + /** + * Simple structure resembling Apache Arrow arrays of variable length strings. + * When you split a `Str`, that is under 4 GB in size, this is used for space-efficiency. + */ + struct consecutive_slices_32bit_t { + size_t count; + PyObject *parent; + char const *start; + uint32_t *offsets; + } consecutive_32bit; + + /** + * Simple structure resembling Apache Arrow arrays of variable length strings. + * When you split a `Str`, over 4 GB long, this structure is used to indicate chunk offsets. + */ + struct consecutive_slices_64bit_t { + size_t count; + PyObject *parent; + char const *start; + uint64_t *offsets; + } consecutive_64bit; + + /** + * Once you sort, shuffle, or reorganize slices making up a larger string, this structure + * cn be used for space-efficient lookups. + */ + struct reordered_slices_t { + size_t count; + PyObject *parent; + sz_haystack_t *parts; + } reordered; + + /** + * Complex structure with two variable length chunks inside - for the parents and their slices. + * The parents are sorted in ascending order of their memory ranges, to let us rapidly locate the source + * with a binary search. The slices are preserved + */ + struct multi_source_strings_t { + size_t count; + size_t parents_count; + + PyObject **parents; + sz_haystack_t *parts; + } multi_source; + } data; + +} Strs; + #pragma endregion #pragma region Helpers @@ -726,6 +790,15 @@ static PyTypeObject StrType = { // .tp_as_buffer = (PyBufferProcs *)NULL, // Functions to access object as input/output buffer }; +static PyTypeObject StrsType = { + PyVarObject_HEAD_INIT(NULL, 0).tp_name = "stringzilla.Strs", + .tp_doc = "Space-efficient container for large collections of strings and their slices", + .tp_basicsize = sizeof(Strs), + .tp_itemsize = 0, + .tp_flags = Py_TPFLAGS_DEFAULT, + .tp_new = PyType_GenericNew, +}; + #pragma endregion static PyMethodDef stringzilla_methods[] = { // @@ -776,6 +849,9 @@ PyMODINIT_FUNC PyInit_stringzilla(void) { if (PyType_Ready(&FileType) < 0) return NULL; + if (PyType_Ready(&StrsType) < 0) + return NULL; + m = PyModule_Create(&stringzilla_module); if (m == NULL) return NULL; @@ -795,6 +871,15 @@ PyMODINIT_FUNC PyInit_stringzilla(void) { return NULL; } + Py_INCREF(&StrsType); + if (PyModule_AddObject(m, "Strs", (PyObject *)&StrsType) < 0) { + Py_XDECREF(&StrsType); + Py_XDECREF(&FileType); + Py_XDECREF(&StrType); + Py_XDECREF(m); + return NULL; + } + // Initialize temporary_memory, if needed // For example, allocate an initial chunk temporary_memory.start = malloc(4096); From 8f76c291241c955df1845ac8be9f39e29e9c6de9 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sun, 17 Sep 2023 17:30:13 +0400 Subject: [PATCH 16/72] Add: Purely `qsort`-based hybrid sort benchmark --- scripts/test.cpp | 74 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 65 insertions(+), 9 deletions(-) diff --git a/scripts/test.cpp b/scripts/test.cpp index c1462c6d..1cf34bb2 100644 --- a/scripts/test.cpp +++ b/scripts/test.cpp @@ -65,7 +65,7 @@ void populate_with_test(strings_t &strings) { constexpr size_t offset_in_word = 0; -inline static idx_t hybrid_sort(strings_t const &strings, idx_t *order) { +inline static idx_t hybrid_sort_cpp(strings_t const &strings, idx_t *order) { // What if we take up-to 4 first characters and the index for (size_t i = 0; i != strings.size(); ++i) @@ -87,7 +87,50 @@ inline static idx_t hybrid_sort(strings_t const &strings, idx_t *order) { return strings.size(); } -inline static idx_t hybrid_stable_sort(strings_t const &strings, idx_t *order) { +int hybrid_sort_c_compare_uint32_t(const void *a, const void *b) { + uint32_t int_a = *((uint32_t *)(((char *)a) + sizeof(sz_size_t) - 4)); + uint32_t int_b = *((uint32_t *)(((char *)b) + sizeof(sz_size_t) - 4)); + return (int_a < int_b) ? -1 : (int_a > int_b); +} + +int hybrid_sort_c_compare_strings(void *arg, const void *a, const void *b) { + sz_sequence_t *seq = (sz_sequence_t *)arg; + sz_size_t idx_a = *(sz_size_t *)a; + sz_size_t idx_b = *(sz_size_t *)b; + + const char *str_a = seq->get_start(seq->handle, idx_a); + const char *str_b = seq->get_start(seq->handle, idx_b); + sz_size_t len_a = seq->get_length(seq->handle, idx_a); + sz_size_t len_b = seq->get_length(seq->handle, idx_b); + + int res = strncmp(str_a, str_b, len_a < len_b ? len_a : len_b); + return res ? res : (int)(len_a - len_b); +} + +sz_size_t hybrid_sort_c(sz_sequence_t *sequence) { + // Copy up to 4 first characters into the 'order' array. + for (sz_size_t i = 0; i < sequence->count; ++i) { + const char *str = sequence->get_start(sequence->handle, sequence->order[i]); + sz_size_t len = sequence->get_length(sequence->handle, sequence->order[i]); + len = len > 4 ? 4 : len; + memcpy((char *)&sequence->order[i] + sizeof(sz_size_t) - 4, str, len); + } + + // Sort based on the first 4 bytes. + qsort(sequence->order, sequence->count, sizeof(sz_size_t), hybrid_sort_c_compare_uint32_t); + + // Clear the 4 bytes used for the initial sort. + for (sz_size_t i = 0; i < sequence->count; ++i) { + memset((char *)&sequence->order[i] + sizeof(sz_size_t) - 4, 0, 4); + } + + // Sort the full strings. + qsort_r(sequence->order, sequence->count, sizeof(sz_size_t), sequence, hybrid_sort_c_compare_strings); + + return sequence->count; +} + +inline static idx_t hybrid_stable_sort_cpp(strings_t const &strings, idx_t *order) { // What if we take up-to 4 first characters and the index for (size_t i = 0; i != strings.size(); ++i) @@ -189,7 +232,7 @@ int main(int, char const **) { }; // Search substring - for (std::size_t needle_len = 1; needle_len <= 5; ++needle_len) { + for (std::size_t needle_len = 1; needle_len <= 0; ++needle_len) { std::string needle(needle_len, '\4'); std::printf("---- Needle length: %zu\n", needle_len); bench_search("std::search", full_text, [&]() { @@ -221,7 +264,7 @@ int main(int, char const **) { permute_new.resize(strings.size()); // Partitioning - if (true) { + if (false) { std::printf("---- Partitioning:\n"); bench_permute("std::partition", strings, permute_base, [](strings_t const &strings, permute_t &permute) { std::partition(permute.begin(), permute.end(), [&](size_t i) { return strings[i].size() < 4; }); @@ -263,8 +306,19 @@ int main(int, char const **) { }); expect_sorted(strings, permute_new); - bench_permute("hybrid_sort", strings, permute_new, [](strings_t const &strings, permute_t &permute) { - hybrid_sort(strings, permute.data()); + bench_permute("hybrid_sort_c", strings, permute_new, [](strings_t const &strings, permute_t &permute) { + sz_sequence_t array; + array.order = permute.data(); + array.count = strings.size(); + array.handle = &strings; + array.get_start = get_start; + array.get_length = get_length; + hybrid_sort_c(&array); + }); + expect_sorted(strings, permute_new); + + bench_permute("hybrid_sort_cpp", strings, permute_new, [](strings_t const &strings, permute_t &permute) { + hybrid_sort_cpp(strings, permute.data()); }); expect_sorted(strings, permute_new); @@ -274,9 +328,11 @@ int main(int, char const **) { }); expect_sorted(strings, permute_base); - bench_permute("hybrid_stable_sort", strings, permute_base, [](strings_t const &strings, permute_t &permute) { - hybrid_stable_sort(strings, permute.data()); - }); + bench_permute( + "hybrid_stable_sort_cpp", + strings, + permute_base, + [](strings_t const &strings, permute_t &permute) { hybrid_stable_sort_cpp(strings, permute.data()); }); expect_sorted(strings, permute_new); expect_same(permute_base, permute_new); } From e53e1b9b9a272c40c1f5f26c2cbac12a4ad9dde2 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sun, 17 Sep 2023 21:10:06 +0400 Subject: [PATCH 17/72] Add: Vectorized `split` for Python --- python/lib.c | 124 +++++++++++++++++++++++++++++++++++++++++++++++- scripts/test.py | 11 +++-- 2 files changed, 129 insertions(+), 6 deletions(-) diff --git a/python/lib.c b/python/lib.c index a74f9816..bd16cb23 100644 --- a/python/lib.c +++ b/python/lib.c @@ -90,6 +90,7 @@ typedef struct { */ struct consecutive_slices_32bit_t { size_t count; + size_t separator_length; PyObject *parent; char const *start; uint32_t *offsets; @@ -101,6 +102,7 @@ typedef struct { */ struct consecutive_slices_64bit_t { size_t count; + size_t separator_length; PyObject *parent; char const *start; uint64_t *offsets; @@ -425,6 +427,125 @@ static PyObject *str_levenstein_vectorcall(PyObject *_, PyObject *const *args, s return PyLong_FromLong(distance); } +static PyObject *strs_split_vectorcall(PyObject *self, PyObject *const *args, size_t nargsf, PyObject *kwnames) { + Py_ssize_t nargs = PyVectorcall_NARGS(nargsf); + + // Validate the number of arguments + if (nargs < 1) { + PyErr_SetString(PyExc_TypeError, "sz.split() requires at least 1 argument"); + return NULL; + } + + PyObject *text_obj = args[0]; + struct sz_haystack_t text; + if (!export_string_like(text_obj, &text.start, &text.length)) { + PyErr_SetString(PyExc_TypeError, "First argument must be string-like"); + return NULL; + } + + struct sz_needle_t separator; + separator.start = " "; + separator.length = 1; + separator.anomaly_offset = 0; + int keepseparator = 0; + Py_ssize_t maxsplit = PY_SSIZE_T_MAX; + + // Parse additional positional arguments and keyword arguments + if (kwnames != NULL) { + for (Py_ssize_t i = 0; i < PyTuple_Size(kwnames); ++i) { + PyObject *key = PyTuple_GetItem(kwnames, i); + PyObject *value = args[nargs + i]; + if (PyUnicode_CompareWithASCIIString(key, "separator") == 0) { + // Assume separator is passed as a Python Unicode object + Py_ssize_t len; + separator.start = PyUnicode_AsUTF8AndSize(value, &len); + separator.length = (size_t)len; + } + else if (PyUnicode_CompareWithASCIIString(key, "maxsplit") == 0) + maxsplit = PyLong_AsSsize_t(value); + else if (PyUnicode_CompareWithASCIIString(key, "keepseparator") == 0) + keepseparator = PyObject_IsTrue(value); + else { + PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key); + return NULL; + } + } + } + + // Create Strs object + Strs *result = (Strs *)PyObject_New(Strs, &StrsType); + if (!result) + return NULL; + + // Initialize Strs object based on the splitting logic + void *offsets = NULL; + size_t offsets_capacity = 0; + size_t offsets_count = 0; + size_t bytes_per_offset; + if (text.length >= UINT32_MAX) { + bytes_per_offset = 8; + result->type = STRS_CONSECUTIVE_64; + result->data.consecutive_64bit.start = text.start; + result->data.consecutive_64bit.parent = text_obj; + result->data.consecutive_64bit.separator_length = keepseparator * separator.length; + } + else { + bytes_per_offset = 4; + result->type = STRS_CONSECUTIVE_32; + result->data.consecutive_32bit.start = text.start; + result->data.consecutive_32bit.parent = text_obj; + result->data.consecutive_32bit.separator_length = keepseparator * separator.length; + } + + // Iterate through string, keeping track of the + sz_size_t last_start = 0; + while (last_start < text.length && offsets_count < maxsplit) { + sz_haystack_t text_remaining; + text_remaining.start = text.start + last_start; + text_remaining.length = text.length - last_start; + sz_size_t offset_in_remaining = sz_neon_find_substr(text_remaining, separator); + + // Reallocate offsets array if needed + if (offsets_count >= offsets_capacity) { + offsets_capacity = (offsets_capacity + 1) * 2; + void *new_offsets = realloc(offsets, offsets_capacity * bytes_per_offset); + if (!new_offsets) { + if (offsets) + free(offsets); + } + offsets = new_offsets; + } + + // If the memory allocation has failed - discard the response + if (!offsets) { + Py_XDECREF(result); + PyErr_NoMemory(); + return NULL; + } + + // Export the offset + if (text.length >= UINT32_MAX) + ((uint64_t *)offsets)[offsets_count++] = (uint64_t)(last_start + offset_in_remaining); + else + ((uint32_t *)offsets)[offsets_count++] = (uint32_t)(last_start + offset_in_remaining); + + // Next time we want to start + last_start = last_start + offset_in_remaining + separator.length; + } + + // Populate the Strs object with the offsets + if (text.length >= UINT32_MAX) { + result->data.consecutive_64bit.offsets = offsets; + result->data.consecutive_64bit.count = offsets_count; + } + else { + result->data.consecutive_32bit.offsets = offsets; + result->data.consecutive_32bit.count = offsets_count; + } + + return (PyObject *)result; +} + #pragma endregion #pragma region MemoryMappingFile @@ -881,7 +1002,6 @@ PyMODINIT_FUNC PyInit_stringzilla(void) { } // Initialize temporary_memory, if needed - // For example, allocate an initial chunk temporary_memory.start = malloc(4096); temporary_memory.length = 4096 * (temporary_memory.start != NULL); atexit(cleanup_module); @@ -892,7 +1012,7 @@ PyMODINIT_FUNC PyInit_stringzilla(void) { PyObject *vectorized_count = register_vectorcall(m, "count", str_count_vectorcall); PyObject *vectorized_levenstein = register_vectorcall(m, "levenstein", str_levenstein_vectorcall); - PyObject *vectorized_split = register_vectorcall(m, "split", str_find_vectorcall); + PyObject *vectorized_split = register_vectorcall(m, "split", strs_split_vectorcall); PyObject *vectorized_sort = register_vectorcall(m, "sort", str_find_vectorcall); PyObject *vectorized_shuffle = register_vectorcall(m, "shuffle", str_find_vectorcall); diff --git a/scripts/test.py b/scripts/test.py index c3f70523..0c5d095f 100644 --- a/scripts/test.py +++ b/scripts/test.py @@ -119,10 +119,13 @@ def test_rich_comparisons(): # assert native.splitlines(True) == list(big.splitlines(keeplinebreaks=True)) -# def test_split_keepseparator(): -# native = "word1 word2 word3" -# big = Str(native) -# assert ["word1 ", "word2 ", "word3"] == list(big.split(" ", keepseparator=True)) +def test_split_keepseparator(): + native = "word1 word2 word3" + big = Str(native) + words = sz.split(big, " ") + parts = sz.split(big, " ", keepseparator=True) + # assert words[0] == "word1" + # assert parts[0] == "word1 " # def test_strs_operations(): From 6f6b389c363cfc4a1a1ef89a560796430b23b709 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Mon, 18 Sep 2023 18:53:32 +0400 Subject: [PATCH 18/72] Add: Split into consecutive slices --- python/lib.c | 404 +++++++++++++++++++++++++++++++++--------------- scripts/test.py | 35 +++-- 2 files changed, 300 insertions(+), 139 deletions(-) diff --git a/python/lib.c b/python/lib.c index bd16cb23..38fd38af 100644 --- a/python/lib.c +++ b/python/lib.c @@ -87,25 +87,35 @@ typedef struct { /** * Simple structure resembling Apache Arrow arrays of variable length strings. * When you split a `Str`, that is under 4 GB in size, this is used for space-efficiency. + * The `end_offsets` contains `count`-many integers marking the end offset of part at a given + * index. The length of consecutive elements can be determined as the difference in consecutive + * offsets. The starting offset of the first element is zero bytes after the `start`. + * Every chunk will include a separator of length `separator_length` at the end, except for the + * last one. */ struct consecutive_slices_32bit_t { size_t count; size_t separator_length; PyObject *parent; char const *start; - uint32_t *offsets; + uint32_t *end_offsets; } consecutive_32bit; /** * Simple structure resembling Apache Arrow arrays of variable length strings. * When you split a `Str`, over 4 GB long, this structure is used to indicate chunk offsets. + * The `end_offsets` contains `count`-many integers marking the end offset of part at a given + * index. The length of consecutive elements can be determined as the difference in consecutive + * offsets. The starting offset of the first element is zero bytes after the `start`. + * Every chunk will include a separator of length `separator_length` at the end, except for the + * last one. */ struct consecutive_slices_64bit_t { size_t count; size_t separator_length; PyObject *parent; char const *start; - uint64_t *offsets; + uint64_t *end_offsets; } consecutive_64bit; /** @@ -199,50 +209,72 @@ int export_string_like(PyObject *object, char const **start, size_t *length) { static Py_ssize_t str_find_vectorcall_(PyObject *_, PyObject *const *args, size_t nargsf, PyObject *kwnames) { Py_ssize_t nargs = PyVectorcall_NARGS(nargsf); - // Initialize defaults - Py_ssize_t start = 0; - Py_ssize_t end = PY_SSIZE_T_MAX; - - // Parse positional arguments: haystack and needle if (nargs < 2) { PyErr_SetString(PyExc_TypeError, "Invalid number of arguments"); return NULL; } + // Initialize with default values or positional arguments PyObject *haystack_obj = args[0]; PyObject *needle_obj = args[1]; - struct sz_haystack_t haystack; - struct sz_needle_t needle; - needle.anomaly_offset = 0; - if (!export_string_like(haystack_obj, &haystack.start, &haystack.length) || - !export_string_like(needle_obj, &needle.start, &needle.length)) { - PyErr_SetString(PyExc_TypeError, "Haystack and needle must be string-like"); - return NULL; - } - - // Parse additional positional arguments - if (nargs > 2) - start = PyLong_AsSsize_t(args[2]); - if (nargs > 3) - end = PyLong_AsSsize_t(args[3]); + PyObject *start_obj = (nargs > 2) ? args[2] : NULL; + PyObject *end_obj = (nargs > 3) ? args[3] : NULL; - // Parse keyword arguments + // Parse keyword arguments to overwrite positional ones if (kwnames != NULL) { for (Py_ssize_t i = 0; i < PyTuple_Size(kwnames); ++i) { PyObject *key = PyTuple_GetItem(kwnames, i); PyObject *value = args[nargs + i]; if (PyUnicode_CompareWithASCIIString(key, "start") == 0) - start = PyLong_AsSsize_t(value); + start_obj = value; else if (PyUnicode_CompareWithASCIIString(key, "end") == 0) - end = PyLong_AsSsize_t(value); + end_obj = value; else { PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key); return NULL; } + if (PyErr_Occurred()) + return NULL; } } - // Limit the haystack range + struct sz_haystack_t haystack; + struct sz_needle_t needle; + Py_ssize_t start, end; + + // Validate and convert `haystack` and `needle` + needle.anomaly_offset = 0; + if (!export_string_like(haystack_obj, &haystack.start, &haystack.length) || + !export_string_like(needle_obj, &needle.start, &needle.length)) { + PyErr_SetString(PyExc_TypeError, "Haystack and needle must be string-like"); + return NULL; + } + + // Validate and convert `start` + if (start_obj) { + start = PyLong_AsSsize_t(start_obj); + if (start == -1 && PyErr_Occurred()) { + PyErr_SetString(PyExc_TypeError, "The start argument must be an integer"); + return NULL; + } + } + else { + start = 0; + } + + // Validate and convert `end` + if (end_obj) { + end = PyLong_AsSsize_t(end_obj); + if (end == -1 && PyErr_Occurred()) { + PyErr_SetString(PyExc_TypeError, "The end argument must be an integer"); + return NULL; + } + } + else { + end = PY_SSIZE_T_MAX; + } + + // Limit the `haystack` range size_t normalized_offset, normalized_length; slice(haystack.length, start, end, &normalized_offset, &normalized_length); haystack.start += normalized_offset; @@ -273,12 +305,7 @@ static PyObject *str_contains_vectorcall(PyObject *_, PyObject *const *args, siz static PyObject *str_count_vectorcall(PyObject *_, PyObject *const *args, size_t nargsf, PyObject *kwnames) { Py_ssize_t nargs = PyVectorcall_NARGS(nargsf); - // Initialize defaults - Py_ssize_t start = 0; - Py_ssize_t end = PY_SSIZE_T_MAX; - int allow_overlap = 0; - - // Parse positional arguments: haystack and needle + // Initialize with default values or positional arguments if (nargs < 2) { PyErr_SetString(PyExc_TypeError, "Invalid number of arguments"); return NULL; @@ -286,40 +313,79 @@ static PyObject *str_count_vectorcall(PyObject *_, PyObject *const *args, size_t PyObject *haystack_obj = args[0]; PyObject *needle_obj = args[1]; + PyObject *start_obj = (nargs > 2) ? args[2] : NULL; + PyObject *end_obj = (nargs > 3) ? args[3] : NULL; + PyObject *allowoverlap_obj = (nargs > 4) ? args[4] : NULL; - struct sz_haystack_t haystack; - struct sz_needle_t needle; - needle.anomaly_offset = 0; - if (!export_string_like(haystack_obj, &haystack.start, &haystack.length) || - !export_string_like(needle_obj, &needle.start, &needle.length)) { - PyErr_SetString(PyExc_TypeError, "Haystack and needle must be string-like"); - return NULL; - } - - // Parse additional positional arguments - if (nargs > 2) - start = PyLong_AsSsize_t(args[2]); - if (nargs > 3) - end = PyLong_AsSsize_t(args[3]); - - // Parse keyword arguments + // Parse keyword arguments to overwrite positional ones if (kwnames != NULL) { for (Py_ssize_t i = 0; i < PyTuple_Size(kwnames); ++i) { PyObject *key = PyTuple_GetItem(kwnames, i); PyObject *value = args[nargs + i]; if (PyUnicode_CompareWithASCIIString(key, "start") == 0) - start = PyLong_AsSsize_t(value); + start_obj = value; else if (PyUnicode_CompareWithASCIIString(key, "end") == 0) - end = PyLong_AsSsize_t(value); + end_obj = value; else if (PyUnicode_CompareWithASCIIString(key, "allowoverlap") == 0) - allow_overlap = PyObject_IsTrue(value); + allowoverlap_obj = value; else { PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key); return NULL; } + if (PyErr_Occurred()) + return NULL; } } + struct sz_haystack_t haystack; + struct sz_needle_t needle; + int allowoverlap; + Py_ssize_t start, end; + + // Validate and convert `haystack` and `needle` + needle.anomaly_offset = 0; + if (!export_string_like(haystack_obj, &haystack.start, &haystack.length) || + !export_string_like(needle_obj, &needle.start, &needle.length)) { + PyErr_SetString(PyExc_TypeError, "Haystack and needle must be string-like"); + return NULL; + } + + // Validate and convert `start` + if (start_obj) { + start = PyLong_AsSsize_t(start_obj); + if (start == -1 && PyErr_Occurred()) { + PyErr_SetString(PyExc_TypeError, "The start argument must be an integer"); + return NULL; + } + } + else { + start = 0; + } + + // Validate and convert `end` + if (end_obj) { + end = PyLong_AsSsize_t(end_obj); + if (end == -1 && PyErr_Occurred()) { + PyErr_SetString(PyExc_TypeError, "The end argument must be an integer"); + return NULL; + } + } + else { + end = PY_SSIZE_T_MAX; + } + + // Validate and convert `allowoverlap` + if (allowoverlap_obj) { + allowoverlap = PyObject_IsTrue(allowoverlap_obj); + if (allowoverlap == -1) { + PyErr_SetString(PyExc_TypeError, "The allowoverlap argument must be a boolean"); + return NULL; + } + } + else { + allowoverlap = 0; + } + // Limit the haystack range size_t normalized_offset, normalized_length; slice(haystack.length, start, end, &normalized_offset, &normalized_length); @@ -333,7 +399,7 @@ static PyObject *str_count_vectorcall(PyObject *_, PyObject *const *args, size_t } else { // Your existing logic for count_substr can be embedded here - if (allow_overlap) { + if (allowoverlap) { while (haystack.length) { size_t offset = sz_neon_find_substr(haystack, needle); int found = offset != haystack.length; @@ -427,50 +493,92 @@ static PyObject *str_levenstein_vectorcall(PyObject *_, PyObject *const *args, s return PyLong_FromLong(distance); } -static PyObject *strs_split_vectorcall(PyObject *self, PyObject *const *args, size_t nargsf, PyObject *kwnames) { +static PyObject *strs_split_vectorcall(PyObject *_, PyObject *const *args, size_t nargsf, PyObject *kwnames) { Py_ssize_t nargs = PyVectorcall_NARGS(nargsf); - // Validate the number of arguments if (nargs < 1) { PyErr_SetString(PyExc_TypeError, "sz.split() requires at least 1 argument"); return NULL; } + // Initialize with default values or positional arguments PyObject *text_obj = args[0]; - struct sz_haystack_t text; - if (!export_string_like(text_obj, &text.start, &text.length)) { - PyErr_SetString(PyExc_TypeError, "First argument must be string-like"); - return NULL; - } + PyObject *separator_obj = (nargs > 1) ? args[1] : NULL; + PyObject *maxsplit_obj = (nargs > 2) ? args[2] : NULL; + PyObject *keepseparator_obj = (nargs > 3) ? args[3] : NULL; - struct sz_needle_t separator; - separator.start = " "; - separator.length = 1; - separator.anomaly_offset = 0; - int keepseparator = 0; - Py_ssize_t maxsplit = PY_SSIZE_T_MAX; - - // Parse additional positional arguments and keyword arguments + // Parse keyword arguments to overwrite positional ones if (kwnames != NULL) { for (Py_ssize_t i = 0; i < PyTuple_Size(kwnames); ++i) { PyObject *key = PyTuple_GetItem(kwnames, i); PyObject *value = args[nargs + i]; - if (PyUnicode_CompareWithASCIIString(key, "separator") == 0) { - // Assume separator is passed as a Python Unicode object - Py_ssize_t len; - separator.start = PyUnicode_AsUTF8AndSize(value, &len); - separator.length = (size_t)len; - } + + if (PyUnicode_CompareWithASCIIString(key, "separator") == 0) + separator_obj = value; else if (PyUnicode_CompareWithASCIIString(key, "maxsplit") == 0) - maxsplit = PyLong_AsSsize_t(value); + maxsplit_obj = value; else if (PyUnicode_CompareWithASCIIString(key, "keepseparator") == 0) - keepseparator = PyObject_IsTrue(value); + keepseparator_obj = value; else { PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key); return NULL; } + + // Check for errors during conversion + if (PyErr_Occurred()) + return NULL; + } + } + + struct sz_haystack_t text; + struct sz_needle_t separator; + int keepseparator; + Py_ssize_t maxsplit; + separator.anomaly_offset = 0; + + // Validate and convert `text` + if (!export_string_like(text_obj, &text.start, &text.length)) { + PyErr_SetString(PyExc_TypeError, "The text argument must be string-like"); + return NULL; + } + + // Validate and convert `separator` + if (separator_obj) { + Py_ssize_t len; + if (!export_string_like(separator_obj, &separator.start, &len)) { + PyErr_SetString(PyExc_TypeError, "The separator argument must be string-like"); + return NULL; + } + separator.length = (size_t)len; + } + else { + separator.start = " "; + separator.length = 1; + } + + // Validate and convert `keepseparator` + if (keepseparator_obj) { + keepseparator = PyObject_IsTrue(keepseparator_obj); + if (keepseparator == -1) { + PyErr_SetString(PyExc_TypeError, "The keepseparator argument must be a boolean"); + return NULL; + } + } + else { + keepseparator = 0; + } + + // Validate and convert `maxsplit` + if (maxsplit_obj) { + maxsplit = PyLong_AsSsize_t(maxsplit_obj); + if (maxsplit == -1 && PyErr_Occurred()) { + PyErr_SetString(PyExc_TypeError, "The maxsplit argument must be an integer"); + return NULL; } } + else { + maxsplit = PY_SSIZE_T_MAX; + } // Create Strs object Strs *result = (Strs *)PyObject_New(Strs, &StrsType); @@ -478,7 +586,7 @@ static PyObject *strs_split_vectorcall(PyObject *self, PyObject *const *args, si return NULL; // Initialize Strs object based on the splitting logic - void *offsets = NULL; + void *offsets_endings = NULL; size_t offsets_capacity = 0; size_t offsets_count = 0; size_t bytes_per_offset; @@ -487,14 +595,14 @@ static PyObject *strs_split_vectorcall(PyObject *self, PyObject *const *args, si result->type = STRS_CONSECUTIVE_64; result->data.consecutive_64bit.start = text.start; result->data.consecutive_64bit.parent = text_obj; - result->data.consecutive_64bit.separator_length = keepseparator * separator.length; + result->data.consecutive_64bit.separator_length = !keepseparator * separator.length; } else { bytes_per_offset = 4; result->type = STRS_CONSECUTIVE_32; result->data.consecutive_32bit.start = text.start; result->data.consecutive_32bit.parent = text_obj; - result->data.consecutive_32bit.separator_length = keepseparator * separator.length; + result->data.consecutive_32bit.separator_length = !keepseparator * separator.length; } // Iterate through string, keeping track of the @@ -508,26 +616,28 @@ static PyObject *strs_split_vectorcall(PyObject *self, PyObject *const *args, si // Reallocate offsets array if needed if (offsets_count >= offsets_capacity) { offsets_capacity = (offsets_capacity + 1) * 2; - void *new_offsets = realloc(offsets, offsets_capacity * bytes_per_offset); + void *new_offsets = realloc(offsets_endings, offsets_capacity * bytes_per_offset); if (!new_offsets) { - if (offsets) - free(offsets); + if (offsets_endings) + free(offsets_endings); } - offsets = new_offsets; + offsets_endings = new_offsets; } // If the memory allocation has failed - discard the response - if (!offsets) { + if (!offsets_endings) { Py_XDECREF(result); PyErr_NoMemory(); return NULL; } // Export the offset + size_t will_continue = offset_in_remaining != text_remaining.length; + size_t next_offset = last_start + offset_in_remaining + separator.length * will_continue; if (text.length >= UINT32_MAX) - ((uint64_t *)offsets)[offsets_count++] = (uint64_t)(last_start + offset_in_remaining); + ((uint64_t *)offsets_endings)[offsets_count++] = (uint64_t)next_offset; else - ((uint32_t *)offsets)[offsets_count++] = (uint32_t)(last_start + offset_in_remaining); + ((uint32_t *)offsets_endings)[offsets_count++] = (uint32_t)next_offset; // Next time we want to start last_start = last_start + offset_in_remaining + separator.length; @@ -535,14 +645,15 @@ static PyObject *strs_split_vectorcall(PyObject *self, PyObject *const *args, si // Populate the Strs object with the offsets if (text.length >= UINT32_MAX) { - result->data.consecutive_64bit.offsets = offsets; + result->data.consecutive_64bit.end_offsets = offsets_endings; result->data.consecutive_64bit.count = offsets_count; } else { - result->data.consecutive_32bit.offsets = offsets; + result->data.consecutive_32bit.end_offsets = offsets_endings; result->data.consecutive_32bit.count = offsets_count; } + Py_INCREF(text_obj); return (PyObject *)result; } @@ -735,10 +846,10 @@ static void Str_dealloc(Str *self) { static PyObject *Str_str(Str *self) { return PyUnicode_FromStringAndSize(self->start, self->length); } -static Py_ssize_t Str_len(Str *self) { return self->length; } - static Py_hash_t Str_hash(Str *self) { return (Py_hash_t)sz_hash_crc32_native(self->start, self->length); } +static Py_ssize_t Str_len(Str *self) { return self->length; } + static PyObject *Str_getitem(Str *self, Py_ssize_t i) { // Negative indexing @@ -807,50 +918,80 @@ static int Str_contains(Str *self, PyObject *arg) { return position != haystack.length; } -static PyObject *Str_getslice(Str *self, PyObject *args) { - PyObject *start_obj = NULL, *end_obj = NULL; - ssize_t start = 0, end = self->length; // Default values +static Py_ssize_t Strs_len(Strs *self) { + switch (self->type) { + case STRS_CONSECUTIVE_32: return self->data.consecutive_32bit.count; + case STRS_CONSECUTIVE_64: return self->data.consecutive_64bit.count; + case STRS_REORDERED: return self->data.reordered.count; + case STRS_MULTI_SOURCE: return self->data.multi_source.count; + default: return 0; + } +} - if (!PyArg_ParseTuple(args, "|OO", &start_obj, &end_obj)) +static PyObject *Strs_getitem(Strs *self, Py_ssize_t i) { + // Check for negative index and convert to positive + Py_ssize_t count = Strs_len(self); + if (i < 0) + i += count; + if (i < 0 || i >= count) { + PyErr_SetString(PyExc_IndexError, "Index out of range"); return NULL; - - if (start_obj != NULL && start_obj != Py_None) { - if (!PyLong_Check(start_obj)) { - PyErr_SetString(PyExc_TypeError, "Start index must be an integer or None"); - return NULL; - } - start = PyLong_AsSsize_t(start_obj); } - if (end_obj != NULL && end_obj != Py_None) { - if (!PyLong_Check(end_obj)) { - PyErr_SetString(PyExc_TypeError, "End index must be an integer or None"); - return NULL; - } - end = PyLong_AsSsize_t(end_obj); + PyObject *parent = NULL; + char const *start = NULL; + size_t length = 0; + + // Extract a member element based on + switch (self->type) { + case STRS_CONSECUTIVE_32: { + uint32_t start_offset = (i == 0) ? 0 : self->data.consecutive_32bit.end_offsets[i - 1]; + uint32_t end_offset = self->data.consecutive_32bit.end_offsets[i]; + start = self->data.consecutive_32bit.start + start_offset; + length = end_offset - start_offset - self->data.consecutive_32bit.separator_length * (i + 1 != count); + parent = self->data.consecutive_32bit.parent; + break; + } + case STRS_CONSECUTIVE_64: { + uint64_t start_offset = (i == 0) ? 0 : self->data.consecutive_64bit.end_offsets[i - 1]; + uint64_t end_offset = self->data.consecutive_64bit.end_offsets[i]; + start = self->data.consecutive_64bit.start + start_offset; + length = end_offset - start_offset - self->data.consecutive_64bit.separator_length * (i + 1 != count); + parent = self->data.consecutive_64bit.parent; + break; + } + case STRS_REORDERED: { + // + break; + } + case STRS_MULTI_SOURCE: { + // + break; + } + default: PyErr_SetString(PyExc_TypeError, "Unknown Strs kind"); return NULL; } - size_t normalized_offset, normalized_length; - slice(self->length, start, end, &normalized_offset, &normalized_length); - - if (normalized_length == 0) - return PyUnicode_FromString(""); - - // Create a new Str object - Str *new_str = (Str *)PyObject_New(Str, &StrType); - if (new_str == NULL) + // Create a new `Str` object + Str *parent_slice = (Str *)StrType.tp_alloc(&StrType, 0); + if (parent_slice == NULL && PyErr_NoMemory()) return NULL; - // Set the parent to the original Str object and increment its reference count - new_str->parent = (PyObject *)self; - Py_INCREF(self); + parent_slice->start = start; + parent_slice->length = length; + parent_slice->parent = parent; + Py_INCREF(parent); + return parent_slice; +} - // Set the start and length to point to the slice - new_str->start = self->start + normalized_offset; - new_str->length = normalized_length; - return (PyObject *)new_str; +static PyObject *Strs_subscript(Str *self, PyObject *key) { + if (PyLong_Check(key)) + return Strs_getitem(self, PyLong_AsSsize_t(key)); + return NULL; } +// Will be called by the `PySequence_Contains` +static int Strs_contains(Str *self, PyObject *arg) { return 0; } + static PyObject *Str_richcompare(PyObject *self, PyObject *other, int op) { char const *a_start, *b_start; @@ -888,7 +1029,7 @@ static PyMappingMethods Str_as_mapping = { .mp_subscript = Str_subscript, // Is used to implement slices in Python }; -static PyMethodDef Str_methods[] = { // +static PyMethodDef Str_methods[] = { // {"contains", (PyCFunction)..., METH_NOARGS, "Convert to Python `str`"}, // {"find", (PyCFunction)..., METH_NOARGS, "Get length"}, // {"__getitem__", (PyCFunction)..., METH_O, "Indexing"}, @@ -911,6 +1052,17 @@ static PyTypeObject StrType = { // .tp_as_buffer = (PyBufferProcs *)NULL, // Functions to access object as input/output buffer }; +static PySequenceMethods Strs_as_sequence = { + .sq_length = Strs_len, // + .sq_item = Strs_getitem, // + .sq_contains = Strs_contains, // +}; + +static PyMappingMethods Strs_as_mapping = { + .mp_length = Strs_len, // + .mp_subscript = Strs_subscript, // Is used to implement slices in Python +}; + static PyTypeObject StrsType = { PyVarObject_HEAD_INIT(NULL, 0).tp_name = "stringzilla.Strs", .tp_doc = "Space-efficient container for large collections of strings and their slices", @@ -918,6 +1070,8 @@ static PyTypeObject StrsType = { .tp_itemsize = 0, .tp_flags = Py_TPFLAGS_DEFAULT, .tp_new = PyType_GenericNew, + .tp_as_sequence = &Strs_as_sequence, + .tp_as_mapping = &Strs_as_mapping, }; #pragma endregion @@ -956,7 +1110,8 @@ PyObject *register_vectorcall(PyObject *module, char const *name, vectorcallfunc } void cleanup_module(void) { - free(temporary_memory.start); + if (temporary_memory.start) + free(temporary_memory.start); temporary_memory.start = NULL; temporary_memory.length = 0; } @@ -1004,7 +1159,7 @@ PyMODINIT_FUNC PyInit_stringzilla(void) { // Initialize temporary_memory, if needed temporary_memory.start = malloc(4096); temporary_memory.length = 4096 * (temporary_memory.start != NULL); - atexit(cleanup_module); + // atexit(cleanup_module); // Register the vectorized functions PyObject *vectorized_find = register_vectorcall(m, "find", str_find_vectorcall); @@ -1043,6 +1198,5 @@ PyMODINIT_FUNC PyInit_stringzilla(void) { Py_XDECREF(&FileType); Py_XDECREF(&StrType); Py_XDECREF(m); - PyErr_NoMemory(); return NULL; } diff --git a/scripts/test.py b/scripts/test.py index 0c5d095f..1fc2193f 100644 --- a/scripts/test.py +++ b/scripts/test.py @@ -38,17 +38,17 @@ def test_indexing(): assert big[i] == native[i] -def test_contains(): - big = Str("abcdef") - assert "a" in big - assert "ab" in big - assert "xxx" not in big +# def test_contains(): +# big = Str("abcdef") +# assert "a" in big +# assert "ab" in big +# assert "xxx" not in big -def test_rich_comparisons(): - assert Str("aa") == "aa" - assert Str("aa") < "b" - assert Str("abb")[1:] == "bb" +# def test_rich_comparisons(): +# assert Str("aa") == "aa" +# assert Str("aa") < "b" +# assert Str("abb")[1:] == "bb" # def get_random_string( @@ -120,12 +120,19 @@ def test_rich_comparisons(): def test_split_keepseparator(): - native = "word1 word2 word3" + native = "word1_word2_word3" big = Str(native) - words = sz.split(big, " ") - parts = sz.split(big, " ", keepseparator=True) - # assert words[0] == "word1" - # assert parts[0] == "word1 " + + words = sz.split(big, "_") + assert len(words) == 3 + + parts = sz.split(big, "_", keepseparator=True) + assert len(parts) == 3 + + assert str(words[0]) == "word1" + assert str(parts[0]) == "word1_" + assert str(words[2]) == "word3" + assert str(parts[2]) == "word3" # def test_strs_operations(): From febbdf57b0d75f2ce7a0e0e6b92f91404a8a8468 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Tue, 19 Sep 2023 18:01:07 +0400 Subject: [PATCH 19/72] Improve: Same functions as global and members --- .clang-format | 6 +- .gitignore | 1 + .vscode/settings.json | 1 + python/lib.c | 494 ++++++++++++++---------------------------- scripts/test.py | 58 ++--- 5 files changed, 200 insertions(+), 360 deletions(-) diff --git a/.clang-format b/.clang-format index ab9f350a..b1adf3b0 100644 --- a/.clang-format +++ b/.clang-format @@ -16,12 +16,12 @@ AlignTrailingComments: true AllowAllArgumentsOnNextLine: false AllowAllConstructorInitializersOnNextLine: true AllowAllParametersOfDeclarationOnNextLine: true -AllowShortBlocksOnASingleLine: false +AllowShortBlocksOnASingleLine: Always AllowShortCaseLabelsOnASingleLine: true AllowShortFunctionsOnASingleLine: true -AllowShortIfStatementsOnASingleLine: Never +AllowShortIfStatementsOnASingleLine: Always AllowShortLambdasOnASingleLine: true -AllowShortLoopsOnASingleLine: false +AllowShortLoopsOnASingleLine: true AlwaysBreakBeforeMultilineStrings: true AlwaysBreakTemplateDeclarations: Yes AlwaysBreakBeforeMultilineStrings: true diff --git a/.gitignore b/.gitignore index a96d24d0..cfbdf78a 100644 --- a/.gitignore +++ b/.gitignore @@ -13,5 +13,6 @@ substr_search_cpp *.so *.egg-info *.whl +node_modules/ leipzig1M.txt \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json index 48034254..b75f1ba8 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -133,6 +133,7 @@ "itemsize", "keeplinebreaks", "keepseparator", + "kwargs", "kwds", "kwnames", "levenstein", diff --git a/python/lib.c b/python/lib.c index 38fd38af..b0360866 100644 --- a/python/lib.c +++ b/python/lib.c @@ -1,6 +1,8 @@ /** * @brief Very light-weight CPython wrapper for StringZilla, with support for memory-mapping, * native Python strings, Apache Arrow collections, and more. + * + * To minimize latency this implementation avoids `PyArg_ParseTupleAndKeywords` calls. */ #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) #define NOMINMAX @@ -29,7 +31,7 @@ static PyTypeObject FileType; static PyTypeObject StrType; static PyTypeObject StrsType; -struct { +static struct { void *start; size_t length; } temporary_memory = {NULL, 0}; @@ -206,35 +208,30 @@ int export_string_like(PyObject *object, char const **start, size_t *length) { #pragma region Global Functions -static Py_ssize_t str_find_vectorcall_(PyObject *_, PyObject *const *args, size_t nargsf, PyObject *kwnames) { - Py_ssize_t nargs = PyVectorcall_NARGS(nargsf); - - if (nargs < 2) { +static Py_ssize_t api_find_(PyObject *self, PyObject *args, PyObject *kwargs) { + int is_member = (self != NULL && PyObject_TypeCheck(self, &StrType)); + Py_ssize_t nargs = PyTuple_Size(args); + if (nargs < !is_member + 1 || nargs > !is_member + 3) { PyErr_SetString(PyExc_TypeError, "Invalid number of arguments"); - return NULL; + return 0; } - // Initialize with default values or positional arguments - PyObject *haystack_obj = args[0]; - PyObject *needle_obj = args[1]; - PyObject *start_obj = (nargs > 2) ? args[2] : NULL; - PyObject *end_obj = (nargs > 3) ? args[3] : NULL; - - // Parse keyword arguments to overwrite positional ones - if (kwnames != NULL) { - for (Py_ssize_t i = 0; i < PyTuple_Size(kwnames); ++i) { - PyObject *key = PyTuple_GetItem(kwnames, i); - PyObject *value = args[nargs + i]; - if (PyUnicode_CompareWithASCIIString(key, "start") == 0) - start_obj = value; - else if (PyUnicode_CompareWithASCIIString(key, "end") == 0) - end_obj = value; + PyObject *haystack_obj = is_member ? self : PyTuple_GET_ITEM(args, 0); + PyObject *needle_obj = PyTuple_GET_ITEM(args, !is_member + 0); + PyObject *start_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL; + PyObject *end_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL; + + // Parse keyword arguments + if (kwargs) { + Py_ssize_t pos = 0; + PyObject *key, *value; + while (PyDict_Next(kwargs, &pos, &key, &value)) { + if (PyUnicode_CompareWithASCIIString(key, "start") == 0) { start_obj = value; } + else if (PyUnicode_CompareWithASCIIString(key, "end") == 0) { end_obj = value; } else { PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key); - return NULL; + return 0; } - if (PyErr_Occurred()) - return NULL; } } @@ -247,7 +244,7 @@ static Py_ssize_t str_find_vectorcall_(PyObject *_, PyObject *const *args, size_ if (!export_string_like(haystack_obj, &haystack.start, &haystack.length) || !export_string_like(needle_obj, &needle.start, &needle.length)) { PyErr_SetString(PyExc_TypeError, "Haystack and needle must be string-like"); - return NULL; + return 0; } // Validate and convert `start` @@ -255,24 +252,20 @@ static Py_ssize_t str_find_vectorcall_(PyObject *_, PyObject *const *args, size_ start = PyLong_AsSsize_t(start_obj); if (start == -1 && PyErr_Occurred()) { PyErr_SetString(PyExc_TypeError, "The start argument must be an integer"); - return NULL; + return 0; } } - else { - start = 0; - } + else { start = 0; } // Validate and convert `end` if (end_obj) { end = PyLong_AsSsize_t(end_obj); if (end == -1 && PyErr_Occurred()) { PyErr_SetString(PyExc_TypeError, "The end argument must be an integer"); - return NULL; + return 0; } } - else { - end = PY_SSIZE_T_MAX; - } + else { end = PY_SSIZE_T_MAX; } // Limit the `haystack` range size_t normalized_offset, normalized_length; @@ -282,123 +275,68 @@ static Py_ssize_t str_find_vectorcall_(PyObject *_, PyObject *const *args, size_ // Perform contains operation size_t offset = sz_neon_find_substr(haystack, needle); - if (offset == haystack.length) - return -1; + if (offset == haystack.length) return -1; return (Py_ssize_t)offset; } -static PyObject *str_find_vectorcall(PyObject *_, PyObject *const *args, size_t nargsf, PyObject *kwnames) { - Py_ssize_t signed_offset = str_find_vectorcall_(NULL, args, nargsf, kwnames); +static PyObject *api_find(PyObject *self, PyObject *args, PyObject *kwargs) { + Py_ssize_t signed_offset = api_find_(self, args, kwargs); + if (PyErr_Occurred()) return NULL; return PyLong_FromSsize_t(signed_offset); } -static PyObject *str_contains_vectorcall(PyObject *_, PyObject *const *args, size_t nargsf, PyObject *kwnames) { - Py_ssize_t signed_offset = str_find_vectorcall_(NULL, args, nargsf, kwnames); - if (signed_offset == -1) { - Py_RETURN_FALSE; - } - else { - Py_RETURN_TRUE; - } +static PyObject *api_contains(PyObject *self, PyObject *args, PyObject *kwargs) { + Py_ssize_t signed_offset = api_find_(self, args, kwargs); + if (PyErr_Occurred()) return NULL; + if (signed_offset == -1) { Py_RETURN_FALSE; } + else { Py_RETURN_TRUE; } } -static PyObject *str_count_vectorcall(PyObject *_, PyObject *const *args, size_t nargsf, PyObject *kwnames) { - Py_ssize_t nargs = PyVectorcall_NARGS(nargsf); - - // Initialize with default values or positional arguments - if (nargs < 2) { - PyErr_SetString(PyExc_TypeError, "Invalid number of arguments"); +static PyObject *api_count(PyObject *self, PyObject *args, PyObject *kwargs) { + int is_member = (self != NULL && PyObject_TypeCheck(self, &StrType)); + Py_ssize_t nargs = PyTuple_Size(args); + if (nargs < !is_member + 1 || nargs > !is_member + 4) { + PyErr_Format(PyExc_TypeError, "Invalid number of arguments"); return NULL; } - PyObject *haystack_obj = args[0]; - PyObject *needle_obj = args[1]; - PyObject *start_obj = (nargs > 2) ? args[2] : NULL; - PyObject *end_obj = (nargs > 3) ? args[3] : NULL; - PyObject *allowoverlap_obj = (nargs > 4) ? args[4] : NULL; - - // Parse keyword arguments to overwrite positional ones - if (kwnames != NULL) { - for (Py_ssize_t i = 0; i < PyTuple_Size(kwnames); ++i) { - PyObject *key = PyTuple_GetItem(kwnames, i); - PyObject *value = args[nargs + i]; - if (PyUnicode_CompareWithASCIIString(key, "start") == 0) - start_obj = value; - else if (PyUnicode_CompareWithASCIIString(key, "end") == 0) - end_obj = value; - else if (PyUnicode_CompareWithASCIIString(key, "allowoverlap") == 0) - allowoverlap_obj = value; - else { - PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key); - return NULL; - } - if (PyErr_Occurred()) + PyObject *haystack_obj = is_member ? self : PyTuple_GET_ITEM(args, 0); + PyObject *needle_obj = PyTuple_GET_ITEM(args, !is_member + 0); + PyObject *start_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL; + PyObject *end_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL; + PyObject *allowoverlap_obj = nargs > !is_member + 3 ? PyTuple_GET_ITEM(args, !is_member + 3) : NULL; + + if (kwargs) { + Py_ssize_t pos = 0; + PyObject *key, *value; + while (PyDict_Next(kwargs, &pos, &key, &value)) + if (PyUnicode_CompareWithASCIIString(key, "start") == 0) { start_obj = value; } + else if (PyUnicode_CompareWithASCIIString(key, "end") == 0) { end_obj = value; } + else if (PyUnicode_CompareWithASCIIString(key, "allowoverlap") == 0) { allowoverlap_obj = value; } + else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key)) return NULL; - } } struct sz_haystack_t haystack; struct sz_needle_t needle; - int allowoverlap; - Py_ssize_t start, end; + Py_ssize_t start = start_obj ? PyLong_AsSsize_t(start_obj) : 0; + Py_ssize_t end = end_obj ? PyLong_AsSsize_t(end_obj) : PY_SSIZE_T_MAX; + int allowoverlap = allowoverlap_obj ? PyObject_IsTrue(allowoverlap_obj) : 0; - // Validate and convert `haystack` and `needle` needle.anomaly_offset = 0; if (!export_string_like(haystack_obj, &haystack.start, &haystack.length) || - !export_string_like(needle_obj, &needle.start, &needle.length)) { - PyErr_SetString(PyExc_TypeError, "Haystack and needle must be string-like"); - return NULL; - } - - // Validate and convert `start` - if (start_obj) { - start = PyLong_AsSsize_t(start_obj); - if (start == -1 && PyErr_Occurred()) { - PyErr_SetString(PyExc_TypeError, "The start argument must be an integer"); - return NULL; - } - } - else { - start = 0; - } + !export_string_like(needle_obj, &needle.start, &needle.length)) + return PyErr_Format(PyExc_TypeError, "Haystack and needle must be string-like"), NULL; - // Validate and convert `end` - if (end_obj) { - end = PyLong_AsSsize_t(end_obj); - if (end == -1 && PyErr_Occurred()) { - PyErr_SetString(PyExc_TypeError, "The end argument must be an integer"); - return NULL; - } - } - else { - end = PY_SSIZE_T_MAX; - } + if ((start == -1 || end == -1 || allowoverlap == -1) && PyErr_Occurred()) return NULL; - // Validate and convert `allowoverlap` - if (allowoverlap_obj) { - allowoverlap = PyObject_IsTrue(allowoverlap_obj); - if (allowoverlap == -1) { - PyErr_SetString(PyExc_TypeError, "The allowoverlap argument must be a boolean"); - return NULL; - } - } - else { - allowoverlap = 0; - } - - // Limit the haystack range size_t normalized_offset, normalized_length; slice(haystack.length, start, end, &normalized_offset, &normalized_length); haystack.start += normalized_offset; haystack.length = normalized_length; - // Perform counting operation - size_t count = 0; - if (needle.length == 1) { - count = sz_naive_count_char(haystack, *needle.start); - } - else { - // Your existing logic for count_substr can be embedded here + size_t count = needle.length == 1 ? sz_naive_count_char(haystack, *needle.start) : 0; + if (needle.length != 1) { if (allowoverlap) { while (haystack.length) { size_t offset = sz_neon_find_substr(haystack, needle); @@ -418,114 +356,87 @@ static PyObject *str_count_vectorcall(PyObject *_, PyObject *const *args, size_t } } } - return PyLong_FromSize_t(count); } -static PyObject *str_levenstein_vectorcall(PyObject *_, PyObject *const *args, size_t nargsf, PyObject *kwnames) { - Py_ssize_t nargs = PyVectorcall_NARGS(nargsf); - - // Validate the number of arguments - if (nargs < 2 || nargs > 3) { - PyErr_SetString(PyExc_TypeError, "Invalid number of arguments"); - return NULL; - } - - PyObject *str1_obj = args[0]; - PyObject *str2_obj = args[1]; - - struct sz_haystack_t str1, str2; - if (!export_string_like(str1_obj, &str1.start, &str1.length) || - !export_string_like(str2_obj, &str2.start, &str2.length)) { - PyErr_SetString(PyExc_TypeError, "Both arguments must be string-like"); +static PyObject *api_levenstein(PyObject *self, PyObject *args, PyObject *kwargs) { + int is_member = (self != NULL && PyObject_TypeCheck(self, &StrType)); + Py_ssize_t nargs = PyTuple_Size(args); + if (nargs < !is_member + 1 || nargs > !is_member + 2) { + PyErr_Format(PyExc_TypeError, "Invalid number of arguments"); return NULL; } - // Initialize bound argument - int bound = 255; + PyObject *str1_obj = is_member ? self : PyTuple_GET_ITEM(args, 0); + PyObject *str2_obj = PyTuple_GET_ITEM(args, !is_member + 0); + PyObject *bound_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL; - // Check if `bound` is given as a positional argument - if (nargs == 3) { - bound = PyLong_AsLong(args[2]); - if (bound > 255 || bound < 0) { - PyErr_SetString(PyExc_ValueError, "Bound must be an integer between 0 and 255"); - return NULL; - } - } - - // Parse keyword arguments - if (kwnames != NULL) { - for (Py_ssize_t i = 0; i < PyTuple_Size(kwnames); ++i) { - PyObject *key = PyTuple_GetItem(kwnames, i); - PyObject *value = args[nargs + i]; + if (kwargs) { + PyObject *key, *value; + Py_ssize_t pos = 0; + while (PyDict_Next(kwargs, &pos, &key, &value)) if (PyUnicode_CompareWithASCIIString(key, "bound") == 0) { - if (nargs == 3) { - PyErr_SetString(PyExc_TypeError, "Received bound both as positional and keyword argument"); - return NULL; - } - bound = PyLong_AsLong(value); - if (bound > 255 || bound < 0) { - PyErr_SetString(PyExc_ValueError, "Bound must be an integer between 0 and 255"); + if (bound_obj) { + PyErr_Format(PyExc_TypeError, "Received bound both as positional and keyword argument"); return NULL; } + bound_obj = value; } - } } - // Initialize or reallocate the Levenshtein distance matrix + int bound = 255; // Default value for bound + if (bound_obj && ((bound = PyLong_AsLong(bound_obj)) > 255 || bound < 0)) { + PyErr_Format(PyExc_ValueError, "Bound must be an integer between 0 and 255"); + return NULL; + } + + struct sz_haystack_t str1, str2; + if (!export_string_like(str1_obj, &str1.start, &str1.length) || + !export_string_like(str2_obj, &str2.start, &str2.length)) { + PyErr_Format(PyExc_TypeError, "Both arguments must be string-like"); + return NULL; + } + size_t memory_needed = sz_levenstein_memory_needed(str1.length, str2.length); if (temporary_memory.length < memory_needed) { temporary_memory.start = realloc(temporary_memory.start, memory_needed); temporary_memory.length = memory_needed; } - if (temporary_memory.start == NULL) { - PyErr_SetString(PyExc_MemoryError, "Unable to allocate memory for the Levenshtein matrix"); + if (!temporary_memory.start) { + PyErr_Format(PyExc_MemoryError, "Unable to allocate memory for the Levenshtein matrix"); return NULL; } - levenstein_distance_t distance = sz_levenstein( // - str1.start, - str1.length, - str2.start, - str2.length, - (levenstein_distance_t)bound, - temporary_memory.start); + levenstein_distance_t small_bound = (levenstein_distance_t)bound; + levenstein_distance_t distance = + sz_levenstein(str1.start, str1.length, str2.start, str2.length, small_bound, temporary_memory.start); + return PyLong_FromLong(distance); } -static PyObject *strs_split_vectorcall(PyObject *_, PyObject *const *args, size_t nargsf, PyObject *kwnames) { - Py_ssize_t nargs = PyVectorcall_NARGS(nargsf); +static PyObject *api_split(PyObject *self, PyObject *args, PyObject *kwargs) { - if (nargs < 1) { + // Check minimum arguments + int is_member = (self != NULL && PyObject_TypeCheck(self, &StrType)); + Py_ssize_t nargs = PyTuple_Size(args); + if (nargs < !is_member + 1 || nargs > !is_member + 3) { PyErr_SetString(PyExc_TypeError, "sz.split() requires at least 1 argument"); return NULL; } - // Initialize with default values or positional arguments - PyObject *text_obj = args[0]; - PyObject *separator_obj = (nargs > 1) ? args[1] : NULL; - PyObject *maxsplit_obj = (nargs > 2) ? args[2] : NULL; - PyObject *keepseparator_obj = (nargs > 3) ? args[3] : NULL; - - // Parse keyword arguments to overwrite positional ones - if (kwnames != NULL) { - for (Py_ssize_t i = 0; i < PyTuple_Size(kwnames); ++i) { - PyObject *key = PyTuple_GetItem(kwnames, i); - PyObject *value = args[nargs + i]; - - if (PyUnicode_CompareWithASCIIString(key, "separator") == 0) - separator_obj = value; - else if (PyUnicode_CompareWithASCIIString(key, "maxsplit") == 0) - maxsplit_obj = value; - else if (PyUnicode_CompareWithASCIIString(key, "keepseparator") == 0) - keepseparator_obj = value; - else { - PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key); - return NULL; - } + PyObject *text_obj = is_member ? self : PyTuple_GET_ITEM(args, 0); + PyObject *separator_obj = nargs > !is_member + 0 ? PyTuple_GET_ITEM(args, !is_member + 0) : NULL; + PyObject *maxsplit_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL; + PyObject *keepseparator_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL; - // Check for errors during conversion - if (PyErr_Occurred()) + if (kwargs) { + PyObject *key, *value; + Py_ssize_t pos = 0; + while (PyDict_Next(kwargs, &pos, &key, &value)) { + if (PyUnicode_CompareWithASCIIString(key, "separator") == 0) { separator_obj = value; } + else if (PyUnicode_CompareWithASCIIString(key, "maxsplit") == 0) { maxsplit_obj = value; } + else if (PyUnicode_CompareWithASCIIString(key, "keepseparator") == 0) { keepseparator_obj = value; } + else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key)) return NULL; } } @@ -564,9 +475,7 @@ static PyObject *strs_split_vectorcall(PyObject *_, PyObject *const *args, size_ return NULL; } } - else { - keepseparator = 0; - } + else { keepseparator = 0; } // Validate and convert `maxsplit` if (maxsplit_obj) { @@ -576,14 +485,11 @@ static PyObject *strs_split_vectorcall(PyObject *_, PyObject *const *args, size_ return NULL; } } - else { - maxsplit = PY_SSIZE_T_MAX; - } + else { maxsplit = PY_SSIZE_T_MAX; } // Create Strs object Strs *result = (Strs *)PyObject_New(Strs, &StrsType); - if (!result) - return NULL; + if (!result) return NULL; // Initialize Strs object based on the splitting logic void *offsets_endings = NULL; @@ -618,8 +524,7 @@ static PyObject *strs_split_vectorcall(PyObject *_, PyObject *const *args, size_ offsets_capacity = (offsets_capacity + 1) * 2; void *new_offsets = realloc(offsets_endings, offsets_capacity * bytes_per_offset); if (!new_offsets) { - if (offsets_endings) - free(offsets_endings); + if (offsets_endings) free(offsets_endings); } offsets_endings = new_offsets; } @@ -634,10 +539,8 @@ static PyObject *strs_split_vectorcall(PyObject *_, PyObject *const *args, size_ // Export the offset size_t will_continue = offset_in_remaining != text_remaining.length; size_t next_offset = last_start + offset_in_remaining + separator.length * will_continue; - if (text.length >= UINT32_MAX) - ((uint64_t *)offsets_endings)[offsets_count++] = (uint64_t)next_offset; - else - ((uint32_t *)offsets_endings)[offsets_count++] = (uint32_t)next_offset; + if (text.length >= UINT32_MAX) { ((uint64_t *)offsets_endings)[offsets_count++] = (uint64_t)next_offset; } + else { ((uint32_t *)offsets_endings)[offsets_count++] = (uint32_t)next_offset; } // Next time we want to start last_start = last_start + offset_in_remaining + separator.length; @@ -692,8 +595,7 @@ static void File_dealloc(File *self) { static PyObject *File_new(PyTypeObject *type, PyObject *positional_args, PyObject *named_args) { File *self; self = (File *)type->tp_alloc(type, 0); - if (self == NULL) - return NULL; + if (self == NULL) return NULL; #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) self->file_handle = NULL; @@ -707,8 +609,7 @@ static PyObject *File_new(PyTypeObject *type, PyObject *positional_args, PyObjec static int File_init(File *self, PyObject *positional_args, PyObject *named_args) { const char *path; - if (!PyArg_ParseTuple(positional_args, "s", &path)) - return -1; + if (!PyArg_ParseTuple(positional_args, "s", &path)) return -1; #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) self->file_handle = CreateFile(path, GENERIC_READ, FILE_SHARE_READ, 0, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0); @@ -772,17 +673,6 @@ static PyTypeObject FileType = { .tp_new = (newfunc)File_new, .tp_init = (initproc)File_init, .tp_dealloc = (destructor)File_dealloc, - - // PyBufferProcs *tp_as_buffer; - - // reprfunc tp_repr; - // PyNumberMethods *tp_as_number; - // PySequenceMethods *tp_as_sequence; - // PyMappingMethods *tp_as_mapping; - // ternaryfunc tp_call; - // reprfunc tp_str; - // getattrofunc tp_getattro; - // setattrofunc tp_setattro; }; #pragma endregion @@ -797,8 +687,7 @@ static int Str_init(Str *self, PyObject *positional_args, PyObject *named_args) // The `named_args` would be `NULL` if (named_args) { static char *names[] = {"parent", "from", "to", NULL}; - if (!PyArg_ParseTupleAndKeywords(positional_args, named_args, "|Onn", names, &parent, &from, &to)) - return -1; + if (!PyArg_ParseTupleAndKeywords(positional_args, named_args, "|Onn", names, &parent, &from, &to)) return -1; } else if (!PyArg_ParseTuple(positional_args, "|Onn", &parent, &from, &to)) return -1; @@ -829,8 +718,7 @@ static int Str_init(Str *self, PyObject *positional_args, PyObject *named_args) static PyObject *Str_new(PyTypeObject *type, PyObject *args, PyObject *kwds) { Str *self; self = (Str *)type->tp_alloc(type, 0); - if (!self) - return NULL; + if (!self) return NULL; self->parent = NULL; self->start = NULL; @@ -839,8 +727,8 @@ static PyObject *Str_new(PyTypeObject *type, PyObject *args, PyObject *kwds) { } static void Str_dealloc(Str *self) { - if (self->parent) - Py_XDECREF(self->parent); + if (self->parent) Py_XDECREF(self->parent); + self->parent = NULL; Py_TYPE(self)->tp_free((PyObject *)self); } @@ -853,8 +741,7 @@ static Py_ssize_t Str_len(Str *self) { return self->length; } static PyObject *Str_getitem(Str *self, Py_ssize_t i) { // Negative indexing - if (i < 0) - i += self->length; + if (i < 0) i += self->length; if (i < 0 || (size_t)i >= self->length) { PyErr_SetString(PyExc_IndexError, "Index out of range"); @@ -867,12 +754,10 @@ static PyObject *Str_getitem(Str *self, Py_ssize_t i) { static PyObject *Str_subscript(Str *self, PyObject *key) { if (PySlice_Check(key)) { + // Sanity checks Py_ssize_t start, stop, step; - if (PySlice_Unpack(key, &start, &stop, &step) < 0) - return NULL; - if (PySlice_AdjustIndices(self->length, &start, &stop, step) < 0) - return NULL; - + if (PySlice_Unpack(key, &start, &stop, &step) < 0) return NULL; + if (PySlice_AdjustIndices(self->length, &start, &stop, step) < 0) return NULL; if (step != 1) { PyErr_SetString(PyExc_IndexError, "Efficient step is not supported"); return NULL; @@ -880,8 +765,7 @@ static PyObject *Str_subscript(Str *self, PyObject *key) { // Create a new `Str` object Str *self_slice = (Str *)StrType.tp_alloc(&StrType, 0); - if (self_slice == NULL && PyErr_NoMemory()) - return NULL; + if (self_slice == NULL && PyErr_NoMemory()) return NULL; // Set its properties based on the slice self_slice->start = self->start + start; @@ -892,9 +776,7 @@ static PyObject *Str_subscript(Str *self, PyObject *key) { Py_INCREF(self); return (PyObject *)self_slice; } - else if (PyLong_Check(key)) { - return Str_getitem(self, PyLong_AsSsize_t(key)); - } + else if (PyLong_Check(key)) { return Str_getitem(self, PyLong_AsSsize_t(key)); } else { PyErr_SetString(PyExc_TypeError, "Str indices must be integers or slices"); return NULL; @@ -931,8 +813,7 @@ static Py_ssize_t Strs_len(Strs *self) { static PyObject *Strs_getitem(Strs *self, Py_ssize_t i) { // Check for negative index and convert to positive Py_ssize_t count = Strs_len(self); - if (i < 0) - i += count; + if (i < 0) i += count; if (i < 0 || i >= count) { PyErr_SetString(PyExc_IndexError, "Index out of range"); return NULL; @@ -973,8 +854,7 @@ static PyObject *Strs_getitem(Strs *self, Py_ssize_t i) { // Create a new `Str` object Str *parent_slice = (Str *)StrType.tp_alloc(&StrType, 0); - if (parent_slice == NULL && PyErr_NoMemory()) - return NULL; + if (parent_slice == NULL && PyErr_NoMemory()) return NULL; parent_slice->start = start; parent_slice->length = length; @@ -984,8 +864,7 @@ static PyObject *Strs_getitem(Strs *self, Py_ssize_t i) { } static PyObject *Strs_subscript(Str *self, PyObject *key) { - if (PyLong_Check(key)) - return Strs_getitem(self, PyLong_AsSsize_t(key)); + if (PyLong_Check(key)) return Strs_getitem(self, PyLong_AsSsize_t(key)); return NULL; } @@ -1004,8 +883,7 @@ static PyObject *Str_richcompare(PyObject *self, PyObject *other, int op) { int cmp_result = memcmp(a_start, b_start, min_length); // If the strings are equal up to `min_length`, then the shorter string is smaller - if (cmp_result == 0) - cmp_result = (a_length > b_length) - (a_length < b_length); + if (cmp_result == 0) cmp_result = (a_length > b_length) - (a_length < b_length); switch (op) { case Py_LT: return PyBool_FromLong(cmp_result < 0); @@ -1029,10 +907,14 @@ static PyMappingMethods Str_as_mapping = { .mp_subscript = Str_subscript, // Is used to implement slices in Python }; -static PyMethodDef Str_methods[] = { - // {"contains", (PyCFunction)..., METH_NOARGS, "Convert to Python `str`"}, - // {"find", (PyCFunction)..., METH_NOARGS, "Get length"}, - // {"__getitem__", (PyCFunction)..., METH_O, "Indexing"}, +#define sz_method_flags_m METH_VARARGS | METH_KEYWORDS + +static PyMethodDef Str_methods[] = { // + {"find", api_find, sz_method_flags_m, "Find the first occurrence of a substring."}, + {"contains", api_contains, sz_method_flags_m, "Check if a string contains a substring."}, + {"count", api_count, sz_method_flags_m, "Count the occurrences of a substring."}, + {"levenstein", api_levenstein, sz_method_flags_m, "Calculate the Levenshtein distance between two strings."}, + {"split", api_split, sz_method_flags_m, "Split a string by a separator."}, {NULL, NULL, 0, NULL}}; static PyTypeObject StrType = { @@ -1076,8 +958,20 @@ static PyTypeObject StrsType = { #pragma endregion -static PyMethodDef stringzilla_methods[] = { // - {NULL, NULL, 0, NULL}}; +static void stringzilla_cleanup(PyObject *m) { + if (temporary_memory.start) free(temporary_memory.start); + temporary_memory.start = NULL; + temporary_memory.length = 0; +} + +static PyMethodDef stringzilla_methods[] = { + {"find", api_find, sz_method_flags_m, "Find the first occurrence of a substring."}, + {"contains", api_contains, sz_method_flags_m, "Check if a string contains a substring."}, + {"count", api_count, sz_method_flags_m, "Count the occurrences of a substring."}, + {"levenstein", api_levenstein, sz_method_flags_m, "Calculate the Levenshtein distance between two strings."}, + {"split", api_split, sz_method_flags_m, "Split a string by a separator."}, + {NULL, NULL, 0, NULL} /* Sentinel */ +}; static PyModuleDef stringzilla_module = { PyModuleDef_HEAD_INIT, @@ -1088,49 +982,18 @@ static PyModuleDef stringzilla_module = { NULL, NULL, NULL, - NULL, + stringzilla_cleanup, }; -PyObject *register_vectorcall(PyObject *module, char const *name, vectorcallfunc vectorcall) { - - PyCFunctionObject *vectorcall_object = (PyCFunctionObject *)PyObject_Malloc(sizeof(PyCFunctionObject)); - if (vectorcall_object == NULL) - return NULL; - - PyObject_Init(vectorcall_object, &PyCFunction_Type); - vectorcall_object->m_ml = NULL; // No regular `PyMethodDef` - vectorcall_object->vectorcall = vectorcall; - - // Add the 'find' function to the module - if (PyModule_AddObject(module, name, vectorcall_object) < 0) { - Py_XDECREF(vectorcall_object); - return NULL; - } - return vectorcall_object; -} - -void cleanup_module(void) { - if (temporary_memory.start) - free(temporary_memory.start); - temporary_memory.start = NULL; - temporary_memory.length = 0; -} - PyMODINIT_FUNC PyInit_stringzilla(void) { PyObject *m; - if (PyType_Ready(&StrType) < 0) - return NULL; - - if (PyType_Ready(&FileType) < 0) - return NULL; - - if (PyType_Ready(&StrsType) < 0) - return NULL; + if (PyType_Ready(&StrType) < 0) return NULL; + if (PyType_Ready(&FileType) < 0) return NULL; + if (PyType_Ready(&StrsType) < 0) return NULL; m = PyModule_Create(&stringzilla_module); - if (m == NULL) - return NULL; + if (m == NULL) return NULL; Py_INCREF(&StrType); if (PyModule_AddObject(m, "Str", (PyObject *)&StrType) < 0) { @@ -1159,42 +1022,9 @@ PyMODINIT_FUNC PyInit_stringzilla(void) { // Initialize temporary_memory, if needed temporary_memory.start = malloc(4096); temporary_memory.length = 4096 * (temporary_memory.start != NULL); - // atexit(cleanup_module); - - // Register the vectorized functions - PyObject *vectorized_find = register_vectorcall(m, "find", str_find_vectorcall); - PyObject *vectorized_contains = register_vectorcall(m, "contains", str_contains_vectorcall); - PyObject *vectorized_count = register_vectorcall(m, "count", str_count_vectorcall); - PyObject *vectorized_levenstein = register_vectorcall(m, "levenstein", str_levenstein_vectorcall); - - PyObject *vectorized_split = register_vectorcall(m, "split", strs_split_vectorcall); - PyObject *vectorized_sort = register_vectorcall(m, "sort", str_find_vectorcall); - PyObject *vectorized_shuffle = register_vectorcall(m, "shuffle", str_find_vectorcall); - - if (!vectorized_find || !vectorized_count || // - !vectorized_contains || !vectorized_levenstein || // - !vectorized_split || !vectorized_sort || !vectorized_shuffle) { - goto cleanup; - } - return m; cleanup: - if (vectorized_find) - Py_XDECREF(vectorized_find); - if (vectorized_contains) - Py_XDECREF(vectorized_contains); - if (vectorized_count) - Py_XDECREF(vectorized_count); - if (vectorized_levenstein) - Py_XDECREF(vectorized_levenstein); - if (vectorized_split) - Py_XDECREF(vectorized_split); - if (vectorized_sort) - Py_XDECREF(vectorized_sort); - if (vectorized_shuffle) - Py_XDECREF(vectorized_shuffle); - Py_XDECREF(&FileType); Py_XDECREF(&StrType); Py_XDECREF(m); diff --git a/scripts/test.py b/scripts/test.py index 1fc2193f..8163e0e5 100644 --- a/scripts/test.py +++ b/scripts/test.py @@ -25,6 +25,22 @@ def test_globals(): assert sz.levenstein("abababab", "aaaaaaaa", bound=2) == 2 +def test_split_keepseparator(): + native = "word1_word2_word3" + big = Str(native) + + words = sz.split(big, "_") + assert len(words) == 3 + + parts = sz.split(big, "_", keepseparator=True) + assert len(parts) == 3 + + assert str(words[0]) == "word1" + assert str(parts[0]) == "word1_" + assert str(words[2]) == "word3" + assert str(parts[2]) == "word3" + + def test_construct(): native = "aaaaa" big = Str(native) @@ -38,17 +54,25 @@ def test_indexing(): assert big[i] == native[i] -# def test_contains(): -# big = Str("abcdef") -# assert "a" in big -# assert "ab" in big -# assert "xxx" not in big +def test_count(): + native = "aaaaa" + big = Str(native) + assert big.count("a") == 5 + assert big.count("aa") == 2 + assert big.count("aa", allowoverlap=True) == 4 + + +def test_contains(): + big = Str("abcdef") + assert "a" in big + assert "ab" in big + assert "xxx" not in big -# def test_rich_comparisons(): -# assert Str("aa") == "aa" -# assert Str("aa") < "b" -# assert Str("abb")[1:] == "bb" +def test_rich_comparisons(): + assert Str("aa") == "aa" + assert Str("aa") < "b" + assert Str("abb")[1:] == "bb" # def get_random_string( @@ -119,22 +143,6 @@ def test_indexing(): # assert native.splitlines(True) == list(big.splitlines(keeplinebreaks=True)) -def test_split_keepseparator(): - native = "word1_word2_word3" - big = Str(native) - - words = sz.split(big, "_") - assert len(words) == 3 - - parts = sz.split(big, "_", keepseparator=True) - assert len(parts) == 3 - - assert str(words[0]) == "word1" - assert str(parts[0]) == "word1_" - assert str(words[2]) == "word3" - assert str(parts[2]) == "word3" - - # def test_strs_operations(): # native = "line1\nline2\nline3" # big = Str(native) From 7dc3b2821b7b5642cb3221b306ab559af31f8189 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Tue, 19 Sep 2023 18:26:36 +0400 Subject: [PATCH 20/72] Add: Buffer protocol support --- .vscode/settings.json | 2 ++ python/lib.c | 48 +++++++++++++++++++++++++++++++------ scripts/test.py | 56 +++++++++++++++++++++++++------------------ 3 files changed, 76 insertions(+), 30 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 2ee49d0a..6fa841e1 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -142,6 +142,7 @@ "MODINIT", "napi", "nargsf", + "ndim", "newfunc", "NOARGS", "NOMINMAX", @@ -149,6 +150,7 @@ "pytest", "quadgram", "readlines", + "releasebuffer", "richcompare", "SIMD", "splitlines", diff --git a/python/lib.c b/python/lib.c index b0360866..8035e0ba 100644 --- a/python/lib.c +++ b/python/lib.c @@ -2,7 +2,8 @@ * @brief Very light-weight CPython wrapper for StringZilla, with support for memory-mapping, * native Python strings, Apache Arrow collections, and more. * - * To minimize latency this implementation avoids `PyArg_ParseTupleAndKeywords` calls. + * - Doesn't use PyBind11, NanoBind, Boost.Python, or any other high-level libs, only CPython API. + * - To minimize latency this implementation avoids `PyArg_ParseTupleAndKeywords` calls. */ #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) #define NOMINMAX @@ -783,7 +784,35 @@ static PyObject *Str_subscript(Str *self, PyObject *key) { } } -// Will be called by the `PySequence_Contains` +static int Str_getbuffer(Str *self, Py_buffer *view, int flags) { + if (view == NULL) { + PyErr_SetString(PyExc_ValueError, "NULL view in getbuffer"); + return -1; + } + + static Py_ssize_t itemsize[1] = {1}; + view->obj = (PyObject *)self; + view->buf = self->start; + view->len = self->length; + view->readonly = 1; + view->itemsize = sizeof(char); + view->format = "c"; // https://docs.python.org/3/library/struct.html#format-characters + view->ndim = 1; + view->shape = &self->length; // 1-D array, so shape is just a pointer to the length + view->strides = itemsize; // strides in a 1-D array is just the item size + view->suboffsets = NULL; + view->internal = NULL; + + Py_INCREF(self); + return 0; +} + +static void Str_releasebuffer(PyObject *_, Py_buffer *view) { + // This function MUST NOT decrement view->obj, since that is done automatically + // in PyBuffer_Release() (this scheme is useful for breaking reference cycles). + // https://docs.python.org/3/c-api/typeobj.html#c.PyBufferProcs.bf_releasebuffer +} + static int Str_contains(Str *self, PyObject *arg) { struct sz_needle_t needle_struct; @@ -907,6 +936,11 @@ static PyMappingMethods Str_as_mapping = { .mp_subscript = Str_subscript, // Is used to implement slices in Python }; +static PyBufferProcs Str_as_buffer = { + .bf_getbuffer = Str_getbuffer, + .bf_releasebuffer = Str_releasebuffer, +}; + #define sz_method_flags_m METH_VARARGS | METH_KEYWORDS static PyMethodDef Str_methods[] = { // @@ -922,16 +956,16 @@ static PyTypeObject StrType = { .tp_doc = "Immutable string/slice class with SIMD and SWAR-accelerated operations", .tp_basicsize = sizeof(Str), .tp_flags = Py_TPFLAGS_DEFAULT, - .tp_methods = Str_methods, .tp_new = Str_new, .tp_init = Str_init, .tp_dealloc = Str_dealloc, - .tp_as_sequence = &Str_as_sequence, - .tp_as_mapping = &Str_as_mapping, - .tp_hash = Str_hash, // String hashing functions + .tp_hash = Str_hash, .tp_richcompare = Str_richcompare, .tp_str = Str_str, - // .tp_as_buffer = (PyBufferProcs *)NULL, // Functions to access object as input/output buffer + .tp_methods = Str_methods, + .tp_as_sequence = &Str_as_sequence, + .tp_as_mapping = &Str_as_mapping, + .tp_as_buffer = &Str_as_buffer, }; static PySequenceMethods Strs_as_sequence = { diff --git a/scripts/test.py b/scripts/test.py index 8163e0e5..b9083ea6 100644 --- a/scripts/test.py +++ b/scripts/test.py @@ -75,19 +75,29 @@ def test_rich_comparisons(): assert Str("abb")[1:] == "bb" -# def get_random_string( -# length: Optional[int] = None, variability: Optional[int] = None -# ) -> str: -# if length is None: -# length = randint(3, 300) -# if variability is None: -# variability = len(ascii_lowercase) -# return "".join(choice(ascii_lowercase[:variability]) for _ in range(length)) +def test_buffer_protocol(): + import numpy as np + my_str = Str("hello") + arr = np.array(my_str) + assert arr.dtype == np.dtype("c") + assert arr.shape == (len("hello"),) + assert "".join([c.decode("utf-8") for c in arr.tolist()]) == "hello" -# def is_equal_strings(native_strings, big_strings): -# for native_slice, big_slice in zip(native_strings, big_strings): -# assert native_slice == big_slice + +def get_random_string( + length: Optional[int] = None, variability: Optional[int] = None +) -> str: + if length is None: + length = randint(3, 300) + if variability is None: + variability = len(ascii_lowercase) + return "".join(choice(ascii_lowercase[:variability]) for _ in range(length)) + + +def is_equal_strings(native_strings, big_strings): + for native_slice, big_slice in zip(native_strings, big_strings): + assert native_slice == big_slice # def check_identical( @@ -255,16 +265,16 @@ def test_rich_comparisons(): # ) -# def test_levenstein(): -# # Create a new string by slicing and concatenating -# def insert_char_at(s, char_to_insert, index): -# return s[:index] + char_to_insert + s[index:] +def test_levenstein(): + # Create a new string by slicing and concatenating + def insert_char_at(s, char_to_insert, index): + return s[:index] + char_to_insert + s[index:] -# for _ in range(100): -# a = get_random_string(length=20) -# b = a -# for i in range(150): -# source_offset = randint(0, len(ascii_lowercase) - 1) -# target_offset = randint(0, len(b) - 1) -# b = insert_char_at(b, ascii_lowercase[source_offset], target_offset) -# assert levenstein(a, b, 200) == i + 1 + for _ in range(100): + a = get_random_string(length=20) + b = a + for i in range(150): + source_offset = randint(0, len(ascii_lowercase) - 1) + target_offset = randint(0, len(b) - 1) + b = insert_char_at(b, ascii_lowercase[source_offset], target_offset) + assert sz.levenstein(a, b, 200) == i + 1 From 177005e40201893c4eb67540959fcfba44e19e7f Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Tue, 19 Sep 2023 18:26:58 +0400 Subject: [PATCH 21/72] Format: Compact code style --- stringzilla/stringzilla.h | 102 +++++++++++++------------------------- 1 file changed, 35 insertions(+), 67 deletions(-) diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h index fea8ac47..8bd32fa1 100644 --- a/stringzilla/stringzilla.h +++ b/stringzilla/stringzilla.h @@ -41,8 +41,7 @@ inline static sz_size_t sz_divide_round_up(sz_size_t x, sz_size_t divisor) { ret */ inline static int sz_equal(char const *a, char const *b, sz_size_t length) { char const *const a_end = a + length; - while (a != a_end && *a == *b) - a++, b++; + while (a != a_end && *a == *b) a++, b++; return a_end == a; } @@ -66,8 +65,7 @@ inline static sz_size_t sz_naive_count_char(sz_haystack_t h, char n) { char const *text = h.start; char const *end = h.start + h.length; - for (; (uint64_t)text % 8 != 0 && text < end; ++text) - result += *text == n; + for (; (uint64_t)text % 8 != 0 && text < end; ++text) result += *text == n; // This code simulates hyper-scalar execution, comparing 8 characters at a time. uint64_t nnnnnnnn = n; @@ -84,8 +82,7 @@ inline static sz_size_t sz_naive_count_char(sz_haystack_t h, char n) { result += popcount64(match_indicators); } - for (; text < end; ++text) - result += *text == n; + for (; text < end; ++text) result += *text == n; return result; } @@ -98,8 +95,7 @@ inline static sz_size_t sz_naive_find_char(sz_haystack_t h, char n) { char const *end = h.start + h.length; for (; (uint64_t)text % 8 != 0 && text < end; ++text) - if (*text == n) - return text - h.start; + if (*text == n) return text - h.start; // This code simulates hyper-scalar execution, analyzing 8 offsets at a time. uint64_t nnnnnnnn = n; @@ -114,13 +110,11 @@ inline static sz_size_t sz_naive_find_char(sz_haystack_t h, char n) { match_indicators &= match_indicators >> 4; match_indicators &= 0x0101010101010101; - if (match_indicators != 0) - return text - h.start + ctz64(match_indicators) / 8; + if (match_indicators != 0) return text - h.start + ctz64(match_indicators) / 8; } for (; text < end; ++text) - if (*text == n) - return text - h.start; + if (*text == n) return text - h.start; return h.length; } @@ -161,8 +155,7 @@ inline static sz_size_t sz_naive_find_2chars(sz_haystack_t h, char const *n) { } for (; text + 2 <= end; ++text) - if (text[0] == n[0] && text[1] == n[1]) - return text - h.start; + if (text[0] == n[0] && text[1] == n[1]) return text - h.start; return h.length; } @@ -211,13 +204,11 @@ inline static sz_size_t sz_naive_find_3chars(sz_haystack_t h, char const *n) { (third_indicators >> 16) & (third_indicators >> 8) & (third_indicators >> 0) & 0x0000010000010000; uint64_t match_indicators = first_indicators | (second_indicators >> 8) | (third_indicators >> 16); - if (match_indicators != 0) - return text - h.start + ctz64(match_indicators) / 8; + if (match_indicators != 0) return text - h.start + ctz64(match_indicators) / 8; } for (; text + 3 <= end; ++text) - if (text[0] == n[0] && text[1] == n[1] && text[2] == n[2]) - return text - h.start; + if (text[0] == n[0] && text[1] == n[1] && text[2] == n[2]) return text - h.start; return h.length; } @@ -275,8 +266,7 @@ inline static sz_size_t sz_naive_find_4chars(sz_haystack_t h, char const *n) { } for (; text + 4 <= end; ++text) - if (text[0] == n[0] && text[1] == n[1] && text[2] == n[2] && text[3] == n[3]) - return text - h.start; + if (text[0] == n[0] && text[1] == n[1] && text[2] == n[2] && text[3] == n[3]) return text - h.start; return h.length; } @@ -287,8 +277,7 @@ inline static sz_size_t sz_naive_find_4chars(sz_haystack_t h, char const *n) { */ inline static sz_size_t sz_naive_find_substr(sz_haystack_t h, sz_needle_t n) { - if (h.length < n.length) - return h.length; + if (h.length < n.length) return h.length; char const *text = h.start; char const *const end = h.start + h.length; @@ -365,18 +354,17 @@ sz_size_t sz_avx2_find_substr(sz_haystack_t h, sz_needle_t n) { if (matches0 | matches1 | matches2 | matches3) { for (sz_size_t i = 0; i < 32; i++) { - if (sz_equal(text + i, n.start, n.length)) - return i + (text - h.start); + if (sz_equal(text + i, n.start, n.length)) return i + (text - h.start); } } } // Don't forget the last (up to 35) characters. sz_haystack_t tail; - tail.ptr = text; - tail.len = end - text; + tail.start = text; + tail.length = end - text; size_t tail_match = sz_naive_find_substr(tail, n); - return text + tail_match - h.ptr; + return text + tail_match - h.start; } #endif // x86 AVX2 @@ -423,18 +411,17 @@ inline static sz_size_t sz_neon_find_substr(sz_haystack_t h, sz_needle_t n) { if (has_match) { for (sz_size_t i = 0; i < 16; i++) { - if (sz_equal(text + i, n.start, n.length)) - return i + (text - h.start); + if (sz_equal(text + i, n.start, n.length)) return i + (text - h.start); } } } // Don't forget the last (up to 16+3=19) characters. sz_haystack_t tail; - tail.ptr = text; - tail.len = end - text; + tail.start = text; + tail.length = end - text; size_t tail_match = sz_naive_find_substr(tail, n); - return text + tail_match - h.ptr; + return text + tail_match - h.start; } #endif // Arm Neon @@ -472,8 +459,7 @@ typedef struct sz_sequence_t { inline static sz_size_t sz_partition(sz_sequence_t *sequence, sz_sequence_predicate_t predicate) { sz_size_t matches = 0; - while (matches != sequence->count && predicate(sequence->handle, sequence->order[matches])) - ++matches; + while (matches != sequence->count && predicate(sequence->handle, sequence->order[matches])) ++matches; for (sz_size_t i = matches + 1; i < sequence->count; ++i) if (predicate(sequence->handle, sequence->order[i])) @@ -491,16 +477,13 @@ inline static void sz_merge(sz_sequence_t *sequence, sz_size_t partition, sz_seq sz_size_t start_b = partition + 1; // If the direct merge is already sorted - if (!less(sequence->handle, sequence->order[start_b], sequence->order[partition])) - return; + if (!less(sequence->handle, sequence->order[start_b], sequence->order[partition])) return; sz_size_t start_a = 0; while (start_a <= partition && start_b <= sequence->count) { // If element 1 is in right place - if (!less(sequence->handle, sequence->order[start_b], sequence->order[start_a])) { - start_a++; - } + if (!less(sequence->handle, sequence->order[start_b], sequence->order[start_a])) { start_a++; } else { sz_size_t value = sequence->order[start_b]; sz_size_t index = start_b; @@ -527,19 +510,16 @@ inline static void _sz_sort_recursion( // sz_size_t bit_max, sz_qsort_comparison_func_t qsort_comparator) { - if (!sequence->count) - return; + if (!sequence->count) return; // Partition a range of integers according to a specific bit value sz_size_t split = 0; { sz_size_t mask = (1ul << 63) >> bit_idx; - while (split != sequence->count && !(sequence->order[split] & mask)) - ++split; + while (split != sequence->count && !(sequence->order[split] & mask)) ++split; for (sz_size_t i = split + 1; i < sequence->count; ++i) - if (!(sequence->order[i] & mask)) - sz_swap(sequence->order + i, sequence->order + split), ++split; + if (!(sequence->order[i] & mask)) sz_swap(sequence->order + i, sequence->order + split), ++split; } // Go down recursively @@ -556,9 +536,7 @@ inline static void _sz_sort_recursion( // // Reached the end of recursion else { // Discard the prefixes - for (sz_size_t i = 0; i != sequence->count; ++i) { - memset((char *)(&sequence->order[i]) + 4, 0, 4ul); - } + for (sz_size_t i = 0; i != sequence->count; ++i) { memset((char *)(&sequence->order[i]) + 4, 0, 4ul); } // Perform sorts on smaller chunks instead of the whole handle #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) @@ -649,8 +627,7 @@ inline static void sz_sort(sz_sequence_t *sequence, sz_sort_config_t const *conf sz_size_t length = sequence->get_length(sequence->handle, sequence->order[i]); length = length > 4ul ? 4ul : length; char *prefix = (char *)&sequence->order[i]; - for (sz_size_t j = 0; j != length; ++j) - prefix[7 - j] = begin[j]; + for (sz_size_t j = 0; j != length; ++j) prefix[7 - j] = begin[j]; if (case_insensitive) { prefix[0] = tolower(prefix[0]); prefix[1] = tolower(prefix[1]); @@ -660,8 +637,7 @@ inline static void sz_sort(sz_sequence_t *sequence, sz_sort_config_t const *conf } sz_qsort_comparison_func_t comparator = _sz_sort_sequence_strncmp; - if (case_insensitive) - comparator = _sz_sort_sequence_strncasecmp; + if (case_insensitive) comparator = _sz_sort_sequence_strncasecmp; // Perform optionally-parallel radix sort on them _sz_sort_recursion(sequence, 0, 32, comparator); @@ -699,26 +675,21 @@ inline static levenstein_distance_t sz_levenstein( // void *buffer) { // If one of the strings is empty - the edit distance is equal to the length of the other one - if (a_length == 0) - return b_length <= bound ? b_length : bound; - if (b_length == 0) - return a_length <= bound ? a_length : bound; + if (a_length == 0) return b_length <= bound ? b_length : bound; + if (b_length == 0) return a_length <= bound ? a_length : bound; // If the difference in length is beyond the `bound`, there is no need to check at all if (a_length > b_length) { - if (a_length - b_length > bound) - return bound + 1; + if (a_length - b_length > bound) return bound + 1; } else { - if (b_length - a_length > bound) - return bound + 1; + if (b_length - a_length > bound) return bound + 1; } levenstein_distance_t *previous_distances = (levenstein_distance_t *)buffer; levenstein_distance_t *current_distances = previous_distances + b_length + 1; - for (sz_size_t idx_b = 0; idx_b != (b_length + 1); ++idx_b) - previous_distances[idx_b] = idx_b; + for (sz_size_t idx_b = 0; idx_b != (b_length + 1); ++idx_b) previous_distances[idx_b] = idx_b; for (sz_size_t idx_a = 0; idx_a != a_length; ++idx_a) { current_distances[0] = idx_a + 1; @@ -733,14 +704,11 @@ inline static levenstein_distance_t sz_levenstein( // current_distances[idx_b + 1] = _sz_levenstein_minimum(cost_deletion, cost_insertion, cost_substitution); // Keep track of the minimum distance seen so far in this row - if (current_distances[idx_b + 1] < min_distance) { - min_distance = current_distances[idx_b + 1]; - } + if (current_distances[idx_b + 1] < min_distance) { min_distance = current_distances[idx_b + 1]; } } // If the minimum distance in this row exceeded the bound, return early - if (min_distance > bound) - return bound; + if (min_distance > bound) return bound; // Swap previous_distances and current_distances pointers levenstein_distance_t *temp = previous_distances; From b771739f66e42d26ab5321025d3aa098dadcbf2a Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Tue, 19 Sep 2023 22:14:25 +0400 Subject: [PATCH 22/72] Add: `startswith` & `endswith` --- .vscode/settings.json | 2 ++ python/lib.c | 61 ++++++++++++++++++++++++++++++++++++++----- 2 files changed, 56 insertions(+), 7 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 6fa841e1..97c0113c 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -127,6 +127,7 @@ "bigram", "cibuildwheel", "endregion", + "endswith", "getitem", "getslice", "initproc", @@ -154,6 +155,7 @@ "richcompare", "SIMD", "splitlines", + "startswith", "stringzilla", "Strs", "strzl", diff --git a/python/lib.c b/python/lib.c index 8035e0ba..faf14a4d 100644 --- a/python/lib.c +++ b/python/lib.c @@ -210,7 +210,7 @@ int export_string_like(PyObject *object, char const **start, size_t *length) { #pragma region Global Functions static Py_ssize_t api_find_(PyObject *self, PyObject *args, PyObject *kwargs) { - int is_member = (self != NULL && PyObject_TypeCheck(self, &StrType)); + int is_member = self != NULL && PyObject_TypeCheck(self, &StrType); Py_ssize_t nargs = PyTuple_Size(args); if (nargs < !is_member + 1 || nargs > !is_member + 3) { PyErr_SetString(PyExc_TypeError, "Invalid number of arguments"); @@ -294,7 +294,7 @@ static PyObject *api_contains(PyObject *self, PyObject *args, PyObject *kwargs) } static PyObject *api_count(PyObject *self, PyObject *args, PyObject *kwargs) { - int is_member = (self != NULL && PyObject_TypeCheck(self, &StrType)); + int is_member = self != NULL && PyObject_TypeCheck(self, &StrType); Py_ssize_t nargs = PyTuple_Size(args); if (nargs < !is_member + 1 || nargs > !is_member + 4) { PyErr_Format(PyExc_TypeError, "Invalid number of arguments"); @@ -361,7 +361,7 @@ static PyObject *api_count(PyObject *self, PyObject *args, PyObject *kwargs) { } static PyObject *api_levenstein(PyObject *self, PyObject *args, PyObject *kwargs) { - int is_member = (self != NULL && PyObject_TypeCheck(self, &StrType)); + int is_member = self != NULL && PyObject_TypeCheck(self, &StrType); Py_ssize_t nargs = PyTuple_Size(args); if (nargs < !is_member + 1 || nargs > !is_member + 2) { PyErr_Format(PyExc_TypeError, "Invalid number of arguments"); @@ -415,10 +415,54 @@ static PyObject *api_levenstein(PyObject *self, PyObject *args, PyObject *kwargs return PyLong_FromLong(distance); } +static PyObject *api_startswith(PyObject *self, PyObject *args, PyObject *kwargs) { + int is_member = self != NULL && PyObject_TypeCheck(self, &StrType); + if (PyTuple_Size(args) != !is_member + 1) { + PyErr_Format(PyExc_TypeError, "Invalid number of arguments"); + return NULL; + } + + PyObject *str_obj = is_member ? self : PyTuple_GET_ITEM(args, 0); + PyObject *prefix_obj = PyTuple_GET_ITEM(args, !is_member); + + struct sz_haystack_t str, prefix; + if (!export_string_like(str_obj, &str.start, &str.length) || + !export_string_like(prefix_obj, &prefix.start, &prefix.length)) { + PyErr_SetString(PyExc_TypeError, "Both arguments must be string-like"); + return NULL; + } + + if (str.length < prefix.length) { Py_RETURN_FALSE; } + else if (strncmp(str.start, prefix.start, prefix.length) == 0) { Py_RETURN_TRUE; } + else { Py_RETURN_FALSE; } +} + +static PyObject *api_endswith(PyObject *self, PyObject *args, PyObject *kwargs) { + int is_member = self != NULL && PyObject_TypeCheck(self, &StrType); + if (PyTuple_Size(args) != !is_member + 1) { + PyErr_Format(PyExc_TypeError, "Invalid number of arguments"); + return NULL; + } + + PyObject *str_obj = is_member ? self : PyTuple_GET_ITEM(args, 0); + PyObject *suffix_obj = PyTuple_GET_ITEM(args, !is_member); + + struct sz_haystack_t str, suffix; + if (!export_string_like(str_obj, &str.start, &str.length) || + !export_string_like(suffix_obj, &suffix.start, &suffix.length)) { + PyErr_SetString(PyExc_TypeError, "Both arguments must be string-like"); + return NULL; + } + + if (str.length < suffix.length) { Py_RETURN_FALSE; } + else if (strncmp(str.start + (str.length - suffix.length), suffix.start, suffix.length) == 0) { Py_RETURN_TRUE; } + else { Py_RETURN_FALSE; } +} + static PyObject *api_split(PyObject *self, PyObject *args, PyObject *kwargs) { // Check minimum arguments - int is_member = (self != NULL && PyObject_TypeCheck(self, &StrType)); + int is_member = self != NULL && PyObject_TypeCheck(self, &StrType); Py_ssize_t nargs = PyTuple_Size(args); if (nargs < !is_member + 1 || nargs > !is_member + 3) { PyErr_SetString(PyExc_TypeError, "sz.split() requires at least 1 argument"); @@ -691,7 +735,7 @@ static int Str_init(Str *self, PyObject *positional_args, PyObject *named_args) if (!PyArg_ParseTupleAndKeywords(positional_args, named_args, "|Onn", names, &parent, &from, &to)) return -1; } else if (!PyArg_ParseTuple(positional_args, "|Onn", &parent, &from, &to)) - return -1; + return -1; // Handle empty string if (parent == NULL) { @@ -949,6 +993,8 @@ static PyMethodDef Str_methods[] = { // {"count", api_count, sz_method_flags_m, "Count the occurrences of a substring."}, {"levenstein", api_levenstein, sz_method_flags_m, "Calculate the Levenshtein distance between two strings."}, {"split", api_split, sz_method_flags_m, "Split a string by a separator."}, + {"startswith", api_startswith, sz_method_flags_m, "Check if a string starts with a given prefix."}, + {"endswith", api_endswith, sz_method_flags_m, "Check if a string ends with a given suffix."}, {NULL, NULL, 0, NULL}}; static PyTypeObject StrType = { @@ -1004,8 +1050,9 @@ static PyMethodDef stringzilla_methods[] = { {"count", api_count, sz_method_flags_m, "Count the occurrences of a substring."}, {"levenstein", api_levenstein, sz_method_flags_m, "Calculate the Levenshtein distance between two strings."}, {"split", api_split, sz_method_flags_m, "Split a string by a separator."}, - {NULL, NULL, 0, NULL} /* Sentinel */ -}; + {"startswith", api_startswith, sz_method_flags_m, "Check if a string starts with a given prefix."}, + {"endswith", api_endswith, sz_method_flags_m, "Check if a string ends with a given suffix."}, + {NULL, NULL, 0, NULL}}; static PyModuleDef stringzilla_module = { PyModuleDef_HEAD_INIT, From 173197fd4f5098ae17c79a4d8b8d673d73ee0317 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Tue, 19 Sep 2023 22:14:35 +0400 Subject: [PATCH 23/72] Improve: Faster `Str` constructor --- python/lib.c | 68 +++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 59 insertions(+), 9 deletions(-) diff --git a/python/lib.c b/python/lib.c index faf14a4d..185b7c09 100644 --- a/python/lib.c +++ b/python/lib.c @@ -724,24 +724,74 @@ static PyTypeObject FileType = { #pragma region Str -static int Str_init(Str *self, PyObject *positional_args, PyObject *named_args) { - PyObject *parent = NULL; - Py_ssize_t from = 0; - Py_ssize_t to = PY_SSIZE_T_MAX; +static int Str_init(Str *self, PyObject *args, PyObject *kwargs) { - // The `named_args` would be `NULL` - if (named_args) { - static char *names[] = {"parent", "from", "to", NULL}; - if (!PyArg_ParseTupleAndKeywords(positional_args, named_args, "|Onn", names, &parent, &from, &to)) return -1; + // Parse all arguments into PyObjects first + Py_ssize_t nargs = PyTuple_Size(args); + if (nargs > 3) { + PyErr_SetString(PyExc_TypeError, "Invalid number of arguments"); + return -1; } - else if (!PyArg_ParseTuple(positional_args, "|Onn", &parent, &from, &to)) + PyObject *parent_obj = nargs >= 1 ? PyTuple_GET_ITEM(args, 0) : NULL; + PyObject *from_obj = nargs >= 2 ? PyTuple_GET_ITEM(args, 1) : NULL; + PyObject *to_obj = nargs >= 3 ? PyTuple_GET_ITEM(args, 2) : NULL; + + // Parse keyword arguments, if provided, and ensure no duplicates + if (kwargs) { + PyObject *key, *value; + Py_ssize_t pos = 0; + while (PyDict_Next(kwargs, &pos, &key, &value)) { + if (PyUnicode_CompareWithASCIIString(key, "parent") == 0) { + if (parent_obj) { + PyErr_SetString(PyExc_TypeError, "Received `parent` both as positional and keyword argument"); return -1; + } + parent_obj = value; + } + else if (PyUnicode_CompareWithASCIIString(key, "from") == 0) { + if (from_obj) { + PyErr_SetString(PyExc_TypeError, "Received `from` both as positional and keyword argument"); + return -1; + } + from_obj = value; + } + else if (PyUnicode_CompareWithASCIIString(key, "to") == 0) { + if (to_obj) { + PyErr_SetString(PyExc_TypeError, "Received `to` both as positional and keyword argument"); + return -1; + } + to_obj = value; + } + else { + PyErr_SetString(PyExc_TypeError, "Invalid keyword argument"); + return -1; + } + } + } // Handle empty string if (parent == NULL) { self->start = NULL; self->length = 0; } + + // Now, type-check and cast each argument + Py_ssize_t from = 0, to = PY_SSIZE_T_MAX; + if (from_obj) { + from = PyLong_AsSsize_t(from_obj); + if (from == -1 && PyErr_Occurred()) { + PyErr_SetString(PyExc_TypeError, "The `from` argument must be an integer"); + return -1; + } + } + if (to_obj) { + to = PyLong_AsSsize_t(to_obj); + if (to == -1 && PyErr_Occurred()) { + PyErr_SetString(PyExc_TypeError, "The `to` argument must be an integer"); + return -1; + } + } + // Increment the reference count of the parent else if (export_string_like(parent, &self->start, &self->length)) { self->parent = parent; From a480c0bcdc3e5d219a299675b7b990d693f111ad Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Tue, 19 Sep 2023 22:53:22 +0400 Subject: [PATCH 24/72] Add: `Str` concatenation --- python/lib.c | 385 ++++++++++++++++++++++++++++++++++++--------------- setup.py | 2 + 2 files changed, 275 insertions(+), 112 deletions(-) diff --git a/python/lib.c b/python/lib.c index 185b7c09..7d4c59ef 100644 --- a/python/lib.c +++ b/python/lib.c @@ -209,7 +209,7 @@ int export_string_like(PyObject *object, char const **start, size_t *length) { #pragma region Global Functions -static Py_ssize_t api_find_(PyObject *self, PyObject *args, PyObject *kwargs) { +static Py_ssize_t Str_find_(PyObject *self, PyObject *args, PyObject *kwargs) { int is_member = self != NULL && PyObject_TypeCheck(self, &StrType); Py_ssize_t nargs = PyTuple_Size(args); if (nargs < !is_member + 1 || nargs > !is_member + 3) { @@ -236,8 +236,8 @@ static Py_ssize_t api_find_(PyObject *self, PyObject *args, PyObject *kwargs) { } } - struct sz_haystack_t haystack; - struct sz_needle_t needle; + sz_haystack_t haystack; + sz_needle_t needle; Py_ssize_t start, end; // Validate and convert `haystack` and `needle` @@ -280,20 +280,30 @@ static Py_ssize_t api_find_(PyObject *self, PyObject *args, PyObject *kwargs) { return (Py_ssize_t)offset; } -static PyObject *api_find(PyObject *self, PyObject *args, PyObject *kwargs) { - Py_ssize_t signed_offset = api_find_(self, args, kwargs); +static PyObject *Str_find(PyObject *self, PyObject *args, PyObject *kwargs) { + Py_ssize_t signed_offset = Str_find_(self, args, kwargs); if (PyErr_Occurred()) return NULL; return PyLong_FromSsize_t(signed_offset); } -static PyObject *api_contains(PyObject *self, PyObject *args, PyObject *kwargs) { - Py_ssize_t signed_offset = api_find_(self, args, kwargs); +static PyObject *Str_index(PyObject *self, PyObject *args, PyObject *kwargs) { + Py_ssize_t signed_offset = Str_find_(self, args, kwargs); + if (PyErr_Occurred()) return NULL; + if (signed_offset == -1) { + PyErr_SetString(PyExc_ValueError, "substring not found"); + return NULL; + } + return PyLong_FromSsize_t(signed_offset); +} + +static PyObject *Str_contains(PyObject *self, PyObject *args, PyObject *kwargs) { + Py_ssize_t signed_offset = Str_find_(self, args, kwargs); if (PyErr_Occurred()) return NULL; if (signed_offset == -1) { Py_RETURN_FALSE; } else { Py_RETURN_TRUE; } } -static PyObject *api_count(PyObject *self, PyObject *args, PyObject *kwargs) { +static PyObject *Str_count(PyObject *self, PyObject *args, PyObject *kwargs) { int is_member = self != NULL && PyObject_TypeCheck(self, &StrType); Py_ssize_t nargs = PyTuple_Size(args); if (nargs < !is_member + 1 || nargs > !is_member + 4) { @@ -318,8 +328,8 @@ static PyObject *api_count(PyObject *self, PyObject *args, PyObject *kwargs) { return NULL; } - struct sz_haystack_t haystack; - struct sz_needle_t needle; + sz_haystack_t haystack; + sz_needle_t needle; Py_ssize_t start = start_obj ? PyLong_AsSsize_t(start_obj) : 0; Py_ssize_t end = end_obj ? PyLong_AsSsize_t(end_obj) : PY_SSIZE_T_MAX; int allowoverlap = allowoverlap_obj ? PyObject_IsTrue(allowoverlap_obj) : 0; @@ -360,7 +370,7 @@ static PyObject *api_count(PyObject *self, PyObject *args, PyObject *kwargs) { return PyLong_FromSize_t(count); } -static PyObject *api_levenstein(PyObject *self, PyObject *args, PyObject *kwargs) { +static PyObject *Str_levenstein(PyObject *self, PyObject *args, PyObject *kwargs) { int is_member = self != NULL && PyObject_TypeCheck(self, &StrType); Py_ssize_t nargs = PyTuple_Size(args); if (nargs < !is_member + 1 || nargs > !is_member + 2) { @@ -391,7 +401,7 @@ static PyObject *api_levenstein(PyObject *self, PyObject *args, PyObject *kwargs return NULL; } - struct sz_haystack_t str1, str2; + sz_haystack_t str1, str2; if (!export_string_like(str1_obj, &str1.start, &str1.length) || !export_string_like(str2_obj, &str2.start, &str2.length)) { PyErr_Format(PyExc_TypeError, "Both arguments must be string-like"); @@ -415,51 +425,169 @@ static PyObject *api_levenstein(PyObject *self, PyObject *args, PyObject *kwargs return PyLong_FromLong(distance); } -static PyObject *api_startswith(PyObject *self, PyObject *args, PyObject *kwargs) { +static PyObject *Str_startswith(PyObject *self, PyObject *args, PyObject *kwargs) { int is_member = self != NULL && PyObject_TypeCheck(self, &StrType); - if (PyTuple_Size(args) != !is_member + 1) { + Py_ssize_t nargs = PyTuple_Size(args); + if (nargs < !is_member + 1 || nargs > !is_member + 3) { PyErr_Format(PyExc_TypeError, "Invalid number of arguments"); return NULL; } PyObject *str_obj = is_member ? self : PyTuple_GET_ITEM(args, 0); PyObject *prefix_obj = PyTuple_GET_ITEM(args, !is_member); + PyObject *start_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL; + PyObject *end_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL; + + // Optional start and end arguments + Py_ssize_t start = 0, end = PY_SSIZE_T_MAX; + + if (start_obj && ((start = PyLong_AsSsize_t(start_obj)) == -1 && PyErr_Occurred())) { + PyErr_SetString(PyExc_TypeError, "start must be an integer"); + return NULL; + } + + if (end_obj && ((end = PyLong_AsSsize_t(end_obj)) == -1 && PyErr_Occurred())) { + PyErr_SetString(PyExc_TypeError, "end must be an integer"); + return NULL; + } - struct sz_haystack_t str, prefix; + sz_haystack_t str, prefix; if (!export_string_like(str_obj, &str.start, &str.length) || !export_string_like(prefix_obj, &prefix.start, &prefix.length)) { PyErr_SetString(PyExc_TypeError, "Both arguments must be string-like"); return NULL; } + // Apply start and end arguments + str.start += start; + str.length -= start; + if (end != PY_SSIZE_T_MAX && end - start < str.length) { str.length = end - start; } + if (str.length < prefix.length) { Py_RETURN_FALSE; } else if (strncmp(str.start, prefix.start, prefix.length) == 0) { Py_RETURN_TRUE; } else { Py_RETURN_FALSE; } } -static PyObject *api_endswith(PyObject *self, PyObject *args, PyObject *kwargs) { +static PyObject *Str_endswith(PyObject *self, PyObject *args, PyObject *kwargs) { int is_member = self != NULL && PyObject_TypeCheck(self, &StrType); - if (PyTuple_Size(args) != !is_member + 1) { + Py_ssize_t nargs = PyTuple_Size(args); + if (nargs < !is_member + 1 || nargs > !is_member + 3) { PyErr_Format(PyExc_TypeError, "Invalid number of arguments"); return NULL; } PyObject *str_obj = is_member ? self : PyTuple_GET_ITEM(args, 0); PyObject *suffix_obj = PyTuple_GET_ITEM(args, !is_member); + PyObject *start_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL; + PyObject *end_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL; + + // Optional start and end arguments + Py_ssize_t start = 0, end = PY_SSIZE_T_MAX; + + if (start_obj && ((start = PyLong_AsSsize_t(start_obj)) == -1 && PyErr_Occurred())) { + PyErr_SetString(PyExc_TypeError, "start must be an integer"); + return NULL; + } - struct sz_haystack_t str, suffix; + if (end_obj && ((end = PyLong_AsSsize_t(end_obj)) == -1 && PyErr_Occurred())) { + PyErr_SetString(PyExc_TypeError, "end must be an integer"); + return NULL; + } + + sz_haystack_t str, suffix; if (!export_string_like(str_obj, &str.start, &str.length) || !export_string_like(suffix_obj, &suffix.start, &suffix.length)) { PyErr_SetString(PyExc_TypeError, "Both arguments must be string-like"); return NULL; } + // Apply start and end arguments + str.start += start; + str.length -= start; + if (end != PY_SSIZE_T_MAX && end - start < str.length) { str.length = end - start; } + if (str.length < suffix.length) { Py_RETURN_FALSE; } else if (strncmp(str.start + (str.length - suffix.length), suffix.start, suffix.length) == 0) { Py_RETURN_TRUE; } else { Py_RETURN_FALSE; } } -static PyObject *api_split(PyObject *self, PyObject *args, PyObject *kwargs) { +static Strs *Str_split_( + PyObject *parent, sz_haystack_t text, sz_needle_t separator, int keepseparator, Py_ssize_t maxsplit) { + + // Create Strs object + Strs *result = (Strs *)PyObject_New(Strs, &StrsType); + if (!result) return NULL; + + // Initialize Strs object based on the splitting logic + void *offsets_endings = NULL; + size_t offsets_capacity = 0; + size_t offsets_count = 0; + size_t bytes_per_offset; + if (text.length >= UINT32_MAX) { + bytes_per_offset = 8; + result->type = STRS_CONSECUTIVE_64; + result->data.consecutive_64bit.start = text.start; + result->data.consecutive_64bit.parent = parent; + result->data.consecutive_64bit.separator_length = !keepseparator * separator.length; + } + else { + bytes_per_offset = 4; + result->type = STRS_CONSECUTIVE_32; + result->data.consecutive_32bit.start = text.start; + result->data.consecutive_32bit.parent = parent; + result->data.consecutive_32bit.separator_length = !keepseparator * separator.length; + } + + // Iterate through string, keeping track of the + sz_size_t last_start = 0; + while (last_start < text.length && offsets_count < maxsplit) { + sz_haystack_t text_remaining; + text_remaining.start = text.start + last_start; + text_remaining.length = text.length - last_start; + sz_size_t offset_in_remaining = sz_neon_find_substr(text_remaining, separator); + + // Reallocate offsets array if needed + if (offsets_count >= offsets_capacity) { + offsets_capacity = (offsets_capacity + 1) * 2; + void *new_offsets = realloc(offsets_endings, offsets_capacity * bytes_per_offset); + if (!new_offsets) { + if (offsets_endings) free(offsets_endings); + } + offsets_endings = new_offsets; + } + + // If the memory allocation has failed - discard the response + if (!offsets_endings) { + Py_XDECREF(result); + PyErr_NoMemory(); + return NULL; + } + + // Export the offset + size_t will_continue = offset_in_remaining != text_remaining.length; + size_t next_offset = last_start + offset_in_remaining + separator.length * will_continue; + if (text.length >= UINT32_MAX) { ((uint64_t *)offsets_endings)[offsets_count++] = (uint64_t)next_offset; } + else { ((uint32_t *)offsets_endings)[offsets_count++] = (uint32_t)next_offset; } + + // Next time we want to start + last_start = last_start + offset_in_remaining + separator.length; + } + + // Populate the Strs object with the offsets + if (text.length >= UINT32_MAX) { + result->data.consecutive_64bit.end_offsets = offsets_endings; + result->data.consecutive_64bit.count = offsets_count; + } + else { + result->data.consecutive_32bit.end_offsets = offsets_endings; + result->data.consecutive_32bit.count = offsets_count; + } + + Py_INCREF(parent); + return result; +} + +static PyObject *Str_split(PyObject *self, PyObject *args, PyObject *kwargs) { // Check minimum arguments int is_member = self != NULL && PyObject_TypeCheck(self, &StrType); @@ -486,8 +614,8 @@ static PyObject *api_split(PyObject *self, PyObject *args, PyObject *kwargs) { } } - struct sz_haystack_t text; - struct sz_needle_t separator; + sz_haystack_t text; + sz_needle_t separator; int keepseparator; Py_ssize_t maxsplit; separator.anomaly_offset = 0; @@ -532,77 +660,107 @@ static PyObject *api_split(PyObject *self, PyObject *args, PyObject *kwargs) { } else { maxsplit = PY_SSIZE_T_MAX; } - // Create Strs object - Strs *result = (Strs *)PyObject_New(Strs, &StrsType); - if (!result) return NULL; + return Str_split_(text_obj, text, separator, keepseparator, maxsplit); +} - // Initialize Strs object based on the splitting logic - void *offsets_endings = NULL; - size_t offsets_capacity = 0; - size_t offsets_count = 0; - size_t bytes_per_offset; - if (text.length >= UINT32_MAX) { - bytes_per_offset = 8; - result->type = STRS_CONSECUTIVE_64; - result->data.consecutive_64bit.start = text.start; - result->data.consecutive_64bit.parent = text_obj; - result->data.consecutive_64bit.separator_length = !keepseparator * separator.length; +static PyObject *Str_splitlines(PyObject *self, PyObject *args, PyObject *kwargs) { + + // Check minimum arguments + int is_member = self != NULL && PyObject_TypeCheck(self, &StrType); + Py_ssize_t nargs = PyTuple_Size(args); + if (nargs < !is_member || nargs > !is_member + 2) { + PyErr_SetString(PyExc_TypeError, "splitlines() requires at least 1 argument"); + return NULL; } - else { - bytes_per_offset = 4; - result->type = STRS_CONSECUTIVE_32; - result->data.consecutive_32bit.start = text.start; - result->data.consecutive_32bit.parent = text_obj; - result->data.consecutive_32bit.separator_length = !keepseparator * separator.length; + + PyObject *text_obj = is_member ? self : PyTuple_GET_ITEM(args, 0); + PyObject *keeplinebreaks_obj = nargs > !is_member ? PyTuple_GET_ITEM(args, !is_member) : NULL; + PyObject *maxsplit_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL; + + if (kwargs) { + PyObject *key, *value; + Py_ssize_t pos = 0; + while (PyDict_Next(kwargs, &pos, &key, &value)) { + if (PyUnicode_CompareWithASCIIString(key, "keeplinebreaks") == 0) { keeplinebreaks_obj = value; } + else if (PyUnicode_CompareWithASCIIString(key, "maxsplit") == 0) { maxsplit_obj = value; } + else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key)) { return NULL; } + } } - // Iterate through string, keeping track of the - sz_size_t last_start = 0; - while (last_start < text.length && offsets_count < maxsplit) { - sz_haystack_t text_remaining; - text_remaining.start = text.start + last_start; - text_remaining.length = text.length - last_start; - sz_size_t offset_in_remaining = sz_neon_find_substr(text_remaining, separator); + sz_haystack_t text; + int keeplinebreaks; + Py_ssize_t maxsplit = PY_SSIZE_T_MAX; // Default value for maxsplit - // Reallocate offsets array if needed - if (offsets_count >= offsets_capacity) { - offsets_capacity = (offsets_capacity + 1) * 2; - void *new_offsets = realloc(offsets_endings, offsets_capacity * bytes_per_offset); - if (!new_offsets) { - if (offsets_endings) free(offsets_endings); - } - offsets_endings = new_offsets; + // Validate and convert `text` + if (!export_string_like(text_obj, &text.start, &text.length)) { + PyErr_SetString(PyExc_TypeError, "The text argument must be string-like"); + return NULL; + } + + // Validate and convert `keeplinebreaks` + if (keeplinebreaks_obj) { + keeplinebreaks = PyObject_IsTrue(keeplinebreaks_obj); + if (keeplinebreaks == -1) { + PyErr_SetString(PyExc_TypeError, "The keeplinebreaks argument must be a boolean"); + return NULL; } + } + else { keeplinebreaks = 0; } - // If the memory allocation has failed - discard the response - if (!offsets_endings) { - Py_XDECREF(result); - PyErr_NoMemory(); + // Validate and convert `maxsplit` + if (maxsplit_obj) { + maxsplit = PyLong_AsSsize_t(maxsplit_obj); + if (maxsplit == -1 && PyErr_Occurred()) { + PyErr_SetString(PyExc_TypeError, "The maxsplit argument must be an integer"); return NULL; } + } - // Export the offset - size_t will_continue = offset_in_remaining != text_remaining.length; - size_t next_offset = last_start + offset_in_remaining + separator.length * will_continue; - if (text.length >= UINT32_MAX) { ((uint64_t *)offsets_endings)[offsets_count++] = (uint64_t)next_offset; } - else { ((uint32_t *)offsets_endings)[offsets_count++] = (uint32_t)next_offset; } + // TODO: Support arbitrary newline characters: + // https://docs.python.org/3/library/stdtypes.html#str.splitlines + // \n, \r, \r\n, \v or \x0b, \f or \x0c, \x1c, \x1d, \x1e, \x85, \u2028, \u2029 + // https://github.com/ashvardanian/StringZilla/issues/29 + sz_needle_t separator; + separator.start = "\n"; + separator.length = 1; + return Str_split_(text_obj, text, separator, keeplinebreaks, maxsplit); +} - // Next time we want to start - last_start = last_start + offset_in_remaining + separator.length; +static PyObject *Str_concat(PyObject *self, PyObject *other) { + struct sz_haystack_t self_str, other_str; + + // Validate and convert `self` + if (!export_string_like(self, &self_str.start, &self_str.length)) { + PyErr_SetString(PyExc_TypeError, "The self object must be string-like"); + return NULL; } - // Populate the Strs object with the offsets - if (text.length >= UINT32_MAX) { - result->data.consecutive_64bit.end_offsets = offsets_endings; - result->data.consecutive_64bit.count = offsets_count; + // Validate and convert `other` + if (!export_string_like(other, &other_str.start, &other_str.length)) { + PyErr_SetString(PyExc_TypeError, "The other object must be string-like"); + return NULL; } - else { - result->data.consecutive_32bit.end_offsets = offsets_endings; - result->data.consecutive_32bit.count = offsets_count; + + // Allocate a new Str instance + Str *result_str = PyObject_New(Str, &StrType); + if (result_str == NULL) { return NULL; } + + // Calculate the total length of the new string + result_str->parent = NULL; + result_str->length = self_str.length + other_str.length; + + // Allocate memory for the new string + result_str->start = malloc(result_str->length); + if (result_str->start == NULL) { + PyErr_SetString(PyExc_MemoryError, "Unable to allocate memory for string concatenation"); + return NULL; } - Py_INCREF(text_obj); - return (PyObject *)result; + // Perform the string concatenation + memcpy(result_str->start, self_str.start, self_str.length); + memcpy(result_str->start + self_str.length, other_str.start, other_str.length); + + return (PyObject *)result_str; } #pragma endregion @@ -769,12 +927,6 @@ static int Str_init(Str *self, PyObject *args, PyObject *kwargs) { } } - // Handle empty string - if (parent == NULL) { - self->start = NULL; - self->length = 0; - } - // Now, type-check and cast each argument Py_ssize_t from = 0, to = PY_SSIZE_T_MAX; if (from_obj) { @@ -792,10 +944,15 @@ static int Str_init(Str *self, PyObject *args, PyObject *kwargs) { } } + // Handle empty string + if (parent_obj == NULL) { + self->start = NULL; + self->length = 0; + } // Increment the reference count of the parent - else if (export_string_like(parent, &self->start, &self->length)) { - self->parent = parent; - Py_INCREF(parent); + else if (export_string_like(parent_obj, &self->start, &self->length)) { + self->parent = parent_obj; + Py_INCREF(parent_obj); } else { PyErr_SetString(PyExc_TypeError, "Unsupported parent type"); @@ -822,7 +979,8 @@ static PyObject *Str_new(PyTypeObject *type, PyObject *args, PyObject *kwds) { } static void Str_dealloc(Str *self) { - if (self->parent) Py_XDECREF(self->parent); + if (self->parent) { Py_XDECREF(self->parent); } + else if (self->start) { free(self->start); } self->parent = NULL; Py_TYPE(self)->tp_free((PyObject *)self); } @@ -907,16 +1065,16 @@ static void Str_releasebuffer(PyObject *_, Py_buffer *view) { // https://docs.python.org/3/c-api/typeobj.html#c.PyBufferProcs.bf_releasebuffer } -static int Str_contains(Str *self, PyObject *arg) { +static int Str_in(Str *self, PyObject *arg) { - struct sz_needle_t needle_struct; + sz_needle_t needle_struct; needle_struct.anomaly_offset = 0; if (!export_string_like(arg, &needle_struct.start, &needle_struct.length)) { PyErr_SetString(PyExc_TypeError, "Unsupported argument type"); return -1; } - struct sz_haystack_t haystack; + sz_haystack_t haystack; haystack.start = self->start; haystack.length = self->length; size_t position = sz_neon_find_substr(haystack, needle_struct); @@ -1020,9 +1178,9 @@ static PyObject *Str_richcompare(PyObject *self, PyObject *other, int op) { } static PySequenceMethods Str_as_sequence = { - .sq_length = Str_len, // - .sq_item = Str_getitem, // - .sq_contains = Str_contains, // + .sq_length = Str_len, // + .sq_item = Str_getitem, // + .sq_contains = Str_in, // }; static PyMappingMethods Str_as_mapping = { @@ -1035,16 +1193,22 @@ static PyBufferProcs Str_as_buffer = { .bf_releasebuffer = Str_releasebuffer, }; +static PyNumberMethods Str_as_number = { + .nb_add = Str_concat, +}; + #define sz_method_flags_m METH_VARARGS | METH_KEYWORDS static PyMethodDef Str_methods[] = { // - {"find", api_find, sz_method_flags_m, "Find the first occurrence of a substring."}, - {"contains", api_contains, sz_method_flags_m, "Check if a string contains a substring."}, - {"count", api_count, sz_method_flags_m, "Count the occurrences of a substring."}, - {"levenstein", api_levenstein, sz_method_flags_m, "Calculate the Levenshtein distance between two strings."}, - {"split", api_split, sz_method_flags_m, "Split a string by a separator."}, - {"startswith", api_startswith, sz_method_flags_m, "Check if a string starts with a given prefix."}, - {"endswith", api_endswith, sz_method_flags_m, "Check if a string ends with a given suffix."}, + {"find", Str_find, sz_method_flags_m, "Find the first occurrence of a substring."}, + {"index", Str_index, sz_method_flags_m, "Find the first occurrence of a substring or raise error if missing."}, + {"contains", Str_contains, sz_method_flags_m, "Check if a string contains a substring."}, + {"count", Str_count, sz_method_flags_m, "Count the occurrences of a substring."}, + {"levenstein", Str_levenstein, sz_method_flags_m, "Calculate the Levenshtein distance between two strings."}, + {"split", Str_split, sz_method_flags_m, "Split a string by a separator."}, + {"splitlines", Str_splitlines, sz_method_flags_m, "Split a string by line breaks."}, + {"startswith", Str_startswith, sz_method_flags_m, "Check if a string starts with a given prefix."}, + {"endswith", Str_endswith, sz_method_flags_m, "Check if a string ends with a given suffix."}, {NULL, NULL, 0, NULL}}; static PyTypeObject StrType = { @@ -1062,6 +1226,7 @@ static PyTypeObject StrType = { .tp_as_sequence = &Str_as_sequence, .tp_as_mapping = &Str_as_mapping, .tp_as_buffer = &Str_as_buffer, + .tp_as_number = &Str_as_number, }; static PySequenceMethods Strs_as_sequence = { @@ -1095,13 +1260,15 @@ static void stringzilla_cleanup(PyObject *m) { } static PyMethodDef stringzilla_methods[] = { - {"find", api_find, sz_method_flags_m, "Find the first occurrence of a substring."}, - {"contains", api_contains, sz_method_flags_m, "Check if a string contains a substring."}, - {"count", api_count, sz_method_flags_m, "Count the occurrences of a substring."}, - {"levenstein", api_levenstein, sz_method_flags_m, "Calculate the Levenshtein distance between two strings."}, - {"split", api_split, sz_method_flags_m, "Split a string by a separator."}, - {"startswith", api_startswith, sz_method_flags_m, "Check if a string starts with a given prefix."}, - {"endswith", api_endswith, sz_method_flags_m, "Check if a string ends with a given suffix."}, + {"find", Str_find, sz_method_flags_m, "Find the first occurrence of a substring."}, + {"index", Str_index, sz_method_flags_m, "Find the first occurrence of a substring or raise error if missing."}, + {"contains", Str_contains, sz_method_flags_m, "Check if a string contains a substring."}, + {"count", Str_count, sz_method_flags_m, "Count the occurrences of a substring."}, + {"levenstein", Str_levenstein, sz_method_flags_m, "Calculate the Levenshtein distance between two strings."}, + {"split", Str_split, sz_method_flags_m, "Split a string by a separator."}, + {"splitlines", Str_splitlines, sz_method_flags_m, "Split a string by line breaks."}, + {"startswith", Str_startswith, sz_method_flags_m, "Check if a string starts with a given prefix."}, + {"endswith", Str_endswith, sz_method_flags_m, "Check if a string ends with a given suffix."}, {NULL, NULL, 0, NULL}}; static PyModuleDef stringzilla_module = { @@ -1154,10 +1321,4 @@ PyMODINIT_FUNC PyInit_stringzilla(void) { temporary_memory.start = malloc(4096); temporary_memory.length = 4096 * (temporary_memory.start != NULL); return m; - -cleanup: - Py_XDECREF(&FileType); - Py_XDECREF(&StrType); - Py_XDECREF(m); - return NULL; } diff --git a/setup.py b/setup.py index 419d9915..cb136b1d 100644 --- a/setup.py +++ b/setup.py @@ -38,6 +38,8 @@ compile_args.append("-O3") compile_args.append("-pedantic") compile_args.append("-Wno-unknown-pragmas") + compile_args.append("-Wno-incompatible-function-pointer-types") + compile_args.append("-Wno-incompatible-pointer-types") compile_args.append("-Xpreprocessor -fopenmp") link_args.append("-Xpreprocessor -lomp") From 4f69e6623b3a0ad4eae49afa943dc1587a1f10c2 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Tue, 19 Sep 2023 23:50:28 +0400 Subject: [PATCH 25/72] Add: `partition` method in Python --- python/lib.c | 1683 +++++++++++++++++++++++++++----------------------- 1 file changed, 903 insertions(+), 780 deletions(-) diff --git a/python/lib.c b/python/lib.c index 7d4c59ef..8459cc30 100644 --- a/python/lib.c +++ b/python/lib.c @@ -205,976 +205,1097 @@ int export_string_like(PyObject *object, char const **start, size_t *length) { return 0; } -#pragma endregion - -#pragma region Global Functions - -static Py_ssize_t Str_find_(PyObject *self, PyObject *args, PyObject *kwargs) { - int is_member = self != NULL && PyObject_TypeCheck(self, &StrType); - Py_ssize_t nargs = PyTuple_Size(args); - if (nargs < !is_member + 1 || nargs > !is_member + 3) { - PyErr_SetString(PyExc_TypeError, "Invalid number of arguments"); - return 0; +int get_string_at_offset( + Strs *strs, Py_ssize_t i, Py_ssize_t count, PyObject **parent, char const **start, size_t *length) { + switch (strs->type) { + case STRS_CONSECUTIVE_32: { + uint32_t start_offset = (i == 0) ? 0 : strs->data.consecutive_32bit.end_offsets[i - 1]; + uint32_t end_offset = strs->data.consecutive_32bit.end_offsets[i]; + *start = strs->data.consecutive_32bit.start + start_offset; + *length = end_offset - start_offset - strs->data.consecutive_32bit.separator_length * (i + 1 != count); + *parent = strs->data.consecutive_32bit.parent; + return 1; } - - PyObject *haystack_obj = is_member ? self : PyTuple_GET_ITEM(args, 0); - PyObject *needle_obj = PyTuple_GET_ITEM(args, !is_member + 0); - PyObject *start_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL; - PyObject *end_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL; - - // Parse keyword arguments - if (kwargs) { - Py_ssize_t pos = 0; - PyObject *key, *value; - while (PyDict_Next(kwargs, &pos, &key, &value)) { - if (PyUnicode_CompareWithASCIIString(key, "start") == 0) { start_obj = value; } - else if (PyUnicode_CompareWithASCIIString(key, "end") == 0) { end_obj = value; } - else { - PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key); - return 0; - } - } + case STRS_CONSECUTIVE_64: { + uint64_t start_offset = (i == 0) ? 0 : strs->data.consecutive_64bit.end_offsets[i - 1]; + uint64_t end_offset = strs->data.consecutive_64bit.end_offsets[i]; + *start = strs->data.consecutive_64bit.start + start_offset; + *length = end_offset - start_offset - strs->data.consecutive_64bit.separator_length * (i + 1 != count); + *parent = strs->data.consecutive_64bit.parent; + return 1; } - - sz_haystack_t haystack; - sz_needle_t needle; - Py_ssize_t start, end; - - // Validate and convert `haystack` and `needle` - needle.anomaly_offset = 0; - if (!export_string_like(haystack_obj, &haystack.start, &haystack.length) || - !export_string_like(needle_obj, &needle.start, &needle.length)) { - PyErr_SetString(PyExc_TypeError, "Haystack and needle must be string-like"); - return 0; + case STRS_REORDERED: { + // + return 1; } - - // Validate and convert `start` - if (start_obj) { - start = PyLong_AsSsize_t(start_obj); - if (start == -1 && PyErr_Occurred()) { - PyErr_SetString(PyExc_TypeError, "The start argument must be an integer"); - return 0; - } + case STRS_MULTI_SOURCE: { + // + return 1; } - else { start = 0; } - - // Validate and convert `end` - if (end_obj) { - end = PyLong_AsSsize_t(end_obj); - if (end == -1 && PyErr_Occurred()) { - PyErr_SetString(PyExc_TypeError, "The end argument must be an integer"); - return 0; - } + default: + // Unsupported type + PyErr_SetString(PyExc_TypeError, "Unsupported type for conversion"); + return -1; } - else { end = PY_SSIZE_T_MAX; } - - // Limit the `haystack` range - size_t normalized_offset, normalized_length; - slice(haystack.length, start, end, &normalized_offset, &normalized_length); - haystack.start += normalized_offset; - haystack.length = normalized_length; - - // Perform contains operation - size_t offset = sz_neon_find_substr(haystack, needle); - if (offset == haystack.length) return -1; - return (Py_ssize_t)offset; -} - -static PyObject *Str_find(PyObject *self, PyObject *args, PyObject *kwargs) { - Py_ssize_t signed_offset = Str_find_(self, args, kwargs); - if (PyErr_Occurred()) return NULL; - return PyLong_FromSsize_t(signed_offset); } -static PyObject *Str_index(PyObject *self, PyObject *args, PyObject *kwargs) { - Py_ssize_t signed_offset = Str_find_(self, args, kwargs); - if (PyErr_Occurred()) return NULL; - if (signed_offset == -1) { - PyErr_SetString(PyExc_ValueError, "substring not found"); - return NULL; +int prepare_strings_for_reordering(Strs *strs) { + // Already in reordered form + if (strs->type == STRS_REORDERED) { return 1; } + + // Allocate memory for reordered slices + size_t count = 0; + switch (strs->type) { + case STRS_CONSECUTIVE_32: count = strs->data.consecutive_32bit.count; break; + case STRS_CONSECUTIVE_64: count = strs->data.consecutive_64bit.count; break; + case STRS_REORDERED: return 1; + case STRS_MULTI_SOURCE: return 1; + default: + // Unsupported type + PyErr_SetString(PyExc_TypeError, "Unsupported type for conversion"); + return -1; } - return PyLong_FromSsize_t(signed_offset); -} - -static PyObject *Str_contains(PyObject *self, PyObject *args, PyObject *kwargs) { - Py_ssize_t signed_offset = Str_find_(self, args, kwargs); - if (PyErr_Occurred()) return NULL; - if (signed_offset == -1) { Py_RETURN_FALSE; } - else { Py_RETURN_TRUE; } -} -static PyObject *Str_count(PyObject *self, PyObject *args, PyObject *kwargs) { - int is_member = self != NULL && PyObject_TypeCheck(self, &StrType); - Py_ssize_t nargs = PyTuple_Size(args); - if (nargs < !is_member + 1 || nargs > !is_member + 4) { - PyErr_Format(PyExc_TypeError, "Invalid number of arguments"); - return NULL; + sz_haystack_t *new_parts = (sz_haystack_t *)malloc(count * sizeof(sz_haystack_t)); + if (new_parts == NULL) { + PyErr_SetString(PyExc_MemoryError, "Unable to allocate memory for reordered slices"); + return -1; } - PyObject *haystack_obj = is_member ? self : PyTuple_GET_ITEM(args, 0); - PyObject *needle_obj = PyTuple_GET_ITEM(args, !is_member + 0); - PyObject *start_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL; - PyObject *end_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL; - PyObject *allowoverlap_obj = nargs > !is_member + 3 ? PyTuple_GET_ITEM(args, !is_member + 3) : NULL; + // Populate the new reordered array using get_string_at_offset + for (Py_ssize_t i = 0; i < count; ++i) { + PyObject *parent; + char const *start; + size_t length; + if (!get_string_at_offset(strs, i, count, &parent, &start, &length)) { + // Handle error + PyErr_SetString(PyExc_RuntimeError, "Failed to get string at offset"); + free(new_parts); + return -1; + } - if (kwargs) { - Py_ssize_t pos = 0; - PyObject *key, *value; - while (PyDict_Next(kwargs, &pos, &key, &value)) - if (PyUnicode_CompareWithASCIIString(key, "start") == 0) { start_obj = value; } - else if (PyUnicode_CompareWithASCIIString(key, "end") == 0) { end_obj = value; } - else if (PyUnicode_CompareWithASCIIString(key, "allowoverlap") == 0) { allowoverlap_obj = value; } - else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key)) - return NULL; + new_parts[i].start = start; + new_parts[i].length = length; } - sz_haystack_t haystack; - sz_needle_t needle; - Py_ssize_t start = start_obj ? PyLong_AsSsize_t(start_obj) : 0; - Py_ssize_t end = end_obj ? PyLong_AsSsize_t(end_obj) : PY_SSIZE_T_MAX; - int allowoverlap = allowoverlap_obj ? PyObject_IsTrue(allowoverlap_obj) : 0; - - needle.anomaly_offset = 0; - if (!export_string_like(haystack_obj, &haystack.start, &haystack.length) || - !export_string_like(needle_obj, &needle.start, &needle.length)) - return PyErr_Format(PyExc_TypeError, "Haystack and needle must be string-like"), NULL; + // Release previous used memory. - if ((start == -1 || end == -1 || allowoverlap == -1) && PyErr_Occurred()) return NULL; - - size_t normalized_offset, normalized_length; - slice(haystack.length, start, end, &normalized_offset, &normalized_length); - haystack.start += normalized_offset; - haystack.length = normalized_length; + // Update the Strs object + strs->type = STRS_REORDERED; + strs->data.reordered.count = count; + strs->data.reordered.parts = new_parts; + strs->data.reordered.parent = NULL; // Assuming the parent is no longer needed - size_t count = needle.length == 1 ? sz_naive_count_char(haystack, *needle.start) : 0; - if (needle.length != 1) { - if (allowoverlap) { - while (haystack.length) { - size_t offset = sz_neon_find_substr(haystack, needle); - int found = offset != haystack.length; - count += found; - haystack.start += offset + found; - haystack.length -= offset + found; - } - } - else { - while (haystack.length) { - size_t offset = sz_neon_find_substr(haystack, needle); - int found = offset != haystack.length; - count += found; - haystack.start += offset + needle.length; - haystack.length -= offset + needle.length * found; - } - } - } - return PyLong_FromSize_t(count); + return 0; } -static PyObject *Str_levenstein(PyObject *self, PyObject *args, PyObject *kwargs) { - int is_member = self != NULL && PyObject_TypeCheck(self, &StrType); - Py_ssize_t nargs = PyTuple_Size(args); - if (nargs < !is_member + 1 || nargs > !is_member + 2) { - PyErr_Format(PyExc_TypeError, "Invalid number of arguments"); - return NULL; - } +#pragma endregion - PyObject *str1_obj = is_member ? self : PyTuple_GET_ITEM(args, 0); - PyObject *str2_obj = PyTuple_GET_ITEM(args, !is_member + 0); - PyObject *bound_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL; +#pragma region MemoryMappingFile - if (kwargs) { - PyObject *key, *value; - Py_ssize_t pos = 0; - while (PyDict_Next(kwargs, &pos, &key, &value)) - if (PyUnicode_CompareWithASCIIString(key, "bound") == 0) { - if (bound_obj) { - PyErr_Format(PyExc_TypeError, "Received bound both as positional and keyword argument"); - return NULL; - } - bound_obj = value; - } +static void File_dealloc(File *self) { +#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) + if (self->start) { + UnmapViewOfFile(self->start); + self->start = NULL; } - - int bound = 255; // Default value for bound - if (bound_obj && ((bound = PyLong_AsLong(bound_obj)) > 255 || bound < 0)) { - PyErr_Format(PyExc_ValueError, "Bound must be an integer between 0 and 255"); - return NULL; + if (self->mapping_handle) { + CloseHandle(self->mapping_handle); + self->mapping_handle = NULL; } - - sz_haystack_t str1, str2; - if (!export_string_like(str1_obj, &str1.start, &str1.length) || - !export_string_like(str2_obj, &str2.start, &str2.length)) { - PyErr_Format(PyExc_TypeError, "Both arguments must be string-like"); - return NULL; + if (self->file_handle) { + CloseHandle(self->file_handle); + self->file_handle = NULL; } - - size_t memory_needed = sz_levenstein_memory_needed(str1.length, str2.length); - if (temporary_memory.length < memory_needed) { - temporary_memory.start = realloc(temporary_memory.start, memory_needed); - temporary_memory.length = memory_needed; +#else + if (self->start) { + munmap(self->start, self->length); + self->start = NULL; + self->length = 0; } - if (!temporary_memory.start) { - PyErr_Format(PyExc_MemoryError, "Unable to allocate memory for the Levenshtein matrix"); - return NULL; + if (self->file_descriptor != 0) { + close(self->file_descriptor); + self->file_descriptor = 0; } +#endif + Py_TYPE(self)->tp_free((PyObject *)self); +} - levenstein_distance_t small_bound = (levenstein_distance_t)bound; - levenstein_distance_t distance = - sz_levenstein(str1.start, str1.length, str2.start, str2.length, small_bound, temporary_memory.start); +static PyObject *File_new(PyTypeObject *type, PyObject *positional_args, PyObject *named_args) { + File *self; + self = (File *)type->tp_alloc(type, 0); + if (self == NULL) return NULL; - return PyLong_FromLong(distance); +#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) + self->file_handle = NULL; + self->mapping_handle = NULL; +#else + self->file_descriptor = 0; +#endif + self->start = NULL; + self->length = 0; } -static PyObject *Str_startswith(PyObject *self, PyObject *args, PyObject *kwargs) { - int is_member = self != NULL && PyObject_TypeCheck(self, &StrType); - Py_ssize_t nargs = PyTuple_Size(args); - if (nargs < !is_member + 1 || nargs > !is_member + 3) { - PyErr_Format(PyExc_TypeError, "Invalid number of arguments"); - return NULL; - } - - PyObject *str_obj = is_member ? self : PyTuple_GET_ITEM(args, 0); - PyObject *prefix_obj = PyTuple_GET_ITEM(args, !is_member); - PyObject *start_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL; - PyObject *end_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL; +static int File_init(File *self, PyObject *positional_args, PyObject *named_args) { + const char *path; + if (!PyArg_ParseTuple(positional_args, "s", &path)) return -1; - // Optional start and end arguments - Py_ssize_t start = 0, end = PY_SSIZE_T_MAX; +#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) + self->file_handle = CreateFile(path, GENERIC_READ, FILE_SHARE_READ, 0, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0); + if (self->file_handle == INVALID_HANDLE_VALUE) { + PyErr_SetString(PyExc_RuntimeError, "Couldn't map the file!"); + return -1; + } - if (start_obj && ((start = PyLong_AsSsize_t(start_obj)) == -1 && PyErr_Occurred())) { - PyErr_SetString(PyExc_TypeError, "start must be an integer"); - return NULL; + self->mapping_handle = CreateFileMapping(self->file_handle, 0, PAGE_READONLY, 0, 0, 0); + if (self->mapping_handle == 0) { + CloseHandle(self->file_handle); + self->file_handle = NULL; + PyErr_SetString(PyExc_RuntimeError, "Couldn't map the file!"); + return -1; } - if (end_obj && ((end = PyLong_AsSsize_t(end_obj)) == -1 && PyErr_Occurred())) { - PyErr_SetString(PyExc_TypeError, "end must be an integer"); - return NULL; + char *file = (char *)MapViewOfFile(self->mapping_handle, FILE_MAP_READ, 0, 0, 0); + if (file == 0) { + CloseHandle(self->mapping_handle); + self->mapping_handle = NULL; + CloseHandle(self->file_handle); + self->file_handle = NULL; + PyErr_SetString(PyExc_RuntimeError, "Couldn't map the file!"); + return -1; } + self->start = file; + self->length = GetFileSize(self->file_handle, 0); +#else + struct stat sb; + self->file_descriptor = open(path, O_RDONLY); + if (fstat(self->file_descriptor, &sb) != 0) { + close(self->file_descriptor); + self->file_descriptor = 0; + PyErr_SetString(PyExc_RuntimeError, "Can't retrieve file size!"); + return -1; + } + size_t file_size = sb.st_size; + void *map = mmap(NULL, sb.st_size, PROT_READ, MAP_SHARED, self->file_descriptor, 0); + if (map == MAP_FAILED) { + close(self->file_descriptor); + self->file_descriptor = 0; + PyErr_SetString(PyExc_RuntimeError, "Couldn't map the file!"); + return -1; + } + self->start = map; + self->length = file_size; +#endif - sz_haystack_t str, prefix; - if (!export_string_like(str_obj, &str.start, &str.length) || - !export_string_like(prefix_obj, &prefix.start, &prefix.length)) { - PyErr_SetString(PyExc_TypeError, "Both arguments must be string-like"); - return NULL; + return 0; +} + +static PyMethodDef File_methods[] = { // + {NULL, NULL, 0, NULL}}; + +static PyTypeObject FileType = { + PyObject_HEAD_INIT(NULL).tp_name = "stringzilla.File", + .tp_doc = "Memory mapped file class, that exposes the memory range for low-level access", + .tp_basicsize = sizeof(File), + .tp_flags = Py_TPFLAGS_DEFAULT, + .tp_methods = File_methods, + .tp_new = (newfunc)File_new, + .tp_init = (initproc)File_init, + .tp_dealloc = (destructor)File_dealloc, +}; + +#pragma endregion + +#pragma region Str + +static int Str_init(Str *self, PyObject *args, PyObject *kwargs) { + + // Parse all arguments into PyObjects first + Py_ssize_t nargs = PyTuple_Size(args); + if (nargs > 3) { + PyErr_SetString(PyExc_TypeError, "Invalid number of arguments"); + return -1; } + PyObject *parent_obj = nargs >= 1 ? PyTuple_GET_ITEM(args, 0) : NULL; + PyObject *from_obj = nargs >= 2 ? PyTuple_GET_ITEM(args, 1) : NULL; + PyObject *to_obj = nargs >= 3 ? PyTuple_GET_ITEM(args, 2) : NULL; - // Apply start and end arguments - str.start += start; - str.length -= start; - if (end != PY_SSIZE_T_MAX && end - start < str.length) { str.length = end - start; } + // Parse keyword arguments, if provided, and ensure no duplicates + if (kwargs) { + PyObject *key, *value; + Py_ssize_t pos = 0; + while (PyDict_Next(kwargs, &pos, &key, &value)) { + if (PyUnicode_CompareWithASCIIString(key, "parent") == 0) { + if (parent_obj) { + PyErr_SetString(PyExc_TypeError, "Received `parent` both as positional and keyword argument"); + return -1; + } + parent_obj = value; + } + else if (PyUnicode_CompareWithASCIIString(key, "from") == 0) { + if (from_obj) { + PyErr_SetString(PyExc_TypeError, "Received `from` both as positional and keyword argument"); + return -1; + } + from_obj = value; + } + else if (PyUnicode_CompareWithASCIIString(key, "to") == 0) { + if (to_obj) { + PyErr_SetString(PyExc_TypeError, "Received `to` both as positional and keyword argument"); + return -1; + } + to_obj = value; + } + else { + PyErr_SetString(PyExc_TypeError, "Invalid keyword argument"); + return -1; + } + } + } - if (str.length < prefix.length) { Py_RETURN_FALSE; } - else if (strncmp(str.start, prefix.start, prefix.length) == 0) { Py_RETURN_TRUE; } - else { Py_RETURN_FALSE; } + // Now, type-check and cast each argument + Py_ssize_t from = 0, to = PY_SSIZE_T_MAX; + if (from_obj) { + from = PyLong_AsSsize_t(from_obj); + if (from == -1 && PyErr_Occurred()) { + PyErr_SetString(PyExc_TypeError, "The `from` argument must be an integer"); + return -1; + } + } + if (to_obj) { + to = PyLong_AsSsize_t(to_obj); + if (to == -1 && PyErr_Occurred()) { + PyErr_SetString(PyExc_TypeError, "The `to` argument must be an integer"); + return -1; + } + } + + // Handle empty string + if (parent_obj == NULL) { + self->start = NULL; + self->length = 0; + } + // Increment the reference count of the parent + else if (export_string_like(parent_obj, &self->start, &self->length)) { + self->parent = parent_obj; + Py_INCREF(parent_obj); + } + else { + PyErr_SetString(PyExc_TypeError, "Unsupported parent type"); + return -1; + } + + // Apply slicing + size_t normalized_offset, normalized_length; + slice(self->length, from, to, &normalized_offset, &normalized_length); + self->start = ((char *)self->start) + normalized_offset; + self->length = normalized_length; + return 0; } -static PyObject *Str_endswith(PyObject *self, PyObject *args, PyObject *kwargs) { - int is_member = self != NULL && PyObject_TypeCheck(self, &StrType); - Py_ssize_t nargs = PyTuple_Size(args); - if (nargs < !is_member + 1 || nargs > !is_member + 3) { - PyErr_Format(PyExc_TypeError, "Invalid number of arguments"); +static PyObject *Str_new(PyTypeObject *type, PyObject *args, PyObject *kwds) { + Str *self; + self = (Str *)type->tp_alloc(type, 0); + if (!self) return NULL; + + self->parent = NULL; + self->start = NULL; + self->length = 0; + return (PyObject *)self; +} + +static void Str_dealloc(Str *self) { + if (self->parent) { Py_XDECREF(self->parent); } + else if (self->start) { free(self->start); } + self->parent = NULL; + Py_TYPE(self)->tp_free((PyObject *)self); +} + +static PyObject *Str_str(Str *self) { return PyUnicode_FromStringAndSize(self->start, self->length); } + +static Py_hash_t Str_hash(Str *self) { return (Py_hash_t)sz_hash_crc32_native(self->start, self->length); } + +static Py_ssize_t Str_len(Str *self) { return self->length; } + +static PyObject *Str_getitem(Str *self, Py_ssize_t i) { + + // Negative indexing + if (i < 0) i += self->length; + + if (i < 0 || (size_t)i >= self->length) { + PyErr_SetString(PyExc_IndexError, "Index out of range"); return NULL; } - PyObject *str_obj = is_member ? self : PyTuple_GET_ITEM(args, 0); - PyObject *suffix_obj = PyTuple_GET_ITEM(args, !is_member); - PyObject *start_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL; - PyObject *end_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL; + // Assuming the underlying data is UTF-8 encoded + return PyUnicode_FromStringAndSize(self->start + i, 1); +} - // Optional start and end arguments - Py_ssize_t start = 0, end = PY_SSIZE_T_MAX; +static PyObject *Str_subscript(Str *self, PyObject *key) { + if (PySlice_Check(key)) { + // Sanity checks + Py_ssize_t start, stop, step; + if (PySlice_Unpack(key, &start, &stop, &step) < 0) return NULL; + if (PySlice_AdjustIndices(self->length, &start, &stop, step) < 0) return NULL; + if (step != 1) { + PyErr_SetString(PyExc_IndexError, "Efficient step is not supported"); + return NULL; + } - if (start_obj && ((start = PyLong_AsSsize_t(start_obj)) == -1 && PyErr_Occurred())) { - PyErr_SetString(PyExc_TypeError, "start must be an integer"); + // Create a new `Str` object + Str *self_slice = (Str *)StrType.tp_alloc(&StrType, 0); + if (self_slice == NULL && PyErr_NoMemory()) return NULL; + + // Set its properties based on the slice + self_slice->start = self->start + start; + self_slice->length = stop - start; + self_slice->parent = (PyObject *)self; // Set parent to keep it alive + + // Increment the reference count of the parent + Py_INCREF(self); + return (PyObject *)self_slice; + } + else if (PyLong_Check(key)) { return Str_getitem(self, PyLong_AsSsize_t(key)); } + else { + PyErr_SetString(PyExc_TypeError, "Str indices must be integers or slices"); return NULL; } +} + +static int Str_getbuffer(Str *self, Py_buffer *view, int flags) { + if (view == NULL) { + PyErr_SetString(PyExc_ValueError, "NULL view in getbuffer"); + return -1; + } + + static Py_ssize_t itemsize[1] = {1}; + view->obj = (PyObject *)self; + view->buf = self->start; + view->len = self->length; + view->readonly = 1; + view->itemsize = sizeof(char); + view->format = "c"; // https://docs.python.org/3/library/struct.html#format-characters + view->ndim = 1; + view->shape = &self->length; // 1-D array, so shape is just a pointer to the length + view->strides = itemsize; // strides in a 1-D array is just the item size + view->suboffsets = NULL; + view->internal = NULL; + + Py_INCREF(self); + return 0; +} + +static void Str_releasebuffer(PyObject *_, Py_buffer *view) { + // This function MUST NOT decrement view->obj, since that is done automatically + // in PyBuffer_Release() (this scheme is useful for breaking reference cycles). + // https://docs.python.org/3/c-api/typeobj.html#c.PyBufferProcs.bf_releasebuffer +} + +static int Str_in(Str *self, PyObject *arg) { + + sz_needle_t needle_struct; + needle_struct.anomaly_offset = 0; + if (!export_string_like(arg, &needle_struct.start, &needle_struct.length)) { + PyErr_SetString(PyExc_TypeError, "Unsupported argument type"); + return -1; + } + + sz_haystack_t haystack; + haystack.start = self->start; + haystack.length = self->length; + size_t position = sz_neon_find_substr(haystack, needle_struct); + return position != haystack.length; +} + +static Py_ssize_t Strs_len(Strs *self) { + switch (self->type) { + case STRS_CONSECUTIVE_32: return self->data.consecutive_32bit.count; + case STRS_CONSECUTIVE_64: return self->data.consecutive_64bit.count; + case STRS_REORDERED: return self->data.reordered.count; + case STRS_MULTI_SOURCE: return self->data.multi_source.count; + default: return 0; + } +} - if (end_obj && ((end = PyLong_AsSsize_t(end_obj)) == -1 && PyErr_Occurred())) { - PyErr_SetString(PyExc_TypeError, "end must be an integer"); +static PyObject *Strs_getitem(Strs *self, Py_ssize_t i) { + // Check for negative index and convert to positive + Py_ssize_t count = Strs_len(self); + if (i < 0) i += count; + if (i < 0 || i >= count) { + PyErr_SetString(PyExc_IndexError, "Index out of range"); return NULL; } - sz_haystack_t str, suffix; - if (!export_string_like(str_obj, &str.start, &str.length) || - !export_string_like(suffix_obj, &suffix.start, &suffix.length)) { - PyErr_SetString(PyExc_TypeError, "Both arguments must be string-like"); + PyObject *parent = NULL; + char const *start = NULL; + size_t length = 0; + if (!get_string_at_offset(self, i, count, &parent, &start, &length)) { + PyErr_SetString(PyExc_TypeError, "Unknown Strs kind"); return NULL; } - // Apply start and end arguments - str.start += start; - str.length -= start; - if (end != PY_SSIZE_T_MAX && end - start < str.length) { str.length = end - start; } + // Create a new `Str` object + Str *parent_slice = (Str *)StrType.tp_alloc(&StrType, 0); + if (parent_slice == NULL && PyErr_NoMemory()) return NULL; - if (str.length < suffix.length) { Py_RETURN_FALSE; } - else if (strncmp(str.start + (str.length - suffix.length), suffix.start, suffix.length) == 0) { Py_RETURN_TRUE; } - else { Py_RETURN_FALSE; } + parent_slice->start = start; + parent_slice->length = length; + parent_slice->parent = parent; + Py_INCREF(parent); + return parent_slice; } -static Strs *Str_split_( - PyObject *parent, sz_haystack_t text, sz_needle_t separator, int keepseparator, Py_ssize_t maxsplit) { - - // Create Strs object - Strs *result = (Strs *)PyObject_New(Strs, &StrsType); - if (!result) return NULL; - - // Initialize Strs object based on the splitting logic - void *offsets_endings = NULL; - size_t offsets_capacity = 0; - size_t offsets_count = 0; - size_t bytes_per_offset; - if (text.length >= UINT32_MAX) { - bytes_per_offset = 8; - result->type = STRS_CONSECUTIVE_64; - result->data.consecutive_64bit.start = text.start; - result->data.consecutive_64bit.parent = parent; - result->data.consecutive_64bit.separator_length = !keepseparator * separator.length; - } - else { - bytes_per_offset = 4; - result->type = STRS_CONSECUTIVE_32; - result->data.consecutive_32bit.start = text.start; - result->data.consecutive_32bit.parent = parent; - result->data.consecutive_32bit.separator_length = !keepseparator * separator.length; - } +static PyObject *Strs_subscript(Str *self, PyObject *key) { + if (PyLong_Check(key)) return Strs_getitem(self, PyLong_AsSsize_t(key)); + return NULL; +} - // Iterate through string, keeping track of the - sz_size_t last_start = 0; - while (last_start < text.length && offsets_count < maxsplit) { - sz_haystack_t text_remaining; - text_remaining.start = text.start + last_start; - text_remaining.length = text.length - last_start; - sz_size_t offset_in_remaining = sz_neon_find_substr(text_remaining, separator); +// Will be called by the `PySequence_Contains` +static int Strs_contains(Str *self, PyObject *arg) { return 0; } - // Reallocate offsets array if needed - if (offsets_count >= offsets_capacity) { - offsets_capacity = (offsets_capacity + 1) * 2; - void *new_offsets = realloc(offsets_endings, offsets_capacity * bytes_per_offset); - if (!new_offsets) { - if (offsets_endings) free(offsets_endings); - } - offsets_endings = new_offsets; - } +static PyObject *Str_richcompare(PyObject *self, PyObject *other, int op) { - // If the memory allocation has failed - discard the response - if (!offsets_endings) { - Py_XDECREF(result); - PyErr_NoMemory(); - return NULL; - } + char const *a_start, *b_start; + size_t a_length, b_length; + if (!export_string_like(self, &a_start, &a_length) || !export_string_like(other, &b_start, &b_length)) + Py_RETURN_NOTIMPLEMENTED; - // Export the offset - size_t will_continue = offset_in_remaining != text_remaining.length; - size_t next_offset = last_start + offset_in_remaining + separator.length * will_continue; - if (text.length >= UINT32_MAX) { ((uint64_t *)offsets_endings)[offsets_count++] = (uint64_t)next_offset; } - else { ((uint32_t *)offsets_endings)[offsets_count++] = (uint32_t)next_offset; } + // Perform byte-wise comparison up to the minimum length + size_t min_length = a_length < b_length ? a_length : b_length; + int cmp_result = memcmp(a_start, b_start, min_length); - // Next time we want to start - last_start = last_start + offset_in_remaining + separator.length; - } + // If the strings are equal up to `min_length`, then the shorter string is smaller + if (cmp_result == 0) cmp_result = (a_length > b_length) - (a_length < b_length); - // Populate the Strs object with the offsets - if (text.length >= UINT32_MAX) { - result->data.consecutive_64bit.end_offsets = offsets_endings; - result->data.consecutive_64bit.count = offsets_count; - } - else { - result->data.consecutive_32bit.end_offsets = offsets_endings; - result->data.consecutive_32bit.count = offsets_count; + switch (op) { + case Py_LT: return PyBool_FromLong(cmp_result < 0); + case Py_LE: return PyBool_FromLong(cmp_result <= 0); + case Py_EQ: return PyBool_FromLong(cmp_result == 0); + case Py_NE: return PyBool_FromLong(cmp_result != 0); + case Py_GT: return PyBool_FromLong(cmp_result > 0); + case Py_GE: return PyBool_FromLong(cmp_result >= 0); + default: Py_RETURN_NOTIMPLEMENTED; } - - Py_INCREF(parent); - return result; } -static PyObject *Str_split(PyObject *self, PyObject *args, PyObject *kwargs) { +/** + * @return 1 on success, 0 on failure. + */ +static int Str_find_( // + PyObject *self, + PyObject *args, + PyObject *kwargs, + Py_ssize_t *offset_out, + sz_haystack_t *haystack_out, + sz_needle_t *needle_out) { - // Check minimum arguments int is_member = self != NULL && PyObject_TypeCheck(self, &StrType); Py_ssize_t nargs = PyTuple_Size(args); if (nargs < !is_member + 1 || nargs > !is_member + 3) { - PyErr_SetString(PyExc_TypeError, "sz.split() requires at least 1 argument"); - return NULL; + PyErr_SetString(PyExc_TypeError, "Invalid number of arguments"); + return 0; } - PyObject *text_obj = is_member ? self : PyTuple_GET_ITEM(args, 0); - PyObject *separator_obj = nargs > !is_member + 0 ? PyTuple_GET_ITEM(args, !is_member + 0) : NULL; - PyObject *maxsplit_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL; - PyObject *keepseparator_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL; + PyObject *haystack_obj = is_member ? self : PyTuple_GET_ITEM(args, 0); + PyObject *needle_obj = PyTuple_GET_ITEM(args, !is_member + 0); + PyObject *start_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL; + PyObject *end_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL; + // Parse keyword arguments if (kwargs) { - PyObject *key, *value; Py_ssize_t pos = 0; + PyObject *key, *value; while (PyDict_Next(kwargs, &pos, &key, &value)) { - if (PyUnicode_CompareWithASCIIString(key, "separator") == 0) { separator_obj = value; } - else if (PyUnicode_CompareWithASCIIString(key, "maxsplit") == 0) { maxsplit_obj = value; } - else if (PyUnicode_CompareWithASCIIString(key, "keepseparator") == 0) { keepseparator_obj = value; } - else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key)) - return NULL; + if (PyUnicode_CompareWithASCIIString(key, "start") == 0) { start_obj = value; } + else if (PyUnicode_CompareWithASCIIString(key, "end") == 0) { end_obj = value; } + else { + PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key); + return 0; + } } } - sz_haystack_t text; - sz_needle_t separator; - int keepseparator; - Py_ssize_t maxsplit; - separator.anomaly_offset = 0; - - // Validate and convert `text` - if (!export_string_like(text_obj, &text.start, &text.length)) { - PyErr_SetString(PyExc_TypeError, "The text argument must be string-like"); - return NULL; - } + sz_haystack_t haystack; + sz_needle_t needle; + Py_ssize_t start, end; - // Validate and convert `separator` - if (separator_obj) { - Py_ssize_t len; - if (!export_string_like(separator_obj, &separator.start, &len)) { - PyErr_SetString(PyExc_TypeError, "The separator argument must be string-like"); - return NULL; - } - separator.length = (size_t)len; - } - else { - separator.start = " "; - separator.length = 1; + // Validate and convert `haystack` and `needle` + needle.anomaly_offset = 0; + if (!export_string_like(haystack_obj, &haystack.start, &haystack.length) || + !export_string_like(needle_obj, &needle.start, &needle.length)) { + PyErr_SetString(PyExc_TypeError, "Haystack and needle must be string-like"); + return 0; } - // Validate and convert `keepseparator` - if (keepseparator_obj) { - keepseparator = PyObject_IsTrue(keepseparator_obj); - if (keepseparator == -1) { - PyErr_SetString(PyExc_TypeError, "The keepseparator argument must be a boolean"); - return NULL; + // Validate and convert `start` + if (start_obj) { + start = PyLong_AsSsize_t(start_obj); + if (start == -1 && PyErr_Occurred()) { + PyErr_SetString(PyExc_TypeError, "The start argument must be an integer"); + return 0; } } - else { keepseparator = 0; } + else { start = 0; } - // Validate and convert `maxsplit` - if (maxsplit_obj) { - maxsplit = PyLong_AsSsize_t(maxsplit_obj); - if (maxsplit == -1 && PyErr_Occurred()) { - PyErr_SetString(PyExc_TypeError, "The maxsplit argument must be an integer"); - return NULL; + // Validate and convert `end` + if (end_obj) { + end = PyLong_AsSsize_t(end_obj); + if (end == -1 && PyErr_Occurred()) { + PyErr_SetString(PyExc_TypeError, "The end argument must be an integer"); + return 0; } } - else { maxsplit = PY_SSIZE_T_MAX; } + else { end = PY_SSIZE_T_MAX; } - return Str_split_(text_obj, text, separator, keepseparator, maxsplit); -} + // Limit the `haystack` range + size_t normalized_offset, normalized_length; + slice(haystack.length, start, end, &normalized_offset, &normalized_length); + haystack.start += normalized_offset; + haystack.length = normalized_length; -static PyObject *Str_splitlines(PyObject *self, PyObject *args, PyObject *kwargs) { + // Perform contains operation + size_t offset = sz_neon_find_substr(haystack, needle); + if (offset == haystack.length) { *offset_out = -1; } + else { *offset_out = (Py_ssize_t)offset; } - // Check minimum arguments - int is_member = self != NULL && PyObject_TypeCheck(self, &StrType); - Py_ssize_t nargs = PyTuple_Size(args); - if (nargs < !is_member || nargs > !is_member + 2) { - PyErr_SetString(PyExc_TypeError, "splitlines() requires at least 1 argument"); - return NULL; - } + *haystack_out = haystack; + *needle_out = needle; + return 1; +} - PyObject *text_obj = is_member ? self : PyTuple_GET_ITEM(args, 0); - PyObject *keeplinebreaks_obj = nargs > !is_member ? PyTuple_GET_ITEM(args, !is_member) : NULL; - PyObject *maxsplit_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL; +static PyObject *Str_find(PyObject *self, PyObject *args, PyObject *kwargs) { + Py_ssize_t signed_offset; + sz_haystack_t text; + sz_needle_t separator; + if (!Str_find_(self, args, kwargs, &signed_offset, &text, &separator)) return NULL; + return PyLong_FromSsize_t(signed_offset); +} - if (kwargs) { - PyObject *key, *value; - Py_ssize_t pos = 0; - while (PyDict_Next(kwargs, &pos, &key, &value)) { - if (PyUnicode_CompareWithASCIIString(key, "keeplinebreaks") == 0) { keeplinebreaks_obj = value; } - else if (PyUnicode_CompareWithASCIIString(key, "maxsplit") == 0) { maxsplit_obj = value; } - else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key)) { return NULL; } - } +static PyObject *Str_index(PyObject *self, PyObject *args, PyObject *kwargs) { + Py_ssize_t signed_offset; + sz_haystack_t text; + sz_needle_t separator; + if (!Str_find_(self, args, kwargs, &signed_offset, &text, &separator)) return NULL; + if (signed_offset == -1) { + PyErr_SetString(PyExc_ValueError, "substring not found"); + return NULL; } + return PyLong_FromSsize_t(signed_offset); +} +static PyObject *Str_contains(PyObject *self, PyObject *args, PyObject *kwargs) { + Py_ssize_t signed_offset; sz_haystack_t text; - int keeplinebreaks; - Py_ssize_t maxsplit = PY_SSIZE_T_MAX; // Default value for maxsplit + sz_needle_t separator; + if (!Str_find_(self, args, kwargs, &signed_offset, &text, &separator)) return NULL; + if (signed_offset == -1) { Py_RETURN_FALSE; } + else { Py_RETURN_TRUE; } +} - // Validate and convert `text` - if (!export_string_like(text_obj, &text.start, &text.length)) { - PyErr_SetString(PyExc_TypeError, "The text argument must be string-like"); - return NULL; - } +static PyObject *Str_partition(PyObject *self, PyObject *args, PyObject *kwargs) { + Py_ssize_t separator_index; + sz_haystack_t text; + sz_needle_t separator; + PyObject *result_tuple; - // Validate and convert `keeplinebreaks` - if (keeplinebreaks_obj) { - keeplinebreaks = PyObject_IsTrue(keeplinebreaks_obj); - if (keeplinebreaks == -1) { - PyErr_SetString(PyExc_TypeError, "The keeplinebreaks argument must be a boolean"); - return NULL; - } - } - else { keeplinebreaks = 0; } + // Use Str_find_ to get the index of the separator + if (!Str_find_(self, args, kwargs, &separator_index, &text, &separator)) return NULL; - // Validate and convert `maxsplit` - if (maxsplit_obj) { - maxsplit = PyLong_AsSsize_t(maxsplit_obj); - if (maxsplit == -1 && PyErr_Occurred()) { - PyErr_SetString(PyExc_TypeError, "The maxsplit argument must be an integer"); - return NULL; - } + // If separator is not found, return a tuple (self, "", "") + if (separator_index == -1) { + PyObject *empty_str1 = Str_new(&StrType, Py_None, Py_None); + PyObject *empty_str2 = Str_new(&StrType, Py_None, Py_None); + + result_tuple = PyTuple_New(3); + Py_INCREF(self); + PyTuple_SET_ITEM(result_tuple, 0, self); + PyTuple_SET_ITEM(result_tuple, 1, empty_str1); + PyTuple_SET_ITEM(result_tuple, 2, empty_str2); + return result_tuple; } - // TODO: Support arbitrary newline characters: - // https://docs.python.org/3/library/stdtypes.html#str.splitlines - // \n, \r, \r\n, \v or \x0b, \f or \x0c, \x1c, \x1d, \x1e, \x85, \u2028, \u2029 - // https://github.com/ashvardanian/StringZilla/issues/29 - sz_needle_t separator; - separator.start = "\n"; - separator.length = 1; - return Str_split_(text_obj, text, separator, keeplinebreaks, maxsplit); -} + // Create the three parts manually + Str *before = Str_new(&StrType, NULL, NULL); + Str *middle = Str_new(&StrType, NULL, NULL); + Str *after = Str_new(&StrType, NULL, NULL); -static PyObject *Str_concat(PyObject *self, PyObject *other) { - struct sz_haystack_t self_str, other_str; + before->parent = self, before->start = text.start, before->length = separator_index; + middle->parent = self, middle->start = text.start + separator_index, middle->length = separator.length; + after->parent = self, after->start = text.start + separator_index + separator.length, + after->length = text.length - separator_index - separator.length; - // Validate and convert `self` - if (!export_string_like(self, &self_str.start, &self_str.length)) { - PyErr_SetString(PyExc_TypeError, "The self object must be string-like"); - return NULL; - } + // All parts reference the same parent + Py_INCREF(self); + Py_INCREF(self); + Py_INCREF(self); - // Validate and convert `other` - if (!export_string_like(other, &other_str.start, &other_str.length)) { - PyErr_SetString(PyExc_TypeError, "The other object must be string-like"); + // Build the result tuple + result_tuple = PyTuple_New(3); + PyTuple_SET_ITEM(result_tuple, 0, before); + PyTuple_SET_ITEM(result_tuple, 1, middle); + PyTuple_SET_ITEM(result_tuple, 2, after); + + return result_tuple; +} + +static PyObject *Str_count(PyObject *self, PyObject *args, PyObject *kwargs) { + int is_member = self != NULL && PyObject_TypeCheck(self, &StrType); + Py_ssize_t nargs = PyTuple_Size(args); + if (nargs < !is_member + 1 || nargs > !is_member + 4) { + PyErr_Format(PyExc_TypeError, "Invalid number of arguments"); return NULL; } - // Allocate a new Str instance - Str *result_str = PyObject_New(Str, &StrType); - if (result_str == NULL) { return NULL; } - - // Calculate the total length of the new string - result_str->parent = NULL; - result_str->length = self_str.length + other_str.length; + PyObject *haystack_obj = is_member ? self : PyTuple_GET_ITEM(args, 0); + PyObject *needle_obj = PyTuple_GET_ITEM(args, !is_member + 0); + PyObject *start_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL; + PyObject *end_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL; + PyObject *allowoverlap_obj = nargs > !is_member + 3 ? PyTuple_GET_ITEM(args, !is_member + 3) : NULL; - // Allocate memory for the new string - result_str->start = malloc(result_str->length); - if (result_str->start == NULL) { - PyErr_SetString(PyExc_MemoryError, "Unable to allocate memory for string concatenation"); - return NULL; + if (kwargs) { + Py_ssize_t pos = 0; + PyObject *key, *value; + while (PyDict_Next(kwargs, &pos, &key, &value)) + if (PyUnicode_CompareWithASCIIString(key, "start") == 0) { start_obj = value; } + else if (PyUnicode_CompareWithASCIIString(key, "end") == 0) { end_obj = value; } + else if (PyUnicode_CompareWithASCIIString(key, "allowoverlap") == 0) { allowoverlap_obj = value; } + else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key)) + return NULL; } - // Perform the string concatenation - memcpy(result_str->start, self_str.start, self_str.length); - memcpy(result_str->start + self_str.length, other_str.start, other_str.length); + sz_haystack_t haystack; + sz_needle_t needle; + Py_ssize_t start = start_obj ? PyLong_AsSsize_t(start_obj) : 0; + Py_ssize_t end = end_obj ? PyLong_AsSsize_t(end_obj) : PY_SSIZE_T_MAX; + int allowoverlap = allowoverlap_obj ? PyObject_IsTrue(allowoverlap_obj) : 0; - return (PyObject *)result_str; -} + needle.anomaly_offset = 0; + if (!export_string_like(haystack_obj, &haystack.start, &haystack.length) || + !export_string_like(needle_obj, &needle.start, &needle.length)) + return PyErr_Format(PyExc_TypeError, "Haystack and needle must be string-like"), NULL; -#pragma endregion + if ((start == -1 || end == -1 || allowoverlap == -1) && PyErr_Occurred()) return NULL; -#pragma region MemoryMappingFile + size_t normalized_offset, normalized_length; + slice(haystack.length, start, end, &normalized_offset, &normalized_length); + haystack.start += normalized_offset; + haystack.length = normalized_length; -static void File_dealloc(File *self) { -#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) - if (self->start) { - UnmapViewOfFile(self->start); - self->start = NULL; - } - if (self->mapping_handle) { - CloseHandle(self->mapping_handle); - self->mapping_handle = NULL; - } - if (self->file_handle) { - CloseHandle(self->file_handle); - self->file_handle = NULL; - } -#else - if (self->start) { - munmap(self->start, self->length); - self->start = NULL; - self->length = 0; - } - if (self->file_descriptor != 0) { - close(self->file_descriptor); - self->file_descriptor = 0; + size_t count = needle.length == 1 ? sz_naive_count_char(haystack, *needle.start) : 0; + if (needle.length != 1) { + if (allowoverlap) { + while (haystack.length) { + size_t offset = sz_neon_find_substr(haystack, needle); + int found = offset != haystack.length; + count += found; + haystack.start += offset + found; + haystack.length -= offset + found; + } + } + else { + while (haystack.length) { + size_t offset = sz_neon_find_substr(haystack, needle); + int found = offset != haystack.length; + count += found; + haystack.start += offset + needle.length; + haystack.length -= offset + needle.length * found; + } + } } -#endif - Py_TYPE(self)->tp_free((PyObject *)self); + return PyLong_FromSize_t(count); } -static PyObject *File_new(PyTypeObject *type, PyObject *positional_args, PyObject *named_args) { - File *self; - self = (File *)type->tp_alloc(type, 0); - if (self == NULL) return NULL; - -#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) - self->file_handle = NULL; - self->mapping_handle = NULL; -#else - self->file_descriptor = 0; -#endif - self->start = NULL; - self->length = 0; -} +static PyObject *Str_levenstein(PyObject *self, PyObject *args, PyObject *kwargs) { + int is_member = self != NULL && PyObject_TypeCheck(self, &StrType); + Py_ssize_t nargs = PyTuple_Size(args); + if (nargs < !is_member + 1 || nargs > !is_member + 2) { + PyErr_Format(PyExc_TypeError, "Invalid number of arguments"); + return NULL; + } -static int File_init(File *self, PyObject *positional_args, PyObject *named_args) { - const char *path; - if (!PyArg_ParseTuple(positional_args, "s", &path)) return -1; + PyObject *str1_obj = is_member ? self : PyTuple_GET_ITEM(args, 0); + PyObject *str2_obj = PyTuple_GET_ITEM(args, !is_member + 0); + PyObject *bound_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL; -#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) - self->file_handle = CreateFile(path, GENERIC_READ, FILE_SHARE_READ, 0, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0); - if (self->file_handle == INVALID_HANDLE_VALUE) { - PyErr_SetString(PyExc_RuntimeError, "Couldn't map the file!"); - return -1; + if (kwargs) { + PyObject *key, *value; + Py_ssize_t pos = 0; + while (PyDict_Next(kwargs, &pos, &key, &value)) + if (PyUnicode_CompareWithASCIIString(key, "bound") == 0) { + if (bound_obj) { + PyErr_Format(PyExc_TypeError, "Received bound both as positional and keyword argument"); + return NULL; + } + bound_obj = value; + } } - self->mapping_handle = CreateFileMapping(self->file_handle, 0, PAGE_READONLY, 0, 0, 0); - if (self->mapping_handle == 0) { - CloseHandle(self->file_handle); - self->file_handle = NULL; - PyErr_SetString(PyExc_RuntimeError, "Couldn't map the file!"); - return -1; + int bound = 255; // Default value for bound + if (bound_obj && ((bound = PyLong_AsLong(bound_obj)) > 255 || bound < 0)) { + PyErr_Format(PyExc_ValueError, "Bound must be an integer between 0 and 255"); + return NULL; } - char *file = (char *)MapViewOfFile(self->mapping_handle, FILE_MAP_READ, 0, 0, 0); - if (file == 0) { - CloseHandle(self->mapping_handle); - self->mapping_handle = NULL; - CloseHandle(self->file_handle); - self->file_handle = NULL; - PyErr_SetString(PyExc_RuntimeError, "Couldn't map the file!"); - return -1; + sz_haystack_t str1, str2; + if (!export_string_like(str1_obj, &str1.start, &str1.length) || + !export_string_like(str2_obj, &str2.start, &str2.length)) { + PyErr_Format(PyExc_TypeError, "Both arguments must be string-like"); + return NULL; } - self->start = file; - self->length = GetFileSize(self->file_handle, 0); -#else - struct stat sb; - self->file_descriptor = open(path, O_RDONLY); - if (fstat(self->file_descriptor, &sb) != 0) { - close(self->file_descriptor); - self->file_descriptor = 0; - PyErr_SetString(PyExc_RuntimeError, "Can't retrieve file size!"); - return -1; + + size_t memory_needed = sz_levenstein_memory_needed(str1.length, str2.length); + if (temporary_memory.length < memory_needed) { + temporary_memory.start = realloc(temporary_memory.start, memory_needed); + temporary_memory.length = memory_needed; } - size_t file_size = sb.st_size; - void *map = mmap(NULL, sb.st_size, PROT_READ, MAP_SHARED, self->file_descriptor, 0); - if (map == MAP_FAILED) { - close(self->file_descriptor); - self->file_descriptor = 0; - PyErr_SetString(PyExc_RuntimeError, "Couldn't map the file!"); - return -1; + if (!temporary_memory.start) { + PyErr_Format(PyExc_MemoryError, "Unable to allocate memory for the Levenshtein matrix"); + return NULL; } - self->start = map; - self->length = file_size; -#endif - return 0; + levenstein_distance_t small_bound = (levenstein_distance_t)bound; + levenstein_distance_t distance = + sz_levenstein(str1.start, str1.length, str2.start, str2.length, small_bound, temporary_memory.start); + + return PyLong_FromLong(distance); } -static PyMethodDef File_methods[] = { // - {NULL, NULL, 0, NULL}}; +static PyObject *Str_startswith(PyObject *self, PyObject *args, PyObject *kwargs) { + int is_member = self != NULL && PyObject_TypeCheck(self, &StrType); + Py_ssize_t nargs = PyTuple_Size(args); + if (nargs < !is_member + 1 || nargs > !is_member + 3) { + PyErr_Format(PyExc_TypeError, "Invalid number of arguments"); + return NULL; + } -static PyTypeObject FileType = { - PyObject_HEAD_INIT(NULL).tp_name = "stringzilla.File", - .tp_doc = "Memory mapped file class, that exposes the memory range for low-level access", - .tp_basicsize = sizeof(File), - .tp_flags = Py_TPFLAGS_DEFAULT, - .tp_methods = File_methods, - .tp_new = (newfunc)File_new, - .tp_init = (initproc)File_init, - .tp_dealloc = (destructor)File_dealloc, -}; + PyObject *str_obj = is_member ? self : PyTuple_GET_ITEM(args, 0); + PyObject *prefix_obj = PyTuple_GET_ITEM(args, !is_member); + PyObject *start_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL; + PyObject *end_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL; -#pragma endregion + // Optional start and end arguments + Py_ssize_t start = 0, end = PY_SSIZE_T_MAX; -#pragma region Str + if (start_obj && ((start = PyLong_AsSsize_t(start_obj)) == -1 && PyErr_Occurred())) { + PyErr_SetString(PyExc_TypeError, "start must be an integer"); + return NULL; + } -static int Str_init(Str *self, PyObject *args, PyObject *kwargs) { + if (end_obj && ((end = PyLong_AsSsize_t(end_obj)) == -1 && PyErr_Occurred())) { + PyErr_SetString(PyExc_TypeError, "end must be an integer"); + return NULL; + } - // Parse all arguments into PyObjects first - Py_ssize_t nargs = PyTuple_Size(args); - if (nargs > 3) { - PyErr_SetString(PyExc_TypeError, "Invalid number of arguments"); - return -1; + sz_haystack_t str, prefix; + if (!export_string_like(str_obj, &str.start, &str.length) || + !export_string_like(prefix_obj, &prefix.start, &prefix.length)) { + PyErr_SetString(PyExc_TypeError, "Both arguments must be string-like"); + return NULL; } - PyObject *parent_obj = nargs >= 1 ? PyTuple_GET_ITEM(args, 0) : NULL; - PyObject *from_obj = nargs >= 2 ? PyTuple_GET_ITEM(args, 1) : NULL; - PyObject *to_obj = nargs >= 3 ? PyTuple_GET_ITEM(args, 2) : NULL; - // Parse keyword arguments, if provided, and ensure no duplicates - if (kwargs) { - PyObject *key, *value; - Py_ssize_t pos = 0; - while (PyDict_Next(kwargs, &pos, &key, &value)) { - if (PyUnicode_CompareWithASCIIString(key, "parent") == 0) { - if (parent_obj) { - PyErr_SetString(PyExc_TypeError, "Received `parent` both as positional and keyword argument"); - return -1; - } - parent_obj = value; - } - else if (PyUnicode_CompareWithASCIIString(key, "from") == 0) { - if (from_obj) { - PyErr_SetString(PyExc_TypeError, "Received `from` both as positional and keyword argument"); - return -1; - } - from_obj = value; - } - else if (PyUnicode_CompareWithASCIIString(key, "to") == 0) { - if (to_obj) { - PyErr_SetString(PyExc_TypeError, "Received `to` both as positional and keyword argument"); - return -1; - } - to_obj = value; - } - else { - PyErr_SetString(PyExc_TypeError, "Invalid keyword argument"); - return -1; - } - } + // Apply start and end arguments + str.start += start; + str.length -= start; + if (end != PY_SSIZE_T_MAX && end - start < str.length) { str.length = end - start; } + + if (str.length < prefix.length) { Py_RETURN_FALSE; } + else if (strncmp(str.start, prefix.start, prefix.length) == 0) { Py_RETURN_TRUE; } + else { Py_RETURN_FALSE; } +} + +static PyObject *Str_endswith(PyObject *self, PyObject *args, PyObject *kwargs) { + int is_member = self != NULL && PyObject_TypeCheck(self, &StrType); + Py_ssize_t nargs = PyTuple_Size(args); + if (nargs < !is_member + 1 || nargs > !is_member + 3) { + PyErr_Format(PyExc_TypeError, "Invalid number of arguments"); + return NULL; } - // Now, type-check and cast each argument - Py_ssize_t from = 0, to = PY_SSIZE_T_MAX; - if (from_obj) { - from = PyLong_AsSsize_t(from_obj); - if (from == -1 && PyErr_Occurred()) { - PyErr_SetString(PyExc_TypeError, "The `from` argument must be an integer"); - return -1; - } + PyObject *str_obj = is_member ? self : PyTuple_GET_ITEM(args, 0); + PyObject *suffix_obj = PyTuple_GET_ITEM(args, !is_member); + PyObject *start_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL; + PyObject *end_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL; + + // Optional start and end arguments + Py_ssize_t start = 0, end = PY_SSIZE_T_MAX; + + if (start_obj && ((start = PyLong_AsSsize_t(start_obj)) == -1 && PyErr_Occurred())) { + PyErr_SetString(PyExc_TypeError, "start must be an integer"); + return NULL; } - if (to_obj) { - to = PyLong_AsSsize_t(to_obj); - if (to == -1 && PyErr_Occurred()) { - PyErr_SetString(PyExc_TypeError, "The `to` argument must be an integer"); - return -1; - } + + if (end_obj && ((end = PyLong_AsSsize_t(end_obj)) == -1 && PyErr_Occurred())) { + PyErr_SetString(PyExc_TypeError, "end must be an integer"); + return NULL; } - // Handle empty string - if (parent_obj == NULL) { - self->start = NULL; - self->length = 0; + sz_haystack_t str, suffix; + if (!export_string_like(str_obj, &str.start, &str.length) || + !export_string_like(suffix_obj, &suffix.start, &suffix.length)) { + PyErr_SetString(PyExc_TypeError, "Both arguments must be string-like"); + return NULL; } - // Increment the reference count of the parent - else if (export_string_like(parent_obj, &self->start, &self->length)) { - self->parent = parent_obj; - Py_INCREF(parent_obj); + + // Apply start and end arguments + str.start += start; + str.length -= start; + if (end != PY_SSIZE_T_MAX && end - start < str.length) { str.length = end - start; } + + if (str.length < suffix.length) { Py_RETURN_FALSE; } + else if (strncmp(str.start + (str.length - suffix.length), suffix.start, suffix.length) == 0) { Py_RETURN_TRUE; } + else { Py_RETURN_FALSE; } +} + +static Strs *Str_split_( + PyObject *parent, sz_haystack_t text, sz_needle_t separator, int keepseparator, Py_ssize_t maxsplit) { + + // Create Strs object + Strs *result = (Strs *)PyObject_New(Strs, &StrsType); + if (!result) return NULL; + + // Initialize Strs object based on the splitting logic + void *offsets_endings = NULL; + size_t offsets_capacity = 0; + size_t offsets_count = 0; + size_t bytes_per_offset; + if (text.length >= UINT32_MAX) { + bytes_per_offset = 8; + result->type = STRS_CONSECUTIVE_64; + result->data.consecutive_64bit.start = text.start; + result->data.consecutive_64bit.parent = parent; + result->data.consecutive_64bit.separator_length = !keepseparator * separator.length; } else { - PyErr_SetString(PyExc_TypeError, "Unsupported parent type"); - return -1; + bytes_per_offset = 4; + result->type = STRS_CONSECUTIVE_32; + result->data.consecutive_32bit.start = text.start; + result->data.consecutive_32bit.parent = parent; + result->data.consecutive_32bit.separator_length = !keepseparator * separator.length; } - // Apply slicing - size_t normalized_offset, normalized_length; - slice(self->length, from, to, &normalized_offset, &normalized_length); - self->start = ((char *)self->start) + normalized_offset; - self->length = normalized_length; - return 0; -} - -static PyObject *Str_new(PyTypeObject *type, PyObject *args, PyObject *kwds) { - Str *self; - self = (Str *)type->tp_alloc(type, 0); - if (!self) return NULL; + // Iterate through string, keeping track of the + sz_size_t last_start = 0; + while (last_start < text.length && offsets_count < maxsplit) { + sz_haystack_t text_remaining; + text_remaining.start = text.start + last_start; + text_remaining.length = text.length - last_start; + sz_size_t offset_in_remaining = sz_neon_find_substr(text_remaining, separator); - self->parent = NULL; - self->start = NULL; - self->length = 0; - return (PyObject *)self; -} + // Reallocate offsets array if needed + if (offsets_count >= offsets_capacity) { + offsets_capacity = (offsets_capacity + 1) * 2; + void *new_offsets = realloc(offsets_endings, offsets_capacity * bytes_per_offset); + if (!new_offsets) { + if (offsets_endings) free(offsets_endings); + } + offsets_endings = new_offsets; + } -static void Str_dealloc(Str *self) { - if (self->parent) { Py_XDECREF(self->parent); } - else if (self->start) { free(self->start); } - self->parent = NULL; - Py_TYPE(self)->tp_free((PyObject *)self); -} + // If the memory allocation has failed - discard the response + if (!offsets_endings) { + Py_XDECREF(result); + PyErr_NoMemory(); + return NULL; + } -static PyObject *Str_str(Str *self) { return PyUnicode_FromStringAndSize(self->start, self->length); } + // Export the offset + size_t will_continue = offset_in_remaining != text_remaining.length; + size_t next_offset = last_start + offset_in_remaining + separator.length * will_continue; + if (text.length >= UINT32_MAX) { ((uint64_t *)offsets_endings)[offsets_count++] = (uint64_t)next_offset; } + else { ((uint32_t *)offsets_endings)[offsets_count++] = (uint32_t)next_offset; } -static Py_hash_t Str_hash(Str *self) { return (Py_hash_t)sz_hash_crc32_native(self->start, self->length); } + // Next time we want to start + last_start = last_start + offset_in_remaining + separator.length; + } -static Py_ssize_t Str_len(Str *self) { return self->length; } + // Populate the Strs object with the offsets + if (text.length >= UINT32_MAX) { + result->data.consecutive_64bit.end_offsets = offsets_endings; + result->data.consecutive_64bit.count = offsets_count; + } + else { + result->data.consecutive_32bit.end_offsets = offsets_endings; + result->data.consecutive_32bit.count = offsets_count; + } -static PyObject *Str_getitem(Str *self, Py_ssize_t i) { + Py_INCREF(parent); + return result; +} - // Negative indexing - if (i < 0) i += self->length; +static PyObject *Str_split(PyObject *self, PyObject *args, PyObject *kwargs) { - if (i < 0 || (size_t)i >= self->length) { - PyErr_SetString(PyExc_IndexError, "Index out of range"); + // Check minimum arguments + int is_member = self != NULL && PyObject_TypeCheck(self, &StrType); + Py_ssize_t nargs = PyTuple_Size(args); + if (nargs < !is_member + 1 || nargs > !is_member + 3) { + PyErr_SetString(PyExc_TypeError, "sz.split() requires at least 1 argument"); return NULL; } - // Assuming the underlying data is UTF-8 encoded - return PyUnicode_FromStringAndSize(self->start + i, 1); -} + PyObject *text_obj = is_member ? self : PyTuple_GET_ITEM(args, 0); + PyObject *separator_obj = nargs > !is_member + 0 ? PyTuple_GET_ITEM(args, !is_member + 0) : NULL; + PyObject *maxsplit_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL; + PyObject *keepseparator_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL; -static PyObject *Str_subscript(Str *self, PyObject *key) { - if (PySlice_Check(key)) { - // Sanity checks - Py_ssize_t start, stop, step; - if (PySlice_Unpack(key, &start, &stop, &step) < 0) return NULL; - if (PySlice_AdjustIndices(self->length, &start, &stop, step) < 0) return NULL; - if (step != 1) { - PyErr_SetString(PyExc_IndexError, "Efficient step is not supported"); - return NULL; + if (kwargs) { + PyObject *key, *value; + Py_ssize_t pos = 0; + while (PyDict_Next(kwargs, &pos, &key, &value)) { + if (PyUnicode_CompareWithASCIIString(key, "separator") == 0) { separator_obj = value; } + else if (PyUnicode_CompareWithASCIIString(key, "maxsplit") == 0) { maxsplit_obj = value; } + else if (PyUnicode_CompareWithASCIIString(key, "keepseparator") == 0) { keepseparator_obj = value; } + else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key)) + return NULL; } + } - // Create a new `Str` object - Str *self_slice = (Str *)StrType.tp_alloc(&StrType, 0); - if (self_slice == NULL && PyErr_NoMemory()) return NULL; + sz_haystack_t text; + sz_needle_t separator; + int keepseparator; + Py_ssize_t maxsplit; + separator.anomaly_offset = 0; - // Set its properties based on the slice - self_slice->start = self->start + start; - self_slice->length = stop - start; - self_slice->parent = (PyObject *)self; // Set parent to keep it alive + // Validate and convert `text` + if (!export_string_like(text_obj, &text.start, &text.length)) { + PyErr_SetString(PyExc_TypeError, "The text argument must be string-like"); + return NULL; + } - // Increment the reference count of the parent - Py_INCREF(self); - return (PyObject *)self_slice; + // Validate and convert `separator` + if (separator_obj) { + Py_ssize_t len; + if (!export_string_like(separator_obj, &separator.start, &len)) { + PyErr_SetString(PyExc_TypeError, "The separator argument must be string-like"); + return NULL; + } + separator.length = (size_t)len; } - else if (PyLong_Check(key)) { return Str_getitem(self, PyLong_AsSsize_t(key)); } else { - PyErr_SetString(PyExc_TypeError, "Str indices must be integers or slices"); - return NULL; + separator.start = " "; + separator.length = 1; } -} -static int Str_getbuffer(Str *self, Py_buffer *view, int flags) { - if (view == NULL) { - PyErr_SetString(PyExc_ValueError, "NULL view in getbuffer"); - return -1; + // Validate and convert `keepseparator` + if (keepseparator_obj) { + keepseparator = PyObject_IsTrue(keepseparator_obj); + if (keepseparator == -1) { + PyErr_SetString(PyExc_TypeError, "The keepseparator argument must be a boolean"); + return NULL; + } } + else { keepseparator = 0; } - static Py_ssize_t itemsize[1] = {1}; - view->obj = (PyObject *)self; - view->buf = self->start; - view->len = self->length; - view->readonly = 1; - view->itemsize = sizeof(char); - view->format = "c"; // https://docs.python.org/3/library/struct.html#format-characters - view->ndim = 1; - view->shape = &self->length; // 1-D array, so shape is just a pointer to the length - view->strides = itemsize; // strides in a 1-D array is just the item size - view->suboffsets = NULL; - view->internal = NULL; - - Py_INCREF(self); - return 0; -} + // Validate and convert `maxsplit` + if (maxsplit_obj) { + maxsplit = PyLong_AsSsize_t(maxsplit_obj); + if (maxsplit == -1 && PyErr_Occurred()) { + PyErr_SetString(PyExc_TypeError, "The maxsplit argument must be an integer"); + return NULL; + } + } + else { maxsplit = PY_SSIZE_T_MAX; } -static void Str_releasebuffer(PyObject *_, Py_buffer *view) { - // This function MUST NOT decrement view->obj, since that is done automatically - // in PyBuffer_Release() (this scheme is useful for breaking reference cycles). - // https://docs.python.org/3/c-api/typeobj.html#c.PyBufferProcs.bf_releasebuffer + return Str_split_(text_obj, text, separator, keepseparator, maxsplit); } -static int Str_in(Str *self, PyObject *arg) { +static PyObject *Str_splitlines(PyObject *self, PyObject *args, PyObject *kwargs) { - sz_needle_t needle_struct; - needle_struct.anomaly_offset = 0; - if (!export_string_like(arg, &needle_struct.start, &needle_struct.length)) { - PyErr_SetString(PyExc_TypeError, "Unsupported argument type"); - return -1; + // Check minimum arguments + int is_member = self != NULL && PyObject_TypeCheck(self, &StrType); + Py_ssize_t nargs = PyTuple_Size(args); + if (nargs < !is_member || nargs > !is_member + 2) { + PyErr_SetString(PyExc_TypeError, "splitlines() requires at least 1 argument"); + return NULL; } - sz_haystack_t haystack; - haystack.start = self->start; - haystack.length = self->length; - size_t position = sz_neon_find_substr(haystack, needle_struct); - return position != haystack.length; -} + PyObject *text_obj = is_member ? self : PyTuple_GET_ITEM(args, 0); + PyObject *keeplinebreaks_obj = nargs > !is_member ? PyTuple_GET_ITEM(args, !is_member) : NULL; + PyObject *maxsplit_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL; -static Py_ssize_t Strs_len(Strs *self) { - switch (self->type) { - case STRS_CONSECUTIVE_32: return self->data.consecutive_32bit.count; - case STRS_CONSECUTIVE_64: return self->data.consecutive_64bit.count; - case STRS_REORDERED: return self->data.reordered.count; - case STRS_MULTI_SOURCE: return self->data.multi_source.count; - default: return 0; + if (kwargs) { + PyObject *key, *value; + Py_ssize_t pos = 0; + while (PyDict_Next(kwargs, &pos, &key, &value)) { + if (PyUnicode_CompareWithASCIIString(key, "keeplinebreaks") == 0) { keeplinebreaks_obj = value; } + else if (PyUnicode_CompareWithASCIIString(key, "maxsplit") == 0) { maxsplit_obj = value; } + else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key)) { return NULL; } + } } -} -static PyObject *Strs_getitem(Strs *self, Py_ssize_t i) { - // Check for negative index and convert to positive - Py_ssize_t count = Strs_len(self); - if (i < 0) i += count; - if (i < 0 || i >= count) { - PyErr_SetString(PyExc_IndexError, "Index out of range"); + sz_haystack_t text; + int keeplinebreaks; + Py_ssize_t maxsplit = PY_SSIZE_T_MAX; // Default value for maxsplit + + // Validate and convert `text` + if (!export_string_like(text_obj, &text.start, &text.length)) { + PyErr_SetString(PyExc_TypeError, "The text argument must be string-like"); return NULL; } - PyObject *parent = NULL; - char const *start = NULL; - size_t length = 0; - - // Extract a member element based on - switch (self->type) { - case STRS_CONSECUTIVE_32: { - uint32_t start_offset = (i == 0) ? 0 : self->data.consecutive_32bit.end_offsets[i - 1]; - uint32_t end_offset = self->data.consecutive_32bit.end_offsets[i]; - start = self->data.consecutive_32bit.start + start_offset; - length = end_offset - start_offset - self->data.consecutive_32bit.separator_length * (i + 1 != count); - parent = self->data.consecutive_32bit.parent; - break; - } - case STRS_CONSECUTIVE_64: { - uint64_t start_offset = (i == 0) ? 0 : self->data.consecutive_64bit.end_offsets[i - 1]; - uint64_t end_offset = self->data.consecutive_64bit.end_offsets[i]; - start = self->data.consecutive_64bit.start + start_offset; - length = end_offset - start_offset - self->data.consecutive_64bit.separator_length * (i + 1 != count); - parent = self->data.consecutive_64bit.parent; - break; - } - case STRS_REORDERED: { - // - break; - } - case STRS_MULTI_SOURCE: { - // - break; - } - default: PyErr_SetString(PyExc_TypeError, "Unknown Strs kind"); return NULL; + // Validate and convert `keeplinebreaks` + if (keeplinebreaks_obj) { + keeplinebreaks = PyObject_IsTrue(keeplinebreaks_obj); + if (keeplinebreaks == -1) { + PyErr_SetString(PyExc_TypeError, "The keeplinebreaks argument must be a boolean"); + return NULL; + } } + else { keeplinebreaks = 0; } - // Create a new `Str` object - Str *parent_slice = (Str *)StrType.tp_alloc(&StrType, 0); - if (parent_slice == NULL && PyErr_NoMemory()) return NULL; - - parent_slice->start = start; - parent_slice->length = length; - parent_slice->parent = parent; - Py_INCREF(parent); - return parent_slice; -} + // Validate and convert `maxsplit` + if (maxsplit_obj) { + maxsplit = PyLong_AsSsize_t(maxsplit_obj); + if (maxsplit == -1 && PyErr_Occurred()) { + PyErr_SetString(PyExc_TypeError, "The maxsplit argument must be an integer"); + return NULL; + } + } -static PyObject *Strs_subscript(Str *self, PyObject *key) { - if (PyLong_Check(key)) return Strs_getitem(self, PyLong_AsSsize_t(key)); - return NULL; + // TODO: Support arbitrary newline characters: + // https://docs.python.org/3/library/stdtypes.html#str.splitlines + // \n, \r, \r\n, \v or \x0b, \f or \x0c, \x1c, \x1d, \x1e, \x85, \u2028, \u2029 + // https://github.com/ashvardanian/StringZilla/issues/29 + sz_needle_t separator; + separator.start = "\n"; + separator.length = 1; + return Str_split_(text_obj, text, separator, keeplinebreaks, maxsplit); } -// Will be called by the `PySequence_Contains` -static int Strs_contains(Str *self, PyObject *arg) { return 0; } +static PyObject *Str_concat(PyObject *self, PyObject *other) { + struct sz_haystack_t self_str, other_str; -static PyObject *Str_richcompare(PyObject *self, PyObject *other, int op) { + // Validate and convert `self` + if (!export_string_like(self, &self_str.start, &self_str.length)) { + PyErr_SetString(PyExc_TypeError, "The self object must be string-like"); + return NULL; + } - char const *a_start, *b_start; - size_t a_length, b_length; - if (!export_string_like(self, &a_start, &a_length) || !export_string_like(other, &b_start, &b_length)) - Py_RETURN_NOTIMPLEMENTED; + // Validate and convert `other` + if (!export_string_like(other, &other_str.start, &other_str.length)) { + PyErr_SetString(PyExc_TypeError, "The other object must be string-like"); + return NULL; + } - // Perform byte-wise comparison up to the minimum length - size_t min_length = a_length < b_length ? a_length : b_length; - int cmp_result = memcmp(a_start, b_start, min_length); + // Allocate a new Str instance + Str *result_str = PyObject_New(Str, &StrType); + if (result_str == NULL) { return NULL; } - // If the strings are equal up to `min_length`, then the shorter string is smaller - if (cmp_result == 0) cmp_result = (a_length > b_length) - (a_length < b_length); + // Calculate the total length of the new string + result_str->parent = NULL; + result_str->length = self_str.length + other_str.length; - switch (op) { - case Py_LT: return PyBool_FromLong(cmp_result < 0); - case Py_LE: return PyBool_FromLong(cmp_result <= 0); - case Py_EQ: return PyBool_FromLong(cmp_result == 0); - case Py_NE: return PyBool_FromLong(cmp_result != 0); - case Py_GT: return PyBool_FromLong(cmp_result > 0); - case Py_GE: return PyBool_FromLong(cmp_result >= 0); - default: Py_RETURN_NOTIMPLEMENTED; + // Allocate memory for the new string + result_str->start = malloc(result_str->length); + if (result_str->start == NULL) { + PyErr_SetString(PyExc_MemoryError, "Unable to allocate memory for string concatenation"); + return NULL; } + + // Perform the string concatenation + memcpy(result_str->start, self_str.start, self_str.length); + memcpy(result_str->start + self_str.length, other_str.start, other_str.length); + + return (PyObject *)result_str; } static PySequenceMethods Str_as_sequence = { @@ -1203,6 +1324,7 @@ static PyMethodDef Str_methods[] = { // {"find", Str_find, sz_method_flags_m, "Find the first occurrence of a substring."}, {"index", Str_index, sz_method_flags_m, "Find the first occurrence of a substring or raise error if missing."}, {"contains", Str_contains, sz_method_flags_m, "Check if a string contains a substring."}, + {"partition", Str_partition, sz_method_flags_m, "Splits string into 3-tuple: before, match, after."}, {"count", Str_count, sz_method_flags_m, "Count the occurrences of a substring."}, {"levenstein", Str_levenstein, sz_method_flags_m, "Calculate the Levenshtein distance between two strings."}, {"split", Str_split, sz_method_flags_m, "Split a string by a separator."}, @@ -1263,6 +1385,7 @@ static PyMethodDef stringzilla_methods[] = { {"find", Str_find, sz_method_flags_m, "Find the first occurrence of a substring."}, {"index", Str_index, sz_method_flags_m, "Find the first occurrence of a substring or raise error if missing."}, {"contains", Str_contains, sz_method_flags_m, "Check if a string contains a substring."}, + {"partition", Str_partition, sz_method_flags_m, "Splits string into 3-tuple: before, match, after."}, {"count", Str_count, sz_method_flags_m, "Count the occurrences of a substring."}, {"levenstein", Str_levenstein, sz_method_flags_m, "Calculate the Levenshtein distance between two strings."}, {"split", Str_split, sz_method_flags_m, "Split a string by a separator."}, From d7f1f374f441d0d78750d24f555688e2e9afc93d Mon Sep 17 00:00:00 2001 From: Nairi Harutyunyan Date: Thu, 21 Sep 2023 13:50:38 +0300 Subject: [PATCH 26/72] Draft verison of CountSubstrAPI --- javascript/lib.c | 98 ++++++++++++++++++++++++++++++++++++- javascript/stringzilla.d.ts | 10 +++- javascript/test.js | 8 ++- package-lock.json | 4 +- 4 files changed, 113 insertions(+), 7 deletions(-) diff --git a/javascript/lib.c b/javascript/lib.c index d00bf2cd..b2e097ff 100644 --- a/javascript/lib.c +++ b/javascript/lib.c @@ -52,10 +52,104 @@ napi_value FindAPI(napi_env env, napi_callback_info info) { return js_result; } +napi_value CountSubstrAPI(napi_env env, napi_callback_info info) { + size_t argc = 3; + napi_value args[3]; + napi_get_cb_info(env, info, &argc, args, NULL, NULL); + + // Extract the C string from the JavaScript string for haystack and needle + size_t str_size; + size_t str_len; + + // For haystack + napi_get_value_string_utf8(env, args[0], NULL, 0, &str_size); + char *haystack = malloc(str_size + 1); + napi_get_value_string_utf8(env, args[0], haystack, str_size + 1, &str_len); + struct strzl_haystack_t strzl_haystack = {haystack, str_len}; + + + // For needle + napi_get_value_string_utf8(env, args[1], NULL, 0, &str_size); + char *needle = malloc(str_size + 1); + napi_get_value_string_utf8(env, args[1], needle, str_size + 1, &str_len); + struct strzl_needle_t strzl_needle = {needle, str_len, 0}; + + bool overlap = false; + napi_get_value_bool(env, args[2], &overlap); + + size_t haystack_l = strlen(haystack); + size_t needle_l = strlen(needle); + + size_t result = 0; + + if (haystack_l == 1) + result = count_char(haystack, *needle); + else if (haystack_l < needle_l) + result = 0; + else if (overlap) { + while (strlen(haystack)) { + #if defined(__AVX2__) + size_t offset = strzl_avx2_find_substr(strzl_haystack, strzl_needle); + #elif defined(__ARM_NEON) + size_t offset = strzl_neon_find_substr(strzl_haystack, strzl_needle); + #else + size_t offset = strzl_naive_find_substr(strzl_haystack, strzl_needle); + #endif + + + bool found = offset != haystack_l; + result += found; + haystack += offset + found; + haystack_l -= offset + found; + } + } + + else { + while (haystack_l) { + #if defined(__AVX2__) + size_t offset = strzl_avx2_find_substr(strzl_haystack, strzl_needle); + #elif defined(__ARM_NEON) + size_t offset = strzl_neon_find_substr(strzl_haystack, strzl_needle); + #else + size_t offset = strzl_naive_find_substr(strzl_haystack, strzl_needle); + #endif + + bool found = offset != haystack_l; + result += found; + haystack += offset + needle_l; + haystack_l -= offset + needle_l * found; + } + } + + // Cleanup + free(haystack); + free(needle); + + // Convert result to JavaScript BigInt and return + napi_value js_result; + napi_create_bigint_uint64(env, result, &js_result); + + return js_result; +} + napi_value Init(napi_env env, napi_value exports) { - napi_property_descriptor desc = {"find", 0, FindAPI, 0, 0, 0, napi_default, 0}; - napi_define_properties(env, exports, 1, &desc); + // Define the "find" property + napi_property_descriptor findDesc = {"find", 0, FindAPI, 0, 0, 0, napi_default, 0}; + + // Define the "countSubstr" property + napi_property_descriptor countSubstrDesc = {"countSubstr", 0, CountSubstrAPI, 0, 0, 0, napi_default, 0}; + + // Define an array of property descriptors + napi_property_descriptor properties[] = {findDesc, countSubstrDesc}; + + // Define the number of properties in the array + size_t propertyCount = sizeof(properties) / sizeof(properties[0]); + + // Define the properties on the exports object + napi_define_properties(env, exports, propertyCount, properties); + return exports; } NAPI_MODULE(NODE_GYP_MODULE_NAME, Init) + diff --git a/javascript/stringzilla.d.ts b/javascript/stringzilla.d.ts index 657e666f..57eff05b 100644 --- a/javascript/stringzilla.d.ts +++ b/javascript/stringzilla.d.ts @@ -6,4 +6,12 @@ * @param {string} needle */ export function find(haystack: string, needle: string): bigint; - \ No newline at end of file + +/** + * Searches for a substring in a larger string. + * + * @param {string} haystack + * @param {string} needle + * @param {boolean} overlap + */ +export function countSubstr(haystack: string, needle: string, overlap: boolean): bigint; diff --git a/javascript/test.js b/javascript/test.js index 084d55cd..04ea7280 100644 --- a/javascript/test.js +++ b/javascript/test.js @@ -1,7 +1,11 @@ var assert = require('assert'); var stringzilla = require('bindings')('stringzilla'); -const result = stringzilla.find("hello world", "world"); -console.log(result); // Output will depend on the result of your findOperation function. +const findResult = stringzilla.find("hello world", "world"); +console.log(findResult); // Output will depend on the result of your findOperation function. + +const countResult = stringzilla.countSubstr("hello world", "world"); +console.log(countResult); // Output will depend on the result of your countSubstr function. + console.log('JavaScript tests passed!'); diff --git a/package-lock.json b/package-lock.json index e577ab31..38555f5c 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "stringzilla", - "version": "1.2.0", + "version": "1.2.2", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "stringzilla", - "version": "1.2.0", + "version": "1.2.2", "license": "Apache 2.0", "dependencies": { "@types/node": "^20.4.5", From ca77b0acfe0246f4a286972dfac0b0d2c71d0e17 Mon Sep 17 00:00:00 2001 From: Nairi Harutyunyan Date: Thu, 21 Sep 2023 13:57:50 +0300 Subject: [PATCH 27/72] Add count_char function --- javascript/lib.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/javascript/lib.c b/javascript/lib.c index b2e097ff..75b3548a 100644 --- a/javascript/lib.c +++ b/javascript/lib.c @@ -52,6 +52,11 @@ napi_value FindAPI(napi_env env, napi_callback_info info) { return js_result; } +size_t count_char(strzl_haystack_t strzl_haystack, char needle) { + size_t result = strzl_naive_count_char(strzl_haystack, needle); + return result; +} + napi_value CountSubstrAPI(napi_env env, napi_callback_info info) { size_t argc = 3; napi_value args[3]; @@ -83,7 +88,7 @@ napi_value CountSubstrAPI(napi_env env, napi_callback_info info) { size_t result = 0; if (haystack_l == 1) - result = count_char(haystack, *needle); + result = count_char(strzl_haystack, needle); else if (haystack_l < needle_l) result = 0; else if (overlap) { From 8abb7624ffceea013ef9e59411dfaf65748ae13c Mon Sep 17 00:00:00 2001 From: Nairi Harutyunyan Date: Fri, 22 Sep 2023 00:46:15 +0300 Subject: [PATCH 28/72] Fix issues --- javascript/lib.c | 63 ++++++++++++++++++++++------------------------ javascript/test.js | 2 +- 2 files changed, 31 insertions(+), 34 deletions(-) diff --git a/javascript/lib.c b/javascript/lib.c index 75b3548a..b06e48f3 100644 --- a/javascript/lib.c +++ b/javascript/lib.c @@ -8,6 +8,7 @@ * * @see NodeJS docs: https://nodejs.org/api/n-api.html */ +#include #include #include @@ -17,20 +18,20 @@ napi_value FindAPI(napi_env env, napi_callback_info info) { napi_get_cb_info(env, info, &argc, args, NULL, NULL); // Extract the C string from the JavaScript string for haystack and needle - size_t str_size; - size_t str_len; + size_t haystack_l; + size_t needle_l; // For haystack - napi_get_value_string_utf8(env, args[0], NULL, 0, &str_size); - char *haystack = malloc(str_size + 1); - napi_get_value_string_utf8(env, args[0], haystack, str_size + 1, &str_len); - struct strzl_haystack_t strzl_haystack = {haystack, str_len}; + napi_get_value_string_utf8(env, args[0], NULL, 0, &haystack_l); + char *haystack = malloc(haystack_l + 1); + napi_get_value_string_utf8(env, args[0], haystack, haystack_l + 1, &needle_l); + struct strzl_haystack_t strzl_haystack = {haystack, needle_l}; // For needle - napi_get_value_string_utf8(env, args[1], NULL, 0, &str_size); - char *needle = malloc(str_size + 1); - napi_get_value_string_utf8(env, args[1], needle, str_size + 1, &str_len); - struct strzl_needle_t strzl_needle = {needle, str_len, 0}; + napi_get_value_string_utf8(env, args[1], NULL, 0, &haystack_l); + char *needle = malloc(haystack_l + 1); + napi_get_value_string_utf8(env, args[1], needle, haystack_l + 1, &needle_l); + struct strzl_needle_t strzl_needle = {needle, needle_l, 0}; // Perform the find operation #if defined(__AVX2__) @@ -54,6 +55,7 @@ napi_value FindAPI(napi_env env, napi_callback_info info) { size_t count_char(strzl_haystack_t strzl_haystack, char needle) { size_t result = strzl_naive_count_char(strzl_haystack, needle); + return result; } @@ -63,28 +65,24 @@ napi_value CountSubstrAPI(napi_env env, napi_callback_info info) { napi_get_cb_info(env, info, &argc, args, NULL, NULL); // Extract the C string from the JavaScript string for haystack and needle - size_t str_size; - size_t str_len; + size_t haystack_l; + size_t needle_l; // For haystack - napi_get_value_string_utf8(env, args[0], NULL, 0, &str_size); - char *haystack = malloc(str_size + 1); - napi_get_value_string_utf8(env, args[0], haystack, str_size + 1, &str_len); - struct strzl_haystack_t strzl_haystack = {haystack, str_len}; - + napi_get_value_string_utf8(env, args[0], NULL, 0, &haystack_l); + char *haystack = malloc(haystack_l + 1); + napi_get_value_string_utf8(env, args[0], haystack, haystack_l + 1, &needle_l); + struct strzl_haystack_t strzl_haystack = {haystack, needle_l}; // For needle - napi_get_value_string_utf8(env, args[1], NULL, 0, &str_size); - char *needle = malloc(str_size + 1); - napi_get_value_string_utf8(env, args[1], needle, str_size + 1, &str_len); - struct strzl_needle_t strzl_needle = {needle, str_len, 0}; + napi_get_value_string_utf8(env, args[1], NULL, 0, &haystack_l); + char *needle = malloc(haystack_l + 1); + napi_get_value_string_utf8(env, args[1], needle, haystack_l + 1, &needle_l); + struct strzl_needle_t strzl_needle = {needle, needle_l, 0}; bool overlap = false; napi_get_value_bool(env, args[2], &overlap); - size_t haystack_l = strlen(haystack); - size_t needle_l = strlen(needle); - size_t result = 0; if (haystack_l == 1) @@ -92,7 +90,7 @@ napi_value CountSubstrAPI(napi_env env, napi_callback_info info) { else if (haystack_l < needle_l) result = 0; else if (overlap) { - while (strlen(haystack)) { + while (strzl_haystack.len) { #if defined(__AVX2__) size_t offset = strzl_avx2_find_substr(strzl_haystack, strzl_needle); #elif defined(__ARM_NEON) @@ -101,16 +99,15 @@ napi_value CountSubstrAPI(napi_env env, napi_callback_info info) { size_t offset = strzl_naive_find_substr(strzl_haystack, strzl_needle); #endif - - bool found = offset != haystack_l; + bool found = offset != strzl_haystack.len; result += found; - haystack += offset + found; - haystack_l -= offset + found; + strzl_haystack.ptr += offset + found; + strzl_haystack.len -= offset + found; } } else { - while (haystack_l) { + while (strzl_haystack.len) { #if defined(__AVX2__) size_t offset = strzl_avx2_find_substr(strzl_haystack, strzl_needle); #elif defined(__ARM_NEON) @@ -119,10 +116,10 @@ napi_value CountSubstrAPI(napi_env env, napi_callback_info info) { size_t offset = strzl_naive_find_substr(strzl_haystack, strzl_needle); #endif - bool found = offset != haystack_l; + bool found = offset != strzl_haystack.len; result += found; - haystack += offset + needle_l; - haystack_l -= offset + needle_l * found; + strzl_haystack.ptr += offset + needle_l; + strzl_haystack.len -= offset + needle_l * found; } } diff --git a/javascript/test.js b/javascript/test.js index 04ea7280..18ea11b2 100644 --- a/javascript/test.js +++ b/javascript/test.js @@ -4,7 +4,7 @@ var stringzilla = require('bindings')('stringzilla'); const findResult = stringzilla.find("hello world", "world"); console.log(findResult); // Output will depend on the result of your findOperation function. -const countResult = stringzilla.countSubstr("hello world", "world"); +const countResult = stringzilla.countSubstr("abababab", "aba", true); console.log(countResult); // Output will depend on the result of your countSubstr function. From c1953fade9563b8d6627d05662ee1025773b4558 Mon Sep 17 00:00:00 2001 From: Nairi Harutyunyan Date: Fri, 22 Sep 2023 00:53:29 +0300 Subject: [PATCH 29/72] remove stdio --- javascript/lib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/javascript/lib.c b/javascript/lib.c index b06e48f3..830b577e 100644 --- a/javascript/lib.c +++ b/javascript/lib.c @@ -8,7 +8,7 @@ * * @see NodeJS docs: https://nodejs.org/api/n-api.html */ -#include + #include #include From 2758c3c15e42f89c3bfd202144690f44fd6b320c Mon Sep 17 00:00:00 2001 From: Nairi Harutyunyan Date: Fri, 22 Sep 2023 01:12:57 +0300 Subject: [PATCH 30/72] In JavaScript if find unable to find the specified value then it should return -1 --- javascript/lib.c | 52 +++++++++++++++++++++++++++++------------------- 1 file changed, 32 insertions(+), 20 deletions(-) diff --git a/javascript/lib.c b/javascript/lib.c index d00bf2cd..a092e242 100644 --- a/javascript/lib.c +++ b/javascript/lib.c @@ -8,38 +8,50 @@ * * @see NodeJS docs: https://nodejs.org/api/n-api.html */ + #include #include napi_value FindAPI(napi_env env, napi_callback_info info) { - size_t argc = 2; +size_t argc = 2; napi_value args[2]; napi_get_cb_info(env, info, &argc, args, NULL, NULL); // Extract the C string from the JavaScript string for haystack and needle - size_t str_size; - size_t str_len; + size_t haystack_l; + size_t needle_l; // For haystack - napi_get_value_string_utf8(env, args[0], NULL, 0, &str_size); - char *haystack = malloc(str_size + 1); - napi_get_value_string_utf8(env, args[0], haystack, str_size + 1, &str_len); - struct strzl_haystack_t strzl_haystack = {haystack, str_len}; + napi_get_value_string_utf8(env, args[0], NULL, 0, &haystack_l); + char *haystack = malloc(haystack_l + 1); + napi_get_value_string_utf8(env, args[0], haystack, haystack_l + 1, &needle_l); + struct strzl_haystack_t strzl_haystack = {haystack, needle_l}; // For needle - napi_get_value_string_utf8(env, args[1], NULL, 0, &str_size); - char *needle = malloc(str_size + 1); - napi_get_value_string_utf8(env, args[1], needle, str_size + 1, &str_len); - struct strzl_needle_t strzl_needle = {needle, str_len, 0}; - -// Perform the find operation -#if defined(__AVX2__) - uint64_t result = strzl_avx2_find_substr(strzl_haystack, strzl_needle); -#elif defined(__ARM_NEON) - uint64_t result = strzl_neon_find_substr(strzl_haystack, strzl_needle); -#else - uint64_t result = strzl_naive_find_substr(strzl_haystack, strzl_needle); -#endif + napi_get_value_string_utf8(env, args[1], NULL, 0, &haystack_l); + char *needle = malloc(haystack_l + 1); + napi_get_value_string_utf8(env, args[1], needle, haystack_l + 1, &needle_l); + struct strzl_needle_t strzl_needle = {needle, needle_l, 0}; + + // Perform the find operation + #if defined(__AVX2__) + uint64_t result = strzl_avx2_find_substr(strzl_haystack, strzl_needle); + #elif defined(__ARM_NEON) + uint64_t result = strzl_neon_find_substr(strzl_haystack, strzl_needle); + #else + uint64_t result = strzl_naive_find_substr(strzl_haystack, strzl_needle); + #endif + + // Restore length of haystack as it's lost + haystack_l = strlen(haystack); + + // In JavaScript if find unable to find the specified value then it should return -1 + if (haystack_l == (size_t)result) { + napi_value js_result; + napi_create_int32(env, -1, &js_result); + + return js_result; + } // Cleanup free(haystack); From c7eb66868603b3c0ac21a724b485d9987a01667a Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Fri, 22 Sep 2023 09:57:36 +0400 Subject: [PATCH 31/72] Add: Shuffling method in Python --- python/lib.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/python/lib.c b/python/lib.c index 8459cc30..dbf41114 100644 --- a/python/lib.c +++ b/python/lib.c @@ -1351,6 +1351,74 @@ static PyTypeObject StrType = { .tp_as_number = &Str_as_number, }; +#pragma endregion + +#pragma regions Strs + +static PyObject *Strs_shuffle(Strs *self, PyObject *args, PyObject *kwargs) { + unsigned int seed = time(NULL); // Default seed + + // Check for positional arguments + Py_ssize_t nargs = PyTuple_Size(args); + if (nargs > 1) { + PyErr_SetString(PyExc_TypeError, "shuffle() takes at most 1 positional argument"); + return NULL; + } + else if (nargs == 1) { + PyObject *seed_obj = PyTuple_GET_ITEM(args, 0); + if (!PyLong_Check(seed_obj)) { + PyErr_SetString(PyExc_TypeError, "The seed must be an integer"); + return NULL; + } + seed = PyLong_AsUnsignedLong(seed_obj); + } + + // Check for keyword arguments + if (kwargs) { + PyObject *key, *value; + Py_ssize_t pos = 0; + while (PyDict_Next(kwargs, &pos, &key, &value)) { + if (PyUnicode_CompareWithASCIIString(key, "seed") == 0) { + if (nargs == 1) { + PyErr_SetString(PyExc_TypeError, "Received seed both as positional and keyword argument"); + return NULL; + } + if (!PyLong_Check(value)) { + PyErr_SetString(PyExc_TypeError, "The seed must be an integer"); + return NULL; + } + seed = PyLong_AsUnsignedLong(value); + } + else { + PyErr_Format(PyExc_TypeError, "Received an unexpected keyword argument '%U'", key); + return NULL; + } + } + } + + // Change the layout + if (!prepare_strings_for_reordering(self)) { + PyErr_Format(PyExc_TypeError, "Failed to prepare the sequence for shuffling"); + return NULL; + } + + // Get the parts and their count + struct reordered_slices_t *reordered = &self->data.reordered; + sz_haystack_t *parts = reordered->parts; + size_t count = reordered->count; + + // Fisher-Yates Shuffle Algorithm + for (size_t i = count - 1; i > 0; --i) { + size_t j = rand() % (i + 1); + // Swap parts[i] and parts[j] + sz_haystack_t temp = parts[i]; + parts[i] = parts[j]; + parts[j] = temp; + } + + Py_RETURN_NONE; +} + static PySequenceMethods Strs_as_sequence = { .sq_length = Strs_len, // .sq_item = Strs_getitem, // @@ -1362,6 +1430,10 @@ static PyMappingMethods Strs_as_mapping = { .mp_subscript = Strs_subscript, // Is used to implement slices in Python }; +static PyMethodDef Strs_methods[] = { + {"shuffle", Strs_shuffle, sz_method_flags_m, "Shuffle the elements of the Strs object."}, // + {NULL, NULL, 0, NULL}}; + static PyTypeObject StrsType = { PyVarObject_HEAD_INIT(NULL, 0).tp_name = "stringzilla.Strs", .tp_doc = "Space-efficient container for large collections of strings and their slices", @@ -1369,6 +1441,7 @@ static PyTypeObject StrsType = { .tp_itemsize = 0, .tp_flags = Py_TPFLAGS_DEFAULT, .tp_new = PyType_GenericNew, + .tp_methods = Strs_methods, .tp_as_sequence = &Strs_as_sequence, .tp_as_mapping = &Strs_as_mapping, }; From 5a2f72d2267433c3696863d82dd93dd3a34dd0d4 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Fri, 22 Sep 2023 09:57:48 +0400 Subject: [PATCH 32/72] Make: Colorful diagnostics --- .vscode/settings.json | 1 + setup.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/.vscode/settings.json b/.vscode/settings.json index 97c0113c..c32a469d 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -149,6 +149,7 @@ "NOMINMAX", "NOTIMPLEMENTED", "pytest", + "Pythonic", "quadgram", "readlines", "releasebuffer", diff --git a/setup.py b/setup.py index cb136b1d..f667c0f2 100644 --- a/setup.py +++ b/setup.py @@ -13,6 +13,7 @@ compile_args.append("-O3") compile_args.append("-pedantic") compile_args.append("-Wno-unknown-pragmas") + compile_args.append("-fdiagnostics-color=always") compile_args.append("-fopenmp") link_args.append("-lgomp") @@ -40,6 +41,7 @@ compile_args.append("-Wno-unknown-pragmas") compile_args.append("-Wno-incompatible-function-pointer-types") compile_args.append("-Wno-incompatible-pointer-types") + compile_args.append("-fcolor-diagnostics") compile_args.append("-Xpreprocessor -fopenmp") link_args.append("-Xpreprocessor -lomp") From 577554cc818025c53c67fb2a7acacdaa031d5a3a Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Fri, 22 Sep 2023 10:07:06 +0400 Subject: [PATCH 33/72] Make: `AlwaysBreakBeforeMultilineStrings` --- .clang-format | 1 - 1 file changed, 1 deletion(-) diff --git a/.clang-format b/.clang-format index ab9f350a..11877ff7 100644 --- a/.clang-format +++ b/.clang-format @@ -24,7 +24,6 @@ AllowShortLambdasOnASingleLine: true AllowShortLoopsOnASingleLine: false AlwaysBreakBeforeMultilineStrings: true AlwaysBreakTemplateDeclarations: Yes -AlwaysBreakBeforeMultilineStrings: true AlwaysBreakAfterReturnType: None PenaltyReturnTypeOnItsOwnLine: 200 From 021b29830911d50692500fd3fdcf77c634170514 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Fri, 22 Sep 2023 10:07:30 +0400 Subject: [PATCH 34/72] Fix: Memory leak and extra `strlen` calls --- javascript/lib.c | 93 ++++++++++++++++++++++-------------------------- 1 file changed, 43 insertions(+), 50 deletions(-) diff --git a/javascript/lib.c b/javascript/lib.c index 282d7066..8991b37a 100644 --- a/javascript/lib.c +++ b/javascript/lib.c @@ -12,45 +12,34 @@ #include napi_value FindAPI(napi_env env, napi_callback_info info) { -size_t argc = 2; + size_t argc = 2; napi_value args[2]; napi_get_cb_info(env, info, &argc, args, NULL, NULL); // Extract the C string from the JavaScript string for haystack and needle - size_t haystack_l; - size_t needle_l; + struct strzl_haystack_t strzl_haystack = {NULL, 0}; + struct strzl_needle_t strzl_needle = {NULL, 0, 0}; // For haystack - napi_get_value_string_utf8(env, args[0], NULL, 0, &haystack_l); - char *haystack = malloc(haystack_l + 1); - napi_get_value_string_utf8(env, args[0], haystack, haystack_l + 1, &needle_l); - struct strzl_haystack_t strzl_haystack = {haystack, needle_l}; + napi_get_value_string_utf8(env, args[0], NULL, 0, &strzl_haystack.len); + char *haystack = malloc(strzl_haystack.len); + napi_get_value_string_utf8(env, args[0], haystack, strzl_haystack.len, &strzl_haystack.len); + strzl_haystack.ptr = haystack; // For needle - napi_get_value_string_utf8(env, args[1], NULL, 0, &haystack_l); - char *needle = malloc(haystack_l + 1); - napi_get_value_string_utf8(env, args[1], needle, haystack_l + 1, &needle_l); - struct strzl_needle_t strzl_needle = {needle, needle_l, 0}; - - // Perform the find operation - #if defined(__AVX2__) - uint64_t result = strzl_avx2_find_substr(strzl_haystack, strzl_needle); - #elif defined(__ARM_NEON) - uint64_t result = strzl_neon_find_substr(strzl_haystack, strzl_needle); - #else - uint64_t result = strzl_naive_find_substr(strzl_haystack, strzl_needle); - #endif - - // Restore length of haystack as it's lost - haystack_l = strlen(haystack); - - // In JavaScript if find unable to find the specified value then it should return -1 - if (haystack_l == (size_t)result) { - napi_value js_result; - napi_create_int32(env, -1, &js_result); - - return js_result; - } + napi_get_value_string_utf8(env, args[1], NULL, 0, &strzl_needle.len); + char *needle = malloc(strzl_needle.len); + napi_get_value_string_utf8(env, args[1], needle, strzl_needle.len, &strzl_needle.len); + strzl_needle.ptr = needle; + +// Perform the find operation +#if defined(__AVX2__) + uint64_t result = strzl_avx2_find_substr(strzl_haystack, strzl_needle); +#elif defined(__ARM_NEON) + uint64_t result = strzl_neon_find_substr(strzl_haystack, strzl_needle); +#else + uint64_t result = strzl_naive_find_substr(strzl_haystack, strzl_needle); +#endif // Cleanup free(haystack); @@ -58,7 +47,12 @@ size_t argc = 2; // Convert result to JavaScript BigInt and return napi_value js_result; - napi_create_bigint_uint64(env, result, &js_result); + + // In JavaScript if find unable to find the specified value then it should return -1 + if (result = strzl_haystack.len) + napi_create_bigint_int64(env, -1, &js_result); + else + napi_create_bigint_uint64(env, result, &js_result); return js_result; } @@ -101,13 +95,13 @@ napi_value CountSubstrAPI(napi_env env, napi_callback_info info) { result = 0; else if (overlap) { while (strzl_haystack.len) { - #if defined(__AVX2__) - size_t offset = strzl_avx2_find_substr(strzl_haystack, strzl_needle); - #elif defined(__ARM_NEON) - size_t offset = strzl_neon_find_substr(strzl_haystack, strzl_needle); - #else - size_t offset = strzl_naive_find_substr(strzl_haystack, strzl_needle); - #endif +#if defined(__AVX2__) + size_t offset = strzl_avx2_find_substr(strzl_haystack, strzl_needle); +#elif defined(__ARM_NEON) + size_t offset = strzl_neon_find_substr(strzl_haystack, strzl_needle); +#else + size_t offset = strzl_naive_find_substr(strzl_haystack, strzl_needle); +#endif bool found = offset != strzl_haystack.len; result += found; @@ -118,13 +112,13 @@ napi_value CountSubstrAPI(napi_env env, napi_callback_info info) { else { while (strzl_haystack.len) { - #if defined(__AVX2__) - size_t offset = strzl_avx2_find_substr(strzl_haystack, strzl_needle); - #elif defined(__ARM_NEON) - size_t offset = strzl_neon_find_substr(strzl_haystack, strzl_needle); - #else - size_t offset = strzl_naive_find_substr(strzl_haystack, strzl_needle); - #endif +#if defined(__AVX2__) + size_t offset = strzl_avx2_find_substr(strzl_haystack, strzl_needle); +#elif defined(__ARM_NEON) + size_t offset = strzl_neon_find_substr(strzl_haystack, strzl_needle); +#else + size_t offset = strzl_naive_find_substr(strzl_haystack, strzl_needle); +#endif bool found = offset != strzl_haystack.len; result += found; @@ -147,13 +141,13 @@ napi_value CountSubstrAPI(napi_env env, napi_callback_info info) { napi_value Init(napi_env env, napi_value exports) { // Define the "find" property napi_property_descriptor findDesc = {"find", 0, FindAPI, 0, 0, 0, napi_default, 0}; - + // Define the "countSubstr" property napi_property_descriptor countSubstrDesc = {"countSubstr", 0, CountSubstrAPI, 0, 0, 0, napi_default, 0}; - + // Define an array of property descriptors napi_property_descriptor properties[] = {findDesc, countSubstrDesc}; - + // Define the number of properties in the array size_t propertyCount = sizeof(properties) / sizeof(properties[0]); @@ -164,4 +158,3 @@ napi_value Init(napi_env env, napi_value exports) { } NAPI_MODULE(NODE_GYP_MODULE_NAME, Init) - From 880f4371fccc6c925e42b514c302696496638862 Mon Sep 17 00:00:00 2001 From: Nairi Harutyunyan Date: Sat, 23 Sep 2023 00:51:45 +0300 Subject: [PATCH 35/72] Improvements, add test cases --- .vscode/settings.json | 3 ++- javascript/lib.c | 11 +++++---- javascript/test.js | 11 --------- javascript/test/countSubstr.js | 44 ++++++++++++++++++++++++++++++++++ javascript/test/find.js | 28 ++++++++++++++++++++++ package.json | 3 ++- 6 files changed, 82 insertions(+), 18 deletions(-) delete mode 100644 javascript/test.js create mode 100644 javascript/test/countSubstr.js create mode 100644 javascript/test/find.js diff --git a/.vscode/settings.json b/.vscode/settings.json index 3ebc8b24..5bb0127a 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -116,7 +116,8 @@ "stop_token": "cpp", "__verbose_abort": "cpp", "strstream": "cpp", - "filesystem": "cpp" + "filesystem": "cpp", + "stringzilla.h": "c" }, "C_Cpp.default.configurationProvider": "ms-vscode.cmake-tools", "cSpell.words": [ diff --git a/javascript/lib.c b/javascript/lib.c index 8991b37a..0a2ee655 100644 --- a/javascript/lib.c +++ b/javascript/lib.c @@ -8,6 +8,7 @@ * * @see NodeJS docs: https://nodejs.org/api/n-api.html */ + #include #include @@ -49,7 +50,7 @@ napi_value FindAPI(napi_env env, napi_callback_info info) { napi_value js_result; // In JavaScript if find unable to find the specified value then it should return -1 - if (result = strzl_haystack.len) + if (result == 0) napi_create_bigint_int64(env, -1, &js_result); else napi_create_bigint_uint64(env, result, &js_result); @@ -75,8 +76,8 @@ napi_value CountSubstrAPI(napi_env env, napi_callback_info info) { // For haystack napi_get_value_string_utf8(env, args[0], NULL, 0, &haystack_l); char *haystack = malloc(haystack_l + 1); - napi_get_value_string_utf8(env, args[0], haystack, haystack_l + 1, &needle_l); - struct strzl_haystack_t strzl_haystack = {haystack, needle_l}; + napi_get_value_string_utf8(env, args[0], haystack, haystack_l + 1, &haystack_l); + struct strzl_haystack_t strzl_haystack = {haystack, haystack_l}; // For needle napi_get_value_string_utf8(env, args[1], NULL, 0, &haystack_l); @@ -89,8 +90,8 @@ napi_value CountSubstrAPI(napi_env env, napi_callback_info info) { size_t result = 0; - if (haystack_l == 1) - result = count_char(strzl_haystack, needle); + if (needle_l == 1 || needle_l == 0) + result = count_char(strzl_haystack, needle[0]); else if (haystack_l < needle_l) result = 0; else if (overlap) { diff --git a/javascript/test.js b/javascript/test.js deleted file mode 100644 index 18ea11b2..00000000 --- a/javascript/test.js +++ /dev/null @@ -1,11 +0,0 @@ -var assert = require('assert'); -var stringzilla = require('bindings')('stringzilla'); - -const findResult = stringzilla.find("hello world", "world"); -console.log(findResult); // Output will depend on the result of your findOperation function. - -const countResult = stringzilla.countSubstr("abababab", "aba", true); -console.log(countResult); // Output will depend on the result of your countSubstr function. - - -console.log('JavaScript tests passed!'); diff --git a/javascript/test/countSubstr.js b/javascript/test/countSubstr.js new file mode 100644 index 00000000..973ba541 --- /dev/null +++ b/javascript/test/countSubstr.js @@ -0,0 +1,44 @@ +import test from 'node:test'; +import bindings from 'bindings'; +import assert from 'node:assert'; + +const stringzilla = bindings('stringzilla'); + +test('Count Words - Single Occurrence', () => { + const result = stringzilla.countSubstr('hello world', 'world'); + + assert.strictEqual(result, 1n); +}); + +test('Count Words - Multiple Occurrence', () => { + const result = stringzilla.countSubstr('hello world, hello John', 'hello'); + + assert.strictEqual(result, 2n); +}); + +test('Count Words - Multiple Occurrences with Overlap Test', () => { + const result_1 = stringzilla.countSubstr('abababab', 'aba'); + + assert.strictEqual(result_1, 2n); + + const result_2 = stringzilla.countSubstr('abababab', 'aba', true); + + assert.strictEqual(result_2, 3n); +}); + +test('Count Words - No Occurrence', () => { + const result = stringzilla.countSubstr('hello world', 'hi'); + + assert.strictEqual(result, 0n); +}); + +test('Count Words - Empty String Inputs', () => { + const result_1 = stringzilla.countSubstr('hello world', ''); + assert.strictEqual(result_1, 0n); + + const result_2 = stringzilla.countSubstr('', 'hi'); + assert.strictEqual(result_2, 0n); + + const result_3 = stringzilla.countSubstr('', ''); + assert.strictEqual(result_3, 0n); +}); diff --git a/javascript/test/find.js b/javascript/test/find.js new file mode 100644 index 00000000..f0f1ea45 --- /dev/null +++ b/javascript/test/find.js @@ -0,0 +1,28 @@ +import test from 'node:test'; +import bindings from 'bindings'; +import assert from 'node:assert'; + +const stringzilla = bindings('stringzilla'); + +test('Find Word in Text - Positive Case', () => { + const result = stringzilla.find('hello world, hello john', 'world'); + + assert.strictEqual(result, 6n); +}); + +test('Find Word in Text - Negative Case (Word Not Found)', () => { + const result = stringzilla.find('hello world', 'hi'); + + assert.strictEqual(result, -1n); +}); + +test('Find Word in Text - Negative Case (Empty String Inputs)', () => { + const result_1 = stringzilla.find('hello world', ''); + assert.strictEqual(result_1, -1n); + + const result_2 = stringzilla.find('', 'a'); + assert.strictEqual(result_2, -1n); + + const result_3 = stringzilla.find('', ''); + assert.strictEqual(result_2, -1n); +}); diff --git a/package.json b/package.json index a1bab16c..e7ff8597 100644 --- a/package.json +++ b/package.json @@ -5,6 +5,7 @@ "author": "Ash Vardanian", "license": "Apache 2.0", "main": "javascript/stringzilla.js", + "type": "module", "repository": { "type": "git", "url": "https://github.com/ashvardanian/stringzilla.git" @@ -19,7 +20,7 @@ "node-addon-api": "^3.0.0" }, "scripts": { - "test": "node javascript/test.js" + "test": "node --test ./javascript/test" }, "devDependencies": { "@semantic-release/exec": "^6.0.3", From 5b395968bbedbd8e2f65cf3139a4d3c7a2545027 Mon Sep 17 00:00:00 2001 From: Nairi Harutyunyan Date: Sat, 23 Sep 2023 01:10:20 +0300 Subject: [PATCH 36/72] Fix condition in find function --- javascript/lib.c | 2 +- javascript/test/find.js | 12 +++++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/javascript/lib.c b/javascript/lib.c index 0a2ee655..616a3e81 100644 --- a/javascript/lib.c +++ b/javascript/lib.c @@ -50,7 +50,7 @@ napi_value FindAPI(napi_env env, napi_callback_info info) { napi_value js_result; // In JavaScript if find unable to find the specified value then it should return -1 - if (result == 0) + if (result == strzl_haystack.len) napi_create_bigint_int64(env, -1, &js_result); else napi_create_bigint_uint64(env, result, &js_result); diff --git a/javascript/test/find.js b/javascript/test/find.js index f0f1ea45..cd2a800d 100644 --- a/javascript/test/find.js +++ b/javascript/test/find.js @@ -5,20 +5,22 @@ import assert from 'node:assert'; const stringzilla = bindings('stringzilla'); test('Find Word in Text - Positive Case', () => { - const result = stringzilla.find('hello world, hello john', 'world'); + const result = stringzilla.find('hello world, hello john', 'hello'); - assert.strictEqual(result, 6n); + assert.strictEqual(result, 0n); }); test('Find Word in Text - Negative Case (Word Not Found)', () => { - const result = stringzilla.find('hello world', 'hi'); + const result_1 = stringzilla.find('ha', 'aaa'); + assert.strictEqual(result_1, -1n); - assert.strictEqual(result, -1n); + const result_2 = stringzilla.find('g', 'a'); + assert.strictEqual(result_2, -1n); }); test('Find Word in Text - Negative Case (Empty String Inputs)', () => { const result_1 = stringzilla.find('hello world', ''); - assert.strictEqual(result_1, -1n); + assert.strictEqual(result_1, 0n); const result_2 = stringzilla.find('', 'a'); assert.strictEqual(result_2, -1n); From 1a6a8e496f446a0f450c5f955967b0ef64c225b2 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sat, 23 Sep 2023 08:44:07 +0100 Subject: [PATCH 37/72] Improve: Use less temp. variables to count matches --- javascript/lib.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/javascript/lib.c b/javascript/lib.c index 616a3e81..c2098a08 100644 --- a/javascript/lib.c +++ b/javascript/lib.c @@ -46,10 +46,10 @@ napi_value FindAPI(napi_env env, napi_callback_info info) { free(haystack); free(needle); - // Convert result to JavaScript BigInt and return + // Convert the result to JavaScript BigInt and return napi_value js_result; - // In JavaScript if find unable to find the specified value then it should return -1 + // In JavaScript, if `find` is unable to find the specified value, then it should return -1 if (result == strzl_haystack.len) napi_create_bigint_int64(env, -1, &js_result); else @@ -70,30 +70,30 @@ napi_value CountSubstrAPI(napi_env env, napi_callback_info info) { napi_get_cb_info(env, info, &argc, args, NULL, NULL); // Extract the C string from the JavaScript string for haystack and needle - size_t haystack_l; - size_t needle_l; + struct strzl_haystack_t strzl_haystack = {NULL, 0}; + struct strzl_needle_t strzl_needle = {NULL, 0, 0}; // For haystack - napi_get_value_string_utf8(env, args[0], NULL, 0, &haystack_l); - char *haystack = malloc(haystack_l + 1); - napi_get_value_string_utf8(env, args[0], haystack, haystack_l + 1, &haystack_l); - struct strzl_haystack_t strzl_haystack = {haystack, haystack_l}; + napi_get_value_string_utf8(env, args[0], NULL, 0, &strzl_haystack.len); + char *haystack = malloc(strzl_haystack.len); + napi_get_value_string_utf8(env, args[0], haystack, strzl_haystack.len, &strzl_haystack.len); + strzl_haystack.ptr = haystack; // For needle - napi_get_value_string_utf8(env, args[1], NULL, 0, &haystack_l); - char *needle = malloc(haystack_l + 1); - napi_get_value_string_utf8(env, args[1], needle, haystack_l + 1, &needle_l); - struct strzl_needle_t strzl_needle = {needle, needle_l, 0}; + napi_get_value_string_utf8(env, args[1], NULL, 0, &strzl_needle.len); + char *needle = malloc(strzl_needle.len); + napi_get_value_string_utf8(env, args[1], needle, strzl_needle.len, &strzl_needle.len); + strzl_needle.ptr = needle; bool overlap = false; napi_get_value_bool(env, args[2], &overlap); - size_t result = 0; + size_t result; - if (needle_l == 1 || needle_l == 0) - result = count_char(strzl_haystack, needle[0]); - else if (haystack_l < needle_l) + if (strzl_needle.len == 0 || strzl_haystack.len == 0 || strzl_haystack.len < strzl_needle.len) { result = 0; + else if (strzl_needle.len == 1) + result = count_char(strzl_haystack, strzl_needle.ptr[0]); else if (overlap) { while (strzl_haystack.len) { #if defined(__AVX2__) @@ -123,8 +123,8 @@ napi_value CountSubstrAPI(napi_env env, napi_callback_info info) { bool found = offset != strzl_haystack.len; result += found; - strzl_haystack.ptr += offset + needle_l; - strzl_haystack.len -= offset + needle_l * found; + strzl_haystack.ptr += offset + strzl_needle.len; + strzl_haystack.len -= offset + strzl_needle.len * found; } } @@ -132,7 +132,7 @@ napi_value CountSubstrAPI(napi_env env, napi_callback_info info) { free(haystack); free(needle); - // Convert result to JavaScript BigInt and return + // Convert the result to JavaScript `BigInt` and return napi_value js_result; napi_create_bigint_uint64(env, result, &js_result); @@ -152,7 +152,7 @@ napi_value Init(napi_env env, napi_value exports) { // Define the number of properties in the array size_t propertyCount = sizeof(properties) / sizeof(properties[0]); - // Define the properties on the exports object + // Define the properties on the `exports` object napi_define_properties(env, exports, propertyCount, properties); return exports; From 98fb43b4d5074cb104e0dd55e285485b2f3ba69d Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sat, 23 Sep 2023 23:40:35 +0400 Subject: [PATCH 38/72] Make: Deduplicate `.clang-format` settings --- .clang-format | 1 - 1 file changed, 1 deletion(-) diff --git a/.clang-format b/.clang-format index b1adf3b0..e0f25893 100644 --- a/.clang-format +++ b/.clang-format @@ -22,7 +22,6 @@ AllowShortFunctionsOnASingleLine: true AllowShortIfStatementsOnASingleLine: Always AllowShortLambdasOnASingleLine: true AllowShortLoopsOnASingleLine: true -AlwaysBreakBeforeMultilineStrings: true AlwaysBreakTemplateDeclarations: Yes AlwaysBreakBeforeMultilineStrings: true AlwaysBreakAfterReturnType: None From 48213a85cb595184da30ecbddccf899bc58ee625 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sat, 23 Sep 2023 23:41:00 +0400 Subject: [PATCH 39/72] Make: Add NumPy dependency --- .vscode/settings.json | 2 ++ pyproject.toml | 2 +- setup.py | 4 +++- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index c32a469d..36d0a490 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -131,6 +131,7 @@ "getitem", "getslice", "initproc", + "intp", "itemsize", "keeplinebreaks", "keepseparator", @@ -148,6 +149,7 @@ "NOARGS", "NOMINMAX", "NOTIMPLEMENTED", + "numpy", "pytest", "Pythonic", "quadgram", diff --git a/pyproject.toml b/pyproject.toml index fe8221c7..e12df96a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["setuptools>=42", "wheel", "cmake>=3.22"] +requires = ["setuptools>=42", "wheel", "cmake>=3.22", "numpy"] build-backend = "setuptools.build_meta" [tool.pytest.ini_options] diff --git a/setup.py b/setup.py index f667c0f2..1b8d83ce 100644 --- a/setup.py +++ b/setup.py @@ -3,6 +3,7 @@ import platform from setuptools import setup, Extension +import numpy as np compile_args = [] link_args = [] @@ -54,7 +55,7 @@ Extension( "stringzilla", ["python/lib.c"], - include_dirs=["stringzilla"], + include_dirs=["stringzilla", np.get_include()], extra_compile_args=compile_args, extra_link_args=link_args, define_macros=macros_args, @@ -98,5 +99,6 @@ "Topic :: Text Processing :: Indexing", ], include_dirs=[], + setup_requires=["numpy"], ext_modules=ext_modules, ) From d67a00554da060bf5dfa04048a8d3269a2e3bcfe Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sat, 23 Sep 2023 23:41:58 +0400 Subject: [PATCH 40/72] Add: `sort()` and `order()` efficient Py methods --- README.md | 6 +- python/lib.c | 360 ++++++++++++++++++++++++++++------ scripts/test.cpp | 12 +- scripts/test.py | 395 +++++++++++++++++++------------------- stringzilla/stringzilla.h | 57 ++++-- 5 files changed, 543 insertions(+), 287 deletions(-) diff --git a/README.md b/README.md index 57197cb7..818c8879 100644 --- a/README.md +++ b/README.md @@ -119,9 +119,9 @@ sz_haystack_t haystack = {your_text, your_text_length}; sz_needle_t needle = {your_subtext, your_subtext_length, your_anomaly_offset}; // Perform string-level operations -size_t character_count = sz_naive_count_char(haystack, 'a'); -size_t character_position = sz_naive_find_char(haystack, 'a'); -size_t substring_position = sz_naive_find_substr(haystack, needle); +size_t character_count = sz_count_char_swar(haystack, 'a'); +size_t character_position = sz_find_char_swar(haystack, 'a'); +size_t substring_position = sz_find_substr_swar(haystack, needle); // Perform collection level operations sz_sequence_t array = {your_order, your_count, your_get_start, your_get_length, your_handle}; diff --git a/python/lib.c b/python/lib.c index dbf41114..c1e27113 100644 --- a/python/lib.c +++ b/python/lib.c @@ -22,7 +22,9 @@ typedef SSIZE_T ssize_t; #include // `ssize_t` #endif -#include +#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION +#include // Core CPython interfaces +#include // NumPy #include @@ -151,6 +153,47 @@ typedef struct { #pragma region Helpers +typedef int boolean_t; + +inline static char const *haystacks_get_start(sz_haystack_t const *parts, sz_size_t i) { return parts[i].start; } +inline static size_t haystacks_get_length(sz_haystack_t const *parts, sz_size_t i) { return parts[i].length; } + +void reverse_offsets(sz_size_t *array, size_t length) { + size_t i, j; + // Swap array[i] and array[j] + for (i = 0, j = length - 1; i < j; i++, j--) { + sz_size_t temp = array[i]; + array[i] = array[j]; + array[j] = temp; + } +} + +void reverse_haystacks(sz_haystack_t *array, size_t length) { + size_t i, j; + // Swap array[i] and array[j] + for (i = 0, j = length - 1; i < j; i++, j--) { + sz_haystack_t temp = array[i]; + array[i] = array[j]; + array[j] = temp; + } +} + +void apply_order(sz_haystack_t *array, sz_size_t *order, size_t length) { + for (size_t i = 0; i < length; ++i) { + while (order[i] != i) { + // Swap array[i] and array[order[i]] + sz_haystack_t temp = array[i]; + array[i] = array[order[i]]; + array[order[i]] = temp; + + // Also update the order array to reflect the swap + size_t temp_idx = order[i]; + order[i] = order[temp_idx]; + order[temp_idx] = temp_idx; + } + } +} + void slice(size_t length, ssize_t start, ssize_t end, size_t *normalized_offset, size_t *normalized_length) { // clang-format off @@ -172,7 +215,7 @@ void slice(size_t length, ssize_t start, ssize_t end, size_t *normalized_offset, *normalized_length = end - start; } -int export_string_like(PyObject *object, char const **start, size_t *length) { +boolean_t export_string_like(PyObject *object, char const **start, size_t *length) { if (PyUnicode_Check(object)) { // Handle Python str Py_ssize_t signed_length; @@ -205,61 +248,86 @@ int export_string_like(PyObject *object, char const **start, size_t *length) { return 0; } -int get_string_at_offset( +typedef void (*get_string_at_offset_t)(Strs *, Py_ssize_t, Py_ssize_t, PyObject **, char const **, size_t *); + +void str_at_offset_consecutive_32bit( Strs *strs, Py_ssize_t i, Py_ssize_t count, PyObject **parent, char const **start, size_t *length) { + uint32_t start_offset = (i == 0) ? 0 : strs->data.consecutive_32bit.end_offsets[i - 1]; + uint32_t end_offset = strs->data.consecutive_32bit.end_offsets[i]; + *start = strs->data.consecutive_32bit.start + start_offset; + *length = end_offset - start_offset - strs->data.consecutive_32bit.separator_length * (i + 1 != count); + *parent = strs->data.consecutive_32bit.parent; +} + +void str_at_offset_consecutive_64bit( + Strs *strs, Py_ssize_t i, Py_ssize_t count, PyObject **parent, char const **start, size_t *length) { + uint64_t start_offset = (i == 0) ? 0 : strs->data.consecutive_64bit.end_offsets[i - 1]; + uint64_t end_offset = strs->data.consecutive_64bit.end_offsets[i]; + *start = strs->data.consecutive_64bit.start + start_offset; + *length = end_offset - start_offset - strs->data.consecutive_64bit.separator_length * (i + 1 != count); + *parent = strs->data.consecutive_64bit.parent; +} + +void str_at_offset_reordered( + Strs *strs, Py_ssize_t i, Py_ssize_t count, PyObject **parent, char const **start, size_t *length) { + *start = strs->data.reordered.parts[i].start; + *length = strs->data.reordered.parts[i].length; + *parent = strs->data.reordered.parent; +} + +void str_at_offset_multi_source( + Strs *strs, Py_ssize_t i, Py_ssize_t count, PyObject **parent, char const **start, size_t *length) { + *start = strs->data.multi_source.parts[i].start; + *length = strs->data.multi_source.parts[i].length; + *parent = NULL; // TODO: +} + +get_string_at_offset_t str_at_offset_getter(Strs *strs) { switch (strs->type) { - case STRS_CONSECUTIVE_32: { - uint32_t start_offset = (i == 0) ? 0 : strs->data.consecutive_32bit.end_offsets[i - 1]; - uint32_t end_offset = strs->data.consecutive_32bit.end_offsets[i]; - *start = strs->data.consecutive_32bit.start + start_offset; - *length = end_offset - start_offset - strs->data.consecutive_32bit.separator_length * (i + 1 != count); - *parent = strs->data.consecutive_32bit.parent; - return 1; - } - case STRS_CONSECUTIVE_64: { - uint64_t start_offset = (i == 0) ? 0 : strs->data.consecutive_64bit.end_offsets[i - 1]; - uint64_t end_offset = strs->data.consecutive_64bit.end_offsets[i]; - *start = strs->data.consecutive_64bit.start + start_offset; - *length = end_offset - start_offset - strs->data.consecutive_64bit.separator_length * (i + 1 != count); - *parent = strs->data.consecutive_64bit.parent; - return 1; - } - case STRS_REORDERED: { - // - return 1; - } - case STRS_MULTI_SOURCE: { - // - return 1; - } + case STRS_CONSECUTIVE_32: return str_at_offset_consecutive_32bit; + case STRS_CONSECUTIVE_64: return str_at_offset_consecutive_64bit; + case STRS_REORDERED: return str_at_offset_reordered; + case STRS_MULTI_SOURCE: return str_at_offset_multi_source; default: // Unsupported type PyErr_SetString(PyExc_TypeError, "Unsupported type for conversion"); - return -1; + return NULL; } } -int prepare_strings_for_reordering(Strs *strs) { - // Already in reordered form - if (strs->type == STRS_REORDERED) { return 1; } +boolean_t prepare_strings_for_reordering(Strs *strs) { // Allocate memory for reordered slices size_t count = 0; + void *old_buffer = NULL; + get_string_at_offset_t getter = NULL; + PyObject *parent = NULL; switch (strs->type) { - case STRS_CONSECUTIVE_32: count = strs->data.consecutive_32bit.count; break; - case STRS_CONSECUTIVE_64: count = strs->data.consecutive_64bit.count; break; + case STRS_CONSECUTIVE_32: + count = strs->data.consecutive_32bit.count; + old_buffer = strs->data.consecutive_32bit.end_offsets; + parent = strs->data.consecutive_32bit.parent; + getter = str_at_offset_consecutive_32bit; + break; + case STRS_CONSECUTIVE_64: + count = strs->data.consecutive_64bit.count; + old_buffer = strs->data.consecutive_64bit.end_offsets; + parent = strs->data.consecutive_64bit.parent; + getter = str_at_offset_consecutive_64bit; + break; + // Already in reordered form case STRS_REORDERED: return 1; case STRS_MULTI_SOURCE: return 1; default: // Unsupported type PyErr_SetString(PyExc_TypeError, "Unsupported type for conversion"); - return -1; + return 0; } sz_haystack_t *new_parts = (sz_haystack_t *)malloc(count * sizeof(sz_haystack_t)); if (new_parts == NULL) { PyErr_SetString(PyExc_MemoryError, "Unable to allocate memory for reordered slices"); - return -1; + return 0; } // Populate the new reordered array using get_string_at_offset @@ -267,26 +335,20 @@ int prepare_strings_for_reordering(Strs *strs) { PyObject *parent; char const *start; size_t length; - if (!get_string_at_offset(strs, i, count, &parent, &start, &length)) { - // Handle error - PyErr_SetString(PyExc_RuntimeError, "Failed to get string at offset"); - free(new_parts); - return -1; - } - + getter(strs, i, count, &parent, &start, &length); new_parts[i].start = start; new_parts[i].length = length; } // Release previous used memory. + if (old_buffer) free(old_buffer); // Update the Strs object strs->type = STRS_REORDERED; strs->data.reordered.count = count; strs->data.reordered.parts = new_parts; - strs->data.reordered.parent = NULL; // Assuming the parent is no longer needed - - return 0; + strs->data.reordered.parent = parent; + return 1; } #pragma endregion @@ -603,7 +665,7 @@ static int Str_in(Str *self, PyObject *arg) { sz_haystack_t haystack; haystack.start = self->start; haystack.length = self->length; - size_t position = sz_neon_find_substr(haystack, needle_struct); + size_t position = sz_find_substr_auto(haystack, needle_struct); return position != haystack.length; } @@ -629,20 +691,23 @@ static PyObject *Strs_getitem(Strs *self, Py_ssize_t i) { PyObject *parent = NULL; char const *start = NULL; size_t length = 0; - if (!get_string_at_offset(self, i, count, &parent, &start, &length)) { + get_string_at_offset_t getter = str_at_offset_getter(self); + if (!getter) { PyErr_SetString(PyExc_TypeError, "Unknown Strs kind"); return NULL; } + else + getter(self, i, count, &parent, &start, &length); // Create a new `Str` object - Str *parent_slice = (Str *)StrType.tp_alloc(&StrType, 0); - if (parent_slice == NULL && PyErr_NoMemory()) return NULL; + Str *view_copy = (Str *)StrType.tp_alloc(&StrType, 0); + if (view_copy == NULL && PyErr_NoMemory()) return NULL; - parent_slice->start = start; - parent_slice->length = length; - parent_slice->parent = parent; + view_copy->start = start; + view_copy->length = length; + view_copy->parent = parent; Py_INCREF(parent); - return parent_slice; + return view_copy; } static PyObject *Strs_subscript(Str *self, PyObject *key) { @@ -754,7 +819,7 @@ static int Str_find_( // haystack.length = normalized_length; // Perform contains operation - size_t offset = sz_neon_find_substr(haystack, needle); + size_t offset = sz_find_substr_auto(haystack, needle); if (offset == haystack.length) { *offset_out = -1; } else { *offset_out = (Py_ssize_t)offset; } @@ -881,11 +946,11 @@ static PyObject *Str_count(PyObject *self, PyObject *args, PyObject *kwargs) { haystack.start += normalized_offset; haystack.length = normalized_length; - size_t count = needle.length == 1 ? sz_naive_count_char(haystack, *needle.start) : 0; + size_t count = needle.length == 1 ? sz_count_char_swar(haystack, *needle.start) : 0; if (needle.length != 1) { if (allowoverlap) { while (haystack.length) { - size_t offset = sz_neon_find_substr(haystack, needle); + size_t offset = sz_find_substr_auto(haystack, needle); int found = offset != haystack.length; count += found; haystack.start += offset + found; @@ -894,7 +959,7 @@ static PyObject *Str_count(PyObject *self, PyObject *args, PyObject *kwargs) { } else { while (haystack.length) { - size_t offset = sz_neon_find_substr(haystack, needle); + size_t offset = sz_find_substr_auto(haystack, needle); int found = offset != haystack.length; count += found; haystack.start += offset + needle.length; @@ -943,6 +1008,7 @@ static PyObject *Str_levenstein(PyObject *self, PyObject *args, PyObject *kwargs return NULL; } + // Allocate memory for the Levenstein matrix size_t memory_needed = sz_levenstein_memory_needed(str1.length, str2.length); if (temporary_memory.length < memory_needed) { temporary_memory.start = realloc(temporary_memory.start, memory_needed); @@ -1075,11 +1141,11 @@ static Strs *Str_split_( // Iterate through string, keeping track of the sz_size_t last_start = 0; - while (last_start < text.length && offsets_count < maxsplit) { + while (last_start <= text.length && offsets_count < maxsplit) { sz_haystack_t text_remaining; text_remaining.start = text.start + last_start; text_remaining.length = text.length - last_start; - sz_size_t offset_in_remaining = sz_neon_find_substr(text_remaining, separator); + sz_size_t offset_in_remaining = sz_find_substr_auto(text_remaining, separator); // Reallocate offsets array if needed if (offsets_count >= offsets_capacity) { @@ -1419,6 +1485,176 @@ static PyObject *Strs_shuffle(Strs *self, PyObject *args, PyObject *kwargs) { Py_RETURN_NONE; } +static boolean_t Strs_sort_(Strs *self, + sz_haystack_t **parts_output, + sz_size_t **order_output, + sz_size_t *count_output) { + + // Change the layout + if (!prepare_strings_for_reordering(self)) { + PyErr_Format(PyExc_TypeError, "Failed to prepare the sequence for sorting"); + return 0; + } + + // Get the parts and their count + sz_haystack_t *parts = NULL; + size_t count = 0; + switch (self->type) { + case STRS_REORDERED: + parts = self->data.reordered.parts; + count = self->data.reordered.count; + break; + + case STRS_MULTI_SOURCE: + parts = self->data.multi_source.parts; + count = self->data.multi_source.count; + break; + } + + // Allocate temporary memory to store the ordering offsets + size_t memory_needed = sizeof(sz_size_t) * count; + if (temporary_memory.length < memory_needed) { + temporary_memory.start = realloc(temporary_memory.start, memory_needed); + temporary_memory.length = memory_needed; + } + if (!temporary_memory.start) { + PyErr_Format(PyExc_MemoryError, "Unable to allocate memory for the Levenshtein matrix"); + return 0; + } + + // Call our sorting algorithm + sz_sequence_t sequence = {}; + sz_sort_config_t sort_config = {}; + sequence.order = (sz_size_t *)temporary_memory.start; + sequence.count = count; + sequence.handle = parts; + sequence.get_start = haystacks_get_start; + sequence.get_length = haystacks_get_length; + for (sz_size_t i = 0; i != sequence.count; ++i) sequence.order[i] = i; + sz_sort(&sequence, &sort_config); + + // Export results + *parts_output = parts; + *order_output = sequence.order; + *count_output = sequence.count; + return 1; +} + +static PyObject *Strs_sort(Strs *self, PyObject *args, PyObject *kwargs) { + PyObject *reverse_obj = NULL; // Default is not reversed + + // Check for positional arguments + Py_ssize_t nargs = PyTuple_Size(args); + if (nargs > 1) { + PyErr_SetString(PyExc_TypeError, "sort() takes at most 1 positional argument"); + return NULL; + } + else if (nargs == 1) { reverse_obj = PyTuple_GET_ITEM(args, 0); } + + // Check for keyword arguments + if (kwargs) { + PyObject *key, *value; + Py_ssize_t pos = 0; + while (PyDict_Next(kwargs, &pos, &key, &value)) { + if (PyUnicode_CompareWithASCIIString(key, "reverse") == 0) { + if (reverse_obj) { + PyErr_SetString(PyExc_TypeError, "Received reverse both as positional and keyword argument"); + return NULL; + } + reverse_obj = value; + } + else { + PyErr_Format(PyExc_TypeError, "Received an unexpected keyword argument '%U'", key); + return NULL; + } + } + } + + boolean_t reverse = 0; // Default is False + if (reverse_obj) { + if (!PyBool_Check(reverse_obj)) { + PyErr_SetString(PyExc_TypeError, "The reverse must be a boolean"); + return NULL; + } + reverse = PyObject_IsTrue(reverse_obj); + } + + sz_haystack_t *parts = NULL; + sz_size_t *order = NULL; + sz_size_t *count = NULL; + if (!Strs_sort_(self, &parts, &order, &count)) return NULL; + + // Apply the sorting algorithm here, considering the `reverse` value + if (reverse) reverse_offsets(order, count); + + // Apply the new order. + apply_order(parts, order, count); + + Py_RETURN_NONE; +} + +static PyObject *Strs_order(Strs *self, PyObject *args, PyObject *kwargs) { + PyObject *reverse_obj = NULL; // Default is not reversed + + // Check for positional arguments + Py_ssize_t nargs = PyTuple_Size(args); + if (nargs > 1) { + PyErr_SetString(PyExc_TypeError, "order() takes at most 1 positional argument"); + return NULL; + } + else if (nargs == 1) { reverse_obj = PyTuple_GET_ITEM(args, 0); } + + // Check for keyword arguments + if (kwargs) { + PyObject *key, *value; + Py_ssize_t pos = 0; + while (PyDict_Next(kwargs, &pos, &key, &value)) { + if (PyUnicode_CompareWithASCIIString(key, "reverse") == 0) { + if (reverse_obj) { + PyErr_SetString(PyExc_TypeError, "Received reverse both as positional and keyword argument"); + return NULL; + } + reverse_obj = value; + } + else { + PyErr_Format(PyExc_TypeError, "Received an unexpected keyword argument '%U'", key); + return NULL; + } + } + } + + boolean_t reverse = 0; // Default is False + if (reverse_obj) { + if (!PyBool_Check(reverse_obj)) { + PyErr_SetString(PyExc_TypeError, "The reverse must be a boolean"); + return NULL; + } + reverse = PyObject_IsTrue(reverse_obj); + } + + sz_haystack_t *parts = NULL; + sz_size_t *order = NULL; + sz_size_t count = NULL; + if (!Strs_sort_(self, &parts, &order, &count)) return NULL; + + // Apply the sorting algorithm here, considering the `reverse` value + if (reverse) reverse_offsets(order, count); + + // Here, instead of applying the order, we want to return the copy of the + // order as a NumPy array of 64-bit unsigned integers. + npy_intp numpy_size = count; + PyObject *array = PyArray_SimpleNew(1, &numpy_size, NPY_UINT64); + if (!array) { + PyErr_SetString(PyExc_RuntimeError, "Failed to create a NumPy array"); + return NULL; + } + + // Copy the data from the order array to the newly created NumPy array + sz_size_t *numpy_data_ptr = (sz_size_t *)PyArray_DATA((PyArrayObject *)array); + memcpy(numpy_data_ptr, order, count * sizeof(sz_size_t)); + return array; +} + static PySequenceMethods Strs_as_sequence = { .sq_length = Strs_len, // .sq_item = Strs_getitem, // @@ -1431,7 +1667,9 @@ static PyMappingMethods Strs_as_mapping = { }; static PyMethodDef Strs_methods[] = { - {"shuffle", Strs_shuffle, sz_method_flags_m, "Shuffle the elements of the Strs object."}, // + {"shuffle", Strs_shuffle, sz_method_flags_m, "Shuffle the elements of the Strs object."}, // + {"sort", Strs_sort, sz_method_flags_m, "Sort the elements of the Strs object."}, // + {"order", Strs_order, sz_method_flags_m, "Provides the indexes to achieve sorted order."}, // {NULL, NULL, 0, NULL}}; static PyTypeObject StrsType = { @@ -1482,6 +1720,8 @@ static PyModuleDef stringzilla_module = { PyMODINIT_FUNC PyInit_stringzilla(void) { PyObject *m; + import_array(); + if (PyType_Ready(&StrType) < 0) return NULL; if (PyType_Ready(&FileType) < 0) return NULL; if (PyType_Ready(&StrsType) < 0) return NULL; diff --git a/scripts/test.cpp b/scripts/test.cpp index 1cf34bb2..e2c83d1b 100644 --- a/scripts/test.cpp +++ b/scripts/test.cpp @@ -238,23 +238,23 @@ int main(int, char const **) { bench_search("std::search", full_text, [&]() { return std::search(full_text.begin(), full_text.end(), needle.begin(), needle.end()) - full_text.begin(); }); - bench_search("sz_naive_find_substr", full_text, [&]() { + bench_search("sz_find_substr_swar", full_text, [&]() { sz_haystack_t h {full_text.data(), full_text.size()}; sz_needle_t n {needle.data(), needle.size()}; - return sz_naive_find_substr(h, n); + return sz_find_substr_swar(h, n); }); #if defined(__ARM_NEON) - bench_search("sz_neon_find_substr", full_text, [&]() { + bench_search("sz_find_substr_neon", full_text, [&]() { sz_haystack_t h {full_text.data(), full_text.size()}; sz_needle_t n {needle.data(), needle.size()}; - return sz_neon_find_substr(h, n); + return sz_find_substr_neon(h, n); }); #endif #if defined(__AVX2__) - bench_search("sz_avx2_find_substr", full_text, [&]() { + bench_search("sz_find_substr_avx2", full_text, [&]() { sz_haystack_t h {full_text.data(), full_text.size()}; sz_needle_t n {needle.data(), needle.size()}; - return sz_avx2_find_substr(h, n); + return sz_find_substr_avx2(h, n); }); #endif } diff --git a/scripts/test.py b/scripts/test.py index b9083ea6..14b6e9e7 100644 --- a/scripts/test.py +++ b/scripts/test.py @@ -6,55 +6,23 @@ import pytest import stringzilla as sz -from stringzilla import Str +from stringzilla import Str, Strs -def test_globals(): - assert sz.find("abcdef", "bcdef") == 1 - assert sz.find("abcdef", "x") == -1 - - assert sz.count("abcdef", "x") == 0 - assert sz.count("aaaaa", "a") == 5 - assert sz.count("aaaaa", "aa") == 2 - assert sz.count("aaaaa", "aa", allowoverlap=True) == 4 - - assert sz.levenstein("aaa", "aaa") == 0 - assert sz.levenstein("aaa", "bbb") == 3 - assert sz.levenstein("abababab", "aaaaaaaa") == 4 - assert sz.levenstein("abababab", "aaaaaaaa", 2) == 2 - assert sz.levenstein("abababab", "aaaaaaaa", bound=2) == 2 - - -def test_split_keepseparator(): - native = "word1_word2_word3" - big = Str(native) - - words = sz.split(big, "_") - assert len(words) == 3 - - parts = sz.split(big, "_", keepseparator=True) - assert len(parts) == 3 - - assert str(words[0]) == "word1" - assert str(parts[0]) == "word1_" - assert str(words[2]) == "word3" - assert str(parts[2]) == "word3" - - -def test_construct(): +def test_unit_construct(): native = "aaaaa" big = Str(native) assert len(big) == len(native) -def test_indexing(): +def test_unit_indexing(): native = "abcdef" big = Str(native) for i in range(len(native)): assert big[i] == native[i] -def test_count(): +def test_unit_count(): native = "aaaaa" big = Str(native) assert big.count("a") == 5 @@ -62,20 +30,20 @@ def test_count(): assert big.count("aa", allowoverlap=True) == 4 -def test_contains(): +def test_unit_contains(): big = Str("abcdef") assert "a" in big assert "ab" in big assert "xxx" not in big -def test_rich_comparisons(): +def test_unit_rich_comparisons(): assert Str("aa") == "aa" assert Str("aa") < "b" assert Str("abb")[1:] == "bb" -def test_buffer_protocol(): +def test_unit_buffer_protocol(): import numpy as np my_str = Str("hello") @@ -85,6 +53,77 @@ def test_buffer_protocol(): assert "".join([c.decode("utf-8") for c in arr.tolist()]) == "hello" +def test_unit_split(): + native = "token1\ntoken2\ntoken3" + big = Str(native) + assert native.splitlines() == list(big.splitlines()) + assert native.splitlines(True) == list(big.splitlines(keeplinebreaks=True)) + assert native.split("token3") == list(big.split("token3")) + + words = sz.split(big, "\n") + assert len(words) == 3 + assert str(words[0]) == "token1" + assert str(words[2]) == "token3" + + parts = sz.split(big, "\n", keepseparator=True) + assert len(parts) == 3 + assert str(parts[0]) == "token1\n" + assert str(parts[2]) == "token3" + + +def test_unit_sequence(): + native = "line3\nline2\nline1" + big = Str(native) + + lines = big.splitlines() + assert [2, 1, 0] == list(lines.order()) + + lines.sort() + assert [0, 1, 2] == list(lines.order()) + assert ["line1", "line2", "line3"] == list(lines) + + shuffled_copy = lines.shuffled(seed=42) + assert set(lines) == set(shuffled_copy) + + lines.append("line4") + assert 4 == len(lines) + lines.extend(["line5", "line6"]) + assert 6 == len(lines) + + lines.append(lines[0]) + assert 7 == len(lines) + assert lines[6] == "line1" + + lines.extend(lines) + assert 14 == len(lines) + assert lines[7] == "line1" + assert lines[8] == "line2" + assert lines[12] == "line6" + + # Test that shuffles are reproducible with the same `seed` + a = [str(s) for s in lines.shuffled(seed=42)] + b = [str(s) for s in lines.shuffled(seed=42)] + assert a == b + + +def test_unit_globals(): + """Validates that the previously unit-tested member methods are also visible as global functions.""" + + assert sz.find("abcdef", "bcdef") == 1 + assert sz.find("abcdef", "x") == -1 + + assert sz.count("abcdef", "x") == 0 + assert sz.count("aaaaa", "a") == 5 + assert sz.count("aaaaa", "aa") == 2 + assert sz.count("aaaaa", "aa", allowoverlap=True) == 4 + + assert sz.levenstein("aaa", "aaa") == 0 + assert sz.levenstein("aaa", "bbb") == 3 + assert sz.levenstein("abababab", "aaaaaaaa") == 4 + assert sz.levenstein("abababab", "aaaaaaaa", 2) == 2 + assert sz.levenstein("abababab", "aaaaaaaa", bound=2) == 2 + + def get_random_string( length: Optional[int] = None, variability: Optional[int] = None ) -> str: @@ -100,169 +139,125 @@ def is_equal_strings(native_strings, big_strings): assert native_slice == big_slice -# def check_identical( -# native: str, -# big: Union[Str, File], -# needle: Optional[str] = None, -# check_iterators: bool = False, -# ): -# if needle is None: -# part_offset = randint(0, len(native) - 1) -# part_length = randint(1, len(native) - part_offset) -# needle = native[part_offset:part_length] - -# present_in_native: bool = needle in native -# present_in_big = needle in big -# assert present_in_native == present_in_big -# assert native.find(needle) == big.find(needle) -# assert native.count(needle) == big.count(needle) +def check_identical( + native: str, + big: Str, + needle: Optional[str] = None, + check_iterators: bool = False, +): + if needle is None: + part_offset = randint(0, len(native) - 1) + part_length = randint(1, len(native) - part_offset) + needle = native[part_offset:part_length] + + present_in_native: bool = needle in native + present_in_big = needle in big + assert present_in_native == present_in_big + assert native.find(needle) == big.find(needle) + assert native.count(needle) == big.count(needle) + + native_strings = native.split(needle) + big_strings: Strs = big.split(needle) + assert len(native_strings) == len(big_strings) + + if check_iterators: + for i in range(len(native_strings)): + assert len(native_strings[i]) == len(big_strings[i]) + assert native_strings[i] == big_strings[i] + assert [c for c in native_strings[i]] == [c for c in big_strings[i]] + + is_equal_strings(native_strings, big_strings) + + +@pytest.mark.parametrize("haystack_length", range(1, 65)) +@pytest.mark.parametrize("variability", range(1, 25)) +def test_fuzzy_substrings(haystack_length: int, variability: int): + native = get_random_string(variability=variability, length=haystack_length) + big = Str(native) + pattern = get_random_string(variability=variability, length=randint(1, 5)) + assert (pattern in native) == big.contains(pattern) + assert native.find(pattern) == big.find(pattern) + + +@pytest.mark.parametrize("repetitions", range(1, 10)) +def test_basic(repetitions: int): + native = "abcd" * repetitions + big = Str(native) + + check_identical(native, big, "a", True) + check_identical(native, big, "ab", True) + check_identical(native, big, "abc", True) + check_identical(native, big, "abcd", True) + check_identical(native, big, "abcde", True) # Missing pattern + -# native_strings = native.split(needle) -# big_strings: Strs = big.split(needle) -# assert len(native_strings) == len(big_strings) +@pytest.mark.parametrize("pattern_length", [1, 2, 4, 5]) +@pytest.mark.parametrize("haystack_length", range(1, 69, 3)) +@pytest.mark.parametrize("variability", range(1, 27, 3)) +def test_fuzzy(pattern_length: int, haystack_length: int, variability: int): + native = get_random_string(variability=variability, length=haystack_length) + big = Str(native) + + # Start by matching the prefix and the suffix + check_identical(native, big, native[:pattern_length]) + check_identical(native, big, native[-pattern_length:]) + + # Continue with random strs + for _ in range(haystack_length // pattern_length): + pattern = get_random_string(variability=variability, length=pattern_length) + check_identical(native, big, pattern) + + +def test_strs(): + native = get_random_string(length=10) + big = Str(native) + + assert native[0:5] == big.sub(0, 5) and native[0:5] == big[0:5] + assert native[5:10] == big.sub(5, 10) and native[5:10] == big[5:10] + + assert native[5:5] == big.sub(5, 5) and native[5:5] == big[5:5] + assert native[-5:-5] == big.sub(-5, -5) and native[-5:-5] == big[-5:-5] + assert native[2:-2] == big.sub(2, -2) and native[2:-2] == big[2:-2] + assert native[7:-7] == big.sub(7, -7) and native[7:-7] == big[7:-7] + + assert native[5:3] == big.sub(5, 3) and native[5:3] == big[5:3] + assert native[5:7] == big.sub(5, 7) and native[5:7] == big[5:7] + assert native[5:-3] == big.sub(5, -3) and native[5:-3] == big[5:-3] + assert native[5:-7] == big.sub(5, -7) and native[5:-7] == big[5:-7] + + assert native[-5:3] == big.sub(-5, 3) and native[-5:3] == big[-5:3] + assert native[-5:7] == big.sub(-5, 7) and native[-5:7] == big[-5:7] + assert native[-5:-3] == big.sub(-5, -3) and native[-5:-3] == big[-5:-3] + assert native[-5:-7] == big.sub(-5, -7) and native[-5:-7] == big[-5:-7] + + assert native[2:] == big.sub(2) and native[2:] == big[2:] + assert native[:7] == big.sub(end=7) and native[:7] == big[:7] + assert native[-2:] == big.sub(-2) and native[-2:] == big[-2:] + assert native[:-7] == big.sub(end=-7) and native[:-7] == big[:-7] + assert native[:-10] == big.sub(end=-10) and native[:-10] == big[:-10] + assert native[:-1] == big.sub(end=-1) and native[:-1] == big[:-1] + + length = 1000 + native = get_random_string(length=length) + big = Str(native) -# if check_iterators: -# for i in range(len(native_strings)): -# assert len(native_strings[i]) == len(big_strings[i]) -# assert native_strings[i] == big_strings[i] -# assert [c for c in native_strings[i]] == [c for c in big_strings[i]] - -# is_equal_strings(native_strings, big_strings) - - -# @pytest.mark.parametrize("haystack_length", range(1, 65)) -# @pytest.mark.parametrize("variability", range(1, 25)) -# def test_contains(haystack_length: int, variability: int): -# native = get_random_string(variability=variability, length=haystack_length) -# big = Str(native) -# pattern = get_random_string(variability=variability, length=randint(1, 5)) -# assert (pattern in native) == big.contains(pattern) - - -# def test_count_overlap(): -# native = "aaaaa" -# big = Str(native) -# assert native.count("aa") == big.count("aa") -# assert 4 == big.count("aa", allowoverlap=True) - - -# def test_splitlines(): -# native = "line1\nline2\nline3" -# big = Str(native) -# assert native.splitlines() == list(big.splitlines()) -# assert native.splitlines(True) == list(big.splitlines(keeplinebreaks=True)) - - -# def test_strs_operations(): -# native = "line1\nline2\nline3" -# big = Str(native) -# lines = big.splitlines() -# lines.sort() -# assert ["line1", "line2", "line3"] == list(lines) - -# shuffled_copy = lines.shuffled(seed=42) -# assert set(lines) == set(shuffled_copy) - -# lines.append("line4") -# assert 4 == len(lines) -# lines.extend(["line5", "line6"]) -# assert 6 == len(lines) - -# lines.append(lines[0]) -# assert 7 == len(lines) -# assert lines[6] == "line1" - -# lines.extend(lines) -# assert 14 == len(lines) -# assert lines[7] == "line1" -# assert lines[8] == "line2" -# assert lines[12] == "line6" - -# # Test that shuffles are reproducible with the same `seed` -# a = [str(s) for s in lines.shuffled(seed=42)] -# b = [str(s) for s in lines.shuffled(seed=42)] -# assert a == b - - -# @pytest.mark.parametrize("repetitions", range(1, 10)) -# def test_basic(repetitions: int): -# native = "abcd" * repetitions -# big = Str(native) - -# check_identical(native, big, "a", True) -# check_identical(native, big, "ab", True) -# check_identical(native, big, "abc", True) -# check_identical(native, big, "abcd", True) -# check_identical(native, big, "abcde", True) # Missing pattern - - -# @pytest.mark.parametrize("pattern_length", [1, 2, 4, 5]) -# @pytest.mark.parametrize("haystack_length", range(1, 69, 3)) -# @pytest.mark.parametrize("variability", range(1, 27, 3)) -# def test_fuzzy(pattern_length: int, haystack_length: int, variability: int): -# native = get_random_string(variability=variability, length=haystack_length) -# big = Str(native) - -# # Start by matching the prefix and the suffix -# check_identical(native, big, native[:pattern_length]) -# check_identical(native, big, native[-pattern_length:]) - -# # Continue with random strs -# for _ in range(haystack_length // pattern_length): -# pattern = get_random_string(variability=variability, length=pattern_length) -# check_identical(native, big, pattern) - - -# def test_strs(): -# native = get_random_string(length=10) -# big = Str(native) - -# assert native[0:5] == big.sub(0, 5) and native[0:5] == big[0:5] -# assert native[5:10] == big.sub(5, 10) and native[5:10] == big[5:10] - -# assert native[5:5] == big.sub(5, 5) and native[5:5] == big[5:5] -# assert native[-5:-5] == big.sub(-5, -5) and native[-5:-5] == big[-5:-5] -# assert native[2:-2] == big.sub(2, -2) and native[2:-2] == big[2:-2] -# assert native[7:-7] == big.sub(7, -7) and native[7:-7] == big[7:-7] - -# assert native[5:3] == big.sub(5, 3) and native[5:3] == big[5:3] -# assert native[5:7] == big.sub(5, 7) and native[5:7] == big[5:7] -# assert native[5:-3] == big.sub(5, -3) and native[5:-3] == big[5:-3] -# assert native[5:-7] == big.sub(5, -7) and native[5:-7] == big[5:-7] - -# assert native[-5:3] == big.sub(-5, 3) and native[-5:3] == big[-5:3] -# assert native[-5:7] == big.sub(-5, 7) and native[-5:7] == big[-5:7] -# assert native[-5:-3] == big.sub(-5, -3) and native[-5:-3] == big[-5:-3] -# assert native[-5:-7] == big.sub(-5, -7) and native[-5:-7] == big[-5:-7] - -# assert native[2:] == big.sub(2) and native[2:] == big[2:] -# assert native[:7] == big.sub(end=7) and native[:7] == big[:7] -# assert native[-2:] == big.sub(-2) and native[-2:] == big[-2:] -# assert native[:-7] == big.sub(end=-7) and native[:-7] == big[:-7] -# assert native[:-10] == big.sub(end=-10) and native[:-10] == big[:-10] -# assert native[:-1] == big.sub(end=-1) and native[:-1] == big[:-1] - -# length = 1000 -# native = get_random_string(length=length) -# big = Str(native) - -# needle = native[0 : randint(2, 5)] -# native_strings = native.split(needle) -# big_strings: Strs = big.split(needle) - -# length = len(native_strings) -# for i in range(length): -# start = randint(1 - length, length - 1) -# stop = randint(1 - length, length - 1) -# step = 0 -# while step == 0: -# step = randint(-int(math.sqrt(length)), int(math.sqrt(length))) - -# is_equal_strings(native_strings[start:stop:step], big_strings[start:stop:step]) -# is_equal_strings( -# native_strings[start:stop:step], -# big_strings.sub(start, stop, step), -# ) + needle = native[0 : randint(2, 5)] + native_strings = native.split(needle) + big_strings: Strs = big.split(needle) + + length = len(native_strings) + for i in range(length): + start = randint(1 - length, length - 1) + stop = randint(1 - length, length - 1) + step = 0 + while step == 0: + step = randint(-int(math.sqrt(length)), int(math.sqrt(length))) + + is_equal_strings(native_strings[start:stop:step], big_strings[start:stop:step]) + is_equal_strings( + native_strings[start:stop:step], + big_strings.sub(start, stop, step), + ) def test_levenstein(): diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h index 8bd32fa1..52cc4ec6 100644 --- a/stringzilla/stringzilla.h +++ b/stringzilla/stringzilla.h @@ -59,7 +59,7 @@ typedef struct sz_needle_t { /** * @brief SWAR single-character counting procedure, jumping 8 bytes at a time. */ -inline static sz_size_t sz_naive_count_char(sz_haystack_t h, char n) { +inline static sz_size_t sz_count_char_swar(sz_haystack_t h, char n) { sz_size_t result = 0; char const *text = h.start; @@ -89,7 +89,7 @@ inline static sz_size_t sz_naive_count_char(sz_haystack_t h, char n) { /** * @brief SWAR single-character search in string, jumping 8 bytes at a time. */ -inline static sz_size_t sz_naive_find_char(sz_haystack_t h, char n) { +inline static sz_size_t sz_find_char_swar(sz_haystack_t h, char n) { char const *text = h.start; char const *end = h.start + h.length; @@ -121,7 +121,7 @@ inline static sz_size_t sz_naive_find_char(sz_haystack_t h, char n) { /** * @brief SWAR character-bigram search in string, jumping 8 bytes at a time. */ -inline static sz_size_t sz_naive_find_2chars(sz_haystack_t h, char const *n) { +inline static sz_size_t sz_find_2chars_swar(sz_haystack_t h, char const *n) { char const *text = h.start; char const *end = h.start + h.length; @@ -162,7 +162,7 @@ inline static sz_size_t sz_naive_find_2chars(sz_haystack_t h, char const *n) { /** * @brief SWAR character-trigram search in string, jumping 8 bytes at a time. */ -inline static sz_size_t sz_naive_find_3chars(sz_haystack_t h, char const *n) { +inline static sz_size_t sz_find_3chars_swar(sz_haystack_t h, char const *n) { char const *text = h.start; char const *end = h.start + h.length; @@ -215,7 +215,7 @@ inline static sz_size_t sz_naive_find_3chars(sz_haystack_t h, char const *n) { /** * @brief SWAR character-quadgram search in string, jumping 8 bytes at a time. */ -inline static sz_size_t sz_naive_find_4chars(sz_haystack_t h, char const *n) { +inline static sz_size_t sz_find_4chars_swar(sz_haystack_t h, char const *n) { char const *text = h.start; char const *end = h.start + h.length; @@ -230,7 +230,7 @@ inline static sz_size_t sz_naive_find_4chars(sz_haystack_t h, char const *n) { lookup[0b0100] = lookup[0b1100] = 2; lookup[0b1000] = 3; - // We can perform 5 comparisons per load, but it's easir to perform 4, minimizing the size of the lookup table. + // We can perform 5 comparisons per load, but it's easier to perform 4, minimizing the size of the lookup table. for (; text + 8 <= end; text += 4) { uint64_t text_slice; memcpy(&text_slice, text, 8); @@ -275,19 +275,20 @@ inline static sz_size_t sz_naive_find_4chars(sz_haystack_t h, char const *n) { * it compares 4-byte anomalies first, most commonly prefixes. It's computationally cheaper. * Matching performance fluctuates between 1 GB/s and 3,5 GB/s per core. */ -inline static sz_size_t sz_naive_find_substr(sz_haystack_t h, sz_needle_t n) { +inline static sz_size_t sz_find_substr_swar(sz_haystack_t h, sz_needle_t n) { if (h.length < n.length) return h.length; - char const *text = h.start; - char const *const end = h.start + h.length; switch (n.length) { case 0: return 0; - case 1: return sz_naive_find_char(h, *n.start); - case 2: return sz_naive_find_2chars(h, n.start); - case 3: return sz_naive_find_3chars(h, n.start); - case 4: return sz_naive_find_4chars(h, n.start); + case 1: return sz_find_char_swar(h, *n.start); + case 2: return sz_find_2chars_swar(h, n.start); + case 3: return sz_find_3chars_swar(h, n.start); + case 4: return sz_find_4chars_swar(h, n.start); default: { + char const *text = h.start; + char const *const end = h.start + h.length; + sz_anomaly_t n_anomaly, h_anomaly; sz_size_t const n_suffix_len = n.length - 4 - n.anomaly_offset; char const *n_suffix_ptr = n.start + 4 + n.anomaly_offset; @@ -314,7 +315,7 @@ inline static sz_size_t sz_naive_find_substr(sz_haystack_t h, sz_needle_t n) { * was practically more efficient than loading once and shifting around, as introduces * less data dependencies. */ -sz_size_t sz_avx2_find_substr(sz_haystack_t h, sz_needle_t n) { +inline static sz_size_t sz_find_substr_avx2(sz_haystack_t h, sz_needle_t n) { // Precomputed constants char const *const end = h.start + h.length; @@ -363,7 +364,7 @@ sz_size_t sz_avx2_find_substr(sz_haystack_t h, sz_needle_t n) { sz_haystack_t tail; tail.start = text; tail.length = end - text; - size_t tail_match = sz_naive_find_substr(tail, n); + size_t tail_match = sz_find_substr_swar(tail, n); return text + tail_match - h.start; } @@ -377,7 +378,7 @@ sz_size_t sz_avx2_find_substr(sz_haystack_t h, sz_needle_t n) { * was practically more efficient than loading once and shifting around, as introduces * less data dependencies. */ -inline static sz_size_t sz_neon_find_substr(sz_haystack_t h, sz_needle_t n) { +inline static sz_size_t sz_find_substr_neon(sz_haystack_t h, sz_needle_t n) { // Precomputed constants char const *const end = h.start + h.length; @@ -420,12 +421,33 @@ inline static sz_size_t sz_neon_find_substr(sz_haystack_t h, sz_needle_t n) { sz_haystack_t tail; tail.start = text; tail.length = end - text; - size_t tail_match = sz_naive_find_substr(tail, n); + size_t tail_match = sz_find_substr_swar(tail, n); return text + tail_match - h.start; } #endif // Arm Neon +inline static sz_size_t sz_find_substr_auto(sz_haystack_t h, sz_needle_t n) { + if (h.length < n.length) return h.length; + + switch (n.length) { + case 0: return 0; + case 1: return sz_find_char_swar(h, *n.start); + case 2: return sz_find_2chars_swar(h, n.start); + case 3: return sz_find_3chars_swar(h, n.start); + case 4: + return sz_find_4chars_swar(h, n.start); + // #if defined(__ARM_NEON) + // default: return sz_find_substr_neon(h, n); + // #elif defined(__AVX2__) + // default: return sz_find_substr_avx2(h, n); + // #else + default: + return sz_find_substr_swar(h, n); + // #endif + } +} + inline static void sz_swap(sz_size_t *a, sz_size_t *b) { sz_size_t t = *a; *a = *b; @@ -517,7 +539,6 @@ inline static void _sz_sort_recursion( // { sz_size_t mask = (1ul << 63) >> bit_idx; while (split != sequence->count && !(sequence->order[split] & mask)) ++split; - for (sz_size_t i = split + 1; i < sequence->count; ++i) if (!(sequence->order[i] & mask)) sz_swap(sequence->order + i, sequence->order + split), ++split; } From 1265fce66acf3f1b2b2a35ac2782eddbf40875cf Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sun, 24 Sep 2023 12:29:49 +0400 Subject: [PATCH 41/72] Add: Subscript methods --- CMakeLists.txt | 2 +- python/lib.c | 218 +++++++++++++++++++++++++++++++++++++++++++++-- scripts/test.c | 60 +++++++++++++ scripts/test.cpp | 18 ++-- scripts/test.py | 51 ++++++----- 5 files changed, 300 insertions(+), 49 deletions(-) create mode 100644 scripts/test.c diff --git a/CMakeLists.txt b/CMakeLists.txt index 6909c838..df569329 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -89,7 +89,7 @@ if(STRINGZILLA_INSTALL) endif() if(${STRINGZILLA_BUILD_TEST} OR ${STRINGZILLA_BUILD_BENCHMARK}) - add_executable(stringzilla_test scripts/test.cpp) + add_executable(stringzilla_test scripts/test.c) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3") diff --git a/python/lib.c b/python/lib.c index c1e27113..5b576c67 100644 --- a/python/lib.c +++ b/python/lib.c @@ -136,11 +136,13 @@ typedef struct { /** * Complex structure with two variable length chunks inside - for the parents and their slices. * The parents are sorted in ascending order of their memory ranges, to let us rapidly locate the source - * with a binary search. The slices are preserved + * with a binary search. */ - struct multi_source_strings_t { + struct multi_source_slices_t { size_t count; + size_t capacity; size_t parents_count; + size_t parents_capacity; PyObject **parents; sz_haystack_t *parts; @@ -279,7 +281,24 @@ void str_at_offset_multi_source( Strs *strs, Py_ssize_t i, Py_ssize_t count, PyObject **parent, char const **start, size_t *length) { *start = strs->data.multi_source.parts[i].start; *length = strs->data.multi_source.parts[i].length; - *parent = NULL; // TODO: + + PyObject **parents = strs->data.multi_source.parents; + size_t parents_count = strs->data.multi_source.parents_count; + for (size_t j = 0; j < parents_count; ++j) { + PyObject *current_parent = parents[j]; + char *parent_start; + Py_ssize_t parent_length; + export_string_like(current_parent, &parent_start, &parent_length); + + // Check if the string at offset `i` is within the range of the current parent. + if (*start >= parent_start && *start + *length <= parent_start + parent_length) { + *parent = current_parent; + return; + } + } + + // If no parent is found, set *parent to NULL. + *parent = NULL; } get_string_at_offset_t str_at_offset_getter(Strs *strs) { @@ -331,11 +350,11 @@ boolean_t prepare_strings_for_reordering(Strs *strs) { } // Populate the new reordered array using get_string_at_offset - for (Py_ssize_t i = 0; i < count; ++i) { + for (size_t i = 0; i < count; ++i) { PyObject *parent; char const *start; size_t length; - getter(strs, i, count, &parent, &start, &length); + getter(strs, (Py_ssize_t)i, count, &parent, &start, &length); new_parts[i].start = start; new_parts[i].length = length; } @@ -351,6 +370,8 @@ boolean_t prepare_strings_for_reordering(Strs *strs) { return 1; } +boolean_t prepare_strings_for_extension(Strs *strs, size_t new_parents, size_t new_parts) { return 1; } + #pragma endregion #pragma region MemoryMappingFile @@ -679,6 +700,16 @@ static Py_ssize_t Strs_len(Strs *self) { } } +static Py_ssize_t Strs_parents_count(Strs *self) { + switch (self->type) { + case STRS_CONSECUTIVE_32: return 1; + case STRS_CONSECUTIVE_64: return 1; + case STRS_REORDERED: return 1; + case STRS_MULTI_SOURCE: return self->data.multi_source.parents_count; + default: return 0; + } +} + static PyObject *Strs_getitem(Strs *self, Py_ssize_t i) { // Check for negative index and convert to positive Py_ssize_t count = Strs_len(self); @@ -710,9 +741,176 @@ static PyObject *Strs_getitem(Strs *self, Py_ssize_t i) { return view_copy; } -static PyObject *Strs_subscript(Str *self, PyObject *key) { - if (PyLong_Check(key)) return Strs_getitem(self, PyLong_AsSsize_t(key)); - return NULL; +static PyObject *Strs_subscript(Strs *self, PyObject *key) { + if (PySlice_Check(key)) { + // Sanity checks + Py_ssize_t count = Strs_len(self); + Py_ssize_t start, stop, step; + if (PySlice_Unpack(key, &start, &stop, &step) < 0) return NULL; + if (PySlice_AdjustIndices(count, &start, &stop, step) < 0) return NULL; + if (step != 1) { + PyErr_SetString(PyExc_IndexError, "Efficient step is not supported"); + return NULL; + } + + // Create a new `Strs` object + Strs *self_slice = (Strs *)StrsType.tp_alloc(&StrsType, 0); + if (self_slice == NULL && PyErr_NoMemory()) return NULL; + + // Depending on the layout, the procedure will be different. + self_slice->type = self->type; + switch (self->type) { + case STRS_CONSECUTIVE_32: { + struct consecutive_slices_32bit_t *from = &self->data.consecutive_32bit; + struct consecutive_slices_32bit_t *to = &self_slice->data.consecutive_32bit; + to->count = stop - start; + to->separator_length = from->sepa rator_length; + to->parent = from->parent; + + size_t first_length; + str_at_offset_consecutive_32bit(self, start, count, &to->parent, &to->start, &first_length); + uint32_t first_offset = to->start - from->start; + to->end_offsets = malloc(sizeof(uint32_t) * to->count); + if (to->end_offsets == NULL && PyErr_NoMemory()) { + Py_XDECREF(self_slice); + return NULL; + } + for (size_t i = 0; i != to->count; ++i) to->end_offsets[i] = from->end_offsets[i] - first_offset; + Py_INCREF(to->parent); + break; + } + case STRS_CONSECUTIVE_64: { + struct consecutive_slices_64bit_t *from = &self->data.consecutive_64bit; + struct consecutive_slices_64bit_t *to = &self_slice->data.consecutive_64bit; + to->count = stop - start; + to->separator_length = from->separator_length; + to->parent = from->parent; + + size_t first_length; + str_at_offset_consecutive_64bit(self, start, count, &to->parent, &to->start, &first_length); + uint64_t first_offset = to->start - from->start; + to->end_offsets = malloc(sizeof(uint64_t) * to->count); + if (to->end_offsets == NULL && PyErr_NoMemory()) { + Py_XDECREF(self_slice); + return NULL; + } + for (size_t i = 0; i != to->count; ++i) to->end_offsets[i] = from->end_offsets[i] - first_offset; + Py_INCREF(to->parent); + break; + } + case STRS_REORDERED: { + struct reordered_slices_t *from = &self->data.reordered; + struct reordered_slices_t *to = &self_slice->data.reordered; + to->count = stop - start; + to->parent = from->parent; + + to->parts = malloc(sizeof(sz_haystack_t) * to->count); + if (to->parts == NULL && PyErr_NoMemory()) { + Py_XDECREF(self_slice); + return NULL; + } + memcpy(to->parts, from->parts + start, sizeof(sz_haystack_t) * to->count); + Py_INCREF(to->parent); + break; + } + case STRS_MULTI_SOURCE: { + struct multi_source_slices_t *from = &self->data.multi_source; + struct multi_source_slices_t *to = &self_slice->data.multi_source; + to->count = stop - start; + to->capacity = to->count; + to->parents_count = 0; + to->parents_capacity = from->parents_capacity; + + // Allocate memory for both `parts` and `parents` references + to->parts = malloc(sizeof(sz_haystack_t) * to->capacity); + if (to->parts == NULL && PyErr_NoMemory()) { + Py_XDECREF(self_slice); + return NULL; + } + to->parents = malloc(sizeof(PyObject *) * to->parents_capacity); + if (to->parents == NULL && PyErr_NoMemory()) { + free(to->parts); + Py_XDECREF(self_slice); + return NULL; + } + + // Iterate through the `parts` of this slice, detect the `parent` + // of each exported entry in `from->parents`, and upsert it into the `to->parents` + for (Py_ssize_t i = start; i < stop; ++i) { + PyObject *detected_parent; + char const *part_start; + size_t part_length; + + // Find the parent of the part at the offset `i` + str_at_offset_multi_source(self, i, count, &detected_parent, &part_start, &part_length); + Py_INCREF(detected_parent); + + // Upsert the detected parent into to->parents + // As the to->parents array is meant to be sorted, + // we insert in a way that maintains the sorting + size_t j = 0; + while (j < to->parents_count && to->parents[j] != detected_parent) ++j; + + // If the parent is not already in to->parents, insert it. + if (j == to->parents_count) { + to->parents[j] = detected_parent; + ++to->parents_count; + } + + // Populate the to->parts array + to->parts[i - start].start = part_start; + to->parts[i - start].length = part_length; + } + + break; + } + default: + // Unsupported type + PyErr_SetString(PyExc_TypeError, "Unsupported type for conversion"); + return NULL; + } + + return (PyObject *)self_slice; + } + else if (PyLong_Check(key)) { return Strs_getitem(self, PyLong_AsSsize_t(key)); } + else { + PyErr_SetString(PyExc_TypeError, "Strs indices must be integers or slices"); + return NULL; + } +} + +static PyObject *Strs_extend(Strs *self, PyObject *seq) { + // Check if seq is an instance of Strs + if (PyObject_IsInstance(seq, (PyObject *)&StrsType)) { + Strs *other = (Strs *)seq; + size_t other_parents = Strs_len(other); + size_t other_parts = Strs_parents_count(other); + if (!prepare_strings_for_extension(self, other_parents, other_parts)) { + PyErr_Format(PyExc_TypeError, "Failed to prepare the sequence for extension"); + return NULL; + } + + // TODO: + } + else if (PySequence_Check(seq)) { + // Check if seq is a sequence + Py_ssize_t length = PySequence_Size(seq); + // Validate that every item in the sequence is string-like with `export_string_like` + // TODO: + + for (Py_ssize_t i = 0; i < length; i++) { + PyObject *item = PySequence_ITEM(seq, i); + if (!item) return NULL; // Error getting item from sequence + + // TODO: + } + } + else { + PyErr_SetString(PyExc_TypeError, "Parameter must be a sequence or an instance of Strs"); + return NULL; + } + + Py_RETURN_NONE; } // Will be called by the `PySequence_Contains` @@ -1670,6 +1868,8 @@ static PyMethodDef Strs_methods[] = { {"shuffle", Strs_shuffle, sz_method_flags_m, "Shuffle the elements of the Strs object."}, // {"sort", Strs_sort, sz_method_flags_m, "Sort the elements of the Strs object."}, // {"order", Strs_order, sz_method_flags_m, "Provides the indexes to achieve sorted order."}, // + {"append", Strs_append, sz_method_flags_m, "Append the sequence with a new string."}, // + {"extend", Strs_extend, sz_method_flags_m, "Extend the sequence with new strings."}, // {NULL, NULL, 0, NULL}}; static PyTypeObject StrsType = { @@ -1698,11 +1898,11 @@ static PyMethodDef stringzilla_methods[] = { {"contains", Str_contains, sz_method_flags_m, "Check if a string contains a substring."}, {"partition", Str_partition, sz_method_flags_m, "Splits string into 3-tuple: before, match, after."}, {"count", Str_count, sz_method_flags_m, "Count the occurrences of a substring."}, - {"levenstein", Str_levenstein, sz_method_flags_m, "Calculate the Levenshtein distance between two strings."}, {"split", Str_split, sz_method_flags_m, "Split a string by a separator."}, {"splitlines", Str_splitlines, sz_method_flags_m, "Split a string by line breaks."}, {"startswith", Str_startswith, sz_method_flags_m, "Check if a string starts with a given prefix."}, {"endswith", Str_endswith, sz_method_flags_m, "Check if a string ends with a given suffix."}, + {"levenstein", Str_levenstein, sz_method_flags_m, "Calculate the Levenshtein distance between two strings."}, {NULL, NULL, 0, NULL}}; static PyModuleDef stringzilla_module = { diff --git a/scripts/test.c b/scripts/test.c new file mode 100644 index 00000000..f50d7e62 --- /dev/null +++ b/scripts/test.c @@ -0,0 +1,60 @@ +#include +#include +#include +#include +#include +#include + +#include + +#define MAX_LENGTH 300 +#define MIN_LENGTH 3 +#define ASCII_LOWERCASE "abcdefghijklmnopqrstuvwxyz" +#define VARIABILITY 25 + +// Utility function to populate random string in a buffer +void populate_random_string(char *buffer, int length, int variability) { + for (int i = 0; i < length; i++) { buffer[i] = ASCII_LOWERCASE[rand() % variability]; } + buffer[length] = '\0'; +} + +// Test function for sz_find_substr_auto +void test_sz_find_substr_auto() { + char buffer[MAX_LENGTH + 1]; + char pattern[6]; // Maximum length of 5 + 1 for '\0' + + for (int length = MIN_LENGTH; length < MAX_LENGTH; length++) { + for (int variability = 1; variability < VARIABILITY; variability++) { + populate_random_string(buffer, length, variability); + + struct sz_haystack_t haystack; + haystack.start = buffer; + haystack.length = length; + + int pattern_length = rand() % 5 + 1; + populate_random_string(pattern, pattern_length, variability); + + struct sz_needle_t needle; + needle.start = pattern; + needle.length = pattern_length; + + // Comparing the result of your function with the standard library function. + const char *result_libc = strstr(buffer, pattern); + uint64_t result_stringzilla = sz_find_substr_auto(haystack, needle); + + assert(((result_libc && result_stringzilla == (uint64_t)(result_libc - buffer)) || + (!result_libc && result_stringzilla == (uint64_t)-1)) && + "Test failed for sz_find_substr_auto"); + } + } +} + +int main() { + srand((unsigned int)time(NULL)); + + test_sz_find_substr_auto(); + // Add calls to other test functions as you implement them + + printf("All tests passed!\n"); + return 0; +} diff --git a/scripts/test.cpp b/scripts/test.cpp index e2c83d1b..ddef4e82 100644 --- a/scripts/test.cpp +++ b/scripts/test.cpp @@ -46,8 +46,7 @@ void populate_from_file( // std::ifstream f(path, std::ios::in); std::string s; - while (strings.size() < limit && std::getline(f, s, ' ')) - strings.push_back(s); + while (strings.size() < limit && std::getline(f, s, ' ')) strings.push_back(s); } void populate_with_test(strings_t &strings) { @@ -79,8 +78,7 @@ inline static idx_t hybrid_sort_cpp(strings_t const &strings, idx_t *order) { return *(uint32_t *)(i_bytes + offset_in_word) < *(uint32_t *)(j_bytes + offset_in_word); }); - for (size_t i = 0; i != strings.size(); ++i) - std::memset((char *)&order[i] + offset_in_word, 0, 4ul); + for (size_t i = 0; i != strings.size(); ++i) std::memset((char *)&order[i] + offset_in_word, 0, 4ul); std::sort(order, order + strings.size(), [&](idx_t i, idx_t j) { return strings[i] < strings[j]; }); @@ -144,8 +142,7 @@ inline static idx_t hybrid_stable_sort_cpp(strings_t const &strings, idx_t *orde return *(uint32_t *)(i_bytes + offset_in_word) < *(uint32_t *)(j_bytes + offset_in_word); }); - for (size_t i = 0; i != strings.size(); ++i) - std::memset((char *)&order[i] + offset_in_word, 0, 4ul); + for (size_t i = 0; i != strings.size(); ++i) std::memset((char *)&order[i] + offset_in_word, 0, 4ul); std::stable_sort(order, order + strings.size(), [&](idx_t i, idx_t j) { return strings[i] < strings[j]; }); @@ -198,8 +195,7 @@ void bench_search(char const *name, std::string_view full_text, algo_at &&algo) // Run multiple iterations std::size_t bytes_passed = 0; - for (std::size_t i = 0; i != iterations; ++i) - bytes_passed += algo(); + for (std::size_t i = 0; i != iterations; ++i) bytes_passed += algo(); // Measure elapsed time stdcc::time_point t2 = stdcc::now(); @@ -215,15 +211,13 @@ int main(int, char const **) { strings_t strings; populate_from_file("leipzig1M.txt", strings, 10000000); std::size_t mean_bytes = 0; - for (std::string const &str : strings) - mean_bytes += str.size(); + for (std::string const &str : strings) mean_bytes += str.size(); mean_bytes /= strings.size(); std::printf("Parsed the file with %zu words of %zu mean length!\n", strings.size(), mean_bytes); std::string full_text; full_text.reserve(mean_bytes + strings.size() * 2); - for (std::string const &str : strings) - full_text.append(str), full_text.push_back(' '); + for (std::string const &str : strings) full_text.append(str), full_text.push_back(' '); auto make_random_needle = [](std::string_view full_text) { std::size_t length = std::rand() % 6 + 2; diff --git a/scripts/test.py b/scripts/test.py index 14b6e9e7..ea6aae8a 100644 --- a/scripts/test.py +++ b/scripts/test.py @@ -82,9 +82,6 @@ def test_unit_sequence(): assert [0, 1, 2] == list(lines.order()) assert ["line1", "line2", "line3"] == list(lines) - shuffled_copy = lines.shuffled(seed=42) - assert set(lines) == set(shuffled_copy) - lines.append("line4") assert 4 == len(lines) lines.extend(["line5", "line6"]) @@ -212,30 +209,30 @@ def test_strs(): native = get_random_string(length=10) big = Str(native) - assert native[0:5] == big.sub(0, 5) and native[0:5] == big[0:5] - assert native[5:10] == big.sub(5, 10) and native[5:10] == big[5:10] - - assert native[5:5] == big.sub(5, 5) and native[5:5] == big[5:5] - assert native[-5:-5] == big.sub(-5, -5) and native[-5:-5] == big[-5:-5] - assert native[2:-2] == big.sub(2, -2) and native[2:-2] == big[2:-2] - assert native[7:-7] == big.sub(7, -7) and native[7:-7] == big[7:-7] - - assert native[5:3] == big.sub(5, 3) and native[5:3] == big[5:3] - assert native[5:7] == big.sub(5, 7) and native[5:7] == big[5:7] - assert native[5:-3] == big.sub(5, -3) and native[5:-3] == big[5:-3] - assert native[5:-7] == big.sub(5, -7) and native[5:-7] == big[5:-7] - - assert native[-5:3] == big.sub(-5, 3) and native[-5:3] == big[-5:3] - assert native[-5:7] == big.sub(-5, 7) and native[-5:7] == big[-5:7] - assert native[-5:-3] == big.sub(-5, -3) and native[-5:-3] == big[-5:-3] - assert native[-5:-7] == big.sub(-5, -7) and native[-5:-7] == big[-5:-7] - - assert native[2:] == big.sub(2) and native[2:] == big[2:] - assert native[:7] == big.sub(end=7) and native[:7] == big[:7] - assert native[-2:] == big.sub(-2) and native[-2:] == big[-2:] - assert native[:-7] == big.sub(end=-7) and native[:-7] == big[:-7] - assert native[:-10] == big.sub(end=-10) and native[:-10] == big[:-10] - assert native[:-1] == big.sub(end=-1) and native[:-1] == big[:-1] + assert native[0:5] == big[0:5] + assert native[5:10] == big[5:10] + + assert native[5:5] == big[5:5] + assert native[-5:-5] == big[-5:-5] + assert native[2:-2] == big[2:-2] + assert native[7:-7] == big[7:-7] + + assert native[5:3] == big[5:3] + assert native[5:7] == big[5:7] + assert native[5:-3] == big[5:-3] + assert native[5:-7] == big[5:-7] + + assert native[-5:3] == big[-5:3] + assert native[-5:7] == big[-5:7] + assert native[-5:-3] == big[-5:-3] + assert native[-5:-7] == big[-5:-7] + + assert native[2:] == big[2:] + assert native[:7] == big[:7] + assert native[-2:] == big[-2:] + assert native[:-7] == big[:-7] + assert native[:-10] == big[:-10] + assert native[:-1] == big[:-1] length = 1000 native = get_random_string(length=length) From 876a726626dcbc4dd59dd3813241f70a33e9d665 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sun, 24 Sep 2023 12:46:26 +0400 Subject: [PATCH 42/72] Break: Deprecate multi-source `Strs`; split tests --- .github/workflows/prerelease.yml | 4 +- README.md | 2 +- pyproject.toml | 2 +- python/lib.c | 146 +---------------- scripts/test.py | 272 ------------------------------- scripts/test_fuzzy.py | 113 +++++++++++++ scripts/test_units.py | 104 ++++++++++++ scripts/wc.py | 11 -- 8 files changed, 223 insertions(+), 431 deletions(-) delete mode 100644 scripts/test.py create mode 100644 scripts/test_fuzzy.py create mode 100644 scripts/test_units.py delete mode 100644 scripts/wc.py diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml index 0f00d8fc..9c6bdab9 100644 --- a/.github/workflows/prerelease.yml +++ b/.github/workflows/prerelease.yml @@ -39,7 +39,7 @@ jobs: - name: Build locally run: python -m pip install . - name: Test with PyTest - run: pytest scripts/test.py + run: pytest scripts/ test_python_37: @@ -68,6 +68,6 @@ jobs: run: python -m pip install . - name: Test with PyTest - run: pytest scripts/test.py + run: pytest scripts/ diff --git a/README.md b/README.md index 818c8879..f774df35 100644 --- a/README.md +++ b/README.md @@ -149,7 +149,7 @@ CPython: ```sh # Clean up and install -rm -rf build && pip install -e . && pytest scripts/test.py -s -x +rm -rf build && pip install -e . && pytest scripts/ -s -x # Install without dependencies pip install -e . --no-index --no-deps diff --git a/pyproject.toml b/pyproject.toml index e12df96a..5260630a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ filterwarnings = ["error"] [tool.cibuildwheel] test-requires = ["pytest"] -test-command = "pytest {project}/scripts/test.py -x" +test-command = "pytest {project}/scripts/ -x" build-verbosity = 0 skip = ["*musllinux*", "*i686*", "pp*"] diff --git a/python/lib.c b/python/lib.c index 5b576c67..679c4f30 100644 --- a/python/lib.c +++ b/python/lib.c @@ -133,20 +133,6 @@ typedef struct { sz_haystack_t *parts; } reordered; - /** - * Complex structure with two variable length chunks inside - for the parents and their slices. - * The parents are sorted in ascending order of their memory ranges, to let us rapidly locate the source - * with a binary search. - */ - struct multi_source_slices_t { - size_t count; - size_t capacity; - size_t parents_count; - size_t parents_capacity; - - PyObject **parents; - sz_haystack_t *parts; - } multi_source; } data; } Strs; @@ -277,36 +263,11 @@ void str_at_offset_reordered( *parent = strs->data.reordered.parent; } -void str_at_offset_multi_source( - Strs *strs, Py_ssize_t i, Py_ssize_t count, PyObject **parent, char const **start, size_t *length) { - *start = strs->data.multi_source.parts[i].start; - *length = strs->data.multi_source.parts[i].length; - - PyObject **parents = strs->data.multi_source.parents; - size_t parents_count = strs->data.multi_source.parents_count; - for (size_t j = 0; j < parents_count; ++j) { - PyObject *current_parent = parents[j]; - char *parent_start; - Py_ssize_t parent_length; - export_string_like(current_parent, &parent_start, &parent_length); - - // Check if the string at offset `i` is within the range of the current parent. - if (*start >= parent_start && *start + *length <= parent_start + parent_length) { - *parent = current_parent; - return; - } - } - - // If no parent is found, set *parent to NULL. - *parent = NULL; -} - get_string_at_offset_t str_at_offset_getter(Strs *strs) { switch (strs->type) { case STRS_CONSECUTIVE_32: return str_at_offset_consecutive_32bit; case STRS_CONSECUTIVE_64: return str_at_offset_consecutive_64bit; case STRS_REORDERED: return str_at_offset_reordered; - case STRS_MULTI_SOURCE: return str_at_offset_multi_source; default: // Unsupported type PyErr_SetString(PyExc_TypeError, "Unsupported type for conversion"); @@ -695,17 +656,6 @@ static Py_ssize_t Strs_len(Strs *self) { case STRS_CONSECUTIVE_32: return self->data.consecutive_32bit.count; case STRS_CONSECUTIVE_64: return self->data.consecutive_64bit.count; case STRS_REORDERED: return self->data.reordered.count; - case STRS_MULTI_SOURCE: return self->data.multi_source.count; - default: return 0; - } -} - -static Py_ssize_t Strs_parents_count(Strs *self) { - switch (self->type) { - case STRS_CONSECUTIVE_32: return 1; - case STRS_CONSECUTIVE_64: return 1; - case STRS_REORDERED: return 1; - case STRS_MULTI_SOURCE: return self->data.multi_source.parents_count; default: return 0; } } @@ -764,7 +714,7 @@ static PyObject *Strs_subscript(Strs *self, PyObject *key) { struct consecutive_slices_32bit_t *from = &self->data.consecutive_32bit; struct consecutive_slices_32bit_t *to = &self_slice->data.consecutive_32bit; to->count = stop - start; - to->separator_length = from->sepa rator_length; + to->separator_length = from->separator_length; to->parent = from->parent; size_t first_length; @@ -813,57 +763,6 @@ static PyObject *Strs_subscript(Strs *self, PyObject *key) { Py_INCREF(to->parent); break; } - case STRS_MULTI_SOURCE: { - struct multi_source_slices_t *from = &self->data.multi_source; - struct multi_source_slices_t *to = &self_slice->data.multi_source; - to->count = stop - start; - to->capacity = to->count; - to->parents_count = 0; - to->parents_capacity = from->parents_capacity; - - // Allocate memory for both `parts` and `parents` references - to->parts = malloc(sizeof(sz_haystack_t) * to->capacity); - if (to->parts == NULL && PyErr_NoMemory()) { - Py_XDECREF(self_slice); - return NULL; - } - to->parents = malloc(sizeof(PyObject *) * to->parents_capacity); - if (to->parents == NULL && PyErr_NoMemory()) { - free(to->parts); - Py_XDECREF(self_slice); - return NULL; - } - - // Iterate through the `parts` of this slice, detect the `parent` - // of each exported entry in `from->parents`, and upsert it into the `to->parents` - for (Py_ssize_t i = start; i < stop; ++i) { - PyObject *detected_parent; - char const *part_start; - size_t part_length; - - // Find the parent of the part at the offset `i` - str_at_offset_multi_source(self, i, count, &detected_parent, &part_start, &part_length); - Py_INCREF(detected_parent); - - // Upsert the detected parent into to->parents - // As the to->parents array is meant to be sorted, - // we insert in a way that maintains the sorting - size_t j = 0; - while (j < to->parents_count && to->parents[j] != detected_parent) ++j; - - // If the parent is not already in to->parents, insert it. - if (j == to->parents_count) { - to->parents[j] = detected_parent; - ++to->parents_count; - } - - // Populate the to->parts array - to->parts[i - start].start = part_start; - to->parts[i - start].length = part_length; - } - - break; - } default: // Unsupported type PyErr_SetString(PyExc_TypeError, "Unsupported type for conversion"); @@ -879,40 +778,6 @@ static PyObject *Strs_subscript(Strs *self, PyObject *key) { } } -static PyObject *Strs_extend(Strs *self, PyObject *seq) { - // Check if seq is an instance of Strs - if (PyObject_IsInstance(seq, (PyObject *)&StrsType)) { - Strs *other = (Strs *)seq; - size_t other_parents = Strs_len(other); - size_t other_parts = Strs_parents_count(other); - if (!prepare_strings_for_extension(self, other_parents, other_parts)) { - PyErr_Format(PyExc_TypeError, "Failed to prepare the sequence for extension"); - return NULL; - } - - // TODO: - } - else if (PySequence_Check(seq)) { - // Check if seq is a sequence - Py_ssize_t length = PySequence_Size(seq); - // Validate that every item in the sequence is string-like with `export_string_like` - // TODO: - - for (Py_ssize_t i = 0; i < length; i++) { - PyObject *item = PySequence_ITEM(seq, i); - if (!item) return NULL; // Error getting item from sequence - - // TODO: - } - } - else { - PyErr_SetString(PyExc_TypeError, "Parameter must be a sequence or an instance of Strs"); - return NULL; - } - - Py_RETURN_NONE; -} - // Will be called by the `PySequence_Contains` static int Strs_contains(Str *self, PyObject *arg) { return 0; } @@ -1590,11 +1455,11 @@ static PyMethodDef Str_methods[] = { // {"contains", Str_contains, sz_method_flags_m, "Check if a string contains a substring."}, {"partition", Str_partition, sz_method_flags_m, "Splits string into 3-tuple: before, match, after."}, {"count", Str_count, sz_method_flags_m, "Count the occurrences of a substring."}, - {"levenstein", Str_levenstein, sz_method_flags_m, "Calculate the Levenshtein distance between two strings."}, {"split", Str_split, sz_method_flags_m, "Split a string by a separator."}, {"splitlines", Str_splitlines, sz_method_flags_m, "Split a string by line breaks."}, {"startswith", Str_startswith, sz_method_flags_m, "Check if a string starts with a given prefix."}, {"endswith", Str_endswith, sz_method_flags_m, "Check if a string ends with a given suffix."}, + {"levenstein", Str_levenstein, sz_method_flags_m, "Calculate the Levenshtein distance between two strings."}, {NULL, NULL, 0, NULL}}; static PyTypeObject StrType = { @@ -1702,11 +1567,6 @@ static boolean_t Strs_sort_(Strs *self, parts = self->data.reordered.parts; count = self->data.reordered.count; break; - - case STRS_MULTI_SOURCE: - parts = self->data.multi_source.parts; - count = self->data.multi_source.count; - break; } // Allocate temporary memory to store the ordering offsets @@ -1868,8 +1728,6 @@ static PyMethodDef Strs_methods[] = { {"shuffle", Strs_shuffle, sz_method_flags_m, "Shuffle the elements of the Strs object."}, // {"sort", Strs_sort, sz_method_flags_m, "Sort the elements of the Strs object."}, // {"order", Strs_order, sz_method_flags_m, "Provides the indexes to achieve sorted order."}, // - {"append", Strs_append, sz_method_flags_m, "Append the sequence with a new string."}, // - {"extend", Strs_extend, sz_method_flags_m, "Extend the sequence with new strings."}, // {NULL, NULL, 0, NULL}}; static PyTypeObject StrsType = { diff --git a/scripts/test.py b/scripts/test.py deleted file mode 100644 index ea6aae8a..00000000 --- a/scripts/test.py +++ /dev/null @@ -1,272 +0,0 @@ -from typing import Union, Optional -from random import choice, randint -from string import ascii_lowercase -import math - -import pytest - -import stringzilla as sz -from stringzilla import Str, Strs - - -def test_unit_construct(): - native = "aaaaa" - big = Str(native) - assert len(big) == len(native) - - -def test_unit_indexing(): - native = "abcdef" - big = Str(native) - for i in range(len(native)): - assert big[i] == native[i] - - -def test_unit_count(): - native = "aaaaa" - big = Str(native) - assert big.count("a") == 5 - assert big.count("aa") == 2 - assert big.count("aa", allowoverlap=True) == 4 - - -def test_unit_contains(): - big = Str("abcdef") - assert "a" in big - assert "ab" in big - assert "xxx" not in big - - -def test_unit_rich_comparisons(): - assert Str("aa") == "aa" - assert Str("aa") < "b" - assert Str("abb")[1:] == "bb" - - -def test_unit_buffer_protocol(): - import numpy as np - - my_str = Str("hello") - arr = np.array(my_str) - assert arr.dtype == np.dtype("c") - assert arr.shape == (len("hello"),) - assert "".join([c.decode("utf-8") for c in arr.tolist()]) == "hello" - - -def test_unit_split(): - native = "token1\ntoken2\ntoken3" - big = Str(native) - assert native.splitlines() == list(big.splitlines()) - assert native.splitlines(True) == list(big.splitlines(keeplinebreaks=True)) - assert native.split("token3") == list(big.split("token3")) - - words = sz.split(big, "\n") - assert len(words) == 3 - assert str(words[0]) == "token1" - assert str(words[2]) == "token3" - - parts = sz.split(big, "\n", keepseparator=True) - assert len(parts) == 3 - assert str(parts[0]) == "token1\n" - assert str(parts[2]) == "token3" - - -def test_unit_sequence(): - native = "line3\nline2\nline1" - big = Str(native) - - lines = big.splitlines() - assert [2, 1, 0] == list(lines.order()) - - lines.sort() - assert [0, 1, 2] == list(lines.order()) - assert ["line1", "line2", "line3"] == list(lines) - - lines.append("line4") - assert 4 == len(lines) - lines.extend(["line5", "line6"]) - assert 6 == len(lines) - - lines.append(lines[0]) - assert 7 == len(lines) - assert lines[6] == "line1" - - lines.extend(lines) - assert 14 == len(lines) - assert lines[7] == "line1" - assert lines[8] == "line2" - assert lines[12] == "line6" - - # Test that shuffles are reproducible with the same `seed` - a = [str(s) for s in lines.shuffled(seed=42)] - b = [str(s) for s in lines.shuffled(seed=42)] - assert a == b - - -def test_unit_globals(): - """Validates that the previously unit-tested member methods are also visible as global functions.""" - - assert sz.find("abcdef", "bcdef") == 1 - assert sz.find("abcdef", "x") == -1 - - assert sz.count("abcdef", "x") == 0 - assert sz.count("aaaaa", "a") == 5 - assert sz.count("aaaaa", "aa") == 2 - assert sz.count("aaaaa", "aa", allowoverlap=True) == 4 - - assert sz.levenstein("aaa", "aaa") == 0 - assert sz.levenstein("aaa", "bbb") == 3 - assert sz.levenstein("abababab", "aaaaaaaa") == 4 - assert sz.levenstein("abababab", "aaaaaaaa", 2) == 2 - assert sz.levenstein("abababab", "aaaaaaaa", bound=2) == 2 - - -def get_random_string( - length: Optional[int] = None, variability: Optional[int] = None -) -> str: - if length is None: - length = randint(3, 300) - if variability is None: - variability = len(ascii_lowercase) - return "".join(choice(ascii_lowercase[:variability]) for _ in range(length)) - - -def is_equal_strings(native_strings, big_strings): - for native_slice, big_slice in zip(native_strings, big_strings): - assert native_slice == big_slice - - -def check_identical( - native: str, - big: Str, - needle: Optional[str] = None, - check_iterators: bool = False, -): - if needle is None: - part_offset = randint(0, len(native) - 1) - part_length = randint(1, len(native) - part_offset) - needle = native[part_offset:part_length] - - present_in_native: bool = needle in native - present_in_big = needle in big - assert present_in_native == present_in_big - assert native.find(needle) == big.find(needle) - assert native.count(needle) == big.count(needle) - - native_strings = native.split(needle) - big_strings: Strs = big.split(needle) - assert len(native_strings) == len(big_strings) - - if check_iterators: - for i in range(len(native_strings)): - assert len(native_strings[i]) == len(big_strings[i]) - assert native_strings[i] == big_strings[i] - assert [c for c in native_strings[i]] == [c for c in big_strings[i]] - - is_equal_strings(native_strings, big_strings) - - -@pytest.mark.parametrize("haystack_length", range(1, 65)) -@pytest.mark.parametrize("variability", range(1, 25)) -def test_fuzzy_substrings(haystack_length: int, variability: int): - native = get_random_string(variability=variability, length=haystack_length) - big = Str(native) - pattern = get_random_string(variability=variability, length=randint(1, 5)) - assert (pattern in native) == big.contains(pattern) - assert native.find(pattern) == big.find(pattern) - - -@pytest.mark.parametrize("repetitions", range(1, 10)) -def test_basic(repetitions: int): - native = "abcd" * repetitions - big = Str(native) - - check_identical(native, big, "a", True) - check_identical(native, big, "ab", True) - check_identical(native, big, "abc", True) - check_identical(native, big, "abcd", True) - check_identical(native, big, "abcde", True) # Missing pattern - - -@pytest.mark.parametrize("pattern_length", [1, 2, 4, 5]) -@pytest.mark.parametrize("haystack_length", range(1, 69, 3)) -@pytest.mark.parametrize("variability", range(1, 27, 3)) -def test_fuzzy(pattern_length: int, haystack_length: int, variability: int): - native = get_random_string(variability=variability, length=haystack_length) - big = Str(native) - - # Start by matching the prefix and the suffix - check_identical(native, big, native[:pattern_length]) - check_identical(native, big, native[-pattern_length:]) - - # Continue with random strs - for _ in range(haystack_length // pattern_length): - pattern = get_random_string(variability=variability, length=pattern_length) - check_identical(native, big, pattern) - - -def test_strs(): - native = get_random_string(length=10) - big = Str(native) - - assert native[0:5] == big[0:5] - assert native[5:10] == big[5:10] - - assert native[5:5] == big[5:5] - assert native[-5:-5] == big[-5:-5] - assert native[2:-2] == big[2:-2] - assert native[7:-7] == big[7:-7] - - assert native[5:3] == big[5:3] - assert native[5:7] == big[5:7] - assert native[5:-3] == big[5:-3] - assert native[5:-7] == big[5:-7] - - assert native[-5:3] == big[-5:3] - assert native[-5:7] == big[-5:7] - assert native[-5:-3] == big[-5:-3] - assert native[-5:-7] == big[-5:-7] - - assert native[2:] == big[2:] - assert native[:7] == big[:7] - assert native[-2:] == big[-2:] - assert native[:-7] == big[:-7] - assert native[:-10] == big[:-10] - assert native[:-1] == big[:-1] - - length = 1000 - native = get_random_string(length=length) - big = Str(native) - - needle = native[0 : randint(2, 5)] - native_strings = native.split(needle) - big_strings: Strs = big.split(needle) - - length = len(native_strings) - for i in range(length): - start = randint(1 - length, length - 1) - stop = randint(1 - length, length - 1) - step = 0 - while step == 0: - step = randint(-int(math.sqrt(length)), int(math.sqrt(length))) - - is_equal_strings(native_strings[start:stop:step], big_strings[start:stop:step]) - is_equal_strings( - native_strings[start:stop:step], - big_strings.sub(start, stop, step), - ) - - -def test_levenstein(): - # Create a new string by slicing and concatenating - def insert_char_at(s, char_to_insert, index): - return s[:index] + char_to_insert + s[index:] - - for _ in range(100): - a = get_random_string(length=20) - b = a - for i in range(150): - source_offset = randint(0, len(ascii_lowercase) - 1) - target_offset = randint(0, len(b) - 1) - b = insert_char_at(b, ascii_lowercase[source_offset], target_offset) - assert sz.levenstein(a, b, 200) == i + 1 diff --git a/scripts/test_fuzzy.py b/scripts/test_fuzzy.py new file mode 100644 index 00000000..7249e93b --- /dev/null +++ b/scripts/test_fuzzy.py @@ -0,0 +1,113 @@ +from typing import Union, Optional +from random import choice, randint +from string import ascii_lowercase + +import pytest + +import stringzilla as sz +from stringzilla import Str, Strs + + +def get_random_string( + length: Optional[int] = None, variability: Optional[int] = None +) -> str: + if length is None: + length = randint(3, 300) + if variability is None: + variability = len(ascii_lowercase) + return "".join(choice(ascii_lowercase[:variability]) for _ in range(length)) + + +def is_equal_strings(native_strings, big_strings): + for native_slice, big_slice in zip(native_strings, big_strings): + assert native_slice == big_slice + + +def check_identical( + native: str, + big: Str, + needle: Optional[str] = None, + check_iterators: bool = False, +): + if needle is None: + part_offset = randint(0, len(native) - 1) + part_length = randint(1, len(native) - part_offset) + needle = native[part_offset:part_length] + + present_in_native: bool = needle in native + present_in_big = needle in big + assert present_in_native == present_in_big + assert native.find(needle) == big.find(needle) + assert native.count(needle) == big.count(needle) + + native_strings = native.split(needle) + big_strings: Strs = big.split(needle) + assert len(native_strings) == len(big_strings) + + if check_iterators: + for i in range(len(native_strings)): + assert len(native_strings[i]) == len(big_strings[i]) + assert native_strings[i] == big_strings[i] + assert [c for c in native_strings[i]] == [c for c in big_strings[i]] + + is_equal_strings(native_strings, big_strings) + + +@pytest.mark.parametrize("repetitions", range(1, 10)) +def test_fuzzy_repetitions(repetitions: int): + native = "abcd" * repetitions + big = Str(native) + + check_identical(native, big, "a", True) + check_identical(native, big, "ab", True) + check_identical(native, big, "abc", True) + check_identical(native, big, "abcd", True) + check_identical(native, big, "abcde", True) # Missing pattern + + +@pytest.mark.parametrize("pattern_length", [1, 2, 4, 5]) +@pytest.mark.parametrize("haystack_length", range(1, 65)) +@pytest.mark.parametrize("variability", range(1, 25)) +def test_fuzzy_substrings(pattern_length: int, haystack_length: int, variability: int): + native = get_random_string(variability=variability, length=haystack_length) + big = Str(native) + pattern = get_random_string(variability=variability, length=pattern_length) + assert (pattern in native) == big.contains(pattern) + assert native.find(pattern) == big.find(pattern) + + +@pytest.mark.parametrize("iterations", range(100)) +@pytest.mark.parametrize("max_edit_distance", [150]) +def test_levenstein(iterations: int, max_edit_distance: int): + # Create a new string by slicing and concatenating + def insert_char_at(s, char_to_insert, index): + return s[:index] + char_to_insert + s[index:] + + for _ in range(iterations): + a = get_random_string(length=20) + b = a + for i in range(max_edit_distance): + source_offset = randint(0, len(ascii_lowercase) - 1) + target_offset = randint(0, len(b) - 1) + b = insert_char_at(b, ascii_lowercase[source_offset], target_offset) + assert sz.levenstein(a, b, 200) == i + 1 + + +@pytest.mark.parametrize("list_length", [10, 20, 30, 40, 50]) +@pytest.mark.parametrize("part_length", [5, 10]) +@pytest.mark.parametrize("variability", [2, 3]) +def test_fuzzy_sorting(list_length: int, part_length: int, variability: int): + native_list = [ + get_random_string(variability=variability, length=part_length) + for _ in range(list_length) + ] + native_joined = ".".join(native_list) + big_joined = Str(native_joined) + big_list = big_joined.split(".") + + native_list.sort() + big_list.sort() + + assert len(native_list) == len(big_list) + for native_str, big_str in zip(native_list, big_list): + assert native_str == str(big_str) diff --git a/scripts/test_units.py b/scripts/test_units.py new file mode 100644 index 00000000..a7c622e3 --- /dev/null +++ b/scripts/test_units.py @@ -0,0 +1,104 @@ +from typing import Union, Optional +from random import choice, randint +from string import ascii_lowercase + +import pytest + +import stringzilla as sz +from stringzilla import Str, Strs + + +def test_unit_construct(): + native = "aaaaa" + big = Str(native) + assert len(big) == len(native) + + +def test_unit_indexing(): + native = "abcdef" + big = Str(native) + for i in range(len(native)): + assert big[i] == native[i] + + +def test_unit_count(): + native = "aaaaa" + big = Str(native) + assert big.count("a") == 5 + assert big.count("aa") == 2 + assert big.count("aa", allowoverlap=True) == 4 + + +def test_unit_contains(): + big = Str("abcdef") + assert "a" in big + assert "ab" in big + assert "xxx" not in big + + +def test_unit_rich_comparisons(): + assert Str("aa") == "aa" + assert Str("aa") < "b" + assert Str("abb")[1:] == "bb" + + +def test_unit_buffer_protocol(): + import numpy as np + + my_str = Str("hello") + arr = np.array(my_str) + assert arr.dtype == np.dtype("c") + assert arr.shape == (len("hello"),) + assert "".join([c.decode("utf-8") for c in arr.tolist()]) == "hello" + + +def test_unit_split(): + native = "token1\ntoken2\ntoken3" + big = Str(native) + assert native.splitlines() == list(big.splitlines()) + assert native.splitlines(True) == list(big.splitlines(keeplinebreaks=True)) + assert native.split("token3") == list(big.split("token3")) + + words = sz.split(big, "\n") + assert len(words) == 3 + assert str(words[0]) == "token1" + assert str(words[2]) == "token3" + + parts = sz.split(big, "\n", keepseparator=True) + assert len(parts) == 3 + assert str(parts[0]) == "token1\n" + assert str(parts[2]) == "token3" + + +def test_unit_sequence(): + native = "line3\nline2\nline1" + big = Str(native) + + lines = big.splitlines() + assert [2, 1, 0] == list(lines.order()) + + lines.sort() + assert [0, 1, 2] == list(lines.order()) + assert ["line1", "line2", "line3"] == list(lines) + + lines.sort(reverse=True) + assert [2, 1, 0] == list(lines.order()) + assert ["line3", "line2", "line1"] == list(lines) + + +def test_unit_globals(): + """Validates that the previously unit-tested member methods are also visible as global functions.""" + + assert sz.find("abcdef", "bcdef") == 1 + assert sz.find("abcdef", "x") == -1 + + assert sz.count("abcdef", "x") == 0 + assert sz.count("aaaaa", "a") == 5 + assert sz.count("aaaaa", "aa") == 2 + assert sz.count("aaaaa", "aa", allowoverlap=True) == 4 + + assert sz.levenstein("aaa", "aaa") == 0 + assert sz.levenstein("aaa", "bbb") == 3 + assert sz.levenstein("abababab", "aaaaaaaa") == 4 + assert sz.levenstein("abababab", "aaaaaaaa", 2) == 2 + assert sz.levenstein("abababab", "aaaaaaaa", bound=2) == 2 diff --git a/scripts/wc.py b/scripts/wc.py deleted file mode 100644 index 60204345..00000000 --- a/scripts/wc.py +++ /dev/null @@ -1,11 +0,0 @@ -import argparse -from stringzilla import Str, File, Strs - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("-l", "--lines", nargs="*", help="Count lines in files") - args = parser.parse_args() - if args.lines: - for filename in args.lines: - print(File(filename).count("\n")) From ccaa1d249d4091db0e19985300605d18431296e9 Mon Sep 17 00:00:00 2001 From: Nairi Harutyunyan Date: Wed, 27 Sep 2023 16:14:07 +0300 Subject: [PATCH 43/72] Minor fixes --- javascript/lib.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/javascript/lib.c b/javascript/lib.c index c2098a08..6330d90b 100644 --- a/javascript/lib.c +++ b/javascript/lib.c @@ -86,11 +86,13 @@ napi_value CountSubstrAPI(napi_env env, napi_callback_info info) { strzl_needle.ptr = needle; bool overlap = false; - napi_get_value_bool(env, args[2], &overlap); + if (argc > 2) { + napi_get_value_bool(env, args[2], &overlap); + } size_t result; - if (strzl_needle.len == 0 || strzl_haystack.len == 0 || strzl_haystack.len < strzl_needle.len) { + if (strzl_needle.len == 0 || strzl_haystack.len == 0 || strzl_haystack.len < strzl_needle.len) result = 0; else if (strzl_needle.len == 1) result = count_char(strzl_haystack, strzl_needle.ptr[0]); From 730948a1f63975445e207d14ab08d16d5807be55 Mon Sep 17 00:00:00 2001 From: Nairi Harutyunyan Date: Wed, 27 Sep 2023 16:19:41 +0300 Subject: [PATCH 44/72] Add CI for JavaScript --- .github/workflows/javascript-ci.yml | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 .github/workflows/javascript-ci.yml diff --git a/.github/workflows/javascript-ci.yml b/.github/workflows/javascript-ci.yml new file mode 100644 index 00000000..674fa882 --- /dev/null +++ b/.github/workflows/javascript-ci.yml @@ -0,0 +1,21 @@ +name: CI +on: + pull_request: + branches: '*' + push: + branches: '*' + +jobs: + tests: + runs-on: ubuntu-latest + strategy: + matrix: + node-version: [18.x] + steps: + - uses: actions/checkout@v4 + - name: Use Node.js + uses: actions/setup-node@v3 + with: + node-version: '18.x' + - run: npm i + - run: npm test \ No newline at end of file From 35051f34164b0fb08409df89158cf875add1d81c Mon Sep 17 00:00:00 2001 From: Nairi Harutyunyan Date: Fri, 29 Sep 2023 23:41:58 +0300 Subject: [PATCH 45/72] Rename countSubstr to count --- javascript/lib.c | 8 ++++---- javascript/stringzilla.d.ts | 2 +- javascript/test/{countSubstr.js => count.js} | 16 ++++++++-------- 3 files changed, 13 insertions(+), 13 deletions(-) rename javascript/test/{countSubstr.js => count.js} (58%) diff --git a/javascript/lib.c b/javascript/lib.c index 6330d90b..2c46224c 100644 --- a/javascript/lib.c +++ b/javascript/lib.c @@ -64,7 +64,7 @@ size_t count_char(strzl_haystack_t strzl_haystack, char needle) { return result; } -napi_value CountSubstrAPI(napi_env env, napi_callback_info info) { +napi_value CountAPI(napi_env env, napi_callback_info info) { size_t argc = 3; napi_value args[3]; napi_get_cb_info(env, info, &argc, args, NULL, NULL); @@ -145,11 +145,11 @@ napi_value Init(napi_env env, napi_value exports) { // Define the "find" property napi_property_descriptor findDesc = {"find", 0, FindAPI, 0, 0, 0, napi_default, 0}; - // Define the "countSubstr" property - napi_property_descriptor countSubstrDesc = {"countSubstr", 0, CountSubstrAPI, 0, 0, 0, napi_default, 0}; + // Define the "count" property + napi_property_descriptor countDesc = {"count", 0, CountAPI, 0, 0, 0, napi_default, 0}; // Define an array of property descriptors - napi_property_descriptor properties[] = {findDesc, countSubstrDesc}; + napi_property_descriptor properties[] = {findDesc, countDesc}; // Define the number of properties in the array size_t propertyCount = sizeof(properties) / sizeof(properties[0]); diff --git a/javascript/stringzilla.d.ts b/javascript/stringzilla.d.ts index 57eff05b..5521c152 100644 --- a/javascript/stringzilla.d.ts +++ b/javascript/stringzilla.d.ts @@ -14,4 +14,4 @@ export function find(haystack: string, needle: string): bigint; * @param {string} needle * @param {boolean} overlap */ -export function countSubstr(haystack: string, needle: string, overlap: boolean): bigint; +export function count(haystack: string, needle: string, overlap: boolean): bigint; diff --git a/javascript/test/countSubstr.js b/javascript/test/count.js similarity index 58% rename from javascript/test/countSubstr.js rename to javascript/test/count.js index 973ba541..a1c44d16 100644 --- a/javascript/test/countSubstr.js +++ b/javascript/test/count.js @@ -5,40 +5,40 @@ import assert from 'node:assert'; const stringzilla = bindings('stringzilla'); test('Count Words - Single Occurrence', () => { - const result = stringzilla.countSubstr('hello world', 'world'); + const result = stringzilla.count('hello world', 'world'); assert.strictEqual(result, 1n); }); test('Count Words - Multiple Occurrence', () => { - const result = stringzilla.countSubstr('hello world, hello John', 'hello'); + const result = stringzilla.count('hello world, hello John', 'hello'); assert.strictEqual(result, 2n); }); test('Count Words - Multiple Occurrences with Overlap Test', () => { - const result_1 = stringzilla.countSubstr('abababab', 'aba'); + const result_1 = stringzilla.count('abababab', 'aba'); assert.strictEqual(result_1, 2n); - const result_2 = stringzilla.countSubstr('abababab', 'aba', true); + const result_2 = stringzilla.count('abababab', 'aba', true); assert.strictEqual(result_2, 3n); }); test('Count Words - No Occurrence', () => { - const result = stringzilla.countSubstr('hello world', 'hi'); + const result = stringzilla.count('hello world', 'hi'); assert.strictEqual(result, 0n); }); test('Count Words - Empty String Inputs', () => { - const result_1 = stringzilla.countSubstr('hello world', ''); + const result_1 = stringzilla.count('hello world', ''); assert.strictEqual(result_1, 0n); - const result_2 = stringzilla.countSubstr('', 'hi'); + const result_2 = stringzilla.count('', 'hi'); assert.strictEqual(result_2, 0n); - const result_3 = stringzilla.countSubstr('', ''); + const result_3 = stringzilla.count('', ''); assert.strictEqual(result_3, 0n); }); From ff76eaf815abb2bb39f815eed042f0e52964f4be Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Thu, 5 Oct 2023 11:54:44 -0700 Subject: [PATCH 46/72] Fix: Applying sort order in Python --- python/lib.c | 17 ++++++++--------- scripts/test_fuzzy.py | 10 +++++++++- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/python/lib.c b/python/lib.c index 679c4f30..17324ba6 100644 --- a/python/lib.c +++ b/python/lib.c @@ -168,17 +168,16 @@ void reverse_haystacks(sz_haystack_t *array, size_t length) { void apply_order(sz_haystack_t *array, sz_size_t *order, size_t length) { for (size_t i = 0; i < length; ++i) { - while (order[i] != i) { - // Swap array[i] and array[order[i]] + if (i == order[i]) continue; sz_haystack_t temp = array[i]; - array[i] = array[order[i]]; - array[order[i]] = temp; - - // Also update the order array to reflect the swap - size_t temp_idx = order[i]; - order[i] = order[temp_idx]; - order[temp_idx] = temp_idx; + size_t k = i, j; + while (i != (j = order[k])) { + array[k] = array[j]; + order[k] = k; + k = j; } + array[k] = temp; + order[k] = k; } } diff --git a/scripts/test_fuzzy.py b/scripts/test_fuzzy.py index 7249e93b..ba4aca42 100644 --- a/scripts/test_fuzzy.py +++ b/scripts/test_fuzzy.py @@ -105,9 +105,17 @@ def test_fuzzy_sorting(list_length: int, part_length: int, variability: int): big_joined = Str(native_joined) big_list = big_joined.split(".") + native_ordered = sorted(native_list) + native_order = big_list.order() + for i in range(list_length): + assert native_ordered[i] == native_list[native_order[i]], "Order is wrong" + assert native_ordered[i] == str( + big_list[int(native_order[i])] + ), "Split is wrong?!" + native_list.sort() big_list.sort() assert len(native_list) == len(big_list) for native_str, big_str in zip(native_list, big_list): - assert native_str == str(big_str) + assert native_str == str(big_str), "Order is wrong" From 69636f004fdebcce368deab264741ff33c45a6f6 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Thu, 5 Oct 2023 12:49:59 -0700 Subject: [PATCH 47/72] Fix: Reverse order --- .clang-format | 4 ++-- python/lib.c | 12 ++++++------ scripts/bench.py | 4 ++-- scripts/test_units.py | 9 +++++---- 4 files changed, 15 insertions(+), 14 deletions(-) diff --git a/.clang-format b/.clang-format index e0f25893..bf4937c3 100644 --- a/.clang-format +++ b/.clang-format @@ -17,9 +17,9 @@ AllowAllArgumentsOnNextLine: false AllowAllConstructorInitializersOnNextLine: true AllowAllParametersOfDeclarationOnNextLine: true AllowShortBlocksOnASingleLine: Always +AllowShortIfStatementsOnASingleLine: Always AllowShortCaseLabelsOnASingleLine: true AllowShortFunctionsOnASingleLine: true -AllowShortIfStatementsOnASingleLine: Always AllowShortLambdasOnASingleLine: true AllowShortLoopsOnASingleLine: true AlwaysBreakTemplateDeclarations: Yes @@ -46,7 +46,7 @@ BraceWrapping: IndentBraces: false -SortIncludes: false +SortIncludes: true SortUsingDeclarations: true SpaceAfterCStyleCast: false diff --git a/python/lib.c b/python/lib.c index 17324ba6..cef4a751 100644 --- a/python/lib.c +++ b/python/lib.c @@ -9,10 +9,10 @@ #define NOMINMAX #include #else -#include -#include // `stat` -#include // `mmap` #include // `O_RDNLY` +#include // `mmap` +#include // `stat` +#include #endif #ifdef _MSC_VER @@ -169,7 +169,7 @@ void reverse_haystacks(sz_haystack_t *array, size_t length) { void apply_order(sz_haystack_t *array, sz_size_t *order, size_t length) { for (size_t i = 0; i < length; ++i) { if (i == order[i]) continue; - sz_haystack_t temp = array[i]; + sz_haystack_t temp = array[i]; size_t k = i, j; while (i != (j = order[k])) { array[k] = array[j]; @@ -1638,7 +1638,7 @@ static PyObject *Strs_sort(Strs *self, PyObject *args, PyObject *kwargs) { sz_haystack_t *parts = NULL; sz_size_t *order = NULL; - sz_size_t *count = NULL; + sz_size_t count = 0; if (!Strs_sort_(self, &parts, &order, &count)) return NULL; // Apply the sorting algorithm here, considering the `reverse` value @@ -1691,7 +1691,7 @@ static PyObject *Strs_order(Strs *self, PyObject *args, PyObject *kwargs) { sz_haystack_t *parts = NULL; sz_size_t *order = NULL; - sz_size_t count = NULL; + sz_size_t count = 0; if (!Strs_sort_(self, &parts, &order, &count)) return NULL; // Apply the sorting algorithm here, considering the `reverse` value diff --git a/scripts/bench.py b/scripts/bench.py index 5522031a..a7c864fb 100644 --- a/scripts/bench.py +++ b/scripts/bench.py @@ -53,8 +53,8 @@ def bench( stringzilla_file = File(haystack_path) else: haystack_length = int(haystack_length) - repretitions = haystack_length // len(haystack_pattern) - pythonic_str: str = haystack_pattern * repretitions + repetitions = haystack_length // len(haystack_pattern) + pythonic_str: str = haystack_pattern * repetitions stringzilla_file = None stringzilla_str = Str(pythonic_str) diff --git a/scripts/test_units.py b/scripts/test_units.py index a7c622e3..a2f985a7 100644 --- a/scripts/test_units.py +++ b/scripts/test_units.py @@ -71,7 +71,7 @@ def test_unit_split(): def test_unit_sequence(): - native = "line3\nline2\nline1" + native = "p3\np2\np1" big = Str(native) lines = big.splitlines() @@ -79,11 +79,12 @@ def test_unit_sequence(): lines.sort() assert [0, 1, 2] == list(lines.order()) - assert ["line1", "line2", "line3"] == list(lines) + assert ["p1", "p2", "p3"] == list(lines) + # Reverse order + assert [2, 1, 0] == list(lines.order(reverse=True)) lines.sort(reverse=True) - assert [2, 1, 0] == list(lines.order()) - assert ["line3", "line2", "line1"] == list(lines) + assert ["p3", "p2", "p1"] == list(lines) def test_unit_globals(): From b1cf5e5c29ca8a58cd39ba86aec932db6d9a2cba Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Thu, 5 Oct 2023 14:16:52 -0700 Subject: [PATCH 48/72] Fix: Buffer width for NodeJS bindings --- javascript/lib.c | 148 ++++++++++++++---------------------- javascript/stringzilla.d.ts | 17 ----- javascript/stringzilla.js | 24 +++++- python/lib.c | 24 +++--- scripts/test.c | 14 ++-- stringzilla/stringzilla.h | 12 +-- 6 files changed, 107 insertions(+), 132 deletions(-) delete mode 100644 javascript/stringzilla.d.ts diff --git a/javascript/lib.c b/javascript/lib.c index 2c46224c..3644468a 100644 --- a/javascript/lib.c +++ b/javascript/lib.c @@ -1,12 +1,11 @@ /** - * @file lib.c - * @author Ash Vardanian - * @brief JavaScript bindings for StringZilla. - * @date 2023-09-18 + * @file lib.c + * @brief JavaScript bindings for StringZilla. + * @author Ash Vardanian + * @date September 18, 2023 * - * @copyright Copyright (c) 2023 - * - * @see NodeJS docs: https://nodejs.org/api/n-api.html + * @copyright Copyright (c) 2023 + * @see NodeJS docs: https://nodejs.org/api/n-api.html */ #include @@ -18,49 +17,39 @@ napi_value FindAPI(napi_env env, napi_callback_info info) { napi_get_cb_info(env, info, &argc, args, NULL, NULL); // Extract the C string from the JavaScript string for haystack and needle - struct strzl_haystack_t strzl_haystack = {NULL, 0}; - struct strzl_needle_t strzl_needle = {NULL, 0, 0}; + sz_haystack_t haystack_sz = {NULL, 0}; + sz_needle_t needle_sz = {NULL, 0, 0}; // For haystack - napi_get_value_string_utf8(env, args[0], NULL, 0, &strzl_haystack.len); - char *haystack = malloc(strzl_haystack.len); - napi_get_value_string_utf8(env, args[0], haystack, strzl_haystack.len, &strzl_haystack.len); - strzl_haystack.ptr = haystack; + napi_get_value_string_utf8(env, args[0], NULL, 0, &haystack_sz.length); + haystack_sz.start = malloc(haystack_sz.length + 1); + napi_get_value_string_utf8(env, args[0], haystack_sz.start, haystack_sz.length + 1, &haystack_sz.length); // For needle - napi_get_value_string_utf8(env, args[1], NULL, 0, &strzl_needle.len); - char *needle = malloc(strzl_needle.len); - napi_get_value_string_utf8(env, args[1], needle, strzl_needle.len, &strzl_needle.len); - strzl_needle.ptr = needle; - -// Perform the find operation -#if defined(__AVX2__) - uint64_t result = strzl_avx2_find_substr(strzl_haystack, strzl_needle); -#elif defined(__ARM_NEON) - uint64_t result = strzl_neon_find_substr(strzl_haystack, strzl_needle); -#else - uint64_t result = strzl_naive_find_substr(strzl_haystack, strzl_needle); -#endif + napi_get_value_string_utf8(env, args[1], NULL, 0, &needle_sz.length); + needle_sz.start = malloc(needle_sz.length + 1); + napi_get_value_string_utf8(env, args[1], needle_sz.start, needle_sz.length + 1, &needle_sz.length); + + // Perform the find operation + sz_size_t result = sz_find_substr(haystack_sz, needle_sz); // Cleanup - free(haystack); - free(needle); + free(haystack_sz.start); + free(needle_sz.start); // Convert the result to JavaScript BigInt and return napi_value js_result; // In JavaScript, if `find` is unable to find the specified value, then it should return -1 - if (result == strzl_haystack.len) - napi_create_bigint_int64(env, -1, &js_result); + if (result == haystack_sz.length) napi_create_bigint_int64(env, -1, &js_result); else napi_create_bigint_uint64(env, result, &js_result); return js_result; } -size_t count_char(strzl_haystack_t strzl_haystack, char needle) { - size_t result = strzl_naive_count_char(strzl_haystack, needle); - +size_t count_char(sz_haystack_t haystack_sz, char needle) { + size_t result = sz_count_char(haystack_sz, needle); return result; } @@ -70,91 +59,66 @@ napi_value CountAPI(napi_env env, napi_callback_info info) { napi_get_cb_info(env, info, &argc, args, NULL, NULL); // Extract the C string from the JavaScript string for haystack and needle - struct strzl_haystack_t strzl_haystack = {NULL, 0}; - struct strzl_needle_t strzl_needle = {NULL, 0, 0}; + sz_haystack_t haystack_sz = {NULL, 0}; + sz_needle_t needle_sz = {NULL, 0, 0}; // For haystack - napi_get_value_string_utf8(env, args[0], NULL, 0, &strzl_haystack.len); - char *haystack = malloc(strzl_haystack.len); - napi_get_value_string_utf8(env, args[0], haystack, strzl_haystack.len, &strzl_haystack.len); - strzl_haystack.ptr = haystack; + napi_get_value_string_utf8(env, args[0], NULL, 0, &haystack_sz.length); + haystack_sz.start = malloc(haystack_sz.length + 1); + napi_get_value_string_utf8(env, args[0], haystack_sz.start, haystack_sz.length + 1, &haystack_sz.length); // For needle - napi_get_value_string_utf8(env, args[1], NULL, 0, &strzl_needle.len); - char *needle = malloc(strzl_needle.len); - napi_get_value_string_utf8(env, args[1], needle, strzl_needle.len, &strzl_needle.len); - strzl_needle.ptr = needle; + napi_get_value_string_utf8(env, args[1], NULL, 0, &needle_sz.length); + needle_sz.start = malloc(needle_sz.length + 1); + napi_get_value_string_utf8(env, args[1], needle_sz.start, needle_sz.length + 1, &needle_sz.length); bool overlap = false; - if (argc > 2) { - napi_get_value_bool(env, args[2], &overlap); - } + if (argc > 2) { napi_get_value_bool(env, args[2], &overlap); } - size_t result; + void const *haystack_start = haystack_sz.start, *needle_start = needle_sz.start; - if (strzl_needle.len == 0 || strzl_haystack.len == 0 || strzl_haystack.len < strzl_needle.len) - result = 0; - else if (strzl_needle.len == 1) - result = count_char(strzl_haystack, strzl_needle.ptr[0]); + size_t count = 0; + if (needle_sz.length == 0 || haystack_sz.length == 0 || haystack_sz.length < needle_sz.length) { count = 0; } + else if (needle_sz.length == 1) { count = count_char(haystack_sz, needle_sz.start[0]); } else if (overlap) { - while (strzl_haystack.len) { -#if defined(__AVX2__) - size_t offset = strzl_avx2_find_substr(strzl_haystack, strzl_needle); -#elif defined(__ARM_NEON) - size_t offset = strzl_neon_find_substr(strzl_haystack, strzl_needle); -#else - size_t offset = strzl_naive_find_substr(strzl_haystack, strzl_needle); -#endif - - bool found = offset != strzl_haystack.len; - result += found; - strzl_haystack.ptr += offset + found; - strzl_haystack.len -= offset + found; + while (haystack_sz.length) { + sz_size_t offset = sz_find_substr(haystack_sz, needle_sz); + int found = offset != haystack_sz.length; + count += found; + haystack_sz.start += offset + found; + haystack_sz.length -= offset + found; } } - else { - while (strzl_haystack.len) { -#if defined(__AVX2__) - size_t offset = strzl_avx2_find_substr(strzl_haystack, strzl_needle); -#elif defined(__ARM_NEON) - size_t offset = strzl_neon_find_substr(strzl_haystack, strzl_needle); -#else - size_t offset = strzl_naive_find_substr(strzl_haystack, strzl_needle); -#endif - - bool found = offset != strzl_haystack.len; - result += found; - strzl_haystack.ptr += offset + strzl_needle.len; - strzl_haystack.len -= offset + strzl_needle.len * found; + while (haystack_sz.length) { + sz_size_t offset = sz_find_substr(haystack_sz, needle_sz); + int found = offset != haystack_sz.length; + count += found; + haystack_sz.start += offset + needle_sz.length; + haystack_sz.length -= offset + needle_sz.length * found; } } // Cleanup - free(haystack); - free(needle); + free(haystack_start); + free(needle_start); - // Convert the result to JavaScript `BigInt` and return - napi_value js_result; - napi_create_bigint_uint64(env, result, &js_result); + // Convert the `count` to JavaScript `BigInt` and return + napi_value js_count; + napi_create_bigint_uint64(env, count, &js_count); - return js_result; + return js_count; } napi_value Init(napi_env env, napi_value exports) { - // Define the "find" property - napi_property_descriptor findDesc = {"find", 0, FindAPI, 0, 0, 0, napi_default, 0}; - - // Define the "count" property - napi_property_descriptor countDesc = {"count", 0, CountAPI, 0, 0, 0, napi_default, 0}; // Define an array of property descriptors + napi_property_descriptor findDesc = {"find", 0, FindAPI, 0, 0, 0, napi_default, 0}; + napi_property_descriptor countDesc = {"count", 0, CountAPI, 0, 0, 0, napi_default, 0}; napi_property_descriptor properties[] = {findDesc, countDesc}; - // Define the number of properties in the array - size_t propertyCount = sizeof(properties) / sizeof(properties[0]); - // Define the properties on the `exports` object + size_t propertyCount = sizeof(properties) / sizeof(properties[0]); napi_define_properties(env, exports, propertyCount, properties); return exports; diff --git a/javascript/stringzilla.d.ts b/javascript/stringzilla.d.ts deleted file mode 100644 index 5521c152..00000000 --- a/javascript/stringzilla.d.ts +++ /dev/null @@ -1,17 +0,0 @@ - -/** - * Searches for a short string in a long one. - * - * @param {string} haystack - * @param {string} needle - */ -export function find(haystack: string, needle: string): bigint; - -/** - * Searches for a substring in a larger string. - * - * @param {string} haystack - * @param {string} needle - * @param {boolean} overlap - */ -export function count(haystack: string, needle: string, overlap: boolean): bigint; diff --git a/javascript/stringzilla.js b/javascript/stringzilla.js index d163bee8..24b78e24 100644 --- a/javascript/stringzilla.js +++ b/javascript/stringzilla.js @@ -1,2 +1,22 @@ -const stringzilla = require('bindings')('stringzilla'); -module.exports = stringzilla; \ No newline at end of file +const compiled = require('bindings')('stringzilla'); + +module.exports = { + /** + * Searches for a short string in a long one. + * + * @param {string} haystack + * @param {string} needle + * @returns {bigint} + */ + find: compiled.find, + + /** + * Searches for a substring in a larger string. + * + * @param {string} haystack + * @param {string} needle + * @param {boolean} overlap + * @returns {bigint} + */ + count: compiled.count +}; diff --git a/python/lib.c b/python/lib.c index cef4a751..ad10f196 100644 --- a/python/lib.c +++ b/python/lib.c @@ -1,6 +1,10 @@ /** - * @brief Very light-weight CPython wrapper for StringZilla, with support for memory-mapping, - * native Python strings, Apache Arrow collections, and more. + * @file lib.c + * @brief Very light-weight CPython wrapper for StringZilla, with support for memory-mapping, + * native Python strings, Apache Arrow collections, and more. + * @author Ash Vardanian + * @date July 10, 2023 + * @copyright Copyright (c) 2023 * * - Doesn't use PyBind11, NanoBind, Boost.Python, or any other high-level libs, only CPython API. * - To minimize latency this implementation avoids `PyArg_ParseTupleAndKeywords` calls. @@ -646,7 +650,7 @@ static int Str_in(Str *self, PyObject *arg) { sz_haystack_t haystack; haystack.start = self->start; haystack.length = self->length; - size_t position = sz_find_substr_auto(haystack, needle_struct); + size_t position = sz_find_substr(haystack, needle_struct); return position != haystack.length; } @@ -881,7 +885,7 @@ static int Str_find_( // haystack.length = normalized_length; // Perform contains operation - size_t offset = sz_find_substr_auto(haystack, needle); + size_t offset = sz_find_substr(haystack, needle); if (offset == haystack.length) { *offset_out = -1; } else { *offset_out = (Py_ssize_t)offset; } @@ -1008,11 +1012,13 @@ static PyObject *Str_count(PyObject *self, PyObject *args, PyObject *kwargs) { haystack.start += normalized_offset; haystack.length = normalized_length; - size_t count = needle.length == 1 ? sz_count_char_swar(haystack, *needle.start) : 0; - if (needle.length != 1) { + size_t count = 0; + if (needle.length == 0 || haystack.length == 0 || haystack.length < needle.length) { count = 0; } + else if (needle.length == 1) { count = sz_count_char(haystack, needle.start[0]); } + else if (needle.length != 1) { if (allowoverlap) { while (haystack.length) { - size_t offset = sz_find_substr_auto(haystack, needle); + sz_size_t offset = sz_find_substr(haystack, needle); int found = offset != haystack.length; count += found; haystack.start += offset + found; @@ -1021,7 +1027,7 @@ static PyObject *Str_count(PyObject *self, PyObject *args, PyObject *kwargs) { } else { while (haystack.length) { - size_t offset = sz_find_substr_auto(haystack, needle); + sz_size_t offset = sz_find_substr(haystack, needle); int found = offset != haystack.length; count += found; haystack.start += offset + needle.length; @@ -1207,7 +1213,7 @@ static Strs *Str_split_( sz_haystack_t text_remaining; text_remaining.start = text.start + last_start; text_remaining.length = text.length - last_start; - sz_size_t offset_in_remaining = sz_find_substr_auto(text_remaining, separator); + sz_size_t offset_in_remaining = sz_find_substr(text_remaining, separator); // Reallocate offsets array if needed if (offsets_count >= offsets_capacity) { diff --git a/scripts/test.c b/scripts/test.c index f50d7e62..a921e76d 100644 --- a/scripts/test.c +++ b/scripts/test.c @@ -1,9 +1,9 @@ +#include +#include #include #include -#include #include #include -#include #include @@ -18,8 +18,8 @@ void populate_random_string(char *buffer, int length, int variability) { buffer[length] = '\0'; } -// Test function for sz_find_substr_auto -void test_sz_find_substr_auto() { +// Test function for sz_find_substr +void test_sz_find_substr() { char buffer[MAX_LENGTH + 1]; char pattern[6]; // Maximum length of 5 + 1 for '\0' @@ -40,11 +40,11 @@ void test_sz_find_substr_auto() { // Comparing the result of your function with the standard library function. const char *result_libc = strstr(buffer, pattern); - uint64_t result_stringzilla = sz_find_substr_auto(haystack, needle); + uint64_t result_stringzilla = sz_find_substr(haystack, needle); assert(((result_libc && result_stringzilla == (uint64_t)(result_libc - buffer)) || (!result_libc && result_stringzilla == (uint64_t)-1)) && - "Test failed for sz_find_substr_auto"); + "Test failed for sz_find_substr"); } } } @@ -52,7 +52,7 @@ void test_sz_find_substr_auto() { int main() { srand((unsigned int)time(NULL)); - test_sz_find_substr_auto(); + test_sz_find_substr(); // Add calls to other test functions as you implement them printf("All tests passed!\n"); diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h index 52cc4ec6..7e2957a3 100644 --- a/stringzilla/stringzilla.h +++ b/stringzilla/stringzilla.h @@ -1,12 +1,12 @@ #ifndef STRINGZILLA_H_ #define STRINGZILLA_H_ -#include // `uint8_t` +#include // `tolower` +#include // `qsort_s` #include // `sz_size_t` -#include // `memcpy` +#include // `uint8_t` #include // `qsort_r` -#include // `qsort_s` -#include // `tolower` +#include // `memcpy` #if defined(__AVX2__) #include @@ -427,7 +427,9 @@ inline static sz_size_t sz_find_substr_neon(sz_haystack_t h, sz_needle_t n) { #endif // Arm Neon -inline static sz_size_t sz_find_substr_auto(sz_haystack_t h, sz_needle_t n) { +inline static sz_size_t sz_count_char(sz_haystack_t h, char n) { return sz_count_char_swar(h, n); } + +inline static sz_size_t sz_find_substr(sz_haystack_t h, sz_needle_t n) { if (h.length < n.length) return h.length; switch (n.length) { From 7a317c072072ab431e38ae1f7dedad13490f7722 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Thu, 5 Oct 2023 14:23:43 -0700 Subject: [PATCH 49/72] Make: Shift JavaScript CI --- .github/workflows/javascript-ci.yml | 21 --------------------- .github/workflows/prerelease.yml | 17 +++++++++++++++++ .github/workflows/release.yml | 2 +- README.md | 10 +++++----- stringzilla/stringzilla.h | 1 + 5 files changed, 24 insertions(+), 27 deletions(-) delete mode 100644 .github/workflows/javascript-ci.yml diff --git a/.github/workflows/javascript-ci.yml b/.github/workflows/javascript-ci.yml deleted file mode 100644 index 674fa882..00000000 --- a/.github/workflows/javascript-ci.yml +++ /dev/null @@ -1,21 +0,0 @@ -name: CI -on: - pull_request: - branches: '*' - push: - branches: '*' - -jobs: - tests: - runs-on: ubuntu-latest - strategy: - matrix: - node-version: [18.x] - steps: - - uses: actions/checkout@v4 - - name: Use Node.js - uses: actions/setup-node@v3 - with: - node-version: '18.x' - - run: npm i - - run: npm test \ No newline at end of file diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml index 9c6bdab9..9940829f 100644 --- a/.github/workflows/prerelease.yml +++ b/.github/workflows/prerelease.yml @@ -70,4 +70,21 @@ jobs: - name: Test with PyTest run: pytest scripts/ + test_javascript: + runs-on: ubuntu-latest + strategy: + matrix: + node-version: [18.x] + steps: + + - uses: actions/checkout@v4 + - name: Set up Node.js + uses: actions/setup-node@v3 + with: + node-version: '18.x' + + - name: Build locally + run: npm i + - name: Test + run: npm test diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index f583fc05..b96529a7 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -127,4 +127,4 @@ jobs: - run: npm test - uses: JS-DevTools/npm-publish@v2 with: - token: ${{ secrets.NPM_TOKEN }} \ No newline at end of file + token: ${{ secrets.NPM_TOKEN }} diff --git a/README.md b/README.md index f774df35..3c04c219 100644 --- a/README.md +++ b/README.md @@ -119,9 +119,9 @@ sz_haystack_t haystack = {your_text, your_text_length}; sz_needle_t needle = {your_subtext, your_subtext_length, your_anomaly_offset}; // Perform string-level operations -size_t character_count = sz_count_char_swar(haystack, 'a'); -size_t character_position = sz_find_char_swar(haystack, 'a'); -size_t substring_position = sz_find_substr_swar(haystack, needle); +size_t character_count = sz_count_char(haystack, 'a'); +size_t character_position = sz_find_char(haystack, 'a'); +size_t substring_position = sz_find_substr(haystack, needle); // Perform collection level operations sz_sequence_t array = {your_order, your_count, your_get_start, your_get_length, your_handle}; @@ -148,7 +148,7 @@ Here's how to set up your dev environment and run some tests. CPython: ```sh -# Clean up and install +# Clean up, install, and test! rm -rf build && pip install -e . && pytest scripts/ -s -x # Install without dependencies @@ -158,7 +158,7 @@ pip install -e . --no-index --no-deps NodeJS: ```sh -npm install && node javascript/test.js +npm install && npm test ``` ### Benchmarking diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h index 7e2957a3..e1425729 100644 --- a/stringzilla/stringzilla.h +++ b/stringzilla/stringzilla.h @@ -428,6 +428,7 @@ inline static sz_size_t sz_find_substr_neon(sz_haystack_t h, sz_needle_t n) { #endif // Arm Neon inline static sz_size_t sz_count_char(sz_haystack_t h, char n) { return sz_count_char_swar(h, n); } +inline static sz_size_t sz_find_char(sz_haystack_t h, char n) { return sz_find_char_swar(h, n); } inline static sz_size_t sz_find_substr(sz_haystack_t h, sz_needle_t n) { if (h.length < n.length) return h.length; From 61588cc81f21f0e3b539dcd56a08cecc04677c05 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Thu, 5 Oct 2023 14:26:40 -0700 Subject: [PATCH 50/72] Improve: Silence type-casting warnings --- javascript/lib.c | 40 ++++++++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/javascript/lib.c b/javascript/lib.c index 3644468a..fe1f5f68 100644 --- a/javascript/lib.c +++ b/javascript/lib.c @@ -21,21 +21,29 @@ napi_value FindAPI(napi_env env, napi_callback_info info) { sz_needle_t needle_sz = {NULL, 0, 0}; // For haystack - napi_get_value_string_utf8(env, args[0], NULL, 0, &haystack_sz.length); + napi_get_value_string_utf8(env, args[0], NULL, 0, (size_t *)&haystack_sz.length); haystack_sz.start = malloc(haystack_sz.length + 1); - napi_get_value_string_utf8(env, args[0], haystack_sz.start, haystack_sz.length + 1, &haystack_sz.length); + napi_get_value_string_utf8(env, + args[0], + (char *)haystack_sz.start, + haystack_sz.length + 1, + (size_t *)&haystack_sz.length); // For needle - napi_get_value_string_utf8(env, args[1], NULL, 0, &needle_sz.length); + napi_get_value_string_utf8(env, args[1], NULL, 0, (size_t *)&needle_sz.length); needle_sz.start = malloc(needle_sz.length + 1); - napi_get_value_string_utf8(env, args[1], needle_sz.start, needle_sz.length + 1, &needle_sz.length); + napi_get_value_string_utf8(env, + args[1], + (char *)needle_sz.start, + needle_sz.length + 1, + (size_t *)&needle_sz.length); // Perform the find operation sz_size_t result = sz_find_substr(haystack_sz, needle_sz); // Cleanup - free(haystack_sz.start); - free(needle_sz.start); + free((void *)haystack_sz.start); + free((void *)needle_sz.start); // Convert the result to JavaScript BigInt and return napi_value js_result; @@ -63,14 +71,22 @@ napi_value CountAPI(napi_env env, napi_callback_info info) { sz_needle_t needle_sz = {NULL, 0, 0}; // For haystack - napi_get_value_string_utf8(env, args[0], NULL, 0, &haystack_sz.length); + napi_get_value_string_utf8(env, args[0], NULL, 0, (size_t *)&haystack_sz.length); haystack_sz.start = malloc(haystack_sz.length + 1); - napi_get_value_string_utf8(env, args[0], haystack_sz.start, haystack_sz.length + 1, &haystack_sz.length); + napi_get_value_string_utf8(env, + args[0], + (char *)haystack_sz.start, + haystack_sz.length + 1, + (size_t *)&haystack_sz.length); // For needle - napi_get_value_string_utf8(env, args[1], NULL, 0, &needle_sz.length); + napi_get_value_string_utf8(env, args[1], NULL, 0, (size_t *)&needle_sz.length); needle_sz.start = malloc(needle_sz.length + 1); - napi_get_value_string_utf8(env, args[1], needle_sz.start, needle_sz.length + 1, &needle_sz.length); + napi_get_value_string_utf8(env, + args[1], + (char *)needle_sz.start, + needle_sz.length + 1, + (size_t *)&needle_sz.length); bool overlap = false; if (argc > 2) { napi_get_value_bool(env, args[2], &overlap); } @@ -100,8 +116,8 @@ napi_value CountAPI(napi_env env, napi_callback_info info) { } // Cleanup - free(haystack_start); - free(needle_start); + free((void *)haystack_start); + free((void *)needle_start); // Convert the `count` to JavaScript `BigInt` and return napi_value js_count; From b5a0ccf3fe0e4401da06d89461e53b124fa9e034 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Thu, 5 Oct 2023 15:13:17 -0700 Subject: [PATCH 51/72] Fix: Overflow bug in Arm NEON --- stringzilla/stringzilla.h | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h index e1425729..c1e6adc3 100644 --- a/stringzilla/stringzilla.h +++ b/stringzilla/stringzilla.h @@ -408,7 +408,7 @@ inline static sz_size_t sz_find_substr_neon(sz_haystack_t h, sz_needle_t n) { // vorrq_u32 (all) uint32x4_t matches = vorrq_u32(vorrq_u32(matches0, matches1), vorrq_u32(matches2, matches3)); uint64x2_t matches64x2 = vreinterpretq_u64_u32(matches); - int has_match = vgetq_lane_u64(matches64x2, 0) | vgetq_lane_u64(matches64x2, 1); + uint64_t has_match = vgetq_lane_u64(matches64x2, 0) | vgetq_lane_u64(matches64x2, 1); if (has_match) { for (sz_size_t i = 0; i < 16; i++) { @@ -439,15 +439,13 @@ inline static sz_size_t sz_find_substr(sz_haystack_t h, sz_needle_t n) { case 2: return sz_find_2chars_swar(h, n.start); case 3: return sz_find_3chars_swar(h, n.start); case 4: - return sz_find_4chars_swar(h, n.start); - // #if defined(__ARM_NEON) - // default: return sz_find_substr_neon(h, n); - // #elif defined(__AVX2__) - // default: return sz_find_substr_avx2(h, n); - // #else - default: - return sz_find_substr_swar(h, n); - // #endif +#if defined(__ARM_NEON) + default: return sz_find_substr_neon(h, n); +#elif defined(__AVX2__) + default: return sz_find_substr_avx2(h, n); +#else + default: return sz_find_substr_swar(h, n); +#endif } } From e278b454c05a7598b13cf05e03ed8f32ca3c6622 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Thu, 5 Oct 2023 15:15:34 -0700 Subject: [PATCH 52/72] Make: Dependencies for testing --- .github/workflows/prerelease.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml index 9940829f..7fb45e98 100644 --- a/.github/workflows/prerelease.yml +++ b/.github/workflows/prerelease.yml @@ -61,7 +61,7 @@ jobs: - name: Install dependencies run: | - python -m pip install --no-cache-dir --upgrade pip + python -m pip install --no-cache-dir --upgrade pip numpy pip install --no-cache-dir pytest - name: Build locally @@ -71,6 +71,7 @@ jobs: run: pytest scripts/ test_javascript: + name: Test JavaScript runs-on: ubuntu-latest strategy: matrix: From b281bf0302ff1b1f8092e5933d80b5663d6a9d26 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Thu, 5 Oct 2023 15:55:49 -0700 Subject: [PATCH 53/72] Improve: Avoid inner `for`-loop on Arm NEON --- scripts/test_fuzzy.py | 10 ++++-- stringzilla/stringzilla.h | 70 +++++++++++++++++++++++---------------- 2 files changed, 49 insertions(+), 31 deletions(-) diff --git a/scripts/test_fuzzy.py b/scripts/test_fuzzy.py index ba4aca42..dbefd485 100644 --- a/scripts/test_fuzzy.py +++ b/scripts/test_fuzzy.py @@ -65,15 +65,19 @@ def test_fuzzy_repetitions(repetitions: int): check_identical(native, big, "abcde", True) # Missing pattern -@pytest.mark.parametrize("pattern_length", [1, 2, 4, 5]) +@pytest.mark.parametrize("pattern_length", [1, 2, 3, 4, 5]) @pytest.mark.parametrize("haystack_length", range(1, 65)) @pytest.mark.parametrize("variability", range(1, 25)) def test_fuzzy_substrings(pattern_length: int, haystack_length: int, variability: int): native = get_random_string(variability=variability, length=haystack_length) big = Str(native) pattern = get_random_string(variability=variability, length=pattern_length) - assert (pattern in native) == big.contains(pattern) - assert native.find(pattern) == big.find(pattern) + assert (pattern in native) == big.contains( + pattern + ), f"Failed to check if {pattern} at offset {native.find(pattern)} is present in {native}" + assert native.find(pattern) == big.find( + pattern + ), f"Failed to locate {pattern} at offset {native.find(pattern)} in {native}" @pytest.mark.parametrize("iterations", range(100)) diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h index c1e6adc3..7b664ca6 100644 --- a/stringzilla/stringzilla.h +++ b/stringzilla/stringzilla.h @@ -393,28 +393,49 @@ inline static sz_size_t sz_find_substr_neon(sz_haystack_t h, sz_needle_t n) { uint32x4_t const anomalies = vld1q_dup_u32(&anomaly); uint32x4_t const masks = vld1q_dup_u32(&mask); + uint32x4_t matches, matches0, matches1, matches2, matches3; char const *text = h.start; - for (; (text + n.length + 16) <= end; text += 16) { - - uint32x4_t matches0 = vceqq_u32(vandq_u32(vld1q_u32((uint32_t const *)(text + 0)), masks), anomalies); - uint32x4_t matches1 = vceqq_u32(vandq_u32(vld1q_u32((uint32_t const *)(text + 1)), masks), anomalies); - uint32x4_t matches2 = vceqq_u32(vandq_u32(vld1q_u32((uint32_t const *)(text + 2)), masks), anomalies); - uint32x4_t matches3 = vceqq_u32(vandq_u32(vld1q_u32((uint32_t const *)(text + 3)), masks), anomalies); - - // Extracting matches from matches: - // vmaxvq_u32 (only a64) - // vgetq_lane_u32 (all) - // vorrq_u32 (all) - uint32x4_t matches = vorrq_u32(vorrq_u32(matches0, matches1), vorrq_u32(matches2, matches3)); - uint64x2_t matches64x2 = vreinterpretq_u64_u32(matches); - uint64_t has_match = vgetq_lane_u64(matches64x2, 0) | vgetq_lane_u64(matches64x2, 1); - - if (has_match) { - for (sz_size_t i = 0; i < 16; i++) { - if (sz_equal(text + i, n.start, n.length)) return i + (text - h.start); + while (text + n.length + 16 <= end) { + + // Each of the following `matchesX` contains only 4 relevant bits - one per word. + // Each signifies a match at the given offset. + matches0 = vceqq_u32(vandq_u32(vld1q_u32((uint32_t const *)(text + 0)), masks), anomalies); + matches1 = vceqq_u32(vandq_u32(vld1q_u32((uint32_t const *)(text + 1)), masks), anomalies); + matches2 = vceqq_u32(vandq_u32(vld1q_u32((uint32_t const *)(text + 2)), masks), anomalies); + matches3 = vceqq_u32(vandq_u32(vld1q_u32((uint32_t const *)(text + 3)), masks), anomalies); + matches = vorrq_u32(vorrq_u32(matches0, matches1), vorrq_u32(matches2, matches3)); + + if (vmaxvq_u32(matches)) { + // Let's isolate the match from every word + matches0 = vandq_u32(matches0, vdupq_n_u32(0x00000001)); + matches1 = vandq_u32(matches1, vdupq_n_u32(0x00000002)); + matches2 = vandq_u32(matches2, vdupq_n_u32(0x00000004)); + matches3 = vandq_u32(matches3, vdupq_n_u32(0x00000008)); + matches = vorrq_u32(vorrq_u32(matches0, matches1), vorrq_u32(matches2, matches3)); + + // By now, every 32-bit word of `matches` no more than 4 set bits. + // Meaning that we can narrow it down to a single 16-bit word. + uint16x4_t matches_u16x4 = vmovn_u32(matches); + uint16_t matches_u16 = // + (vget_lane_u16(matches_u16x4, 0) << 0) | // + (vget_lane_u16(matches_u16x4, 1) << 4) | // + (vget_lane_u16(matches_u16x4, 2) << 8) | // + (vget_lane_u16(matches_u16x4, 3) << 12); + + // Find the first match + size_t first_match_offset = __builtin_ctz(matches_u16); + if (n.length > 4) { + if (sz_equal(text + first_match_offset + 4, n.start + 4, n.length - 4)) + return text + first_match_offset - h.start; + else + text += first_match_offset + 1; } + else + return text + first_match_offset - h.start; } + else + text += 16; } // Don't forget the last (up to 16+3=19) characters. @@ -433,20 +454,13 @@ inline static sz_size_t sz_find_char(sz_haystack_t h, char n) { return sz_find_c inline static sz_size_t sz_find_substr(sz_haystack_t h, sz_needle_t n) { if (h.length < n.length) return h.length; - switch (n.length) { - case 0: return 0; - case 1: return sz_find_char_swar(h, *n.start); - case 2: return sz_find_2chars_swar(h, n.start); - case 3: return sz_find_3chars_swar(h, n.start); - case 4: #if defined(__ARM_NEON) - default: return sz_find_substr_neon(h, n); + return sz_find_substr_neon(h, n); #elif defined(__AVX2__) - default: return sz_find_substr_avx2(h, n); + return sz_find_substr_avx2(h, n); #else - default: return sz_find_substr_swar(h, n); + return sz_find_substr_swar(h, n); #endif - } } inline static void sz_swap(sz_size_t *a, sz_size_t *b) { From 305cec84d79429a8e9263c6b4b0a9e0c6dadcc8d Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Thu, 5 Oct 2023 16:06:14 -0700 Subject: [PATCH 54/72] Add: Micro-benchmarking notebook --- scripts/bench.ipynb | 185 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 185 insertions(+) create mode 100644 scripts/bench.ipynb diff --git a/scripts/bench.ipynb b/scripts/bench.ipynb new file mode 100644 index 00000000..b69d2f8f --- /dev/null +++ b/scripts/bench.ipynb @@ -0,0 +1,185 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import stringzilla as sz" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "129,644,797\n" + ] + } + ], + "source": [ + "pythonic_str: str = open(\"../leipzig1M.txt\", \"r\").read()\n", + "print(f\"{len(pythonic_str):,}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "sz_str = sz.Str(pythonic_str)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "pattern = \"the\"" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1,456,488\n" + ] + } + ], + "source": [ + "print(f\"{pythonic_str.count(pattern):,}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1,456,488\n" + ] + } + ], + "source": [ + "print(f\"{sz_str.count(pattern):,}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "150 ms ± 2.01 ms per loop (mean ± std. dev. of 100 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit -n 1 -r 100\n", + "pythonic_str.count(pattern)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "37.8 ms ± 286 µs per loop (mean ± std. dev. of 100 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit -n 1 -r 100\n", + "sz_str.count(pattern)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "173 ns ± 23.7 ns per loop (mean ± std. dev. of 1000 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit -n 1 -r 1000\n", + "pythonic_str.find(pattern)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The slowest run took 82.51 times longer than the fastest. This could mean that an intermediate result is being cached.\n", + "94.3 ns ± 108 ns per loop (mean ± std. dev. of 1000 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit -n 1 -r 1000\n", + "sz_str.find(pattern)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} From b09cb3d64644d6d007b6f118abcbaccd0f9eab3f Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sat, 7 Oct 2023 15:43:57 -0700 Subject: [PATCH 55/72] Make: `numpy` dependency --- .github/workflows/prerelease.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml index 7fb45e98..dd09096b 100644 --- a/.github/workflows/prerelease.yml +++ b/.github/workflows/prerelease.yml @@ -34,7 +34,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - python -m pip install --no-cache-dir --upgrade pip + python -m pip install --no-cache-dir --upgrade pip numpy pip install --no-cache-dir pytest - name: Build locally run: python -m pip install . From 779dded5c653bf9be56ec270ba0d8e9ee1a26052 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sat, 7 Oct 2023 18:12:38 -0700 Subject: [PATCH 56/72] Improve: drop `ctype`, `stddef`, `stdint` headers --- .vscode/settings.json | 2 + README.md | 4 +- python/lib.c | 42 +++--- scripts/bench.ipynb | 2 +- stringzilla/stringzilla.h | 274 +++++++++++++++++++++++++------------- 5 files changed, 203 insertions(+), 121 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 886d1d22..08c5bb65 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -151,6 +151,7 @@ "NOMINMAX", "NOTIMPLEMENTED", "numpy", + "octogram", "pytest", "Pythonic", "quadgram", @@ -166,6 +167,7 @@ "substr", "SWAR", "TPFLAGS", + "unigram", "Vardanian", "vectorcallfunc", "XDECREF", diff --git a/README.md b/README.md index 3c04c219..85032c34 100644 --- a/README.md +++ b/README.md @@ -116,11 +116,11 @@ There is an ABI-stable C 99 interface, in case you have a database, an operating // Initialize your haystack and needle sz_haystack_t haystack = {your_text, your_text_length}; -sz_needle_t needle = {your_subtext, your_subtext_length, your_anomaly_offset}; +sz_needle_t needle = {your_subtext, your_subtext_length, your_quadgram_offset}; // Perform string-level operations size_t character_count = sz_count_char(haystack, 'a'); -size_t character_position = sz_find_char(haystack, 'a'); +size_t character_position = sz_find_unigram(haystack, 'a'); size_t substring_position = sz_find_substr(haystack, needle); // Perform collection level operations diff --git a/python/lib.c b/python/lib.c index ad10f196..a0f6caca 100644 --- a/python/lib.c +++ b/python/lib.c @@ -48,12 +48,12 @@ static struct { * native `mmap` module, as it exposes the address of the mapping in memory. */ typedef struct { - PyObject_HEAD; + PyObject_HEAD #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) - HANDLE file_handle; + HANDLE file_handle; HANDLE mapping_handle; #else - int file_descriptor; + int file_descriptor; #endif void *start; size_t length; @@ -72,8 +72,7 @@ typedef struct { * - Str(File("some-path.txt"), from=0, to=sys.maxint) */ typedef struct { - PyObject_HEAD; - PyObject *parent; + PyObject_HEAD PyObject *parent; char const *start; size_t length; } Str; @@ -83,14 +82,14 @@ typedef struct { * for faster sorting, shuffling, joins, and lookups. */ typedef struct { - PyObject_HEAD; + PyObject_HEAD - enum { - STRS_CONSECUTIVE_32, - STRS_CONSECUTIVE_64, - STRS_REORDERED, - STRS_MULTI_SOURCE, - } type; + enum { + STRS_CONSECUTIVE_32, + STRS_CONSECUTIVE_64, + STRS_REORDERED, + STRS_MULTI_SOURCE, + } type; union { /** @@ -641,7 +640,7 @@ static void Str_releasebuffer(PyObject *_, Py_buffer *view) { static int Str_in(Str *self, PyObject *arg) { sz_needle_t needle_struct; - needle_struct.anomaly_offset = 0; + needle_struct.quadgram_offset = 0; if (!export_string_like(arg, &needle_struct.start, &needle_struct.length)) { PyErr_SetString(PyExc_TypeError, "Unsupported argument type"); return -1; @@ -851,7 +850,7 @@ static int Str_find_( // Py_ssize_t start, end; // Validate and convert `haystack` and `needle` - needle.anomaly_offset = 0; + needle.quadgram_offset = 0; if (!export_string_like(haystack_obj, &haystack.start, &haystack.length) || !export_string_like(needle_obj, &needle.start, &needle.length)) { PyErr_SetString(PyExc_TypeError, "Haystack and needle must be string-like"); @@ -1000,7 +999,7 @@ static PyObject *Str_count(PyObject *self, PyObject *args, PyObject *kwargs) { Py_ssize_t end = end_obj ? PyLong_AsSsize_t(end_obj) : PY_SSIZE_T_MAX; int allowoverlap = allowoverlap_obj ? PyObject_IsTrue(allowoverlap_obj) : 0; - needle.anomaly_offset = 0; + needle.quadgram_offset = 0; if (!export_string_like(haystack_obj, &haystack.start, &haystack.length) || !export_string_like(needle_obj, &needle.start, &needle.length)) return PyErr_Format(PyExc_TypeError, "Haystack and needle must be string-like"), NULL; @@ -1287,7 +1286,7 @@ static PyObject *Str_split(PyObject *self, PyObject *args, PyObject *kwargs) { sz_needle_t separator; int keepseparator; Py_ssize_t maxsplit; - separator.anomaly_offset = 0; + separator.quadgram_offset = 0; // Validate and convert `text` if (!export_string_like(text_obj, &text.start, &text.length)) { @@ -1565,14 +1564,9 @@ static boolean_t Strs_sort_(Strs *self, } // Get the parts and their count - sz_haystack_t *parts = NULL; - size_t count = 0; - switch (self->type) { - case STRS_REORDERED: - parts = self->data.reordered.parts; - count = self->data.reordered.count; - break; - } + // The only possible `self->type` by now is the `STRS_REORDERED` + sz_haystack_t *parts = self->data.reordered.parts; + size_t count = self->data.reordered.count; // Allocate temporary memory to store the ordering offsets size_t memory_needed = sizeof(sz_size_t) * count; diff --git a/scripts/bench.ipynb b/scripts/bench.ipynb index b69d2f8f..b3bc4392 100644 --- a/scripts/bench.ipynb +++ b/scripts/bench.ipynb @@ -176,7 +176,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.11" + "version": "3.10.13" }, "orig_nbformat": 4 }, diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h index 7b664ca6..51319f01 100644 --- a/stringzilla/stringzilla.h +++ b/stringzilla/stringzilla.h @@ -1,10 +1,7 @@ #ifndef STRINGZILLA_H_ #define STRINGZILLA_H_ -#include // `tolower` #include // `qsort_s` -#include // `sz_size_t` -#include // `uint8_t` #include // `qsort_r` #include // `memcpy` @@ -30,11 +27,71 @@ extern "C" { #endif -typedef uint32_t sz_anomaly_t; -typedef uint64_t sz_size_t; +#if defined(__LP64__) || defined(_LP64) || defined(__x86_64__) || defined(_WIN64) +typedef unsigned long sz_size_t; // 64-bit on most platforms when pointers are 64-bit +#else +typedef unsigned sz_size_t; // 32-bit on most platforms when pointers are 32-bit +#endif + +typedef unsigned sz_u32_t; // Always 32 bits +typedef unsigned long long sz_u64_t; // Always 64 bits + +typedef union sz_quadgram_t { + unsigned u32; + unsigned char u8s[4]; +} sz_quadgram_t; // Always 32-bit unsigned integer, representing 8 bytes/characters + +typedef union sz_octogram_t { + unsigned long long u64; + unsigned char u8s[8]; +} sz_octogram_t; // Always 64-bit unsigned integer, representing 8 bytes/characters inline static sz_size_t sz_divide_round_up(sz_size_t x, sz_size_t divisor) { return (x + (divisor - 1)) / divisor; } +inline static sz_size_t sz_tolower_ascii(char c) { + static char lowered[256] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, // + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, // + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, // + 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, // + 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 91, 92, 93, 94, 95, // + 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, // + 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, // + 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, // + 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, // + 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, // + 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, // + 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, // + 240, 241, 242, 243, 244, 245, 246, 215, 248, 249, 250, 251, 252, 253, 254, 223, // + 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, // + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, // + }; + return lowered[(int)c]; +} + +inline static sz_size_t sz_toupper_ascii(char c) { + static char upped[256] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, // + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, // + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, // + 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, // + 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 91, 92, 93, 94, 95, // + 96, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, // + 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 123, 124, 125, 126, 127, // + 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, // + 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, // + 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, // + 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, // + 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, // + 240, 241, 242, 243, 244, 245, 246, 215, 248, 249, 250, 251, 252, 253, 254, 223, // + 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, // + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, // + }; + return upped[(int)c]; +} + /** * @brief This is a faster alternative to `strncmp(a, b, length) == 0`. * @return 1 for `true`, and 0 for `false`. @@ -53,28 +110,29 @@ typedef struct sz_haystack_t { typedef struct sz_needle_t { char const *start; sz_size_t length; - sz_size_t anomaly_offset; + sz_size_t quadgram_offset; } sz_needle_t; /** * @brief SWAR single-character counting procedure, jumping 8 bytes at a time. */ -inline static sz_size_t sz_count_char_swar(sz_haystack_t h, char n) { +inline static sz_size_t sz_count_unigram_swar(sz_haystack_t h, char n) { sz_size_t result = 0; char const *text = h.start; char const *end = h.start + h.length; - for (; (uint64_t)text % 8 != 0 && text < end; ++text) result += *text == n; + // Process the misaligned head, to void UB on unaligned 64-bit loads. + for (; ((unsigned long)text & 7ul) && text < end; ++text) result += *text == n; // This code simulates hyper-scalar execution, comparing 8 characters at a time. - uint64_t nnnnnnnn = n; + sz_u64_t nnnnnnnn = n; nnnnnnnn |= nnnnnnnn << 8; nnnnnnnn |= nnnnnnnn << 16; nnnnnnnn |= nnnnnnnn << 32; for (; text + 8 <= end; text += 8) { - uint64_t text_slice = *(uint64_t const *)text; - uint64_t match_indicators = ~(text_slice ^ nnnnnnnn); + sz_u64_t text_slice = *(sz_u64_t const *)text; + sz_u64_t match_indicators = ~(text_slice ^ nnnnnnnn); match_indicators &= match_indicators >> 1; match_indicators &= match_indicators >> 2; match_indicators &= match_indicators >> 4; @@ -89,22 +147,23 @@ inline static sz_size_t sz_count_char_swar(sz_haystack_t h, char n) { /** * @brief SWAR single-character search in string, jumping 8 bytes at a time. */ -inline static sz_size_t sz_find_char_swar(sz_haystack_t h, char n) { +inline static sz_size_t sz_find_unigram_swar(sz_haystack_t h, char n) { char const *text = h.start; char const *end = h.start + h.length; - for (; (uint64_t)text % 8 != 0 && text < end; ++text) + // Process the misaligned head, to void UB on unaligned 64-bit loads. + for (; ((unsigned long)text & 7ul) && text < end; ++text) if (*text == n) return text - h.start; // This code simulates hyper-scalar execution, analyzing 8 offsets at a time. - uint64_t nnnnnnnn = n; + sz_u64_t nnnnnnnn = n; nnnnnnnn |= nnnnnnnn << 8; // broadcast `n` into `nnnnnnnn` nnnnnnnn |= nnnnnnnn << 16; // broadcast `n` into `nnnnnnnn` nnnnnnnn |= nnnnnnnn << 32; // broadcast `n` into `nnnnnnnn` for (; text + 8 <= end; text += 8) { - uint64_t text_slice = *(uint64_t const *)text; - uint64_t match_indicators = ~(text_slice ^ nnnnnnnn); + sz_u64_t text_slice = *(sz_u64_t const *)text; + sz_u64_t match_indicators = ~(text_slice ^ nnnnnnnn); match_indicators &= match_indicators >> 1; match_indicators &= match_indicators >> 2; match_indicators &= match_indicators >> 4; @@ -121,26 +180,31 @@ inline static sz_size_t sz_find_char_swar(sz_haystack_t h, char n) { /** * @brief SWAR character-bigram search in string, jumping 8 bytes at a time. */ -inline static sz_size_t sz_find_2chars_swar(sz_haystack_t h, char const *n) { +inline static sz_size_t sz_find_bigram_swar(sz_haystack_t h, char const *n) { char const *text = h.start; char const *end = h.start + h.length; + // Process the misaligned head, to void UB on unaligned 64-bit loads. + for (; ((unsigned long)text & 7ul) && text + 2 <= end; ++text) + if (text[0] == n[0] && text[1] == n[1]) return text - h.start; + // This code simulates hyper-scalar execution, analyzing 7 offsets at a time. - uint64_t nnnn = ((uint64_t)(n[0]) << 0) | ((uint64_t)(n[1]) << 8); // broadcast `n` into `nnnn` + sz_u64_t nnnn = ((sz_u64_t)(n[0]) << 0) | ((sz_u64_t)(n[1]) << 8); // broadcast `n` into `nnnn` nnnn |= nnnn << 16; // broadcast `n` into `nnnn` nnnn |= nnnn << 32; // broadcast `n` into `nnnn` - uint64_t text_slice; for (; text + 8 <= end; text += 7) { - memcpy(&text_slice, text, 8); - uint64_t even_indicators = ~(text_slice ^ nnnn); - uint64_t odd_indicators = ~((text_slice << 8) ^ nnnn); + sz_u64_t text_slice = *(sz_u64_t const *)text; + sz_u64_t even_indicators = ~(text_slice ^ nnnn); + sz_u64_t odd_indicators = ~((text_slice << 8) ^ nnnn); + // For every even match - 2 char (16 bits) must be identical. even_indicators &= even_indicators >> 1; even_indicators &= even_indicators >> 2; even_indicators &= even_indicators >> 4; even_indicators &= even_indicators >> 8; even_indicators &= 0x0001000100010001; + // For every odd match - 2 char (16 bits) must be identical. odd_indicators &= odd_indicators >> 1; odd_indicators &= odd_indicators >> 2; @@ -149,7 +213,7 @@ inline static sz_size_t sz_find_2chars_swar(sz_haystack_t h, char const *n) { odd_indicators &= 0x0001000100010000; if (even_indicators + odd_indicators) { - uint64_t match_indicators = even_indicators | (odd_indicators >> 8); + sz_u64_t match_indicators = even_indicators | (odd_indicators >> 8); return text - h.start + ctz64(match_indicators) / 8; } } @@ -162,23 +226,26 @@ inline static sz_size_t sz_find_2chars_swar(sz_haystack_t h, char const *n) { /** * @brief SWAR character-trigram search in string, jumping 8 bytes at a time. */ -inline static sz_size_t sz_find_3chars_swar(sz_haystack_t h, char const *n) { +inline static sz_size_t sz_find_trigram_swar(sz_haystack_t h, char const *n) { char const *text = h.start; char const *end = h.start + h.length; + // Process the misaligned head, to void UB on unaligned 64-bit loads. + for (; ((unsigned long)text & 7ul) && text + 3 <= end; ++text) + if (text[0] == n[0] && text[1] == n[1] && text[2] == n[2]) return text - h.start; + // This code simulates hyper-scalar execution, analyzing 6 offsets at a time. // We have two unused bytes at the end. - uint64_t nn = (uint64_t)(n[0] << 0) | ((uint64_t)(n[1]) << 8) | ((uint64_t)(n[2]) << 16); // broadcast `n` into `nn` + sz_u64_t nn = (sz_u64_t)(n[0] << 0) | ((sz_u64_t)(n[1]) << 8) | ((sz_u64_t)(n[2]) << 16); // broadcast `n` into `nn` nn |= nn << 24; // broadcast `n` into `nn` nn <<= 16; // broadcast `n` into `nn` for (; text + 8 <= end; text += 6) { - uint64_t text_slice; - memcpy(&text_slice, text, 8); - uint64_t first_indicators = ~(text_slice ^ nn); - uint64_t second_indicators = ~((text_slice << 8) ^ nn); - uint64_t third_indicators = ~((text_slice << 16) ^ nn); + sz_u64_t text_slice = *(sz_u64_t const *)text; + sz_u64_t first_indicators = ~(text_slice ^ nn); + sz_u64_t second_indicators = ~((text_slice << 8) ^ nn); + sz_u64_t third_indicators = ~((text_slice << 16) ^ nn); // For every first match - 3 chars (24 bits) must be identical. // For that merge every byte state and then combine those three-way. first_indicators &= first_indicators >> 1; @@ -203,7 +270,7 @@ inline static sz_size_t sz_find_3chars_swar(sz_haystack_t h, char const *n) { third_indicators = (third_indicators >> 16) & (third_indicators >> 8) & (third_indicators >> 0) & 0x0000010000010000; - uint64_t match_indicators = first_indicators | (second_indicators >> 8) | (third_indicators >> 16); + sz_u64_t match_indicators = first_indicators | (second_indicators >> 8) | (third_indicators >> 16); if (match_indicators != 0) return text - h.start + ctz64(match_indicators) / 8; } @@ -215,29 +282,32 @@ inline static sz_size_t sz_find_3chars_swar(sz_haystack_t h, char const *n) { /** * @brief SWAR character-quadgram search in string, jumping 8 bytes at a time. */ -inline static sz_size_t sz_find_4chars_swar(sz_haystack_t h, char const *n) { +inline static sz_size_t sz_find_quadgram_swar(sz_haystack_t h, char const *n) { char const *text = h.start; char const *end = h.start + h.length; + // Process the misaligned head, to void UB on unaligned 64-bit loads. + for (; ((unsigned long)text & 7ul) && text + 4 <= end; ++text) + if (text[0] == n[0] && text[1] == n[1] && text[2] == n[2] && text[3] == n[3]) return text - h.start; + // This code simulates hyper-scalar execution, analyzing 4 offsets at a time. - uint64_t nn = (uint64_t)(n[0] << 0) | ((uint64_t)(n[1]) << 8) | ((uint64_t)(n[2]) << 16) | ((uint64_t)(n[3]) << 24); + sz_u64_t nn = (sz_u64_t)(n[0] << 0) | ((sz_u64_t)(n[1]) << 8) | ((sz_u64_t)(n[2]) << 16) | ((sz_u64_t)(n[3]) << 24); nn |= nn << 32; // - uint8_t lookup[16] = {0}; - lookup[0b0010] = lookup[0b0110] = lookup[0b1010] = lookup[0b1110] = 1; - lookup[0b0100] = lookup[0b1100] = 2; - lookup[0b1000] = 3; + unsigned char lookup[16] = {0}; + lookup[0x2] = lookup[0x6] = lookup[0xA] = lookup[0xE] = 1; + lookup[0x4] = lookup[0xC] = 2; + lookup[0x8] = 3; // We can perform 5 comparisons per load, but it's easier to perform 4, minimizing the size of the lookup table. for (; text + 8 <= end; text += 4) { - uint64_t text_slice; - memcpy(&text_slice, text, 8); - uint64_t text01 = (text_slice & 0x00000000FFFFFFFF) | ((text_slice & 0x000000FFFFFFFF00) << 24); - uint64_t text23 = ((text_slice & 0x0000FFFFFFFF0000) >> 16) | ((text_slice & 0x00FFFFFFFF000000) << 8); - uint64_t text01_indicators = ~(text01 ^ nn); - uint64_t text23_indicators = ~(text23 ^ nn); + sz_u64_t text_slice = *(sz_u64_t const *)text; + sz_u64_t text01 = (text_slice & 0x00000000FFFFFFFF) | ((text_slice & 0x000000FFFFFFFF00) << 24); + sz_u64_t text23 = ((text_slice & 0x0000FFFFFFFF0000) >> 16) | ((text_slice & 0x00FFFFFFFF000000) << 8); + sz_u64_t text01_indicators = ~(text01 ^ nn); + sz_u64_t text23_indicators = ~(text23 ^ nn); // For every first match - 4 chars (32 bits) must be identical. text01_indicators &= text01_indicators >> 1; @@ -258,7 +328,7 @@ inline static sz_size_t sz_find_4chars_swar(sz_haystack_t h, char const *n) { if (text01_indicators + text23_indicators) { // Assuming we have performed 4 comparisons, we can only have 2^4=16 outcomes. // Which is small enough for a lookup table. - uint8_t match_indicators = (uint8_t)( // + unsigned char match_indicators = (unsigned char)( // (text01_indicators >> 31) | (text01_indicators << 0) | // (text23_indicators >> 29) | (text23_indicators << 2)); return text - h.start + lookup[match_indicators]; @@ -272,7 +342,7 @@ inline static sz_size_t sz_find_4chars_swar(sz_haystack_t h, char const *n) { /** * @brief Trivial substring search with scalar code. Instead of comparing characters one-by-one - * it compares 4-byte anomalies first, most commonly prefixes. It's computationally cheaper. + * it compares 4-byte quadgrams first, most commonly prefixes. It's computationally cheaper. * Matching performance fluctuates between 1 GB/s and 3,5 GB/s per core. */ inline static sz_size_t sz_find_substr_swar(sz_haystack_t h, sz_needle_t n) { @@ -281,26 +351,36 @@ inline static sz_size_t sz_find_substr_swar(sz_haystack_t h, sz_needle_t n) { switch (n.length) { case 0: return 0; - case 1: return sz_find_char_swar(h, *n.start); - case 2: return sz_find_2chars_swar(h, n.start); - case 3: return sz_find_3chars_swar(h, n.start); - case 4: return sz_find_4chars_swar(h, n.start); + case 1: return sz_find_unigram_swar(h, *n.start); + case 2: return sz_find_bigram_swar(h, n.start); + case 3: return sz_find_trigram_swar(h, n.start); + case 4: return sz_find_quadgram_swar(h, n.start); default: { char const *text = h.start; char const *const end = h.start + h.length; - sz_anomaly_t n_anomaly, h_anomaly; - sz_size_t const n_suffix_len = n.length - 4 - n.anomaly_offset; - char const *n_suffix_ptr = n.start + 4 + n.anomaly_offset; - memcpy(&n_anomaly, n.start + n.anomaly_offset, 4); - - text += n.anomaly_offset; - for (; text + n.length <= end; text++) { - memcpy(&h_anomaly, text, 4); - if (h_anomaly == n_anomaly) // Match anomaly. - if (sz_equal(text + 4, n_suffix_ptr, n_suffix_len)) // Match suffix. - if (sz_equal(text - n.anomaly_offset, n.start, n.anomaly_offset)) // Match prefix. - return text - h.start - n.anomaly_offset; + sz_quadgram_t n_quadgram, h_quadgram; + sz_size_t const n_suffix_len = n.length - 4 - n.quadgram_offset; + char const *n_suffix_ptr = n.start + 4 + n.quadgram_offset; + n_quadgram.u8s[0] = n.start[n.quadgram_offset]; + n_quadgram.u8s[1] = n.start[n.quadgram_offset + 1]; + n_quadgram.u8s[2] = n.start[n.quadgram_offset + 2]; + n_quadgram.u8s[3] = n.start[n.quadgram_offset + 3]; + h_quadgram.u8s[0] = h.start[0]; + h_quadgram.u8s[1] = h.start[1]; + h_quadgram.u8s[2] = h.start[2]; + h_quadgram.u8s[3] = h.start[3]; + + text += n.quadgram_offset; + while (text + n.length <= end) { + if (h_quadgram.u32 == n_quadgram.u32) // Match quadgram. + if (sz_equal(text + 4, n_suffix_ptr, n_suffix_len)) // Match suffix. + if (sz_equal(text - n.quadgram_offset, n.start, n.quadgram_offset)) // Match prefix. + return text - h.start - n.quadgram_offset; + + h_quadgram.u32 <<= 8; + h_quadgram.u8s[3] = *text; + ++text; } return h.length; } @@ -319,17 +399,17 @@ inline static sz_size_t sz_find_substr_avx2(sz_haystack_t h, sz_needle_t n) { // Precomputed constants char const *const end = h.start + h.length; - uint32_t anomaly = 0; - uint32_t mask = 0; + sz_quadgram_t quadgram = 0; + sz_quadgram_t mask = 0; switch (n.length) { - case 1: memset(&mask, 0xFF, 1), memcpy(&anomaly, n.start, 1); break; - case 2: memset(&mask, 0xFF, 2), memcpy(&anomaly, n.start, 2); break; - case 3: memset(&mask, 0xFF, 3), memcpy(&anomaly, n.start, 3); break; - default: memset(&mask, 0xFF, 4), memcpy(&anomaly, n.start, 4); break; + case 1: memset(&mask, 0xFF, 1), memcpy(&quadgram, n.start, 1); break; + case 2: memset(&mask, 0xFF, 2), memcpy(&quadgram, n.start, 2); break; + case 3: memset(&mask, 0xFF, 3), memcpy(&quadgram, n.start, 3); break; + default: memset(&mask, 0xFF, 4), memcpy(&quadgram, n.start, 4); break; } - __m256i const anomalies = _mm256_set1_epi32(*(uint32_t const *)&anomaly); - __m256i const masks = _mm256_set1_epi32(*(uint32_t const *)&mask); + __m256i const quadgrams = _mm256_set1_epi32(quadgram.u32); + __m256i const masks = _mm256_set1_epi32(mask.u32); // Top level for-loop changes dramatically. // In sequential computing model for 32 offsets we would do: @@ -345,13 +425,13 @@ inline static sz_size_t sz_find_substr_avx2(sz_haystack_t h, sz_needle_t n) { // Performing many unaligned loads ends up being faster than loading once and shuffling around. __m256i texts0 = _mm256_and_si256(_mm256_loadu_si256((__m256i const *)(text + 0)), masks); - int matches0 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts0, anomalies)); + int matches0 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts0, quadgrams)); __m256i texts1 = _mm256_and_si256(_mm256_loadu_si256((__m256i const *)(text + 1)), masks); - int matches1 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts1, anomalies)); + int matches1 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts1, quadgrams)); __m256i text2 = _mm256_and_si256(_mm256_loadu_si256((__m256i const *)(text + 2)), masks); - int matches2 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(text2, anomalies)); + int matches2 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(text2, quadgrams)); __m256i texts3 = _mm256_and_si256(_mm256_loadu_si256((__m256i const *)(text + 3)), masks); - int matches3 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts3, anomalies)); + int matches3 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts3, quadgrams)); if (matches0 | matches1 | matches2 | matches3) { for (sz_size_t i = 0; i < 32; i++) { @@ -382,16 +462,22 @@ inline static sz_size_t sz_find_substr_neon(sz_haystack_t h, sz_needle_t n) { // Precomputed constants char const *const end = h.start + h.length; - uint32_t anomaly = 0; - uint32_t mask = 0; + sz_quadgram_t quadgram = {}; + sz_quadgram_t mask = {}; switch (n.length) { - case 1: memset(&mask, 0xFF, 1), memcpy(&anomaly, n.start, 1); break; - case 2: memset(&mask, 0xFF, 2), memcpy(&anomaly, n.start, 2); break; - case 3: memset(&mask, 0xFF, 3), memcpy(&anomaly, n.start, 3); break; - default: memset(&mask, 0xFF, 4), memcpy(&anomaly, n.start, 4); break; + case 1: mask.u8s[0] = 0xFF, quadgram.u8s[0] = n.start[0]; break; + case 2: mask.u8s[0] = mask.u8s[1] = 0xFF, quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1]; break; + case 3: + mask.u8s[0] = mask.u8s[1] = mask.u8s[2] = 0xFF, quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1], + quadgram.u8s[2] = n.start[2]; + break; + default: + mask.u32 = 0xFFFFFFFF, quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1], quadgram.u8s[2] = n.start[2], + quadgram.u8s[3] = n.start[3]; + break; } - uint32x4_t const anomalies = vld1q_dup_u32(&anomaly); + uint32x4_t const quadgrams = vld1q_dup_u32(&quadgram.u32); uint32x4_t const masks = vld1q_dup_u32(&mask); uint32x4_t matches, matches0, matches1, matches2, matches3; @@ -400,10 +486,10 @@ inline static sz_size_t sz_find_substr_neon(sz_haystack_t h, sz_needle_t n) { // Each of the following `matchesX` contains only 4 relevant bits - one per word. // Each signifies a match at the given offset. - matches0 = vceqq_u32(vandq_u32(vld1q_u32((uint32_t const *)(text + 0)), masks), anomalies); - matches1 = vceqq_u32(vandq_u32(vld1q_u32((uint32_t const *)(text + 1)), masks), anomalies); - matches2 = vceqq_u32(vandq_u32(vld1q_u32((uint32_t const *)(text + 2)), masks), anomalies); - matches3 = vceqq_u32(vandq_u32(vld1q_u32((uint32_t const *)(text + 3)), masks), anomalies); + matches0 = vceqq_u32(vandq_u32(vld1q_u32((sz_quadgram_t const *)(text + 0)), masks), quadgrams); + matches1 = vceqq_u32(vandq_u32(vld1q_u32((sz_quadgram_t const *)(text + 1)), masks), quadgrams); + matches2 = vceqq_u32(vandq_u32(vld1q_u32((sz_quadgram_t const *)(text + 2)), masks), quadgrams); + matches3 = vceqq_u32(vandq_u32(vld1q_u32((sz_quadgram_t const *)(text + 3)), masks), quadgrams); matches = vorrq_u32(vorrq_u32(matches0, matches1), vorrq_u32(matches2, matches3)); if (vmaxvq_u32(matches)) { @@ -448,8 +534,8 @@ inline static sz_size_t sz_find_substr_neon(sz_haystack_t h, sz_needle_t n) { #endif // Arm Neon -inline static sz_size_t sz_count_char(sz_haystack_t h, char n) { return sz_count_char_swar(h, n); } -inline static sz_size_t sz_find_char(sz_haystack_t h, char n) { return sz_find_char_swar(h, n); } +inline static sz_size_t sz_count_char(sz_haystack_t h, char n) { return sz_count_unigram_swar(h, n); } +inline static sz_size_t sz_find_unigram(sz_haystack_t h, char n) { return sz_find_unigram_swar(h, n); } inline static sz_size_t sz_find_substr(sz_haystack_t h, sz_needle_t n) { if (h.length < n.length) return h.length; @@ -665,10 +751,10 @@ inline static void sz_sort(sz_sequence_t *sequence, sz_sort_config_t const *conf char *prefix = (char *)&sequence->order[i]; for (sz_size_t j = 0; j != length; ++j) prefix[7 - j] = begin[j]; if (case_insensitive) { - prefix[0] = tolower(prefix[0]); - prefix[1] = tolower(prefix[1]); - prefix[2] = tolower(prefix[2]); - prefix[3] = tolower(prefix[3]); + prefix[0] = sz_tolower_ascii(prefix[0]); + prefix[1] = sz_tolower_ascii(prefix[1]); + prefix[2] = sz_tolower_ascii(prefix[2]); + prefix[3] = sz_tolower_ascii(prefix[3]); } } @@ -679,7 +765,7 @@ inline static void sz_sort(sz_sequence_t *sequence, sz_sort_config_t const *conf _sz_sort_recursion(sequence, 0, 32, comparator); } -typedef uint8_t levenstein_distance_t; +typedef unsigned char levenstein_distance_t; /** * @return Amount of temporary memory (in bytes) needed to efficiently compute @@ -758,11 +844,11 @@ inline static levenstein_distance_t sz_levenstein( // /** * @brief Hashes provided string using hardware-accelerated CRC32 instructions. */ -inline static uint32_t sz_hash_crc32_native(char const *start, sz_size_t length) { return 0; } +inline static sz_u32_t sz_hash_crc32_native(char const *start, sz_size_t length) { return 0; } -inline static uint32_t sz_hash_crc32_neon(char const *start, sz_size_t length) { return 0; } +inline static sz_u32_t sz_hash_crc32_neon(char const *start, sz_size_t length) { return 0; } -inline static uint32_t sz_hash_crc32_sse(char const *start, sz_size_t length) { return 0; } +inline static sz_u32_t sz_hash_crc32_sse(char const *start, sz_size_t length) { return 0; } #ifdef __cplusplus } From d102bdf87078d9a6a8b3064759db3dbf7dc4e331 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sat, 7 Oct 2023 19:42:11 -0700 Subject: [PATCH 57/72] Fix: SWAR search bug --- stringzilla/stringzilla.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h index 51319f01..7353024a 100644 --- a/stringzilla/stringzilla.h +++ b/stringzilla/stringzilla.h @@ -373,13 +373,13 @@ inline static sz_size_t sz_find_substr_swar(sz_haystack_t h, sz_needle_t n) { text += n.quadgram_offset; while (text + n.length <= end) { + h_quadgram.u8s[3] = text[3]; if (h_quadgram.u32 == n_quadgram.u32) // Match quadgram. if (sz_equal(text + 4, n_suffix_ptr, n_suffix_len)) // Match suffix. if (sz_equal(text - n.quadgram_offset, n.start, n.quadgram_offset)) // Match prefix. return text - h.start - n.quadgram_offset; - h_quadgram.u32 <<= 8; - h_quadgram.u8s[3] = *text; + h_quadgram.u32 >>= 8; ++text; } return h.length; From 9b3c63d951461cd0dcccc3993ab1f7e18a2589c8 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sat, 7 Oct 2023 20:39:44 -0700 Subject: [PATCH 58/72] Improve: avoiding nested loop in AVX2 --- stringzilla/stringzilla.h | 93 +++++++++++++++++++++++++-------------- 1 file changed, 61 insertions(+), 32 deletions(-) diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h index 7353024a..6b481dda 100644 --- a/stringzilla/stringzilla.h +++ b/stringzilla/stringzilla.h @@ -387,6 +387,40 @@ inline static sz_size_t sz_find_substr_swar(sz_haystack_t h, sz_needle_t n) { } } +/** + * Helper function, used in substring search operations. + */ +inline static void _sz_find_substr_populate_quadgram( // + sz_haystack_t h, + sz_needle_t n, + sz_quadgram_t *quadgram_out, + sz_quadgram_t *mask_out) { + + sz_quadgram_t quadgram; + sz_quadgram_t mask; + switch (n.length) { + case 1: + mask.u8s[0] = 0xFF, mask.u8s[1] = mask.u8s[2] = mask.u8s[3] = 0; + quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = quadgram.u8s[2] = quadgram.u8s[3] = 0; + break; + case 2: + mask.u8s[0] = mask.u8s[1] = 0xFF, mask.u8s[2] = mask.u8s[3] = 0; + quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1], quadgram.u8s[2] = quadgram.u8s[3] = 0; + break; + case 3: + mask.u8s[0] = mask.u8s[1] = mask.u8s[2] = 0xFF, mask.u8s[3] = 0; + quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1], quadgram.u8s[2] = n.start[2], quadgram.u8s[3] = 0; + break; + default: + mask.u32 = 0xFFFFFFFF; + quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1], quadgram.u8s[2] = n.start[2], + quadgram.u8s[3] = n.start[3]; + break; + } + *quadgram_out = quadgram; + *mask_out = mask; +} + #if defined(__AVX2__) /** @@ -399,15 +433,9 @@ inline static sz_size_t sz_find_substr_avx2(sz_haystack_t h, sz_needle_t n) { // Precomputed constants char const *const end = h.start + h.length; - sz_quadgram_t quadgram = 0; - sz_quadgram_t mask = 0; - switch (n.length) { - case 1: memset(&mask, 0xFF, 1), memcpy(&quadgram, n.start, 1); break; - case 2: memset(&mask, 0xFF, 2), memcpy(&quadgram, n.start, 2); break; - case 3: memset(&mask, 0xFF, 3), memcpy(&quadgram, n.start, 3); break; - default: memset(&mask, 0xFF, 4), memcpy(&quadgram, n.start, 4); break; - } - + sz_quadgram_t quadgram; + sz_quadgram_t mask; + _sz_find_substr_populate_quadgram(h, n, &quadgram, &mask); __m256i const quadgrams = _mm256_set1_epi32(quadgram.u32); __m256i const masks = _mm256_set1_epi32(mask.u32); @@ -421,7 +449,7 @@ inline static sz_size_t sz_find_substr_avx2(sz_haystack_t h, sz_needle_t n) { // + 3 bitwise ANDs. // + 1 heavy (but very unlikely) branch. char const *text = h.start; - for (; (text + n.length + 32) <= end; text += 32) { + while (text + n.length + 32 <= end) { // Performing many unaligned loads ends up being faster than loading once and shuffling around. __m256i texts0 = _mm256_and_si256(_mm256_loadu_si256((__m256i const *)(text + 0)), masks); @@ -434,10 +462,23 @@ inline static sz_size_t sz_find_substr_avx2(sz_haystack_t h, sz_needle_t n) { int matches3 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts3, quadgrams)); if (matches0 | matches1 | matches2 | matches3) { - for (sz_size_t i = 0; i < 32; i++) { - if (sz_equal(text + i, n.start, n.length)) return i + (text - h.start); + int matches = // + (matches0 & 0x1111'1111u) | // + (matches1 & 0x2222'2222u) | // + (matches2 & 0x4444'4444u) | // + (matches3 & 0x8888'8888u); + size_t first_match_offset = _tzcnt_u32(matches); + if (n.length > 4) { + if (sz_equal(text + first_match_offset + 4, n.start + 4, n.length - 4)) + return text + first_match_offset - h.start; + else + text += first_match_offset + 1; } - } + else + return text + first_match_offset - h.start; + } + else + text += 32; } // Don't forget the last (up to 35) characters. @@ -462,21 +503,9 @@ inline static sz_size_t sz_find_substr_neon(sz_haystack_t h, sz_needle_t n) { // Precomputed constants char const *const end = h.start + h.length; - sz_quadgram_t quadgram = {}; - sz_quadgram_t mask = {}; - switch (n.length) { - case 1: mask.u8s[0] = 0xFF, quadgram.u8s[0] = n.start[0]; break; - case 2: mask.u8s[0] = mask.u8s[1] = 0xFF, quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1]; break; - case 3: - mask.u8s[0] = mask.u8s[1] = mask.u8s[2] = 0xFF, quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1], - quadgram.u8s[2] = n.start[2]; - break; - default: - mask.u32 = 0xFFFFFFFF, quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1], quadgram.u8s[2] = n.start[2], - quadgram.u8s[3] = n.start[3]; - break; - } - + sz_quadgram_t quadgram; + sz_quadgram_t mask; + _sz_find_substr_populate_quadgram(h, n, &quadgram, &mask); uint32x4_t const quadgrams = vld1q_dup_u32(&quadgram.u32); uint32x4_t const masks = vld1q_dup_u32(&mask); uint32x4_t matches, matches0, matches1, matches2, matches3; @@ -486,10 +515,10 @@ inline static sz_size_t sz_find_substr_neon(sz_haystack_t h, sz_needle_t n) { // Each of the following `matchesX` contains only 4 relevant bits - one per word. // Each signifies a match at the given offset. - matches0 = vceqq_u32(vandq_u32(vld1q_u32((sz_quadgram_t const *)(text + 0)), masks), quadgrams); - matches1 = vceqq_u32(vandq_u32(vld1q_u32((sz_quadgram_t const *)(text + 1)), masks), quadgrams); - matches2 = vceqq_u32(vandq_u32(vld1q_u32((sz_quadgram_t const *)(text + 2)), masks), quadgrams); - matches3 = vceqq_u32(vandq_u32(vld1q_u32((sz_quadgram_t const *)(text + 3)), masks), quadgrams); + matches0 = vceqq_u32(vandq_u32(vld1q_u32(text + 0), masks), quadgrams); + matches1 = vceqq_u32(vandq_u32(vld1q_u32(text + 1), masks), quadgrams); + matches2 = vceqq_u32(vandq_u32(vld1q_u32(text + 2), masks), quadgrams); + matches3 = vceqq_u32(vandq_u32(vld1q_u32(text + 3), masks), quadgrams); matches = vorrq_u32(vorrq_u32(matches0, matches1), vorrq_u32(matches2, matches3)); if (vmaxvq_u32(matches)) { From fa7984a9f0b70d387cecbc4f4c4442443cf0150f Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Mon, 9 Oct 2023 13:01:13 -0700 Subject: [PATCH 59/72] Break: Avoiding LibC and new API --- stringzilla/stringzilla.h | 789 +++++++++++++++++++++----------------- 1 file changed, 446 insertions(+), 343 deletions(-) diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h index 6b481dda..0aa8774b 100644 --- a/stringzilla/stringzilla.h +++ b/stringzilla/stringzilla.h @@ -1,13 +1,10 @@ #ifndef STRINGZILLA_H_ #define STRINGZILLA_H_ -#include // `qsort_s` -#include // `qsort_r` -#include // `memcpy` - #if defined(__AVX2__) #include #endif + #if defined(__ARM_NEON) #include #endif @@ -16,117 +13,88 @@ #include #define popcount64 __popcnt64 #define ctz64 _tzcnt_u64 +#define clz64 _lzcnt_u64 #define strncasecmp _strnicmp #define strcasecmp _stricmp #else #define popcount64 __builtin_popcountll #define ctz64 __builtin_ctzll +#define clz64 __builtin_clzll +#endif + +/** + * Generally `NULL` is coming from locale.h, stddef.h, stdio.h, stdlib.h, string.h, time.h, and wchar.h, + * according to the C standard. + */ +#ifndef NULL +#define NULL ((void *)0) #endif #ifdef __cplusplus extern "C" { #endif +/** + * @brief Analogous to `size_t` and `std::size_t`, unsigned integer, identical to pointer size. + * 64-bit on most platforms where pointers are 64-bit. + * 32-bit on platforms where pointers are 32-bit. + */ #if defined(__LP64__) || defined(_LP64) || defined(__x86_64__) || defined(_WIN64) -typedef unsigned long sz_size_t; // 64-bit on most platforms when pointers are 64-bit +typedef unsigned long sz_size_t; #else -typedef unsigned sz_size_t; // 32-bit on most platforms when pointers are 32-bit +typedef unsigned sz_size_t; #endif +typedef int sz_bool_t; // Only one relevant bit typedef unsigned sz_u32_t; // Always 32 bits typedef unsigned long long sz_u64_t; // Always 64 bits +typedef char const *sz_string_ptr_t; // A type alias for `char const * ` + +/** + * @brief Helper construct for higher-level bindings. + */ +typedef struct sz_string_view_t { + sz_string_ptr_t start; + sz_size_t length; +} sz_string_view_t; -typedef union sz_quadgram_t { +/** + * @brief Internal data-structure, used to address "anomalies" (often prefixes), + * during substring search. Always a 32-bit unsigned integer, containing 4 chars. + */ +typedef union _sz_anomaly_t { unsigned u32; unsigned char u8s[4]; -} sz_quadgram_t; // Always 32-bit unsigned integer, representing 8 bytes/characters - -typedef union sz_octogram_t { - unsigned long long u64; - unsigned char u8s[8]; -} sz_octogram_t; // Always 64-bit unsigned integer, representing 8 bytes/characters - -inline static sz_size_t sz_divide_round_up(sz_size_t x, sz_size_t divisor) { return (x + (divisor - 1)) / divisor; } - -inline static sz_size_t sz_tolower_ascii(char c) { - static char lowered[256] = { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, // - 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, // - 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, // - 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, // - 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 91, 92, 93, 94, 95, // - 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, // - 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, // - 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, // - 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, // - 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, // - 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, // - 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, // - 240, 241, 242, 243, 244, 245, 246, 215, 248, 249, 250, 251, 252, 253, 254, 223, // - 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, // - 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, // - }; - return lowered[(int)c]; -} - -inline static sz_size_t sz_toupper_ascii(char c) { - static char upped[256] = { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, // - 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, // - 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, // - 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, // - 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 91, 92, 93, 94, 95, // - 96, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, // - 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 123, 124, 125, 126, 127, // - 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, // - 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, // - 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, // - 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, // - 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, // - 240, 241, 242, 243, 244, 245, 246, 215, 248, 249, 250, 251, 252, 253, 254, 223, // - 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, // - 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, // - }; - return upped[(int)c]; -} +} _sz_anomaly_t; /** - * @brief This is a faster alternative to `strncmp(a, b, length) == 0`. + * @brief This is a slightly faster alternative to `strncmp(a, b, length) == 0`. + * Doesn't provide major performance improvements, but helps avoid the LibC dependency. * @return 1 for `true`, and 0 for `false`. */ -inline static int sz_equal(char const *a, char const *b, sz_size_t length) { - char const *const a_end = a + length; +inline static sz_bool_t sz_equal(sz_string_ptr_t a, sz_string_ptr_t b, sz_size_t length) { + sz_string_ptr_t const a_end = a + length; while (a != a_end && *a == *b) a++, b++; return a_end == a; } -typedef struct sz_haystack_t { - char const *start; - sz_size_t length; -} sz_haystack_t; - -typedef struct sz_needle_t { - char const *start; - sz_size_t length; - sz_size_t quadgram_offset; -} sz_needle_t; - /** - * @brief SWAR single-character counting procedure, jumping 8 bytes at a time. + * @brief Count the number of occurrences of a @b single-character needle in an arbitrary length haystack. + * This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time. */ -inline static sz_size_t sz_count_unigram_swar(sz_haystack_t h, char n) { +inline static sz_size_t sz_count_char_swar(sz_string_ptr_t const haystack, + sz_size_t const haystack_length, + sz_string_ptr_t const needle) { sz_size_t result = 0; - char const *text = h.start; - char const *end = h.start + h.length; + sz_string_ptr_t text = haystack; + sz_string_ptr_t const end = haystack + haystack_length; // Process the misaligned head, to void UB on unaligned 64-bit loads. - for (; ((unsigned long)text & 7ul) && text < end; ++text) result += *text == n; + for (; ((unsigned long)text & 7ul) && text < end; ++text) result += *text == *needle; // This code simulates hyper-scalar execution, comparing 8 characters at a time. - sz_u64_t nnnnnnnn = n; + sz_u64_t nnnnnnnn = *needle; nnnnnnnn |= nnnnnnnn << 8; nnnnnnnn |= nnnnnnnn << 16; nnnnnnnn |= nnnnnnnn << 32; @@ -140,27 +108,31 @@ inline static sz_size_t sz_count_unigram_swar(sz_haystack_t h, char n) { result += popcount64(match_indicators); } - for (; text < end; ++text) result += *text == n; + for (; text < end; ++text) result += *text == *needle; return result; } /** - * @brief SWAR single-character search in string, jumping 8 bytes at a time. + * @brief Find the first occurrence of a @b single-character needle in an arbitrary length haystack. + * This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time. + * Identical to `memchr(haystack, needle[0], haystack_length)`. */ -inline static sz_size_t sz_find_unigram_swar(sz_haystack_t h, char n) { +inline static sz_string_ptr_t sz_find_1char_swar(sz_string_ptr_t const haystack, + sz_size_t const haystack_length, + sz_string_ptr_t const needle) { - char const *text = h.start; - char const *end = h.start + h.length; + sz_string_ptr_t text = haystack; + sz_string_ptr_t const end = haystack + haystack_length; // Process the misaligned head, to void UB on unaligned 64-bit loads. for (; ((unsigned long)text & 7ul) && text < end; ++text) - if (*text == n) return text - h.start; + if (*text == *needle) return text; // This code simulates hyper-scalar execution, analyzing 8 offsets at a time. - sz_u64_t nnnnnnnn = n; - nnnnnnnn |= nnnnnnnn << 8; // broadcast `n` into `nnnnnnnn` - nnnnnnnn |= nnnnnnnn << 16; // broadcast `n` into `nnnnnnnn` - nnnnnnnn |= nnnnnnnn << 32; // broadcast `n` into `nnnnnnnn` + sz_u64_t nnnnnnnn = *needle; + nnnnnnnn |= nnnnnnnn << 8; // broadcast `needle` into `nnnnnnnn` + nnnnnnnn |= nnnnnnnn << 16; // broadcast `needle` into `nnnnnnnn` + nnnnnnnn |= nnnnnnnn << 32; // broadcast `needle` into `nnnnnnnn` for (; text + 8 <= end; text += 8) { sz_u64_t text_slice = *(sz_u64_t const *)text; sz_u64_t match_indicators = ~(text_slice ^ nnnnnnnn); @@ -169,30 +141,70 @@ inline static sz_size_t sz_find_unigram_swar(sz_haystack_t h, char n) { match_indicators &= match_indicators >> 4; match_indicators &= 0x0101010101010101; - if (match_indicators != 0) return text - h.start + ctz64(match_indicators) / 8; + if (match_indicators != 0) return text + ctz64(match_indicators) / 8; } for (; text < end; ++text) - if (*text == n) return text - h.start; - return h.length; + if (*text == *needle) return text; + return NULL; +} + +/** + * @brief Find the last occurrence of a @b single-character needle in an arbitrary length haystack. + * This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time. + * Identical to `memrchr(haystack, needle[0], haystack_length)`. + */ +inline static sz_string_ptr_t sz_rfind_1char_swar(sz_string_ptr_t const haystack, + sz_size_t const haystack_length, + sz_string_ptr_t const needle) { + + sz_string_ptr_t const end = haystack + haystack_length; + sz_string_ptr_t text = end - 1; + + // Process the misaligned head, to void UB on unaligned 64-bit loads. + for (; ((unsigned long)text & 7ul) && text >= haystack; --text) + if (*text == *needle) return text; + + // This code simulates hyper-scalar execution, analyzing 8 offsets at a time. + sz_u64_t nnnnnnnn = *needle; + nnnnnnnn |= nnnnnnnn << 8; // broadcast `needle` into `nnnnnnnn` + nnnnnnnn |= nnnnnnnn << 16; // broadcast `needle` into `nnnnnnnn` + nnnnnnnn |= nnnnnnnn << 32; // broadcast `needle` into `nnnnnnnn` + for (; text - 8 >= haystack; text -= 8) { + sz_u64_t text_slice = *(sz_u64_t const *)text; + sz_u64_t match_indicators = ~(text_slice ^ nnnnnnnn); + match_indicators &= match_indicators >> 1; + match_indicators &= match_indicators >> 2; + match_indicators &= match_indicators >> 4; + match_indicators &= 0x0101010101010101; + + if (match_indicators != 0) return text - 8 + clz64(match_indicators) / 8; + } + + for (; text >= haystack; --text) + if (*text == *needle) return text; + return NULL; } /** - * @brief SWAR character-bigram search in string, jumping 8 bytes at a time. + * @brief Find the first occurrence of a @b two-character needle in an arbitrary length haystack. + * This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time. */ -inline static sz_size_t sz_find_bigram_swar(sz_haystack_t h, char const *n) { +inline static sz_string_ptr_t sz_find_2char_swar(sz_string_ptr_t const haystack, + sz_size_t const haystack_length, + sz_string_ptr_t const needle) { - char const *text = h.start; - char const *end = h.start + h.length; + sz_string_ptr_t text = haystack; + sz_string_ptr_t const end = haystack + haystack_length; // Process the misaligned head, to void UB on unaligned 64-bit loads. for (; ((unsigned long)text & 7ul) && text + 2 <= end; ++text) - if (text[0] == n[0] && text[1] == n[1]) return text - h.start; + if (text[0] == needle[0] && text[1] == needle[1]) return text; // This code simulates hyper-scalar execution, analyzing 7 offsets at a time. - sz_u64_t nnnn = ((sz_u64_t)(n[0]) << 0) | ((sz_u64_t)(n[1]) << 8); // broadcast `n` into `nnnn` - nnnn |= nnnn << 16; // broadcast `n` into `nnnn` - nnnn |= nnnn << 32; // broadcast `n` into `nnnn` + sz_u64_t nnnn = ((sz_u64_t)(needle[0]) << 0) | ((sz_u64_t)(needle[1]) << 8); // broadcast `needle` into `nnnn` + nnnn |= nnnn << 16; // broadcast `needle` into `nnnn` + nnnn |= nnnn << 32; // broadcast `needle` into `nnnn` for (; text + 8 <= end; text += 7) { sz_u64_t text_slice = *(sz_u64_t const *)text; sz_u64_t even_indicators = ~(text_slice ^ nnnn); @@ -214,32 +226,38 @@ inline static sz_size_t sz_find_bigram_swar(sz_haystack_t h, char const *n) { if (even_indicators + odd_indicators) { sz_u64_t match_indicators = even_indicators | (odd_indicators >> 8); - return text - h.start + ctz64(match_indicators) / 8; + return text + ctz64(match_indicators) / 8; } } for (; text + 2 <= end; ++text) - if (text[0] == n[0] && text[1] == n[1]) return text - h.start; - return h.length; + if (text[0] == needle[0] && text[1] == needle[1]) return text; + return NULL; } /** - * @brief SWAR character-trigram search in string, jumping 8 bytes at a time. + * @brief Find the first occurrence of a three-character needle in an arbitrary length haystack. + * This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time. */ -inline static sz_size_t sz_find_trigram_swar(sz_haystack_t h, char const *n) { +inline static sz_string_ptr_t sz_find_3char_swar(sz_string_ptr_t const haystack, + sz_size_t const haystack_length, + sz_string_ptr_t const needle) { - char const *text = h.start; - char const *end = h.start + h.length; + sz_string_ptr_t text = haystack; + sz_string_ptr_t end = haystack + haystack_length; // Process the misaligned head, to void UB on unaligned 64-bit loads. for (; ((unsigned long)text & 7ul) && text + 3 <= end; ++text) - if (text[0] == n[0] && text[1] == n[1] && text[2] == n[2]) return text - h.start; + if (text[0] == needle[0] && text[1] == needle[1] && text[2] == needle[2]) return text; // This code simulates hyper-scalar execution, analyzing 6 offsets at a time. // We have two unused bytes at the end. - sz_u64_t nn = (sz_u64_t)(n[0] << 0) | ((sz_u64_t)(n[1]) << 8) | ((sz_u64_t)(n[2]) << 16); // broadcast `n` into `nn` - nn |= nn << 24; // broadcast `n` into `nn` - nn <<= 16; // broadcast `n` into `nn` + sz_u64_t nn = // broadcast `needle` into `nn` + (sz_u64_t)(needle[0] << 0) | // broadcast `needle` into `nn` + ((sz_u64_t)(needle[1]) << 8) | // broadcast `needle` into `nn` + ((sz_u64_t)(needle[2]) << 16); // broadcast `needle` into `nn` + nn |= nn << 24; // broadcast `needle` into `nn` + nn <<= 16; // broadcast `needle` into `nn` for (; text + 8 <= end; text += 6) { sz_u64_t text_slice = *(sz_u64_t const *)text; @@ -271,35 +289,39 @@ inline static sz_size_t sz_find_trigram_swar(sz_haystack_t h, char const *n) { (third_indicators >> 16) & (third_indicators >> 8) & (third_indicators >> 0) & 0x0000010000010000; sz_u64_t match_indicators = first_indicators | (second_indicators >> 8) | (third_indicators >> 16); - if (match_indicators != 0) return text - h.start + ctz64(match_indicators) / 8; + if (match_indicators != 0) return text + ctz64(match_indicators) / 8; } for (; text + 3 <= end; ++text) - if (text[0] == n[0] && text[1] == n[1] && text[2] == n[2]) return text - h.start; - return h.length; + if (text[0] == needle[0] && text[1] == needle[1] && text[2] == needle[2]) return text; + return NULL; } /** - * @brief SWAR character-quadgram search in string, jumping 8 bytes at a time. + * @brief Find the first occurrence of a @b four-character needle in an arbitrary length haystack. + * This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time. */ -inline static sz_size_t sz_find_quadgram_swar(sz_haystack_t h, char const *n) { +inline static sz_string_ptr_t sz_find_4char_swar(sz_string_ptr_t const haystack, + sz_size_t const haystack_length, + sz_string_ptr_t const needle) { - char const *text = h.start; - char const *end = h.start + h.length; + sz_string_ptr_t text = haystack; + sz_string_ptr_t end = haystack + haystack_length; // Process the misaligned head, to void UB on unaligned 64-bit loads. for (; ((unsigned long)text & 7ul) && text + 4 <= end; ++text) - if (text[0] == n[0] && text[1] == n[1] && text[2] == n[2] && text[3] == n[3]) return text - h.start; + if (text[0] == needle[0] && text[1] == needle[1] && text[2] == needle[2] && text[3] == needle[3]) return text; // This code simulates hyper-scalar execution, analyzing 4 offsets at a time. - sz_u64_t nn = (sz_u64_t)(n[0] << 0) | ((sz_u64_t)(n[1]) << 8) | ((sz_u64_t)(n[2]) << 16) | ((sz_u64_t)(n[3]) << 24); + sz_u64_t nn = (sz_u64_t)(needle[0] << 0) | ((sz_u64_t)(needle[1]) << 8) | ((sz_u64_t)(needle[2]) << 16) | + ((sz_u64_t)(needle[3]) << 24); nn |= nn << 32; // - unsigned char lookup[16] = {0}; - lookup[0x2] = lookup[0x6] = lookup[0xA] = lookup[0xE] = 1; - lookup[0x4] = lookup[0xC] = 2; - lookup[0x8] = 3; + unsigned char offset_in_slice[16] = {0}; + offset_in_slice[0x2] = offset_in_slice[0x6] = offset_in_slice[0xA] = offset_in_slice[0xE] = 1; + offset_in_slice[0x4] = offset_in_slice[0xC] = 2; + offset_in_slice[0x8] = 3; // We can perform 5 comparisons per load, but it's easier to perform 4, minimizing the size of the lookup table. for (; text + 8 <= end; text += 4) { @@ -331,58 +353,63 @@ inline static sz_size_t sz_find_quadgram_swar(sz_haystack_t h, char const *n) { unsigned char match_indicators = (unsigned char)( // (text01_indicators >> 31) | (text01_indicators << 0) | // (text23_indicators >> 29) | (text23_indicators << 2)); - return text - h.start + lookup[match_indicators]; + return text + offset_in_slice[match_indicators]; } } for (; text + 4 <= end; ++text) - if (text[0] == n[0] && text[1] == n[1] && text[2] == n[2] && text[3] == n[3]) return text - h.start; - return h.length; + if (text[0] == needle[0] && text[1] == needle[1] && text[2] == needle[2] && text[3] == needle[3]) return text; + return NULL; } /** - * @brief Trivial substring search with scalar code. Instead of comparing characters one-by-one - * it compares 4-byte quadgrams first, most commonly prefixes. It's computationally cheaper. + * @brief Trivial substring search with scalar SWAR code. Instead of comparing characters one-by-one + * it compares 4-byte anomalies first, most commonly prefixes. It's computationally cheaper. * Matching performance fluctuates between 1 GB/s and 3,5 GB/s per core. */ -inline static sz_size_t sz_find_substr_swar(sz_haystack_t h, sz_needle_t n) { - - if (h.length < n.length) return h.length; - - switch (n.length) { - case 0: return 0; - case 1: return sz_find_unigram_swar(h, *n.start); - case 2: return sz_find_bigram_swar(h, n.start); - case 3: return sz_find_trigram_swar(h, n.start); - case 4: return sz_find_quadgram_swar(h, n.start); +inline static sz_string_ptr_t sz_find_substr_swar( // + sz_string_ptr_t const haystack, + sz_size_t const haystack_length, + sz_string_ptr_t const needle, + sz_size_t const needle_length) { + + if (haystack_length < needle_length) return NULL; + + sz_size_t anomaly_offset = 0; + switch (needle_length) { + case 0: return NULL; + case 1: return sz_find_1char_swar(haystack, haystack_length, needle); + case 2: return sz_find_2char_swar(haystack, haystack_length, needle); + case 3: return sz_find_3char_swar(haystack, haystack_length, needle); + case 4: return sz_find_4char_swar(haystack, haystack_length, needle); default: { - char const *text = h.start; - char const *const end = h.start + h.length; - - sz_quadgram_t n_quadgram, h_quadgram; - sz_size_t const n_suffix_len = n.length - 4 - n.quadgram_offset; - char const *n_suffix_ptr = n.start + 4 + n.quadgram_offset; - n_quadgram.u8s[0] = n.start[n.quadgram_offset]; - n_quadgram.u8s[1] = n.start[n.quadgram_offset + 1]; - n_quadgram.u8s[2] = n.start[n.quadgram_offset + 2]; - n_quadgram.u8s[3] = n.start[n.quadgram_offset + 3]; - h_quadgram.u8s[0] = h.start[0]; - h_quadgram.u8s[1] = h.start[1]; - h_quadgram.u8s[2] = h.start[2]; - h_quadgram.u8s[3] = h.start[3]; - - text += n.quadgram_offset; - while (text + n.length <= end) { - h_quadgram.u8s[3] = text[3]; - if (h_quadgram.u32 == n_quadgram.u32) // Match quadgram. - if (sz_equal(text + 4, n_suffix_ptr, n_suffix_len)) // Match suffix. - if (sz_equal(text - n.quadgram_offset, n.start, n.quadgram_offset)) // Match prefix. - return text - h.start - n.quadgram_offset; - - h_quadgram.u32 >>= 8; + sz_string_ptr_t text = haystack; + sz_string_ptr_t const end = haystack + haystack_length; + + _sz_anomaly_t n_anomaly, h_anomaly; + sz_size_t const n_suffix_len = needle_length - 4 - anomaly_offset; + sz_string_ptr_t n_suffix_ptr = needle + 4 + anomaly_offset; + n_anomaly.u8s[0] = needle[anomaly_offset]; + n_anomaly.u8s[1] = needle[anomaly_offset + 1]; + n_anomaly.u8s[2] = needle[anomaly_offset + 2]; + n_anomaly.u8s[3] = needle[anomaly_offset + 3]; + h_anomaly.u8s[0] = haystack[0]; + h_anomaly.u8s[1] = haystack[1]; + h_anomaly.u8s[2] = haystack[2]; + h_anomaly.u8s[3] = haystack[3]; + + text += anomaly_offset; + while (text + needle_length <= end) { + h_anomaly.u8s[3] = text[3]; + if (h_anomaly.u32 == n_anomaly.u32) // Match anomaly. + if (sz_equal(text + 4, n_suffix_ptr, n_suffix_len)) // Match suffix. + if (sz_equal(text - anomaly_offset, needle, anomaly_offset)) // Match prefix - optimized out. + return text - anomaly_offset; + + h_anomaly.u32 >>= 8; ++text; } - return h.length; + return NULL; } } } @@ -390,34 +417,33 @@ inline static sz_size_t sz_find_substr_swar(sz_haystack_t h, sz_needle_t n) { /** * Helper function, used in substring search operations. */ -inline static void _sz_find_substr_populate_quadgram( // - sz_haystack_t h, - sz_needle_t n, - sz_quadgram_t *quadgram_out, - sz_quadgram_t *mask_out) { - - sz_quadgram_t quadgram; - sz_quadgram_t mask; - switch (n.length) { +inline static void _sz_find_substr_populate_anomaly( // + sz_string_ptr_t const needle, + sz_size_t const needle_length, + _sz_anomaly_t *anomaly_out, + _sz_anomaly_t *mask_out) { + + _sz_anomaly_t anomaly; + _sz_anomaly_t mask; + switch (needle_length) { case 1: mask.u8s[0] = 0xFF, mask.u8s[1] = mask.u8s[2] = mask.u8s[3] = 0; - quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = quadgram.u8s[2] = quadgram.u8s[3] = 0; + anomaly.u8s[0] = needle[0], anomaly.u8s[1] = anomaly.u8s[2] = anomaly.u8s[3] = 0; break; case 2: mask.u8s[0] = mask.u8s[1] = 0xFF, mask.u8s[2] = mask.u8s[3] = 0; - quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1], quadgram.u8s[2] = quadgram.u8s[3] = 0; + anomaly.u8s[0] = needle[0], anomaly.u8s[1] = needle[1], anomaly.u8s[2] = anomaly.u8s[3] = 0; break; case 3: mask.u8s[0] = mask.u8s[1] = mask.u8s[2] = 0xFF, mask.u8s[3] = 0; - quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1], quadgram.u8s[2] = n.start[2], quadgram.u8s[3] = 0; + anomaly.u8s[0] = needle[0], anomaly.u8s[1] = needle[1], anomaly.u8s[2] = needle[2], anomaly.u8s[3] = 0; break; default: mask.u32 = 0xFFFFFFFF; - quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1], quadgram.u8s[2] = n.start[2], - quadgram.u8s[3] = n.start[3]; + anomaly.u8s[0] = needle[0], anomaly.u8s[1] = needle[1], anomaly.u8s[2] = needle[2], anomaly.u8s[3] = needle[3]; break; } - *quadgram_out = quadgram; + *anomaly_out = anomaly; *mask_out = mask; } @@ -429,14 +455,17 @@ inline static void _sz_find_substr_populate_quadgram( // * was practically more efficient than loading once and shifting around, as introduces * less data dependencies. */ -inline static sz_size_t sz_find_substr_avx2(sz_haystack_t h, sz_needle_t n) { +inline static sz_string_ptr_t sz_find_substr_avx2(sz_string_ptr_t const haystack, + sz_size_t const haystack_length, + sz_string_ptr_t const needle, + sz_size_t const needle_length) { // Precomputed constants - char const *const end = h.start + h.length; - sz_quadgram_t quadgram; - sz_quadgram_t mask; - _sz_find_substr_populate_quadgram(h, n, &quadgram, &mask); - __m256i const quadgrams = _mm256_set1_epi32(quadgram.u32); + sz_string_ptr_t const end = haystack + haystack_length; + _sz_anomaly_t anomaly; + _sz_anomaly_t mask; + _sz_find_substr_populate_anomaly(needle, needle_length, &anomaly, &mask); + __m256i const anomalies = _mm256_set1_epi32(anomaly.u32); __m256i const masks = _mm256_set1_epi32(mask.u32); // Top level for-loop changes dramatically. @@ -448,18 +477,18 @@ inline static sz_size_t sz_find_substr_avx2(sz_haystack_t h, sz_needle_t n) { // + 4 movemasks. // + 3 bitwise ANDs. // + 1 heavy (but very unlikely) branch. - char const *text = h.start; - while (text + n.length + 32 <= end) { + sz_string_ptr_t text = haystack; + while (text + needle_length + 32 <= end) { // Performing many unaligned loads ends up being faster than loading once and shuffling around. __m256i texts0 = _mm256_and_si256(_mm256_loadu_si256((__m256i const *)(text + 0)), masks); - int matches0 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts0, quadgrams)); + int matches0 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts0, anomalies)); __m256i texts1 = _mm256_and_si256(_mm256_loadu_si256((__m256i const *)(text + 1)), masks); - int matches1 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts1, quadgrams)); + int matches1 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts1, anomalies)); __m256i text2 = _mm256_and_si256(_mm256_loadu_si256((__m256i const *)(text + 2)), masks); - int matches2 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(text2, quadgrams)); + int matches2 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(text2, anomalies)); __m256i texts3 = _mm256_and_si256(_mm256_loadu_si256((__m256i const *)(text + 3)), masks); - int matches3 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts3, quadgrams)); + int matches3 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts3, anomalies)); if (matches0 | matches1 | matches2 | matches3) { int matches = // @@ -468,25 +497,21 @@ inline static sz_size_t sz_find_substr_avx2(sz_haystack_t h, sz_needle_t n) { (matches2 & 0x4444'4444u) | // (matches3 & 0x8888'8888u); size_t first_match_offset = _tzcnt_u32(matches); - if (n.length > 4) { - if (sz_equal(text + first_match_offset + 4, n.start + 4, n.length - 4)) - return text + first_match_offset - h.start; + if (needle_length > 4) { + if (sz_equal(text + first_match_offset + 4, needle + 4, needle_length - 4)) + return text + first_match_offset; else text += first_match_offset + 1; } else - return text + first_match_offset - h.start; - } + return text + first_match_offset; + } else text += 32; } // Don't forget the last (up to 35) characters. - sz_haystack_t tail; - tail.start = text; - tail.length = end - text; - size_t tail_match = sz_find_substr_swar(tail, n); - return text + tail_match - h.start; + return sz_find_substr_swar(text, end - text, needle, needle_length); } #endif // x86 AVX2 @@ -499,26 +524,29 @@ inline static sz_size_t sz_find_substr_avx2(sz_haystack_t h, sz_needle_t n) { * was practically more efficient than loading once and shifting around, as introduces * less data dependencies. */ -inline static sz_size_t sz_find_substr_neon(sz_haystack_t h, sz_needle_t n) { +inline static sz_string_ptr_t sz_find_substr_neon(sz_string_ptr_t const haystack, + sz_size_t const haystack_length, + sz_string_ptr_t const needle, + sz_size_t const needle_length) { // Precomputed constants - char const *const end = h.start + h.length; - sz_quadgram_t quadgram; - sz_quadgram_t mask; - _sz_find_substr_populate_quadgram(h, n, &quadgram, &mask); - uint32x4_t const quadgrams = vld1q_dup_u32(&quadgram.u32); - uint32x4_t const masks = vld1q_dup_u32(&mask); + sz_string_ptr_t const end = haystack + haystack_length; + _sz_anomaly_t anomaly; + _sz_anomaly_t mask; + _sz_find_substr_populate_anomaly(needle, needle_length, &anomaly, &mask); + uint32x4_t const anomalies = vld1q_dup_u32(&anomaly.u32); + uint32x4_t const masks = vld1q_dup_u32(&mask.u32); uint32x4_t matches, matches0, matches1, matches2, matches3; - char const *text = h.start; - while (text + n.length + 16 <= end) { + sz_string_ptr_t text = haystack; + while (text + needle_length + 16 <= end) { // Each of the following `matchesX` contains only 4 relevant bits - one per word. // Each signifies a match at the given offset. - matches0 = vceqq_u32(vandq_u32(vld1q_u32(text + 0), masks), quadgrams); - matches1 = vceqq_u32(vandq_u32(vld1q_u32(text + 1), masks), quadgrams); - matches2 = vceqq_u32(vandq_u32(vld1q_u32(text + 2), masks), quadgrams); - matches3 = vceqq_u32(vandq_u32(vld1q_u32(text + 3), masks), quadgrams); + matches0 = vceqq_u32(vandq_u32(vreinterpretq_u32_u8(vld1q_u8((unsigned char *)text + 0)), masks), anomalies); + matches1 = vceqq_u32(vandq_u32(vreinterpretq_u32_u8(vld1q_u8((unsigned char *)text + 1)), masks), anomalies); + matches2 = vceqq_u32(vandq_u32(vreinterpretq_u32_u8(vld1q_u8((unsigned char *)text + 2)), masks), anomalies); + matches3 = vceqq_u32(vandq_u32(vreinterpretq_u32_u8(vld1q_u8((unsigned char *)text + 3)), masks), anomalies); matches = vorrq_u32(vorrq_u32(matches0, matches1), vorrq_u32(matches2, matches3)); if (vmaxvq_u32(matches)) { @@ -540,73 +568,172 @@ inline static sz_size_t sz_find_substr_neon(sz_haystack_t h, sz_needle_t n) { // Find the first match size_t first_match_offset = __builtin_ctz(matches_u16); - if (n.length > 4) { - if (sz_equal(text + first_match_offset + 4, n.start + 4, n.length - 4)) - return text + first_match_offset - h.start; + if (needle_length > 4) { + if (sz_equal(text + first_match_offset + 4, needle + 4, needle_length - 4)) + return text + first_match_offset; else text += first_match_offset + 1; } else - return text + first_match_offset - h.start; + return text + first_match_offset; } else text += 16; } // Don't forget the last (up to 16+3=19) characters. - sz_haystack_t tail; - tail.start = text; - tail.length = end - text; - size_t tail_match = sz_find_substr_swar(tail, n); - return text + tail_match - h.start; + return sz_find_substr_swar(text, end - text, needle, needle_length); } #endif // Arm Neon -inline static sz_size_t sz_count_char(sz_haystack_t h, char n) { return sz_count_unigram_swar(h, n); } -inline static sz_size_t sz_find_unigram(sz_haystack_t h, char n) { return sz_find_unigram_swar(h, n); } +inline static sz_size_t sz_count_char(sz_string_ptr_t const haystack, + sz_size_t const haystack_length, + sz_string_ptr_t const needle) { + return sz_count_char_swar(haystack, haystack_length, needle); +} -inline static sz_size_t sz_find_substr(sz_haystack_t h, sz_needle_t n) { - if (h.length < n.length) return h.length; +inline static sz_string_ptr_t sz_find_1char(sz_string_ptr_t const haystack, + sz_size_t const haystack_length, + sz_string_ptr_t const needle) { + return sz_find_1char_swar(haystack, haystack_length, needle); +} +inline static sz_string_ptr_t sz_rfind_1char(sz_string_ptr_t const haystack, + sz_size_t const haystack_length, + sz_string_ptr_t const needle) { + return sz_rfind_1char_swar(haystack, haystack_length, needle); +} + +inline static sz_string_ptr_t sz_find_substr(sz_string_ptr_t const haystack, + sz_size_t const haystack_length, + sz_string_ptr_t const needle, + sz_size_t const needle_length) { + if (haystack_length < needle_length) return NULL; #if defined(__ARM_NEON) - return sz_find_substr_neon(h, n); + return sz_find_substr_neon(haystack, haystack_length, needle, needle_length); #elif defined(__AVX2__) - return sz_find_substr_avx2(h, n); + return sz_find_substr_avx2(haystack, haystack_length, needle, needle_length); #else - return sz_find_substr_swar(h, n); + return sz_find_substr_swar(haystack, haystack_length, needle, needle_length); #endif } -inline static void sz_swap(sz_size_t *a, sz_size_t *b) { - sz_size_t t = *a; +/** + * @brief Maps any ASCII character to itself, or the lowercase variant, if available. + */ +inline static char sz_tolower_ascii(char c) { + static unsigned char lowered[256] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, // + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, // + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, // + 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, // + 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 91, 92, 93, 94, 95, // + 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, // + 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, // + 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, // + 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, // + 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, // + 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, // + 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, // + 240, 241, 242, 243, 244, 245, 246, 215, 248, 249, 250, 251, 252, 253, 254, 223, // + 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, // + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, // + }; + return *(char *)&lowered[(int)c]; +} + +/** + * @brief Maps any ASCII character to itself, or the uppercase variant, if available. + */ +inline static char sz_toupper_ascii(char c) { + static unsigned char upped[256] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, // + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, // + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, // + 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, // + 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 91, 92, 93, 94, 95, // + 96, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, // + 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 123, 124, 125, 126, 127, // + 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, // + 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, // + 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, // + 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, // + 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, // + 240, 241, 242, 243, 244, 245, 246, 215, 248, 249, 250, 251, 252, 253, 254, 223, // + 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, // + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, // + }; + return *(char *)&upped[(int)c]; +} + +/** + * @brief Char-level lexicographic comparison of two strings. + * Doesn't provide major performance improvements, but helps avoid the LibC dependency. + */ +inline static sz_bool_t sz_is_less_ascii(sz_string_ptr_t const a, + sz_size_t const a_length, + sz_string_ptr_t const b, + sz_size_t const b_length) { + + sz_size_t min_length = (a_length < b_length) ? a_length : b_length; + for (sz_size_t i = 0; i < min_length; ++i) { + if (a[i] < b[i]) return 1; + if (a[i] > b[i]) return 0; + } + return a_length < b_length; +} + +/** + * @brief Char-level lexicographic comparison of two strings, insensitive to the case of ASCII symbols. + * Doesn't provide major performance improvements, but helps avoid the LibC dependency. + */ +inline static sz_bool_t sz_is_less_uncased_ascii(sz_string_ptr_t const a, + sz_size_t const a_length, + sz_string_ptr_t const b, + sz_size_t const b_length) { + + sz_size_t min_length = (a_length < b_length) ? a_length : b_length; + for (sz_size_t i = 0; i < min_length; ++i) { + char a_lower = sz_tolower_ascii(a[i]); + char b_lower = sz_tolower_ascii(b[i]); + if (a_lower < b_lower) return 1; + if (a_lower > b_lower) return 0; + } + return a_length < b_length; +} + +/** + * @brief Helper, that swaps two 64-bit integers representing the order of elements in the sequence. + */ +inline static void _sz_swap_order(sz_u64_t *a, sz_u64_t *b) { + sz_u64_t t = *a; *a = *b; *b = t; } -typedef char const *(*sz_sequence_get_start_t)(void const *, sz_size_t); -typedef sz_size_t (*sz_sequence_get_length_t)(void const *, sz_size_t); -typedef int (*sz_sequence_predicate_t)(void const *, sz_size_t); -typedef int (*sz_sequence_comparator_t)(void const *, sz_size_t, sz_size_t); +struct sz_sequence_s; -// Define a type for the comparison function, depending on the platform. -#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) || defined(__APPLE__) -typedef int (*sz_qsort_comparison_func_t)(void *, void const *, void const *); -#else -typedef int (*sz_qsort_comparison_func_t)(void const *, void const *, void *); -#endif +typedef sz_string_ptr_t (*sz_sequence_member_start_t)(void const *, sz_size_t); +typedef sz_size_t (*sz_sequence_member_length_t)(void const *, sz_size_t); +typedef sz_bool_t (*sz_sequence_predicate_t)(void const *, sz_size_t); +typedef sz_bool_t (*sz_sequence_comparator_t)(void const *, sz_size_t, sz_size_t); +typedef sz_bool_t (*sz_string_is_less_t)(sz_string_ptr_t, sz_size_t, sz_string_ptr_t, sz_size_t); -typedef struct sz_sequence_t { - sz_size_t *order; +typedef struct sz_sequence_s { + sz_u64_t *order; sz_size_t count; - sz_sequence_get_start_t get_start; - sz_sequence_get_length_t get_length; + sz_sequence_member_start_t get_start; + sz_sequence_member_length_t get_length; void const *handle; } sz_sequence_t; /** - * @brief Similar to `std::partition`, given a predicate splits the - * sequence into two parts. + * @brief Similar to `std::partition`, given a predicate splits the sequence into two parts. + * The algorithm is unstable, meaning that elements may change relative order, as long + * as they are in the right partition. This is the simpler algorithm for partitioning. */ inline static sz_size_t sz_partition(sz_sequence_t *sequence, sz_sequence_predicate_t predicate) { @@ -615,14 +742,16 @@ inline static sz_size_t sz_partition(sz_sequence_t *sequence, sz_sequence_predic for (sz_size_t i = matches + 1; i < sequence->count; ++i) if (predicate(sequence->handle, sequence->order[i])) - sz_swap(sequence->order + i, sequence->order + matches), ++matches; + _sz_swap_order(sequence->order + i, sequence->order + matches), ++matches; return matches; } /** - * @brief Inplace `std::set_union` for two consecutive chunks forming - * the same continuous sequence. + * @brief Inplace `std::set_union` for two consecutive chunks forming the same continuous `sequence`. + * + * @param partition The number of elements in the first sub-sequence in `sequence`. + * @param less Comparison function, to determine the lexicographic ordering. */ inline static void sz_merge(sz_sequence_t *sequence, sz_size_t partition, sz_sequence_comparator_t less) { @@ -642,10 +771,7 @@ inline static void sz_merge(sz_sequence_t *sequence, sz_size_t partition, sz_seq // Shift all the elements between element 1 // element 2, right by 1. - while (index != start_a) { - sequence->order[index] = sequence->order[index - 1]; - index--; - } + while (index != start_a) { sequence->order[index] = sequence->order[index - 1], index--; } sequence->order[start_a] = value; // Update all the pointers @@ -656,112 +782,86 @@ inline static void sz_merge(sz_sequence_t *sequence, sz_size_t partition, sz_seq } } +inline static void sz_sort_insertion(sz_sequence_t *sequence, sz_sequence_comparator_t comparator) { + sz_u64_t *keys = sequence->order; + sz_size_t keys_count = sequence->count; + for (sz_size_t i = 1; i < keys_count; i++) { + sz_u64_t i_key = keys[i]; + // Move elements of arr[0..i-1] that are greater than key to one position ahead of their current position + sz_size_t j = i; + for (; j > 0 && comparator(sequence, keys[j - 1], i) != 0; --j) keys[j] = keys[j - 1]; + keys[j] = i_key; + } +} + +/** + * @brief Internal Radix sorting procedure. + */ inline static void _sz_sort_recursion( // sz_sequence_t *sequence, sz_size_t bit_idx, sz_size_t bit_max, - sz_qsort_comparison_func_t qsort_comparator) { + sz_sequence_comparator_t comparator, + sz_size_t partial_order_length) { if (!sequence->count) return; // Partition a range of integers according to a specific bit value sz_size_t split = 0; { - sz_size_t mask = (1ul << 63) >> bit_idx; + sz_u64_t mask = (1ul << 63) >> bit_idx; while (split != sequence->count && !(sequence->order[split] & mask)) ++split; for (sz_size_t i = split + 1; i < sequence->count; ++i) - if (!(sequence->order[i] & mask)) sz_swap(sequence->order + i, sequence->order + split), ++split; + if (!(sequence->order[i] & mask)) _sz_swap_order(sequence->order + i, sequence->order + split), ++split; } // Go down recursively if (bit_idx < bit_max) { sz_sequence_t a = *sequence; a.count = split; - _sz_sort_recursion(&a, bit_idx + 1, bit_max, qsort_comparator); + _sz_sort_recursion(&a, bit_idx + 1, bit_max, comparator, partial_order_length); sz_sequence_t b = *sequence; b.order += split; b.count -= split; - _sz_sort_recursion(&b, bit_idx + 1, bit_max, qsort_comparator); + _sz_sort_recursion(&b, bit_idx + 1, bit_max, comparator, partial_order_length); } // Reached the end of recursion else { // Discard the prefixes - for (sz_size_t i = 0; i != sequence->count; ++i) { memset((char *)(&sequence->order[i]) + 4, 0, 4ul); } - - // Perform sorts on smaller chunks instead of the whole handle -#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) - // https://stackoverflow.com/a/39561369 - // https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/qsort-s?view=msvc-170 - qsort_s(sequence->order, split, sizeof(sz_size_t), qsort_comparator, (void *)sequence); - qsort_s(sequence->order + split, - sequence->count - split, - sizeof(sz_size_t), - qsort_comparator, - (void *)sequence); -#elif __APPLE__ - qsort_r(sequence->order, split, sizeof(sz_size_t), (void *)sequence, qsort_comparator); - qsort_r(sequence->order + split, - sequence->count - split, - sizeof(sz_size_t), - (void *)sequence, - qsort_comparator); -#else - // https://linux.die.net/man/3/qsort_r - qsort_r(sequence->order, split, sizeof(sz_size_t), qsort_comparator, (void *)sequence); - qsort_r(sequence->order + split, - sequence->count - split, - sizeof(sz_size_t), - qsort_comparator, - (void *)sequence); -#endif + sz_u32_t *order_half_words = (sz_u32_t *)sequence->order; + for (sz_size_t i = 0; i != sequence->count; ++i) { order_half_words[i * 2 + 1] = 0; } + + sz_sequence_t a = *sequence; + a.count = split; + sz_sort_insertion(&a, comparator); + + sz_sequence_t b = *sequence; + b.order += split; + b.count -= split; + sz_sort_insertion(&b, comparator); } } -inline static int _sz_sort_sequence_strncmp( -#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) || __APPLE__ - void *sequence_raw, void const *a_raw, void const *b_raw -#else - void const *a_raw, void const *b_raw, void *sequence_raw -#endif -) { - // https://man.freebsd.org/cgi/man.cgi?query=qsort_s&sektion=3&n=1 - // https://www.man7.org/linux/man-pages/man3/strcmp.3.html - sz_sequence_t *sequence = (sz_sequence_t *)sequence_raw; - sz_size_t a = *(sz_size_t *)a_raw; - sz_size_t b = *(sz_size_t *)b_raw; - sz_size_t a_len = sequence->get_length(sequence->handle, a); - sz_size_t b_len = sequence->get_length(sequence->handle, b); - int res = strncmp( // - sequence->get_start(sequence->handle, a), - sequence->get_start(sequence->handle, b), - a_len > b_len ? b_len : a_len); - return res ? res : a_len - b_len; +inline static sz_bool_t _sz_sort_compare_less_ascii(sz_sequence_t *sequence, sz_size_t i_key, sz_size_t j_key) { + sz_string_ptr_t i_str = sequence->get_start(sequence->handle, i_key); + sz_size_t i_len = sequence->get_length(sequence->handle, i_key); + sz_string_ptr_t j_str = sequence->get_start(sequence->handle, j_key); + sz_size_t j_len = sequence->get_length(sequence->handle, j_key); + return sz_is_less_ascii(i_str, i_len, j_str, j_len); } -inline static int _sz_sort_sequence_strncasecmp( -#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) || __APPLE__ - void *sequence_raw, void const *a_raw, void const *b_raw -#else - void const *a_raw, void const *b_raw, void *sequence_raw -#endif -) { - // https://man.freebsd.org/cgi/man.cgi?query=qsort_s&sektion=3&n=1 - // https://www.man7.org/linux/man-pages/man3/strcmp.3.html - sz_sequence_t *sequence = (sz_sequence_t *)sequence_raw; - sz_size_t a = *(sz_size_t *)a_raw; - sz_size_t b = *(sz_size_t *)b_raw; - sz_size_t a_len = sequence->get_length(sequence->handle, a); - sz_size_t b_len = sequence->get_length(sequence->handle, b); - int res = strncasecmp( // - sequence->get_start(sequence->handle, a), - sequence->get_start(sequence->handle, b), - a_len > b_len ? b_len : a_len); - return res ? res : a_len - b_len; +inline static sz_bool_t _sz_sort_compare_less_uncased_ascii(sz_sequence_t *sequence, sz_size_t i_key, sz_size_t j_key) { + sz_string_ptr_t i_str = sequence->get_start(sequence->handle, i_key); + sz_size_t i_len = sequence->get_length(sequence->handle, i_key); + sz_string_ptr_t j_str = sequence->get_start(sequence->handle, j_key); + sz_size_t j_len = sequence->get_length(sequence->handle, j_key); + return sz_is_less_uncased_ascii(i_str, i_len, j_str, j_len); } typedef struct sz_sort_config_t { - int case_insensitive; + sz_bool_t case_insensitive; + sz_size_t partial_order_length; } sz_sort_config_t; /** @@ -770,11 +870,13 @@ typedef struct sz_sort_config_t { */ inline static void sz_sort(sz_sequence_t *sequence, sz_sort_config_t const *config) { - int case_insensitive = config && config->case_insensitive; + sz_bool_t case_insensitive = config && config->case_insensitive; + sz_size_t partial_order_length = + config && config->partial_order_length ? config->partial_order_length : sequence->count; // Export up to 4 bytes into the `sequence` bits themselves for (sz_size_t i = 0; i != sequence->count; ++i) { - char const *begin = sequence->get_start(sequence->handle, sequence->order[i]); + sz_string_ptr_t begin = sequence->get_start(sequence->handle, sequence->order[i]); sz_size_t length = sequence->get_length(sequence->handle, sequence->order[i]); length = length > 4ul ? 4ul : length; char *prefix = (char *)&sequence->order[i]; @@ -787,11 +889,11 @@ inline static void sz_sort(sz_sequence_t *sequence, sz_sort_config_t const *conf } } - sz_qsort_comparison_func_t comparator = _sz_sort_sequence_strncmp; - if (case_insensitive) comparator = _sz_sort_sequence_strncasecmp; + sz_sequence_comparator_t comparator = (sz_sequence_comparator_t)_sz_sort_compare_less_ascii; + if (case_insensitive) comparator = (sz_sequence_comparator_t)_sz_sort_compare_less_uncased_ascii; // Perform optionally-parallel radix sort on them - _sz_sort_recursion(sequence, 0, 32, comparator); + _sz_sort_recursion(sequence, 0, 32, comparator, partial_order_length); } typedef unsigned char levenstein_distance_t; @@ -806,9 +908,9 @@ inline static sz_size_t sz_levenstein_memory_needed(sz_size_t _, sz_size_t b_len * @brief Auxiliary function, that computes the minimum of three values. */ inline static levenstein_distance_t _sz_levenstein_minimum( // - levenstein_distance_t a, - levenstein_distance_t b, - levenstein_distance_t c) { + levenstein_distance_t const a, + levenstein_distance_t const b, + levenstein_distance_t const c) { return (a < b ? (a < c ? a : c) : (b < c ? b : c)); } @@ -818,11 +920,11 @@ inline static levenstein_distance_t _sz_levenstein_minimum( // * It accepts an upper bound on the possible error. Quadratic complexity in time, linear in space. */ inline static levenstein_distance_t sz_levenstein( // - char const *a, - sz_size_t a_length, - char const *b, - sz_size_t b_length, - levenstein_distance_t bound, + sz_string_ptr_t const a, + sz_size_t const a_length, + sz_string_ptr_t const b, + sz_size_t const b_length, + levenstein_distance_t const bound, void *buffer) { // If one of the strings is empty - the edit distance is equal to the length of the other one @@ -873,11 +975,11 @@ inline static levenstein_distance_t sz_levenstein( // /** * @brief Hashes provided string using hardware-accelerated CRC32 instructions. */ -inline static sz_u32_t sz_hash_crc32_native(char const *start, sz_size_t length) { return 0; } +inline static sz_u32_t sz_hash_crc32_native(sz_string_ptr_t start, sz_size_t length) { return 0; } -inline static sz_u32_t sz_hash_crc32_neon(char const *start, sz_size_t length) { return 0; } +inline static sz_u32_t sz_hash_crc32_neon(sz_string_ptr_t start, sz_size_t length) { return 0; } -inline static sz_u32_t sz_hash_crc32_sse(char const *start, sz_size_t length) { return 0; } +inline static sz_u32_t sz_hash_crc32_sse(sz_string_ptr_t start, sz_size_t length) { return 0; } #ifdef __cplusplus } @@ -889,5 +991,6 @@ inline static sz_u32_t sz_hash_crc32_sse(char const *start, sz_size_t length) { #endif #undef popcount64 #undef ctz64 +#undef clz64 #endif // STRINGZILLA_H_ From a7796a13eb365ff7a0d044576840abb42001db63 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Mon, 9 Oct 2023 14:38:44 -0700 Subject: [PATCH 60/72] Improve: Intro-sort --- stringzilla/stringzilla.h | 167 ++++++++++++++++++++++++++++++++------ 1 file changed, 142 insertions(+), 25 deletions(-) diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h index 0aa8774b..84e864cf 100644 --- a/stringzilla/stringzilla.h +++ b/stringzilla/stringzilla.h @@ -714,15 +714,15 @@ inline static void _sz_swap_order(sz_u64_t *a, sz_u64_t *b) { *b = t; } -struct sz_sequence_s; +struct sz_sequence_t; -typedef sz_string_ptr_t (*sz_sequence_member_start_t)(void const *, sz_size_t); -typedef sz_size_t (*sz_sequence_member_length_t)(void const *, sz_size_t); -typedef sz_bool_t (*sz_sequence_predicate_t)(void const *, sz_size_t); -typedef sz_bool_t (*sz_sequence_comparator_t)(void const *, sz_size_t, sz_size_t); +typedef sz_string_ptr_t (*sz_sequence_member_start_t)(struct sz_sequence_t const *, sz_size_t); +typedef sz_size_t (*sz_sequence_member_length_t)(struct sz_sequence_t const *, sz_size_t); +typedef sz_bool_t (*sz_sequence_predicate_t)(struct sz_sequence_t const *, sz_size_t); +typedef sz_bool_t (*sz_sequence_comparator_t)(struct sz_sequence_t const *, sz_size_t, sz_size_t); typedef sz_bool_t (*sz_string_is_less_t)(sz_string_ptr_t, sz_size_t, sz_string_ptr_t, sz_size_t); -typedef struct sz_sequence_s { +typedef struct sz_sequence_t { sz_u64_t *order; sz_size_t count; sz_sequence_member_start_t get_start; @@ -738,10 +738,10 @@ typedef struct sz_sequence_s { inline static sz_size_t sz_partition(sz_sequence_t *sequence, sz_sequence_predicate_t predicate) { sz_size_t matches = 0; - while (matches != sequence->count && predicate(sequence->handle, sequence->order[matches])) ++matches; + while (matches != sequence->count && predicate(sequence, sequence->order[matches])) ++matches; for (sz_size_t i = matches + 1; i < sequence->count; ++i) - if (predicate(sequence->handle, sequence->order[i])) + if (predicate(sequence, sequence->order[i])) _sz_swap_order(sequence->order + i, sequence->order + matches), ++matches; return matches; @@ -758,13 +758,13 @@ inline static void sz_merge(sz_sequence_t *sequence, sz_size_t partition, sz_seq sz_size_t start_b = partition + 1; // If the direct merge is already sorted - if (!less(sequence->handle, sequence->order[start_b], sequence->order[partition])) return; + if (!less(sequence, sequence->order[start_b], sequence->order[partition])) return; sz_size_t start_a = 0; while (start_a <= partition && start_b <= sequence->count) { // If element 1 is in right place - if (!less(sequence->handle, sequence->order[start_b], sequence->order[start_a])) { start_a++; } + if (!less(sequence, sequence->order[start_b], sequence->order[start_a])) { start_a++; } else { sz_size_t value = sequence->order[start_b]; sz_size_t index = start_b; @@ -782,18 +782,135 @@ inline static void sz_merge(sz_sequence_t *sequence, sz_size_t partition, sz_seq } } -inline static void sz_sort_insertion(sz_sequence_t *sequence, sz_sequence_comparator_t comparator) { +inline static void sz_sort_insertion(sz_sequence_t *sequence, sz_sequence_comparator_t less) { sz_u64_t *keys = sequence->order; sz_size_t keys_count = sequence->count; for (sz_size_t i = 1; i < keys_count; i++) { sz_u64_t i_key = keys[i]; - // Move elements of arr[0..i-1] that are greater than key to one position ahead of their current position sz_size_t j = i; - for (; j > 0 && comparator(sequence, keys[j - 1], i) != 0; --j) keys[j] = keys[j - 1]; + for (; j > 0 && less(sequence, i_key, keys[j - 1]); --j) keys[j] = keys[j - 1]; keys[j] = i_key; } } +// Utility functions +inline static sz_size_t _sz_log2i(sz_size_t n) { + sz_size_t log2 = 0; + while (n >>= 1) ++log2; + return log2; +} + +inline static void _sz_sift_down( + sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_u64_t *order, sz_size_t start, sz_size_t end) { + sz_size_t root = start; + while (2 * root + 1 <= end) { + sz_size_t child = 2 * root + 1; + if (child + 1 <= end && less(sequence, order[child], order[child + 1])) { child++; } + if (!less(sequence, order[root], order[child])) { return; } + _sz_swap_order(order + root, order + child); + root = child; + } +} + +inline static void _sz_heapify(sz_sequence_t *sequence, + sz_sequence_comparator_t less, + sz_u64_t *order, + sz_size_t count) { + sz_size_t start = (count - 2) / 2; + while (1) { + _sz_sift_down(sequence, less, order, start, count - 1); + if (start == 0) return; + start--; + } +} + +inline static void _sz_heapsort(sz_sequence_t *sequence, + sz_sequence_comparator_t less, + sz_size_t first, + sz_size_t last) { + sz_u64_t *order = sequence->order; + sz_size_t count = last - first; + _sz_heapify(sequence, less, order + first, count); + sz_size_t end = count - 1; + while (end > 0) { + _sz_swap_order(order + first, order + first + end); + end--; + _sz_sift_down(sequence, less, order + first, 0, end); + } +} + +inline static void _sz_introsort( + sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_size_t first, sz_size_t last, sz_size_t depth) { + + sz_size_t length = last - first; + switch (length) { + case 0: + case 1: return; + case 2: + if (less(sequence, sequence->order[first + 1], sequence->order[first])) + _sz_swap_order(&sequence->order[first], &sequence->order[first + 1]); + return; + case 3: + sz_u64_t a = sequence->order[first]; + sz_u64_t b = sequence->order[first + 1]; + sz_u64_t c = sequence->order[first + 2]; + if (less(sequence, b, a)) _sz_swap_order(&a, &b); + if (less(sequence, c, b)) _sz_swap_order(&c, &b); + if (less(sequence, b, a)) _sz_swap_order(&a, &b); + sequence->order[first] = a; + sequence->order[first + 1] = b; + sequence->order[first + 2] = c; + return; + } + // Until a certain length, the quadratic-complexity insertion-sort is fine + if (length <= 16) { + sz_sequence_t sub_seq = *sequence; + sub_seq.order += first; + sub_seq.count = length; + sz_sort_insertion(&sub_seq, less); + return; + } + + // Fallback to N-logN-complexity heap-sort + if (depth == 0) { + _sz_heapsort(sequence, less, first, last); + return; + } + + --depth; + + // Median-of-three logic to choose pivot + sz_size_t median = first + length / 2; + if (less(sequence, sequence->order[median], sequence->order[first])) + _sz_swap_order(&sequence->order[first], &sequence->order[median]); + if (less(sequence, sequence->order[last - 1], sequence->order[first])) + _sz_swap_order(&sequence->order[first], &sequence->order[last - 1]); + if (less(sequence, sequence->order[median], sequence->order[last - 1])) + _sz_swap_order(&sequence->order[median], &sequence->order[last - 1]); + + // Partition using the median-of-three as the pivot + sz_u64_t pivot = sequence->order[median]; + sz_size_t left = first; + sz_size_t right = last - 1; + while (true) { + while (less(sequence, sequence->order[left], pivot)) left++; + while (less(sequence, pivot, sequence->order[right])) right--; + if (left >= right) break; + _sz_swap_order(&sequence->order[left], &sequence->order[right]); + left++; + right--; + } + + // Recursively sort the partitions + _sz_introsort(sequence, less, first, left, depth); + _sz_introsort(sequence, less, right + 1, last, depth); +} + +inline static void sz_sort_introsort(sz_sequence_t *sequence, sz_sequence_comparator_t less) { + sz_size_t depth_limit = 2 * _sz_log2i(sequence->count); + _sz_introsort(sequence, less, 0, sequence->count, depth_limit); +} + /** * @brief Internal Radix sorting procedure. */ @@ -834,28 +951,28 @@ inline static void _sz_sort_recursion( // sz_sequence_t a = *sequence; a.count = split; - sz_sort_insertion(&a, comparator); + sz_sort_introsort(&a, comparator); sz_sequence_t b = *sequence; b.order += split; b.count -= split; - sz_sort_insertion(&b, comparator); + sz_sort_introsort(&b, comparator); } } inline static sz_bool_t _sz_sort_compare_less_ascii(sz_sequence_t *sequence, sz_size_t i_key, sz_size_t j_key) { - sz_string_ptr_t i_str = sequence->get_start(sequence->handle, i_key); - sz_size_t i_len = sequence->get_length(sequence->handle, i_key); - sz_string_ptr_t j_str = sequence->get_start(sequence->handle, j_key); - sz_size_t j_len = sequence->get_length(sequence->handle, j_key); + sz_string_ptr_t i_str = sequence->get_start(sequence, i_key); + sz_size_t i_len = sequence->get_length(sequence, i_key); + sz_string_ptr_t j_str = sequence->get_start(sequence, j_key); + sz_size_t j_len = sequence->get_length(sequence, j_key); return sz_is_less_ascii(i_str, i_len, j_str, j_len); } inline static sz_bool_t _sz_sort_compare_less_uncased_ascii(sz_sequence_t *sequence, sz_size_t i_key, sz_size_t j_key) { - sz_string_ptr_t i_str = sequence->get_start(sequence->handle, i_key); - sz_size_t i_len = sequence->get_length(sequence->handle, i_key); - sz_string_ptr_t j_str = sequence->get_start(sequence->handle, j_key); - sz_size_t j_len = sequence->get_length(sequence->handle, j_key); + sz_string_ptr_t i_str = sequence->get_start(sequence, i_key); + sz_size_t i_len = sequence->get_length(sequence, i_key); + sz_string_ptr_t j_str = sequence->get_start(sequence, j_key); + sz_size_t j_len = sequence->get_length(sequence, j_key); return sz_is_less_uncased_ascii(i_str, i_len, j_str, j_len); } @@ -876,8 +993,8 @@ inline static void sz_sort(sz_sequence_t *sequence, sz_sort_config_t const *conf // Export up to 4 bytes into the `sequence` bits themselves for (sz_size_t i = 0; i != sequence->count; ++i) { - sz_string_ptr_t begin = sequence->get_start(sequence->handle, sequence->order[i]); - sz_size_t length = sequence->get_length(sequence->handle, sequence->order[i]); + sz_string_ptr_t begin = sequence->get_start(sequence, sequence->order[i]); + sz_size_t length = sequence->get_length(sequence, sequence->order[i]); length = length > 4ul ? 4ul : length; char *prefix = (char *)&sequence->order[i]; for (sz_size_t j = 0; j != length; ++j) prefix[7 - j] = begin[j]; From 51c47fdbc489340d13eb0c8a8879f4bee47d340d Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Mon, 9 Oct 2023 16:30:38 -0700 Subject: [PATCH 61/72] Refactor: New C API for JS --- javascript/lib.c | 62 +++++++++++++------------- javascript/test/find.js | 14 +++--- scripts/test.c | 13 +++--- scripts/test.cpp | 92 +++++++++++++++++++++------------------ stringzilla/stringzilla.h | 9 ++-- 5 files changed, 97 insertions(+), 93 deletions(-) diff --git a/javascript/lib.c b/javascript/lib.c index fe1f5f68..18e36a1b 100644 --- a/javascript/lib.c +++ b/javascript/lib.c @@ -8,17 +8,18 @@ * @see NodeJS docs: https://nodejs.org/api/n-api.html */ -#include -#include +#include // `napi_*` functions +#include // `malloc` +#include // `sz_*` functions -napi_value FindAPI(napi_env env, napi_callback_info info) { +napi_value indexOfAPI(napi_env env, napi_callback_info info) { size_t argc = 2; napi_value args[2]; napi_get_cb_info(env, info, &argc, args, NULL, NULL); // Extract the C string from the JavaScript string for haystack and needle - sz_haystack_t haystack_sz = {NULL, 0}; - sz_needle_t needle_sz = {NULL, 0, 0}; + sz_string_view_t haystack_sz = {NULL, 0}; + sz_string_view_t needle_sz = {NULL, 0}; // For haystack napi_get_value_string_utf8(env, args[0], NULL, 0, (size_t *)&haystack_sz.length); @@ -38,37 +39,32 @@ napi_value FindAPI(napi_env env, napi_callback_info info) { needle_sz.length + 1, (size_t *)&needle_sz.length); - // Perform the find operation - sz_size_t result = sz_find_substr(haystack_sz, needle_sz); - - // Cleanup - free((void *)haystack_sz.start); - free((void *)needle_sz.start); - // Convert the result to JavaScript BigInt and return napi_value js_result; + if (needle_sz.length == 0) { napi_create_bigint_int64(env, 0, &js_result); } + else { + sz_string_ptr_t result = + sz_find_substr(haystack_sz.start, haystack_sz.length, needle_sz.start, needle_sz.length); - // In JavaScript, if `find` is unable to find the specified value, then it should return -1 - if (result == haystack_sz.length) napi_create_bigint_int64(env, -1, &js_result); - else - napi_create_bigint_uint64(env, result, &js_result); + // In JavaScript, if `indexOf` is unable to indexOf the specified value, then it should return -1 + if (result == NULL) { napi_create_bigint_int64(env, -1, &js_result); } + else { napi_create_bigint_uint64(env, result - haystack_sz.start, &js_result); } + } + // Cleanup + free((void *)haystack_sz.start); + free((void *)needle_sz.start); return js_result; } -size_t count_char(sz_haystack_t haystack_sz, char needle) { - size_t result = sz_count_char(haystack_sz, needle); - return result; -} - -napi_value CountAPI(napi_env env, napi_callback_info info) { +napi_value countAPI(napi_env env, napi_callback_info info) { size_t argc = 3; napi_value args[3]; napi_get_cb_info(env, info, &argc, args, NULL, NULL); // Extract the C string from the JavaScript string for haystack and needle - sz_haystack_t haystack_sz = {NULL, 0}; - sz_needle_t needle_sz = {NULL, 0, 0}; + sz_string_view_t haystack_sz = {NULL, 0}; + sz_string_view_t needle_sz = {NULL, 0}; // For haystack napi_get_value_string_utf8(env, args[0], NULL, 0, (size_t *)&haystack_sz.length); @@ -95,11 +91,13 @@ napi_value CountAPI(napi_env env, napi_callback_info info) { size_t count = 0; if (needle_sz.length == 0 || haystack_sz.length == 0 || haystack_sz.length < needle_sz.length) { count = 0; } - else if (needle_sz.length == 1) { count = count_char(haystack_sz, needle_sz.start[0]); } + else if (needle_sz.length == 1) { count = sz_count_char(haystack_sz.start, haystack_sz.length, needle_sz.start); } else if (overlap) { while (haystack_sz.length) { - sz_size_t offset = sz_find_substr(haystack_sz, needle_sz); - int found = offset != haystack_sz.length; + sz_string_ptr_t ptr = + sz_find_substr(haystack_sz.start, haystack_sz.length, needle_sz.start, needle_sz.length); + sz_bool_t found = ptr != NULL; + sz_size_t offset = found ? ptr - haystack_sz.start : haystack_sz.length; count += found; haystack_sz.start += offset + found; haystack_sz.length -= offset + found; @@ -107,8 +105,10 @@ napi_value CountAPI(napi_env env, napi_callback_info info) { } else { while (haystack_sz.length) { - sz_size_t offset = sz_find_substr(haystack_sz, needle_sz); - int found = offset != haystack_sz.length; + sz_string_ptr_t ptr = + sz_find_substr(haystack_sz.start, haystack_sz.length, needle_sz.start, needle_sz.length); + sz_bool_t found = ptr != NULL; + sz_size_t offset = found ? ptr - haystack_sz.start : haystack_sz.length; count += found; haystack_sz.start += offset + needle_sz.length; haystack_sz.length -= offset + needle_sz.length * found; @@ -129,8 +129,8 @@ napi_value CountAPI(napi_env env, napi_callback_info info) { napi_value Init(napi_env env, napi_value exports) { // Define an array of property descriptors - napi_property_descriptor findDesc = {"find", 0, FindAPI, 0, 0, 0, napi_default, 0}; - napi_property_descriptor countDesc = {"count", 0, CountAPI, 0, 0, 0, napi_default, 0}; + napi_property_descriptor findDesc = {"indexOf", 0, indexOfAPI, 0, 0, 0, napi_default, 0}; + napi_property_descriptor countDesc = {"count", 0, countAPI, 0, 0, 0, napi_default, 0}; napi_property_descriptor properties[] = {findDesc, countDesc}; // Define the properties on the `exports` object diff --git a/javascript/test/find.js b/javascript/test/find.js index cd2a800d..9fe4e5b7 100644 --- a/javascript/test/find.js +++ b/javascript/test/find.js @@ -5,26 +5,26 @@ import assert from 'node:assert'; const stringzilla = bindings('stringzilla'); test('Find Word in Text - Positive Case', () => { - const result = stringzilla.find('hello world, hello john', 'hello'); + const result = stringzilla.indexOf('hello world, hello john', 'hello'); assert.strictEqual(result, 0n); }); test('Find Word in Text - Negative Case (Word Not Found)', () => { - const result_1 = stringzilla.find('ha', 'aaa'); + const result_1 = stringzilla.indexOf('ha', 'aaa'); assert.strictEqual(result_1, -1n); - const result_2 = stringzilla.find('g', 'a'); + const result_2 = stringzilla.indexOf('g', 'a'); assert.strictEqual(result_2, -1n); }); test('Find Word in Text - Negative Case (Empty String Inputs)', () => { - const result_1 = stringzilla.find('hello world', ''); + const result_1 = stringzilla.indexOf('hello world', ''); assert.strictEqual(result_1, 0n); - const result_2 = stringzilla.find('', 'a'); + const result_2 = stringzilla.indexOf('', 'a'); assert.strictEqual(result_2, -1n); - const result_3 = stringzilla.find('', ''); - assert.strictEqual(result_2, -1n); + const result_3 = stringzilla.indexOf('', ''); + assert.strictEqual(result_3, 0n); }); diff --git a/scripts/test.c b/scripts/test.c index a921e76d..127975b0 100644 --- a/scripts/test.c +++ b/scripts/test.c @@ -27,24 +27,23 @@ void test_sz_find_substr() { for (int variability = 1; variability < VARIABILITY; variability++) { populate_random_string(buffer, length, variability); - struct sz_haystack_t haystack; + sz_string_view_t haystack; haystack.start = buffer; haystack.length = length; int pattern_length = rand() % 5 + 1; populate_random_string(pattern, pattern_length, variability); - struct sz_needle_t needle; + sz_string_view_t needle; needle.start = pattern; needle.length = pattern_length; // Comparing the result of your function with the standard library function. - const char *result_libc = strstr(buffer, pattern); - uint64_t result_stringzilla = sz_find_substr(haystack, needle); + sz_string_ptr_t result_libc = strstr(buffer, pattern); + sz_string_ptr_t result_stringzilla = + sz_find_substr(haystack.start, haystack.length, needle.start, needle.length); - assert(((result_libc && result_stringzilla == (uint64_t)(result_libc - buffer)) || - (!result_libc && result_stringzilla == (uint64_t)-1)) && - "Test failed for sz_find_substr"); + assert(((result_libc == NULL) ^ (result_stringzilla == NULL)) && "Test failed for sz_find_substr"); } } } diff --git a/scripts/test.cpp b/scripts/test.cpp index ddef4e82..8dc1a4d2 100644 --- a/scripts/test.cpp +++ b/scripts/test.cpp @@ -1,39 +1,39 @@ -#include +#include #include -#include +#include #include -#include -#include -#include +#include #include -#include +#include +#include #include +#include #include using strings_t = std::vector; using idx_t = sz_size_t; -using permute_t = std::vector; +using permute_t = std::vector; #pragma region - C callbacks -static char const *get_start(void const *array_c, sz_size_t i) { - strings_t const &array = *reinterpret_cast(array_c); +static char const *get_start(sz_sequence_t const *array_c, sz_size_t i) { + strings_t const &array = *reinterpret_cast(array_c->handle); return array[i].c_str(); } -static sz_size_t get_length(void const *array_c, sz_size_t i) { - strings_t const &array = *reinterpret_cast(array_c); +static sz_size_t get_length(sz_sequence_t const *array_c, sz_size_t i) { + strings_t const &array = *reinterpret_cast(array_c->handle); return array[i].size(); } -static int is_less(void const *array_c, sz_size_t i, sz_size_t j) { - strings_t const &array = *reinterpret_cast(array_c); +static int is_less(sz_sequence_t const *array_c, sz_size_t i, sz_size_t j) { + strings_t const &array = *reinterpret_cast(array_c->handle); return array[i] < array[j]; } -static int has_under_four_chars(void const *array_c, sz_size_t i) { - strings_t const &array = *reinterpret_cast(array_c); +static int has_under_four_chars(sz_sequence_t const *array_c, sz_size_t i) { + strings_t const &array = *reinterpret_cast(array_c->handle); return array[i].size() < 4; } @@ -64,7 +64,7 @@ void populate_with_test(strings_t &strings) { constexpr size_t offset_in_word = 0; -inline static idx_t hybrid_sort_cpp(strings_t const &strings, idx_t *order) { +inline static idx_t hybrid_sort_cpp(strings_t const &strings, sz_u64_t *order) { // What if we take up-to 4 first characters and the index for (size_t i = 0; i != strings.size(); ++i) @@ -72,7 +72,7 @@ inline static idx_t hybrid_sort_cpp(strings_t const &strings, idx_t *order) { strings[order[i]].c_str(), std::min(strings[order[i]].size(), 4ul)); - std::sort(order, order + strings.size(), [&](idx_t i, idx_t j) { + std::sort(order, order + strings.size(), [&](sz_u64_t i, sz_u64_t j) { char *i_bytes = (char *)&i; char *j_bytes = (char *)&j; return *(uint32_t *)(i_bytes + offset_in_word) < *(uint32_t *)(j_bytes + offset_in_word); @@ -80,7 +80,7 @@ inline static idx_t hybrid_sort_cpp(strings_t const &strings, idx_t *order) { for (size_t i = 0; i != strings.size(); ++i) std::memset((char *)&order[i] + offset_in_word, 0, 4ul); - std::sort(order, order + strings.size(), [&](idx_t i, idx_t j) { return strings[i] < strings[j]; }); + std::sort(order, order + strings.size(), [&](sz_u64_t i, sz_u64_t j) { return strings[i] < strings[j]; }); return strings.size(); } @@ -92,14 +92,14 @@ int hybrid_sort_c_compare_uint32_t(const void *a, const void *b) { } int hybrid_sort_c_compare_strings(void *arg, const void *a, const void *b) { - sz_sequence_t *seq = (sz_sequence_t *)arg; + sz_sequence_t *sequence = (sz_sequence_t *)arg; sz_size_t idx_a = *(sz_size_t *)a; sz_size_t idx_b = *(sz_size_t *)b; - const char *str_a = seq->get_start(seq->handle, idx_a); - const char *str_b = seq->get_start(seq->handle, idx_b); - sz_size_t len_a = seq->get_length(seq->handle, idx_a); - sz_size_t len_b = seq->get_length(seq->handle, idx_b); + const char *str_a = sequence->get_start(sequence, idx_a); + const char *str_b = sequence->get_start(sequence, idx_b); + sz_size_t len_a = sequence->get_length(sequence, idx_a); + sz_size_t len_b = sequence->get_length(sequence, idx_b); int res = strncmp(str_a, str_b, len_a < len_b ? len_a : len_b); return res ? res : (int)(len_a - len_b); @@ -108,8 +108,8 @@ int hybrid_sort_c_compare_strings(void *arg, const void *a, const void *b) { sz_size_t hybrid_sort_c(sz_sequence_t *sequence) { // Copy up to 4 first characters into the 'order' array. for (sz_size_t i = 0; i < sequence->count; ++i) { - const char *str = sequence->get_start(sequence->handle, sequence->order[i]); - sz_size_t len = sequence->get_length(sequence->handle, sequence->order[i]); + const char *str = sequence->get_start(sequence, sequence->order[i]); + sz_size_t len = sequence->get_length(sequence, sequence->order[i]); len = len > 4 ? 4 : len; memcpy((char *)&sequence->order[i] + sizeof(sz_size_t) - 4, str, len); } @@ -128,7 +128,7 @@ sz_size_t hybrid_sort_c(sz_sequence_t *sequence) { return sequence->count; } -inline static idx_t hybrid_stable_sort_cpp(strings_t const &strings, idx_t *order) { +inline static idx_t hybrid_stable_sort_cpp(strings_t const &strings, sz_u64_t *order) { // What if we take up-to 4 first characters and the index for (size_t i = 0; i != strings.size(); ++i) @@ -136,7 +136,7 @@ inline static idx_t hybrid_stable_sort_cpp(strings_t const &strings, idx_t *orde strings[order[i]].c_str(), std::min(strings[order[i]].size(), 4ul)); - std::stable_sort(order, order + strings.size(), [&](idx_t i, idx_t j) { + std::stable_sort(order, order + strings.size(), [&](sz_u64_t i, sz_u64_t j) { char *i_bytes = (char *)&i; char *j_bytes = (char *)&j; return *(uint32_t *)(i_bytes + offset_in_word) < *(uint32_t *)(j_bytes + offset_in_word); @@ -144,7 +144,7 @@ inline static idx_t hybrid_stable_sort_cpp(strings_t const &strings, idx_t *orde for (size_t i = 0; i != strings.size(); ++i) std::memset((char *)&order[i] + offset_in_word, 0, 4ul); - std::stable_sort(order, order + strings.size(), [&](idx_t i, idx_t j) { return strings[i] < strings[j]; }); + std::stable_sort(order, order + strings.size(), [&](sz_u64_t i, sz_u64_t j) { return strings[i] < strings[j]; }); return strings.size(); } @@ -209,7 +209,7 @@ int main(int, char const **) { std::printf("Hey, Ash!\n"); strings_t strings; - populate_from_file("leipzig1M.txt", strings, 10000000); + populate_from_file("leipzig1M.txt", strings, 1000000); std::size_t mean_bytes = 0; for (std::string const &str : strings) mean_bytes += str.size(); mean_bytes /= strings.size(); @@ -229,26 +229,23 @@ int main(int, char const **) { for (std::size_t needle_len = 1; needle_len <= 0; ++needle_len) { std::string needle(needle_len, '\4'); std::printf("---- Needle length: %zu\n", needle_len); - bench_search("std::search", full_text, [&]() { + bench_search("std::search", full_text, [&]() mutable { return std::search(full_text.begin(), full_text.end(), needle.begin(), needle.end()) - full_text.begin(); }); - bench_search("sz_find_substr_swar", full_text, [&]() { - sz_haystack_t h {full_text.data(), full_text.size()}; - sz_needle_t n {needle.data(), needle.size()}; - return sz_find_substr_swar(h, n); + bench_search("sz_find_substr_swar", full_text, [&]() mutable { + sz_string_ptr_t ptr = sz_find_substr_swar(full_text.data(), full_text.size(), needle.data(), needle.size()); + return ptr ? ptr - full_text.data() : full_text.size(); }); #if defined(__ARM_NEON) - bench_search("sz_find_substr_neon", full_text, [&]() { - sz_haystack_t h {full_text.data(), full_text.size()}; - sz_needle_t n {needle.data(), needle.size()}; - return sz_find_substr_neon(h, n); + bench_search("sz_find_substr_neon", full_text, [&]() mutable { + sz_string_ptr_t ptr = sz_find_substr_neon(full_text.data(), full_text.size(), needle.data(), needle.size()); + return ptr ? ptr - full_text.data() : full_text.size(); }); #endif #if defined(__AVX2__) - bench_search("sz_find_substr_avx2", full_text, [&]() { - sz_haystack_t h {full_text.data(), full_text.size()}; - sz_needle_t n {needle.data(), needle.size()}; - return sz_find_substr_avx2(h, n); + bench_search("sz_find_substr_avx2", full_text, [&]() mutable { + sz_string_ptr_t ptr = sz_find_substr_avx2(full_text.data(), full_text.size(), needle.data(), needle.size()); + return ptr ? ptr - full_text.data() : full_text.size(); }); #endif } @@ -300,6 +297,17 @@ int main(int, char const **) { }); expect_sorted(strings, permute_new); + bench_permute("sz_sort_introsort", strings, permute_new, [](strings_t const &strings, permute_t &permute) { + sz_sequence_t array; + array.order = permute.data(); + array.count = strings.size(); + array.handle = &strings; + array.get_start = get_start; + array.get_length = get_length; + sz_sort_introsort(&array, (sz_sequence_comparator_t)_sz_sort_compare_less_ascii); + }); + expect_sorted(strings, permute_new); + bench_permute("hybrid_sort_c", strings, permute_new, [](strings_t const &strings, permute_t &permute) { sz_sequence_t array; array.order = permute.data(); diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h index 84e864cf..ba7f5f39 100644 --- a/stringzilla/stringzilla.h +++ b/stringzilla/stringzilla.h @@ -609,7 +609,7 @@ inline static sz_string_ptr_t sz_find_substr(sz_string_ptr_t const haystack, sz_size_t const haystack_length, sz_string_ptr_t const needle, sz_size_t const needle_length) { - if (haystack_length < needle_length) return NULL; + if (haystack_length < needle_length || needle_length == 0) return NULL; #if defined(__ARM_NEON) return sz_find_substr_neon(haystack, haystack_length, needle, needle_length); #elif defined(__AVX2__) @@ -850,7 +850,7 @@ inline static void _sz_introsort( if (less(sequence, sequence->order[first + 1], sequence->order[first])) _sz_swap_order(&sequence->order[first], &sequence->order[first + 1]); return; - case 3: + case 3: { sz_u64_t a = sequence->order[first]; sz_u64_t b = sequence->order[first + 1]; sz_u64_t c = sequence->order[first + 2]; @@ -862,6 +862,7 @@ inline static void _sz_introsort( sequence->order[first + 2] = c; return; } + } // Until a certain length, the quadratic-complexity insertion-sort is fine if (length <= 16) { sz_sequence_t sub_seq = *sequence; @@ -1102,10 +1103,6 @@ inline static sz_u32_t sz_hash_crc32_sse(sz_string_ptr_t start, sz_size_t length } #endif -#ifdef _MSC_VER -#undef strncasecmp -#undef strcasecmp -#endif #undef popcount64 #undef ctz64 #undef clz64 From eadad4ed1007f4233d54d0a767d7925bc9713382 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Mon, 9 Oct 2023 16:50:13 -0700 Subject: [PATCH 62/72] Refactor: Sync up Py and JS bindings --- javascript/lib.c | 87 ++++++--------- python/lib.c | 183 +++++++++++++++--------------- scripts/test.cpp | 15 ++- stringzilla/stringzilla.h | 226 ++++++++++++++++++++------------------ 4 files changed, 253 insertions(+), 258 deletions(-) diff --git a/javascript/lib.c b/javascript/lib.c index 18e36a1b..8ebe72eb 100644 --- a/javascript/lib.c +++ b/javascript/lib.c @@ -18,42 +18,33 @@ napi_value indexOfAPI(napi_env env, napi_callback_info info) { napi_get_cb_info(env, info, &argc, args, NULL, NULL); // Extract the C string from the JavaScript string for haystack and needle - sz_string_view_t haystack_sz = {NULL, 0}; - sz_string_view_t needle_sz = {NULL, 0}; + sz_string_view_t haystack = {NULL, 0}; + sz_string_view_t needle = {NULL, 0}; // For haystack - napi_get_value_string_utf8(env, args[0], NULL, 0, (size_t *)&haystack_sz.length); - haystack_sz.start = malloc(haystack_sz.length + 1); - napi_get_value_string_utf8(env, - args[0], - (char *)haystack_sz.start, - haystack_sz.length + 1, - (size_t *)&haystack_sz.length); + napi_get_value_string_utf8(env, args[0], NULL, 0, (size_t *)&haystack.length); + haystack.start = malloc(haystack.length + 1); + napi_get_value_string_utf8(env, args[0], (char *)haystack.start, haystack.length + 1, (size_t *)&haystack.length); // For needle - napi_get_value_string_utf8(env, args[1], NULL, 0, (size_t *)&needle_sz.length); - needle_sz.start = malloc(needle_sz.length + 1); - napi_get_value_string_utf8(env, - args[1], - (char *)needle_sz.start, - needle_sz.length + 1, - (size_t *)&needle_sz.length); + napi_get_value_string_utf8(env, args[1], NULL, 0, (size_t *)&needle.length); + needle.start = malloc(needle.length + 1); + napi_get_value_string_utf8(env, args[1], (char *)needle.start, needle.length + 1, (size_t *)&needle.length); // Convert the result to JavaScript BigInt and return napi_value js_result; - if (needle_sz.length == 0) { napi_create_bigint_int64(env, 0, &js_result); } + if (needle.length == 0) { napi_create_bigint_int64(env, 0, &js_result); } else { - sz_string_ptr_t result = - sz_find_substr(haystack_sz.start, haystack_sz.length, needle_sz.start, needle_sz.length); + sz_string_start_t result = sz_find_substring(haystack.start, haystack.length, needle.start, needle.length); // In JavaScript, if `indexOf` is unable to indexOf the specified value, then it should return -1 if (result == NULL) { napi_create_bigint_int64(env, -1, &js_result); } - else { napi_create_bigint_uint64(env, result - haystack_sz.start, &js_result); } + else { napi_create_bigint_uint64(env, result - haystack.start, &js_result); } } // Cleanup - free((void *)haystack_sz.start); - free((void *)needle_sz.start); + free((void *)haystack.start); + free((void *)needle.start); return js_result; } @@ -63,55 +54,45 @@ napi_value countAPI(napi_env env, napi_callback_info info) { napi_get_cb_info(env, info, &argc, args, NULL, NULL); // Extract the C string from the JavaScript string for haystack and needle - sz_string_view_t haystack_sz = {NULL, 0}; - sz_string_view_t needle_sz = {NULL, 0}; + sz_string_view_t haystack = {NULL, 0}; + sz_string_view_t needle = {NULL, 0}; // For haystack - napi_get_value_string_utf8(env, args[0], NULL, 0, (size_t *)&haystack_sz.length); - haystack_sz.start = malloc(haystack_sz.length + 1); - napi_get_value_string_utf8(env, - args[0], - (char *)haystack_sz.start, - haystack_sz.length + 1, - (size_t *)&haystack_sz.length); + napi_get_value_string_utf8(env, args[0], NULL, 0, (size_t *)&haystack.length); + haystack.start = malloc(haystack.length + 1); + napi_get_value_string_utf8(env, args[0], (char *)haystack.start, haystack.length + 1, (size_t *)&haystack.length); // For needle - napi_get_value_string_utf8(env, args[1], NULL, 0, (size_t *)&needle_sz.length); - needle_sz.start = malloc(needle_sz.length + 1); - napi_get_value_string_utf8(env, - args[1], - (char *)needle_sz.start, - needle_sz.length + 1, - (size_t *)&needle_sz.length); + napi_get_value_string_utf8(env, args[1], NULL, 0, (size_t *)&needle.length); + needle.start = malloc(needle.length + 1); + napi_get_value_string_utf8(env, args[1], (char *)needle.start, needle.length + 1, (size_t *)&needle.length); bool overlap = false; if (argc > 2) { napi_get_value_bool(env, args[2], &overlap); } - void const *haystack_start = haystack_sz.start, *needle_start = needle_sz.start; + void const *haystack_start = haystack.start, *needle_start = needle.start; size_t count = 0; - if (needle_sz.length == 0 || haystack_sz.length == 0 || haystack_sz.length < needle_sz.length) { count = 0; } - else if (needle_sz.length == 1) { count = sz_count_char(haystack_sz.start, haystack_sz.length, needle_sz.start); } + if (needle.length == 0 || haystack.length == 0 || haystack.length < needle.length) { count = 0; } + else if (needle.length == 1) { count = sz_count_char(haystack.start, haystack.length, needle.start); } else if (overlap) { - while (haystack_sz.length) { - sz_string_ptr_t ptr = - sz_find_substr(haystack_sz.start, haystack_sz.length, needle_sz.start, needle_sz.length); + while (haystack.length) { + sz_string_start_t ptr = sz_find_substring(haystack.start, haystack.length, needle.start, needle.length); sz_bool_t found = ptr != NULL; - sz_size_t offset = found ? ptr - haystack_sz.start : haystack_sz.length; + sz_size_t offset = found ? ptr - haystack.start : haystack.length; count += found; - haystack_sz.start += offset + found; - haystack_sz.length -= offset + found; + haystack.start += offset + found; + haystack.length -= offset + found; } } else { - while (haystack_sz.length) { - sz_string_ptr_t ptr = - sz_find_substr(haystack_sz.start, haystack_sz.length, needle_sz.start, needle_sz.length); + while (haystack.length) { + sz_string_start_t ptr = sz_find_substring(haystack.start, haystack.length, needle.start, needle.length); sz_bool_t found = ptr != NULL; - sz_size_t offset = found ? ptr - haystack_sz.start : haystack_sz.length; + sz_size_t offset = found ? ptr - haystack.start : haystack.length; count += found; - haystack_sz.start += offset + needle_sz.length; - haystack_sz.length -= offset + needle_sz.length * found; + haystack.start += offset + needle.length; + haystack.length -= offset + needle.length * found; } } diff --git a/python/lib.c b/python/lib.c index a0f6caca..c0ad69d4 100644 --- a/python/lib.c +++ b/python/lib.c @@ -38,10 +38,7 @@ static PyTypeObject FileType; static PyTypeObject StrType; static PyTypeObject StrsType; -static struct { - void *start; - size_t length; -} temporary_memory = {NULL, 0}; +static sz_string_view_t temporary_memory = {NULL, 0}; /** * @brief Describes an on-disk file mapped into RAM, which is different from Python's @@ -55,8 +52,8 @@ typedef struct { #else int file_descriptor; #endif - void *start; - size_t length; + sz_string_start_t start; + sz_size_t length; } File; /** @@ -73,8 +70,8 @@ typedef struct { */ typedef struct { PyObject_HEAD PyObject *parent; - char const *start; - size_t length; + sz_string_start_t start; + sz_size_t length; } Str; /** @@ -133,7 +130,7 @@ typedef struct { struct reordered_slices_t { size_t count; PyObject *parent; - sz_haystack_t *parts; + sz_string_view_t *parts; } reordered; } data; @@ -144,10 +141,13 @@ typedef struct { #pragma region Helpers -typedef int boolean_t; +inline static sz_string_start_t haystacks_get_start(sz_sequence_t *seq, sz_size_t i) { + return ((sz_string_view_t const *)seq->handle)[i].start; +} -inline static char const *haystacks_get_start(sz_haystack_t const *parts, sz_size_t i) { return parts[i].start; } -inline static size_t haystacks_get_length(sz_haystack_t const *parts, sz_size_t i) { return parts[i].length; } +inline static sz_size_t haystacks_get_length(sz_sequence_t *seq, sz_size_t i) { + return ((sz_string_view_t const *)seq->handle)[i].length; +} void reverse_offsets(sz_size_t *array, size_t length) { size_t i, j; @@ -159,21 +159,21 @@ void reverse_offsets(sz_size_t *array, size_t length) { } } -void reverse_haystacks(sz_haystack_t *array, size_t length) { +void reverse_haystacks(sz_string_view_t *array, size_t length) { size_t i, j; // Swap array[i] and array[j] for (i = 0, j = length - 1; i < j; i++, j--) { - sz_haystack_t temp = array[i]; + sz_string_view_t temp = array[i]; array[i] = array[j]; array[j] = temp; } } -void apply_order(sz_haystack_t *array, sz_size_t *order, size_t length) { - for (size_t i = 0; i < length; ++i) { +void apply_order(sz_string_view_t *array, sz_u64_t *order, size_t length) { + for (sz_u64_t i = 0; i < length; ++i) { if (i == order[i]) continue; - sz_haystack_t temp = array[i]; - size_t k = i, j; + sz_string_view_t temp = array[i]; + sz_u64_t k = i, j; while (i != (j = order[k])) { array[k] = array[j]; order[k] = k; @@ -205,7 +205,7 @@ void slice(size_t length, ssize_t start, ssize_t end, size_t *normalized_offset, *normalized_length = end - start; } -boolean_t export_string_like(PyObject *object, char const **start, size_t *length) { +sz_bool_t export_string_like(PyObject *object, sz_string_start_t **start, sz_size_t *length) { if (PyUnicode_Check(object)) { // Handle Python str Py_ssize_t signed_length; @@ -277,7 +277,7 @@ get_string_at_offset_t str_at_offset_getter(Strs *strs) { } } -boolean_t prepare_strings_for_reordering(Strs *strs) { +sz_bool_t prepare_strings_for_reordering(Strs *strs) { // Allocate memory for reordered slices size_t count = 0; @@ -306,7 +306,7 @@ boolean_t prepare_strings_for_reordering(Strs *strs) { return 0; } - sz_haystack_t *new_parts = (sz_haystack_t *)malloc(count * sizeof(sz_haystack_t)); + sz_string_view_t *new_parts = (sz_string_view_t *)malloc(count * sizeof(sz_string_view_t)); if (new_parts == NULL) { PyErr_SetString(PyExc_MemoryError, "Unable to allocate memory for reordered slices"); return 0; @@ -333,7 +333,7 @@ boolean_t prepare_strings_for_reordering(Strs *strs) { return 1; } -boolean_t prepare_strings_for_extension(Strs *strs, size_t new_parents, size_t new_parts) { return 1; } +sz_bool_t prepare_strings_for_extension(Strs *strs, size_t new_parents, size_t new_parts) { return 1; } #pragma endregion @@ -622,8 +622,8 @@ static int Str_getbuffer(Str *self, Py_buffer *view, int flags) { view->itemsize = sizeof(char); view->format = "c"; // https://docs.python.org/3/library/struct.html#format-characters view->ndim = 1; - view->shape = &self->length; // 1-D array, so shape is just a pointer to the length - view->strides = itemsize; // strides in a 1-D array is just the item size + view->shape = (Py_ssize_t *)&self->length; // 1-D array, so shape is just a pointer to the length + view->strides = itemsize; // strides in a 1-D array is just the item size view->suboffsets = NULL; view->internal = NULL; @@ -639,18 +639,13 @@ static void Str_releasebuffer(PyObject *_, Py_buffer *view) { static int Str_in(Str *self, PyObject *arg) { - sz_needle_t needle_struct; - needle_struct.quadgram_offset = 0; + sz_string_view_t needle_struct; if (!export_string_like(arg, &needle_struct.start, &needle_struct.length)) { PyErr_SetString(PyExc_TypeError, "Unsupported argument type"); return -1; } - sz_haystack_t haystack; - haystack.start = self->start; - haystack.length = self->length; - size_t position = sz_find_substr(haystack, needle_struct); - return position != haystack.length; + return sz_find_substring(self->start, self->length, needle_struct.start, needle_struct.length) != NULL; } static Py_ssize_t Strs_len(Strs *self) { @@ -756,12 +751,12 @@ static PyObject *Strs_subscript(Strs *self, PyObject *key) { to->count = stop - start; to->parent = from->parent; - to->parts = malloc(sizeof(sz_haystack_t) * to->count); + to->parts = malloc(sizeof(sz_string_view_t) * to->count); if (to->parts == NULL && PyErr_NoMemory()) { Py_XDECREF(self_slice); return NULL; } - memcpy(to->parts, from->parts + start, sizeof(sz_haystack_t) * to->count); + memcpy(to->parts, from->parts + start, sizeof(sz_string_view_t) * to->count); Py_INCREF(to->parent); break; } @@ -816,8 +811,8 @@ static int Str_find_( // PyObject *args, PyObject *kwargs, Py_ssize_t *offset_out, - sz_haystack_t *haystack_out, - sz_needle_t *needle_out) { + sz_string_view_t *haystack_out, + sz_string_view_t *needle_out) { int is_member = self != NULL && PyObject_TypeCheck(self, &StrType); Py_ssize_t nargs = PyTuple_Size(args); @@ -845,12 +840,11 @@ static int Str_find_( // } } - sz_haystack_t haystack; - sz_needle_t needle; + sz_string_view_t haystack; + sz_string_view_t needle; Py_ssize_t start, end; // Validate and convert `haystack` and `needle` - needle.quadgram_offset = 0; if (!export_string_like(haystack_obj, &haystack.start, &haystack.length) || !export_string_like(needle_obj, &needle.start, &needle.length)) { PyErr_SetString(PyExc_TypeError, "Haystack and needle must be string-like"); @@ -884,9 +878,9 @@ static int Str_find_( // haystack.length = normalized_length; // Perform contains operation - size_t offset = sz_find_substr(haystack, needle); - if (offset == haystack.length) { *offset_out = -1; } - else { *offset_out = (Py_ssize_t)offset; } + sz_string_start_t match = sz_find_substring(haystack.start, haystack.length, needle.start, needle.length); + if (match == NULL) { *offset_out = -1; } + else { *offset_out = (Py_ssize_t)(match - haystack.start); } *haystack_out = haystack; *needle_out = needle; @@ -895,16 +889,16 @@ static int Str_find_( // static PyObject *Str_find(PyObject *self, PyObject *args, PyObject *kwargs) { Py_ssize_t signed_offset; - sz_haystack_t text; - sz_needle_t separator; + sz_string_view_t text; + sz_string_view_t separator; if (!Str_find_(self, args, kwargs, &signed_offset, &text, &separator)) return NULL; return PyLong_FromSsize_t(signed_offset); } static PyObject *Str_index(PyObject *self, PyObject *args, PyObject *kwargs) { Py_ssize_t signed_offset; - sz_haystack_t text; - sz_needle_t separator; + sz_string_view_t text; + sz_string_view_t separator; if (!Str_find_(self, args, kwargs, &signed_offset, &text, &separator)) return NULL; if (signed_offset == -1) { PyErr_SetString(PyExc_ValueError, "substring not found"); @@ -915,8 +909,8 @@ static PyObject *Str_index(PyObject *self, PyObject *args, PyObject *kwargs) { static PyObject *Str_contains(PyObject *self, PyObject *args, PyObject *kwargs) { Py_ssize_t signed_offset; - sz_haystack_t text; - sz_needle_t separator; + sz_string_view_t text; + sz_string_view_t separator; if (!Str_find_(self, args, kwargs, &signed_offset, &text, &separator)) return NULL; if (signed_offset == -1) { Py_RETURN_FALSE; } else { Py_RETURN_TRUE; } @@ -924,8 +918,8 @@ static PyObject *Str_contains(PyObject *self, PyObject *args, PyObject *kwargs) static PyObject *Str_partition(PyObject *self, PyObject *args, PyObject *kwargs) { Py_ssize_t separator_index; - sz_haystack_t text; - sz_needle_t separator; + sz_string_view_t text; + sz_string_view_t separator; PyObject *result_tuple; // Use Str_find_ to get the index of the separator @@ -993,13 +987,12 @@ static PyObject *Str_count(PyObject *self, PyObject *args, PyObject *kwargs) { return NULL; } - sz_haystack_t haystack; - sz_needle_t needle; + sz_string_view_t haystack; + sz_string_view_t needle; Py_ssize_t start = start_obj ? PyLong_AsSsize_t(start_obj) : 0; Py_ssize_t end = end_obj ? PyLong_AsSsize_t(end_obj) : PY_SSIZE_T_MAX; int allowoverlap = allowoverlap_obj ? PyObject_IsTrue(allowoverlap_obj) : 0; - needle.quadgram_offset = 0; if (!export_string_like(haystack_obj, &haystack.start, &haystack.length) || !export_string_like(needle_obj, &needle.start, &needle.length)) return PyErr_Format(PyExc_TypeError, "Haystack and needle must be string-like"), NULL; @@ -1013,27 +1006,28 @@ static PyObject *Str_count(PyObject *self, PyObject *args, PyObject *kwargs) { size_t count = 0; if (needle.length == 0 || haystack.length == 0 || haystack.length < needle.length) { count = 0; } - else if (needle.length == 1) { count = sz_count_char(haystack, needle.start[0]); } - else if (needle.length != 1) { - if (allowoverlap) { - while (haystack.length) { - sz_size_t offset = sz_find_substr(haystack, needle); - int found = offset != haystack.length; - count += found; - haystack.start += offset + found; - haystack.length -= offset + found; - } + else if (needle.length == 1) { count = sz_count_char(haystack.start, haystack.length, needle.start); } + else if (allowoverlap) { + while (haystack.length) { + sz_string_start_t ptr = sz_find_substring(haystack.start, haystack.length, needle.start, needle.length); + sz_bool_t found = ptr != NULL; + sz_size_t offset = found ? ptr - haystack.start : haystack.length; + count += found; + haystack.start += offset + found; + haystack.length -= offset + found; } - else { - while (haystack.length) { - sz_size_t offset = sz_find_substr(haystack, needle); - int found = offset != haystack.length; - count += found; - haystack.start += offset + needle.length; - haystack.length -= offset + needle.length * found; - } + } + else { + while (haystack.length) { + sz_string_start_t ptr = sz_find_substring(haystack.start, haystack.length, needle.start, needle.length); + sz_bool_t found = ptr != NULL; + sz_size_t offset = found ? ptr - haystack.start : haystack.length; + count += found; + haystack.start += offset + needle.length; + haystack.length -= offset + needle.length * found; } } + return PyLong_FromSize_t(count); } @@ -1068,7 +1062,7 @@ static PyObject *Str_levenstein(PyObject *self, PyObject *args, PyObject *kwargs return NULL; } - sz_haystack_t str1, str2; + sz_string_view_t str1, str2; if (!export_string_like(str1_obj, &str1.start, &str1.length) || !export_string_like(str2_obj, &str2.start, &str2.length)) { PyErr_Format(PyExc_TypeError, "Both arguments must be string-like"); @@ -1119,7 +1113,7 @@ static PyObject *Str_startswith(PyObject *self, PyObject *args, PyObject *kwargs return NULL; } - sz_haystack_t str, prefix; + sz_string_view_t str, prefix; if (!export_string_like(str_obj, &str.start, &str.length) || !export_string_like(prefix_obj, &prefix.start, &prefix.length)) { PyErr_SetString(PyExc_TypeError, "Both arguments must be string-like"); @@ -1162,7 +1156,7 @@ static PyObject *Str_endswith(PyObject *self, PyObject *args, PyObject *kwargs) return NULL; } - sz_haystack_t str, suffix; + sz_string_view_t str, suffix; if (!export_string_like(str_obj, &str.start, &str.length) || !export_string_like(suffix_obj, &suffix.start, &suffix.length)) { PyErr_SetString(PyExc_TypeError, "Both arguments must be string-like"); @@ -1180,7 +1174,7 @@ static PyObject *Str_endswith(PyObject *self, PyObject *args, PyObject *kwargs) } static Strs *Str_split_( - PyObject *parent, sz_haystack_t text, sz_needle_t separator, int keepseparator, Py_ssize_t maxsplit) { + PyObject *parent, sz_string_view_t text, sz_string_view_t separator, int keepseparator, Py_ssize_t maxsplit) { // Create Strs object Strs *result = (Strs *)PyObject_New(Strs, &StrsType); @@ -1209,10 +1203,9 @@ static Strs *Str_split_( // Iterate through string, keeping track of the sz_size_t last_start = 0; while (last_start <= text.length && offsets_count < maxsplit) { - sz_haystack_t text_remaining; - text_remaining.start = text.start + last_start; - text_remaining.length = text.length - last_start; - sz_size_t offset_in_remaining = sz_find_substr(text_remaining, separator); + sz_string_start_t match = + sz_find_substring(text.start + last_start, text.length - last_start, separator.start, separator.length); + sz_size_t offset_in_remaining = match ? match - text.start - last_start : text.length - last_start; // Reallocate offsets array if needed if (offsets_count >= offsets_capacity) { @@ -1232,7 +1225,7 @@ static Strs *Str_split_( } // Export the offset - size_t will_continue = offset_in_remaining != text_remaining.length; + size_t will_continue = match != NULL; size_t next_offset = last_start + offset_in_remaining + separator.length * will_continue; if (text.length >= UINT32_MAX) { ((uint64_t *)offsets_endings)[offsets_count++] = (uint64_t)next_offset; } else { ((uint32_t *)offsets_endings)[offsets_count++] = (uint32_t)next_offset; } @@ -1282,11 +1275,10 @@ static PyObject *Str_split(PyObject *self, PyObject *args, PyObject *kwargs) { } } - sz_haystack_t text; - sz_needle_t separator; + sz_string_view_t text; + sz_string_view_t separator; int keepseparator; Py_ssize_t maxsplit; - separator.quadgram_offset = 0; // Validate and convert `text` if (!export_string_like(text_obj, &text.start, &text.length)) { @@ -1355,7 +1347,7 @@ static PyObject *Str_splitlines(PyObject *self, PyObject *args, PyObject *kwargs } } - sz_haystack_t text; + sz_string_view_t text; int keeplinebreaks; Py_ssize_t maxsplit = PY_SSIZE_T_MAX; // Default value for maxsplit @@ -1388,14 +1380,14 @@ static PyObject *Str_splitlines(PyObject *self, PyObject *args, PyObject *kwargs // https://docs.python.org/3/library/stdtypes.html#str.splitlines // \n, \r, \r\n, \v or \x0b, \f or \x0c, \x1c, \x1d, \x1e, \x85, \u2028, \u2029 // https://github.com/ashvardanian/StringZilla/issues/29 - sz_needle_t separator; + sz_string_view_t separator; separator.start = "\n"; separator.length = 1; return Str_split_(text_obj, text, separator, keeplinebreaks, maxsplit); } static PyObject *Str_concat(PyObject *self, PyObject *other) { - struct sz_haystack_t self_str, other_str; + struct sz_string_view_t self_str, other_str; // Validate and convert `self` if (!export_string_like(self, &self_str.start, &self_str.length)) { @@ -1453,7 +1445,8 @@ static PyNumberMethods Str_as_number = { #define sz_method_flags_m METH_VARARGS | METH_KEYWORDS -static PyMethodDef Str_methods[] = { // +static PyMethodDef Str_methods[] = { + // {"find", Str_find, sz_method_flags_m, "Find the first occurrence of a substring."}, {"index", Str_index, sz_method_flags_m, "Find the first occurrence of a substring or raise error if missing."}, {"contains", Str_contains, sz_method_flags_m, "Check if a string contains a substring."}, @@ -1537,14 +1530,14 @@ static PyObject *Strs_shuffle(Strs *self, PyObject *args, PyObject *kwargs) { // Get the parts and their count struct reordered_slices_t *reordered = &self->data.reordered; - sz_haystack_t *parts = reordered->parts; + sz_string_view_t *parts = reordered->parts; size_t count = reordered->count; // Fisher-Yates Shuffle Algorithm for (size_t i = count - 1; i > 0; --i) { size_t j = rand() % (i + 1); // Swap parts[i] and parts[j] - sz_haystack_t temp = parts[i]; + sz_string_view_t temp = parts[i]; parts[i] = parts[j]; parts[j] = temp; } @@ -1552,8 +1545,8 @@ static PyObject *Strs_shuffle(Strs *self, PyObject *args, PyObject *kwargs) { Py_RETURN_NONE; } -static boolean_t Strs_sort_(Strs *self, - sz_haystack_t **parts_output, +static sz_bool_t Strs_sort_(Strs *self, + sz_string_view_t **parts_output, sz_size_t **order_output, sz_size_t *count_output) { @@ -1565,7 +1558,7 @@ static boolean_t Strs_sort_(Strs *self, // Get the parts and their count // The only possible `self->type` by now is the `STRS_REORDERED` - sz_haystack_t *parts = self->data.reordered.parts; + sz_string_view_t *parts = self->data.reordered.parts; size_t count = self->data.reordered.count; // Allocate temporary memory to store the ordering offsets @@ -1627,7 +1620,7 @@ static PyObject *Strs_sort(Strs *self, PyObject *args, PyObject *kwargs) { } } - boolean_t reverse = 0; // Default is False + sz_bool_t reverse = 0; // Default is False if (reverse_obj) { if (!PyBool_Check(reverse_obj)) { PyErr_SetString(PyExc_TypeError, "The reverse must be a boolean"); @@ -1636,7 +1629,7 @@ static PyObject *Strs_sort(Strs *self, PyObject *args, PyObject *kwargs) { reverse = PyObject_IsTrue(reverse_obj); } - sz_haystack_t *parts = NULL; + sz_string_view_t *parts = NULL; sz_size_t *order = NULL; sz_size_t count = 0; if (!Strs_sort_(self, &parts, &order, &count)) return NULL; @@ -1680,7 +1673,7 @@ static PyObject *Strs_order(Strs *self, PyObject *args, PyObject *kwargs) { } } - boolean_t reverse = 0; // Default is False + sz_bool_t reverse = 0; // Default is False if (reverse_obj) { if (!PyBool_Check(reverse_obj)) { PyErr_SetString(PyExc_TypeError, "The reverse must be a boolean"); @@ -1689,7 +1682,7 @@ static PyObject *Strs_order(Strs *self, PyObject *args, PyObject *kwargs) { reverse = PyObject_IsTrue(reverse_obj); } - sz_haystack_t *parts = NULL; + sz_string_view_t *parts = NULL; sz_size_t *order = NULL; sz_size_t count = 0; if (!Strs_sort_(self, &parts, &order, &count)) return NULL; diff --git a/scripts/test.cpp b/scripts/test.cpp index 8dc1a4d2..b61b7d40 100644 --- a/scripts/test.cpp +++ b/scripts/test.cpp @@ -232,19 +232,22 @@ int main(int, char const **) { bench_search("std::search", full_text, [&]() mutable { return std::search(full_text.begin(), full_text.end(), needle.begin(), needle.end()) - full_text.begin(); }); - bench_search("sz_find_substr_swar", full_text, [&]() mutable { - sz_string_ptr_t ptr = sz_find_substr_swar(full_text.data(), full_text.size(), needle.data(), needle.size()); + bench_search("sz_find_substring_swar", full_text, [&]() mutable { + sz_string_start_t ptr = + sz_find_substring_swar(full_text.data(), full_text.size(), needle.data(), needle.size()); return ptr ? ptr - full_text.data() : full_text.size(); }); #if defined(__ARM_NEON) - bench_search("sz_find_substr_neon", full_text, [&]() mutable { - sz_string_ptr_t ptr = sz_find_substr_neon(full_text.data(), full_text.size(), needle.data(), needle.size()); + bench_search("sz_find_substring_neon", full_text, [&]() mutable { + sz_string_start_t ptr = + sz_find_substring_neon(full_text.data(), full_text.size(), needle.data(), needle.size()); return ptr ? ptr - full_text.data() : full_text.size(); }); #endif #if defined(__AVX2__) - bench_search("sz_find_substr_avx2", full_text, [&]() mutable { - sz_string_ptr_t ptr = sz_find_substr_avx2(full_text.data(), full_text.size(), needle.data(), needle.size()); + bench_search("sz_find_substring_avx2", full_text, [&]() mutable { + sz_string_start_t ptr = + sz_find_substring_avx2(full_text.data(), full_text.size(), needle.data(), needle.size()); return ptr ? ptr - full_text.data() : full_text.size(); }); #endif diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h index ba7f5f39..c7c0ae49 100644 --- a/stringzilla/stringzilla.h +++ b/stringzilla/stringzilla.h @@ -45,16 +45,16 @@ typedef unsigned long sz_size_t; typedef unsigned sz_size_t; #endif -typedef int sz_bool_t; // Only one relevant bit -typedef unsigned sz_u32_t; // Always 32 bits -typedef unsigned long long sz_u64_t; // Always 64 bits -typedef char const *sz_string_ptr_t; // A type alias for `char const * ` +typedef int sz_bool_t; // Only one relevant bit +typedef unsigned sz_u32_t; // Always 32 bits +typedef unsigned long long sz_u64_t; // Always 64 bits +typedef char const *sz_string_start_t; // A type alias for `char const * ` /** * @brief Helper construct for higher-level bindings. */ typedef struct sz_string_view_t { - sz_string_ptr_t start; + sz_string_start_t start; sz_size_t length; } sz_string_view_t; @@ -72,8 +72,8 @@ typedef union _sz_anomaly_t { * Doesn't provide major performance improvements, but helps avoid the LibC dependency. * @return 1 for `true`, and 0 for `false`. */ -inline static sz_bool_t sz_equal(sz_string_ptr_t a, sz_string_ptr_t b, sz_size_t length) { - sz_string_ptr_t const a_end = a + length; +inline static sz_bool_t sz_equal(sz_string_start_t a, sz_string_start_t b, sz_size_t length) { + sz_string_start_t const a_end = a + length; while (a != a_end && *a == *b) a++, b++; return a_end == a; } @@ -82,13 +82,13 @@ inline static sz_bool_t sz_equal(sz_string_ptr_t a, sz_string_ptr_t b, sz_size_t * @brief Count the number of occurrences of a @b single-character needle in an arbitrary length haystack. * This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time. */ -inline static sz_size_t sz_count_char_swar(sz_string_ptr_t const haystack, +inline static sz_size_t sz_count_char_swar(sz_string_start_t const haystack, sz_size_t const haystack_length, - sz_string_ptr_t const needle) { + sz_string_start_t const needle) { sz_size_t result = 0; - sz_string_ptr_t text = haystack; - sz_string_ptr_t const end = haystack + haystack_length; + sz_string_start_t text = haystack; + sz_string_start_t const end = haystack + haystack_length; // Process the misaligned head, to void UB on unaligned 64-bit loads. for (; ((unsigned long)text & 7ul) && text < end; ++text) result += *text == *needle; @@ -117,12 +117,12 @@ inline static sz_size_t sz_count_char_swar(sz_string_ptr_t const haystack, * This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time. * Identical to `memchr(haystack, needle[0], haystack_length)`. */ -inline static sz_string_ptr_t sz_find_1char_swar(sz_string_ptr_t const haystack, - sz_size_t const haystack_length, - sz_string_ptr_t const needle) { +inline static sz_string_start_t sz_find_1char_swar(sz_string_start_t const haystack, + sz_size_t const haystack_length, + sz_string_start_t const needle) { - sz_string_ptr_t text = haystack; - sz_string_ptr_t const end = haystack + haystack_length; + sz_string_start_t text = haystack; + sz_string_start_t const end = haystack + haystack_length; // Process the misaligned head, to void UB on unaligned 64-bit loads. for (; ((unsigned long)text & 7ul) && text < end; ++text) @@ -154,12 +154,12 @@ inline static sz_string_ptr_t sz_find_1char_swar(sz_string_ptr_t const haystack, * This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time. * Identical to `memrchr(haystack, needle[0], haystack_length)`. */ -inline static sz_string_ptr_t sz_rfind_1char_swar(sz_string_ptr_t const haystack, - sz_size_t const haystack_length, - sz_string_ptr_t const needle) { +inline static sz_string_start_t sz_rfind_1char_swar(sz_string_start_t const haystack, + sz_size_t const haystack_length, + sz_string_start_t const needle) { - sz_string_ptr_t const end = haystack + haystack_length; - sz_string_ptr_t text = end - 1; + sz_string_start_t const end = haystack + haystack_length; + sz_string_start_t text = end - 1; // Process the misaligned head, to void UB on unaligned 64-bit loads. for (; ((unsigned long)text & 7ul) && text >= haystack; --text) @@ -190,12 +190,12 @@ inline static sz_string_ptr_t sz_rfind_1char_swar(sz_string_ptr_t const haystack * @brief Find the first occurrence of a @b two-character needle in an arbitrary length haystack. * This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time. */ -inline static sz_string_ptr_t sz_find_2char_swar(sz_string_ptr_t const haystack, - sz_size_t const haystack_length, - sz_string_ptr_t const needle) { +inline static sz_string_start_t sz_find_2char_swar(sz_string_start_t const haystack, + sz_size_t const haystack_length, + sz_string_start_t const needle) { - sz_string_ptr_t text = haystack; - sz_string_ptr_t const end = haystack + haystack_length; + sz_string_start_t text = haystack; + sz_string_start_t const end = haystack + haystack_length; // Process the misaligned head, to void UB on unaligned 64-bit loads. for (; ((unsigned long)text & 7ul) && text + 2 <= end; ++text) @@ -239,12 +239,12 @@ inline static sz_string_ptr_t sz_find_2char_swar(sz_string_ptr_t const haystack, * @brief Find the first occurrence of a three-character needle in an arbitrary length haystack. * This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time. */ -inline static sz_string_ptr_t sz_find_3char_swar(sz_string_ptr_t const haystack, - sz_size_t const haystack_length, - sz_string_ptr_t const needle) { +inline static sz_string_start_t sz_find_3char_swar(sz_string_start_t const haystack, + sz_size_t const haystack_length, + sz_string_start_t const needle) { - sz_string_ptr_t text = haystack; - sz_string_ptr_t end = haystack + haystack_length; + sz_string_start_t text = haystack; + sz_string_start_t end = haystack + haystack_length; // Process the misaligned head, to void UB on unaligned 64-bit loads. for (; ((unsigned long)text & 7ul) && text + 3 <= end; ++text) @@ -301,12 +301,12 @@ inline static sz_string_ptr_t sz_find_3char_swar(sz_string_ptr_t const haystack, * @brief Find the first occurrence of a @b four-character needle in an arbitrary length haystack. * This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time. */ -inline static sz_string_ptr_t sz_find_4char_swar(sz_string_ptr_t const haystack, - sz_size_t const haystack_length, - sz_string_ptr_t const needle) { +inline static sz_string_start_t sz_find_4char_swar(sz_string_start_t const haystack, + sz_size_t const haystack_length, + sz_string_start_t const needle) { - sz_string_ptr_t text = haystack; - sz_string_ptr_t end = haystack + haystack_length; + sz_string_start_t text = haystack; + sz_string_start_t end = haystack + haystack_length; // Process the misaligned head, to void UB on unaligned 64-bit loads. for (; ((unsigned long)text & 7ul) && text + 4 <= end; ++text) @@ -367,10 +367,10 @@ inline static sz_string_ptr_t sz_find_4char_swar(sz_string_ptr_t const haystack, * it compares 4-byte anomalies first, most commonly prefixes. It's computationally cheaper. * Matching performance fluctuates between 1 GB/s and 3,5 GB/s per core. */ -inline static sz_string_ptr_t sz_find_substr_swar( // - sz_string_ptr_t const haystack, +inline static sz_string_start_t sz_find_substring_swar( // + sz_string_start_t const haystack, sz_size_t const haystack_length, - sz_string_ptr_t const needle, + sz_string_start_t const needle, sz_size_t const needle_length) { if (haystack_length < needle_length) return NULL; @@ -383,12 +383,12 @@ inline static sz_string_ptr_t sz_find_substr_swar( // case 3: return sz_find_3char_swar(haystack, haystack_length, needle); case 4: return sz_find_4char_swar(haystack, haystack_length, needle); default: { - sz_string_ptr_t text = haystack; - sz_string_ptr_t const end = haystack + haystack_length; + sz_string_start_t text = haystack; + sz_string_start_t const end = haystack + haystack_length; _sz_anomaly_t n_anomaly, h_anomaly; sz_size_t const n_suffix_len = needle_length - 4 - anomaly_offset; - sz_string_ptr_t n_suffix_ptr = needle + 4 + anomaly_offset; + sz_string_start_t n_suffix_ptr = needle + 4 + anomaly_offset; n_anomaly.u8s[0] = needle[anomaly_offset]; n_anomaly.u8s[1] = needle[anomaly_offset + 1]; n_anomaly.u8s[2] = needle[anomaly_offset + 2]; @@ -401,10 +401,9 @@ inline static sz_string_ptr_t sz_find_substr_swar( // text += anomaly_offset; while (text + needle_length <= end) { h_anomaly.u8s[3] = text[3]; - if (h_anomaly.u32 == n_anomaly.u32) // Match anomaly. - if (sz_equal(text + 4, n_suffix_ptr, n_suffix_len)) // Match suffix. - if (sz_equal(text - anomaly_offset, needle, anomaly_offset)) // Match prefix - optimized out. - return text - anomaly_offset; + if (h_anomaly.u32 == n_anomaly.u32) // Match anomaly. + if (sz_equal(text + 4, n_suffix_ptr, n_suffix_len)) // Match suffix. + return text; h_anomaly.u32 >>= 8; ++text; @@ -417,8 +416,8 @@ inline static sz_string_ptr_t sz_find_substr_swar( // /** * Helper function, used in substring search operations. */ -inline static void _sz_find_substr_populate_anomaly( // - sz_string_ptr_t const needle, +inline static void _sz_find_substring_populate_anomaly( // + sz_string_start_t const needle, sz_size_t const needle_length, _sz_anomaly_t *anomaly_out, _sz_anomaly_t *mask_out) { @@ -455,16 +454,16 @@ inline static void _sz_find_substr_populate_anomaly( // * was practically more efficient than loading once and shifting around, as introduces * less data dependencies. */ -inline static sz_string_ptr_t sz_find_substr_avx2(sz_string_ptr_t const haystack, - sz_size_t const haystack_length, - sz_string_ptr_t const needle, - sz_size_t const needle_length) { +inline static sz_string_start_t sz_find_substring_avx2(sz_string_start_t const haystack, + sz_size_t const haystack_length, + sz_string_start_t const needle, + sz_size_t const needle_length) { // Precomputed constants - sz_string_ptr_t const end = haystack + haystack_length; + sz_string_start_t const end = haystack + haystack_length; _sz_anomaly_t anomaly; _sz_anomaly_t mask; - _sz_find_substr_populate_anomaly(needle, needle_length, &anomaly, &mask); + _sz_find_substring_populate_anomaly(needle, needle_length, &anomaly, &mask); __m256i const anomalies = _mm256_set1_epi32(anomaly.u32); __m256i const masks = _mm256_set1_epi32(mask.u32); @@ -477,7 +476,7 @@ inline static sz_string_ptr_t sz_find_substr_avx2(sz_string_ptr_t const haystack // + 4 movemasks. // + 3 bitwise ANDs. // + 1 heavy (but very unlikely) branch. - sz_string_ptr_t text = haystack; + sz_string_start_t text = haystack; while (text + needle_length + 32 <= end) { // Performing many unaligned loads ends up being faster than loading once and shuffling around. @@ -511,7 +510,7 @@ inline static sz_string_ptr_t sz_find_substr_avx2(sz_string_ptr_t const haystack } // Don't forget the last (up to 35) characters. - return sz_find_substr_swar(text, end - text, needle, needle_length); + return sz_find_substring_swar(text, end - text, needle, needle_length); } #endif // x86 AVX2 @@ -524,21 +523,21 @@ inline static sz_string_ptr_t sz_find_substr_avx2(sz_string_ptr_t const haystack * was practically more efficient than loading once and shifting around, as introduces * less data dependencies. */ -inline static sz_string_ptr_t sz_find_substr_neon(sz_string_ptr_t const haystack, - sz_size_t const haystack_length, - sz_string_ptr_t const needle, - sz_size_t const needle_length) { +inline static sz_string_start_t sz_find_substring_neon(sz_string_start_t const haystack, + sz_size_t const haystack_length, + sz_string_start_t const needle, + sz_size_t const needle_length) { // Precomputed constants - sz_string_ptr_t const end = haystack + haystack_length; + sz_string_start_t const end = haystack + haystack_length; _sz_anomaly_t anomaly; _sz_anomaly_t mask; - _sz_find_substr_populate_anomaly(needle, needle_length, &anomaly, &mask); + _sz_find_substring_populate_anomaly(needle, needle_length, &anomaly, &mask); uint32x4_t const anomalies = vld1q_dup_u32(&anomaly.u32); uint32x4_t const masks = vld1q_dup_u32(&mask.u32); uint32x4_t matches, matches0, matches1, matches2, matches3; - sz_string_ptr_t text = haystack; + sz_string_start_t text = haystack; while (text + needle_length + 16 <= end) { // Each of the following `matchesX` contains only 4 relevant bits - one per word. @@ -582,40 +581,40 @@ inline static sz_string_ptr_t sz_find_substr_neon(sz_string_ptr_t const haystack } // Don't forget the last (up to 16+3=19) characters. - return sz_find_substr_swar(text, end - text, needle, needle_length); + return sz_find_substring_swar(text, end - text, needle, needle_length); } #endif // Arm Neon -inline static sz_size_t sz_count_char(sz_string_ptr_t const haystack, +inline static sz_size_t sz_count_char(sz_string_start_t const haystack, sz_size_t const haystack_length, - sz_string_ptr_t const needle) { + sz_string_start_t const needle) { return sz_count_char_swar(haystack, haystack_length, needle); } -inline static sz_string_ptr_t sz_find_1char(sz_string_ptr_t const haystack, - sz_size_t const haystack_length, - sz_string_ptr_t const needle) { +inline static sz_string_start_t sz_find_1char(sz_string_start_t const haystack, + sz_size_t const haystack_length, + sz_string_start_t const needle) { return sz_find_1char_swar(haystack, haystack_length, needle); } -inline static sz_string_ptr_t sz_rfind_1char(sz_string_ptr_t const haystack, - sz_size_t const haystack_length, - sz_string_ptr_t const needle) { +inline static sz_string_start_t sz_rfind_1char(sz_string_start_t const haystack, + sz_size_t const haystack_length, + sz_string_start_t const needle) { return sz_rfind_1char_swar(haystack, haystack_length, needle); } -inline static sz_string_ptr_t sz_find_substr(sz_string_ptr_t const haystack, - sz_size_t const haystack_length, - sz_string_ptr_t const needle, - sz_size_t const needle_length) { +inline static sz_string_start_t sz_find_substring(sz_string_start_t const haystack, + sz_size_t const haystack_length, + sz_string_start_t const needle, + sz_size_t const needle_length) { if (haystack_length < needle_length || needle_length == 0) return NULL; #if defined(__ARM_NEON) - return sz_find_substr_neon(haystack, haystack_length, needle, needle_length); + return sz_find_substring_neon(haystack, haystack_length, needle, needle_length); #elif defined(__AVX2__) - return sz_find_substr_avx2(haystack, haystack_length, needle, needle_length); + return sz_find_substring_avx2(haystack, haystack_length, needle, needle_length); #else - return sz_find_substr_swar(haystack, haystack_length, needle, needle_length); + return sz_find_substring_swar(haystack, haystack_length, needle, needle_length); #endif } @@ -669,30 +668,46 @@ inline static char sz_toupper_ascii(char c) { return *(char *)&upped[(int)c]; } +inline static sz_u64_t sz_u64_unaligned_load(void const *ptr) { +#ifdef _MSC_VER + return *((__unaligned sz_u64_t *)ptr); +#else + __attribute__((aligned(1))) sz_u64_t const *uptr = (sz_u64_t const *)ptr; + return *uptr; +#endif +} + +inline static sz_u64_t sz_u64_byte_reverse(sz_u64_t val) { +#ifdef _MSC_VER + return _byteswap_uint64(val); +#else + return __builtin_bswap64(val); +#endif +} + /** * @brief Char-level lexicographic comparison of two strings. * Doesn't provide major performance improvements, but helps avoid the LibC dependency. */ -inline static sz_bool_t sz_is_less_ascii(sz_string_ptr_t const a, +inline static sz_bool_t sz_is_less_ascii(sz_string_start_t a, sz_size_t const a_length, - sz_string_ptr_t const b, + sz_string_start_t b, sz_size_t const b_length) { sz_size_t min_length = (a_length < b_length) ? a_length : b_length; - for (sz_size_t i = 0; i < min_length; ++i) { - if (a[i] < b[i]) return 1; - if (a[i] > b[i]) return 0; - } - return a_length < b_length; + sz_string_start_t const min_end = a + min_length; + while (a + 8 <= min_end && sz_u64_unaligned_load(a) == sz_u64_unaligned_load(b)) a += 8, b += 8; + while (a != min_end && *a == *b) a++, b++; + return a != min_end ? (*a < *b) : (a_length < b_length); } /** * @brief Char-level lexicographic comparison of two strings, insensitive to the case of ASCII symbols. * Doesn't provide major performance improvements, but helps avoid the LibC dependency. */ -inline static sz_bool_t sz_is_less_uncased_ascii(sz_string_ptr_t const a, +inline static sz_bool_t sz_is_less_uncased_ascii(sz_string_start_t const a, sz_size_t const a_length, - sz_string_ptr_t const b, + sz_string_start_t const b, sz_size_t const b_length) { sz_size_t min_length = (a_length < b_length) ? a_length : b_length; @@ -716,11 +731,11 @@ inline static void _sz_swap_order(sz_u64_t *a, sz_u64_t *b) { struct sz_sequence_t; -typedef sz_string_ptr_t (*sz_sequence_member_start_t)(struct sz_sequence_t const *, sz_size_t); +typedef sz_string_start_t (*sz_sequence_member_start_t)(struct sz_sequence_t const *, sz_size_t); typedef sz_size_t (*sz_sequence_member_length_t)(struct sz_sequence_t const *, sz_size_t); typedef sz_bool_t (*sz_sequence_predicate_t)(struct sz_sequence_t const *, sz_size_t); typedef sz_bool_t (*sz_sequence_comparator_t)(struct sz_sequence_t const *, sz_size_t, sz_size_t); -typedef sz_bool_t (*sz_string_is_less_t)(sz_string_ptr_t, sz_size_t, sz_string_ptr_t, sz_size_t); +typedef sz_bool_t (*sz_string_is_less_t)(sz_string_start_t, sz_size_t, sz_string_start_t, sz_size_t); typedef struct sz_sequence_t { sz_u64_t *order; @@ -795,9 +810,12 @@ inline static void sz_sort_insertion(sz_sequence_t *sequence, sz_sequence_compar // Utility functions inline static sz_size_t _sz_log2i(sz_size_t n) { - sz_size_t log2 = 0; - while (n >>= 1) ++log2; - return log2; + if (n == 0) return 0; // to avoid undefined behavior with __builtin_clz +#if defined(__LP64__) || defined(_WIN64) // 64-bit + return 63 - __builtin_clzll(n); +#else // 32-bit + return 31 - __builtin_clz(n); +#endif } inline static void _sz_sift_down( @@ -893,7 +911,7 @@ inline static void _sz_introsort( sz_u64_t pivot = sequence->order[median]; sz_size_t left = first; sz_size_t right = last - 1; - while (true) { + while (1) { while (less(sequence, sequence->order[left], pivot)) left++; while (less(sequence, pivot, sequence->order[right])) right--; if (left >= right) break; @@ -962,17 +980,17 @@ inline static void _sz_sort_recursion( // } inline static sz_bool_t _sz_sort_compare_less_ascii(sz_sequence_t *sequence, sz_size_t i_key, sz_size_t j_key) { - sz_string_ptr_t i_str = sequence->get_start(sequence, i_key); + sz_string_start_t i_str = sequence->get_start(sequence, i_key); sz_size_t i_len = sequence->get_length(sequence, i_key); - sz_string_ptr_t j_str = sequence->get_start(sequence, j_key); + sz_string_start_t j_str = sequence->get_start(sequence, j_key); sz_size_t j_len = sequence->get_length(sequence, j_key); return sz_is_less_ascii(i_str, i_len, j_str, j_len); } inline static sz_bool_t _sz_sort_compare_less_uncased_ascii(sz_sequence_t *sequence, sz_size_t i_key, sz_size_t j_key) { - sz_string_ptr_t i_str = sequence->get_start(sequence, i_key); + sz_string_start_t i_str = sequence->get_start(sequence, i_key); sz_size_t i_len = sequence->get_length(sequence, i_key); - sz_string_ptr_t j_str = sequence->get_start(sequence, j_key); + sz_string_start_t j_str = sequence->get_start(sequence, j_key); sz_size_t j_len = sequence->get_length(sequence, j_key); return sz_is_less_uncased_ascii(i_str, i_len, j_str, j_len); } @@ -994,7 +1012,7 @@ inline static void sz_sort(sz_sequence_t *sequence, sz_sort_config_t const *conf // Export up to 4 bytes into the `sequence` bits themselves for (sz_size_t i = 0; i != sequence->count; ++i) { - sz_string_ptr_t begin = sequence->get_start(sequence, sequence->order[i]); + sz_string_start_t begin = sequence->get_start(sequence, sequence->order[i]); sz_size_t length = sequence->get_length(sequence, sequence->order[i]); length = length > 4ul ? 4ul : length; char *prefix = (char *)&sequence->order[i]; @@ -1038,9 +1056,9 @@ inline static levenstein_distance_t _sz_levenstein_minimum( // * It accepts an upper bound on the possible error. Quadratic complexity in time, linear in space. */ inline static levenstein_distance_t sz_levenstein( // - sz_string_ptr_t const a, + sz_string_start_t const a, sz_size_t const a_length, - sz_string_ptr_t const b, + sz_string_start_t const b, sz_size_t const b_length, levenstein_distance_t const bound, void *buffer) { @@ -1093,11 +1111,11 @@ inline static levenstein_distance_t sz_levenstein( // /** * @brief Hashes provided string using hardware-accelerated CRC32 instructions. */ -inline static sz_u32_t sz_hash_crc32_native(sz_string_ptr_t start, sz_size_t length) { return 0; } +inline static sz_u32_t sz_hash_crc32_native(sz_string_start_t start, sz_size_t length) { return 0; } -inline static sz_u32_t sz_hash_crc32_neon(sz_string_ptr_t start, sz_size_t length) { return 0; } +inline static sz_u32_t sz_hash_crc32_neon(sz_string_start_t start, sz_size_t length) { return 0; } -inline static sz_u32_t sz_hash_crc32_sse(sz_string_ptr_t start, sz_size_t length) { return 0; } +inline static sz_u32_t sz_hash_crc32_sse(sz_string_start_t start, sz_size_t length) { return 0; } #ifdef __cplusplus } From b0a280d783fef546b4b1b3245cdcaa86e169a97b Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Mon, 9 Oct 2023 16:51:35 -0700 Subject: [PATCH 63/72] Make: Formatting and docs --- .vscode/settings.json | 3 +- CMakeLists.txt | 140 ++++++++++++++++++++++-------------------- README.md | 10 +-- scripts/bench.ipynb | 20 ++++-- scripts/test.c | 14 ++--- 5 files changed, 100 insertions(+), 87 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 08c5bb65..575441f2 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -118,7 +118,8 @@ "strstream": "cpp", "filesystem": "cpp", "stringzilla.h": "c", - "__memory": "c" + "__memory": "c", + "charconv": "c" }, "C_Cpp.default.configurationProvider": "ms-vscode.cmake-tools", "cSpell.words": [ diff --git a/CMakeLists.txt b/CMakeLists.txt index df569329..230c2a06 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,105 +1,109 @@ # This CMake file is heavily inspired by following `stringzilla` CMake: # https://github.com/nlohmann/json/blob/develop/CMakeLists.txt cmake_minimum_required(VERSION 3.1) -project(stringzilla VERSION 0.1.0 LANGUAGES C CXX) +project( + stringzilla + VERSION 0.1.0 + LANGUAGES C CXX) -set (CMAKE_C_STANDARD 11) -set (CMAKE_CXX_STANDARD 17) +set(CMAKE_C_STANDARD 11) +set(CMAKE_CXX_STANDARD 17) -# Determine if USearch is built as a subproject (using `add_subdirectory`) or if it is the main project +# Determine if USearch is built as a subproject (using `add_subdirectory`) or if +# it is the main project set(STRINGZILLA_IS_MAIN_PROJECT OFF) -if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR) - set(STRINGZILLA_IS_MAIN_PROJECT ON) +if(CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR) + set(STRINGZILLA_IS_MAIN_PROJECT ON) endif() # Options option(STRINGZILLA_INSTALL "Install CMake targets" OFF) -option(STRINGZILLA_BUILD_TEST "Compile a native unit test in C++" ${STRINGZILLA_IS_MAIN_PROJECT}) -option(STRINGZILLA_BUILD_BENCHMARK "Compile a native benchmark in C++" ${STRINGZILLA_IS_MAIN_PROJECT}) +option(STRINGZILLA_BUILD_TEST "Compile a native unit test in C++" + ${STRINGZILLA_IS_MAIN_PROJECT}) +option(STRINGZILLA_BUILD_BENCHMARK "Compile a native benchmark in C++" + ${STRINGZILLA_IS_MAIN_PROJECT}) option(STRINGZILLA_BUILD_WOLFRAM "Compile Wolfram Language bindings" OFF) # Includes set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH}) include(ExternalProject) -# Allow CMake 3.13+ to override options when using FetchContent / add_subdirectory -if (POLICY CMP0077) - cmake_policy(SET CMP0077 NEW) -endif () +# Allow CMake 3.13+ to override options when using FetchContent / +# add_subdirectory +if(POLICY CMP0077) + cmake_policy(SET CMP0077 NEW) +endif() # Configuration include(GNUInstallDirs) -set(STRINGZILLA_TARGET_NAME ${PROJECT_NAME}) -set(STRINGZILLA_CONFIG_INSTALL_DIR "${CMAKE_INSTALL_DATADIR}/cmake/${PROJECT_NAME}" CACHE INTERNAL "") -set(STRINGZILLA_INCLUDE_INSTALL_DIR "${CMAKE_INSTALL_INCLUDEDIR}") -set(STRINGZILLA_TARGETS_EXPORT_NAME "${PROJECT_NAME}Targets") -set(STRINGZILLA_CMAKE_CONFIG_TEMPLATE "cmake/config.cmake.in") -set(STRINGZILLA_CMAKE_CONFIG_DIR "${CMAKE_CURRENT_BINARY_DIR}") -set(STRINGZILLA_CMAKE_VERSION_CONFIG_FILE "${STRINGZILLA_CMAKE_CONFIG_DIR}/${PROJECT_NAME}ConfigVersion.cmake") -set(STRINGZILLA_CMAKE_PROJECT_CONFIG_FILE "${STRINGZILLA_CMAKE_CONFIG_DIR}/${PROJECT_NAME}Config.cmake") -set(STRINGZILLA_CMAKE_PROJECT_TARGETS_FILE "${STRINGZILLA_CMAKE_CONFIG_DIR}/${PROJECT_NAME}Targets.cmake") -set(STRINGZILLA_PKGCONFIG_INSTALL_DIR "${CMAKE_INSTALL_DATADIR}/pkgconfig") - +set(STRINGZILLA_TARGET_NAME ${PROJECT_NAME}) +set(STRINGZILLA_CONFIG_INSTALL_DIR + "${CMAKE_INSTALL_DATADIR}/cmake/${PROJECT_NAME}" + CACHE INTERNAL "") +set(STRINGZILLA_INCLUDE_INSTALL_DIR "${CMAKE_INSTALL_INCLUDEDIR}") +set(STRINGZILLA_TARGETS_EXPORT_NAME "${PROJECT_NAME}Targets") +set(STRINGZILLA_CMAKE_CONFIG_TEMPLATE "cmake/config.cmake.in") +set(STRINGZILLA_CMAKE_CONFIG_DIR "${CMAKE_CURRENT_BINARY_DIR}") +set(STRINGZILLA_CMAKE_VERSION_CONFIG_FILE + "${STRINGZILLA_CMAKE_CONFIG_DIR}/${PROJECT_NAME}ConfigVersion.cmake") +set(STRINGZILLA_CMAKE_PROJECT_CONFIG_FILE + "${STRINGZILLA_CMAKE_CONFIG_DIR}/${PROJECT_NAME}Config.cmake") +set(STRINGZILLA_CMAKE_PROJECT_TARGETS_FILE + "${STRINGZILLA_CMAKE_CONFIG_DIR}/${PROJECT_NAME}Targets.cmake") +set(STRINGZILLA_PKGCONFIG_INSTALL_DIR "${CMAKE_INSTALL_DATADIR}/pkgconfig") # Define our header-only library add_library(${STRINGZILLA_TARGET_NAME} INTERFACE) -add_library(${PROJECT_NAME}::${STRINGZILLA_TARGET_NAME} ALIAS ${STRINGZILLA_TARGET_NAME}) +add_library(${PROJECT_NAME}::${STRINGZILLA_TARGET_NAME} ALIAS + ${STRINGZILLA_TARGET_NAME}) set(STRINGZILLA_INCLUDE_BUILD_DIR "${PROJECT_SOURCE_DIR}/include/") target_compile_definitions( - ${STRINGZILLA_TARGET_NAME} - INTERFACE - $<$>:STRINGZILLA_USE_OPENMP=0> -) + ${STRINGZILLA_TARGET_NAME} + INTERFACE $<$>:STRINGZILLA_USE_OPENMP=0>) target_include_directories( - ${STRINGZILLA_TARGET_NAME} - ${STRINGZILLA_SYSTEM_INCLUDE} INTERFACE - $ - $ -) + ${STRINGZILLA_TARGET_NAME} ${STRINGZILLA_SYSTEM_INCLUDE} + INTERFACE $ + $) if(STRINGZILLA_INSTALL) - install( - DIRECTORY ${STRINGZILLA_INCLUDE_BUILD_DIR} - DESTINATION ${STRINGZILLA_INCLUDE_INSTALL_DIR} - ) - install( - FILES ${STRINGZILLA_CMAKE_PROJECT_CONFIG_FILE} ${STRINGZILLA_CMAKE_VERSION_CONFIG_FILE} - DESTINATION ${STRINGZILLA_CONFIG_INSTALL_DIR} - ) - export( - TARGETS ${STRINGZILLA_TARGET_NAME} - NAMESPACE ${PROJECT_NAME}:: - FILE ${STRINGZILLA_CMAKE_PROJECT_TARGETS_FILE} - ) - install( - TARGETS ${STRINGZILLA_TARGET_NAME} - EXPORT ${STRINGZILLA_TARGETS_EXPORT_NAME} - INCLUDES DESTINATION ${STRINGZILLA_INCLUDE_INSTALL_DIR} - ) - install( - EXPORT ${STRINGZILLA_TARGETS_EXPORT_NAME} - NAMESPACE ${PROJECT_NAME}:: - DESTINATION ${STRINGZILLA_CONFIG_INSTALL_DIR} - ) - install( - FILES "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}.pc" - DESTINATION ${STRINGZILLA_PKGCONFIG_INSTALL_DIR} - ) + install(DIRECTORY ${STRINGZILLA_INCLUDE_BUILD_DIR} + DESTINATION ${STRINGZILLA_INCLUDE_INSTALL_DIR}) + install(FILES ${STRINGZILLA_CMAKE_PROJECT_CONFIG_FILE} + ${STRINGZILLA_CMAKE_VERSION_CONFIG_FILE} + DESTINATION ${STRINGZILLA_CONFIG_INSTALL_DIR}) + export( + TARGETS ${STRINGZILLA_TARGET_NAME} + NAMESPACE ${PROJECT_NAME}:: + FILE ${STRINGZILLA_CMAKE_PROJECT_TARGETS_FILE}) + install( + TARGETS ${STRINGZILLA_TARGET_NAME} + EXPORT ${STRINGZILLA_TARGETS_EXPORT_NAME} + INCLUDES + DESTINATION ${STRINGZILLA_INCLUDE_INSTALL_DIR}) + install( + EXPORT ${STRINGZILLA_TARGETS_EXPORT_NAME} + NAMESPACE ${PROJECT_NAME}:: + DESTINATION ${STRINGZILLA_CONFIG_INSTALL_DIR}) + install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}.pc" + DESTINATION ${STRINGZILLA_PKGCONFIG_INSTALL_DIR}) endif() if(${STRINGZILLA_BUILD_TEST} OR ${STRINGZILLA_BUILD_BENCHMARK}) - add_executable(stringzilla_test scripts/test.c) + add_executable(stringzilla_test scripts/test.cpp) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3") + set(CMAKE_CXX_FLAGS + "${CMAKE_CXX_FLAGS} -O3 -flto -march=native -finline-functions -funroll-loops" + ) target_include_directories(stringzilla_test PRIVATE stringzilla) - set_target_properties(stringzilla_test PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) + set_target_properties(stringzilla_test PROPERTIES RUNTIME_OUTPUT_DIRECTORY + ${CMAKE_BINARY_DIR}) - if(${CMAKE_VERSION} VERSION_EQUAL 3.13 OR ${CMAKE_VERSION} VERSION_GREATER 3.13) - include(CTest) - enable_testing() - add_test(NAME stringzilla_test COMMAND stringzilla_test) + if(${CMAKE_VERSION} VERSION_EQUAL 3.13 OR ${CMAKE_VERSION} VERSION_GREATER + 3.13) + include(CTest) + enable_testing() + add_test(NAME stringzilla_test COMMAND stringzilla_test) endif() endif() - diff --git a/README.md b/README.md index 85032c34..8f0765c3 100644 --- a/README.md +++ b/README.md @@ -35,8 +35,8 @@ Coming soon. ## Quick Start: Python 🐍 -1️. Install via pip: `pip install stringzilla` -1. Import the classes you need: `from stringzilla import Str, Strs, File` +1. Install via pip: `pip install stringzilla` +2. Import the classes you need: `from stringzilla import Str, Strs, File` ### Basic Usage @@ -115,13 +115,13 @@ There is an ABI-stable C 99 interface, in case you have a database, an operating #include "stringzilla.h" // Initialize your haystack and needle -sz_haystack_t haystack = {your_text, your_text_length}; -sz_needle_t needle = {your_subtext, your_subtext_length, your_quadgram_offset}; +sz_string_view_t haystack = {your_text, your_text_length}; +sz_string_view_t needle = {your_subtext, your_subtext_length}; // Perform string-level operations size_t character_count = sz_count_char(haystack, 'a'); size_t character_position = sz_find_unigram(haystack, 'a'); -size_t substring_position = sz_find_substr(haystack, needle); +size_t substring_position = sz_find_substring(haystack, needle); // Perform collection level operations sz_sequence_t array = {your_order, your_count, your_get_start, your_get_length, your_handle}; diff --git a/scripts/bench.ipynb b/scripts/bench.ipynb index b3bc4392..492db50a 100644 --- a/scripts/bench.ipynb +++ b/scripts/bench.ipynb @@ -88,7 +88,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "150 ms ± 2.01 ms per loop (mean ± std. dev. of 100 runs, 1 loop each)\n" + "152 ms ± 3.24 ms per loop (mean ± std. dev. of 100 runs, 1 loop each)\n" ] } ], @@ -106,7 +106,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "37.8 ms ± 286 µs per loop (mean ± std. dev. of 100 runs, 1 loop each)\n" + "38.1 ms ± 312 µs per loop (mean ± std. dev. of 100 runs, 1 loop each)\n" ] } ], @@ -124,7 +124,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "173 ns ± 23.7 ns per loop (mean ± std. dev. of 1000 runs, 1 loop each)\n" + "The slowest run took 7.28 times longer than the fastest. This could mean that an intermediate result is being cached.\n", + "186 ns ± 41.1 ns per loop (mean ± std. dev. of 1000 runs, 1 loop each)\n" ] } ], @@ -142,8 +143,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "The slowest run took 82.51 times longer than the fastest. This could mean that an intermediate result is being cached.\n", - "94.3 ns ± 108 ns per loop (mean ± std. dev. of 1000 runs, 1 loop each)\n" + "The slowest run took 120.95 times longer than the fastest. This could mean that an intermediate result is being cached.\n", + "99.6 ns ± 155 ns per loop (mean ± std. dev. of 1000 runs, 1 loop each)\n" ] } ], @@ -152,6 +153,13 @@ "sz_str.find(pattern)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": null, @@ -176,7 +184,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.10.11" }, "orig_nbformat": 4 }, diff --git a/scripts/test.c b/scripts/test.c index 127975b0..b39fd982 100644 --- a/scripts/test.c +++ b/scripts/test.c @@ -18,8 +18,8 @@ void populate_random_string(char *buffer, int length, int variability) { buffer[length] = '\0'; } -// Test function for sz_find_substr -void test_sz_find_substr() { +// Test function for sz_find_substring +void test_sz_find_substring() { char buffer[MAX_LENGTH + 1]; char pattern[6]; // Maximum length of 5 + 1 for '\0' @@ -39,11 +39,11 @@ void test_sz_find_substr() { needle.length = pattern_length; // Comparing the result of your function with the standard library function. - sz_string_ptr_t result_libc = strstr(buffer, pattern); - sz_string_ptr_t result_stringzilla = - sz_find_substr(haystack.start, haystack.length, needle.start, needle.length); + sz_string_start_t result_libc = strstr(buffer, pattern); + sz_string_start_t result_stringzilla = + sz_find_substring(haystack.start, haystack.length, needle.start, needle.length); - assert(((result_libc == NULL) ^ (result_stringzilla == NULL)) && "Test failed for sz_find_substr"); + assert(((result_libc == NULL) ^ (result_stringzilla == NULL)) && "Test failed for sz_find_substring"); } } } @@ -51,7 +51,7 @@ void test_sz_find_substr() { int main() { srand((unsigned int)time(NULL)); - test_sz_find_substr(); + test_sz_find_substring(); // Add calls to other test functions as you implement them printf("All tests passed!\n"); From bcaf7911d962ce641cab910bc0bc491ed6646ddd Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Mon, 9 Oct 2023 17:03:25 -0700 Subject: [PATCH 64/72] Add: Benchmarks notebook --- scripts/bench.ipynb | 26 ++++++-------------------- stringzilla/stringzilla.h | 28 +++++++++++++--------------- 2 files changed, 19 insertions(+), 35 deletions(-) diff --git a/scripts/bench.ipynb b/scripts/bench.ipynb index 492db50a..838ca7af 100644 --- a/scripts/bench.ipynb +++ b/scripts/bench.ipynb @@ -88,7 +88,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "152 ms ± 3.24 ms per loop (mean ± std. dev. of 100 runs, 1 loop each)\n" + "152 ms ± 2.43 ms per loop (mean ± std. dev. of 100 runs, 1 loop each)\n" ] } ], @@ -106,7 +106,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "38.1 ms ± 312 µs per loop (mean ± std. dev. of 100 runs, 1 loop each)\n" + "37.7 ms ± 341 µs per loop (mean ± std. dev. of 100 runs, 1 loop each)\n" ] } ], @@ -124,8 +124,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "The slowest run took 7.28 times longer than the fastest. This could mean that an intermediate result is being cached.\n", - "186 ns ± 41.1 ns per loop (mean ± std. dev. of 1000 runs, 1 loop each)\n" + "The slowest run took 8.67 times longer than the fastest. This could mean that an intermediate result is being cached.\n", + "182 ns ± 35 ns per loop (mean ± std. dev. of 1000 runs, 1 loop each)\n" ] } ], @@ -143,8 +143,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "The slowest run took 120.95 times longer than the fastest. This could mean that an intermediate result is being cached.\n", - "99.6 ns ± 155 ns per loop (mean ± std. dev. of 1000 runs, 1 loop each)\n" + "The slowest run took 40.69 times longer than the fastest. This could mean that an intermediate result is being cached.\n", + "90 ns ± 53.2 ns per loop (mean ± std. dev. of 1000 runs, 1 loop each)\n" ] } ], @@ -152,20 +152,6 @@ "%%timeit -n 1 -r 1000\n", "sz_str.find(pattern)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h index c7c0ae49..94bbde44 100644 --- a/stringzilla/stringzilla.h +++ b/stringzilla/stringzilla.h @@ -35,7 +35,7 @@ extern "C" { #endif /** - * @brief Analogous to `size_t` and `std::size_t`, unsigned integer, identical to pointer size. + * @brief Analogous to `sz_size_t` and `std::sz_size_t`, unsigned integer, identical to pointer size. * 64-bit on most platforms where pointers are 64-bit. * 32-bit on platforms where pointers are 32-bit. */ @@ -490,23 +490,21 @@ inline static sz_string_start_t sz_find_substring_avx2(sz_string_start_t const h int matches3 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts3, anomalies)); if (matches0 | matches1 | matches2 | matches3) { - int matches = // - (matches0 & 0x1111'1111u) | // - (matches1 & 0x2222'2222u) | // - (matches2 & 0x4444'4444u) | // - (matches3 & 0x8888'8888u); - size_t first_match_offset = _tzcnt_u32(matches); + int matches = // + (matches0 & 0x11111111u) | // + (matches1 & 0x22222222u) | // + (matches2 & 0x44444444u) | // + (matches3 & 0x88888888u); + sz_size_t first_match_offset = _tzcnt_u32(matches); if (needle_length > 4) { - if (sz_equal(text + first_match_offset + 4, needle + 4, needle_length - 4)) + if (sz_equal(text + first_match_offset + 4, needle + 4, needle_length - 4)) { return text + first_match_offset; - else - text += first_match_offset + 1; + } + else { text += first_match_offset + 1; } } - else - return text + first_match_offset; + else { return text + first_match_offset; } } - else - text += 32; + else { text += 32; } } // Don't forget the last (up to 35) characters. @@ -566,7 +564,7 @@ inline static sz_string_start_t sz_find_substring_neon(sz_string_start_t const h (vget_lane_u16(matches_u16x4, 3) << 12); // Find the first match - size_t first_match_offset = __builtin_ctz(matches_u16); + sz_size_t first_match_offset = __builtin_ctz(matches_u16); if (needle_length > 4) { if (sz_equal(text + first_match_offset + 4, needle + 4, needle_length - 4)) return text + first_match_offset; From 9bdbf236c8ca20648c2ccdf350035eba592662a7 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Mon, 9 Oct 2023 17:04:35 -0700 Subject: [PATCH 65/72] Make: Automate major releases --- .releaserc | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.releaserc b/.releaserc index faa6b938..ab603bc1 100644 --- a/.releaserc +++ b/.releaserc @@ -11,6 +11,10 @@ { "preset": "eslint", "releaseRules": [ + { + "tag": "Break", + "release": "major" + }, { "tag": "Add", "release": "minor" @@ -35,6 +39,10 @@ { "preset": "eslint", "releaseRules": [ + { + "tag": "Break", + "release": "major" + }, { "tag": "Add", "release": "minor" From a878eba876b23e5dfc1277c853add7edd43a91dd Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Mon, 9 Oct 2023 17:40:23 -0700 Subject: [PATCH 66/72] Fix: MSVC-compliant initialization --- python/lib.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/python/lib.c b/python/lib.c index c0ad69d4..57902505 100644 --- a/python/lib.c +++ b/python/lib.c @@ -30,6 +30,8 @@ typedef SSIZE_T ssize_t; #include // Core CPython interfaces #include // NumPy +#include // `memset` + #include #pragma region Forward Declarations @@ -1573,8 +1575,10 @@ static sz_bool_t Strs_sort_(Strs *self, } // Call our sorting algorithm - sz_sequence_t sequence = {}; - sz_sort_config_t sort_config = {}; + sz_sequence_t sequence; + sz_sort_config_t sort_config; + memset(&sequence, 0, sizeof(sequence)); + memset(&sort_config, 0, sizeof(sort_config)); sequence.order = (sz_size_t *)temporary_memory.start; sequence.count = count; sequence.handle = parts; From 5d333d5af3d60b61a5da36f1df79337a9850d76c Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Mon, 9 Oct 2023 17:46:27 -0700 Subject: [PATCH 67/72] Fix: Missing `__builtin_clzll` symbol --- stringzilla/stringzilla.h | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h index 94bbde44..cfcd0220 100644 --- a/stringzilla/stringzilla.h +++ b/stringzilla/stringzilla.h @@ -806,14 +806,26 @@ inline static void sz_sort_insertion(sz_sequence_t *sequence, sz_sequence_compar } } -// Utility functions inline static sz_size_t _sz_log2i(sz_size_t n) { - if (n == 0) return 0; // to avoid undefined behavior with __builtin_clz + if (n == 0) return 0; + #if defined(__LP64__) || defined(_WIN64) // 64-bit +#ifdef _MSC_VER + unsigned long index; + _BitScanReverse64(&index, n); + return index; +#else return 63 - __builtin_clzll(n); +#endif #else // 32-bit +#ifdef _MSC_VER + unsigned long index; + _BitScanReverse(&index, n); + return index; +#else return 31 - __builtin_clz(n); #endif +#endif } inline static void _sz_sift_down( From 8f9ca8b14bd5b46a22809f077a4bdbf0954ad89a Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Mon, 9 Oct 2023 18:02:29 -0700 Subject: [PATCH 68/72] Improve: Identical bit-counting intrinsics --- stringzilla/stringzilla.h | 113 ++++++++++++++++++++++---------------- 1 file changed, 66 insertions(+), 47 deletions(-) diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h index cfcd0220..00ef0964 100644 --- a/stringzilla/stringzilla.h +++ b/stringzilla/stringzilla.h @@ -9,13 +9,14 @@ #include #endif +/** + * Intrinsics aliases for MSVC, GCC, and Clang. + */ #ifdef _MSC_VER #include #define popcount64 __popcnt64 #define ctz64 _tzcnt_u64 #define clz64 _lzcnt_u64 -#define strncasecmp _strnicmp -#define strcasecmp _stricmp #else #define popcount64 __builtin_popcountll #define ctz64 __builtin_ctzll @@ -23,8 +24,8 @@ #endif /** - * Generally `NULL` is coming from locale.h, stddef.h, stdio.h, stdlib.h, string.h, time.h, and wchar.h, - * according to the C standard. + * @brief Generally `NULL` is coming from locale.h, stddef.h, stdio.h, stdlib.h, string.h, time.h, and wchar.h, + * according to the C standard. */ #ifndef NULL #define NULL ((void *)0) @@ -50,6 +51,11 @@ typedef unsigned sz_u32_t; // Always 32 bits typedef unsigned long long sz_u64_t; // Always 64 bits typedef char const *sz_string_start_t; // A type alias for `char const * ` +/** + * @brief For faster bounded Levenstein (Edit) distance computation no more than 255 characters are supported. + */ +typedef unsigned char levenstein_distance_t; + /** * @brief Helper construct for higher-level bindings. */ @@ -490,12 +496,12 @@ inline static sz_string_start_t sz_find_substring_avx2(sz_string_start_t const h int matches3 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts3, anomalies)); if (matches0 | matches1 | matches2 | matches3) { - int matches = // - (matches0 & 0x11111111u) | // - (matches1 & 0x22222222u) | // - (matches2 & 0x44444444u) | // - (matches3 & 0x88888888u); - sz_size_t first_match_offset = _tzcnt_u32(matches); + int matches = // + (matches0 & 0x11111111) | // + (matches1 & 0x22222222) | // + (matches2 & 0x44444444) | // + (matches3 & 0x88888888); + sz_size_t first_match_offset = ctz64(matches); if (needle_length > 4) { if (sz_equal(text + first_match_offset + 4, needle + 4, needle_length - 4)) { return text + first_match_offset; @@ -564,18 +570,16 @@ inline static sz_string_start_t sz_find_substring_neon(sz_string_start_t const h (vget_lane_u16(matches_u16x4, 3) << 12); // Find the first match - sz_size_t first_match_offset = __builtin_ctz(matches_u16); + sz_size_t first_match_offset = ctz64(matches_u16); if (needle_length > 4) { - if (sz_equal(text + first_match_offset + 4, needle + 4, needle_length - 4)) + if (sz_equal(text + first_match_offset + 4, needle + 4, needle_length - 4)) { return text + first_match_offset; - else - text += first_match_offset + 1; + } + else { text += first_match_offset + 1; } } - else - return text + first_match_offset; + else { return text + first_match_offset; } } - else - text += 16; + else { text += 16; } } // Don't forget the last (up to 16+3=19) characters. @@ -666,6 +670,13 @@ inline static char sz_toupper_ascii(char c) { return *(char *)&upped[(int)c]; } +/** + * @brief Load a 64-bit unsigned integer from a potentially unaligned pointer. + * + * @note This function uses compiler-specific attributes or keywords to + * ensure correct and efficient unaligned loads. It's designed to work + * with both MSVC and GCC/Clang. + */ inline static sz_u64_t sz_u64_unaligned_load(void const *ptr) { #ifdef _MSC_VER return *((__unaligned sz_u64_t *)ptr); @@ -675,6 +686,12 @@ inline static sz_u64_t sz_u64_unaligned_load(void const *ptr) { #endif } +/** + * @brief Reverse the byte order of a 64-bit unsigned integer. + * + * @note This function uses compiler-specific intrinsics to achieve the + * byte-reversal. It's designed to work with both MSVC and GCC/Clang. + */ inline static sz_u64_t sz_u64_byte_reverse(sz_u64_t val) { #ifdef _MSC_VER return _byteswap_uint64(val); @@ -683,6 +700,35 @@ inline static sz_u64_t sz_u64_byte_reverse(sz_u64_t val) { #endif } +/** + * @brief Compute the logarithm base 2 of an integer. + * + * @note If n is 0, the function returns 0 to avoid undefined behavior. + * @note This function uses compiler-specific intrinsics or built-ins + * to achieve the computation. It's designed to work with GCC/Clang and MSVC. + */ +inline static sz_size_t sz_log2i(sz_size_t n) { + if (n == 0) return 0; + +#if defined(__LP64__) || defined(_WIN64) // 64-bit +#ifdef _MSC_VER + unsigned long index; + _BitScanReverse64(&index, n); + return index; +#else + return 63 - __builtin_clzll(n); +#endif +#else // 32-bit +#ifdef _MSC_VER + unsigned long index; + _BitScanReverse(&index, n); + return index; +#else + return 31 - __builtin_clz(n); +#endif +#endif +} + /** * @brief Char-level lexicographic comparison of two strings. * Doesn't provide major performance improvements, but helps avoid the LibC dependency. @@ -806,28 +852,6 @@ inline static void sz_sort_insertion(sz_sequence_t *sequence, sz_sequence_compar } } -inline static sz_size_t _sz_log2i(sz_size_t n) { - if (n == 0) return 0; - -#if defined(__LP64__) || defined(_WIN64) // 64-bit -#ifdef _MSC_VER - unsigned long index; - _BitScanReverse64(&index, n); - return index; -#else - return 63 - __builtin_clzll(n); -#endif -#else // 32-bit -#ifdef _MSC_VER - unsigned long index; - _BitScanReverse(&index, n); - return index; -#else - return 31 - __builtin_clz(n); -#endif -#endif -} - inline static void _sz_sift_down( sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_u64_t *order, sz_size_t start, sz_size_t end) { sz_size_t root = start; @@ -936,13 +960,10 @@ inline static void _sz_introsort( } inline static void sz_sort_introsort(sz_sequence_t *sequence, sz_sequence_comparator_t less) { - sz_size_t depth_limit = 2 * _sz_log2i(sequence->count); + sz_size_t depth_limit = 2 * sz_log2i(sequence->count); _sz_introsort(sequence, less, 0, sequence->count, depth_limit); } -/** - * @brief Internal Radix sorting procedure. - */ inline static void _sz_sort_recursion( // sz_sequence_t *sequence, sz_size_t bit_idx, @@ -1012,7 +1033,7 @@ typedef struct sz_sort_config_t { /** * @brief Sorting algorithm, combining Radix Sort for the first 32 bits of every word - * and a follow-up Quick Sort on resulting structure. + * and a follow-up by a more conventional sorting procedure on equally prefixed parts. */ inline static void sz_sort(sz_sequence_t *sequence, sz_sort_config_t const *config) { @@ -1042,8 +1063,6 @@ inline static void sz_sort(sz_sequence_t *sequence, sz_sort_config_t const *conf _sz_sort_recursion(sequence, 0, 32, comparator, partial_order_length); } -typedef unsigned char levenstein_distance_t; - /** * @return Amount of temporary memory (in bytes) needed to efficiently compute * the Levenstein distance between two strings of given size. From 1a5b7260a96307202aea678d1bb22d4c00f19217 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Mon, 9 Oct 2023 18:58:07 -0700 Subject: [PATCH 69/72] Add: SSE and Arm variants of CRC32 --- python/lib.c | 2 +- stringzilla/stringzilla.h | 110 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 106 insertions(+), 6 deletions(-) diff --git a/python/lib.c b/python/lib.c index 57902505..64e3ae70 100644 --- a/python/lib.c +++ b/python/lib.c @@ -561,7 +561,7 @@ static void Str_dealloc(Str *self) { static PyObject *Str_str(Str *self) { return PyUnicode_FromStringAndSize(self->start, self->length); } -static Py_hash_t Str_hash(Str *self) { return (Py_hash_t)sz_hash_crc32_native(self->start, self->length); } +static Py_hash_t Str_hash(Str *self) { return (Py_hash_t)sz_hash_crc32(self->start, self->length); } static Py_ssize_t Str_len(Str *self) { return self->length; } diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h index 00ef0964..57d17b89 100644 --- a/stringzilla/stringzilla.h +++ b/stringzilla/stringzilla.h @@ -9,6 +9,10 @@ #include #endif +#if defined(__ARM_FEATURE_CRC32) +#include +#endif + /** * Intrinsics aliases for MSVC, GCC, and Clang. */ @@ -1137,14 +1141,110 @@ inline static levenstein_distance_t sz_levenstein( // return previous_distances[b_length] <= bound ? previous_distances[b_length] : bound; } +inline static sz_u32_t sz_hash_crc32_swar(sz_string_start_t start, sz_size_t length) { + /* + * The following CRC lookup table was generated automagically using the + * following model parameters: + * + * Generator Polynomial = ................. 0x1EDC6F41 + * Generator Polynomial Length = .......... 32 bits + * Reflected Bits = ....................... TRUE + * Table Generation Offset = .............. 32 bits + * Number of Slices = ..................... 8 slices + * Slice Lengths = ........................ 8 8 8 8 8 8 8 8 + */ + + static sz_u32_t const table[256] = { + 0x00000000, 0xF26B8303, 0xE13B70F7, 0x1350F3F4, 0xC79A971F, 0x35F1141C, 0x26A1E7E8, 0xD4CA64EB, // + 0x8AD958CF, 0x78B2DBCC, 0x6BE22838, 0x9989AB3B, 0x4D43CFD0, 0xBF284CD3, 0xAC78BF27, 0x5E133C24, // + 0x105EC76F, 0xE235446C, 0xF165B798, 0x030E349B, 0xD7C45070, 0x25AFD373, 0x36FF2087, 0xC494A384, // + 0x9A879FA0, 0x68EC1CA3, 0x7BBCEF57, 0x89D76C54, 0x5D1D08BF, 0xAF768BBC, 0xBC267848, 0x4E4DFB4B, // + 0x20BD8EDE, 0xD2D60DDD, 0xC186FE29, 0x33ED7D2A, 0xE72719C1, 0x154C9AC2, 0x061C6936, 0xF477EA35, // + 0xAA64D611, 0x580F5512, 0x4B5FA6E6, 0xB93425E5, 0x6DFE410E, 0x9F95C20D, 0x8CC531F9, 0x7EAEB2FA, // + 0x30E349B1, 0xC288CAB2, 0xD1D83946, 0x23B3BA45, 0xF779DEAE, 0x05125DAD, 0x1642AE59, 0xE4292D5A, // + 0xBA3A117E, 0x4851927D, 0x5B016189, 0xA96AE28A, 0x7DA08661, 0x8FCB0562, 0x9C9BF696, 0x6EF07595, // + 0x417B1DBC, 0xB3109EBF, 0xA0406D4B, 0x522BEE48, 0x86E18AA3, 0x748A09A0, 0x67DAFA54, 0x95B17957, // + 0xCBA24573, 0x39C9C670, 0x2A993584, 0xD8F2B687, 0x0C38D26C, 0xFE53516F, 0xED03A29B, 0x1F682198, // + 0x5125DAD3, 0xA34E59D0, 0xB01EAA24, 0x42752927, 0x96BF4DCC, 0x64D4CECF, 0x77843D3B, 0x85EFBE38, // + 0xDBFC821C, 0x2997011F, 0x3AC7F2EB, 0xC8AC71E8, 0x1C661503, 0xEE0D9600, 0xFD5D65F4, 0x0F36E6F7, // + 0x61C69362, 0x93AD1061, 0x80FDE395, 0x72966096, 0xA65C047D, 0x5437877E, 0x4767748A, 0xB50CF789, // + 0xEB1FCBAD, 0x197448AE, 0x0A24BB5A, 0xF84F3859, 0x2C855CB2, 0xDEEEDFB1, 0xCDBE2C45, 0x3FD5AF46, // + 0x7198540D, 0x83F3D70E, 0x90A324FA, 0x62C8A7F9, 0xB602C312, 0x44694011, 0x5739B3E5, 0xA55230E6, // + 0xFB410CC2, 0x092A8FC1, 0x1A7A7C35, 0xE811FF36, 0x3CDB9BDD, 0xCEB018DE, 0xDDE0EB2A, 0x2F8B6829, // + 0x82F63B78, 0x709DB87B, 0x63CD4B8F, 0x91A6C88C, 0x456CAC67, 0xB7072F64, 0xA457DC90, 0x563C5F93, // + 0x082F63B7, 0xFA44E0B4, 0xE9141340, 0x1B7F9043, 0xCFB5F4A8, 0x3DDE77AB, 0x2E8E845F, 0xDCE5075C, // + 0x92A8FC17, 0x60C37F14, 0x73938CE0, 0x81F80FE3, 0x55326B08, 0xA759E80B, 0xB4091BFF, 0x466298FC, // + 0x1871A4D8, 0xEA1A27DB, 0xF94AD42F, 0x0B21572C, 0xDFEB33C7, 0x2D80B0C4, 0x3ED04330, 0xCCBBC033, // + 0xA24BB5A6, 0x502036A5, 0x4370C551, 0xB11B4652, 0x65D122B9, 0x97BAA1BA, 0x84EA524E, 0x7681D14D, // + 0x2892ED69, 0xDAF96E6A, 0xC9A99D9E, 0x3BC21E9D, 0xEF087A76, 0x1D63F975, 0x0E330A81, 0xFC588982, // + 0xB21572C9, 0x407EF1CA, 0x532E023E, 0xA145813D, 0x758FE5D6, 0x87E466D5, 0x94B49521, 0x66DF1622, // + 0x38CC2A06, 0xCAA7A905, 0xD9F75AF1, 0x2B9CD9F2, 0xFF56BD19, 0x0D3D3E1A, 0x1E6DCDEE, 0xEC064EED, // + 0xC38D26C4, 0x31E6A5C7, 0x22B65633, 0xD0DDD530, 0x0417B1DB, 0xF67C32D8, 0xE52CC12C, 0x1747422F, // + 0x49547E0B, 0xBB3FFD08, 0xA86F0EFC, 0x5A048DFF, 0x8ECEE914, 0x7CA56A17, 0x6FF599E3, 0x9D9E1AE0, // + 0xD3D3E1AB, 0x21B862A8, 0x32E8915C, 0xC083125F, 0x144976B4, 0xE622F5B7, 0xF5720643, 0x07198540, // + 0x590AB964, 0xAB613A67, 0xB831C993, 0x4A5A4A90, 0x9E902E7B, 0x6CFBAD78, 0x7FAB5E8C, 0x8DC0DD8F, // + 0xE330A81A, 0x115B2B19, 0x020BD8ED, 0xF0605BEE, 0x24AA3F05, 0xD6C1BC06, 0xC5914FF2, 0x37FACCF1, // + 0x69E9F0D5, 0x9B8273D6, 0x88D28022, 0x7AB90321, 0xAE7367CA, 0x5C18E4C9, 0x4F48173D, 0xBD23943E, // + 0xF36E6F75, 0x0105EC76, 0x12551F82, 0xE03E9C81, 0x34F4F86A, 0xC69F7B69, 0xD5CF889D, 0x27A40B9E, // + 0x79B737BA, 0x8BDCB4B9, 0x988C474D, 0x6AE7C44E, 0xBE2DA0A5, 0x4C4623A6, 0x5F16D052, 0xAD7D5351 // + }; + + sz_u32_t crc = 0xFFFFFFFF; + for (sz_string_start_t const end = start + length; start != end; ++start) + crc = (crc >> 8) ^ table[(crc ^ (sz_u32_t)*start) & 0xff]; + return crc ^ 0xFFFFFFFF; +} + +#if defined(__ARM_FEATURE_CRC32) +inline static sz_u32_t sz_hash_crc32_arm(sz_string_start_t start, sz_size_t length) { + sz_u32_t crc = 0xFFFFFFFF; + sz_string_start_t const end = start + length; + + // Align the input to the word boundary + while (((unsigned long)start & 7ul) && start != end) { crc = __crc32cb(crc, *start), start++; } + + // Process the body 8 bytes at a time + while (start + 8 <= end) { crc = __crc32cd(crc, *(unsigned long long *)start), start += 8; } + + // Process the tail bytes + if (start + 4 <= end) { crc = __crc32cw(crc, *(unsigned int *)start), start += 4; } + if (start + 2 <= end) { crc = __crc32ch(crc, *(unsigned short *)start), start += 2; } + if (start < end) { crc = __crc32cb(crc, *start); } + return crc ^ 0xFFFFFFFF; +} +#endif + +#if defined(__SSE4_2__) +inline static sz_u32_t sz_hash_crc32_sse(sz_string_start_t start, sz_size_t length) { + sz_u32_t crc = 0xFFFFFFFF; + sz_string_start_t const end = start + length; + + // Align the input to the word boundary + while (((unsigned long)start & 7ul) && start != end) { crc = _mm_crc32_u8(crc, *start), start++; } + + // Process the body 8 bytes at a time + while (start + 8 <= end) { crc = (sz_u32_t)_mm_crc32_u64(crc, *(unsigned long long *)start), start += 8; } + + // Process the tail bytes + if (start + 4 <= end) { crc = _mm_crc32_u32(crc, *(unsigned int *)start), start += 4; } + if (start + 2 <= end) { crc = _mm_crc32_u16(crc, *(unsigned short *)start), start += 2; } + if (start < end) { crc = _mm_crc32_u8(crc, *start); } + return crc ^ 0xFFFFFFFF; +} +#endif + /** * @brief Hashes provided string using hardware-accelerated CRC32 instructions. */ -inline static sz_u32_t sz_hash_crc32_native(sz_string_start_t start, sz_size_t length) { return 0; } - -inline static sz_u32_t sz_hash_crc32_neon(sz_string_start_t start, sz_size_t length) { return 0; } - -inline static sz_u32_t sz_hash_crc32_sse(sz_string_start_t start, sz_size_t length) { return 0; } +inline static sz_u32_t sz_hash_crc32(sz_string_start_t start, sz_size_t length) { +#if defined(__ARM_FEATURE_CRC32) + return sz_hash_crc32_arm(start, length); +#elif defined(__SSE4_2__) + return sz_hash_crc32_sse(start, length); +#else + return sz_hash_crc32_swar(start, length); +#endif +} #ifdef __cplusplus } From 7b1e170e7ca2407e2d35a043dc34af29551ef7c2 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Mon, 9 Oct 2023 19:08:48 -0700 Subject: [PATCH 70/72] Improve: BitScan dispatch on Windows --- stringzilla/stringzilla.h | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h index 57d17b89..136d93c5 100644 --- a/stringzilla/stringzilla.h +++ b/stringzilla/stringzilla.h @@ -714,19 +714,26 @@ inline static sz_u64_t sz_u64_byte_reverse(sz_u64_t val) { inline static sz_size_t sz_log2i(sz_size_t n) { if (n == 0) return 0; -#if defined(__LP64__) || defined(_WIN64) // 64-bit +#ifdef _WIN64 #ifdef _MSC_VER unsigned long index; - _BitScanReverse64(&index, n); - return index; + if (_BitScanReverse64(&index, n)) return index; + return 0; // This line might be redundant due to the initial check, but it's safer to include it. #else return 63 - __builtin_clzll(n); #endif -#else // 32-bit +#elif defined(_WIN32) #ifdef _MSC_VER unsigned long index; - _BitScanReverse(&index, n); - return index; + if (_BitScanReverse(&index, n)) return index; + return 0; // Same note as above. +#else + return 31 - __builtin_clz(n); +#endif +#else +// Handle non-Windows platforms. You can further differentiate between 32-bit and 64-bit if needed. +#if defined(__LP64__) + return 63 - __builtin_clzll(n); #else return 31 - __builtin_clz(n); #endif From 051f0a886ed2438541be1e2430758661662229a8 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Mon, 9 Oct 2023 19:15:11 -0700 Subject: [PATCH 71/72] Test: Printing failed cases --- scripts/test_fuzzy.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/scripts/test_fuzzy.py b/scripts/test_fuzzy.py index dbefd485..694c1818 100644 --- a/scripts/test_fuzzy.py +++ b/scripts/test_fuzzy.py @@ -20,7 +20,9 @@ def get_random_string( def is_equal_strings(native_strings, big_strings): for native_slice, big_slice in zip(native_strings, big_strings): - assert native_slice == big_slice + assert ( + native_slice == big_slice + ), f"Mismatch between `{native_slice}` and `{str(big_slice)}`" def check_identical( @@ -47,7 +49,9 @@ def check_identical( if check_iterators: for i in range(len(native_strings)): assert len(native_strings[i]) == len(big_strings[i]) - assert native_strings[i] == big_strings[i] + assert ( + native_strings[i] == big_strings[i] + ), f"Mismatch between `{native_strings[i]}` and `{str(big_strings[i])}`" assert [c for c in native_strings[i]] == [c for c in big_strings[i]] is_equal_strings(native_strings, big_strings) From 9a575ce91a945430abb158ff892eef90d546c14c Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Mon, 9 Oct 2023 21:28:28 -0700 Subject: [PATCH 72/72] Fix: `sz_size_t` size in MSVC --- .gitignore | 1 + python/lib.c | 9 +++++---- setup.py | 2 +- stringzilla/stringzilla.h | 31 ++++++++++++++++++++----------- 4 files changed, 27 insertions(+), 16 deletions(-) diff --git a/.gitignore b/.gitignore index cfbdf78a..ca44f760 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,7 @@ substr_search_cpp *.so *.egg-info *.whl +*.pyd node_modules/ leipzig1M.txt \ No newline at end of file diff --git a/python/lib.c b/python/lib.c index 64e3ae70..0fa67358 100644 --- a/python/lib.c +++ b/python/lib.c @@ -71,7 +71,8 @@ typedef struct { * - Str(File("some-path.txt"), from=0, to=sys.maxint) */ typedef struct { - PyObject_HEAD PyObject *parent; + PyObject_HEAD // + PyObject *parent; sz_string_start_t start; sz_size_t length; } Str; @@ -782,13 +783,13 @@ static int Strs_contains(Str *self, PyObject *arg) { return 0; } static PyObject *Str_richcompare(PyObject *self, PyObject *other, int op) { - char const *a_start, *b_start; - size_t a_length, b_length; + sz_string_start_t a_start = NULL, b_start = NULL; + sz_size_t a_length = 0, b_length = 0; if (!export_string_like(self, &a_start, &a_length) || !export_string_like(other, &b_start, &b_length)) Py_RETURN_NOTIMPLEMENTED; // Perform byte-wise comparison up to the minimum length - size_t min_length = a_length < b_length ? a_length : b_length; + sz_size_t min_length = a_length < b_length ? a_length : b_length; int cmp_result = memcmp(a_start, b_start, min_length); // If the strings are equal up to `min_length`, then the shorter string is smaller diff --git a/setup.py b/setup.py index 1b8d83ce..12357369 100644 --- a/setup.py +++ b/setup.py @@ -47,7 +47,7 @@ link_args.append("-Xpreprocessor -lomp") if sys.platform == "win32": - compile_args.append("/std:c++17") + compile_args.append("/std:c99") compile_args.append("/O2") diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h index 136d93c5..b93e191a 100644 --- a/stringzilla/stringzilla.h +++ b/stringzilla/stringzilla.h @@ -35,6 +35,14 @@ #define NULL ((void *)0) #endif +/** + * @brief Compile-time assert macro. + */ +#define SZ_STATIC_ASSERT(condition, name) \ + typedef struct { \ + int static_assert_##name : (condition) ? 1 : -1; \ + } sz_static_assert_##name##_t + #ifdef __cplusplus extern "C" { #endif @@ -45,10 +53,11 @@ extern "C" { * 32-bit on platforms where pointers are 32-bit. */ #if defined(__LP64__) || defined(_LP64) || defined(__x86_64__) || defined(_WIN64) -typedef unsigned long sz_size_t; +typedef unsigned long long sz_size_t; #else typedef unsigned sz_size_t; #endif +SZ_STATIC_ASSERT(sizeof(sz_size_t) == sizeof(void *), sz_size_t_must_be_pointer_size); typedef int sz_bool_t; // Only one relevant bit typedef unsigned sz_u32_t; // Always 32 bits @@ -101,7 +110,7 @@ inline static sz_size_t sz_count_char_swar(sz_string_start_t const haystack, sz_string_start_t const end = haystack + haystack_length; // Process the misaligned head, to void UB on unaligned 64-bit loads. - for (; ((unsigned long)text & 7ul) && text < end; ++text) result += *text == *needle; + for (; ((sz_size_t)text & 7ull) && text < end; ++text) result += *text == *needle; // This code simulates hyper-scalar execution, comparing 8 characters at a time. sz_u64_t nnnnnnnn = *needle; @@ -135,7 +144,7 @@ inline static sz_string_start_t sz_find_1char_swar(sz_string_start_t const hayst sz_string_start_t const end = haystack + haystack_length; // Process the misaligned head, to void UB on unaligned 64-bit loads. - for (; ((unsigned long)text & 7ul) && text < end; ++text) + for (; ((sz_size_t)text & 7ull) && text < end; ++text) if (*text == *needle) return text; // This code simulates hyper-scalar execution, analyzing 8 offsets at a time. @@ -172,7 +181,7 @@ inline static sz_string_start_t sz_rfind_1char_swar(sz_string_start_t const hays sz_string_start_t text = end - 1; // Process the misaligned head, to void UB on unaligned 64-bit loads. - for (; ((unsigned long)text & 7ul) && text >= haystack; --text) + for (; ((sz_size_t)text & 7ull) && text >= haystack; --text) if (*text == *needle) return text; // This code simulates hyper-scalar execution, analyzing 8 offsets at a time. @@ -208,7 +217,7 @@ inline static sz_string_start_t sz_find_2char_swar(sz_string_start_t const hayst sz_string_start_t const end = haystack + haystack_length; // Process the misaligned head, to void UB on unaligned 64-bit loads. - for (; ((unsigned long)text & 7ul) && text + 2 <= end; ++text) + for (; ((sz_size_t)text & 7ull) && text + 2 <= end; ++text) if (text[0] == needle[0] && text[1] == needle[1]) return text; // This code simulates hyper-scalar execution, analyzing 7 offsets at a time. @@ -257,7 +266,7 @@ inline static sz_string_start_t sz_find_3char_swar(sz_string_start_t const hayst sz_string_start_t end = haystack + haystack_length; // Process the misaligned head, to void UB on unaligned 64-bit loads. - for (; ((unsigned long)text & 7ul) && text + 3 <= end; ++text) + for (; ((sz_size_t)text & 7ull) && text + 3 <= end; ++text) if (text[0] == needle[0] && text[1] == needle[1] && text[2] == needle[2]) return text; // This code simulates hyper-scalar execution, analyzing 6 offsets at a time. @@ -319,7 +328,7 @@ inline static sz_string_start_t sz_find_4char_swar(sz_string_start_t const hayst sz_string_start_t end = haystack + haystack_length; // Process the misaligned head, to void UB on unaligned 64-bit loads. - for (; ((unsigned long)text & 7ul) && text + 4 <= end; ++text) + for (; ((sz_size_t)text & 7ull) && text + 4 <= end; ++text) if (text[0] == needle[0] && text[1] == needle[1] && text[2] == needle[2] && text[3] == needle[3]) return text; // This code simulates hyper-scalar execution, analyzing 4 offsets at a time. @@ -987,7 +996,7 @@ inline static void _sz_sort_recursion( // // Partition a range of integers according to a specific bit value sz_size_t split = 0; { - sz_u64_t mask = (1ul << 63) >> bit_idx; + sz_u64_t mask = (1ull << 63) >> bit_idx; while (split != sequence->count && !(sequence->order[split] & mask)) ++split; for (sz_size_t i = split + 1; i < sequence->count; ++i) if (!(sequence->order[i] & mask)) _sz_swap_order(sequence->order + i, sequence->order + split), ++split; @@ -1056,7 +1065,7 @@ inline static void sz_sort(sz_sequence_t *sequence, sz_sort_config_t const *conf for (sz_size_t i = 0; i != sequence->count; ++i) { sz_string_start_t begin = sequence->get_start(sequence, sequence->order[i]); sz_size_t length = sequence->get_length(sequence, sequence->order[i]); - length = length > 4ul ? 4ul : length; + length = length > 4ull ? 4ull : length; char *prefix = (char *)&sequence->order[i]; for (sz_size_t j = 0; j != length; ++j) prefix[7 - j] = begin[j]; if (case_insensitive) { @@ -1208,7 +1217,7 @@ inline static sz_u32_t sz_hash_crc32_arm(sz_string_start_t start, sz_size_t leng sz_string_start_t const end = start + length; // Align the input to the word boundary - while (((unsigned long)start & 7ul) && start != end) { crc = __crc32cb(crc, *start), start++; } + while (((unsigned long)start & 7ull) && start != end) { crc = __crc32cb(crc, *start), start++; } // Process the body 8 bytes at a time while (start + 8 <= end) { crc = __crc32cd(crc, *(unsigned long long *)start), start += 8; } @@ -1227,7 +1236,7 @@ inline static sz_u32_t sz_hash_crc32_sse(sz_string_start_t start, sz_size_t leng sz_string_start_t const end = start + length; // Align the input to the word boundary - while (((unsigned long)start & 7ul) && start != end) { crc = _mm_crc32_u8(crc, *start), start++; } + while (((unsigned long)start & 7ull) && start != end) { crc = _mm_crc32_u8(crc, *start), start++; } // Process the body 8 bytes at a time while (start + 8 <= end) { crc = (sz_u32_t)_mm_crc32_u64(crc, *(unsigned long long *)start), start += 8; }