From 9fd664368298776efe2c0ca83b9b8984ca332ee5 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sat, 30 Dec 2023 14:54:01 -0800 Subject: [PATCH] Add: Split ranges --- CMakeLists.txt | 7 +++ CONTRIBUTING.md | 25 ++++++++++ include/stringzilla/stringzilla.h | 43 ++++++++++++++++ include/stringzilla/stringzilla.hpp | 77 ++++++++++++++++++++++++++--- 4 files changed, 146 insertions(+), 6 deletions(-) create mode 100644 CONTRIBUTING.md diff --git a/CMakeLists.txt b/CMakeLists.txt index a9782020..fb15ad67 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -83,6 +83,13 @@ function(set_compiler_flags target) set_target_properties(${target} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) + # Maximum warnings level & warnings as error + # add_compile_options( + # "$<$:/W4;/WX>" + # "$<$:-Wall;-Wextra;-pedantic;-Werror>" + # "$<$:-Wall;-Wextra;-pedantic;-Werror>" + # "$<$:-Wall;-Wextra;-pedantic;-Werror>" + # ) if(${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang") target_compile_options(${target} PRIVATE "-march=native") target_compile_options(${target} PRIVATE "-fmax-errors=1") diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 00000000..dd50a2d2 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,25 @@ +# Contributing to StringZilla + +## Roadmap + +Future development plans include: + +- [x] [Replace PyBind11 with CPython](https://github.com/ashvardanian/StringZilla/issues/35), [blog](https://ashvardanian.com/posts/pybind11-cpython-tutorial/) +- [x] [Bindings for JavaScript](https://github.com/ashvardanian/StringZilla/issues/25) +- [ ] [Faster string sorting algorithm](https://github.com/ashvardanian/StringZilla/issues/45) +- [ ] [Reverse-order operations in Python](https://github.com/ashvardanian/StringZilla/issues/12) +- [ ] [Splitting with multiple separators at once](https://github.com/ashvardanian/StringZilla/issues/29) +- [ ] Splitting CSV rows into columns +- [ ] UTF-8 validation. +- [ ] Arm SVE backend +- [ ] Bindings for Java and Rust + +## Working on Alternative Hardware Backends + +## Working on Faster Edit Distances + +## Working on Random String Generators + +## Working on Sequence Processing and Sorting + + diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h index aae0f831..8a243fc1 100644 --- a/include/stringzilla/stringzilla.h +++ b/include/stringzilla/stringzilla.h @@ -241,6 +241,9 @@ typedef struct sz_string_view_t { sz_size_t length; } sz_string_view_t; +/** + * @brief Bit-set structure for 256 ASCII characters. Useful for filtering and search. + */ typedef union sz_u8_set_t { sz_u64_t _u64s[4]; sz_u8_t _u8s[32]; @@ -269,6 +272,46 @@ typedef struct sz_memory_allocator_t { void *handle; } sz_memory_allocator_t; +/** + * @brief Tiny memory-owning string structure with a Small String Optimization (SSO). + * Uses similar layout to Folly, 32-bytes long, like modern GCC and Clang STL. + * In uninitialized + */ +typedef union sz_string_t { + + union on_stack { + sz_u8_t u8s[32]; + char chars[32]; + } on_stack; + + struct on_heap { + sz_ptr_t start; + sz_size_t length; + sz_size_t capacity; + sz_size_t tail; + } on_heap; + +} sz_string_t; + +SZ_PUBLIC void sz_string_to_view(sz_string_t *string, sz_ptr_t *start, sz_size_t *length) { + // +} + +SZ_PUBLIC void sz_string_init(sz_string_t *string) { + string->on_heap.start = NULL; + string->on_heap.length = 0; + string->on_heap.capacity = 0; + string->on_heap.tail = 31; +} + +SZ_PUBLIC void sz_string_append() {} + +SZ_PUBLIC void sz_string_free(sz_string_t *string, sz_memory_allocator_t *allocator) {} + +SZ_PUBLIC void sz_copy(sz_cptr_t, sz_size_t, sz_ptr_t) {} + +SZ_PUBLIC void sz_fill(sz_ptr_t, sz_size_t, sz_u8_t) {} + #pragma region Basic Functionality typedef sz_u64_t (*sz_hash_t)(sz_cptr_t, sz_size_t); diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp index 80a299e5..50bffce3 100644 --- a/include/stringzilla/stringzilla.hpp +++ b/include/stringzilla/stringzilla.hpp @@ -66,8 +66,8 @@ struct matcher_find { string_view_ needle_; std::size_t skip_after_match_ = 1; - matcher_find(string_view_ needle, bool allow_interleaving = true) noexcept - : needle_(needle), skip_after_match_(allow_interleaving ? 1 : needle_.length()) {} + matcher_find(string_view_ needle, bool allow_overlaps = true) noexcept + : needle_(needle), skip_after_match_(allow_overlaps ? 1 : needle_.length()) {} size_type needle_length() const noexcept { return needle_.length(); } size_type skip_length() const noexcept { return skip_after_match_; } size_type operator()(string_view_ haystack) const noexcept { return haystack.find(needle_); } @@ -84,8 +84,8 @@ struct matcher_rfind { string_view_ needle_; std::size_t skip_after_match_ = 1; - matcher_rfind(string_view_ needle, bool allow_interleaving = true) noexcept - : needle_(needle), skip_after_match_(allow_interleaving ? 1 : needle_.length()) {} + matcher_rfind(string_view_ needle, bool allow_overlaps = true) noexcept + : needle_(needle), skip_after_match_(allow_overlaps ? 1 : needle_.length()) {} size_type needle_length() const noexcept { return needle_.length(); } size_type skip_length() const noexcept { return skip_after_match_; } size_type operator()(string_view_ haystack) const noexcept { return haystack.rfind(needle_); } @@ -143,7 +143,7 @@ struct end_sentinel_t {}; inline static constexpr end_sentinel_t end_sentinel; /** - * @brief A range of string views representing the matches of a substring search. + * @brief A range of string slices representing the matches of a substring search. * Compatible with C++23 ranges, C++11 string views, and of course, StringZilla. */ template typename matcher_template_> @@ -198,10 +198,11 @@ class range_matches { iterator end() const noexcept { return iterator(string_view(), matcher_); } iterator::difference_type size() const noexcept { return std::distance(begin(), end()); } bool empty() const noexcept { return begin() == end_sentinel; } + bool allow_overlaps() const noexcept { return matcher_.skip_length() < matcher_.needle_length(); } }; /** - * @brief A range of string views representing the matches of a @b reverse-order substring search. + * @brief A range of string slices representing the matches of a @b reverse-order substring search. * Compatible with C++23 ranges, C++11 string views, and of course, StringZilla. */ template typename matcher_template_> @@ -258,6 +259,70 @@ class range_rmatches { matcher_); } + iterator end() const noexcept { return iterator(string_view(), matcher_); } + iterator::difference_type size() const noexcept { return std::distance(begin(), end()); } + bool empty() const noexcept { return begin() == end_sentinel; } + bool allow_overlaps() const noexcept { return matcher_.skip_length() < matcher_.needle_length(); } +}; + +/** + * @brief A range of string slices for different splits of the data. + * Compatible with C++23 ranges, C++11 string views, and of course, StringZilla. + * + * In some sense, represents the inverse operation to `range_matches`, as it reports + */ +template typename matcher_template_> +class range_splits { + using string_view = string_view_; + using matcher = matcher_template_; + + string_view haystack_; + matcher matcher_; + bool include_empty_ = true; + bool include_delimiter_ = false; + + public: + range_splits(string_view haystack, matcher needle) : haystack_(haystack), matcher_(needle) {} + + class iterator { + string_view remaining_; + matcher matcher_; + std::size_t next_offset_; + + public: + using iterator_category = std::forward_iterator_tag; + using difference_type = std::ptrdiff_t; + using value_type = string_view; + using pointer = void; + using reference = void; + + iterator(string_view haystack, matcher matcher) noexcept : remaining_(haystack), matcher_(matcher) {} + value_type operator*() const noexcept { return remaining_.substr(0, matcher_.needle_length()); } + + iterator &operator++() noexcept { + remaining_.remove_prefix(matcher_.skip_length()); + auto position = matcher_(remaining_); + remaining_ = position != string_view::npos ? remaining_.substr(position) : string_view(); + return *this; + } + + iterator operator++(int) noexcept { + iterator temp = *this; + ++(*this); + return temp; + } + + bool operator!=(iterator const &other) const noexcept { return remaining_.size() != other.remaining_.size(); } + bool operator==(iterator const &other) const noexcept { return remaining_.size() == other.remaining_.size(); } + bool operator!=(end_sentinel_t) const noexcept { return !remaining_.empty(); } + bool operator==(end_sentinel_t) const noexcept { return remaining_.empty(); } + }; + + iterator begin() const noexcept { + auto position = matcher_(haystack_); + return iterator(position != string_view::npos ? haystack_.substr(position) : string_view(), matcher_); + } + iterator end() const noexcept { return iterator(string_view(), matcher_); } iterator::difference_type size() const noexcept { return std::distance(begin(), end()); } bool empty() const noexcept { return begin() == end_sentinel; }