Skip to content

Commit

Permalink
Add: Split ranges
Browse files Browse the repository at this point in the history
  • Loading branch information
ashvardanian committed Dec 30, 2023
1 parent e42e9a6 commit 9fd6643
Show file tree
Hide file tree
Showing 4 changed files with 146 additions and 6 deletions.
7 changes: 7 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,13 @@ function(set_compiler_flags target)
set_target_properties(${target} PROPERTIES RUNTIME_OUTPUT_DIRECTORY
${CMAKE_BINARY_DIR})

# Maximum warnings level & warnings as error
# add_compile_options(
# "$<$<CXX_COMPILER_ID:MSVC>:/W4;/WX>"
# "$<$<CXX_COMPILER_ID:GNU>:-Wall;-Wextra;-pedantic;-Werror>"
# "$<$<CXX_COMPILER_ID:Clang>:-Wall;-Wextra;-pedantic;-Werror>"
# "$<$<CXX_COMPILER_ID:AppleClang>:-Wall;-Wextra;-pedantic;-Werror>"
# )
if(${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
target_compile_options(${target} PRIVATE "-march=native")
target_compile_options(${target} PRIVATE "-fmax-errors=1")
Expand Down
25 changes: 25 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Contributing to StringZilla

## Roadmap

Future development plans include:

- [x] [Replace PyBind11 with CPython](https://github.com/ashvardanian/StringZilla/issues/35), [blog](https://ashvardanian.com/posts/pybind11-cpython-tutorial/)
- [x] [Bindings for JavaScript](https://github.com/ashvardanian/StringZilla/issues/25)
- [ ] [Faster string sorting algorithm](https://github.com/ashvardanian/StringZilla/issues/45)
- [ ] [Reverse-order operations in Python](https://github.com/ashvardanian/StringZilla/issues/12)
- [ ] [Splitting with multiple separators at once](https://github.com/ashvardanian/StringZilla/issues/29)
- [ ] Splitting CSV rows into columns
- [ ] UTF-8 validation.
- [ ] Arm SVE backend
- [ ] Bindings for Java and Rust

## Working on Alternative Hardware Backends

## Working on Faster Edit Distances

## Working on Random String Generators

## Working on Sequence Processing and Sorting


43 changes: 43 additions & 0 deletions include/stringzilla/stringzilla.h
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,9 @@ typedef struct sz_string_view_t {
sz_size_t length;
} sz_string_view_t;

/**
* @brief Bit-set structure for 256 ASCII characters. Useful for filtering and search.
*/
typedef union sz_u8_set_t {
sz_u64_t _u64s[4];
sz_u8_t _u8s[32];
Expand Down Expand Up @@ -269,6 +272,46 @@ typedef struct sz_memory_allocator_t {
void *handle;
} sz_memory_allocator_t;

/**
* @brief Tiny memory-owning string structure with a Small String Optimization (SSO).
* Uses similar layout to Folly, 32-bytes long, like modern GCC and Clang STL.
* In uninitialized
*/
typedef union sz_string_t {

union on_stack {
sz_u8_t u8s[32];
char chars[32];
} on_stack;

struct on_heap {
sz_ptr_t start;
sz_size_t length;
sz_size_t capacity;
sz_size_t tail;
} on_heap;

} sz_string_t;

SZ_PUBLIC void sz_string_to_view(sz_string_t *string, sz_ptr_t *start, sz_size_t *length) {
//
}

SZ_PUBLIC void sz_string_init(sz_string_t *string) {
string->on_heap.start = NULL;
string->on_heap.length = 0;
string->on_heap.capacity = 0;
string->on_heap.tail = 31;
}

SZ_PUBLIC void sz_string_append() {}

SZ_PUBLIC void sz_string_free(sz_string_t *string, sz_memory_allocator_t *allocator) {}

SZ_PUBLIC void sz_copy(sz_cptr_t, sz_size_t, sz_ptr_t) {}

SZ_PUBLIC void sz_fill(sz_ptr_t, sz_size_t, sz_u8_t) {}

#pragma region Basic Functionality

typedef sz_u64_t (*sz_hash_t)(sz_cptr_t, sz_size_t);
Expand Down
77 changes: 71 additions & 6 deletions include/stringzilla/stringzilla.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,8 @@ struct matcher_find {
string_view_ needle_;
std::size_t skip_after_match_ = 1;

matcher_find(string_view_ needle, bool allow_interleaving = true) noexcept
: needle_(needle), skip_after_match_(allow_interleaving ? 1 : needle_.length()) {}
matcher_find(string_view_ needle, bool allow_overlaps = true) noexcept
: needle_(needle), skip_after_match_(allow_overlaps ? 1 : needle_.length()) {}
size_type needle_length() const noexcept { return needle_.length(); }
size_type skip_length() const noexcept { return skip_after_match_; }
size_type operator()(string_view_ haystack) const noexcept { return haystack.find(needle_); }
Expand All @@ -84,8 +84,8 @@ struct matcher_rfind {
string_view_ needle_;
std::size_t skip_after_match_ = 1;

matcher_rfind(string_view_ needle, bool allow_interleaving = true) noexcept
: needle_(needle), skip_after_match_(allow_interleaving ? 1 : needle_.length()) {}
matcher_rfind(string_view_ needle, bool allow_overlaps = true) noexcept
: needle_(needle), skip_after_match_(allow_overlaps ? 1 : needle_.length()) {}
size_type needle_length() const noexcept { return needle_.length(); }
size_type skip_length() const noexcept { return skip_after_match_; }
size_type operator()(string_view_ haystack) const noexcept { return haystack.rfind(needle_); }
Expand Down Expand Up @@ -143,7 +143,7 @@ struct end_sentinel_t {};
inline static constexpr end_sentinel_t end_sentinel;

/**
* @brief A range of string views representing the matches of a substring search.
* @brief A range of string slices representing the matches of a substring search.
* Compatible with C++23 ranges, C++11 string views, and of course, StringZilla.
*/
template <typename string_view_, template <typename> typename matcher_template_>
Expand Down Expand Up @@ -198,10 +198,11 @@ class range_matches {
iterator end() const noexcept { return iterator(string_view(), matcher_); }
iterator::difference_type size() const noexcept { return std::distance(begin(), end()); }
bool empty() const noexcept { return begin() == end_sentinel; }
bool allow_overlaps() const noexcept { return matcher_.skip_length() < matcher_.needle_length(); }
};

/**
* @brief A range of string views representing the matches of a @b reverse-order substring search.
* @brief A range of string slices representing the matches of a @b reverse-order substring search.
* Compatible with C++23 ranges, C++11 string views, and of course, StringZilla.
*/
template <typename string_view_, template <typename> typename matcher_template_>
Expand Down Expand Up @@ -258,6 +259,70 @@ class range_rmatches {
matcher_);
}

iterator end() const noexcept { return iterator(string_view(), matcher_); }
iterator::difference_type size() const noexcept { return std::distance(begin(), end()); }
bool empty() const noexcept { return begin() == end_sentinel; }
bool allow_overlaps() const noexcept { return matcher_.skip_length() < matcher_.needle_length(); }
};

/**
* @brief A range of string slices for different splits of the data.
* Compatible with C++23 ranges, C++11 string views, and of course, StringZilla.
*
* In some sense, represents the inverse operation to `range_matches`, as it reports
*/
template <typename string_view_, template <typename> typename matcher_template_>
class range_splits {
using string_view = string_view_;
using matcher = matcher_template_<string_view>;

string_view haystack_;
matcher matcher_;
bool include_empty_ = true;
bool include_delimiter_ = false;

public:
range_splits(string_view haystack, matcher needle) : haystack_(haystack), matcher_(needle) {}

class iterator {
string_view remaining_;
matcher matcher_;
std::size_t next_offset_;

public:
using iterator_category = std::forward_iterator_tag;
using difference_type = std::ptrdiff_t;
using value_type = string_view;
using pointer = void;
using reference = void;

iterator(string_view haystack, matcher matcher) noexcept : remaining_(haystack), matcher_(matcher) {}
value_type operator*() const noexcept { return remaining_.substr(0, matcher_.needle_length()); }

iterator &operator++() noexcept {
remaining_.remove_prefix(matcher_.skip_length());
auto position = matcher_(remaining_);
remaining_ = position != string_view::npos ? remaining_.substr(position) : string_view();
return *this;
}

iterator operator++(int) noexcept {
iterator temp = *this;
++(*this);
return temp;
}

bool operator!=(iterator const &other) const noexcept { return remaining_.size() != other.remaining_.size(); }
bool operator==(iterator const &other) const noexcept { return remaining_.size() == other.remaining_.size(); }
bool operator!=(end_sentinel_t) const noexcept { return !remaining_.empty(); }
bool operator==(end_sentinel_t) const noexcept { return remaining_.empty(); }
};

iterator begin() const noexcept {
auto position = matcher_(haystack_);
return iterator(position != string_view::npos ? haystack_.substr(position) : string_view(), matcher_);
}

iterator end() const noexcept { return iterator(string_view(), matcher_); }
iterator::difference_type size() const noexcept { return std::distance(begin(), end()); }
bool empty() const noexcept { return begin() == end_sentinel; }
Expand Down

0 comments on commit 9fd6643

Please sign in to comment.