diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml index a7d0e259..6efbdef6 100644 --- a/.github/workflows/prerelease.yml +++ b/.github/workflows/prerelease.yml @@ -18,6 +18,32 @@ permissions: contents: read jobs: + versioning: + name: Update Version + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + persist-credentials: false + - name: Run TinySemVer + uses: ashvardanian/tinysemver@v2.0.1 + with: + verbose: "true" + version-file: "VERSION" + update-version-in: | + Cargo.toml:^version = "(\d+\.\d+\.\d+)" + package.json:"version": "(\d+\.\d+\.\d+)" + CMakeLists.txt:VERSION (\d+\.\d+\.\d+) + update-major-version-in: | + include/stringzilla/stringzilla.h:^#define STRINGZILLA_VERSION_MAJOR (\d+) + update-minor-version-in: | + include/stringzilla/stringzilla.h:^#define STRINGZILLA_VERSION_MINOR (\d+) + update-patch-version-in: | + include/stringzilla/stringzilla.h:^#define STRINGZILLA_VERSION_PATCH (\d+) + dry-run: "true" + test_ubuntu_gcc: name: Ubuntu (GCC 12) runs-on: ubuntu-22.04 @@ -230,7 +256,7 @@ jobs: wget https://apt.llvm.org/llvm.sh chmod +x llvm.sh sudo ./llvm.sh 16 - + - name: Build C/C++ run: | cmake -B build_artifacts \ diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 1f34d98e..a7b42e9e 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -18,22 +18,38 @@ permissions: jobs: versioning: - name: Semantic Release - runs-on: ubuntu-22.04 + name: Update Version + runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - name: Checkout + uses: actions/checkout@v4 with: + fetch-depth: 0 persist-credentials: false - - name: Set up Node.js - uses: actions/setup-node@v4 - with: - node-version: 20 - name: Set up Cargo uses: actions-rs/toolchain@v1 with: toolchain: stable override: true - - run: npm install --ignore-scripts --save-dev --prefix ./package-ci @semantic-release/exec @semantic-release/git conventional-changelog-eslint semantic-release && npx --prefix ./package-ci semantic-release + - name: Run TinySemVer + uses: ashvardanian/tinysemver@v2.0.1 + with: + verbose: "true" + version-file: "VERSION" + update-version-in: | + Cargo.toml:^version = "(\d+\.\d+\.\d+)" + package.json:"version": "(\d+\.\d+\.\d+)" + CMakeLists.txt:VERSION (\d+\.\d+\.\d+) + update-major-version-in: | + include/stringzilla/stringzilla.h:^#define STRINGZILLA_VERSION_MAJOR (\d+) + update-minor-version-in: | + include/stringzilla/stringzilla.h:^#define STRINGZILLA_VERSION_MINOR (\d+) + update-patch-version-in: | + include/stringzilla/stringzilla.h:^#define STRINGZILLA_VERSION_PATCH (\d+) + dry-run: "false" + push: "true" + create-release: "true" + github-token: ${{ secrets.SEMANTIC_RELEASE_TOKEN }} rebase: name: Rebase Dev. Branch diff --git a/.github/workflows/update_version.sh b/.github/workflows/update_version.sh deleted file mode 100644 index fc70eda3..00000000 --- a/.github/workflows/update_version.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/sh - -echo $1 > VERSION && - sed -i "s/^\(#define STRINGZILLA_VERSION_MAJOR \).*/\1$(echo "$1" | cut -d. -f1)/" ./include/stringzilla/stringzilla.h && - sed -i "s/^\(#define STRINGZILLA_VERSION_MINOR \).*/\1$(echo "$1" | cut -d. -f2)/" ./include/stringzilla/stringzilla.h && - sed -i "s/^\(#define STRINGZILLA_VERSION_PATCH \).*/\1$(echo "$1" | cut -d. -f3)/" ./include/stringzilla/stringzilla.h && - sed -i "s/^version = \".*\"/version = \"$1\"/" Cargo.toml && - sed -i "s/\"version\": \".*\"/\"version\": \"$1\"/" package.json && - sed -i "s/VERSION [0-9]\+\.[0-9]\+\.[0-9]\+/VERSION $1/" CMakeLists.txt - -# Update the version in the Cargo.lock file, but don't report an error if it fails... -# as `cargo` may not be available in the current environment. -cargo update || true diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index dd76bd41..c9d6a950 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -56,6 +56,10 @@ unzip enwik9.zip && rm enwik9.zip && mv enwik9 enwik9.txt # 4.7 GB (1.7 GB compressed), 1'004'598 lines of UTF8, 268'435'456 tokens of mean length 8 wget --no-clobber -O xlsum.csv.gz https://github.com/ashvardanian/xl-sum/releases/download/v1.0.0/xlsum.csv.gz gzip -d xlsum.csv.gz + +# Human chromosome generator dataset generated by https://github.com/rghilduta/human-chromosome-data-generator/blob/main/generate_chromosome_data.sh +# 1200 rows, each 800 characters long (939K) +wget --no-clobber -O human_protein_1200row_800len.txt https://media.githubusercontent.com/media/rghilduta/human-chromosome-data-generator/main/examples/human_protein_1200row_800len.txt ``` ## IDE Integrations diff --git a/cli/wc.py b/cli/wc.py index e73bd3a5..d1533ab0 100755 --- a/cli/wc.py +++ b/cli/wc.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -import sys +import sys, os import argparse import stringzilla from stringzilla import File, Str @@ -30,6 +30,13 @@ def parse_arguments(): parser.add_argument( "-w", "--words", action="store_true", help="print the word counts" ) + parser.add_argument( + "--files0-from", + metavar="filename", + help="Read input from the files specified by NUL-terminated names in file F;" + " If F is - then read names from standard input", + ) + parser.add_argument("--version", action="version", version=stringzilla.__version__) return parser.parse_args() @@ -45,27 +52,28 @@ def wc(file_path, args): except RuntimeError: # File gives a RuntimeError if the file does not exist return f"No such file: {file_path}", False - line_count = mapped_bytes.count("\n") - word_count = mapped_bytes.count(" ") + 1 - char_count = mapped_bytes.__len__() - counts = { - "line_count": line_count, - "word_count": word_count, - "char_count": char_count, - } + counts = {} + if args.lines: + counts["line_count"] = mapped_bytes.count("\n") + if args.words: + counts["word_count"] = mapped_bytes.count(" ") + 1 + if args.chars: + counts["char_count"] = mapped_bytes.__len__() if args.max_line_length: - max_line_length = max(len(line) for line in str(mapped_bytes).split("\n")) + max_line_length = max(len(line) for line in mapped_bytes.split("\n")) counts["max_line_length"] = max_line_length - if args.bytes or args.chars: - byte_count = char_count # assume 1 char = 1 byte - counts["byte_count"] = byte_count + if args.bytes: + if args.chars: + counts["byte_count"] = counts["char_count"] + else: + counts["byte_count"] = mapped_bytes.__len__() return counts, True -def format_output(counts, args): +def format_output(counts, args, just): selected_counts = [] if args.lines: selected_counts.append(counts["line_count"]) @@ -74,18 +82,18 @@ def format_output(counts, args): if args.chars: selected_counts.append(counts["char_count"]) if args.bytes: - selected_counts.append(counts.get("byte_count", counts["char_count"])) + selected_counts.append(counts["byte_count"]) if args.max_line_length: selected_counts.append(counts.get("max_line_length", 0)) - if not any([args.lines, args.words, args.chars, args.bytes, args.max_line_length]): - selected_counts = [ - counts["line_count"], - counts["word_count"], - counts["char_count"], - ] + return " ".join(str(count).rjust(just) for count in selected_counts) + - return " ".join(str(count) for count in selected_counts) +def get_files_from(fn): + f = open(fn, "r") + s = f.read() + f.close() + return [x for x in s.split("\0") if os.path.isfile(x)] def main(): @@ -97,19 +105,33 @@ def main(): "max_line_length": 0, "byte_count": 0, } + if not any([args.lines, args.words, args.chars, args.bytes, args.max_line_length]): + args.lines = 1 + args.words = 1 + args.bytes = 1 + + # wc uses the file size to determine column width when printing + if args.files0_from: + if args.files[0] == "-": + args.files = get_files_from(args.files0_from) + if len(args.files) == 0: + # print(" No filenames found in ", args.files0_from) + exit(0) + + just = max(len(str(os.stat(fn).st_size)) for fn in args.files) for file_path in args.files: counts, success = wc(file_path, args) if success: for key in total_counts.keys(): total_counts[key] += counts.get(key, 0) - output = format_output(counts, args) + f" {file_path}" + output = format_output(counts, args, just) + f" {file_path}" print(output) else: print(counts) if len(args.files) > 1: - total_output = format_output(total_counts, args) + " total" + total_output = format_output(total_counts, args, just) + " total" print(total_output) diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h index 5c8d64c5..36b98cc7 100644 --- a/include/stringzilla/stringzilla.h +++ b/include/stringzilla/stringzilla.h @@ -2194,7 +2194,7 @@ SZ_INTERNAL sz_cptr_t _sz_find_with_prefix(sz_cptr_t h, sz_size_t h_length, sz_c // Verify the remaining part of the needle sz_size_t remaining = h_length - (found - h); - if (remaining < suffix_length) return SZ_NULL_CHAR; + if (remaining < n_length) return SZ_NULL_CHAR; if (sz_equal(found + prefix_length, n + prefix_length, suffix_length)) return found; // Adjust the position. @@ -2246,7 +2246,6 @@ SZ_INTERNAL sz_cptr_t _sz_rfind_horspool_over_256bytes_serial(sz_cptr_t h, sz_si } SZ_PUBLIC sz_cptr_t sz_find_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) { - // This almost never fires, but it's better to be safe than sorry. if (h_length < n_length || !n_length) return SZ_NULL_CHAR; @@ -2620,11 +2619,9 @@ SZ_PUBLIC sz_size_t sz_edit_distance_serial( // // Skip the matching prefixes and suffixes, they won't affect the distance. for (sz_cptr_t a_end = longer + longer_length, b_end = shorter + shorter_length; longer != a_end && shorter != b_end && *longer == *shorter; - ++longer, ++shorter, --longer_length, --shorter_length) - ; + ++longer, ++shorter, --longer_length, --shorter_length); for (; longer_length && shorter_length && longer[longer_length - 1] == shorter[shorter_length - 1]; - --longer_length, --shorter_length) - ; + --longer_length, --shorter_length); // Bounded computations may exit early. if (bound) { diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp index 98e1c2ec..499d4f29 100644 --- a/include/stringzilla/stringzilla.hpp +++ b/include/stringzilla/stringzilla.hpp @@ -1348,9 +1348,7 @@ class basic_string_slice { * @brief Compares two strings lexicographically. If prefix matches, lengths are compared. * @return 0 if equal, negative if `*this` is less than `other`, positive if `*this` is greater than `other`. */ - int compare(string_view other) const noexcept { - return (int)sz_order(start_, length_, other.start_, other.length_); - } + int compare(string_view other) const noexcept { return (int)sz_order(data(), size(), other.data(), other.size()); } /** * @brief Compares two strings lexicographically. If prefix matches, lengths are compared. @@ -1443,7 +1441,7 @@ class basic_string_slice { /** @brief Checks if the string starts with the other string. */ bool starts_with(string_view other) const noexcept { - return length_ >= other.length_ && sz_equal(start_, other.start_, other.length_) == sz_true_k; + return length_ >= other.size() && sz_equal(start_, other.data(), other.size()) == sz_true_k; } /** @brief Checks if the string starts with the other string. */ @@ -1457,8 +1455,8 @@ class basic_string_slice { /** @brief Checks if the string ends with the other string. */ bool ends_with(string_view other) const noexcept { - return length_ >= other.length_ && - sz_equal(start_ + length_ - other.length_, other.start_, other.length_) == sz_true_k; + return length_ >= other.size() && + sz_equal(start_ + length_ - other.size(), other.data(), other.size()) == sz_true_k; } /** @brief Checks if the string ends with the other string. */ @@ -1472,12 +1470,12 @@ class basic_string_slice { /** @brief Python-like convenience function, dropping the matching prefix. */ string_slice remove_prefix(string_view other) const noexcept { - return starts_with(other) ? string_slice {start_ + other.length_, length_ - other.length_} : *this; + return starts_with(other) ? string_slice {start_ + other.size(), length_ - other.size()} : *this; } /** @brief Python-like convenience function, dropping the matching suffix. */ string_slice remove_suffix(string_view other) const noexcept { - return ends_with(other) ? string_slice {start_, length_ - other.length_} : *this; + return ends_with(other) ? string_slice {start_, length_ - other.size()} : *this; } #pragma endregion @@ -1497,7 +1495,7 @@ class basic_string_slice { * @return The offset of the first character of the match, or `npos` if not found. */ size_type find(string_view other, size_type skip = 0) const noexcept { - auto ptr = sz_find(start_ + skip, length_ - skip, other.start_, other.length_); + auto ptr = sz_find(start_ + skip, length_ - skip, other.data(), other.size()); return ptr ? ptr - start_ : npos; } @@ -1525,7 +1523,7 @@ class basic_string_slice { * @return The offset of the first character of the match, or `npos` if not found. */ size_type rfind(string_view other) const noexcept { - auto ptr = sz_rfind(start_, length_, other.start_, other.length_); + auto ptr = sz_rfind(start_, length_, other.data(), other.size()); return ptr ? ptr - start_ : npos; } @@ -1731,7 +1729,7 @@ class basic_string_slice { */ string_slice lstrip(char_set set) const noexcept { set = set.inverted(); - auto new_start = sz_find_charset(start_, length_, &set.raw()); + auto new_start = (pointer)sz_find_charset(start_, length_, &set.raw()); return new_start ? string_slice {new_start, length_ - static_cast(new_start - start_)} : string_slice(); } @@ -1742,7 +1740,7 @@ class basic_string_slice { */ string_slice rstrip(char_set set) const noexcept { set = set.inverted(); - auto new_end = sz_rfind_charset(start_, length_, &set.raw()); + auto new_end = (pointer)sz_rfind_charset(start_, length_, &set.raw()); return new_end ? string_slice {start_, static_cast(new_end - start_ + 1)} : string_slice(); } @@ -1752,7 +1750,7 @@ class basic_string_slice { */ string_slice strip(char_set set) const noexcept { set = set.inverted(); - auto new_start = sz_find_charset(start_, length_, &set.raw()); + auto new_start = (pointer)sz_find_charset(start_, length_, &set.raw()); return new_start ? string_slice {new_start, static_cast( sz_rfind_charset(new_start, length_ - (new_start - start_), &set.raw()) - @@ -1811,7 +1809,7 @@ class basic_string_slice { rsplit_chars_type rsplit(char_set set = whitespaces_set()) const noexcept { return {*this, {set}}; } /** @brief Split around the occurrences of all newline characters. */ - split_chars_type splitlines() const noexcept { return split(newlines_set); } + split_chars_type splitlines() const noexcept { return split(newlines_set()); } #pragma endregion @@ -1826,9 +1824,9 @@ class basic_string_slice { } private: - sz_constexpr_if_cpp20 string_view &assign(string_view const &other) noexcept { - start_ = other.start_; - length_ = other.length_; + sz_constexpr_if_cpp20 string_slice &assign(string_view const &other) noexcept { + start_ = (pointer)other.data(); + length_ = other.size(); return *this; } @@ -1841,17 +1839,17 @@ class basic_string_slice { template partition_type partition_(pattern_ &&pattern, std::size_t pattern_length) const noexcept { size_type pos = find(pattern); - if (pos == npos) return {*this, string_view(), string_view()}; - return {string_view(start_, pos), string_view(start_ + pos, pattern_length), - string_view(start_ + pos + pattern_length, length_ - pos - pattern_length)}; + if (pos == npos) return {string_slice(*this), string_slice(), string_slice()}; + return {string_slice(start_, pos), string_slice(start_ + pos, pattern_length), + string_slice(start_ + pos + pattern_length, length_ - pos - pattern_length)}; } template partition_type rpartition_(pattern_ &&pattern, std::size_t pattern_length) const noexcept { size_type pos = rfind(pattern); - if (pos == npos) return {*this, string_view(), string_view()}; - return {string_view(start_, pos), string_view(start_ + pos, pattern_length), - string_view(start_ + pos + pattern_length, length_ - pos - pattern_length)}; + if (pos == npos) return {string_slice(*this), string_slice(), string_slice()}; + return {string_slice(start_, pos), string_slice(start_ + pos, pattern_length), + string_slice(start_ + pos + pattern_length, length_ - pos - pattern_length)}; } }; @@ -3191,7 +3189,7 @@ class basic_string { return basic_string {concatenation {view(), other}}; } basic_string operator+(std::initializer_list other) const noexcept(false) { - return basic_string {concatenation {view(), other}}; + return basic_string {concatenation {view(), string_view(other)}}; } #pragma endregion diff --git a/package-ci.json b/package-ci.json deleted file mode 100644 index 6288162c..00000000 --- a/package-ci.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "name": "stringzilla-ci", - "version": "1.0.0", - "devDependencies": { - "@semantic-release/exec": "^6.0.3", - "@semantic-release/git": "^10.0.1", - "conventional-changelog-eslint": "^3.0.9", - "semantic-release": "^21.1.2" - } -} \ No newline at end of file diff --git a/rust/lib.rs b/rust/lib.rs index 652bba5a..30150efb 100644 --- a/rust/lib.rs +++ b/rust/lib.rs @@ -696,6 +696,265 @@ pub mod sz { } } +pub trait Matcher<'a> { + fn find(&self, haystack: &'a [u8]) -> Option; + fn needle_length(&self) -> usize; + fn skip_length(&self, include_overlaps: bool, is_reverse: bool) -> usize; +} + +pub enum MatcherType<'a> { + Find(&'a [u8]), + RFind(&'a [u8]), + FindFirstOf(&'a [u8]), + FindLastOf(&'a [u8]), + FindFirstNotOf(&'a [u8]), + FindLastNotOf(&'a [u8]), +} + +impl<'a> Matcher<'a> for MatcherType<'a> { + fn find(&self, haystack: &'a [u8]) -> Option { + match self { + MatcherType::Find(needle) => sz::find(haystack, needle), + MatcherType::RFind(needle) => sz::rfind(haystack, needle), + MatcherType::FindFirstOf(needles) => sz::find_char_from(haystack, needles), + MatcherType::FindLastOf(needles) => sz::rfind_char_from(haystack, needles), + MatcherType::FindFirstNotOf(needles) => sz::find_char_not_from(haystack, needles), + MatcherType::FindLastNotOf(needles) => sz::rfind_char_not_from(haystack, needles), + } + } + + fn needle_length(&self) -> usize { + match self { + MatcherType::Find(needle) | MatcherType::RFind(needle) => needle.len(), + _ => 1, + } + } + + fn skip_length(&self, include_overlaps: bool, is_reverse: bool) -> usize { + match (include_overlaps, is_reverse) { + (true, true) => self.needle_length().saturating_sub(1), + (true, false) => 1, + (false, true) => 0, + (false, false) => self.needle_length(), + } + } +} + +/// An iterator over non-overlapping matches of a pattern in a string slice. +/// This iterator yields the matched substrings in the order they are found. +/// +/// # Examples +/// +/// ``` +/// use stringzilla::{sz, MatcherType, RangeMatches}; +/// +/// let haystack = b"abababa"; +/// let matcher = MatcherType::Find(b"aba"); +/// let matches: Vec<&[u8]> = RangeMatches::new(haystack, matcher, false).collect(); +/// assert_eq!(matches, vec![b"aba", b"aba"]); +/// ``` +pub struct RangeMatches<'a> { + haystack: &'a [u8], + matcher: MatcherType<'a>, + position: usize, + include_overlaps: bool, +} + +impl<'a> RangeMatches<'a> { + pub fn new(haystack: &'a [u8], matcher: MatcherType<'a>, include_overlaps: bool) -> Self { + Self { + haystack, + matcher, + position: 0, + include_overlaps, + } + } +} + +impl<'a> Iterator for RangeMatches<'a> { + type Item = &'a [u8]; + + fn next(&mut self) -> Option { + if self.position >= self.haystack.len() { + return None; + } + + if let Some(index) = self.matcher.find(&self.haystack[self.position..]) { + let start = self.position + index; + let end = start + self.matcher.needle_length(); + self.position = start + self.matcher.skip_length(self.include_overlaps, false); + Some(&self.haystack[start..end]) + } else { + self.position = self.haystack.len(); + None + } + } +} + +/// An iterator over non-overlapping splits of a string slice by a pattern. +/// This iterator yields the substrings between the matches of the pattern. +/// +/// # Examples +/// +/// ``` +/// use stringzilla::{sz, MatcherType, RangeSplits}; +/// +/// let haystack = b"a,b,c,d"; +/// let matcher = MatcherType::Find(b","); +/// let splits: Vec<&[u8]> = RangeSplits::new(haystack, matcher).collect(); +/// assert_eq!(splits, vec![b"a", b"b", b"c", b"d"]); +/// ``` +pub struct RangeSplits<'a> { + haystack: &'a [u8], + matcher: MatcherType<'a>, + position: usize, + last_match: Option, +} + +impl<'a> RangeSplits<'a> { + pub fn new(haystack: &'a [u8], matcher: MatcherType<'a>) -> Self { + Self { + haystack, + matcher, + position: 0, + last_match: None, + } + } +} + +impl<'a> Iterator for RangeSplits<'a> { + type Item = &'a [u8]; + + fn next(&mut self) -> Option { + if self.position > self.haystack.len() { + return None; + } + + if let Some(index) = self.matcher.find(&self.haystack[self.position..]) { + let start = self.position; + let end = self.position + index; + self.position = end + self.matcher.needle_length(); + self.last_match = Some(end); + Some(&self.haystack[start..end]) + } else if self.position < self.haystack.len() || self.last_match.is_some() { + let start = self.position; + self.position = self.haystack.len() + 1; + Some(&self.haystack[start..]) + } else { + None + } + } +} + +/// An iterator over non-overlapping matches of a pattern in a string slice, searching from the end. +/// This iterator yields the matched substrings in reverse order. +/// +/// # Examples +/// +/// ``` +/// use stringzilla::{sz, MatcherType, RangeRMatches}; +/// +/// let haystack = b"abababa"; +/// let matcher = MatcherType::RFind(b"aba"); +/// let matches: Vec<&[u8]> = RangeRMatches::new(haystack, matcher, false).collect(); +/// assert_eq!(matches, vec![b"aba", b"aba"]); +/// ``` +pub struct RangeRMatches<'a> { + haystack: &'a [u8], + matcher: MatcherType<'a>, + position: usize, + include_overlaps: bool, +} + +impl<'a> RangeRMatches<'a> { + pub fn new(haystack: &'a [u8], matcher: MatcherType<'a>, include_overlaps: bool) -> Self { + Self { + haystack, + matcher, + position: haystack.len(), + include_overlaps, + } + } +} + +impl<'a> Iterator for RangeRMatches<'a> { + type Item = &'a [u8]; + + fn next(&mut self) -> Option { + if self.position == 0 { + return None; + } + + let search_area = &self.haystack[..self.position]; + if let Some(index) = self.matcher.find(search_area) { + let start = index; + let end = start + self.matcher.needle_length(); + let result = Some(&self.haystack[start..end]); + + let skip = self.matcher.skip_length(self.include_overlaps, true); + self.position = start + skip; + + result + } else { + None + } + } +} + +/// An iterator over non-overlapping splits of a string slice by a pattern, searching from the end. +/// This iterator yields the substrings between the matches of the pattern in reverse order. +/// +/// # Examples +/// +/// ``` +/// use stringzilla::{sz, MatcherType, RangeRSplits}; +/// +/// let haystack = b"a,b,c,d"; +/// let matcher = MatcherType::RFind(b","); +/// let splits: Vec<&[u8]> = RangeRSplits::new(haystack, matcher).collect(); +/// assert_eq!(splits, vec![b"d", b"c", b"b", b"a"]); +/// ``` +pub struct RangeRSplits<'a> { + haystack: &'a [u8], + matcher: MatcherType<'a>, + position: usize, +} + +impl<'a> RangeRSplits<'a> { + pub fn new(haystack: &'a [u8], matcher: MatcherType<'a>) -> Self { + Self { + haystack, + matcher, + position: haystack.len(), + } + } +} + +impl<'a> Iterator for RangeRSplits<'a> { + type Item = &'a [u8]; + + fn next(&mut self) -> Option { + if self.position == 0 { + return None; + } + + let search_area = &self.haystack[..self.position]; + if let Some(index) = self.matcher.find(search_area) { + let end = self.position; + let start = index + self.matcher.needle_length(); + let result = Some(&self.haystack[start..end]); + + self.position = index; + + result + } else { + let result = Some(&self.haystack[..self.position]); + self.position = 0; + result + } + } +} + /// Provides extensions for string searching and manipulation functionalities /// on types that can reference byte slices ([u8]). This trait extends the capability /// of any type implementing `AsRef<[u8]>`, allowing easy integration of SIMD-accelerated @@ -724,9 +983,9 @@ pub mod sz { /// /// assert_eq!(haystack.sz_find(needle.as_bytes()), Some(2)); /// ``` -pub trait StringZilla +pub trait StringZilla<'a, N> where - N: AsRef<[u8]>, + N: AsRef<[u8]> + 'a, { /// Searches for the first occurrence of `needle` in `self`. /// @@ -828,12 +1087,157 @@ where /// assert_eq!(first.sz_alignment_score(second.as_bytes(), matrix, gap_penalty), -3); /// ``` fn sz_alignment_score(&self, other: N, matrix: [[i8; 256]; 256], gap: i8) -> isize; + + /// Returns an iterator over all non-overlapping matches of the given `needle` in `self`. + /// + /// # Arguments + /// + /// * `needle`: The byte slice to search for within `self`. + /// + /// # Examples + /// + /// ``` + /// use stringzilla::StringZilla; + /// + /// let haystack = b"abababa"; + /// let needle = b"aba"; + /// let matches: Vec<&[u8]> = haystack.sz_matches(needle).collect(); + /// assert_eq!(matches, vec![b"aba", b"aba", b"aba"]); + /// ``` + fn sz_matches(&'a self, needle: &'a N) -> RangeMatches<'a>; + + /// Returns an iterator over all non-overlapping matches of the given `needle` in `self`, searching from the end. + /// + /// # Arguments + /// + /// * `needle`: The byte slice to search for within `self`. + /// + /// # Examples + /// + /// ``` + /// use stringzilla::StringZilla; + /// + /// let haystack = b"abababa"; + /// let needle = b"aba"; + /// let matches: Vec<&[u8]> = haystack.sz_rmatches(needle).collect(); + /// assert_eq!(matches, vec![b"aba", b"aba", b"aba"]); + /// ``` + fn sz_rmatches(&'a self, needle: &'a N) -> RangeRMatches<'a>; + + /// Returns an iterator over the substrings of `self` that are separated by the given `needle`. + /// + /// # Arguments + /// + /// * `needle`: The byte slice to split `self` by. + /// + /// # Examples + /// + /// ``` + /// use stringzilla::StringZilla; + /// + /// let haystack = b"a,b,c,d"; + /// let needle = b","; + /// let splits: Vec<&[u8]> = haystack.sz_splits(needle).collect(); + /// assert_eq!(splits, vec![b"a", b"b", b"c", b"d"]); + /// ``` + fn sz_splits(&'a self, needle: &'a N) -> RangeSplits<'a>; + + /// Returns an iterator over the substrings of `self` that are separated by the given `needle`, searching from the end. + /// + /// # Arguments + /// + /// * `needle`: The byte slice to split `self` by. + /// + /// # Examples + /// + /// ``` + /// use stringzilla::StringZilla; + /// + /// let haystack = b"a,b,c,d"; + /// let needle = b","; + /// let splits: Vec<&[u8]> = haystack.sz_rsplits(needle).collect(); + /// assert_eq!(splits, vec![b"d", b"c", b"b", b"a"]); + /// ``` + fn sz_rsplits(&'a self, needle: &'a N) -> RangeRSplits<'a>; + + /// Returns an iterator over all non-overlapping matches of any of the bytes in `needles` within `self`. + /// + /// # Arguments + /// + /// * `needles`: The set of bytes to search for within `self`. + /// + /// # Examples + /// + /// ``` + /// use stringzilla::StringZilla; + /// + /// let haystack = b"Hello, world!"; + /// let needles = b"aeiou"; + /// let matches: Vec<&[u8]> = haystack.sz_find_first_of(needles).collect(); + /// assert_eq!(matches, vec![b"e", b"o", b"o"]); + /// ``` + fn sz_find_first_of(&'a self, needles: &'a N) -> RangeMatches<'a>; + + /// Returns an iterator over all non-overlapping matches of any of the bytes in `needles` within `self`, searching from the end. + /// + /// # Arguments + /// + /// * `needles`: The set of bytes to search for within `self`. + /// + /// # Examples + /// + /// ``` + /// use stringzilla::StringZilla; + /// + /// let haystack = b"Hello, world!"; + /// let needles = b"aeiou"; + /// let matches: Vec<&[u8]> = haystack.sz_find_last_of(needles).collect(); + /// assert_eq!(matches, vec![b"o", b"o", b"e"]); + /// ``` + fn sz_find_last_of(&'a self, needles: &'a N) -> RangeRMatches<'a>; + + /// Returns an iterator over all non-overlapping matches of any byte not in `needles` within `self`. + /// + /// # Arguments + /// + /// * `needles`: The set of bytes that should not be matched within `self`. + /// + /// # Examples + /// + /// ``` + /// use stringzilla::StringZilla; + /// + /// let haystack = b"Hello, world!"; + /// let needles = b"aeiou"; + /// let matches: Vec<&[u8]> = haystack.sz_find_first_not_of(needles).collect(); + /// assert_eq!(matches, vec![b"H", b"l", b"l", b",", b" ", b"w", b"r", b"l", b"d", b"!"]); + /// ``` + fn sz_find_first_not_of(&'a self, needles: &'a N) -> RangeMatches<'a>; + + /// Returns an iterator over all non-overlapping matches of any byte not in `needles` within `self`, searching from the end. + /// + /// # Arguments + /// + /// * `needles`: The set of bytes that should not be matched within `self`. + ///q + /// # Examples + /// + /// ``` + /// use stringzilla::StringZilla; + /// + /// let haystack = b"Hello, world!"; + /// let needles = b"aeiou"; + /// let matches: Vec<&[u8]> = haystack.sz_find_last_not_of(needles).collect(); + /// assert_eq!(matches, vec![b"!", b"d", b"l", b"r", b"w", b" ", b",", b"l", b"l", b"H"]); + /// ``` + fn sz_find_last_not_of(&'a self, needles: &'a N) -> RangeRMatches<'a>; + } -impl StringZilla for T +impl<'a, T, N> StringZilla<'a, N> for T where - T: AsRef<[u8]>, - N: AsRef<[u8]>, + T: AsRef<[u8]> + ?Sized, + N: AsRef<[u8]> + 'a, { fn sz_find(&self, needle: N) -> Option { sz::find(self, needle) @@ -866,6 +1270,54 @@ where fn sz_alignment_score(&self, other: N, matrix: [[i8; 256]; 256], gap: i8) -> isize { sz::alignment_score(self, other, matrix, gap) } + + fn sz_matches(&'a self, needle: &'a N) -> RangeMatches<'a> { + RangeMatches::new(self.as_ref(), MatcherType::Find(needle.as_ref()), true) + } + + fn sz_rmatches(&'a self, needle: &'a N) -> RangeRMatches<'a> { + RangeRMatches::new(self.as_ref(), MatcherType::RFind(needle.as_ref()), true) + } + + fn sz_splits(&'a self, needle: &'a N) -> RangeSplits<'a> { + RangeSplits::new(self.as_ref(), MatcherType::Find(needle.as_ref())) + } + + fn sz_rsplits(&'a self, needle: &'a N) -> RangeRSplits<'a> { + RangeRSplits::new(self.as_ref(), MatcherType::RFind(needle.as_ref())) + } + + fn sz_find_first_of(&'a self, needles: &'a N) -> RangeMatches<'a> { + RangeMatches::new( + self.as_ref(), + MatcherType::FindFirstOf(needles.as_ref()), + true, + ) + } + + fn sz_find_last_of(&'a self, needles: &'a N) -> RangeRMatches<'a> { + RangeRMatches::new( + self.as_ref(), + MatcherType::FindLastOf(needles.as_ref()), + true, + ) + } + + fn sz_find_first_not_of(&'a self, needles: &'a N) -> RangeMatches<'a> { + RangeMatches::new( + self.as_ref(), + MatcherType::FindFirstNotOf(needles.as_ref()), + true, + ) + } + + fn sz_find_last_not_of(&'a self, needles: &'a N) -> RangeRMatches<'a> { + RangeRMatches::new( + self.as_ref(), + MatcherType::FindLastNotOf(needles.as_ref()), + true, + ) + } } /// Provides a tool for mutating a byte slice by filling it with random data from a specified alphabet. @@ -1016,4 +1468,164 @@ mod tests { .iter() .all(|&b| b == b'd' || b == b'c' || b == b'b' || b == b'a')); } + + mod search_split_iterators { + use super::*; + use crate::{MatcherType, RangeMatches, RangeRMatches}; + + #[test] + fn test_matches() { + let haystack = b"hello world hello universe"; + let needle = b"hello"; + let matches: Vec<_> = haystack.sz_matches(needle).collect(); + assert_eq!(matches, vec![b"hello", b"hello"]); + } + + #[test] + fn test_rmatches() { + let haystack = b"hello world hello universe"; + let needle = b"hello"; + let matches: Vec<_> = haystack.sz_rmatches(needle).collect(); + assert_eq!(matches, vec![b"hello", b"hello"]); + } + + #[test] + fn test_splits() { + let haystack = b"alpha,beta;gamma"; + let needle = b","; + let splits: Vec<_> = haystack.sz_splits(needle).collect(); + assert_eq!(splits, vec![&b"alpha"[..], &b"beta;gamma"[..]]); + } + + #[test] + fn test_rsplits() { + let haystack = b"alpha,beta;gamma"; + let needle = b";"; + let splits: Vec<_> = haystack.sz_rsplits(needle).collect(); + assert_eq!(splits, vec![&b"gamma"[..], &b"alpha,beta"[..]]); + } + + #[test] + fn test_splits_with_empty_parts() { + let haystack = b"a,,b,"; + let needle = b","; + let splits: Vec<_> = haystack.sz_splits(needle).collect(); + assert_eq!(splits, vec![b"a", &b""[..], b"b", &b""[..]]); + } + + #[test] + fn test_matches_with_overlaps() { + let haystack = b"aaaa"; + let needle = b"aa"; + let matches: Vec<_> = haystack.sz_matches(needle).collect(); + assert_eq!(matches, vec![b"aa", b"aa", b"aa"]); + } + + #[test] + fn test_splits_with_utf8() { + let haystack = "こんにちは,世界".as_bytes(); + let needle = b","; + let splits: Vec<_> = haystack.sz_splits(needle).collect(); + assert_eq!(splits, vec!["こんにちは".as_bytes(), "世界".as_bytes()]); + } + + #[test] + fn test_find_first_of() { + let haystack = b"hello world"; + let needles = b"or"; + let matches: Vec<_> = haystack.sz_find_first_of(needles).collect(); + assert_eq!(matches, vec![b"o", b"o", b"r"]); + } + + #[test] + fn test_find_last_of() { + let haystack = b"hello world"; + let needles = b"or"; + let matches: Vec<_> = haystack.sz_find_last_of(needles).collect(); + assert_eq!(matches, vec![b"r", b"o", b"o"]); + } + + #[test] + fn test_find_first_not_of() { + let haystack = b"aabbbcccd"; + let needles = b"ab"; + let matches: Vec<_> = haystack.sz_find_first_not_of(needles).collect(); + assert_eq!(matches, vec![b"c", b"c", b"c", b"d"]); + } + + #[test] + fn test_find_last_not_of() { + let haystack = b"aabbbcccd"; + let needles = b"cd"; + let matches: Vec<_> = haystack.sz_find_last_not_of(needles).collect(); + assert_eq!(matches, vec![b"b", b"b", b"b", b"a", b"a"]); + } + + #[test] + fn test_find_first_of_empty_needles() { + let haystack = b"hello world"; + let needles = b""; + let matches: Vec<_> = haystack.sz_find_first_of(needles).collect(); + assert_eq!(matches, Vec::<&[u8]>::new()); + } + + #[test] + fn test_find_last_of_empty_haystack() { + let haystack = b""; + let needles = b"abc"; + let matches: Vec<_> = haystack.sz_find_last_of(needles).collect(); + assert_eq!(matches, Vec::<&[u8]>::new()); + } + + #[test] + fn test_find_first_not_of_all_matching() { + let haystack = b"aaabbbccc"; + let needles = b"abc"; + let matches: Vec<_> = haystack.sz_find_first_not_of(needles).collect(); + assert_eq!(matches, Vec::<&[u8]>::new()); + } + + #[test] + fn test_find_last_not_of_all_not_matching() { + let haystack = b"hello world"; + let needles = b"xyz"; + let matches: Vec<_> = haystack.sz_find_last_not_of(needles).collect(); + assert_eq!( + matches, + vec![b"d", b"l", b"r", b"o", b"w", b" ", b"o", b"l", b"l", b"e", b"h"] + ); + } + + #[test] + fn test_range_matches_overlapping() { + let haystack = b"aaaa"; + let matcher = MatcherType::Find(b"aa"); + let matches: Vec<_> = RangeMatches::new(haystack, matcher, true).collect(); + assert_eq!(matches, vec![&b"aa"[..], &b"aa"[..], &b"aa"[..]]); + } + + #[test] + fn test_range_matches_non_overlapping() { + let haystack = b"aaaa"; + let matcher = MatcherType::Find(b"aa"); + let matches: Vec<_> = RangeMatches::new(haystack, matcher, false).collect(); + assert_eq!(matches, vec![&b"aa"[..], &b"aa"[..]]); + } + + #[test] + fn test_range_rmatches_overlapping() { + let haystack = b"aaaa"; + let matcher = MatcherType::RFind(b"aa"); + let matches: Vec<_> = RangeRMatches::new(haystack, matcher, true).collect(); + assert_eq!(matches, vec![&b"aa"[..], &b"aa"[..], &b"aa"[..]]); + } + + #[test] + fn test_range_rmatches_non_overlapping() { + let haystack = b"aaaa"; + let matcher = MatcherType::RFind(b"aa"); + let matches: Vec<_> = RangeRMatches::new(haystack, matcher, false).collect(); + assert_eq!(matches, vec![&b"aa"[..], &b"aa"[..]]); + } + } } diff --git a/scripts/test.cpp b/scripts/test.cpp index 12465c94..6e88b0e4 100644 --- a/scripts/test.cpp +++ b/scripts/test.cpp @@ -1,4 +1,4 @@ -#undef NDEBUG // Enable all assertions +#undef NDEBUG // Enable all assertions // Enable assertions for iterators #if !defined(_ITERATOR_DEBUG_LEVEL) || _ITERATOR_DEBUG_LEVEL == 0 @@ -46,6 +46,17 @@ namespace sz = ashvardanian::stringzilla; using namespace sz::scripts; using sz::literals::operator""_sz; +/* + * Instantiate all the templates to make the symbols visible and also check + * for weird compilation errors on uncommon paths. + */ +#if SZ_DETECT_CPP_17 && __cpp_lib_string_view +template class std::basic_string_view; +#endif +template class sz::basic_string_slice; +template class std::basic_string; +template class sz::basic_string; + /** * @brief Several string processing operations rely on computing integer logarithms. * Failures in such operations will result in wrong `resize` outcomes and heap corruption. @@ -230,6 +241,7 @@ static void test_api_readonly() { // More complex queries. assert(str("abbabbaaaaaa").find("aa") == 6); assert(str("abcdabcd").substr(2, 4).find("abc") == str::npos); + assert(str("hello, world!").substr(0, 11).find("world") == str::npos); // ! `rfind` and `find_last_of` are not consistent in meaning of their arguments. assert(str("hello").find_first_of("le") == 1); @@ -277,7 +289,7 @@ static void test_api_readonly() { assert(str("abcdefgh" "\x01" "\xC6" "ijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" "\xC0" "\xFA" "0123456789+-", 68).find_last_of("\xC0\xC1") == 54); // sets // clang-format on - // Boundary consitions. + // Boundary conditions. assert(str("hello").find_first_of("ox", 4) == 4); assert(str("hello").find_first_of("ox", 5) == str::npos); assert(str("hello").find_last_of("ox", 4) == 4); @@ -952,11 +964,20 @@ static void test_search() { assert(rfinds.size() == 3); assert(rfinds[0] == "c"); - auto splits = ".a..c."_sz.split(sz::char_set(".")).template to>(); - assert(splits.size() == 5); - assert(splits[0] == ""); - assert(splits[1] == "a"); - assert(splits[4] == ""); + { + auto splits = ".a..c."_sz.split(sz::char_set(".")).template to>(); + assert(splits.size() == 5); + assert(splits[0] == ""); + assert(splits[1] == "a"); + assert(splits[4] == ""); + } + + { + auto splits = "line1\nline2\nline3"_sz.split("line3").template to>(); + assert(splits.size() == 2); + assert(splits[0] == "line1\nline2\n"); + assert(splits[1] == ""); + } assert(""_sz.split(".").size() == 1); assert(""_sz.rsplit(".").size() == 1); diff --git a/scripts/test.py b/scripts/test.py index f4bf8628..66be4266 100644 --- a/scripts/test.py +++ b/scripts/test.py @@ -386,6 +386,13 @@ def test_unit_globals(): assert sz.find("", "abcdef") == "".find("abcdef") assert sz.rfind("", "abcdef") == "".rfind("abcdef") + assert sz.find("Hello, world!", "world", 0, 11) == "Hello, world!".find( + "world", 0, 11 + ) + assert sz.rfind("Hello, world!", "world", 0, 11) == "Hello, world!".rfind( + "world", 0, 11 + ) + # Compare partitioning functions assert sz.partition("abcdef", "c") == ("ab", "c", "def") assert sz.rpartition("abcdef", "c") == ("ab", "c", "def")