Skip to content

Commit 2caefac

Browse files
committed
Fix: Compilation of all bindings
1 parent 2ce2b49 commit 2caefac

File tree

16 files changed

+690
-697
lines changed

16 files changed

+690
-697
lines changed

CONTRIBUTING.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -432,6 +432,13 @@ npm ci && npm test
432432
swift build && swift test
433433
```
434434

435+
To format, consider using [SwiftFormat](https://github.com/nicklockwood/SwiftFormat):
436+
437+
```bash
438+
brew install swiftformat
439+
swiftformat .
440+
```
441+
435442
Running Swift on Linux requires a couple of extra steps, as the Swift compiler is not available in the default repositories.
436443
Please get the most recent Swift tarball from the [official website](https://www.swift.org/install/).
437444
At the time of writing, for 64-bit Arm CPU running Ubuntu 22.04, the following commands would work:
@@ -467,6 +474,13 @@ sudo docker run --rm -v "$PWD:/workspace" -w /workspace swift:5.9 /bin/bash -cl
467474
cargo test
468475
```
469476

477+
If you need to isolate a failing test:
478+
479+
```bash
480+
export RUST_BACKTRACE=full
481+
cargo test -- --test-threads=1 --nocapture
482+
```
483+
470484
If you are updating the package contents, you can validate the list of included files using the following command:
471485

472486
```bash

Package.swift

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,16 @@ let package = Package(
55
name: "StringZilla",
66
platforms: [
77
// Linux doesn't have to be explicitly listed
8-
.iOS(.v13), // For iOS, version 13 and later
9-
.tvOS(.v13), // For tvOS, version 13 and later
8+
.iOS(.v13), // For iOS, version 13 and later
9+
.tvOS(.v13), // For tvOS, version 13 and later
1010
.macOS(.v10_15), // For macOS, version 10.15 (Catalina) and later
11-
.watchOS(.v6) // For watchOS, version 6 and later
11+
.watchOS(.v6), // For watchOS, version 6 and later
1212
],
1313
products: [
1414
.library(
1515
name: "StringZilla",
1616
targets: ["StringZillaC", "StringZilla"]
17-
)
17+
),
1818
],
1919
targets: [
2020
.target(
@@ -27,7 +27,7 @@ let package = Package(
2727
.define("SZ_AVOID_LIBC", to: "0"), // We need `malloc` from LibC
2828
.define("SZ_DEBUG", to: "0"), // We don't need any extra assertions in the C layer
2929
.headerSearchPath("include/stringzilla"), // Specify header search paths
30-
.unsafeFlags(["-Wall"]) // Use with caution: specify custom compiler flags
30+
.unsafeFlags(["-Wall"]), // Use with caution: specify custom compiler flags
3131
]
3232
),
3333
.target(
@@ -41,7 +41,7 @@ let package = Package(
4141
dependencies: ["StringZilla"],
4242
path: "swift",
4343
sources: ["Test.swift"]
44-
)
44+
),
4545
],
4646
cLanguageStandard: CLanguageStandard.c99
4747
)

README.md

Lines changed: 30 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ __Who is this for?__
137137
<span style="color:#ABABAB;">arm:</span> <b>0.02</b> GB/s
138138
</td>
139139
<td align="center">
140-
<code>sz_find_charset</code><br/>
140+
<code>sz_find_byteset</code><br/>
141141
<span style="color:#ABABAB;">x86:</span> <b>4.08</b> &centerdot;
142142
<span style="color:#ABABAB;">arm:</span> <b>3.22</b> GB/s
143143
</td>
@@ -155,7 +155,7 @@ __Who is this for?__
155155
</td>
156156
<td align="center">⚪</td>
157157
<td align="center">
158-
<code>sz_rfind_charset</code><br/>
158+
<code>sz_rfind_byteset</code><br/>
159159
<span style="color:#ABABAB;">x86:</span> <b>0.43</b> &centerdot;
160160
<span style="color:#ABABAB;">arm:</span> <b>0.23</b> GB/s
161161
</td>
@@ -181,7 +181,7 @@ __Who is this for?__
181181
<span style="color:#ABABAB;">arm:</span> <b>5.9</b> MB/s
182182
</td>
183183
<td align="center">
184-
<code>sz_generate</code><br/>
184+
<code>sz_fill_random</code><br/>
185185
<span style="color:#ABABAB;">x86:</span> <b>56.2</b> &centerdot;
186186
<span style="color:#ABABAB;">arm:</span> <b>25.8</b> MB/s
187187
</td>
@@ -203,7 +203,7 @@ __Who is this for?__
203203
<span style="color:#ABABAB;">arm:</span> <b>140.0</b> MB/s
204204
</td>
205205
<td align="center">
206-
<code>sz_look_up_transform</code><br/>
206+
<code>sz_lookup</code><br/>
207207
<span style="color:#ABABAB;">x86:</span> <b>21.2</b> &centerdot;
208208
<span style="color:#ABABAB;">arm:</span> <b>8.5</b> GB/s
209209
</td>
@@ -247,7 +247,7 @@ __Who is this for?__
247247
<span style="color:#ABABAB;">arm:</span> <b>2,220</b> ns
248248
</td>
249249
<td align="center">
250-
<code>sz_edit_distance</code><br/>
250+
<code>sz_levenshtein_distance</code><br/>
251251
<span style="color:#ABABAB;">x86:</span> <b>99</b> &centerdot;
252252
<span style="color:#ABABAB;">arm:</span> <b>180</b> ns
253253
</td>
@@ -265,7 +265,7 @@ __Who is this for?__
265265
<span style="color:#ABABAB;">arm:</span> <b>367</b> ms
266266
</td>
267267
<td align="center">
268-
<code>sz_alignment_score</code><br/>
268+
<code>sz_needleman_wunsch_score</code><br/>
269269
<span style="color:#ABABAB;">x86:</span> <b>73</b> &centerdot;
270270
<span style="color:#ABABAB;">arm:</span> <b>177</b> ms
271271
</td>
@@ -396,8 +396,8 @@ x: int = text.find_first_of('chars', start=0, end=sys.maxsize)
396396
x: int = text.find_last_of('chars', start=0, end=sys.maxsize)
397397
x: int = text.find_first_not_of('chars', start=0, end=sys.maxsize)
398398
x: int = text.find_last_not_of('chars', start=0, end=sys.maxsize)
399-
x: Strs = text.split_charset(separator='chars', maxsplit=sys.maxsize, keepseparator=False)
400-
x: Strs = text.rsplit_charset(separator='chars', maxsplit=sys.maxsize, keepseparator=False)
399+
x: Strs = text.split_byteset(separator='chars', maxsplit=sys.maxsize, keepseparator=False)
400+
x: Strs = text.rsplit_byteset(separator='chars', maxsplit=sys.maxsize, keepseparator=False)
401401
```
402402

403403
You can also transform the string using Look-Up Tables (LUTs), mapping it to a different character set.
@@ -453,8 +453,8 @@ StringZilla saves a lot of memory by viewing existing memory regions as substrin
453453
```py
454454
x: SplitIterator[Str] = text.split_iter(separator=' ', keepseparator=False)
455455
x: SplitIterator[Str] = text.rsplit_iter(separator=' ', keepseparator=False)
456-
x: SplitIterator[Str] = text.split_charset_iter(separator='chars', keepseparator=False)
457-
x: SplitIterator[Str] = text.rsplit_charset_iter(separator='chars', keepseparator=False)
456+
x: SplitIterator[Str] = text.split_byteset_iter(separator='chars', keepseparator=False)
457+
x: SplitIterator[Str] = text.rsplit_byteset_iter(separator='chars', keepseparator=False)
458458
```
459459

460460
StringZilla can easily be 10x more memory efficient than native Python classes for tokenization.
@@ -654,7 +654,7 @@ By design, StringZilla has a couple of notable differences from LibC:
654654
655655
That way `sz_find` and `sz_rfind` are similar to `strstr` and `strrstr` in LibC.
656656
Similarly, `sz_find_byte` and `sz_rfind_byte` replace `memchr` and `memrchr`.
657-
The `sz_find_charset` maps to `strspn` and `strcspn`, while `sz_rfind_charset` has no sibling in LibC.
657+
The `sz_find_byteset` maps to `strspn` and `strcspn`, while `sz_rfind_byteset` has no sibling in LibC.
658658
659659
<table>
660660
<tr>
@@ -679,11 +679,11 @@ The `sz_find_charset` maps to `strspn` and `strcspn`, while `sz_rfind_charset` h
679679
</tr>
680680
<tr>
681681
<td><code>strcspn(haystack, needles)</code></td>
682-
<td><code>sz_rfind_charset(haystack, haystack_length, needles_bitset)</code></td>
682+
<td><code>sz_rfind_byteset(haystack, haystack_length, needles_bitset)</code></td>
683683
</tr>
684684
<tr>
685685
<td><code>strspn(haystack, needles)</code></td>
686-
<td><code>sz_find_charset(haystack, haystack_length, needles_bitset)</code></td>
686+
<td><code>sz_find_byteset(haystack, haystack_length, needles_bitset)</code></td>
687687
</tr>
688688
<tr>
689689
<td><code>memmem(haystack, haystack_length, needle, needle_length)</code>, <code>strstr</code></td>
@@ -923,7 +923,7 @@ StringZilla provides a convenient `partition` function, which returns a tuple of
923923
```cpp
924924
auto parts = haystack.partition(':'); // Matching a character
925925
auto [before, match, after] = haystack.partition(':'); // Structure unpacking
926-
auto [before, match, after] = haystack.partition(sz::char_set(":;")); // Character-set argument
926+
auto [before, match, after] = haystack.partition(sz::byteset(":;")); // Character-set argument
927927
auto [before, match, after] = haystack.partition(" : "); // String argument
928928
auto [before, match, after] = haystack.rpartition(sz::whitespaces_set()); // Split around the last whitespace
929929
```
@@ -951,8 +951,8 @@ Here is a sneak peek of the most useful ones.
951951
```cpp
952952
text.hash(); // -> 64 bit unsigned integer
953953
text.ssize(); // -> 64 bit signed length to avoid `static_cast<std::ssize_t>(text.size())`
954-
text.contains_only(" \w\t"); // == text.find_first_not_of(sz::char_set(" \w\t")) == npos;
955-
text.contains(sz::whitespaces_set()); // == text.find(sz::char_set(sz::whitespaces_set())) != npos;
954+
text.contains_only(" \w\t"); // == text.find_first_not_of(sz::byteset(" \w\t")) == npos;
955+
text.contains(sz::whitespaces_set()); // == text.find(sz::byteset(sz::whitespaces_set())) != npos;
956956

957957
// Simpler slicing than `substr`
958958
text.front(10); // -> sz::string_view
@@ -997,7 +997,7 @@ To avoid those, StringZilla provides lazily-evaluated ranges, compatible with th
997997

998998
```cpp
999999
for (auto line : haystack.split("\r\n"))
1000-
for (auto word : line.split(sz::char_set(" \w\t.,;:!?")))
1000+
for (auto word : line.split(sz::byteset(" \w\t.,;:!?")))
10011001
std::cout << word << std::endl;
10021002
```
10031003

@@ -1006,9 +1006,9 @@ It also allows interleaving matches, if you want both inclusions of `xx` in `xxx
10061006
Debugging pointer offsets is not a pleasant exercise, so keep the following functions in mind.
10071007

10081008
- `haystack.[r]find_all(needle, interleaving)`
1009-
- `haystack.[r]find_all(sz::char_set(""))`
1009+
- `haystack.[r]find_all(sz::byteset(""))`
10101010
- `haystack.[r]split(needle)`
1011-
- `haystack.[r]split(sz::char_set(""))`
1011+
- `haystack.[r]split(sz::byteset(""))`
10121012

10131013
For $N$ matches the split functions will report $N+1$ matches, potentially including empty strings.
10141014
Ranges have a few convenience methods as well:
@@ -1065,7 +1065,7 @@ sz::string random_string(std::size_t length, char const *alphabet, std::size_t c
10651065
```
10661066
10671067
Mouthful and slow.
1068-
StringZilla provides a C native method - `sz_generate` and a convenient C++ wrapper - `sz::generate`.
1068+
StringZilla provides a C native method - `sz_fill_random` and a convenient C++ wrapper - `sz::generate`.
10691069
Similar to Python it also defines the commonly used character sets.
10701070
10711071
```cpp
@@ -1085,9 +1085,9 @@ In text processing, it's often necessary to replace all occurrences of a specifi
10851085
Standard library functions may not offer the most efficient or convenient methods for performing bulk replacements, especially when dealing with large strings or performance-critical applications.
10861086

10871087
- `haystack.replace_all(needle_string, replacement_string)`
1088-
- `haystack.replace_all(sz::char_set(""), replacement_string)`
1088+
- `haystack.replace_all(sz::byteset(""), replacement_string)`
10891089
- `haystack.try_replace_all(needle_string, replacement_string)`
1090-
- `haystack.try_replace_all(sz::char_set(""), replacement_string)`
1090+
- `haystack.try_replace_all(sz::byteset(""), replacement_string)`
10911091
- `haystack.transform(sz::look_up_table::identity())`
10921092
- `haystack.transform(sz::look_up_table::identity(), haystack.data())`
10931093

@@ -1250,8 +1250,8 @@ sz::find("Hello, world!", "world") // 7
12501250
sz::rfind("Hello, world!", "world") // 7
12511251

12521252
// Generalizations of `memchr::memrchr[123]`
1253-
sz::find_char_from("Hello, world!", "world") // 2
1254-
sz::rfind_char_from("Hello, world!", "world") // 11
1253+
sz::find_byte_from("Hello, world!", "world") // 2
1254+
sz::rfind_byte_from("Hello, world!", "world") // 11
12551255
```
12561256

12571257
Unlike `memchr`, the throughput of `stringzilla` is [high in both normal and reverse-order searches][memchr-benchmarks].
@@ -1268,10 +1268,10 @@ let my_cow_str = Cow::from(&my_string);
12681268
// Use the generic function with a String
12691269
assert_eq!(my_string.sz_find("world"), Some(7));
12701270
assert_eq!(my_string.sz_rfind("world"), Some(7));
1271-
assert_eq!(my_string.sz_find_char_from("world"), Some(2));
1272-
assert_eq!(my_string.sz_rfind_char_from("world"), Some(11));
1273-
assert_eq!(my_string.sz_find_char_not_from("world"), Some(0));
1274-
assert_eq!(my_string.sz_rfind_char_not_from("world"), Some(12));
1271+
assert_eq!(my_string.sz_find_byte_from("world"), Some(2));
1272+
assert_eq!(my_string.sz_rfind_byte_from("world"), Some(11));
1273+
assert_eq!(my_string.sz_find_byte_not_from("world"), Some(0));
1274+
assert_eq!(my_string.sz_rfind_byte_not_from("world"), Some(12));
12751275

12761276
// Same works for &str and Cow<'_, str>
12771277
assert_eq!(my_str.sz_find("world"), Some(7));
@@ -1315,7 +1315,7 @@ s[s.findLast(substring: "o")!...] // "o StringZilla. 👋")
13151315
s[s.findFirst(characterFrom: "aeiou")!...] // "ello, world! Welcome to StringZilla. 👋")
13161316
s[s.findLast(characterFrom: "aeiou")!...] // "a. 👋")
13171317
s[s.findFirst(characterNotFrom: "aeiou")!...] // "Hello, world! Welcome to StringZilla. 👋"
1318-
s.editDistance(from: "Hello, world!")! // 29
1318+
s.levenshteinDistance(from: "Hello, world!")! // 29
13191319
```
13201320

13211321
## Algorithms & Design Decisions 📚
@@ -1561,7 +1561,7 @@ Most StringZilla operations are byte-level, so they work well with ASCII and UTF
15611561
In some cases, like edit-distance computation, the result of byte-level evaluation and character-level evaluation may differ.
15621562
So StringZilla provides following functions to work with Unicode:
15631563

1564-
- `sz_edit_distance_utf8` - computes the Levenshtein distance between two UTF-8 strings.
1564+
- `sz_levenshtein_distance_utf8` - computes the Levenshtein distance between two UTF-8 strings.
15651565
- `sz_hamming_distance_utf8` - computes the Hamming distance between two UTF-8 strings.
15661566

15671567
Java, JavaScript, Python 2, C#, and Objective-C, however, use wide characters (`wchar`) - two byte long codes, instead of the more reasonable fixed-length UTF32 or variable-length UTF8.

0 commit comments

Comments
 (0)