Skip to content

Commit

Permalink
Merge pull request #111 from ashvardanian/main-dev
Browse files Browse the repository at this point in the history
String decoding & MSVC builds
  • Loading branch information
ashvardanian authored Mar 3, 2024
2 parents 66fd8a9 + 184c1e6 commit a86ae00
Show file tree
Hide file tree
Showing 10 changed files with 226 additions and 51 deletions.
36 changes: 36 additions & 0 deletions .github/workflows/prerelease.yml
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,43 @@ jobs:
runs-on: windows-latest
steps:
- uses: actions/checkout@v4
- uses: ilammy/msvc-dev-cmd@v1

- name: Build C/C++
shell: cmd
run: |
cmake -GNinja -B build_artifacts ^
-DCMAKE_BUILD_TYPE=RelWithDebInfo ^
-DCMAKE_EXPORT_COMPILE_COMMANDS=1 ^
-DSTRINGZILLA_BUILD_BENCHMARK=1 ^
-DSTRINGZILLA_BUILD_TEST=1
cmake --build build_artifacts --config RelWithDebInfo > build_artifacts/logs.txt 2>&1 || (
echo "Compilation failed. Here are the logs:"
type build_artifacts\logs.txt
echo "The original compilation commands:"
type build_artifacts\compile_commands.json
echo:
echo "CPU Features:"
wmic cpu list /format:list
exit 1
)
- name: Test C++
run: .\build_artifacts\stringzilla_test_cpp20.exe
- name: Test on Real World Data
run: |
.\build_artifacts\stringzilla_bench_search.exe ${DATASET_PATH} # for substring search
.\build_artifacts\stringzilla_bench_token.exe ${DATASET_PATH} # for hashing, equality comparisons, etc.
.\build_artifacts\stringzilla_bench_similarity.exe ${DATASET_PATH} # for edit distances and alignment scores
.\build_artifacts\stringzilla_bench_sort.exe ${DATASET_PATH} # for sorting arrays of strings
.\build_artifacts\stringzilla_bench_container.exe ${DATASET_PATH} # for STL containers with string keys
env:
DATASET_PATH: ./README.md
# Don't overload GitHub with our benchmarks.
# The results in such an unstable environment will be meaningless anyway.
if: 0

# Python
- name: Set up Python ${{ env.PYTHON_VERSION }}
uses: actions/setup-python@v5
with:
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
build/
build_debug/
build_release/
build_artifacts*

# Yes, everyone loves keeping this file in the history.
# But with a very minimalistic binding and just a couple of dependencies
Expand All @@ -27,6 +28,7 @@ CMakeFiles
*.pyd
.venv/*
node_modules/
.vs/

# Recommended datasets
leipzig1M.txt
Expand Down
3 changes: 2 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,8 @@
"xmemory": "cpp",
"xtr1common": "cpp",
"xtree": "cpp",
"xutility": "cpp"
"xutility": "cpp",
"errno.h": "c"
},
"python.pythonPath": "~/miniconda3/bin/python"
}
12 changes: 9 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -239,9 +239,15 @@ if(${STRINGZILLA_BUILD_TEST})
# compile multiple backends: disabling all SIMD, enabling only AVX2, only AVX-512, only Arm Neon.
if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64|amd64")
# x86 specific backends
define_launcher(stringzilla_test_cpp20_x86_serial scripts/test.cpp 20 "ivybridge")
define_launcher(stringzilla_test_cpp20_x86_avx2 scripts/test.cpp 20 "haswell")
define_launcher(stringzilla_test_cpp20_x86_avx512 scripts/test.cpp 20 "sapphirerapids")
if (MSVC)
define_launcher(stringzilla_test_cpp20_x86_serial scripts/test.cpp 20 "AVX")
define_launcher(stringzilla_test_cpp20_x86_avx2 scripts/test.cpp 20 "AVX2")
define_launcher(stringzilla_test_cpp20_x86_avx512 scripts/test.cpp 20 "AVX512")
else()
define_launcher(stringzilla_test_cpp20_x86_serial scripts/test.cpp 20 "ivybridge")
define_launcher(stringzilla_test_cpp20_x86_avx2 scripts/test.cpp 20 "haswell")
define_launcher(stringzilla_test_cpp20_x86_avx512 scripts/test.cpp 20 "sapphirerapids")
endif()
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
# ARM specific backends
define_launcher(stringzilla_test_cpp20_arm_serial scripts/test.cpp 20 "armv8-a")
Expand Down
75 changes: 48 additions & 27 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -309,14 +309,14 @@ Consider contributing, if you need a feature that's not yet implemented.

### Basic Usage

If you've ever used the Python `str` or `bytes` class, you'll know what to expect.
If you've ever used the Python `str`, `bytes`, `bytearray`, `memoryview` class, you'll know what to expect.
StringZilla's `Str` class is a hybrid of those two, providing `str`-like interface to byte-arrays.

```python
from stringzilla import Str, File

text_from_str = Str('some-string')
text_from_file = Str(File('some-file.txt'))
text_from_str = Str('some-string') # no copies, just a view
text_from_file = Str(File('some-file.txt')) # memory-mapped file
```

The `File` class memory-maps a file from persistent memory without loading its copy into RAM.
Expand All @@ -328,18 +328,23 @@ A standard dataset pre-processing use case would be to map a sizeable textual da
- Length: `len(text) -> int`
- Indexing: `text[42] -> str`
- Slicing: `text[42:46] -> Str`
- String conversion: `str(text) -> str`
- Substring check: `'substring' in text -> bool`
- Hashing: `hash(text) -> int`
- String conversion: `str(text) -> str`

### Advanced Operations

- `text.contains('substring', start=0, end=9223372036854775807) -> bool`
- `text.find('substring', start=0, end=9223372036854775807) -> int`
- `text.count('substring', start=0, end=9223372036854775807, allowoverlap=False) -> int`
- `text.split(separator=' ', maxsplit=9223372036854775807, keepseparator=False) -> Strs`
- `text.rsplit(separator=' ', maxsplit=9223372036854775807, keepseparator=False) -> Strs`
- `text.splitlines(keeplinebreaks=False, maxsplit=9223372036854775807) -> Strs`
```py
import sys

x: bool = text.contains('substring', start=0, end=sys.maxsize)
x: int = text.find('substring', start=0, end=sys.maxsize)
x: int = text.count('substring', start=0, end=sys.maxsize, allowoverlap=False)
x: str = text.decode(encoding='utf-8', errors='strict')
x: Strs = text.split(separator=' ', maxsplit=sys.maxsize, keepseparator=False)
x: Strs = text.rsplit(separator=' ', maxsplit=sys.maxsize, keepseparator=False)
x: Strs = text.splitlines(keeplinebreaks=False, maxsplit=sys.maxsize)
```

It's important to note, that the last function behavior is slightly different from Python's `str.splitlines`.
The [native version][faq-splitlines] matches `\n`, `\r`, `\v` or `\x0b`, `\f` or `\x0c`, `\x1c`, `\x1d`, `\x1e`, `\x85`, `\r\n`, `\u2028`, `\u2029`, including 3x two-bytes-long runes.
Expand All @@ -353,15 +358,14 @@ Python strings don't natively support character set operations.
This forces people to use regular expressions, which are slow and hard to read.
To avoid the need for `re.finditer`, StringZilla provides the following interfaces:

- `text.find_first_of('chars', start=0, end=9223372036854775807) -> int`
- `text.find_last_of('chars', start=0, end=9223372036854775807) -> int`
- `text.find_first_not_of('chars', start=0, end=9223372036854775807) -> int`
- `text.find_last_not_of('chars', start=0, end=9223372036854775807) -> int`

Similarly, for splitting operations:

- `text.split_charset(separator='chars', maxsplit=9223372036854775807, keepseparator=False) -> Strs`
- `text.rsplit_charset(separator='chars', maxsplit=9223372036854775807, keepseparator=False) -> Strs`
```py
x: int = text.find_first_of('chars', start=0, end=sys.maxsize)
x: int = text.find_last_of('chars', start=0, end=sys.maxsize)
x: int = text.find_first_not_of('chars', start=0, end=sys.maxsize)
x: int = text.find_last_not_of('chars', start=0, end=sys.maxsize)
x: Strs = text.split_charset(separator='chars', maxsplit=sys.maxsize, keepseparator=False)
x: Strs = text.rsplit_charset(separator='chars', maxsplit=sys.maxsize, keepseparator=False)
```

### Collection-Level Operations

Expand Down Expand Up @@ -420,9 +424,9 @@ Assuming StringZilla CPython bindings are implemented [without any intermediate
```py
import stringzilla as sz

contains: bool = sz.contains("haystack", "needle", start=0, end=9223372036854775807)
offset: int = sz.find("haystack", "needle", start=0, end=9223372036854775807)
count: int = sz.count("haystack", "needle", start=0, end=9223372036854775807, allowoverlap=False)
contains: bool = sz.contains("haystack", "needle", start=0, end=sys.maxsize)
offset: int = sz.find("haystack", "needle", start=0, end=sys.maxsize)
count: int = sz.count("haystack", "needle", start=0, end=sys.maxsize, allowoverlap=False)
```

### Edit Distances
Expand Down Expand Up @@ -515,6 +519,20 @@ next_doc_offset = next_doc.offset_within(web_archieve)
web_archieve.write_to("next_doc.html")
```

#### PyArrow

A `Str` is easy to cast to [PyArrow](https://arrow.apache.org/docs/python/arrays.html#string-and-binary-types) buffers.

```py
from pyarrow as foreign_buffer
from stringzilla import Str

original = "hello"
view = Str(native)
arrow = foreign_buffer(view.address, view.nbytes, view)
```

That means you can convert `Str` to `pyarrow.Buffer` and `Strs` to `pyarrow.Array` without extra copies.

## Quick Start: C/C++ 🛠️

Expand Down Expand Up @@ -1369,13 +1387,16 @@ Another one is the [Fibonacci hash trick](https://probablydance.com/2018/06/16/f

### Unicode, UTF-8, and Wide Characters

StringZilla does not __yet__ implement any Unicode-specific algorithms.
The content is addressed at byte-level, and the string is assumed to be encoded in UTF-8 or extended ASCII.
Refer to [simdutf](https://github.com/simdutf/simdutf) for fast conversions and [icu](https://github.com/unicode-org/icu) for character metadata.
Most StringZilla operations are byte-level, so they work well with ASCII and UTF8 content out of the box.
In some cases, like edit-distance computation, the result of byte-level evaluation and character-level evaluation may differ.
So StringZilla provides following functions to work with Unicode:

- `sz_edit_distance_utf8` - computes the Levenshtein distance between two UTF-8 strings.
- `sz_hamming_distance_utf8` - computes the Hamming distance between two UTF-8 strings.

This may introduce frictions, when binding to some programming languages.
Namely, Java, JavaScript, Python 2, C#, and Objective-C use wide characters (`wchar`) - two byte long codes.
Java, JavaScript, Python 2, C#, and Objective-C, however, use wide characters (`wchar`) - two byte long codes, instead of the more reasonable fixed-length UTF32 or variable-length UTF8.
This leads [to all kinds of offset-counting issues][wide-char-offsets] when facing four-byte long Unicode characters.
So consider transcoding with [simdutf](https://github.com/simdutf/simdutf), if you are coming from such environments.

[wide-char-offsets]: https://josephg.com/blog/string-length-lies/

Expand Down
8 changes: 4 additions & 4 deletions include/stringzilla/stringzilla.h
Original file line number Diff line number Diff line change
Expand Up @@ -4507,10 +4507,10 @@ SZ_PUBLIC void sz_hashes_avx512(sz_cptr_t start, sz_size_t length, sz_size_t win
chars_vec.zmm = _mm512_add_epi8(chars_vec.zmm, shift_vec.zmm);

// ... and prefetch the next four characters into Level 2 or higher.
_mm_prefetch(text_fourth + 1, _MM_HINT_T1);
_mm_prefetch(text_third + 1, _MM_HINT_T1);
_mm_prefetch(text_second + 1, _MM_HINT_T1);
_mm_prefetch(text_first + 1, _MM_HINT_T1);
_mm_prefetch((sz_cptr_t)text_fourth + 1, _MM_HINT_T1);
_mm_prefetch((sz_cptr_t)text_third + 1, _MM_HINT_T1);
_mm_prefetch((sz_cptr_t)text_second + 1, _MM_HINT_T1);
_mm_prefetch((sz_cptr_t)text_first + 1, _MM_HINT_T1);

// 3. Add the incoming characters.
hash_vec.zmm = _mm512_add_epi64(hash_vec.zmm, chars_vec.zmm);
Expand Down
15 changes: 11 additions & 4 deletions include/stringzilla/stringzilla.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -458,8 +458,9 @@ class range_matches {
return temp;
}

bool operator!=(iterator const &other) const noexcept { return remaining_.begin() != other.remaining_.begin(); }
bool operator==(iterator const &other) const noexcept { return remaining_.begin() == other.remaining_.begin(); }
// Assumes both iterators point to the same underlying string.
bool operator!=(iterator const &other) const noexcept { return remaining_.data() != other.remaining_.data(); }
bool operator==(iterator const &other) const noexcept { return remaining_.data() == other.remaining_.data(); }
bool operator!=(end_sentinel_type) const noexcept { return !remaining_.empty(); }
bool operator==(end_sentinel_type) const noexcept { return remaining_.empty(); }
};
Expand Down Expand Up @@ -550,8 +551,14 @@ class range_rmatches {
return temp;
}

bool operator!=(iterator const &other) const noexcept { return remaining_.end() != other.remaining_.end(); }
bool operator==(iterator const &other) const noexcept { return remaining_.end() == other.remaining_.end(); }
// Assumes both iterators point to the same underlying string.
// This has to be `.data() + .size()`, to be compatible with `std::string_view` on MSVC.
bool operator!=(iterator const &other) const noexcept {
return remaining_.data() + remaining_.size() != other.remaining_.data() + other.remaining_.size();
}
bool operator==(iterator const &other) const noexcept {
return remaining_.data() + remaining_.size() == other.remaining_.data() + other.remaining_.size();
}
bool operator!=(end_sentinel_type) const noexcept { return !remaining_.empty(); }
bool operator==(end_sentinel_type) const noexcept { return remaining_.empty(); }
};
Expand Down
Loading

0 comments on commit a86ae00

Please sign in to comment.