diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml index e9abbc34..0213f679 100644 --- a/.github/workflows/prerelease.yml +++ b/.github/workflows/prerelease.yml @@ -262,7 +262,43 @@ jobs: runs-on: windows-latest steps: - uses: actions/checkout@v4 + - uses: ilammy/msvc-dev-cmd@v1 + - name: Build C/C++ + shell: cmd + run: | + cmake -GNinja -B build_artifacts ^ + -DCMAKE_BUILD_TYPE=RelWithDebInfo ^ + -DCMAKE_EXPORT_COMPILE_COMMANDS=1 ^ + -DSTRINGZILLA_BUILD_BENCHMARK=1 ^ + -DSTRINGZILLA_BUILD_TEST=1 + + cmake --build build_artifacts --config RelWithDebInfo > build_artifacts/logs.txt 2>&1 || ( + echo "Compilation failed. Here are the logs:" + type build_artifacts\logs.txt + echo "The original compilation commands:" + type build_artifacts\compile_commands.json + echo: + echo "CPU Features:" + wmic cpu list /format:list + exit 1 + ) + - name: Test C++ + run: .\build_artifacts\stringzilla_test_cpp20.exe + - name: Test on Real World Data + run: | + .\build_artifacts\stringzilla_bench_search.exe ${DATASET_PATH} # for substring search + .\build_artifacts\stringzilla_bench_token.exe ${DATASET_PATH} # for hashing, equality comparisons, etc. + .\build_artifacts\stringzilla_bench_similarity.exe ${DATASET_PATH} # for edit distances and alignment scores + .\build_artifacts\stringzilla_bench_sort.exe ${DATASET_PATH} # for sorting arrays of strings + .\build_artifacts\stringzilla_bench_container.exe ${DATASET_PATH} # for STL containers with string keys + env: + DATASET_PATH: ./README.md + # Don't overload GitHub with our benchmarks. + # The results in such an unstable environment will be meaningless anyway. + if: 0 + + # Python - name: Set up Python ${{ env.PYTHON_VERSION }} uses: actions/setup-python@v5 with: diff --git a/.gitignore b/.gitignore index 412c78b6..6fd5cd1b 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ build/ build_debug/ build_release/ +build_artifacts* # Yes, everyone loves keeping this file in the history. # But with a very minimalistic binding and just a couple of dependencies @@ -27,6 +28,7 @@ CMakeFiles *.pyd .venv/* node_modules/ +.vs/ # Recommended datasets leipzig1M.txt diff --git a/.vscode/settings.json b/.vscode/settings.json index 63d0f17a..e7194e7a 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -252,7 +252,8 @@ "xmemory": "cpp", "xtr1common": "cpp", "xtree": "cpp", - "xutility": "cpp" + "xutility": "cpp", + "errno.h": "c" }, "python.pythonPath": "~/miniconda3/bin/python" } \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 3a52a1ef..153ed226 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -239,9 +239,15 @@ if(${STRINGZILLA_BUILD_TEST}) # compile multiple backends: disabling all SIMD, enabling only AVX2, only AVX-512, only Arm Neon. if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64|amd64") # x86 specific backends - define_launcher(stringzilla_test_cpp20_x86_serial scripts/test.cpp 20 "ivybridge") - define_launcher(stringzilla_test_cpp20_x86_avx2 scripts/test.cpp 20 "haswell") - define_launcher(stringzilla_test_cpp20_x86_avx512 scripts/test.cpp 20 "sapphirerapids") + if (MSVC) + define_launcher(stringzilla_test_cpp20_x86_serial scripts/test.cpp 20 "AVX") + define_launcher(stringzilla_test_cpp20_x86_avx2 scripts/test.cpp 20 "AVX2") + define_launcher(stringzilla_test_cpp20_x86_avx512 scripts/test.cpp 20 "AVX512") + else() + define_launcher(stringzilla_test_cpp20_x86_serial scripts/test.cpp 20 "ivybridge") + define_launcher(stringzilla_test_cpp20_x86_avx2 scripts/test.cpp 20 "haswell") + define_launcher(stringzilla_test_cpp20_x86_avx512 scripts/test.cpp 20 "sapphirerapids") + endif() elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64") # ARM specific backends define_launcher(stringzilla_test_cpp20_arm_serial scripts/test.cpp 20 "armv8-a") diff --git a/README.md b/README.md index e6592fff..4bafcd1e 100644 --- a/README.md +++ b/README.md @@ -309,14 +309,14 @@ Consider contributing, if you need a feature that's not yet implemented. ### Basic Usage -If you've ever used the Python `str` or `bytes` class, you'll know what to expect. +If you've ever used the Python `str`, `bytes`, `bytearray`, `memoryview` class, you'll know what to expect. StringZilla's `Str` class is a hybrid of those two, providing `str`-like interface to byte-arrays. ```python from stringzilla import Str, File -text_from_str = Str('some-string') -text_from_file = Str(File('some-file.txt')) +text_from_str = Str('some-string') # no copies, just a view +text_from_file = Str(File('some-file.txt')) # memory-mapped file ``` The `File` class memory-maps a file from persistent memory without loading its copy into RAM. @@ -328,18 +328,23 @@ A standard dataset pre-processing use case would be to map a sizeable textual da - Length: `len(text) -> int` - Indexing: `text[42] -> str` - Slicing: `text[42:46] -> Str` -- String conversion: `str(text) -> str` - Substring check: `'substring' in text -> bool` - Hashing: `hash(text) -> int` +- String conversion: `str(text) -> str` ### Advanced Operations -- `text.contains('substring', start=0, end=9223372036854775807) -> bool` -- `text.find('substring', start=0, end=9223372036854775807) -> int` -- `text.count('substring', start=0, end=9223372036854775807, allowoverlap=False) -> int` -- `text.split(separator=' ', maxsplit=9223372036854775807, keepseparator=False) -> Strs` -- `text.rsplit(separator=' ', maxsplit=9223372036854775807, keepseparator=False) -> Strs` -- `text.splitlines(keeplinebreaks=False, maxsplit=9223372036854775807) -> Strs` +```py +import sys + +x: bool = text.contains('substring', start=0, end=sys.maxsize) +x: int = text.find('substring', start=0, end=sys.maxsize) +x: int = text.count('substring', start=0, end=sys.maxsize, allowoverlap=False) +x: str = text.decode(encoding='utf-8', errors='strict') +x: Strs = text.split(separator=' ', maxsplit=sys.maxsize, keepseparator=False) +x: Strs = text.rsplit(separator=' ', maxsplit=sys.maxsize, keepseparator=False) +x: Strs = text.splitlines(keeplinebreaks=False, maxsplit=sys.maxsize) +``` It's important to note, that the last function behavior is slightly different from Python's `str.splitlines`. The [native version][faq-splitlines] matches `\n`, `\r`, `\v` or `\x0b`, `\f` or `\x0c`, `\x1c`, `\x1d`, `\x1e`, `\x85`, `\r\n`, `\u2028`, `\u2029`, including 3x two-bytes-long runes. @@ -353,15 +358,14 @@ Python strings don't natively support character set operations. This forces people to use regular expressions, which are slow and hard to read. To avoid the need for `re.finditer`, StringZilla provides the following interfaces: -- `text.find_first_of('chars', start=0, end=9223372036854775807) -> int` -- `text.find_last_of('chars', start=0, end=9223372036854775807) -> int` -- `text.find_first_not_of('chars', start=0, end=9223372036854775807) -> int` -- `text.find_last_not_of('chars', start=0, end=9223372036854775807) -> int` - -Similarly, for splitting operations: - -- `text.split_charset(separator='chars', maxsplit=9223372036854775807, keepseparator=False) -> Strs` -- `text.rsplit_charset(separator='chars', maxsplit=9223372036854775807, keepseparator=False) -> Strs` +```py +x: int = text.find_first_of('chars', start=0, end=sys.maxsize) +x: int = text.find_last_of('chars', start=0, end=sys.maxsize) +x: int = text.find_first_not_of('chars', start=0, end=sys.maxsize) +x: int = text.find_last_not_of('chars', start=0, end=sys.maxsize) +x: Strs = text.split_charset(separator='chars', maxsplit=sys.maxsize, keepseparator=False) +x: Strs = text.rsplit_charset(separator='chars', maxsplit=sys.maxsize, keepseparator=False) +``` ### Collection-Level Operations @@ -420,9 +424,9 @@ Assuming StringZilla CPython bindings are implemented [without any intermediate ```py import stringzilla as sz -contains: bool = sz.contains("haystack", "needle", start=0, end=9223372036854775807) -offset: int = sz.find("haystack", "needle", start=0, end=9223372036854775807) -count: int = sz.count("haystack", "needle", start=0, end=9223372036854775807, allowoverlap=False) +contains: bool = sz.contains("haystack", "needle", start=0, end=sys.maxsize) +offset: int = sz.find("haystack", "needle", start=0, end=sys.maxsize) +count: int = sz.count("haystack", "needle", start=0, end=sys.maxsize, allowoverlap=False) ``` ### Edit Distances @@ -515,6 +519,20 @@ next_doc_offset = next_doc.offset_within(web_archieve) web_archieve.write_to("next_doc.html") ``` +#### PyArrow + +A `Str` is easy to cast to [PyArrow](https://arrow.apache.org/docs/python/arrays.html#string-and-binary-types) buffers. + +```py +from pyarrow as foreign_buffer +from stringzilla import Str + +original = "hello" +view = Str(native) +arrow = foreign_buffer(view.address, view.nbytes, view) +``` + +That means you can convert `Str` to `pyarrow.Buffer` and `Strs` to `pyarrow.Array` without extra copies. ## Quick Start: C/C++ 🛠️ @@ -1369,13 +1387,16 @@ Another one is the [Fibonacci hash trick](https://probablydance.com/2018/06/16/f ### Unicode, UTF-8, and Wide Characters -StringZilla does not __yet__ implement any Unicode-specific algorithms. -The content is addressed at byte-level, and the string is assumed to be encoded in UTF-8 or extended ASCII. -Refer to [simdutf](https://github.com/simdutf/simdutf) for fast conversions and [icu](https://github.com/unicode-org/icu) for character metadata. +Most StringZilla operations are byte-level, so they work well with ASCII and UTF8 content out of the box. +In some cases, like edit-distance computation, the result of byte-level evaluation and character-level evaluation may differ. +So StringZilla provides following functions to work with Unicode: + +- `sz_edit_distance_utf8` - computes the Levenshtein distance between two UTF-8 strings. +- `sz_hamming_distance_utf8` - computes the Hamming distance between two UTF-8 strings. -This may introduce frictions, when binding to some programming languages. -Namely, Java, JavaScript, Python 2, C#, and Objective-C use wide characters (`wchar`) - two byte long codes. +Java, JavaScript, Python 2, C#, and Objective-C, however, use wide characters (`wchar`) - two byte long codes, instead of the more reasonable fixed-length UTF32 or variable-length UTF8. This leads [to all kinds of offset-counting issues][wide-char-offsets] when facing four-byte long Unicode characters. +So consider transcoding with [simdutf](https://github.com/simdutf/simdutf), if you are coming from such environments. [wide-char-offsets]: https://josephg.com/blog/string-length-lies/ diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h index fe55692a..ef066513 100644 --- a/include/stringzilla/stringzilla.h +++ b/include/stringzilla/stringzilla.h @@ -4507,10 +4507,10 @@ SZ_PUBLIC void sz_hashes_avx512(sz_cptr_t start, sz_size_t length, sz_size_t win chars_vec.zmm = _mm512_add_epi8(chars_vec.zmm, shift_vec.zmm); // ... and prefetch the next four characters into Level 2 or higher. - _mm_prefetch(text_fourth + 1, _MM_HINT_T1); - _mm_prefetch(text_third + 1, _MM_HINT_T1); - _mm_prefetch(text_second + 1, _MM_HINT_T1); - _mm_prefetch(text_first + 1, _MM_HINT_T1); + _mm_prefetch((sz_cptr_t)text_fourth + 1, _MM_HINT_T1); + _mm_prefetch((sz_cptr_t)text_third + 1, _MM_HINT_T1); + _mm_prefetch((sz_cptr_t)text_second + 1, _MM_HINT_T1); + _mm_prefetch((sz_cptr_t)text_first + 1, _MM_HINT_T1); // 3. Add the incoming characters. hash_vec.zmm = _mm512_add_epi64(hash_vec.zmm, chars_vec.zmm); diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp index daf47d50..e64ebc44 100644 --- a/include/stringzilla/stringzilla.hpp +++ b/include/stringzilla/stringzilla.hpp @@ -458,8 +458,9 @@ class range_matches { return temp; } - bool operator!=(iterator const &other) const noexcept { return remaining_.begin() != other.remaining_.begin(); } - bool operator==(iterator const &other) const noexcept { return remaining_.begin() == other.remaining_.begin(); } + // Assumes both iterators point to the same underlying string. + bool operator!=(iterator const &other) const noexcept { return remaining_.data() != other.remaining_.data(); } + bool operator==(iterator const &other) const noexcept { return remaining_.data() == other.remaining_.data(); } bool operator!=(end_sentinel_type) const noexcept { return !remaining_.empty(); } bool operator==(end_sentinel_type) const noexcept { return remaining_.empty(); } }; @@ -550,8 +551,14 @@ class range_rmatches { return temp; } - bool operator!=(iterator const &other) const noexcept { return remaining_.end() != other.remaining_.end(); } - bool operator==(iterator const &other) const noexcept { return remaining_.end() == other.remaining_.end(); } + // Assumes both iterators point to the same underlying string. + // This has to be `.data() + .size()`, to be compatible with `std::string_view` on MSVC. + bool operator!=(iterator const &other) const noexcept { + return remaining_.data() + remaining_.size() != other.remaining_.data() + other.remaining_.size(); + } + bool operator==(iterator const &other) const noexcept { + return remaining_.data() + remaining_.size() == other.remaining_.data() + other.remaining_.size(); + } bool operator!=(end_sentinel_type) const noexcept { return !remaining_.empty(); } bool operator==(end_sentinel_type) const noexcept { return remaining_.empty(); } }; diff --git a/python/lib.c b/python/lib.c index d4b74e84..75acfad6 100644 --- a/python/lib.c +++ b/python/lib.c @@ -35,6 +35,7 @@ typedef SSIZE_T ssize_t; #include // Core CPython interfaces +#include // `errno` #include // `fopen` #include // `rand`, `srand` #include // `memset`, `memcpy` @@ -78,7 +79,7 @@ typedef struct { * - Str() # Empty string * - Str("some-string") # Full-range slice of a Python `str` * - Str(File("some-path.txt")) # Full-range view of a persisted file - * - Str(File("some-path.txt"), from=0, to=sys.maxint) + * - Str(File("some-path.txt"), from=0, to=sys.maxsize) */ typedef struct { PyObject ob_base; @@ -441,9 +442,18 @@ static int File_init(File *self, PyObject *positional_args, PyObject *named_args if (!PyArg_ParseTuple(positional_args, "s", &path)) return -1; #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) + DWORD path_attributes = GetFileAttributes(path); + if (path_attributes == INVALID_FILE_ATTRIBUTES) { + PyErr_SetString(PyExc_OSError, "Couldn't get file attributes!"); + return -1; + } + if (path_attributes & FILE_ATTRIBUTE_DIRECTORY) { + PyErr_SetString(PyExc_ValueError, "The provided path is a directory, not a normal file!"); + return -1; + } self->file_handle = CreateFile(path, GENERIC_READ, FILE_SHARE_READ, 0, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0); if (self->file_handle == INVALID_HANDLE_VALUE) { - PyErr_SetString(PyExc_RuntimeError, "Couldn't map the file!"); + PyErr_SetString(PyExc_OSError, "Couldn't map the file!"); return -1; } @@ -451,7 +461,7 @@ static int File_init(File *self, PyObject *positional_args, PyObject *named_args if (self->mapping_handle == 0) { CloseHandle(self->file_handle); self->file_handle = NULL; - PyErr_SetString(PyExc_RuntimeError, "Couldn't map the file!"); + PyErr_SetString(PyExc_OSError, "Couldn't map the file!"); return -1; } @@ -461,18 +471,31 @@ static int File_init(File *self, PyObject *positional_args, PyObject *named_args self->mapping_handle = NULL; CloseHandle(self->file_handle); self->file_handle = NULL; - PyErr_SetString(PyExc_RuntimeError, "Couldn't map the file!"); + PyErr_SetString(PyExc_OSError, "Couldn't map the file!"); return -1; } self->start = file; self->length = GetFileSize(self->file_handle, 0); #else - struct stat sb; self->file_descriptor = open(path, O_RDONLY); + if (self->file_descriptor == -1) { + PyErr_Format(PyExc_OSError, "Couldn't open the file at '%s': %s", path, strerror(errno)); + return -1; + } + // No permissions are required on the file itself to get it's properties from the existing descriptor. + // https://linux.die.net/man/2/fstat + struct stat sb; if (fstat(self->file_descriptor, &sb) != 0) { close(self->file_descriptor); self->file_descriptor = 0; - PyErr_SetString(PyExc_RuntimeError, "Can't retrieve file size!"); + PyErr_Format(PyExc_OSError, "Can't retrieve file size at '%s': %s", path, strerror(errno)); + return -1; + } + // Check if it's a regular file + if (!S_ISREG(sb.st_mode)) { + close(self->file_descriptor); + self->file_descriptor = 0; + PyErr_Format(PyExc_ValueError, "The provided path is not a normal file at '%s'", path); return -1; } size_t file_size = sb.st_size; @@ -480,7 +503,7 @@ static int File_init(File *self, PyObject *positional_args, PyObject *named_args if (map == MAP_FAILED) { close(self->file_descriptor); self->file_descriptor = 0; - PyErr_SetString(PyExc_RuntimeError, "Couldn't map the file!"); + PyErr_Format(PyExc_OSError, "Couldn't map the file at '%s': %s", path, strerror(errno)); return -1; } self->start = map; @@ -1162,6 +1185,48 @@ static PyObject *Strs_richcompare(PyObject *self, PyObject *other, int op) { } } +static PyObject *Str_decode(PyObject *self, PyObject *args, PyObject *kwargs) { + int is_member = self != NULL && PyObject_TypeCheck(self, &StrType); + Py_ssize_t nargs = PyTuple_Size(args); + if (nargs < !is_member || nargs > !is_member + 2) { + PyErr_Format(PyExc_TypeError, "Invalid number of arguments"); + return NULL; + } + + PyObject *text_obj = is_member ? self : PyTuple_GET_ITEM(args, 0); + PyObject *encoding_obj = nargs > !is_member + 0 ? PyTuple_GET_ITEM(args, !is_member + 0) : NULL; + PyObject *errors_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL; + + if (kwargs) { + Py_ssize_t pos = 0; + PyObject *key, *value; + while (PyDict_Next(kwargs, &pos, &key, &value)) + if (PyUnicode_CompareWithASCIIString(key, "encoding") == 0) { encoding_obj = value; } + else if (PyUnicode_CompareWithASCIIString(key, "errors") == 0) { errors_obj = value; } + else if (PyErr_Format(PyExc_TypeError, "Got an unexpected keyword argument '%U'", key)) + return NULL; + } + + // Convert `encoding` and `errors` to `NULL` if they are `None` + if (encoding_obj == Py_None) encoding_obj = NULL; + if (errors_obj == Py_None) errors_obj = NULL; + + sz_string_view_t text, encoding, errors; + if ((!export_string_like(text_obj, &text.start, &text.length)) || + (encoding_obj && !export_string_like(encoding_obj, &encoding.start, &encoding.length)) || + (errors_obj && !export_string_like(errors_obj, &errors.start, &errors.length))) { + PyErr_Format(PyExc_TypeError, "text, encoding, and errors must be string-like"); + return NULL; + } + + if (encoding_obj == NULL) encoding = (sz_string_view_t) {"utf-8", 5}; + if (errors_obj == NULL) errors = (sz_string_view_t) {"strict", 6}; + + // Python docs: https://docs.python.org/3/library/stdtypes.html#bytes.decode + // CPython docs: https://docs.python.org/3/c-api/unicode.html#c.PyUnicode_Decode + return PyUnicode_Decode(text.start, text.length, encoding.start, errors.start); +} + /** * @brief Saves a StringZilla string to disk. */ @@ -2335,12 +2400,13 @@ static PyGetSetDef Str_getsetters[] = { #define SZ_METHOD_FLAGS METH_VARARGS | METH_KEYWORDS static PyMethodDef Str_methods[] = { - // Basic `str`-like functionality + // Basic `str`, `bytes`, and `bytearray`-like functionality {"contains", Str_contains, SZ_METHOD_FLAGS, "Check if a string contains a substring."}, {"count", Str_count, SZ_METHOD_FLAGS, "Count the occurrences of a substring."}, {"splitlines", Str_splitlines, SZ_METHOD_FLAGS, "Split a string by line breaks."}, {"startswith", Str_startswith, SZ_METHOD_FLAGS, "Check if a string starts with a given prefix."}, {"endswith", Str_endswith, SZ_METHOD_FLAGS, "Check if a string ends with a given suffix."}, + {"decode", Str_decode, SZ_METHOD_FLAGS, "Decode the bytes into `str` with a given encoding"}, // Bidirectional operations {"find", Str_find, SZ_METHOD_FLAGS, "Find the first occurrence of a substring."}, @@ -2888,12 +2954,13 @@ static void stringzilla_cleanup(PyObject *m) { } static PyMethodDef stringzilla_methods[] = { - // Basic `str`-like functionality + // Basic `str`, `bytes`, and `bytearray`-like functionality {"contains", Str_contains, SZ_METHOD_FLAGS, "Check if a string contains a substring."}, {"count", Str_count, SZ_METHOD_FLAGS, "Count the occurrences of a substring."}, {"splitlines", Str_splitlines, SZ_METHOD_FLAGS, "Split a string by line breaks."}, {"startswith", Str_startswith, SZ_METHOD_FLAGS, "Check if a string starts with a given prefix."}, {"endswith", Str_endswith, SZ_METHOD_FLAGS, "Check if a string ends with a given suffix."}, + {"decode", Str_decode, SZ_METHOD_FLAGS, "Decode the bytes into `str` with a given encoding"}, // Bidirectional operations {"find", Str_find, SZ_METHOD_FLAGS, "Find the first occurrence of a substring."}, diff --git a/scripts/test.cpp b/scripts/test.cpp index 80f43442..bc730d65 100644 --- a/scripts/test.cpp +++ b/scripts/test.cpp @@ -1,4 +1,10 @@ #undef NDEBUG // Enable all assertions + +// Enable assertions for iterators +#if !defined(_ITERATOR_DEBUG_LEVEL) || _ITERATOR_DEBUG_LEVEL == 0 +#define _ITERATOR_DEBUG_LEVEL 1 +#endif + #include // assertions // Overload the following with caution. diff --git a/scripts/test.py b/scripts/test.py index 2c180cbc..735fbfd2 100644 --- a/scripts/test.py +++ b/scripts/test.py @@ -378,9 +378,38 @@ def test_unit_globals(): assert sz.edit_distance("abababab", "aaaaaaaa", bound=2) == 2 -def test_unit_len(): - w = sz.Str("abcd") - assert 4 == len(w) +def test_string_lengths(): + assert 4 == len(sz.Str("abcd")) + assert 8 == len(sz.Str("αβγδ")) + + +@pytest.mark.parametrize( + "byte_string, encoding, expected", + [ + (b"hello world", "utf-8", "hello world"), + (b"\xf0\x9f\x98\x81", "utf-8", "😁"), # Emoji + (b"hello world", "ascii", "hello world"), + (b"\xf0hello world", "latin-1", "ðhello world"), + (b"", "utf-8", ""), # Empty string case + ], +) +def test_decoding_valid_strings(byte_string, encoding, expected): + assert byte_string.decode(encoding) == expected + assert sz.Str(byte_string).decode(encoding) == expected + + +@pytest.mark.parametrize( + "byte_string, encoding", + [ + (b"\xff", "utf-8"), # Invalid UTF-8 byte + (b"\x80hello", "ascii"), # Non-ASCII byte in ASCII string + ], +) +def test_decoding_exceptions(byte_string, encoding): + with pytest.raises(UnicodeDecodeError): + byte_string.decode(encoding) + with pytest.raises(UnicodeDecodeError): + sz.Str(byte_string).decode(encoding) def test_slice_of_split():