From 245da6df67df2a51cca453ff8fd5dd48f20f5994 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sat, 7 Oct 2023 18:12:38 -0700 Subject: [PATCH 1/8] Improve: drop `ctype`, `stddef`, `stdint` headers --- .vscode/settings.json | 2 + README.md | 4 +- python/lib.c | 42 +++--- scripts/bench.ipynb | 2 +- stringzilla/stringzilla.h | 274 +++++++++++++++++++++++++------------- 5 files changed, 203 insertions(+), 121 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 886d1d22..08c5bb65 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -151,6 +151,7 @@ "NOMINMAX", "NOTIMPLEMENTED", "numpy", + "octogram", "pytest", "Pythonic", "quadgram", @@ -166,6 +167,7 @@ "substr", "SWAR", "TPFLAGS", + "unigram", "Vardanian", "vectorcallfunc", "XDECREF", diff --git a/README.md b/README.md index 3c04c219..85032c34 100644 --- a/README.md +++ b/README.md @@ -116,11 +116,11 @@ There is an ABI-stable C 99 interface, in case you have a database, an operating // Initialize your haystack and needle sz_haystack_t haystack = {your_text, your_text_length}; -sz_needle_t needle = {your_subtext, your_subtext_length, your_anomaly_offset}; +sz_needle_t needle = {your_subtext, your_subtext_length, your_quadgram_offset}; // Perform string-level operations size_t character_count = sz_count_char(haystack, 'a'); -size_t character_position = sz_find_char(haystack, 'a'); +size_t character_position = sz_find_unigram(haystack, 'a'); size_t substring_position = sz_find_substr(haystack, needle); // Perform collection level operations diff --git a/python/lib.c b/python/lib.c index ad10f196..a0f6caca 100644 --- a/python/lib.c +++ b/python/lib.c @@ -48,12 +48,12 @@ static struct { * native `mmap` module, as it exposes the address of the mapping in memory. */ typedef struct { - PyObject_HEAD; + PyObject_HEAD #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) - HANDLE file_handle; + HANDLE file_handle; HANDLE mapping_handle; #else - int file_descriptor; + int file_descriptor; #endif void *start; size_t length; @@ -72,8 +72,7 @@ typedef struct { * - Str(File("some-path.txt"), from=0, to=sys.maxint) */ typedef struct { - PyObject_HEAD; - PyObject *parent; + PyObject_HEAD PyObject *parent; char const *start; size_t length; } Str; @@ -83,14 +82,14 @@ typedef struct { * for faster sorting, shuffling, joins, and lookups. */ typedef struct { - PyObject_HEAD; + PyObject_HEAD - enum { - STRS_CONSECUTIVE_32, - STRS_CONSECUTIVE_64, - STRS_REORDERED, - STRS_MULTI_SOURCE, - } type; + enum { + STRS_CONSECUTIVE_32, + STRS_CONSECUTIVE_64, + STRS_REORDERED, + STRS_MULTI_SOURCE, + } type; union { /** @@ -641,7 +640,7 @@ static void Str_releasebuffer(PyObject *_, Py_buffer *view) { static int Str_in(Str *self, PyObject *arg) { sz_needle_t needle_struct; - needle_struct.anomaly_offset = 0; + needle_struct.quadgram_offset = 0; if (!export_string_like(arg, &needle_struct.start, &needle_struct.length)) { PyErr_SetString(PyExc_TypeError, "Unsupported argument type"); return -1; @@ -851,7 +850,7 @@ static int Str_find_( // Py_ssize_t start, end; // Validate and convert `haystack` and `needle` - needle.anomaly_offset = 0; + needle.quadgram_offset = 0; if (!export_string_like(haystack_obj, &haystack.start, &haystack.length) || !export_string_like(needle_obj, &needle.start, &needle.length)) { PyErr_SetString(PyExc_TypeError, "Haystack and needle must be string-like"); @@ -1000,7 +999,7 @@ static PyObject *Str_count(PyObject *self, PyObject *args, PyObject *kwargs) { Py_ssize_t end = end_obj ? PyLong_AsSsize_t(end_obj) : PY_SSIZE_T_MAX; int allowoverlap = allowoverlap_obj ? PyObject_IsTrue(allowoverlap_obj) : 0; - needle.anomaly_offset = 0; + needle.quadgram_offset = 0; if (!export_string_like(haystack_obj, &haystack.start, &haystack.length) || !export_string_like(needle_obj, &needle.start, &needle.length)) return PyErr_Format(PyExc_TypeError, "Haystack and needle must be string-like"), NULL; @@ -1287,7 +1286,7 @@ static PyObject *Str_split(PyObject *self, PyObject *args, PyObject *kwargs) { sz_needle_t separator; int keepseparator; Py_ssize_t maxsplit; - separator.anomaly_offset = 0; + separator.quadgram_offset = 0; // Validate and convert `text` if (!export_string_like(text_obj, &text.start, &text.length)) { @@ -1565,14 +1564,9 @@ static boolean_t Strs_sort_(Strs *self, } // Get the parts and their count - sz_haystack_t *parts = NULL; - size_t count = 0; - switch (self->type) { - case STRS_REORDERED: - parts = self->data.reordered.parts; - count = self->data.reordered.count; - break; - } + // The only possible `self->type` by now is the `STRS_REORDERED` + sz_haystack_t *parts = self->data.reordered.parts; + size_t count = self->data.reordered.count; // Allocate temporary memory to store the ordering offsets size_t memory_needed = sizeof(sz_size_t) * count; diff --git a/scripts/bench.ipynb b/scripts/bench.ipynb index b69d2f8f..b3bc4392 100644 --- a/scripts/bench.ipynb +++ b/scripts/bench.ipynb @@ -176,7 +176,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.11" + "version": "3.10.13" }, "orig_nbformat": 4 }, diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h index 7b664ca6..51319f01 100644 --- a/stringzilla/stringzilla.h +++ b/stringzilla/stringzilla.h @@ -1,10 +1,7 @@ #ifndef STRINGZILLA_H_ #define STRINGZILLA_H_ -#include // `tolower` #include // `qsort_s` -#include // `sz_size_t` -#include // `uint8_t` #include // `qsort_r` #include // `memcpy` @@ -30,11 +27,71 @@ extern "C" { #endif -typedef uint32_t sz_anomaly_t; -typedef uint64_t sz_size_t; +#if defined(__LP64__) || defined(_LP64) || defined(__x86_64__) || defined(_WIN64) +typedef unsigned long sz_size_t; // 64-bit on most platforms when pointers are 64-bit +#else +typedef unsigned sz_size_t; // 32-bit on most platforms when pointers are 32-bit +#endif + +typedef unsigned sz_u32_t; // Always 32 bits +typedef unsigned long long sz_u64_t; // Always 64 bits + +typedef union sz_quadgram_t { + unsigned u32; + unsigned char u8s[4]; +} sz_quadgram_t; // Always 32-bit unsigned integer, representing 8 bytes/characters + +typedef union sz_octogram_t { + unsigned long long u64; + unsigned char u8s[8]; +} sz_octogram_t; // Always 64-bit unsigned integer, representing 8 bytes/characters inline static sz_size_t sz_divide_round_up(sz_size_t x, sz_size_t divisor) { return (x + (divisor - 1)) / divisor; } +inline static sz_size_t sz_tolower_ascii(char c) { + static char lowered[256] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, // + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, // + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, // + 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, // + 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 91, 92, 93, 94, 95, // + 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, // + 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, // + 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, // + 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, // + 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, // + 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, // + 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, // + 240, 241, 242, 243, 244, 245, 246, 215, 248, 249, 250, 251, 252, 253, 254, 223, // + 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, // + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, // + }; + return lowered[(int)c]; +} + +inline static sz_size_t sz_toupper_ascii(char c) { + static char upped[256] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, // + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, // + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, // + 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, // + 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 91, 92, 93, 94, 95, // + 96, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, // + 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 123, 124, 125, 126, 127, // + 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, // + 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, // + 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, // + 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, // + 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, // + 240, 241, 242, 243, 244, 245, 246, 215, 248, 249, 250, 251, 252, 253, 254, 223, // + 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, // + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, // + }; + return upped[(int)c]; +} + /** * @brief This is a faster alternative to `strncmp(a, b, length) == 0`. * @return 1 for `true`, and 0 for `false`. @@ -53,28 +110,29 @@ typedef struct sz_haystack_t { typedef struct sz_needle_t { char const *start; sz_size_t length; - sz_size_t anomaly_offset; + sz_size_t quadgram_offset; } sz_needle_t; /** * @brief SWAR single-character counting procedure, jumping 8 bytes at a time. */ -inline static sz_size_t sz_count_char_swar(sz_haystack_t h, char n) { +inline static sz_size_t sz_count_unigram_swar(sz_haystack_t h, char n) { sz_size_t result = 0; char const *text = h.start; char const *end = h.start + h.length; - for (; (uint64_t)text % 8 != 0 && text < end; ++text) result += *text == n; + // Process the misaligned head, to void UB on unaligned 64-bit loads. + for (; ((unsigned long)text & 7ul) && text < end; ++text) result += *text == n; // This code simulates hyper-scalar execution, comparing 8 characters at a time. - uint64_t nnnnnnnn = n; + sz_u64_t nnnnnnnn = n; nnnnnnnn |= nnnnnnnn << 8; nnnnnnnn |= nnnnnnnn << 16; nnnnnnnn |= nnnnnnnn << 32; for (; text + 8 <= end; text += 8) { - uint64_t text_slice = *(uint64_t const *)text; - uint64_t match_indicators = ~(text_slice ^ nnnnnnnn); + sz_u64_t text_slice = *(sz_u64_t const *)text; + sz_u64_t match_indicators = ~(text_slice ^ nnnnnnnn); match_indicators &= match_indicators >> 1; match_indicators &= match_indicators >> 2; match_indicators &= match_indicators >> 4; @@ -89,22 +147,23 @@ inline static sz_size_t sz_count_char_swar(sz_haystack_t h, char n) { /** * @brief SWAR single-character search in string, jumping 8 bytes at a time. */ -inline static sz_size_t sz_find_char_swar(sz_haystack_t h, char n) { +inline static sz_size_t sz_find_unigram_swar(sz_haystack_t h, char n) { char const *text = h.start; char const *end = h.start + h.length; - for (; (uint64_t)text % 8 != 0 && text < end; ++text) + // Process the misaligned head, to void UB on unaligned 64-bit loads. + for (; ((unsigned long)text & 7ul) && text < end; ++text) if (*text == n) return text - h.start; // This code simulates hyper-scalar execution, analyzing 8 offsets at a time. - uint64_t nnnnnnnn = n; + sz_u64_t nnnnnnnn = n; nnnnnnnn |= nnnnnnnn << 8; // broadcast `n` into `nnnnnnnn` nnnnnnnn |= nnnnnnnn << 16; // broadcast `n` into `nnnnnnnn` nnnnnnnn |= nnnnnnnn << 32; // broadcast `n` into `nnnnnnnn` for (; text + 8 <= end; text += 8) { - uint64_t text_slice = *(uint64_t const *)text; - uint64_t match_indicators = ~(text_slice ^ nnnnnnnn); + sz_u64_t text_slice = *(sz_u64_t const *)text; + sz_u64_t match_indicators = ~(text_slice ^ nnnnnnnn); match_indicators &= match_indicators >> 1; match_indicators &= match_indicators >> 2; match_indicators &= match_indicators >> 4; @@ -121,26 +180,31 @@ inline static sz_size_t sz_find_char_swar(sz_haystack_t h, char n) { /** * @brief SWAR character-bigram search in string, jumping 8 bytes at a time. */ -inline static sz_size_t sz_find_2chars_swar(sz_haystack_t h, char const *n) { +inline static sz_size_t sz_find_bigram_swar(sz_haystack_t h, char const *n) { char const *text = h.start; char const *end = h.start + h.length; + // Process the misaligned head, to void UB on unaligned 64-bit loads. + for (; ((unsigned long)text & 7ul) && text + 2 <= end; ++text) + if (text[0] == n[0] && text[1] == n[1]) return text - h.start; + // This code simulates hyper-scalar execution, analyzing 7 offsets at a time. - uint64_t nnnn = ((uint64_t)(n[0]) << 0) | ((uint64_t)(n[1]) << 8); // broadcast `n` into `nnnn` + sz_u64_t nnnn = ((sz_u64_t)(n[0]) << 0) | ((sz_u64_t)(n[1]) << 8); // broadcast `n` into `nnnn` nnnn |= nnnn << 16; // broadcast `n` into `nnnn` nnnn |= nnnn << 32; // broadcast `n` into `nnnn` - uint64_t text_slice; for (; text + 8 <= end; text += 7) { - memcpy(&text_slice, text, 8); - uint64_t even_indicators = ~(text_slice ^ nnnn); - uint64_t odd_indicators = ~((text_slice << 8) ^ nnnn); + sz_u64_t text_slice = *(sz_u64_t const *)text; + sz_u64_t even_indicators = ~(text_slice ^ nnnn); + sz_u64_t odd_indicators = ~((text_slice << 8) ^ nnnn); + // For every even match - 2 char (16 bits) must be identical. even_indicators &= even_indicators >> 1; even_indicators &= even_indicators >> 2; even_indicators &= even_indicators >> 4; even_indicators &= even_indicators >> 8; even_indicators &= 0x0001000100010001; + // For every odd match - 2 char (16 bits) must be identical. odd_indicators &= odd_indicators >> 1; odd_indicators &= odd_indicators >> 2; @@ -149,7 +213,7 @@ inline static sz_size_t sz_find_2chars_swar(sz_haystack_t h, char const *n) { odd_indicators &= 0x0001000100010000; if (even_indicators + odd_indicators) { - uint64_t match_indicators = even_indicators | (odd_indicators >> 8); + sz_u64_t match_indicators = even_indicators | (odd_indicators >> 8); return text - h.start + ctz64(match_indicators) / 8; } } @@ -162,23 +226,26 @@ inline static sz_size_t sz_find_2chars_swar(sz_haystack_t h, char const *n) { /** * @brief SWAR character-trigram search in string, jumping 8 bytes at a time. */ -inline static sz_size_t sz_find_3chars_swar(sz_haystack_t h, char const *n) { +inline static sz_size_t sz_find_trigram_swar(sz_haystack_t h, char const *n) { char const *text = h.start; char const *end = h.start + h.length; + // Process the misaligned head, to void UB on unaligned 64-bit loads. + for (; ((unsigned long)text & 7ul) && text + 3 <= end; ++text) + if (text[0] == n[0] && text[1] == n[1] && text[2] == n[2]) return text - h.start; + // This code simulates hyper-scalar execution, analyzing 6 offsets at a time. // We have two unused bytes at the end. - uint64_t nn = (uint64_t)(n[0] << 0) | ((uint64_t)(n[1]) << 8) | ((uint64_t)(n[2]) << 16); // broadcast `n` into `nn` + sz_u64_t nn = (sz_u64_t)(n[0] << 0) | ((sz_u64_t)(n[1]) << 8) | ((sz_u64_t)(n[2]) << 16); // broadcast `n` into `nn` nn |= nn << 24; // broadcast `n` into `nn` nn <<= 16; // broadcast `n` into `nn` for (; text + 8 <= end; text += 6) { - uint64_t text_slice; - memcpy(&text_slice, text, 8); - uint64_t first_indicators = ~(text_slice ^ nn); - uint64_t second_indicators = ~((text_slice << 8) ^ nn); - uint64_t third_indicators = ~((text_slice << 16) ^ nn); + sz_u64_t text_slice = *(sz_u64_t const *)text; + sz_u64_t first_indicators = ~(text_slice ^ nn); + sz_u64_t second_indicators = ~((text_slice << 8) ^ nn); + sz_u64_t third_indicators = ~((text_slice << 16) ^ nn); // For every first match - 3 chars (24 bits) must be identical. // For that merge every byte state and then combine those three-way. first_indicators &= first_indicators >> 1; @@ -203,7 +270,7 @@ inline static sz_size_t sz_find_3chars_swar(sz_haystack_t h, char const *n) { third_indicators = (third_indicators >> 16) & (third_indicators >> 8) & (third_indicators >> 0) & 0x0000010000010000; - uint64_t match_indicators = first_indicators | (second_indicators >> 8) | (third_indicators >> 16); + sz_u64_t match_indicators = first_indicators | (second_indicators >> 8) | (third_indicators >> 16); if (match_indicators != 0) return text - h.start + ctz64(match_indicators) / 8; } @@ -215,29 +282,32 @@ inline static sz_size_t sz_find_3chars_swar(sz_haystack_t h, char const *n) { /** * @brief SWAR character-quadgram search in string, jumping 8 bytes at a time. */ -inline static sz_size_t sz_find_4chars_swar(sz_haystack_t h, char const *n) { +inline static sz_size_t sz_find_quadgram_swar(sz_haystack_t h, char const *n) { char const *text = h.start; char const *end = h.start + h.length; + // Process the misaligned head, to void UB on unaligned 64-bit loads. + for (; ((unsigned long)text & 7ul) && text + 4 <= end; ++text) + if (text[0] == n[0] && text[1] == n[1] && text[2] == n[2] && text[3] == n[3]) return text - h.start; + // This code simulates hyper-scalar execution, analyzing 4 offsets at a time. - uint64_t nn = (uint64_t)(n[0] << 0) | ((uint64_t)(n[1]) << 8) | ((uint64_t)(n[2]) << 16) | ((uint64_t)(n[3]) << 24); + sz_u64_t nn = (sz_u64_t)(n[0] << 0) | ((sz_u64_t)(n[1]) << 8) | ((sz_u64_t)(n[2]) << 16) | ((sz_u64_t)(n[3]) << 24); nn |= nn << 32; // - uint8_t lookup[16] = {0}; - lookup[0b0010] = lookup[0b0110] = lookup[0b1010] = lookup[0b1110] = 1; - lookup[0b0100] = lookup[0b1100] = 2; - lookup[0b1000] = 3; + unsigned char lookup[16] = {0}; + lookup[0x2] = lookup[0x6] = lookup[0xA] = lookup[0xE] = 1; + lookup[0x4] = lookup[0xC] = 2; + lookup[0x8] = 3; // We can perform 5 comparisons per load, but it's easier to perform 4, minimizing the size of the lookup table. for (; text + 8 <= end; text += 4) { - uint64_t text_slice; - memcpy(&text_slice, text, 8); - uint64_t text01 = (text_slice & 0x00000000FFFFFFFF) | ((text_slice & 0x000000FFFFFFFF00) << 24); - uint64_t text23 = ((text_slice & 0x0000FFFFFFFF0000) >> 16) | ((text_slice & 0x00FFFFFFFF000000) << 8); - uint64_t text01_indicators = ~(text01 ^ nn); - uint64_t text23_indicators = ~(text23 ^ nn); + sz_u64_t text_slice = *(sz_u64_t const *)text; + sz_u64_t text01 = (text_slice & 0x00000000FFFFFFFF) | ((text_slice & 0x000000FFFFFFFF00) << 24); + sz_u64_t text23 = ((text_slice & 0x0000FFFFFFFF0000) >> 16) | ((text_slice & 0x00FFFFFFFF000000) << 8); + sz_u64_t text01_indicators = ~(text01 ^ nn); + sz_u64_t text23_indicators = ~(text23 ^ nn); // For every first match - 4 chars (32 bits) must be identical. text01_indicators &= text01_indicators >> 1; @@ -258,7 +328,7 @@ inline static sz_size_t sz_find_4chars_swar(sz_haystack_t h, char const *n) { if (text01_indicators + text23_indicators) { // Assuming we have performed 4 comparisons, we can only have 2^4=16 outcomes. // Which is small enough for a lookup table. - uint8_t match_indicators = (uint8_t)( // + unsigned char match_indicators = (unsigned char)( // (text01_indicators >> 31) | (text01_indicators << 0) | // (text23_indicators >> 29) | (text23_indicators << 2)); return text - h.start + lookup[match_indicators]; @@ -272,7 +342,7 @@ inline static sz_size_t sz_find_4chars_swar(sz_haystack_t h, char const *n) { /** * @brief Trivial substring search with scalar code. Instead of comparing characters one-by-one - * it compares 4-byte anomalies first, most commonly prefixes. It's computationally cheaper. + * it compares 4-byte quadgrams first, most commonly prefixes. It's computationally cheaper. * Matching performance fluctuates between 1 GB/s and 3,5 GB/s per core. */ inline static sz_size_t sz_find_substr_swar(sz_haystack_t h, sz_needle_t n) { @@ -281,26 +351,36 @@ inline static sz_size_t sz_find_substr_swar(sz_haystack_t h, sz_needle_t n) { switch (n.length) { case 0: return 0; - case 1: return sz_find_char_swar(h, *n.start); - case 2: return sz_find_2chars_swar(h, n.start); - case 3: return sz_find_3chars_swar(h, n.start); - case 4: return sz_find_4chars_swar(h, n.start); + case 1: return sz_find_unigram_swar(h, *n.start); + case 2: return sz_find_bigram_swar(h, n.start); + case 3: return sz_find_trigram_swar(h, n.start); + case 4: return sz_find_quadgram_swar(h, n.start); default: { char const *text = h.start; char const *const end = h.start + h.length; - sz_anomaly_t n_anomaly, h_anomaly; - sz_size_t const n_suffix_len = n.length - 4 - n.anomaly_offset; - char const *n_suffix_ptr = n.start + 4 + n.anomaly_offset; - memcpy(&n_anomaly, n.start + n.anomaly_offset, 4); - - text += n.anomaly_offset; - for (; text + n.length <= end; text++) { - memcpy(&h_anomaly, text, 4); - if (h_anomaly == n_anomaly) // Match anomaly. - if (sz_equal(text + 4, n_suffix_ptr, n_suffix_len)) // Match suffix. - if (sz_equal(text - n.anomaly_offset, n.start, n.anomaly_offset)) // Match prefix. - return text - h.start - n.anomaly_offset; + sz_quadgram_t n_quadgram, h_quadgram; + sz_size_t const n_suffix_len = n.length - 4 - n.quadgram_offset; + char const *n_suffix_ptr = n.start + 4 + n.quadgram_offset; + n_quadgram.u8s[0] = n.start[n.quadgram_offset]; + n_quadgram.u8s[1] = n.start[n.quadgram_offset + 1]; + n_quadgram.u8s[2] = n.start[n.quadgram_offset + 2]; + n_quadgram.u8s[3] = n.start[n.quadgram_offset + 3]; + h_quadgram.u8s[0] = h.start[0]; + h_quadgram.u8s[1] = h.start[1]; + h_quadgram.u8s[2] = h.start[2]; + h_quadgram.u8s[3] = h.start[3]; + + text += n.quadgram_offset; + while (text + n.length <= end) { + if (h_quadgram.u32 == n_quadgram.u32) // Match quadgram. + if (sz_equal(text + 4, n_suffix_ptr, n_suffix_len)) // Match suffix. + if (sz_equal(text - n.quadgram_offset, n.start, n.quadgram_offset)) // Match prefix. + return text - h.start - n.quadgram_offset; + + h_quadgram.u32 <<= 8; + h_quadgram.u8s[3] = *text; + ++text; } return h.length; } @@ -319,17 +399,17 @@ inline static sz_size_t sz_find_substr_avx2(sz_haystack_t h, sz_needle_t n) { // Precomputed constants char const *const end = h.start + h.length; - uint32_t anomaly = 0; - uint32_t mask = 0; + sz_quadgram_t quadgram = 0; + sz_quadgram_t mask = 0; switch (n.length) { - case 1: memset(&mask, 0xFF, 1), memcpy(&anomaly, n.start, 1); break; - case 2: memset(&mask, 0xFF, 2), memcpy(&anomaly, n.start, 2); break; - case 3: memset(&mask, 0xFF, 3), memcpy(&anomaly, n.start, 3); break; - default: memset(&mask, 0xFF, 4), memcpy(&anomaly, n.start, 4); break; + case 1: memset(&mask, 0xFF, 1), memcpy(&quadgram, n.start, 1); break; + case 2: memset(&mask, 0xFF, 2), memcpy(&quadgram, n.start, 2); break; + case 3: memset(&mask, 0xFF, 3), memcpy(&quadgram, n.start, 3); break; + default: memset(&mask, 0xFF, 4), memcpy(&quadgram, n.start, 4); break; } - __m256i const anomalies = _mm256_set1_epi32(*(uint32_t const *)&anomaly); - __m256i const masks = _mm256_set1_epi32(*(uint32_t const *)&mask); + __m256i const quadgrams = _mm256_set1_epi32(quadgram.u32); + __m256i const masks = _mm256_set1_epi32(mask.u32); // Top level for-loop changes dramatically. // In sequential computing model for 32 offsets we would do: @@ -345,13 +425,13 @@ inline static sz_size_t sz_find_substr_avx2(sz_haystack_t h, sz_needle_t n) { // Performing many unaligned loads ends up being faster than loading once and shuffling around. __m256i texts0 = _mm256_and_si256(_mm256_loadu_si256((__m256i const *)(text + 0)), masks); - int matches0 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts0, anomalies)); + int matches0 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts0, quadgrams)); __m256i texts1 = _mm256_and_si256(_mm256_loadu_si256((__m256i const *)(text + 1)), masks); - int matches1 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts1, anomalies)); + int matches1 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts1, quadgrams)); __m256i text2 = _mm256_and_si256(_mm256_loadu_si256((__m256i const *)(text + 2)), masks); - int matches2 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(text2, anomalies)); + int matches2 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(text2, quadgrams)); __m256i texts3 = _mm256_and_si256(_mm256_loadu_si256((__m256i const *)(text + 3)), masks); - int matches3 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts3, anomalies)); + int matches3 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts3, quadgrams)); if (matches0 | matches1 | matches2 | matches3) { for (sz_size_t i = 0; i < 32; i++) { @@ -382,16 +462,22 @@ inline static sz_size_t sz_find_substr_neon(sz_haystack_t h, sz_needle_t n) { // Precomputed constants char const *const end = h.start + h.length; - uint32_t anomaly = 0; - uint32_t mask = 0; + sz_quadgram_t quadgram = {}; + sz_quadgram_t mask = {}; switch (n.length) { - case 1: memset(&mask, 0xFF, 1), memcpy(&anomaly, n.start, 1); break; - case 2: memset(&mask, 0xFF, 2), memcpy(&anomaly, n.start, 2); break; - case 3: memset(&mask, 0xFF, 3), memcpy(&anomaly, n.start, 3); break; - default: memset(&mask, 0xFF, 4), memcpy(&anomaly, n.start, 4); break; + case 1: mask.u8s[0] = 0xFF, quadgram.u8s[0] = n.start[0]; break; + case 2: mask.u8s[0] = mask.u8s[1] = 0xFF, quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1]; break; + case 3: + mask.u8s[0] = mask.u8s[1] = mask.u8s[2] = 0xFF, quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1], + quadgram.u8s[2] = n.start[2]; + break; + default: + mask.u32 = 0xFFFFFFFF, quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1], quadgram.u8s[2] = n.start[2], + quadgram.u8s[3] = n.start[3]; + break; } - uint32x4_t const anomalies = vld1q_dup_u32(&anomaly); + uint32x4_t const quadgrams = vld1q_dup_u32(&quadgram.u32); uint32x4_t const masks = vld1q_dup_u32(&mask); uint32x4_t matches, matches0, matches1, matches2, matches3; @@ -400,10 +486,10 @@ inline static sz_size_t sz_find_substr_neon(sz_haystack_t h, sz_needle_t n) { // Each of the following `matchesX` contains only 4 relevant bits - one per word. // Each signifies a match at the given offset. - matches0 = vceqq_u32(vandq_u32(vld1q_u32((uint32_t const *)(text + 0)), masks), anomalies); - matches1 = vceqq_u32(vandq_u32(vld1q_u32((uint32_t const *)(text + 1)), masks), anomalies); - matches2 = vceqq_u32(vandq_u32(vld1q_u32((uint32_t const *)(text + 2)), masks), anomalies); - matches3 = vceqq_u32(vandq_u32(vld1q_u32((uint32_t const *)(text + 3)), masks), anomalies); + matches0 = vceqq_u32(vandq_u32(vld1q_u32((sz_quadgram_t const *)(text + 0)), masks), quadgrams); + matches1 = vceqq_u32(vandq_u32(vld1q_u32((sz_quadgram_t const *)(text + 1)), masks), quadgrams); + matches2 = vceqq_u32(vandq_u32(vld1q_u32((sz_quadgram_t const *)(text + 2)), masks), quadgrams); + matches3 = vceqq_u32(vandq_u32(vld1q_u32((sz_quadgram_t const *)(text + 3)), masks), quadgrams); matches = vorrq_u32(vorrq_u32(matches0, matches1), vorrq_u32(matches2, matches3)); if (vmaxvq_u32(matches)) { @@ -448,8 +534,8 @@ inline static sz_size_t sz_find_substr_neon(sz_haystack_t h, sz_needle_t n) { #endif // Arm Neon -inline static sz_size_t sz_count_char(sz_haystack_t h, char n) { return sz_count_char_swar(h, n); } -inline static sz_size_t sz_find_char(sz_haystack_t h, char n) { return sz_find_char_swar(h, n); } +inline static sz_size_t sz_count_char(sz_haystack_t h, char n) { return sz_count_unigram_swar(h, n); } +inline static sz_size_t sz_find_unigram(sz_haystack_t h, char n) { return sz_find_unigram_swar(h, n); } inline static sz_size_t sz_find_substr(sz_haystack_t h, sz_needle_t n) { if (h.length < n.length) return h.length; @@ -665,10 +751,10 @@ inline static void sz_sort(sz_sequence_t *sequence, sz_sort_config_t const *conf char *prefix = (char *)&sequence->order[i]; for (sz_size_t j = 0; j != length; ++j) prefix[7 - j] = begin[j]; if (case_insensitive) { - prefix[0] = tolower(prefix[0]); - prefix[1] = tolower(prefix[1]); - prefix[2] = tolower(prefix[2]); - prefix[3] = tolower(prefix[3]); + prefix[0] = sz_tolower_ascii(prefix[0]); + prefix[1] = sz_tolower_ascii(prefix[1]); + prefix[2] = sz_tolower_ascii(prefix[2]); + prefix[3] = sz_tolower_ascii(prefix[3]); } } @@ -679,7 +765,7 @@ inline static void sz_sort(sz_sequence_t *sequence, sz_sort_config_t const *conf _sz_sort_recursion(sequence, 0, 32, comparator); } -typedef uint8_t levenstein_distance_t; +typedef unsigned char levenstein_distance_t; /** * @return Amount of temporary memory (in bytes) needed to efficiently compute @@ -758,11 +844,11 @@ inline static levenstein_distance_t sz_levenstein( // /** * @brief Hashes provided string using hardware-accelerated CRC32 instructions. */ -inline static uint32_t sz_hash_crc32_native(char const *start, sz_size_t length) { return 0; } +inline static sz_u32_t sz_hash_crc32_native(char const *start, sz_size_t length) { return 0; } -inline static uint32_t sz_hash_crc32_neon(char const *start, sz_size_t length) { return 0; } +inline static sz_u32_t sz_hash_crc32_neon(char const *start, sz_size_t length) { return 0; } -inline static uint32_t sz_hash_crc32_sse(char const *start, sz_size_t length) { return 0; } +inline static sz_u32_t sz_hash_crc32_sse(char const *start, sz_size_t length) { return 0; } #ifdef __cplusplus } From b62b9c666c8970bb4219f1227b225ecb44a0d707 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sat, 7 Oct 2023 19:42:11 -0700 Subject: [PATCH 2/8] Fix: SWAR search bug --- stringzilla/stringzilla.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h index 51319f01..7353024a 100644 --- a/stringzilla/stringzilla.h +++ b/stringzilla/stringzilla.h @@ -373,13 +373,13 @@ inline static sz_size_t sz_find_substr_swar(sz_haystack_t h, sz_needle_t n) { text += n.quadgram_offset; while (text + n.length <= end) { + h_quadgram.u8s[3] = text[3]; if (h_quadgram.u32 == n_quadgram.u32) // Match quadgram. if (sz_equal(text + 4, n_suffix_ptr, n_suffix_len)) // Match suffix. if (sz_equal(text - n.quadgram_offset, n.start, n.quadgram_offset)) // Match prefix. return text - h.start - n.quadgram_offset; - h_quadgram.u32 <<= 8; - h_quadgram.u8s[3] = *text; + h_quadgram.u32 >>= 8; ++text; } return h.length; From ac7012a2796e613af75fde91e205ef55fb84944b Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sat, 7 Oct 2023 20:39:44 -0700 Subject: [PATCH 3/8] Improve: avoiding nested loop in AVX2 --- stringzilla/stringzilla.h | 93 +++++++++++++++++++++++++-------------- 1 file changed, 61 insertions(+), 32 deletions(-) diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h index 7353024a..6b481dda 100644 --- a/stringzilla/stringzilla.h +++ b/stringzilla/stringzilla.h @@ -387,6 +387,40 @@ inline static sz_size_t sz_find_substr_swar(sz_haystack_t h, sz_needle_t n) { } } +/** + * Helper function, used in substring search operations. + */ +inline static void _sz_find_substr_populate_quadgram( // + sz_haystack_t h, + sz_needle_t n, + sz_quadgram_t *quadgram_out, + sz_quadgram_t *mask_out) { + + sz_quadgram_t quadgram; + sz_quadgram_t mask; + switch (n.length) { + case 1: + mask.u8s[0] = 0xFF, mask.u8s[1] = mask.u8s[2] = mask.u8s[3] = 0; + quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = quadgram.u8s[2] = quadgram.u8s[3] = 0; + break; + case 2: + mask.u8s[0] = mask.u8s[1] = 0xFF, mask.u8s[2] = mask.u8s[3] = 0; + quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1], quadgram.u8s[2] = quadgram.u8s[3] = 0; + break; + case 3: + mask.u8s[0] = mask.u8s[1] = mask.u8s[2] = 0xFF, mask.u8s[3] = 0; + quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1], quadgram.u8s[2] = n.start[2], quadgram.u8s[3] = 0; + break; + default: + mask.u32 = 0xFFFFFFFF; + quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1], quadgram.u8s[2] = n.start[2], + quadgram.u8s[3] = n.start[3]; + break; + } + *quadgram_out = quadgram; + *mask_out = mask; +} + #if defined(__AVX2__) /** @@ -399,15 +433,9 @@ inline static sz_size_t sz_find_substr_avx2(sz_haystack_t h, sz_needle_t n) { // Precomputed constants char const *const end = h.start + h.length; - sz_quadgram_t quadgram = 0; - sz_quadgram_t mask = 0; - switch (n.length) { - case 1: memset(&mask, 0xFF, 1), memcpy(&quadgram, n.start, 1); break; - case 2: memset(&mask, 0xFF, 2), memcpy(&quadgram, n.start, 2); break; - case 3: memset(&mask, 0xFF, 3), memcpy(&quadgram, n.start, 3); break; - default: memset(&mask, 0xFF, 4), memcpy(&quadgram, n.start, 4); break; - } - + sz_quadgram_t quadgram; + sz_quadgram_t mask; + _sz_find_substr_populate_quadgram(h, n, &quadgram, &mask); __m256i const quadgrams = _mm256_set1_epi32(quadgram.u32); __m256i const masks = _mm256_set1_epi32(mask.u32); @@ -421,7 +449,7 @@ inline static sz_size_t sz_find_substr_avx2(sz_haystack_t h, sz_needle_t n) { // + 3 bitwise ANDs. // + 1 heavy (but very unlikely) branch. char const *text = h.start; - for (; (text + n.length + 32) <= end; text += 32) { + while (text + n.length + 32 <= end) { // Performing many unaligned loads ends up being faster than loading once and shuffling around. __m256i texts0 = _mm256_and_si256(_mm256_loadu_si256((__m256i const *)(text + 0)), masks); @@ -434,10 +462,23 @@ inline static sz_size_t sz_find_substr_avx2(sz_haystack_t h, sz_needle_t n) { int matches3 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts3, quadgrams)); if (matches0 | matches1 | matches2 | matches3) { - for (sz_size_t i = 0; i < 32; i++) { - if (sz_equal(text + i, n.start, n.length)) return i + (text - h.start); + int matches = // + (matches0 & 0x1111'1111u) | // + (matches1 & 0x2222'2222u) | // + (matches2 & 0x4444'4444u) | // + (matches3 & 0x8888'8888u); + size_t first_match_offset = _tzcnt_u32(matches); + if (n.length > 4) { + if (sz_equal(text + first_match_offset + 4, n.start + 4, n.length - 4)) + return text + first_match_offset - h.start; + else + text += first_match_offset + 1; } - } + else + return text + first_match_offset - h.start; + } + else + text += 32; } // Don't forget the last (up to 35) characters. @@ -462,21 +503,9 @@ inline static sz_size_t sz_find_substr_neon(sz_haystack_t h, sz_needle_t n) { // Precomputed constants char const *const end = h.start + h.length; - sz_quadgram_t quadgram = {}; - sz_quadgram_t mask = {}; - switch (n.length) { - case 1: mask.u8s[0] = 0xFF, quadgram.u8s[0] = n.start[0]; break; - case 2: mask.u8s[0] = mask.u8s[1] = 0xFF, quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1]; break; - case 3: - mask.u8s[0] = mask.u8s[1] = mask.u8s[2] = 0xFF, quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1], - quadgram.u8s[2] = n.start[2]; - break; - default: - mask.u32 = 0xFFFFFFFF, quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1], quadgram.u8s[2] = n.start[2], - quadgram.u8s[3] = n.start[3]; - break; - } - + sz_quadgram_t quadgram; + sz_quadgram_t mask; + _sz_find_substr_populate_quadgram(h, n, &quadgram, &mask); uint32x4_t const quadgrams = vld1q_dup_u32(&quadgram.u32); uint32x4_t const masks = vld1q_dup_u32(&mask); uint32x4_t matches, matches0, matches1, matches2, matches3; @@ -486,10 +515,10 @@ inline static sz_size_t sz_find_substr_neon(sz_haystack_t h, sz_needle_t n) { // Each of the following `matchesX` contains only 4 relevant bits - one per word. // Each signifies a match at the given offset. - matches0 = vceqq_u32(vandq_u32(vld1q_u32((sz_quadgram_t const *)(text + 0)), masks), quadgrams); - matches1 = vceqq_u32(vandq_u32(vld1q_u32((sz_quadgram_t const *)(text + 1)), masks), quadgrams); - matches2 = vceqq_u32(vandq_u32(vld1q_u32((sz_quadgram_t const *)(text + 2)), masks), quadgrams); - matches3 = vceqq_u32(vandq_u32(vld1q_u32((sz_quadgram_t const *)(text + 3)), masks), quadgrams); + matches0 = vceqq_u32(vandq_u32(vld1q_u32(text + 0), masks), quadgrams); + matches1 = vceqq_u32(vandq_u32(vld1q_u32(text + 1), masks), quadgrams); + matches2 = vceqq_u32(vandq_u32(vld1q_u32(text + 2), masks), quadgrams); + matches3 = vceqq_u32(vandq_u32(vld1q_u32(text + 3), masks), quadgrams); matches = vorrq_u32(vorrq_u32(matches0, matches1), vorrq_u32(matches2, matches3)); if (vmaxvq_u32(matches)) { From efafbbf0687f1d315c94b54b08e5b93f91e88be0 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Mon, 9 Oct 2023 13:01:13 -0700 Subject: [PATCH 4/8] Break: Avoiding LibC and new API --- stringzilla/stringzilla.h | 789 +++++++++++++++++++++----------------- 1 file changed, 446 insertions(+), 343 deletions(-) diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h index 6b481dda..0aa8774b 100644 --- a/stringzilla/stringzilla.h +++ b/stringzilla/stringzilla.h @@ -1,13 +1,10 @@ #ifndef STRINGZILLA_H_ #define STRINGZILLA_H_ -#include // `qsort_s` -#include // `qsort_r` -#include // `memcpy` - #if defined(__AVX2__) #include #endif + #if defined(__ARM_NEON) #include #endif @@ -16,117 +13,88 @@ #include #define popcount64 __popcnt64 #define ctz64 _tzcnt_u64 +#define clz64 _lzcnt_u64 #define strncasecmp _strnicmp #define strcasecmp _stricmp #else #define popcount64 __builtin_popcountll #define ctz64 __builtin_ctzll +#define clz64 __builtin_clzll +#endif + +/** + * Generally `NULL` is coming from locale.h, stddef.h, stdio.h, stdlib.h, string.h, time.h, and wchar.h, + * according to the C standard. + */ +#ifndef NULL +#define NULL ((void *)0) #endif #ifdef __cplusplus extern "C" { #endif +/** + * @brief Analogous to `size_t` and `std::size_t`, unsigned integer, identical to pointer size. + * 64-bit on most platforms where pointers are 64-bit. + * 32-bit on platforms where pointers are 32-bit. + */ #if defined(__LP64__) || defined(_LP64) || defined(__x86_64__) || defined(_WIN64) -typedef unsigned long sz_size_t; // 64-bit on most platforms when pointers are 64-bit +typedef unsigned long sz_size_t; #else -typedef unsigned sz_size_t; // 32-bit on most platforms when pointers are 32-bit +typedef unsigned sz_size_t; #endif +typedef int sz_bool_t; // Only one relevant bit typedef unsigned sz_u32_t; // Always 32 bits typedef unsigned long long sz_u64_t; // Always 64 bits +typedef char const *sz_string_ptr_t; // A type alias for `char const * ` + +/** + * @brief Helper construct for higher-level bindings. + */ +typedef struct sz_string_view_t { + sz_string_ptr_t start; + sz_size_t length; +} sz_string_view_t; -typedef union sz_quadgram_t { +/** + * @brief Internal data-structure, used to address "anomalies" (often prefixes), + * during substring search. Always a 32-bit unsigned integer, containing 4 chars. + */ +typedef union _sz_anomaly_t { unsigned u32; unsigned char u8s[4]; -} sz_quadgram_t; // Always 32-bit unsigned integer, representing 8 bytes/characters - -typedef union sz_octogram_t { - unsigned long long u64; - unsigned char u8s[8]; -} sz_octogram_t; // Always 64-bit unsigned integer, representing 8 bytes/characters - -inline static sz_size_t sz_divide_round_up(sz_size_t x, sz_size_t divisor) { return (x + (divisor - 1)) / divisor; } - -inline static sz_size_t sz_tolower_ascii(char c) { - static char lowered[256] = { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, // - 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, // - 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, // - 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, // - 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 91, 92, 93, 94, 95, // - 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, // - 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, // - 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, // - 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, // - 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, // - 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, // - 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, // - 240, 241, 242, 243, 244, 245, 246, 215, 248, 249, 250, 251, 252, 253, 254, 223, // - 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, // - 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, // - }; - return lowered[(int)c]; -} - -inline static sz_size_t sz_toupper_ascii(char c) { - static char upped[256] = { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, // - 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, // - 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, // - 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, // - 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 91, 92, 93, 94, 95, // - 96, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, // - 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 123, 124, 125, 126, 127, // - 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, // - 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, // - 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, // - 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, // - 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, // - 240, 241, 242, 243, 244, 245, 246, 215, 248, 249, 250, 251, 252, 253, 254, 223, // - 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, // - 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, // - }; - return upped[(int)c]; -} +} _sz_anomaly_t; /** - * @brief This is a faster alternative to `strncmp(a, b, length) == 0`. + * @brief This is a slightly faster alternative to `strncmp(a, b, length) == 0`. + * Doesn't provide major performance improvements, but helps avoid the LibC dependency. * @return 1 for `true`, and 0 for `false`. */ -inline static int sz_equal(char const *a, char const *b, sz_size_t length) { - char const *const a_end = a + length; +inline static sz_bool_t sz_equal(sz_string_ptr_t a, sz_string_ptr_t b, sz_size_t length) { + sz_string_ptr_t const a_end = a + length; while (a != a_end && *a == *b) a++, b++; return a_end == a; } -typedef struct sz_haystack_t { - char const *start; - sz_size_t length; -} sz_haystack_t; - -typedef struct sz_needle_t { - char const *start; - sz_size_t length; - sz_size_t quadgram_offset; -} sz_needle_t; - /** - * @brief SWAR single-character counting procedure, jumping 8 bytes at a time. + * @brief Count the number of occurrences of a @b single-character needle in an arbitrary length haystack. + * This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time. */ -inline static sz_size_t sz_count_unigram_swar(sz_haystack_t h, char n) { +inline static sz_size_t sz_count_char_swar(sz_string_ptr_t const haystack, + sz_size_t const haystack_length, + sz_string_ptr_t const needle) { sz_size_t result = 0; - char const *text = h.start; - char const *end = h.start + h.length; + sz_string_ptr_t text = haystack; + sz_string_ptr_t const end = haystack + haystack_length; // Process the misaligned head, to void UB on unaligned 64-bit loads. - for (; ((unsigned long)text & 7ul) && text < end; ++text) result += *text == n; + for (; ((unsigned long)text & 7ul) && text < end; ++text) result += *text == *needle; // This code simulates hyper-scalar execution, comparing 8 characters at a time. - sz_u64_t nnnnnnnn = n; + sz_u64_t nnnnnnnn = *needle; nnnnnnnn |= nnnnnnnn << 8; nnnnnnnn |= nnnnnnnn << 16; nnnnnnnn |= nnnnnnnn << 32; @@ -140,27 +108,31 @@ inline static sz_size_t sz_count_unigram_swar(sz_haystack_t h, char n) { result += popcount64(match_indicators); } - for (; text < end; ++text) result += *text == n; + for (; text < end; ++text) result += *text == *needle; return result; } /** - * @brief SWAR single-character search in string, jumping 8 bytes at a time. + * @brief Find the first occurrence of a @b single-character needle in an arbitrary length haystack. + * This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time. + * Identical to `memchr(haystack, needle[0], haystack_length)`. */ -inline static sz_size_t sz_find_unigram_swar(sz_haystack_t h, char n) { +inline static sz_string_ptr_t sz_find_1char_swar(sz_string_ptr_t const haystack, + sz_size_t const haystack_length, + sz_string_ptr_t const needle) { - char const *text = h.start; - char const *end = h.start + h.length; + sz_string_ptr_t text = haystack; + sz_string_ptr_t const end = haystack + haystack_length; // Process the misaligned head, to void UB on unaligned 64-bit loads. for (; ((unsigned long)text & 7ul) && text < end; ++text) - if (*text == n) return text - h.start; + if (*text == *needle) return text; // This code simulates hyper-scalar execution, analyzing 8 offsets at a time. - sz_u64_t nnnnnnnn = n; - nnnnnnnn |= nnnnnnnn << 8; // broadcast `n` into `nnnnnnnn` - nnnnnnnn |= nnnnnnnn << 16; // broadcast `n` into `nnnnnnnn` - nnnnnnnn |= nnnnnnnn << 32; // broadcast `n` into `nnnnnnnn` + sz_u64_t nnnnnnnn = *needle; + nnnnnnnn |= nnnnnnnn << 8; // broadcast `needle` into `nnnnnnnn` + nnnnnnnn |= nnnnnnnn << 16; // broadcast `needle` into `nnnnnnnn` + nnnnnnnn |= nnnnnnnn << 32; // broadcast `needle` into `nnnnnnnn` for (; text + 8 <= end; text += 8) { sz_u64_t text_slice = *(sz_u64_t const *)text; sz_u64_t match_indicators = ~(text_slice ^ nnnnnnnn); @@ -169,30 +141,70 @@ inline static sz_size_t sz_find_unigram_swar(sz_haystack_t h, char n) { match_indicators &= match_indicators >> 4; match_indicators &= 0x0101010101010101; - if (match_indicators != 0) return text - h.start + ctz64(match_indicators) / 8; + if (match_indicators != 0) return text + ctz64(match_indicators) / 8; } for (; text < end; ++text) - if (*text == n) return text - h.start; - return h.length; + if (*text == *needle) return text; + return NULL; +} + +/** + * @brief Find the last occurrence of a @b single-character needle in an arbitrary length haystack. + * This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time. + * Identical to `memrchr(haystack, needle[0], haystack_length)`. + */ +inline static sz_string_ptr_t sz_rfind_1char_swar(sz_string_ptr_t const haystack, + sz_size_t const haystack_length, + sz_string_ptr_t const needle) { + + sz_string_ptr_t const end = haystack + haystack_length; + sz_string_ptr_t text = end - 1; + + // Process the misaligned head, to void UB on unaligned 64-bit loads. + for (; ((unsigned long)text & 7ul) && text >= haystack; --text) + if (*text == *needle) return text; + + // This code simulates hyper-scalar execution, analyzing 8 offsets at a time. + sz_u64_t nnnnnnnn = *needle; + nnnnnnnn |= nnnnnnnn << 8; // broadcast `needle` into `nnnnnnnn` + nnnnnnnn |= nnnnnnnn << 16; // broadcast `needle` into `nnnnnnnn` + nnnnnnnn |= nnnnnnnn << 32; // broadcast `needle` into `nnnnnnnn` + for (; text - 8 >= haystack; text -= 8) { + sz_u64_t text_slice = *(sz_u64_t const *)text; + sz_u64_t match_indicators = ~(text_slice ^ nnnnnnnn); + match_indicators &= match_indicators >> 1; + match_indicators &= match_indicators >> 2; + match_indicators &= match_indicators >> 4; + match_indicators &= 0x0101010101010101; + + if (match_indicators != 0) return text - 8 + clz64(match_indicators) / 8; + } + + for (; text >= haystack; --text) + if (*text == *needle) return text; + return NULL; } /** - * @brief SWAR character-bigram search in string, jumping 8 bytes at a time. + * @brief Find the first occurrence of a @b two-character needle in an arbitrary length haystack. + * This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time. */ -inline static sz_size_t sz_find_bigram_swar(sz_haystack_t h, char const *n) { +inline static sz_string_ptr_t sz_find_2char_swar(sz_string_ptr_t const haystack, + sz_size_t const haystack_length, + sz_string_ptr_t const needle) { - char const *text = h.start; - char const *end = h.start + h.length; + sz_string_ptr_t text = haystack; + sz_string_ptr_t const end = haystack + haystack_length; // Process the misaligned head, to void UB on unaligned 64-bit loads. for (; ((unsigned long)text & 7ul) && text + 2 <= end; ++text) - if (text[0] == n[0] && text[1] == n[1]) return text - h.start; + if (text[0] == needle[0] && text[1] == needle[1]) return text; // This code simulates hyper-scalar execution, analyzing 7 offsets at a time. - sz_u64_t nnnn = ((sz_u64_t)(n[0]) << 0) | ((sz_u64_t)(n[1]) << 8); // broadcast `n` into `nnnn` - nnnn |= nnnn << 16; // broadcast `n` into `nnnn` - nnnn |= nnnn << 32; // broadcast `n` into `nnnn` + sz_u64_t nnnn = ((sz_u64_t)(needle[0]) << 0) | ((sz_u64_t)(needle[1]) << 8); // broadcast `needle` into `nnnn` + nnnn |= nnnn << 16; // broadcast `needle` into `nnnn` + nnnn |= nnnn << 32; // broadcast `needle` into `nnnn` for (; text + 8 <= end; text += 7) { sz_u64_t text_slice = *(sz_u64_t const *)text; sz_u64_t even_indicators = ~(text_slice ^ nnnn); @@ -214,32 +226,38 @@ inline static sz_size_t sz_find_bigram_swar(sz_haystack_t h, char const *n) { if (even_indicators + odd_indicators) { sz_u64_t match_indicators = even_indicators | (odd_indicators >> 8); - return text - h.start + ctz64(match_indicators) / 8; + return text + ctz64(match_indicators) / 8; } } for (; text + 2 <= end; ++text) - if (text[0] == n[0] && text[1] == n[1]) return text - h.start; - return h.length; + if (text[0] == needle[0] && text[1] == needle[1]) return text; + return NULL; } /** - * @brief SWAR character-trigram search in string, jumping 8 bytes at a time. + * @brief Find the first occurrence of a three-character needle in an arbitrary length haystack. + * This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time. */ -inline static sz_size_t sz_find_trigram_swar(sz_haystack_t h, char const *n) { +inline static sz_string_ptr_t sz_find_3char_swar(sz_string_ptr_t const haystack, + sz_size_t const haystack_length, + sz_string_ptr_t const needle) { - char const *text = h.start; - char const *end = h.start + h.length; + sz_string_ptr_t text = haystack; + sz_string_ptr_t end = haystack + haystack_length; // Process the misaligned head, to void UB on unaligned 64-bit loads. for (; ((unsigned long)text & 7ul) && text + 3 <= end; ++text) - if (text[0] == n[0] && text[1] == n[1] && text[2] == n[2]) return text - h.start; + if (text[0] == needle[0] && text[1] == needle[1] && text[2] == needle[2]) return text; // This code simulates hyper-scalar execution, analyzing 6 offsets at a time. // We have two unused bytes at the end. - sz_u64_t nn = (sz_u64_t)(n[0] << 0) | ((sz_u64_t)(n[1]) << 8) | ((sz_u64_t)(n[2]) << 16); // broadcast `n` into `nn` - nn |= nn << 24; // broadcast `n` into `nn` - nn <<= 16; // broadcast `n` into `nn` + sz_u64_t nn = // broadcast `needle` into `nn` + (sz_u64_t)(needle[0] << 0) | // broadcast `needle` into `nn` + ((sz_u64_t)(needle[1]) << 8) | // broadcast `needle` into `nn` + ((sz_u64_t)(needle[2]) << 16); // broadcast `needle` into `nn` + nn |= nn << 24; // broadcast `needle` into `nn` + nn <<= 16; // broadcast `needle` into `nn` for (; text + 8 <= end; text += 6) { sz_u64_t text_slice = *(sz_u64_t const *)text; @@ -271,35 +289,39 @@ inline static sz_size_t sz_find_trigram_swar(sz_haystack_t h, char const *n) { (third_indicators >> 16) & (third_indicators >> 8) & (third_indicators >> 0) & 0x0000010000010000; sz_u64_t match_indicators = first_indicators | (second_indicators >> 8) | (third_indicators >> 16); - if (match_indicators != 0) return text - h.start + ctz64(match_indicators) / 8; + if (match_indicators != 0) return text + ctz64(match_indicators) / 8; } for (; text + 3 <= end; ++text) - if (text[0] == n[0] && text[1] == n[1] && text[2] == n[2]) return text - h.start; - return h.length; + if (text[0] == needle[0] && text[1] == needle[1] && text[2] == needle[2]) return text; + return NULL; } /** - * @brief SWAR character-quadgram search in string, jumping 8 bytes at a time. + * @brief Find the first occurrence of a @b four-character needle in an arbitrary length haystack. + * This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time. */ -inline static sz_size_t sz_find_quadgram_swar(sz_haystack_t h, char const *n) { +inline static sz_string_ptr_t sz_find_4char_swar(sz_string_ptr_t const haystack, + sz_size_t const haystack_length, + sz_string_ptr_t const needle) { - char const *text = h.start; - char const *end = h.start + h.length; + sz_string_ptr_t text = haystack; + sz_string_ptr_t end = haystack + haystack_length; // Process the misaligned head, to void UB on unaligned 64-bit loads. for (; ((unsigned long)text & 7ul) && text + 4 <= end; ++text) - if (text[0] == n[0] && text[1] == n[1] && text[2] == n[2] && text[3] == n[3]) return text - h.start; + if (text[0] == needle[0] && text[1] == needle[1] && text[2] == needle[2] && text[3] == needle[3]) return text; // This code simulates hyper-scalar execution, analyzing 4 offsets at a time. - sz_u64_t nn = (sz_u64_t)(n[0] << 0) | ((sz_u64_t)(n[1]) << 8) | ((sz_u64_t)(n[2]) << 16) | ((sz_u64_t)(n[3]) << 24); + sz_u64_t nn = (sz_u64_t)(needle[0] << 0) | ((sz_u64_t)(needle[1]) << 8) | ((sz_u64_t)(needle[2]) << 16) | + ((sz_u64_t)(needle[3]) << 24); nn |= nn << 32; // - unsigned char lookup[16] = {0}; - lookup[0x2] = lookup[0x6] = lookup[0xA] = lookup[0xE] = 1; - lookup[0x4] = lookup[0xC] = 2; - lookup[0x8] = 3; + unsigned char offset_in_slice[16] = {0}; + offset_in_slice[0x2] = offset_in_slice[0x6] = offset_in_slice[0xA] = offset_in_slice[0xE] = 1; + offset_in_slice[0x4] = offset_in_slice[0xC] = 2; + offset_in_slice[0x8] = 3; // We can perform 5 comparisons per load, but it's easier to perform 4, minimizing the size of the lookup table. for (; text + 8 <= end; text += 4) { @@ -331,58 +353,63 @@ inline static sz_size_t sz_find_quadgram_swar(sz_haystack_t h, char const *n) { unsigned char match_indicators = (unsigned char)( // (text01_indicators >> 31) | (text01_indicators << 0) | // (text23_indicators >> 29) | (text23_indicators << 2)); - return text - h.start + lookup[match_indicators]; + return text + offset_in_slice[match_indicators]; } } for (; text + 4 <= end; ++text) - if (text[0] == n[0] && text[1] == n[1] && text[2] == n[2] && text[3] == n[3]) return text - h.start; - return h.length; + if (text[0] == needle[0] && text[1] == needle[1] && text[2] == needle[2] && text[3] == needle[3]) return text; + return NULL; } /** - * @brief Trivial substring search with scalar code. Instead of comparing characters one-by-one - * it compares 4-byte quadgrams first, most commonly prefixes. It's computationally cheaper. + * @brief Trivial substring search with scalar SWAR code. Instead of comparing characters one-by-one + * it compares 4-byte anomalies first, most commonly prefixes. It's computationally cheaper. * Matching performance fluctuates between 1 GB/s and 3,5 GB/s per core. */ -inline static sz_size_t sz_find_substr_swar(sz_haystack_t h, sz_needle_t n) { - - if (h.length < n.length) return h.length; - - switch (n.length) { - case 0: return 0; - case 1: return sz_find_unigram_swar(h, *n.start); - case 2: return sz_find_bigram_swar(h, n.start); - case 3: return sz_find_trigram_swar(h, n.start); - case 4: return sz_find_quadgram_swar(h, n.start); +inline static sz_string_ptr_t sz_find_substr_swar( // + sz_string_ptr_t const haystack, + sz_size_t const haystack_length, + sz_string_ptr_t const needle, + sz_size_t const needle_length) { + + if (haystack_length < needle_length) return NULL; + + sz_size_t anomaly_offset = 0; + switch (needle_length) { + case 0: return NULL; + case 1: return sz_find_1char_swar(haystack, haystack_length, needle); + case 2: return sz_find_2char_swar(haystack, haystack_length, needle); + case 3: return sz_find_3char_swar(haystack, haystack_length, needle); + case 4: return sz_find_4char_swar(haystack, haystack_length, needle); default: { - char const *text = h.start; - char const *const end = h.start + h.length; - - sz_quadgram_t n_quadgram, h_quadgram; - sz_size_t const n_suffix_len = n.length - 4 - n.quadgram_offset; - char const *n_suffix_ptr = n.start + 4 + n.quadgram_offset; - n_quadgram.u8s[0] = n.start[n.quadgram_offset]; - n_quadgram.u8s[1] = n.start[n.quadgram_offset + 1]; - n_quadgram.u8s[2] = n.start[n.quadgram_offset + 2]; - n_quadgram.u8s[3] = n.start[n.quadgram_offset + 3]; - h_quadgram.u8s[0] = h.start[0]; - h_quadgram.u8s[1] = h.start[1]; - h_quadgram.u8s[2] = h.start[2]; - h_quadgram.u8s[3] = h.start[3]; - - text += n.quadgram_offset; - while (text + n.length <= end) { - h_quadgram.u8s[3] = text[3]; - if (h_quadgram.u32 == n_quadgram.u32) // Match quadgram. - if (sz_equal(text + 4, n_suffix_ptr, n_suffix_len)) // Match suffix. - if (sz_equal(text - n.quadgram_offset, n.start, n.quadgram_offset)) // Match prefix. - return text - h.start - n.quadgram_offset; - - h_quadgram.u32 >>= 8; + sz_string_ptr_t text = haystack; + sz_string_ptr_t const end = haystack + haystack_length; + + _sz_anomaly_t n_anomaly, h_anomaly; + sz_size_t const n_suffix_len = needle_length - 4 - anomaly_offset; + sz_string_ptr_t n_suffix_ptr = needle + 4 + anomaly_offset; + n_anomaly.u8s[0] = needle[anomaly_offset]; + n_anomaly.u8s[1] = needle[anomaly_offset + 1]; + n_anomaly.u8s[2] = needle[anomaly_offset + 2]; + n_anomaly.u8s[3] = needle[anomaly_offset + 3]; + h_anomaly.u8s[0] = haystack[0]; + h_anomaly.u8s[1] = haystack[1]; + h_anomaly.u8s[2] = haystack[2]; + h_anomaly.u8s[3] = haystack[3]; + + text += anomaly_offset; + while (text + needle_length <= end) { + h_anomaly.u8s[3] = text[3]; + if (h_anomaly.u32 == n_anomaly.u32) // Match anomaly. + if (sz_equal(text + 4, n_suffix_ptr, n_suffix_len)) // Match suffix. + if (sz_equal(text - anomaly_offset, needle, anomaly_offset)) // Match prefix - optimized out. + return text - anomaly_offset; + + h_anomaly.u32 >>= 8; ++text; } - return h.length; + return NULL; } } } @@ -390,34 +417,33 @@ inline static sz_size_t sz_find_substr_swar(sz_haystack_t h, sz_needle_t n) { /** * Helper function, used in substring search operations. */ -inline static void _sz_find_substr_populate_quadgram( // - sz_haystack_t h, - sz_needle_t n, - sz_quadgram_t *quadgram_out, - sz_quadgram_t *mask_out) { - - sz_quadgram_t quadgram; - sz_quadgram_t mask; - switch (n.length) { +inline static void _sz_find_substr_populate_anomaly( // + sz_string_ptr_t const needle, + sz_size_t const needle_length, + _sz_anomaly_t *anomaly_out, + _sz_anomaly_t *mask_out) { + + _sz_anomaly_t anomaly; + _sz_anomaly_t mask; + switch (needle_length) { case 1: mask.u8s[0] = 0xFF, mask.u8s[1] = mask.u8s[2] = mask.u8s[3] = 0; - quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = quadgram.u8s[2] = quadgram.u8s[3] = 0; + anomaly.u8s[0] = needle[0], anomaly.u8s[1] = anomaly.u8s[2] = anomaly.u8s[3] = 0; break; case 2: mask.u8s[0] = mask.u8s[1] = 0xFF, mask.u8s[2] = mask.u8s[3] = 0; - quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1], quadgram.u8s[2] = quadgram.u8s[3] = 0; + anomaly.u8s[0] = needle[0], anomaly.u8s[1] = needle[1], anomaly.u8s[2] = anomaly.u8s[3] = 0; break; case 3: mask.u8s[0] = mask.u8s[1] = mask.u8s[2] = 0xFF, mask.u8s[3] = 0; - quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1], quadgram.u8s[2] = n.start[2], quadgram.u8s[3] = 0; + anomaly.u8s[0] = needle[0], anomaly.u8s[1] = needle[1], anomaly.u8s[2] = needle[2], anomaly.u8s[3] = 0; break; default: mask.u32 = 0xFFFFFFFF; - quadgram.u8s[0] = n.start[0], quadgram.u8s[1] = n.start[1], quadgram.u8s[2] = n.start[2], - quadgram.u8s[3] = n.start[3]; + anomaly.u8s[0] = needle[0], anomaly.u8s[1] = needle[1], anomaly.u8s[2] = needle[2], anomaly.u8s[3] = needle[3]; break; } - *quadgram_out = quadgram; + *anomaly_out = anomaly; *mask_out = mask; } @@ -429,14 +455,17 @@ inline static void _sz_find_substr_populate_quadgram( // * was practically more efficient than loading once and shifting around, as introduces * less data dependencies. */ -inline static sz_size_t sz_find_substr_avx2(sz_haystack_t h, sz_needle_t n) { +inline static sz_string_ptr_t sz_find_substr_avx2(sz_string_ptr_t const haystack, + sz_size_t const haystack_length, + sz_string_ptr_t const needle, + sz_size_t const needle_length) { // Precomputed constants - char const *const end = h.start + h.length; - sz_quadgram_t quadgram; - sz_quadgram_t mask; - _sz_find_substr_populate_quadgram(h, n, &quadgram, &mask); - __m256i const quadgrams = _mm256_set1_epi32(quadgram.u32); + sz_string_ptr_t const end = haystack + haystack_length; + _sz_anomaly_t anomaly; + _sz_anomaly_t mask; + _sz_find_substr_populate_anomaly(needle, needle_length, &anomaly, &mask); + __m256i const anomalies = _mm256_set1_epi32(anomaly.u32); __m256i const masks = _mm256_set1_epi32(mask.u32); // Top level for-loop changes dramatically. @@ -448,18 +477,18 @@ inline static sz_size_t sz_find_substr_avx2(sz_haystack_t h, sz_needle_t n) { // + 4 movemasks. // + 3 bitwise ANDs. // + 1 heavy (but very unlikely) branch. - char const *text = h.start; - while (text + n.length + 32 <= end) { + sz_string_ptr_t text = haystack; + while (text + needle_length + 32 <= end) { // Performing many unaligned loads ends up being faster than loading once and shuffling around. __m256i texts0 = _mm256_and_si256(_mm256_loadu_si256((__m256i const *)(text + 0)), masks); - int matches0 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts0, quadgrams)); + int matches0 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts0, anomalies)); __m256i texts1 = _mm256_and_si256(_mm256_loadu_si256((__m256i const *)(text + 1)), masks); - int matches1 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts1, quadgrams)); + int matches1 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts1, anomalies)); __m256i text2 = _mm256_and_si256(_mm256_loadu_si256((__m256i const *)(text + 2)), masks); - int matches2 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(text2, quadgrams)); + int matches2 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(text2, anomalies)); __m256i texts3 = _mm256_and_si256(_mm256_loadu_si256((__m256i const *)(text + 3)), masks); - int matches3 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts3, quadgrams)); + int matches3 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(texts3, anomalies)); if (matches0 | matches1 | matches2 | matches3) { int matches = // @@ -468,25 +497,21 @@ inline static sz_size_t sz_find_substr_avx2(sz_haystack_t h, sz_needle_t n) { (matches2 & 0x4444'4444u) | // (matches3 & 0x8888'8888u); size_t first_match_offset = _tzcnt_u32(matches); - if (n.length > 4) { - if (sz_equal(text + first_match_offset + 4, n.start + 4, n.length - 4)) - return text + first_match_offset - h.start; + if (needle_length > 4) { + if (sz_equal(text + first_match_offset + 4, needle + 4, needle_length - 4)) + return text + first_match_offset; else text += first_match_offset + 1; } else - return text + first_match_offset - h.start; - } + return text + first_match_offset; + } else text += 32; } // Don't forget the last (up to 35) characters. - sz_haystack_t tail; - tail.start = text; - tail.length = end - text; - size_t tail_match = sz_find_substr_swar(tail, n); - return text + tail_match - h.start; + return sz_find_substr_swar(text, end - text, needle, needle_length); } #endif // x86 AVX2 @@ -499,26 +524,29 @@ inline static sz_size_t sz_find_substr_avx2(sz_haystack_t h, sz_needle_t n) { * was practically more efficient than loading once and shifting around, as introduces * less data dependencies. */ -inline static sz_size_t sz_find_substr_neon(sz_haystack_t h, sz_needle_t n) { +inline static sz_string_ptr_t sz_find_substr_neon(sz_string_ptr_t const haystack, + sz_size_t const haystack_length, + sz_string_ptr_t const needle, + sz_size_t const needle_length) { // Precomputed constants - char const *const end = h.start + h.length; - sz_quadgram_t quadgram; - sz_quadgram_t mask; - _sz_find_substr_populate_quadgram(h, n, &quadgram, &mask); - uint32x4_t const quadgrams = vld1q_dup_u32(&quadgram.u32); - uint32x4_t const masks = vld1q_dup_u32(&mask); + sz_string_ptr_t const end = haystack + haystack_length; + _sz_anomaly_t anomaly; + _sz_anomaly_t mask; + _sz_find_substr_populate_anomaly(needle, needle_length, &anomaly, &mask); + uint32x4_t const anomalies = vld1q_dup_u32(&anomaly.u32); + uint32x4_t const masks = vld1q_dup_u32(&mask.u32); uint32x4_t matches, matches0, matches1, matches2, matches3; - char const *text = h.start; - while (text + n.length + 16 <= end) { + sz_string_ptr_t text = haystack; + while (text + needle_length + 16 <= end) { // Each of the following `matchesX` contains only 4 relevant bits - one per word. // Each signifies a match at the given offset. - matches0 = vceqq_u32(vandq_u32(vld1q_u32(text + 0), masks), quadgrams); - matches1 = vceqq_u32(vandq_u32(vld1q_u32(text + 1), masks), quadgrams); - matches2 = vceqq_u32(vandq_u32(vld1q_u32(text + 2), masks), quadgrams); - matches3 = vceqq_u32(vandq_u32(vld1q_u32(text + 3), masks), quadgrams); + matches0 = vceqq_u32(vandq_u32(vreinterpretq_u32_u8(vld1q_u8((unsigned char *)text + 0)), masks), anomalies); + matches1 = vceqq_u32(vandq_u32(vreinterpretq_u32_u8(vld1q_u8((unsigned char *)text + 1)), masks), anomalies); + matches2 = vceqq_u32(vandq_u32(vreinterpretq_u32_u8(vld1q_u8((unsigned char *)text + 2)), masks), anomalies); + matches3 = vceqq_u32(vandq_u32(vreinterpretq_u32_u8(vld1q_u8((unsigned char *)text + 3)), masks), anomalies); matches = vorrq_u32(vorrq_u32(matches0, matches1), vorrq_u32(matches2, matches3)); if (vmaxvq_u32(matches)) { @@ -540,73 +568,172 @@ inline static sz_size_t sz_find_substr_neon(sz_haystack_t h, sz_needle_t n) { // Find the first match size_t first_match_offset = __builtin_ctz(matches_u16); - if (n.length > 4) { - if (sz_equal(text + first_match_offset + 4, n.start + 4, n.length - 4)) - return text + first_match_offset - h.start; + if (needle_length > 4) { + if (sz_equal(text + first_match_offset + 4, needle + 4, needle_length - 4)) + return text + first_match_offset; else text += first_match_offset + 1; } else - return text + first_match_offset - h.start; + return text + first_match_offset; } else text += 16; } // Don't forget the last (up to 16+3=19) characters. - sz_haystack_t tail; - tail.start = text; - tail.length = end - text; - size_t tail_match = sz_find_substr_swar(tail, n); - return text + tail_match - h.start; + return sz_find_substr_swar(text, end - text, needle, needle_length); } #endif // Arm Neon -inline static sz_size_t sz_count_char(sz_haystack_t h, char n) { return sz_count_unigram_swar(h, n); } -inline static sz_size_t sz_find_unigram(sz_haystack_t h, char n) { return sz_find_unigram_swar(h, n); } +inline static sz_size_t sz_count_char(sz_string_ptr_t const haystack, + sz_size_t const haystack_length, + sz_string_ptr_t const needle) { + return sz_count_char_swar(haystack, haystack_length, needle); +} -inline static sz_size_t sz_find_substr(sz_haystack_t h, sz_needle_t n) { - if (h.length < n.length) return h.length; +inline static sz_string_ptr_t sz_find_1char(sz_string_ptr_t const haystack, + sz_size_t const haystack_length, + sz_string_ptr_t const needle) { + return sz_find_1char_swar(haystack, haystack_length, needle); +} +inline static sz_string_ptr_t sz_rfind_1char(sz_string_ptr_t const haystack, + sz_size_t const haystack_length, + sz_string_ptr_t const needle) { + return sz_rfind_1char_swar(haystack, haystack_length, needle); +} + +inline static sz_string_ptr_t sz_find_substr(sz_string_ptr_t const haystack, + sz_size_t const haystack_length, + sz_string_ptr_t const needle, + sz_size_t const needle_length) { + if (haystack_length < needle_length) return NULL; #if defined(__ARM_NEON) - return sz_find_substr_neon(h, n); + return sz_find_substr_neon(haystack, haystack_length, needle, needle_length); #elif defined(__AVX2__) - return sz_find_substr_avx2(h, n); + return sz_find_substr_avx2(haystack, haystack_length, needle, needle_length); #else - return sz_find_substr_swar(h, n); + return sz_find_substr_swar(haystack, haystack_length, needle, needle_length); #endif } -inline static void sz_swap(sz_size_t *a, sz_size_t *b) { - sz_size_t t = *a; +/** + * @brief Maps any ASCII character to itself, or the lowercase variant, if available. + */ +inline static char sz_tolower_ascii(char c) { + static unsigned char lowered[256] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, // + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, // + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, // + 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, // + 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 91, 92, 93, 94, 95, // + 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, // + 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, // + 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, // + 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, // + 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, // + 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, // + 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, // + 240, 241, 242, 243, 244, 245, 246, 215, 248, 249, 250, 251, 252, 253, 254, 223, // + 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, // + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, // + }; + return *(char *)&lowered[(int)c]; +} + +/** + * @brief Maps any ASCII character to itself, or the uppercase variant, if available. + */ +inline static char sz_toupper_ascii(char c) { + static unsigned char upped[256] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, // + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, // + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, // + 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, // + 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 91, 92, 93, 94, 95, // + 96, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, // + 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 123, 124, 125, 126, 127, // + 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, // + 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, // + 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, // + 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, // + 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, // + 240, 241, 242, 243, 244, 245, 246, 215, 248, 249, 250, 251, 252, 253, 254, 223, // + 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, // + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, // + }; + return *(char *)&upped[(int)c]; +} + +/** + * @brief Char-level lexicographic comparison of two strings. + * Doesn't provide major performance improvements, but helps avoid the LibC dependency. + */ +inline static sz_bool_t sz_is_less_ascii(sz_string_ptr_t const a, + sz_size_t const a_length, + sz_string_ptr_t const b, + sz_size_t const b_length) { + + sz_size_t min_length = (a_length < b_length) ? a_length : b_length; + for (sz_size_t i = 0; i < min_length; ++i) { + if (a[i] < b[i]) return 1; + if (a[i] > b[i]) return 0; + } + return a_length < b_length; +} + +/** + * @brief Char-level lexicographic comparison of two strings, insensitive to the case of ASCII symbols. + * Doesn't provide major performance improvements, but helps avoid the LibC dependency. + */ +inline static sz_bool_t sz_is_less_uncased_ascii(sz_string_ptr_t const a, + sz_size_t const a_length, + sz_string_ptr_t const b, + sz_size_t const b_length) { + + sz_size_t min_length = (a_length < b_length) ? a_length : b_length; + for (sz_size_t i = 0; i < min_length; ++i) { + char a_lower = sz_tolower_ascii(a[i]); + char b_lower = sz_tolower_ascii(b[i]); + if (a_lower < b_lower) return 1; + if (a_lower > b_lower) return 0; + } + return a_length < b_length; +} + +/** + * @brief Helper, that swaps two 64-bit integers representing the order of elements in the sequence. + */ +inline static void _sz_swap_order(sz_u64_t *a, sz_u64_t *b) { + sz_u64_t t = *a; *a = *b; *b = t; } -typedef char const *(*sz_sequence_get_start_t)(void const *, sz_size_t); -typedef sz_size_t (*sz_sequence_get_length_t)(void const *, sz_size_t); -typedef int (*sz_sequence_predicate_t)(void const *, sz_size_t); -typedef int (*sz_sequence_comparator_t)(void const *, sz_size_t, sz_size_t); +struct sz_sequence_s; -// Define a type for the comparison function, depending on the platform. -#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) || defined(__APPLE__) -typedef int (*sz_qsort_comparison_func_t)(void *, void const *, void const *); -#else -typedef int (*sz_qsort_comparison_func_t)(void const *, void const *, void *); -#endif +typedef sz_string_ptr_t (*sz_sequence_member_start_t)(void const *, sz_size_t); +typedef sz_size_t (*sz_sequence_member_length_t)(void const *, sz_size_t); +typedef sz_bool_t (*sz_sequence_predicate_t)(void const *, sz_size_t); +typedef sz_bool_t (*sz_sequence_comparator_t)(void const *, sz_size_t, sz_size_t); +typedef sz_bool_t (*sz_string_is_less_t)(sz_string_ptr_t, sz_size_t, sz_string_ptr_t, sz_size_t); -typedef struct sz_sequence_t { - sz_size_t *order; +typedef struct sz_sequence_s { + sz_u64_t *order; sz_size_t count; - sz_sequence_get_start_t get_start; - sz_sequence_get_length_t get_length; + sz_sequence_member_start_t get_start; + sz_sequence_member_length_t get_length; void const *handle; } sz_sequence_t; /** - * @brief Similar to `std::partition`, given a predicate splits the - * sequence into two parts. + * @brief Similar to `std::partition`, given a predicate splits the sequence into two parts. + * The algorithm is unstable, meaning that elements may change relative order, as long + * as they are in the right partition. This is the simpler algorithm for partitioning. */ inline static sz_size_t sz_partition(sz_sequence_t *sequence, sz_sequence_predicate_t predicate) { @@ -615,14 +742,16 @@ inline static sz_size_t sz_partition(sz_sequence_t *sequence, sz_sequence_predic for (sz_size_t i = matches + 1; i < sequence->count; ++i) if (predicate(sequence->handle, sequence->order[i])) - sz_swap(sequence->order + i, sequence->order + matches), ++matches; + _sz_swap_order(sequence->order + i, sequence->order + matches), ++matches; return matches; } /** - * @brief Inplace `std::set_union` for two consecutive chunks forming - * the same continuous sequence. + * @brief Inplace `std::set_union` for two consecutive chunks forming the same continuous `sequence`. + * + * @param partition The number of elements in the first sub-sequence in `sequence`. + * @param less Comparison function, to determine the lexicographic ordering. */ inline static void sz_merge(sz_sequence_t *sequence, sz_size_t partition, sz_sequence_comparator_t less) { @@ -642,10 +771,7 @@ inline static void sz_merge(sz_sequence_t *sequence, sz_size_t partition, sz_seq // Shift all the elements between element 1 // element 2, right by 1. - while (index != start_a) { - sequence->order[index] = sequence->order[index - 1]; - index--; - } + while (index != start_a) { sequence->order[index] = sequence->order[index - 1], index--; } sequence->order[start_a] = value; // Update all the pointers @@ -656,112 +782,86 @@ inline static void sz_merge(sz_sequence_t *sequence, sz_size_t partition, sz_seq } } +inline static void sz_sort_insertion(sz_sequence_t *sequence, sz_sequence_comparator_t comparator) { + sz_u64_t *keys = sequence->order; + sz_size_t keys_count = sequence->count; + for (sz_size_t i = 1; i < keys_count; i++) { + sz_u64_t i_key = keys[i]; + // Move elements of arr[0..i-1] that are greater than key to one position ahead of their current position + sz_size_t j = i; + for (; j > 0 && comparator(sequence, keys[j - 1], i) != 0; --j) keys[j] = keys[j - 1]; + keys[j] = i_key; + } +} + +/** + * @brief Internal Radix sorting procedure. + */ inline static void _sz_sort_recursion( // sz_sequence_t *sequence, sz_size_t bit_idx, sz_size_t bit_max, - sz_qsort_comparison_func_t qsort_comparator) { + sz_sequence_comparator_t comparator, + sz_size_t partial_order_length) { if (!sequence->count) return; // Partition a range of integers according to a specific bit value sz_size_t split = 0; { - sz_size_t mask = (1ul << 63) >> bit_idx; + sz_u64_t mask = (1ul << 63) >> bit_idx; while (split != sequence->count && !(sequence->order[split] & mask)) ++split; for (sz_size_t i = split + 1; i < sequence->count; ++i) - if (!(sequence->order[i] & mask)) sz_swap(sequence->order + i, sequence->order + split), ++split; + if (!(sequence->order[i] & mask)) _sz_swap_order(sequence->order + i, sequence->order + split), ++split; } // Go down recursively if (bit_idx < bit_max) { sz_sequence_t a = *sequence; a.count = split; - _sz_sort_recursion(&a, bit_idx + 1, bit_max, qsort_comparator); + _sz_sort_recursion(&a, bit_idx + 1, bit_max, comparator, partial_order_length); sz_sequence_t b = *sequence; b.order += split; b.count -= split; - _sz_sort_recursion(&b, bit_idx + 1, bit_max, qsort_comparator); + _sz_sort_recursion(&b, bit_idx + 1, bit_max, comparator, partial_order_length); } // Reached the end of recursion else { // Discard the prefixes - for (sz_size_t i = 0; i != sequence->count; ++i) { memset((char *)(&sequence->order[i]) + 4, 0, 4ul); } - - // Perform sorts on smaller chunks instead of the whole handle -#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) - // https://stackoverflow.com/a/39561369 - // https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/qsort-s?view=msvc-170 - qsort_s(sequence->order, split, sizeof(sz_size_t), qsort_comparator, (void *)sequence); - qsort_s(sequence->order + split, - sequence->count - split, - sizeof(sz_size_t), - qsort_comparator, - (void *)sequence); -#elif __APPLE__ - qsort_r(sequence->order, split, sizeof(sz_size_t), (void *)sequence, qsort_comparator); - qsort_r(sequence->order + split, - sequence->count - split, - sizeof(sz_size_t), - (void *)sequence, - qsort_comparator); -#else - // https://linux.die.net/man/3/qsort_r - qsort_r(sequence->order, split, sizeof(sz_size_t), qsort_comparator, (void *)sequence); - qsort_r(sequence->order + split, - sequence->count - split, - sizeof(sz_size_t), - qsort_comparator, - (void *)sequence); -#endif + sz_u32_t *order_half_words = (sz_u32_t *)sequence->order; + for (sz_size_t i = 0; i != sequence->count; ++i) { order_half_words[i * 2 + 1] = 0; } + + sz_sequence_t a = *sequence; + a.count = split; + sz_sort_insertion(&a, comparator); + + sz_sequence_t b = *sequence; + b.order += split; + b.count -= split; + sz_sort_insertion(&b, comparator); } } -inline static int _sz_sort_sequence_strncmp( -#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) || __APPLE__ - void *sequence_raw, void const *a_raw, void const *b_raw -#else - void const *a_raw, void const *b_raw, void *sequence_raw -#endif -) { - // https://man.freebsd.org/cgi/man.cgi?query=qsort_s&sektion=3&n=1 - // https://www.man7.org/linux/man-pages/man3/strcmp.3.html - sz_sequence_t *sequence = (sz_sequence_t *)sequence_raw; - sz_size_t a = *(sz_size_t *)a_raw; - sz_size_t b = *(sz_size_t *)b_raw; - sz_size_t a_len = sequence->get_length(sequence->handle, a); - sz_size_t b_len = sequence->get_length(sequence->handle, b); - int res = strncmp( // - sequence->get_start(sequence->handle, a), - sequence->get_start(sequence->handle, b), - a_len > b_len ? b_len : a_len); - return res ? res : a_len - b_len; +inline static sz_bool_t _sz_sort_compare_less_ascii(sz_sequence_t *sequence, sz_size_t i_key, sz_size_t j_key) { + sz_string_ptr_t i_str = sequence->get_start(sequence->handle, i_key); + sz_size_t i_len = sequence->get_length(sequence->handle, i_key); + sz_string_ptr_t j_str = sequence->get_start(sequence->handle, j_key); + sz_size_t j_len = sequence->get_length(sequence->handle, j_key); + return sz_is_less_ascii(i_str, i_len, j_str, j_len); } -inline static int _sz_sort_sequence_strncasecmp( -#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__) || __APPLE__ - void *sequence_raw, void const *a_raw, void const *b_raw -#else - void const *a_raw, void const *b_raw, void *sequence_raw -#endif -) { - // https://man.freebsd.org/cgi/man.cgi?query=qsort_s&sektion=3&n=1 - // https://www.man7.org/linux/man-pages/man3/strcmp.3.html - sz_sequence_t *sequence = (sz_sequence_t *)sequence_raw; - sz_size_t a = *(sz_size_t *)a_raw; - sz_size_t b = *(sz_size_t *)b_raw; - sz_size_t a_len = sequence->get_length(sequence->handle, a); - sz_size_t b_len = sequence->get_length(sequence->handle, b); - int res = strncasecmp( // - sequence->get_start(sequence->handle, a), - sequence->get_start(sequence->handle, b), - a_len > b_len ? b_len : a_len); - return res ? res : a_len - b_len; +inline static sz_bool_t _sz_sort_compare_less_uncased_ascii(sz_sequence_t *sequence, sz_size_t i_key, sz_size_t j_key) { + sz_string_ptr_t i_str = sequence->get_start(sequence->handle, i_key); + sz_size_t i_len = sequence->get_length(sequence->handle, i_key); + sz_string_ptr_t j_str = sequence->get_start(sequence->handle, j_key); + sz_size_t j_len = sequence->get_length(sequence->handle, j_key); + return sz_is_less_uncased_ascii(i_str, i_len, j_str, j_len); } typedef struct sz_sort_config_t { - int case_insensitive; + sz_bool_t case_insensitive; + sz_size_t partial_order_length; } sz_sort_config_t; /** @@ -770,11 +870,13 @@ typedef struct sz_sort_config_t { */ inline static void sz_sort(sz_sequence_t *sequence, sz_sort_config_t const *config) { - int case_insensitive = config && config->case_insensitive; + sz_bool_t case_insensitive = config && config->case_insensitive; + sz_size_t partial_order_length = + config && config->partial_order_length ? config->partial_order_length : sequence->count; // Export up to 4 bytes into the `sequence` bits themselves for (sz_size_t i = 0; i != sequence->count; ++i) { - char const *begin = sequence->get_start(sequence->handle, sequence->order[i]); + sz_string_ptr_t begin = sequence->get_start(sequence->handle, sequence->order[i]); sz_size_t length = sequence->get_length(sequence->handle, sequence->order[i]); length = length > 4ul ? 4ul : length; char *prefix = (char *)&sequence->order[i]; @@ -787,11 +889,11 @@ inline static void sz_sort(sz_sequence_t *sequence, sz_sort_config_t const *conf } } - sz_qsort_comparison_func_t comparator = _sz_sort_sequence_strncmp; - if (case_insensitive) comparator = _sz_sort_sequence_strncasecmp; + sz_sequence_comparator_t comparator = (sz_sequence_comparator_t)_sz_sort_compare_less_ascii; + if (case_insensitive) comparator = (sz_sequence_comparator_t)_sz_sort_compare_less_uncased_ascii; // Perform optionally-parallel radix sort on them - _sz_sort_recursion(sequence, 0, 32, comparator); + _sz_sort_recursion(sequence, 0, 32, comparator, partial_order_length); } typedef unsigned char levenstein_distance_t; @@ -806,9 +908,9 @@ inline static sz_size_t sz_levenstein_memory_needed(sz_size_t _, sz_size_t b_len * @brief Auxiliary function, that computes the minimum of three values. */ inline static levenstein_distance_t _sz_levenstein_minimum( // - levenstein_distance_t a, - levenstein_distance_t b, - levenstein_distance_t c) { + levenstein_distance_t const a, + levenstein_distance_t const b, + levenstein_distance_t const c) { return (a < b ? (a < c ? a : c) : (b < c ? b : c)); } @@ -818,11 +920,11 @@ inline static levenstein_distance_t _sz_levenstein_minimum( // * It accepts an upper bound on the possible error. Quadratic complexity in time, linear in space. */ inline static levenstein_distance_t sz_levenstein( // - char const *a, - sz_size_t a_length, - char const *b, - sz_size_t b_length, - levenstein_distance_t bound, + sz_string_ptr_t const a, + sz_size_t const a_length, + sz_string_ptr_t const b, + sz_size_t const b_length, + levenstein_distance_t const bound, void *buffer) { // If one of the strings is empty - the edit distance is equal to the length of the other one @@ -873,11 +975,11 @@ inline static levenstein_distance_t sz_levenstein( // /** * @brief Hashes provided string using hardware-accelerated CRC32 instructions. */ -inline static sz_u32_t sz_hash_crc32_native(char const *start, sz_size_t length) { return 0; } +inline static sz_u32_t sz_hash_crc32_native(sz_string_ptr_t start, sz_size_t length) { return 0; } -inline static sz_u32_t sz_hash_crc32_neon(char const *start, sz_size_t length) { return 0; } +inline static sz_u32_t sz_hash_crc32_neon(sz_string_ptr_t start, sz_size_t length) { return 0; } -inline static sz_u32_t sz_hash_crc32_sse(char const *start, sz_size_t length) { return 0; } +inline static sz_u32_t sz_hash_crc32_sse(sz_string_ptr_t start, sz_size_t length) { return 0; } #ifdef __cplusplus } @@ -889,5 +991,6 @@ inline static sz_u32_t sz_hash_crc32_sse(char const *start, sz_size_t length) { #endif #undef popcount64 #undef ctz64 +#undef clz64 #endif // STRINGZILLA_H_ From 644630b852ad43be6ba092c3091b458446688c4c Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Mon, 9 Oct 2023 14:38:44 -0700 Subject: [PATCH 5/8] Improve: Intro-sort --- stringzilla/stringzilla.h | 167 ++++++++++++++++++++++++++++++++------ 1 file changed, 142 insertions(+), 25 deletions(-) diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h index 0aa8774b..84e864cf 100644 --- a/stringzilla/stringzilla.h +++ b/stringzilla/stringzilla.h @@ -714,15 +714,15 @@ inline static void _sz_swap_order(sz_u64_t *a, sz_u64_t *b) { *b = t; } -struct sz_sequence_s; +struct sz_sequence_t; -typedef sz_string_ptr_t (*sz_sequence_member_start_t)(void const *, sz_size_t); -typedef sz_size_t (*sz_sequence_member_length_t)(void const *, sz_size_t); -typedef sz_bool_t (*sz_sequence_predicate_t)(void const *, sz_size_t); -typedef sz_bool_t (*sz_sequence_comparator_t)(void const *, sz_size_t, sz_size_t); +typedef sz_string_ptr_t (*sz_sequence_member_start_t)(struct sz_sequence_t const *, sz_size_t); +typedef sz_size_t (*sz_sequence_member_length_t)(struct sz_sequence_t const *, sz_size_t); +typedef sz_bool_t (*sz_sequence_predicate_t)(struct sz_sequence_t const *, sz_size_t); +typedef sz_bool_t (*sz_sequence_comparator_t)(struct sz_sequence_t const *, sz_size_t, sz_size_t); typedef sz_bool_t (*sz_string_is_less_t)(sz_string_ptr_t, sz_size_t, sz_string_ptr_t, sz_size_t); -typedef struct sz_sequence_s { +typedef struct sz_sequence_t { sz_u64_t *order; sz_size_t count; sz_sequence_member_start_t get_start; @@ -738,10 +738,10 @@ typedef struct sz_sequence_s { inline static sz_size_t sz_partition(sz_sequence_t *sequence, sz_sequence_predicate_t predicate) { sz_size_t matches = 0; - while (matches != sequence->count && predicate(sequence->handle, sequence->order[matches])) ++matches; + while (matches != sequence->count && predicate(sequence, sequence->order[matches])) ++matches; for (sz_size_t i = matches + 1; i < sequence->count; ++i) - if (predicate(sequence->handle, sequence->order[i])) + if (predicate(sequence, sequence->order[i])) _sz_swap_order(sequence->order + i, sequence->order + matches), ++matches; return matches; @@ -758,13 +758,13 @@ inline static void sz_merge(sz_sequence_t *sequence, sz_size_t partition, sz_seq sz_size_t start_b = partition + 1; // If the direct merge is already sorted - if (!less(sequence->handle, sequence->order[start_b], sequence->order[partition])) return; + if (!less(sequence, sequence->order[start_b], sequence->order[partition])) return; sz_size_t start_a = 0; while (start_a <= partition && start_b <= sequence->count) { // If element 1 is in right place - if (!less(sequence->handle, sequence->order[start_b], sequence->order[start_a])) { start_a++; } + if (!less(sequence, sequence->order[start_b], sequence->order[start_a])) { start_a++; } else { sz_size_t value = sequence->order[start_b]; sz_size_t index = start_b; @@ -782,18 +782,135 @@ inline static void sz_merge(sz_sequence_t *sequence, sz_size_t partition, sz_seq } } -inline static void sz_sort_insertion(sz_sequence_t *sequence, sz_sequence_comparator_t comparator) { +inline static void sz_sort_insertion(sz_sequence_t *sequence, sz_sequence_comparator_t less) { sz_u64_t *keys = sequence->order; sz_size_t keys_count = sequence->count; for (sz_size_t i = 1; i < keys_count; i++) { sz_u64_t i_key = keys[i]; - // Move elements of arr[0..i-1] that are greater than key to one position ahead of their current position sz_size_t j = i; - for (; j > 0 && comparator(sequence, keys[j - 1], i) != 0; --j) keys[j] = keys[j - 1]; + for (; j > 0 && less(sequence, i_key, keys[j - 1]); --j) keys[j] = keys[j - 1]; keys[j] = i_key; } } +// Utility functions +inline static sz_size_t _sz_log2i(sz_size_t n) { + sz_size_t log2 = 0; + while (n >>= 1) ++log2; + return log2; +} + +inline static void _sz_sift_down( + sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_u64_t *order, sz_size_t start, sz_size_t end) { + sz_size_t root = start; + while (2 * root + 1 <= end) { + sz_size_t child = 2 * root + 1; + if (child + 1 <= end && less(sequence, order[child], order[child + 1])) { child++; } + if (!less(sequence, order[root], order[child])) { return; } + _sz_swap_order(order + root, order + child); + root = child; + } +} + +inline static void _sz_heapify(sz_sequence_t *sequence, + sz_sequence_comparator_t less, + sz_u64_t *order, + sz_size_t count) { + sz_size_t start = (count - 2) / 2; + while (1) { + _sz_sift_down(sequence, less, order, start, count - 1); + if (start == 0) return; + start--; + } +} + +inline static void _sz_heapsort(sz_sequence_t *sequence, + sz_sequence_comparator_t less, + sz_size_t first, + sz_size_t last) { + sz_u64_t *order = sequence->order; + sz_size_t count = last - first; + _sz_heapify(sequence, less, order + first, count); + sz_size_t end = count - 1; + while (end > 0) { + _sz_swap_order(order + first, order + first + end); + end--; + _sz_sift_down(sequence, less, order + first, 0, end); + } +} + +inline static void _sz_introsort( + sz_sequence_t *sequence, sz_sequence_comparator_t less, sz_size_t first, sz_size_t last, sz_size_t depth) { + + sz_size_t length = last - first; + switch (length) { + case 0: + case 1: return; + case 2: + if (less(sequence, sequence->order[first + 1], sequence->order[first])) + _sz_swap_order(&sequence->order[first], &sequence->order[first + 1]); + return; + case 3: + sz_u64_t a = sequence->order[first]; + sz_u64_t b = sequence->order[first + 1]; + sz_u64_t c = sequence->order[first + 2]; + if (less(sequence, b, a)) _sz_swap_order(&a, &b); + if (less(sequence, c, b)) _sz_swap_order(&c, &b); + if (less(sequence, b, a)) _sz_swap_order(&a, &b); + sequence->order[first] = a; + sequence->order[first + 1] = b; + sequence->order[first + 2] = c; + return; + } + // Until a certain length, the quadratic-complexity insertion-sort is fine + if (length <= 16) { + sz_sequence_t sub_seq = *sequence; + sub_seq.order += first; + sub_seq.count = length; + sz_sort_insertion(&sub_seq, less); + return; + } + + // Fallback to N-logN-complexity heap-sort + if (depth == 0) { + _sz_heapsort(sequence, less, first, last); + return; + } + + --depth; + + // Median-of-three logic to choose pivot + sz_size_t median = first + length / 2; + if (less(sequence, sequence->order[median], sequence->order[first])) + _sz_swap_order(&sequence->order[first], &sequence->order[median]); + if (less(sequence, sequence->order[last - 1], sequence->order[first])) + _sz_swap_order(&sequence->order[first], &sequence->order[last - 1]); + if (less(sequence, sequence->order[median], sequence->order[last - 1])) + _sz_swap_order(&sequence->order[median], &sequence->order[last - 1]); + + // Partition using the median-of-three as the pivot + sz_u64_t pivot = sequence->order[median]; + sz_size_t left = first; + sz_size_t right = last - 1; + while (true) { + while (less(sequence, sequence->order[left], pivot)) left++; + while (less(sequence, pivot, sequence->order[right])) right--; + if (left >= right) break; + _sz_swap_order(&sequence->order[left], &sequence->order[right]); + left++; + right--; + } + + // Recursively sort the partitions + _sz_introsort(sequence, less, first, left, depth); + _sz_introsort(sequence, less, right + 1, last, depth); +} + +inline static void sz_sort_introsort(sz_sequence_t *sequence, sz_sequence_comparator_t less) { + sz_size_t depth_limit = 2 * _sz_log2i(sequence->count); + _sz_introsort(sequence, less, 0, sequence->count, depth_limit); +} + /** * @brief Internal Radix sorting procedure. */ @@ -834,28 +951,28 @@ inline static void _sz_sort_recursion( // sz_sequence_t a = *sequence; a.count = split; - sz_sort_insertion(&a, comparator); + sz_sort_introsort(&a, comparator); sz_sequence_t b = *sequence; b.order += split; b.count -= split; - sz_sort_insertion(&b, comparator); + sz_sort_introsort(&b, comparator); } } inline static sz_bool_t _sz_sort_compare_less_ascii(sz_sequence_t *sequence, sz_size_t i_key, sz_size_t j_key) { - sz_string_ptr_t i_str = sequence->get_start(sequence->handle, i_key); - sz_size_t i_len = sequence->get_length(sequence->handle, i_key); - sz_string_ptr_t j_str = sequence->get_start(sequence->handle, j_key); - sz_size_t j_len = sequence->get_length(sequence->handle, j_key); + sz_string_ptr_t i_str = sequence->get_start(sequence, i_key); + sz_size_t i_len = sequence->get_length(sequence, i_key); + sz_string_ptr_t j_str = sequence->get_start(sequence, j_key); + sz_size_t j_len = sequence->get_length(sequence, j_key); return sz_is_less_ascii(i_str, i_len, j_str, j_len); } inline static sz_bool_t _sz_sort_compare_less_uncased_ascii(sz_sequence_t *sequence, sz_size_t i_key, sz_size_t j_key) { - sz_string_ptr_t i_str = sequence->get_start(sequence->handle, i_key); - sz_size_t i_len = sequence->get_length(sequence->handle, i_key); - sz_string_ptr_t j_str = sequence->get_start(sequence->handle, j_key); - sz_size_t j_len = sequence->get_length(sequence->handle, j_key); + sz_string_ptr_t i_str = sequence->get_start(sequence, i_key); + sz_size_t i_len = sequence->get_length(sequence, i_key); + sz_string_ptr_t j_str = sequence->get_start(sequence, j_key); + sz_size_t j_len = sequence->get_length(sequence, j_key); return sz_is_less_uncased_ascii(i_str, i_len, j_str, j_len); } @@ -876,8 +993,8 @@ inline static void sz_sort(sz_sequence_t *sequence, sz_sort_config_t const *conf // Export up to 4 bytes into the `sequence` bits themselves for (sz_size_t i = 0; i != sequence->count; ++i) { - sz_string_ptr_t begin = sequence->get_start(sequence->handle, sequence->order[i]); - sz_size_t length = sequence->get_length(sequence->handle, sequence->order[i]); + sz_string_ptr_t begin = sequence->get_start(sequence, sequence->order[i]); + sz_size_t length = sequence->get_length(sequence, sequence->order[i]); length = length > 4ul ? 4ul : length; char *prefix = (char *)&sequence->order[i]; for (sz_size_t j = 0; j != length; ++j) prefix[7 - j] = begin[j]; From 05a409ce7ab1f76582c7936d2a4d2d6c99e7b3ed Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Mon, 9 Oct 2023 16:30:38 -0700 Subject: [PATCH 6/8] Refactor: New C API for JS --- javascript/lib.c | 62 +++++++++++++------------- javascript/test/find.js | 14 +++--- scripts/test.c | 13 +++--- scripts/test.cpp | 92 +++++++++++++++++++++------------------ stringzilla/stringzilla.h | 9 ++-- 5 files changed, 97 insertions(+), 93 deletions(-) diff --git a/javascript/lib.c b/javascript/lib.c index fe1f5f68..18e36a1b 100644 --- a/javascript/lib.c +++ b/javascript/lib.c @@ -8,17 +8,18 @@ * @see NodeJS docs: https://nodejs.org/api/n-api.html */ -#include -#include +#include // `napi_*` functions +#include // `malloc` +#include // `sz_*` functions -napi_value FindAPI(napi_env env, napi_callback_info info) { +napi_value indexOfAPI(napi_env env, napi_callback_info info) { size_t argc = 2; napi_value args[2]; napi_get_cb_info(env, info, &argc, args, NULL, NULL); // Extract the C string from the JavaScript string for haystack and needle - sz_haystack_t haystack_sz = {NULL, 0}; - sz_needle_t needle_sz = {NULL, 0, 0}; + sz_string_view_t haystack_sz = {NULL, 0}; + sz_string_view_t needle_sz = {NULL, 0}; // For haystack napi_get_value_string_utf8(env, args[0], NULL, 0, (size_t *)&haystack_sz.length); @@ -38,37 +39,32 @@ napi_value FindAPI(napi_env env, napi_callback_info info) { needle_sz.length + 1, (size_t *)&needle_sz.length); - // Perform the find operation - sz_size_t result = sz_find_substr(haystack_sz, needle_sz); - - // Cleanup - free((void *)haystack_sz.start); - free((void *)needle_sz.start); - // Convert the result to JavaScript BigInt and return napi_value js_result; + if (needle_sz.length == 0) { napi_create_bigint_int64(env, 0, &js_result); } + else { + sz_string_ptr_t result = + sz_find_substr(haystack_sz.start, haystack_sz.length, needle_sz.start, needle_sz.length); - // In JavaScript, if `find` is unable to find the specified value, then it should return -1 - if (result == haystack_sz.length) napi_create_bigint_int64(env, -1, &js_result); - else - napi_create_bigint_uint64(env, result, &js_result); + // In JavaScript, if `indexOf` is unable to indexOf the specified value, then it should return -1 + if (result == NULL) { napi_create_bigint_int64(env, -1, &js_result); } + else { napi_create_bigint_uint64(env, result - haystack_sz.start, &js_result); } + } + // Cleanup + free((void *)haystack_sz.start); + free((void *)needle_sz.start); return js_result; } -size_t count_char(sz_haystack_t haystack_sz, char needle) { - size_t result = sz_count_char(haystack_sz, needle); - return result; -} - -napi_value CountAPI(napi_env env, napi_callback_info info) { +napi_value countAPI(napi_env env, napi_callback_info info) { size_t argc = 3; napi_value args[3]; napi_get_cb_info(env, info, &argc, args, NULL, NULL); // Extract the C string from the JavaScript string for haystack and needle - sz_haystack_t haystack_sz = {NULL, 0}; - sz_needle_t needle_sz = {NULL, 0, 0}; + sz_string_view_t haystack_sz = {NULL, 0}; + sz_string_view_t needle_sz = {NULL, 0}; // For haystack napi_get_value_string_utf8(env, args[0], NULL, 0, (size_t *)&haystack_sz.length); @@ -95,11 +91,13 @@ napi_value CountAPI(napi_env env, napi_callback_info info) { size_t count = 0; if (needle_sz.length == 0 || haystack_sz.length == 0 || haystack_sz.length < needle_sz.length) { count = 0; } - else if (needle_sz.length == 1) { count = count_char(haystack_sz, needle_sz.start[0]); } + else if (needle_sz.length == 1) { count = sz_count_char(haystack_sz.start, haystack_sz.length, needle_sz.start); } else if (overlap) { while (haystack_sz.length) { - sz_size_t offset = sz_find_substr(haystack_sz, needle_sz); - int found = offset != haystack_sz.length; + sz_string_ptr_t ptr = + sz_find_substr(haystack_sz.start, haystack_sz.length, needle_sz.start, needle_sz.length); + sz_bool_t found = ptr != NULL; + sz_size_t offset = found ? ptr - haystack_sz.start : haystack_sz.length; count += found; haystack_sz.start += offset + found; haystack_sz.length -= offset + found; @@ -107,8 +105,10 @@ napi_value CountAPI(napi_env env, napi_callback_info info) { } else { while (haystack_sz.length) { - sz_size_t offset = sz_find_substr(haystack_sz, needle_sz); - int found = offset != haystack_sz.length; + sz_string_ptr_t ptr = + sz_find_substr(haystack_sz.start, haystack_sz.length, needle_sz.start, needle_sz.length); + sz_bool_t found = ptr != NULL; + sz_size_t offset = found ? ptr - haystack_sz.start : haystack_sz.length; count += found; haystack_sz.start += offset + needle_sz.length; haystack_sz.length -= offset + needle_sz.length * found; @@ -129,8 +129,8 @@ napi_value CountAPI(napi_env env, napi_callback_info info) { napi_value Init(napi_env env, napi_value exports) { // Define an array of property descriptors - napi_property_descriptor findDesc = {"find", 0, FindAPI, 0, 0, 0, napi_default, 0}; - napi_property_descriptor countDesc = {"count", 0, CountAPI, 0, 0, 0, napi_default, 0}; + napi_property_descriptor findDesc = {"indexOf", 0, indexOfAPI, 0, 0, 0, napi_default, 0}; + napi_property_descriptor countDesc = {"count", 0, countAPI, 0, 0, 0, napi_default, 0}; napi_property_descriptor properties[] = {findDesc, countDesc}; // Define the properties on the `exports` object diff --git a/javascript/test/find.js b/javascript/test/find.js index cd2a800d..9fe4e5b7 100644 --- a/javascript/test/find.js +++ b/javascript/test/find.js @@ -5,26 +5,26 @@ import assert from 'node:assert'; const stringzilla = bindings('stringzilla'); test('Find Word in Text - Positive Case', () => { - const result = stringzilla.find('hello world, hello john', 'hello'); + const result = stringzilla.indexOf('hello world, hello john', 'hello'); assert.strictEqual(result, 0n); }); test('Find Word in Text - Negative Case (Word Not Found)', () => { - const result_1 = stringzilla.find('ha', 'aaa'); + const result_1 = stringzilla.indexOf('ha', 'aaa'); assert.strictEqual(result_1, -1n); - const result_2 = stringzilla.find('g', 'a'); + const result_2 = stringzilla.indexOf('g', 'a'); assert.strictEqual(result_2, -1n); }); test('Find Word in Text - Negative Case (Empty String Inputs)', () => { - const result_1 = stringzilla.find('hello world', ''); + const result_1 = stringzilla.indexOf('hello world', ''); assert.strictEqual(result_1, 0n); - const result_2 = stringzilla.find('', 'a'); + const result_2 = stringzilla.indexOf('', 'a'); assert.strictEqual(result_2, -1n); - const result_3 = stringzilla.find('', ''); - assert.strictEqual(result_2, -1n); + const result_3 = stringzilla.indexOf('', ''); + assert.strictEqual(result_3, 0n); }); diff --git a/scripts/test.c b/scripts/test.c index a921e76d..127975b0 100644 --- a/scripts/test.c +++ b/scripts/test.c @@ -27,24 +27,23 @@ void test_sz_find_substr() { for (int variability = 1; variability < VARIABILITY; variability++) { populate_random_string(buffer, length, variability); - struct sz_haystack_t haystack; + sz_string_view_t haystack; haystack.start = buffer; haystack.length = length; int pattern_length = rand() % 5 + 1; populate_random_string(pattern, pattern_length, variability); - struct sz_needle_t needle; + sz_string_view_t needle; needle.start = pattern; needle.length = pattern_length; // Comparing the result of your function with the standard library function. - const char *result_libc = strstr(buffer, pattern); - uint64_t result_stringzilla = sz_find_substr(haystack, needle); + sz_string_ptr_t result_libc = strstr(buffer, pattern); + sz_string_ptr_t result_stringzilla = + sz_find_substr(haystack.start, haystack.length, needle.start, needle.length); - assert(((result_libc && result_stringzilla == (uint64_t)(result_libc - buffer)) || - (!result_libc && result_stringzilla == (uint64_t)-1)) && - "Test failed for sz_find_substr"); + assert(((result_libc == NULL) ^ (result_stringzilla == NULL)) && "Test failed for sz_find_substr"); } } } diff --git a/scripts/test.cpp b/scripts/test.cpp index ddef4e82..8dc1a4d2 100644 --- a/scripts/test.cpp +++ b/scripts/test.cpp @@ -1,39 +1,39 @@ -#include +#include #include -#include +#include #include -#include -#include -#include +#include #include -#include +#include +#include #include +#include #include using strings_t = std::vector; using idx_t = sz_size_t; -using permute_t = std::vector; +using permute_t = std::vector; #pragma region - C callbacks -static char const *get_start(void const *array_c, sz_size_t i) { - strings_t const &array = *reinterpret_cast(array_c); +static char const *get_start(sz_sequence_t const *array_c, sz_size_t i) { + strings_t const &array = *reinterpret_cast(array_c->handle); return array[i].c_str(); } -static sz_size_t get_length(void const *array_c, sz_size_t i) { - strings_t const &array = *reinterpret_cast(array_c); +static sz_size_t get_length(sz_sequence_t const *array_c, sz_size_t i) { + strings_t const &array = *reinterpret_cast(array_c->handle); return array[i].size(); } -static int is_less(void const *array_c, sz_size_t i, sz_size_t j) { - strings_t const &array = *reinterpret_cast(array_c); +static int is_less(sz_sequence_t const *array_c, sz_size_t i, sz_size_t j) { + strings_t const &array = *reinterpret_cast(array_c->handle); return array[i] < array[j]; } -static int has_under_four_chars(void const *array_c, sz_size_t i) { - strings_t const &array = *reinterpret_cast(array_c); +static int has_under_four_chars(sz_sequence_t const *array_c, sz_size_t i) { + strings_t const &array = *reinterpret_cast(array_c->handle); return array[i].size() < 4; } @@ -64,7 +64,7 @@ void populate_with_test(strings_t &strings) { constexpr size_t offset_in_word = 0; -inline static idx_t hybrid_sort_cpp(strings_t const &strings, idx_t *order) { +inline static idx_t hybrid_sort_cpp(strings_t const &strings, sz_u64_t *order) { // What if we take up-to 4 first characters and the index for (size_t i = 0; i != strings.size(); ++i) @@ -72,7 +72,7 @@ inline static idx_t hybrid_sort_cpp(strings_t const &strings, idx_t *order) { strings[order[i]].c_str(), std::min(strings[order[i]].size(), 4ul)); - std::sort(order, order + strings.size(), [&](idx_t i, idx_t j) { + std::sort(order, order + strings.size(), [&](sz_u64_t i, sz_u64_t j) { char *i_bytes = (char *)&i; char *j_bytes = (char *)&j; return *(uint32_t *)(i_bytes + offset_in_word) < *(uint32_t *)(j_bytes + offset_in_word); @@ -80,7 +80,7 @@ inline static idx_t hybrid_sort_cpp(strings_t const &strings, idx_t *order) { for (size_t i = 0; i != strings.size(); ++i) std::memset((char *)&order[i] + offset_in_word, 0, 4ul); - std::sort(order, order + strings.size(), [&](idx_t i, idx_t j) { return strings[i] < strings[j]; }); + std::sort(order, order + strings.size(), [&](sz_u64_t i, sz_u64_t j) { return strings[i] < strings[j]; }); return strings.size(); } @@ -92,14 +92,14 @@ int hybrid_sort_c_compare_uint32_t(const void *a, const void *b) { } int hybrid_sort_c_compare_strings(void *arg, const void *a, const void *b) { - sz_sequence_t *seq = (sz_sequence_t *)arg; + sz_sequence_t *sequence = (sz_sequence_t *)arg; sz_size_t idx_a = *(sz_size_t *)a; sz_size_t idx_b = *(sz_size_t *)b; - const char *str_a = seq->get_start(seq->handle, idx_a); - const char *str_b = seq->get_start(seq->handle, idx_b); - sz_size_t len_a = seq->get_length(seq->handle, idx_a); - sz_size_t len_b = seq->get_length(seq->handle, idx_b); + const char *str_a = sequence->get_start(sequence, idx_a); + const char *str_b = sequence->get_start(sequence, idx_b); + sz_size_t len_a = sequence->get_length(sequence, idx_a); + sz_size_t len_b = sequence->get_length(sequence, idx_b); int res = strncmp(str_a, str_b, len_a < len_b ? len_a : len_b); return res ? res : (int)(len_a - len_b); @@ -108,8 +108,8 @@ int hybrid_sort_c_compare_strings(void *arg, const void *a, const void *b) { sz_size_t hybrid_sort_c(sz_sequence_t *sequence) { // Copy up to 4 first characters into the 'order' array. for (sz_size_t i = 0; i < sequence->count; ++i) { - const char *str = sequence->get_start(sequence->handle, sequence->order[i]); - sz_size_t len = sequence->get_length(sequence->handle, sequence->order[i]); + const char *str = sequence->get_start(sequence, sequence->order[i]); + sz_size_t len = sequence->get_length(sequence, sequence->order[i]); len = len > 4 ? 4 : len; memcpy((char *)&sequence->order[i] + sizeof(sz_size_t) - 4, str, len); } @@ -128,7 +128,7 @@ sz_size_t hybrid_sort_c(sz_sequence_t *sequence) { return sequence->count; } -inline static idx_t hybrid_stable_sort_cpp(strings_t const &strings, idx_t *order) { +inline static idx_t hybrid_stable_sort_cpp(strings_t const &strings, sz_u64_t *order) { // What if we take up-to 4 first characters and the index for (size_t i = 0; i != strings.size(); ++i) @@ -136,7 +136,7 @@ inline static idx_t hybrid_stable_sort_cpp(strings_t const &strings, idx_t *orde strings[order[i]].c_str(), std::min(strings[order[i]].size(), 4ul)); - std::stable_sort(order, order + strings.size(), [&](idx_t i, idx_t j) { + std::stable_sort(order, order + strings.size(), [&](sz_u64_t i, sz_u64_t j) { char *i_bytes = (char *)&i; char *j_bytes = (char *)&j; return *(uint32_t *)(i_bytes + offset_in_word) < *(uint32_t *)(j_bytes + offset_in_word); @@ -144,7 +144,7 @@ inline static idx_t hybrid_stable_sort_cpp(strings_t const &strings, idx_t *orde for (size_t i = 0; i != strings.size(); ++i) std::memset((char *)&order[i] + offset_in_word, 0, 4ul); - std::stable_sort(order, order + strings.size(), [&](idx_t i, idx_t j) { return strings[i] < strings[j]; }); + std::stable_sort(order, order + strings.size(), [&](sz_u64_t i, sz_u64_t j) { return strings[i] < strings[j]; }); return strings.size(); } @@ -209,7 +209,7 @@ int main(int, char const **) { std::printf("Hey, Ash!\n"); strings_t strings; - populate_from_file("leipzig1M.txt", strings, 10000000); + populate_from_file("leipzig1M.txt", strings, 1000000); std::size_t mean_bytes = 0; for (std::string const &str : strings) mean_bytes += str.size(); mean_bytes /= strings.size(); @@ -229,26 +229,23 @@ int main(int, char const **) { for (std::size_t needle_len = 1; needle_len <= 0; ++needle_len) { std::string needle(needle_len, '\4'); std::printf("---- Needle length: %zu\n", needle_len); - bench_search("std::search", full_text, [&]() { + bench_search("std::search", full_text, [&]() mutable { return std::search(full_text.begin(), full_text.end(), needle.begin(), needle.end()) - full_text.begin(); }); - bench_search("sz_find_substr_swar", full_text, [&]() { - sz_haystack_t h {full_text.data(), full_text.size()}; - sz_needle_t n {needle.data(), needle.size()}; - return sz_find_substr_swar(h, n); + bench_search("sz_find_substr_swar", full_text, [&]() mutable { + sz_string_ptr_t ptr = sz_find_substr_swar(full_text.data(), full_text.size(), needle.data(), needle.size()); + return ptr ? ptr - full_text.data() : full_text.size(); }); #if defined(__ARM_NEON) - bench_search("sz_find_substr_neon", full_text, [&]() { - sz_haystack_t h {full_text.data(), full_text.size()}; - sz_needle_t n {needle.data(), needle.size()}; - return sz_find_substr_neon(h, n); + bench_search("sz_find_substr_neon", full_text, [&]() mutable { + sz_string_ptr_t ptr = sz_find_substr_neon(full_text.data(), full_text.size(), needle.data(), needle.size()); + return ptr ? ptr - full_text.data() : full_text.size(); }); #endif #if defined(__AVX2__) - bench_search("sz_find_substr_avx2", full_text, [&]() { - sz_haystack_t h {full_text.data(), full_text.size()}; - sz_needle_t n {needle.data(), needle.size()}; - return sz_find_substr_avx2(h, n); + bench_search("sz_find_substr_avx2", full_text, [&]() mutable { + sz_string_ptr_t ptr = sz_find_substr_avx2(full_text.data(), full_text.size(), needle.data(), needle.size()); + return ptr ? ptr - full_text.data() : full_text.size(); }); #endif } @@ -300,6 +297,17 @@ int main(int, char const **) { }); expect_sorted(strings, permute_new); + bench_permute("sz_sort_introsort", strings, permute_new, [](strings_t const &strings, permute_t &permute) { + sz_sequence_t array; + array.order = permute.data(); + array.count = strings.size(); + array.handle = &strings; + array.get_start = get_start; + array.get_length = get_length; + sz_sort_introsort(&array, (sz_sequence_comparator_t)_sz_sort_compare_less_ascii); + }); + expect_sorted(strings, permute_new); + bench_permute("hybrid_sort_c", strings, permute_new, [](strings_t const &strings, permute_t &permute) { sz_sequence_t array; array.order = permute.data(); diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h index 84e864cf..ba7f5f39 100644 --- a/stringzilla/stringzilla.h +++ b/stringzilla/stringzilla.h @@ -609,7 +609,7 @@ inline static sz_string_ptr_t sz_find_substr(sz_string_ptr_t const haystack, sz_size_t const haystack_length, sz_string_ptr_t const needle, sz_size_t const needle_length) { - if (haystack_length < needle_length) return NULL; + if (haystack_length < needle_length || needle_length == 0) return NULL; #if defined(__ARM_NEON) return sz_find_substr_neon(haystack, haystack_length, needle, needle_length); #elif defined(__AVX2__) @@ -850,7 +850,7 @@ inline static void _sz_introsort( if (less(sequence, sequence->order[first + 1], sequence->order[first])) _sz_swap_order(&sequence->order[first], &sequence->order[first + 1]); return; - case 3: + case 3: { sz_u64_t a = sequence->order[first]; sz_u64_t b = sequence->order[first + 1]; sz_u64_t c = sequence->order[first + 2]; @@ -862,6 +862,7 @@ inline static void _sz_introsort( sequence->order[first + 2] = c; return; } + } // Until a certain length, the quadratic-complexity insertion-sort is fine if (length <= 16) { sz_sequence_t sub_seq = *sequence; @@ -1102,10 +1103,6 @@ inline static sz_u32_t sz_hash_crc32_sse(sz_string_ptr_t start, sz_size_t length } #endif -#ifdef _MSC_VER -#undef strncasecmp -#undef strcasecmp -#endif #undef popcount64 #undef ctz64 #undef clz64 From cffae4a684437eafe3ed75299d2fb8c82baa1019 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Mon, 9 Oct 2023 16:50:13 -0700 Subject: [PATCH 7/8] Refactor: Sync up Py and JS bindings --- javascript/lib.c | 87 ++++++--------- python/lib.c | 183 +++++++++++++++--------------- scripts/test.cpp | 15 ++- stringzilla/stringzilla.h | 226 ++++++++++++++++++++------------------ 4 files changed, 253 insertions(+), 258 deletions(-) diff --git a/javascript/lib.c b/javascript/lib.c index 18e36a1b..8ebe72eb 100644 --- a/javascript/lib.c +++ b/javascript/lib.c @@ -18,42 +18,33 @@ napi_value indexOfAPI(napi_env env, napi_callback_info info) { napi_get_cb_info(env, info, &argc, args, NULL, NULL); // Extract the C string from the JavaScript string for haystack and needle - sz_string_view_t haystack_sz = {NULL, 0}; - sz_string_view_t needle_sz = {NULL, 0}; + sz_string_view_t haystack = {NULL, 0}; + sz_string_view_t needle = {NULL, 0}; // For haystack - napi_get_value_string_utf8(env, args[0], NULL, 0, (size_t *)&haystack_sz.length); - haystack_sz.start = malloc(haystack_sz.length + 1); - napi_get_value_string_utf8(env, - args[0], - (char *)haystack_sz.start, - haystack_sz.length + 1, - (size_t *)&haystack_sz.length); + napi_get_value_string_utf8(env, args[0], NULL, 0, (size_t *)&haystack.length); + haystack.start = malloc(haystack.length + 1); + napi_get_value_string_utf8(env, args[0], (char *)haystack.start, haystack.length + 1, (size_t *)&haystack.length); // For needle - napi_get_value_string_utf8(env, args[1], NULL, 0, (size_t *)&needle_sz.length); - needle_sz.start = malloc(needle_sz.length + 1); - napi_get_value_string_utf8(env, - args[1], - (char *)needle_sz.start, - needle_sz.length + 1, - (size_t *)&needle_sz.length); + napi_get_value_string_utf8(env, args[1], NULL, 0, (size_t *)&needle.length); + needle.start = malloc(needle.length + 1); + napi_get_value_string_utf8(env, args[1], (char *)needle.start, needle.length + 1, (size_t *)&needle.length); // Convert the result to JavaScript BigInt and return napi_value js_result; - if (needle_sz.length == 0) { napi_create_bigint_int64(env, 0, &js_result); } + if (needle.length == 0) { napi_create_bigint_int64(env, 0, &js_result); } else { - sz_string_ptr_t result = - sz_find_substr(haystack_sz.start, haystack_sz.length, needle_sz.start, needle_sz.length); + sz_string_start_t result = sz_find_substring(haystack.start, haystack.length, needle.start, needle.length); // In JavaScript, if `indexOf` is unable to indexOf the specified value, then it should return -1 if (result == NULL) { napi_create_bigint_int64(env, -1, &js_result); } - else { napi_create_bigint_uint64(env, result - haystack_sz.start, &js_result); } + else { napi_create_bigint_uint64(env, result - haystack.start, &js_result); } } // Cleanup - free((void *)haystack_sz.start); - free((void *)needle_sz.start); + free((void *)haystack.start); + free((void *)needle.start); return js_result; } @@ -63,55 +54,45 @@ napi_value countAPI(napi_env env, napi_callback_info info) { napi_get_cb_info(env, info, &argc, args, NULL, NULL); // Extract the C string from the JavaScript string for haystack and needle - sz_string_view_t haystack_sz = {NULL, 0}; - sz_string_view_t needle_sz = {NULL, 0}; + sz_string_view_t haystack = {NULL, 0}; + sz_string_view_t needle = {NULL, 0}; // For haystack - napi_get_value_string_utf8(env, args[0], NULL, 0, (size_t *)&haystack_sz.length); - haystack_sz.start = malloc(haystack_sz.length + 1); - napi_get_value_string_utf8(env, - args[0], - (char *)haystack_sz.start, - haystack_sz.length + 1, - (size_t *)&haystack_sz.length); + napi_get_value_string_utf8(env, args[0], NULL, 0, (size_t *)&haystack.length); + haystack.start = malloc(haystack.length + 1); + napi_get_value_string_utf8(env, args[0], (char *)haystack.start, haystack.length + 1, (size_t *)&haystack.length); // For needle - napi_get_value_string_utf8(env, args[1], NULL, 0, (size_t *)&needle_sz.length); - needle_sz.start = malloc(needle_sz.length + 1); - napi_get_value_string_utf8(env, - args[1], - (char *)needle_sz.start, - needle_sz.length + 1, - (size_t *)&needle_sz.length); + napi_get_value_string_utf8(env, args[1], NULL, 0, (size_t *)&needle.length); + needle.start = malloc(needle.length + 1); + napi_get_value_string_utf8(env, args[1], (char *)needle.start, needle.length + 1, (size_t *)&needle.length); bool overlap = false; if (argc > 2) { napi_get_value_bool(env, args[2], &overlap); } - void const *haystack_start = haystack_sz.start, *needle_start = needle_sz.start; + void const *haystack_start = haystack.start, *needle_start = needle.start; size_t count = 0; - if (needle_sz.length == 0 || haystack_sz.length == 0 || haystack_sz.length < needle_sz.length) { count = 0; } - else if (needle_sz.length == 1) { count = sz_count_char(haystack_sz.start, haystack_sz.length, needle_sz.start); } + if (needle.length == 0 || haystack.length == 0 || haystack.length < needle.length) { count = 0; } + else if (needle.length == 1) { count = sz_count_char(haystack.start, haystack.length, needle.start); } else if (overlap) { - while (haystack_sz.length) { - sz_string_ptr_t ptr = - sz_find_substr(haystack_sz.start, haystack_sz.length, needle_sz.start, needle_sz.length); + while (haystack.length) { + sz_string_start_t ptr = sz_find_substring(haystack.start, haystack.length, needle.start, needle.length); sz_bool_t found = ptr != NULL; - sz_size_t offset = found ? ptr - haystack_sz.start : haystack_sz.length; + sz_size_t offset = found ? ptr - haystack.start : haystack.length; count += found; - haystack_sz.start += offset + found; - haystack_sz.length -= offset + found; + haystack.start += offset + found; + haystack.length -= offset + found; } } else { - while (haystack_sz.length) { - sz_string_ptr_t ptr = - sz_find_substr(haystack_sz.start, haystack_sz.length, needle_sz.start, needle_sz.length); + while (haystack.length) { + sz_string_start_t ptr = sz_find_substring(haystack.start, haystack.length, needle.start, needle.length); sz_bool_t found = ptr != NULL; - sz_size_t offset = found ? ptr - haystack_sz.start : haystack_sz.length; + sz_size_t offset = found ? ptr - haystack.start : haystack.length; count += found; - haystack_sz.start += offset + needle_sz.length; - haystack_sz.length -= offset + needle_sz.length * found; + haystack.start += offset + needle.length; + haystack.length -= offset + needle.length * found; } } diff --git a/python/lib.c b/python/lib.c index a0f6caca..c0ad69d4 100644 --- a/python/lib.c +++ b/python/lib.c @@ -38,10 +38,7 @@ static PyTypeObject FileType; static PyTypeObject StrType; static PyTypeObject StrsType; -static struct { - void *start; - size_t length; -} temporary_memory = {NULL, 0}; +static sz_string_view_t temporary_memory = {NULL, 0}; /** * @brief Describes an on-disk file mapped into RAM, which is different from Python's @@ -55,8 +52,8 @@ typedef struct { #else int file_descriptor; #endif - void *start; - size_t length; + sz_string_start_t start; + sz_size_t length; } File; /** @@ -73,8 +70,8 @@ typedef struct { */ typedef struct { PyObject_HEAD PyObject *parent; - char const *start; - size_t length; + sz_string_start_t start; + sz_size_t length; } Str; /** @@ -133,7 +130,7 @@ typedef struct { struct reordered_slices_t { size_t count; PyObject *parent; - sz_haystack_t *parts; + sz_string_view_t *parts; } reordered; } data; @@ -144,10 +141,13 @@ typedef struct { #pragma region Helpers -typedef int boolean_t; +inline static sz_string_start_t haystacks_get_start(sz_sequence_t *seq, sz_size_t i) { + return ((sz_string_view_t const *)seq->handle)[i].start; +} -inline static char const *haystacks_get_start(sz_haystack_t const *parts, sz_size_t i) { return parts[i].start; } -inline static size_t haystacks_get_length(sz_haystack_t const *parts, sz_size_t i) { return parts[i].length; } +inline static sz_size_t haystacks_get_length(sz_sequence_t *seq, sz_size_t i) { + return ((sz_string_view_t const *)seq->handle)[i].length; +} void reverse_offsets(sz_size_t *array, size_t length) { size_t i, j; @@ -159,21 +159,21 @@ void reverse_offsets(sz_size_t *array, size_t length) { } } -void reverse_haystacks(sz_haystack_t *array, size_t length) { +void reverse_haystacks(sz_string_view_t *array, size_t length) { size_t i, j; // Swap array[i] and array[j] for (i = 0, j = length - 1; i < j; i++, j--) { - sz_haystack_t temp = array[i]; + sz_string_view_t temp = array[i]; array[i] = array[j]; array[j] = temp; } } -void apply_order(sz_haystack_t *array, sz_size_t *order, size_t length) { - for (size_t i = 0; i < length; ++i) { +void apply_order(sz_string_view_t *array, sz_u64_t *order, size_t length) { + for (sz_u64_t i = 0; i < length; ++i) { if (i == order[i]) continue; - sz_haystack_t temp = array[i]; - size_t k = i, j; + sz_string_view_t temp = array[i]; + sz_u64_t k = i, j; while (i != (j = order[k])) { array[k] = array[j]; order[k] = k; @@ -205,7 +205,7 @@ void slice(size_t length, ssize_t start, ssize_t end, size_t *normalized_offset, *normalized_length = end - start; } -boolean_t export_string_like(PyObject *object, char const **start, size_t *length) { +sz_bool_t export_string_like(PyObject *object, sz_string_start_t **start, sz_size_t *length) { if (PyUnicode_Check(object)) { // Handle Python str Py_ssize_t signed_length; @@ -277,7 +277,7 @@ get_string_at_offset_t str_at_offset_getter(Strs *strs) { } } -boolean_t prepare_strings_for_reordering(Strs *strs) { +sz_bool_t prepare_strings_for_reordering(Strs *strs) { // Allocate memory for reordered slices size_t count = 0; @@ -306,7 +306,7 @@ boolean_t prepare_strings_for_reordering(Strs *strs) { return 0; } - sz_haystack_t *new_parts = (sz_haystack_t *)malloc(count * sizeof(sz_haystack_t)); + sz_string_view_t *new_parts = (sz_string_view_t *)malloc(count * sizeof(sz_string_view_t)); if (new_parts == NULL) { PyErr_SetString(PyExc_MemoryError, "Unable to allocate memory for reordered slices"); return 0; @@ -333,7 +333,7 @@ boolean_t prepare_strings_for_reordering(Strs *strs) { return 1; } -boolean_t prepare_strings_for_extension(Strs *strs, size_t new_parents, size_t new_parts) { return 1; } +sz_bool_t prepare_strings_for_extension(Strs *strs, size_t new_parents, size_t new_parts) { return 1; } #pragma endregion @@ -622,8 +622,8 @@ static int Str_getbuffer(Str *self, Py_buffer *view, int flags) { view->itemsize = sizeof(char); view->format = "c"; // https://docs.python.org/3/library/struct.html#format-characters view->ndim = 1; - view->shape = &self->length; // 1-D array, so shape is just a pointer to the length - view->strides = itemsize; // strides in a 1-D array is just the item size + view->shape = (Py_ssize_t *)&self->length; // 1-D array, so shape is just a pointer to the length + view->strides = itemsize; // strides in a 1-D array is just the item size view->suboffsets = NULL; view->internal = NULL; @@ -639,18 +639,13 @@ static void Str_releasebuffer(PyObject *_, Py_buffer *view) { static int Str_in(Str *self, PyObject *arg) { - sz_needle_t needle_struct; - needle_struct.quadgram_offset = 0; + sz_string_view_t needle_struct; if (!export_string_like(arg, &needle_struct.start, &needle_struct.length)) { PyErr_SetString(PyExc_TypeError, "Unsupported argument type"); return -1; } - sz_haystack_t haystack; - haystack.start = self->start; - haystack.length = self->length; - size_t position = sz_find_substr(haystack, needle_struct); - return position != haystack.length; + return sz_find_substring(self->start, self->length, needle_struct.start, needle_struct.length) != NULL; } static Py_ssize_t Strs_len(Strs *self) { @@ -756,12 +751,12 @@ static PyObject *Strs_subscript(Strs *self, PyObject *key) { to->count = stop - start; to->parent = from->parent; - to->parts = malloc(sizeof(sz_haystack_t) * to->count); + to->parts = malloc(sizeof(sz_string_view_t) * to->count); if (to->parts == NULL && PyErr_NoMemory()) { Py_XDECREF(self_slice); return NULL; } - memcpy(to->parts, from->parts + start, sizeof(sz_haystack_t) * to->count); + memcpy(to->parts, from->parts + start, sizeof(sz_string_view_t) * to->count); Py_INCREF(to->parent); break; } @@ -816,8 +811,8 @@ static int Str_find_( // PyObject *args, PyObject *kwargs, Py_ssize_t *offset_out, - sz_haystack_t *haystack_out, - sz_needle_t *needle_out) { + sz_string_view_t *haystack_out, + sz_string_view_t *needle_out) { int is_member = self != NULL && PyObject_TypeCheck(self, &StrType); Py_ssize_t nargs = PyTuple_Size(args); @@ -845,12 +840,11 @@ static int Str_find_( // } } - sz_haystack_t haystack; - sz_needle_t needle; + sz_string_view_t haystack; + sz_string_view_t needle; Py_ssize_t start, end; // Validate and convert `haystack` and `needle` - needle.quadgram_offset = 0; if (!export_string_like(haystack_obj, &haystack.start, &haystack.length) || !export_string_like(needle_obj, &needle.start, &needle.length)) { PyErr_SetString(PyExc_TypeError, "Haystack and needle must be string-like"); @@ -884,9 +878,9 @@ static int Str_find_( // haystack.length = normalized_length; // Perform contains operation - size_t offset = sz_find_substr(haystack, needle); - if (offset == haystack.length) { *offset_out = -1; } - else { *offset_out = (Py_ssize_t)offset; } + sz_string_start_t match = sz_find_substring(haystack.start, haystack.length, needle.start, needle.length); + if (match == NULL) { *offset_out = -1; } + else { *offset_out = (Py_ssize_t)(match - haystack.start); } *haystack_out = haystack; *needle_out = needle; @@ -895,16 +889,16 @@ static int Str_find_( // static PyObject *Str_find(PyObject *self, PyObject *args, PyObject *kwargs) { Py_ssize_t signed_offset; - sz_haystack_t text; - sz_needle_t separator; + sz_string_view_t text; + sz_string_view_t separator; if (!Str_find_(self, args, kwargs, &signed_offset, &text, &separator)) return NULL; return PyLong_FromSsize_t(signed_offset); } static PyObject *Str_index(PyObject *self, PyObject *args, PyObject *kwargs) { Py_ssize_t signed_offset; - sz_haystack_t text; - sz_needle_t separator; + sz_string_view_t text; + sz_string_view_t separator; if (!Str_find_(self, args, kwargs, &signed_offset, &text, &separator)) return NULL; if (signed_offset == -1) { PyErr_SetString(PyExc_ValueError, "substring not found"); @@ -915,8 +909,8 @@ static PyObject *Str_index(PyObject *self, PyObject *args, PyObject *kwargs) { static PyObject *Str_contains(PyObject *self, PyObject *args, PyObject *kwargs) { Py_ssize_t signed_offset; - sz_haystack_t text; - sz_needle_t separator; + sz_string_view_t text; + sz_string_view_t separator; if (!Str_find_(self, args, kwargs, &signed_offset, &text, &separator)) return NULL; if (signed_offset == -1) { Py_RETURN_FALSE; } else { Py_RETURN_TRUE; } @@ -924,8 +918,8 @@ static PyObject *Str_contains(PyObject *self, PyObject *args, PyObject *kwargs) static PyObject *Str_partition(PyObject *self, PyObject *args, PyObject *kwargs) { Py_ssize_t separator_index; - sz_haystack_t text; - sz_needle_t separator; + sz_string_view_t text; + sz_string_view_t separator; PyObject *result_tuple; // Use Str_find_ to get the index of the separator @@ -993,13 +987,12 @@ static PyObject *Str_count(PyObject *self, PyObject *args, PyObject *kwargs) { return NULL; } - sz_haystack_t haystack; - sz_needle_t needle; + sz_string_view_t haystack; + sz_string_view_t needle; Py_ssize_t start = start_obj ? PyLong_AsSsize_t(start_obj) : 0; Py_ssize_t end = end_obj ? PyLong_AsSsize_t(end_obj) : PY_SSIZE_T_MAX; int allowoverlap = allowoverlap_obj ? PyObject_IsTrue(allowoverlap_obj) : 0; - needle.quadgram_offset = 0; if (!export_string_like(haystack_obj, &haystack.start, &haystack.length) || !export_string_like(needle_obj, &needle.start, &needle.length)) return PyErr_Format(PyExc_TypeError, "Haystack and needle must be string-like"), NULL; @@ -1013,27 +1006,28 @@ static PyObject *Str_count(PyObject *self, PyObject *args, PyObject *kwargs) { size_t count = 0; if (needle.length == 0 || haystack.length == 0 || haystack.length < needle.length) { count = 0; } - else if (needle.length == 1) { count = sz_count_char(haystack, needle.start[0]); } - else if (needle.length != 1) { - if (allowoverlap) { - while (haystack.length) { - sz_size_t offset = sz_find_substr(haystack, needle); - int found = offset != haystack.length; - count += found; - haystack.start += offset + found; - haystack.length -= offset + found; - } + else if (needle.length == 1) { count = sz_count_char(haystack.start, haystack.length, needle.start); } + else if (allowoverlap) { + while (haystack.length) { + sz_string_start_t ptr = sz_find_substring(haystack.start, haystack.length, needle.start, needle.length); + sz_bool_t found = ptr != NULL; + sz_size_t offset = found ? ptr - haystack.start : haystack.length; + count += found; + haystack.start += offset + found; + haystack.length -= offset + found; } - else { - while (haystack.length) { - sz_size_t offset = sz_find_substr(haystack, needle); - int found = offset != haystack.length; - count += found; - haystack.start += offset + needle.length; - haystack.length -= offset + needle.length * found; - } + } + else { + while (haystack.length) { + sz_string_start_t ptr = sz_find_substring(haystack.start, haystack.length, needle.start, needle.length); + sz_bool_t found = ptr != NULL; + sz_size_t offset = found ? ptr - haystack.start : haystack.length; + count += found; + haystack.start += offset + needle.length; + haystack.length -= offset + needle.length * found; } } + return PyLong_FromSize_t(count); } @@ -1068,7 +1062,7 @@ static PyObject *Str_levenstein(PyObject *self, PyObject *args, PyObject *kwargs return NULL; } - sz_haystack_t str1, str2; + sz_string_view_t str1, str2; if (!export_string_like(str1_obj, &str1.start, &str1.length) || !export_string_like(str2_obj, &str2.start, &str2.length)) { PyErr_Format(PyExc_TypeError, "Both arguments must be string-like"); @@ -1119,7 +1113,7 @@ static PyObject *Str_startswith(PyObject *self, PyObject *args, PyObject *kwargs return NULL; } - sz_haystack_t str, prefix; + sz_string_view_t str, prefix; if (!export_string_like(str_obj, &str.start, &str.length) || !export_string_like(prefix_obj, &prefix.start, &prefix.length)) { PyErr_SetString(PyExc_TypeError, "Both arguments must be string-like"); @@ -1162,7 +1156,7 @@ static PyObject *Str_endswith(PyObject *self, PyObject *args, PyObject *kwargs) return NULL; } - sz_haystack_t str, suffix; + sz_string_view_t str, suffix; if (!export_string_like(str_obj, &str.start, &str.length) || !export_string_like(suffix_obj, &suffix.start, &suffix.length)) { PyErr_SetString(PyExc_TypeError, "Both arguments must be string-like"); @@ -1180,7 +1174,7 @@ static PyObject *Str_endswith(PyObject *self, PyObject *args, PyObject *kwargs) } static Strs *Str_split_( - PyObject *parent, sz_haystack_t text, sz_needle_t separator, int keepseparator, Py_ssize_t maxsplit) { + PyObject *parent, sz_string_view_t text, sz_string_view_t separator, int keepseparator, Py_ssize_t maxsplit) { // Create Strs object Strs *result = (Strs *)PyObject_New(Strs, &StrsType); @@ -1209,10 +1203,9 @@ static Strs *Str_split_( // Iterate through string, keeping track of the sz_size_t last_start = 0; while (last_start <= text.length && offsets_count < maxsplit) { - sz_haystack_t text_remaining; - text_remaining.start = text.start + last_start; - text_remaining.length = text.length - last_start; - sz_size_t offset_in_remaining = sz_find_substr(text_remaining, separator); + sz_string_start_t match = + sz_find_substring(text.start + last_start, text.length - last_start, separator.start, separator.length); + sz_size_t offset_in_remaining = match ? match - text.start - last_start : text.length - last_start; // Reallocate offsets array if needed if (offsets_count >= offsets_capacity) { @@ -1232,7 +1225,7 @@ static Strs *Str_split_( } // Export the offset - size_t will_continue = offset_in_remaining != text_remaining.length; + size_t will_continue = match != NULL; size_t next_offset = last_start + offset_in_remaining + separator.length * will_continue; if (text.length >= UINT32_MAX) { ((uint64_t *)offsets_endings)[offsets_count++] = (uint64_t)next_offset; } else { ((uint32_t *)offsets_endings)[offsets_count++] = (uint32_t)next_offset; } @@ -1282,11 +1275,10 @@ static PyObject *Str_split(PyObject *self, PyObject *args, PyObject *kwargs) { } } - sz_haystack_t text; - sz_needle_t separator; + sz_string_view_t text; + sz_string_view_t separator; int keepseparator; Py_ssize_t maxsplit; - separator.quadgram_offset = 0; // Validate and convert `text` if (!export_string_like(text_obj, &text.start, &text.length)) { @@ -1355,7 +1347,7 @@ static PyObject *Str_splitlines(PyObject *self, PyObject *args, PyObject *kwargs } } - sz_haystack_t text; + sz_string_view_t text; int keeplinebreaks; Py_ssize_t maxsplit = PY_SSIZE_T_MAX; // Default value for maxsplit @@ -1388,14 +1380,14 @@ static PyObject *Str_splitlines(PyObject *self, PyObject *args, PyObject *kwargs // https://docs.python.org/3/library/stdtypes.html#str.splitlines // \n, \r, \r\n, \v or \x0b, \f or \x0c, \x1c, \x1d, \x1e, \x85, \u2028, \u2029 // https://github.com/ashvardanian/StringZilla/issues/29 - sz_needle_t separator; + sz_string_view_t separator; separator.start = "\n"; separator.length = 1; return Str_split_(text_obj, text, separator, keeplinebreaks, maxsplit); } static PyObject *Str_concat(PyObject *self, PyObject *other) { - struct sz_haystack_t self_str, other_str; + struct sz_string_view_t self_str, other_str; // Validate and convert `self` if (!export_string_like(self, &self_str.start, &self_str.length)) { @@ -1453,7 +1445,8 @@ static PyNumberMethods Str_as_number = { #define sz_method_flags_m METH_VARARGS | METH_KEYWORDS -static PyMethodDef Str_methods[] = { // +static PyMethodDef Str_methods[] = { + // {"find", Str_find, sz_method_flags_m, "Find the first occurrence of a substring."}, {"index", Str_index, sz_method_flags_m, "Find the first occurrence of a substring or raise error if missing."}, {"contains", Str_contains, sz_method_flags_m, "Check if a string contains a substring."}, @@ -1537,14 +1530,14 @@ static PyObject *Strs_shuffle(Strs *self, PyObject *args, PyObject *kwargs) { // Get the parts and their count struct reordered_slices_t *reordered = &self->data.reordered; - sz_haystack_t *parts = reordered->parts; + sz_string_view_t *parts = reordered->parts; size_t count = reordered->count; // Fisher-Yates Shuffle Algorithm for (size_t i = count - 1; i > 0; --i) { size_t j = rand() % (i + 1); // Swap parts[i] and parts[j] - sz_haystack_t temp = parts[i]; + sz_string_view_t temp = parts[i]; parts[i] = parts[j]; parts[j] = temp; } @@ -1552,8 +1545,8 @@ static PyObject *Strs_shuffle(Strs *self, PyObject *args, PyObject *kwargs) { Py_RETURN_NONE; } -static boolean_t Strs_sort_(Strs *self, - sz_haystack_t **parts_output, +static sz_bool_t Strs_sort_(Strs *self, + sz_string_view_t **parts_output, sz_size_t **order_output, sz_size_t *count_output) { @@ -1565,7 +1558,7 @@ static boolean_t Strs_sort_(Strs *self, // Get the parts and their count // The only possible `self->type` by now is the `STRS_REORDERED` - sz_haystack_t *parts = self->data.reordered.parts; + sz_string_view_t *parts = self->data.reordered.parts; size_t count = self->data.reordered.count; // Allocate temporary memory to store the ordering offsets @@ -1627,7 +1620,7 @@ static PyObject *Strs_sort(Strs *self, PyObject *args, PyObject *kwargs) { } } - boolean_t reverse = 0; // Default is False + sz_bool_t reverse = 0; // Default is False if (reverse_obj) { if (!PyBool_Check(reverse_obj)) { PyErr_SetString(PyExc_TypeError, "The reverse must be a boolean"); @@ -1636,7 +1629,7 @@ static PyObject *Strs_sort(Strs *self, PyObject *args, PyObject *kwargs) { reverse = PyObject_IsTrue(reverse_obj); } - sz_haystack_t *parts = NULL; + sz_string_view_t *parts = NULL; sz_size_t *order = NULL; sz_size_t count = 0; if (!Strs_sort_(self, &parts, &order, &count)) return NULL; @@ -1680,7 +1673,7 @@ static PyObject *Strs_order(Strs *self, PyObject *args, PyObject *kwargs) { } } - boolean_t reverse = 0; // Default is False + sz_bool_t reverse = 0; // Default is False if (reverse_obj) { if (!PyBool_Check(reverse_obj)) { PyErr_SetString(PyExc_TypeError, "The reverse must be a boolean"); @@ -1689,7 +1682,7 @@ static PyObject *Strs_order(Strs *self, PyObject *args, PyObject *kwargs) { reverse = PyObject_IsTrue(reverse_obj); } - sz_haystack_t *parts = NULL; + sz_string_view_t *parts = NULL; sz_size_t *order = NULL; sz_size_t count = 0; if (!Strs_sort_(self, &parts, &order, &count)) return NULL; diff --git a/scripts/test.cpp b/scripts/test.cpp index 8dc1a4d2..b61b7d40 100644 --- a/scripts/test.cpp +++ b/scripts/test.cpp @@ -232,19 +232,22 @@ int main(int, char const **) { bench_search("std::search", full_text, [&]() mutable { return std::search(full_text.begin(), full_text.end(), needle.begin(), needle.end()) - full_text.begin(); }); - bench_search("sz_find_substr_swar", full_text, [&]() mutable { - sz_string_ptr_t ptr = sz_find_substr_swar(full_text.data(), full_text.size(), needle.data(), needle.size()); + bench_search("sz_find_substring_swar", full_text, [&]() mutable { + sz_string_start_t ptr = + sz_find_substring_swar(full_text.data(), full_text.size(), needle.data(), needle.size()); return ptr ? ptr - full_text.data() : full_text.size(); }); #if defined(__ARM_NEON) - bench_search("sz_find_substr_neon", full_text, [&]() mutable { - sz_string_ptr_t ptr = sz_find_substr_neon(full_text.data(), full_text.size(), needle.data(), needle.size()); + bench_search("sz_find_substring_neon", full_text, [&]() mutable { + sz_string_start_t ptr = + sz_find_substring_neon(full_text.data(), full_text.size(), needle.data(), needle.size()); return ptr ? ptr - full_text.data() : full_text.size(); }); #endif #if defined(__AVX2__) - bench_search("sz_find_substr_avx2", full_text, [&]() mutable { - sz_string_ptr_t ptr = sz_find_substr_avx2(full_text.data(), full_text.size(), needle.data(), needle.size()); + bench_search("sz_find_substring_avx2", full_text, [&]() mutable { + sz_string_start_t ptr = + sz_find_substring_avx2(full_text.data(), full_text.size(), needle.data(), needle.size()); return ptr ? ptr - full_text.data() : full_text.size(); }); #endif diff --git a/stringzilla/stringzilla.h b/stringzilla/stringzilla.h index ba7f5f39..c7c0ae49 100644 --- a/stringzilla/stringzilla.h +++ b/stringzilla/stringzilla.h @@ -45,16 +45,16 @@ typedef unsigned long sz_size_t; typedef unsigned sz_size_t; #endif -typedef int sz_bool_t; // Only one relevant bit -typedef unsigned sz_u32_t; // Always 32 bits -typedef unsigned long long sz_u64_t; // Always 64 bits -typedef char const *sz_string_ptr_t; // A type alias for `char const * ` +typedef int sz_bool_t; // Only one relevant bit +typedef unsigned sz_u32_t; // Always 32 bits +typedef unsigned long long sz_u64_t; // Always 64 bits +typedef char const *sz_string_start_t; // A type alias for `char const * ` /** * @brief Helper construct for higher-level bindings. */ typedef struct sz_string_view_t { - sz_string_ptr_t start; + sz_string_start_t start; sz_size_t length; } sz_string_view_t; @@ -72,8 +72,8 @@ typedef union _sz_anomaly_t { * Doesn't provide major performance improvements, but helps avoid the LibC dependency. * @return 1 for `true`, and 0 for `false`. */ -inline static sz_bool_t sz_equal(sz_string_ptr_t a, sz_string_ptr_t b, sz_size_t length) { - sz_string_ptr_t const a_end = a + length; +inline static sz_bool_t sz_equal(sz_string_start_t a, sz_string_start_t b, sz_size_t length) { + sz_string_start_t const a_end = a + length; while (a != a_end && *a == *b) a++, b++; return a_end == a; } @@ -82,13 +82,13 @@ inline static sz_bool_t sz_equal(sz_string_ptr_t a, sz_string_ptr_t b, sz_size_t * @brief Count the number of occurrences of a @b single-character needle in an arbitrary length haystack. * This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time. */ -inline static sz_size_t sz_count_char_swar(sz_string_ptr_t const haystack, +inline static sz_size_t sz_count_char_swar(sz_string_start_t const haystack, sz_size_t const haystack_length, - sz_string_ptr_t const needle) { + sz_string_start_t const needle) { sz_size_t result = 0; - sz_string_ptr_t text = haystack; - sz_string_ptr_t const end = haystack + haystack_length; + sz_string_start_t text = haystack; + sz_string_start_t const end = haystack + haystack_length; // Process the misaligned head, to void UB on unaligned 64-bit loads. for (; ((unsigned long)text & 7ul) && text < end; ++text) result += *text == *needle; @@ -117,12 +117,12 @@ inline static sz_size_t sz_count_char_swar(sz_string_ptr_t const haystack, * This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time. * Identical to `memchr(haystack, needle[0], haystack_length)`. */ -inline static sz_string_ptr_t sz_find_1char_swar(sz_string_ptr_t const haystack, - sz_size_t const haystack_length, - sz_string_ptr_t const needle) { +inline static sz_string_start_t sz_find_1char_swar(sz_string_start_t const haystack, + sz_size_t const haystack_length, + sz_string_start_t const needle) { - sz_string_ptr_t text = haystack; - sz_string_ptr_t const end = haystack + haystack_length; + sz_string_start_t text = haystack; + sz_string_start_t const end = haystack + haystack_length; // Process the misaligned head, to void UB on unaligned 64-bit loads. for (; ((unsigned long)text & 7ul) && text < end; ++text) @@ -154,12 +154,12 @@ inline static sz_string_ptr_t sz_find_1char_swar(sz_string_ptr_t const haystack, * This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time. * Identical to `memrchr(haystack, needle[0], haystack_length)`. */ -inline static sz_string_ptr_t sz_rfind_1char_swar(sz_string_ptr_t const haystack, - sz_size_t const haystack_length, - sz_string_ptr_t const needle) { +inline static sz_string_start_t sz_rfind_1char_swar(sz_string_start_t const haystack, + sz_size_t const haystack_length, + sz_string_start_t const needle) { - sz_string_ptr_t const end = haystack + haystack_length; - sz_string_ptr_t text = end - 1; + sz_string_start_t const end = haystack + haystack_length; + sz_string_start_t text = end - 1; // Process the misaligned head, to void UB on unaligned 64-bit loads. for (; ((unsigned long)text & 7ul) && text >= haystack; --text) @@ -190,12 +190,12 @@ inline static sz_string_ptr_t sz_rfind_1char_swar(sz_string_ptr_t const haystack * @brief Find the first occurrence of a @b two-character needle in an arbitrary length haystack. * This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time. */ -inline static sz_string_ptr_t sz_find_2char_swar(sz_string_ptr_t const haystack, - sz_size_t const haystack_length, - sz_string_ptr_t const needle) { +inline static sz_string_start_t sz_find_2char_swar(sz_string_start_t const haystack, + sz_size_t const haystack_length, + sz_string_start_t const needle) { - sz_string_ptr_t text = haystack; - sz_string_ptr_t const end = haystack + haystack_length; + sz_string_start_t text = haystack; + sz_string_start_t const end = haystack + haystack_length; // Process the misaligned head, to void UB on unaligned 64-bit loads. for (; ((unsigned long)text & 7ul) && text + 2 <= end; ++text) @@ -239,12 +239,12 @@ inline static sz_string_ptr_t sz_find_2char_swar(sz_string_ptr_t const haystack, * @brief Find the first occurrence of a three-character needle in an arbitrary length haystack. * This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time. */ -inline static sz_string_ptr_t sz_find_3char_swar(sz_string_ptr_t const haystack, - sz_size_t const haystack_length, - sz_string_ptr_t const needle) { +inline static sz_string_start_t sz_find_3char_swar(sz_string_start_t const haystack, + sz_size_t const haystack_length, + sz_string_start_t const needle) { - sz_string_ptr_t text = haystack; - sz_string_ptr_t end = haystack + haystack_length; + sz_string_start_t text = haystack; + sz_string_start_t end = haystack + haystack_length; // Process the misaligned head, to void UB on unaligned 64-bit loads. for (; ((unsigned long)text & 7ul) && text + 3 <= end; ++text) @@ -301,12 +301,12 @@ inline static sz_string_ptr_t sz_find_3char_swar(sz_string_ptr_t const haystack, * @brief Find the first occurrence of a @b four-character needle in an arbitrary length haystack. * This implementation uses hardware-agnostic SWAR technique, to process 8 characters at a time. */ -inline static sz_string_ptr_t sz_find_4char_swar(sz_string_ptr_t const haystack, - sz_size_t const haystack_length, - sz_string_ptr_t const needle) { +inline static sz_string_start_t sz_find_4char_swar(sz_string_start_t const haystack, + sz_size_t const haystack_length, + sz_string_start_t const needle) { - sz_string_ptr_t text = haystack; - sz_string_ptr_t end = haystack + haystack_length; + sz_string_start_t text = haystack; + sz_string_start_t end = haystack + haystack_length; // Process the misaligned head, to void UB on unaligned 64-bit loads. for (; ((unsigned long)text & 7ul) && text + 4 <= end; ++text) @@ -367,10 +367,10 @@ inline static sz_string_ptr_t sz_find_4char_swar(sz_string_ptr_t const haystack, * it compares 4-byte anomalies first, most commonly prefixes. It's computationally cheaper. * Matching performance fluctuates between 1 GB/s and 3,5 GB/s per core. */ -inline static sz_string_ptr_t sz_find_substr_swar( // - sz_string_ptr_t const haystack, +inline static sz_string_start_t sz_find_substring_swar( // + sz_string_start_t const haystack, sz_size_t const haystack_length, - sz_string_ptr_t const needle, + sz_string_start_t const needle, sz_size_t const needle_length) { if (haystack_length < needle_length) return NULL; @@ -383,12 +383,12 @@ inline static sz_string_ptr_t sz_find_substr_swar( // case 3: return sz_find_3char_swar(haystack, haystack_length, needle); case 4: return sz_find_4char_swar(haystack, haystack_length, needle); default: { - sz_string_ptr_t text = haystack; - sz_string_ptr_t const end = haystack + haystack_length; + sz_string_start_t text = haystack; + sz_string_start_t const end = haystack + haystack_length; _sz_anomaly_t n_anomaly, h_anomaly; sz_size_t const n_suffix_len = needle_length - 4 - anomaly_offset; - sz_string_ptr_t n_suffix_ptr = needle + 4 + anomaly_offset; + sz_string_start_t n_suffix_ptr = needle + 4 + anomaly_offset; n_anomaly.u8s[0] = needle[anomaly_offset]; n_anomaly.u8s[1] = needle[anomaly_offset + 1]; n_anomaly.u8s[2] = needle[anomaly_offset + 2]; @@ -401,10 +401,9 @@ inline static sz_string_ptr_t sz_find_substr_swar( // text += anomaly_offset; while (text + needle_length <= end) { h_anomaly.u8s[3] = text[3]; - if (h_anomaly.u32 == n_anomaly.u32) // Match anomaly. - if (sz_equal(text + 4, n_suffix_ptr, n_suffix_len)) // Match suffix. - if (sz_equal(text - anomaly_offset, needle, anomaly_offset)) // Match prefix - optimized out. - return text - anomaly_offset; + if (h_anomaly.u32 == n_anomaly.u32) // Match anomaly. + if (sz_equal(text + 4, n_suffix_ptr, n_suffix_len)) // Match suffix. + return text; h_anomaly.u32 >>= 8; ++text; @@ -417,8 +416,8 @@ inline static sz_string_ptr_t sz_find_substr_swar( // /** * Helper function, used in substring search operations. */ -inline static void _sz_find_substr_populate_anomaly( // - sz_string_ptr_t const needle, +inline static void _sz_find_substring_populate_anomaly( // + sz_string_start_t const needle, sz_size_t const needle_length, _sz_anomaly_t *anomaly_out, _sz_anomaly_t *mask_out) { @@ -455,16 +454,16 @@ inline static void _sz_find_substr_populate_anomaly( // * was practically more efficient than loading once and shifting around, as introduces * less data dependencies. */ -inline static sz_string_ptr_t sz_find_substr_avx2(sz_string_ptr_t const haystack, - sz_size_t const haystack_length, - sz_string_ptr_t const needle, - sz_size_t const needle_length) { +inline static sz_string_start_t sz_find_substring_avx2(sz_string_start_t const haystack, + sz_size_t const haystack_length, + sz_string_start_t const needle, + sz_size_t const needle_length) { // Precomputed constants - sz_string_ptr_t const end = haystack + haystack_length; + sz_string_start_t const end = haystack + haystack_length; _sz_anomaly_t anomaly; _sz_anomaly_t mask; - _sz_find_substr_populate_anomaly(needle, needle_length, &anomaly, &mask); + _sz_find_substring_populate_anomaly(needle, needle_length, &anomaly, &mask); __m256i const anomalies = _mm256_set1_epi32(anomaly.u32); __m256i const masks = _mm256_set1_epi32(mask.u32); @@ -477,7 +476,7 @@ inline static sz_string_ptr_t sz_find_substr_avx2(sz_string_ptr_t const haystack // + 4 movemasks. // + 3 bitwise ANDs. // + 1 heavy (but very unlikely) branch. - sz_string_ptr_t text = haystack; + sz_string_start_t text = haystack; while (text + needle_length + 32 <= end) { // Performing many unaligned loads ends up being faster than loading once and shuffling around. @@ -511,7 +510,7 @@ inline static sz_string_ptr_t sz_find_substr_avx2(sz_string_ptr_t const haystack } // Don't forget the last (up to 35) characters. - return sz_find_substr_swar(text, end - text, needle, needle_length); + return sz_find_substring_swar(text, end - text, needle, needle_length); } #endif // x86 AVX2 @@ -524,21 +523,21 @@ inline static sz_string_ptr_t sz_find_substr_avx2(sz_string_ptr_t const haystack * was practically more efficient than loading once and shifting around, as introduces * less data dependencies. */ -inline static sz_string_ptr_t sz_find_substr_neon(sz_string_ptr_t const haystack, - sz_size_t const haystack_length, - sz_string_ptr_t const needle, - sz_size_t const needle_length) { +inline static sz_string_start_t sz_find_substring_neon(sz_string_start_t const haystack, + sz_size_t const haystack_length, + sz_string_start_t const needle, + sz_size_t const needle_length) { // Precomputed constants - sz_string_ptr_t const end = haystack + haystack_length; + sz_string_start_t const end = haystack + haystack_length; _sz_anomaly_t anomaly; _sz_anomaly_t mask; - _sz_find_substr_populate_anomaly(needle, needle_length, &anomaly, &mask); + _sz_find_substring_populate_anomaly(needle, needle_length, &anomaly, &mask); uint32x4_t const anomalies = vld1q_dup_u32(&anomaly.u32); uint32x4_t const masks = vld1q_dup_u32(&mask.u32); uint32x4_t matches, matches0, matches1, matches2, matches3; - sz_string_ptr_t text = haystack; + sz_string_start_t text = haystack; while (text + needle_length + 16 <= end) { // Each of the following `matchesX` contains only 4 relevant bits - one per word. @@ -582,40 +581,40 @@ inline static sz_string_ptr_t sz_find_substr_neon(sz_string_ptr_t const haystack } // Don't forget the last (up to 16+3=19) characters. - return sz_find_substr_swar(text, end - text, needle, needle_length); + return sz_find_substring_swar(text, end - text, needle, needle_length); } #endif // Arm Neon -inline static sz_size_t sz_count_char(sz_string_ptr_t const haystack, +inline static sz_size_t sz_count_char(sz_string_start_t const haystack, sz_size_t const haystack_length, - sz_string_ptr_t const needle) { + sz_string_start_t const needle) { return sz_count_char_swar(haystack, haystack_length, needle); } -inline static sz_string_ptr_t sz_find_1char(sz_string_ptr_t const haystack, - sz_size_t const haystack_length, - sz_string_ptr_t const needle) { +inline static sz_string_start_t sz_find_1char(sz_string_start_t const haystack, + sz_size_t const haystack_length, + sz_string_start_t const needle) { return sz_find_1char_swar(haystack, haystack_length, needle); } -inline static sz_string_ptr_t sz_rfind_1char(sz_string_ptr_t const haystack, - sz_size_t const haystack_length, - sz_string_ptr_t const needle) { +inline static sz_string_start_t sz_rfind_1char(sz_string_start_t const haystack, + sz_size_t const haystack_length, + sz_string_start_t const needle) { return sz_rfind_1char_swar(haystack, haystack_length, needle); } -inline static sz_string_ptr_t sz_find_substr(sz_string_ptr_t const haystack, - sz_size_t const haystack_length, - sz_string_ptr_t const needle, - sz_size_t const needle_length) { +inline static sz_string_start_t sz_find_substring(sz_string_start_t const haystack, + sz_size_t const haystack_length, + sz_string_start_t const needle, + sz_size_t const needle_length) { if (haystack_length < needle_length || needle_length == 0) return NULL; #if defined(__ARM_NEON) - return sz_find_substr_neon(haystack, haystack_length, needle, needle_length); + return sz_find_substring_neon(haystack, haystack_length, needle, needle_length); #elif defined(__AVX2__) - return sz_find_substr_avx2(haystack, haystack_length, needle, needle_length); + return sz_find_substring_avx2(haystack, haystack_length, needle, needle_length); #else - return sz_find_substr_swar(haystack, haystack_length, needle, needle_length); + return sz_find_substring_swar(haystack, haystack_length, needle, needle_length); #endif } @@ -669,30 +668,46 @@ inline static char sz_toupper_ascii(char c) { return *(char *)&upped[(int)c]; } +inline static sz_u64_t sz_u64_unaligned_load(void const *ptr) { +#ifdef _MSC_VER + return *((__unaligned sz_u64_t *)ptr); +#else + __attribute__((aligned(1))) sz_u64_t const *uptr = (sz_u64_t const *)ptr; + return *uptr; +#endif +} + +inline static sz_u64_t sz_u64_byte_reverse(sz_u64_t val) { +#ifdef _MSC_VER + return _byteswap_uint64(val); +#else + return __builtin_bswap64(val); +#endif +} + /** * @brief Char-level lexicographic comparison of two strings. * Doesn't provide major performance improvements, but helps avoid the LibC dependency. */ -inline static sz_bool_t sz_is_less_ascii(sz_string_ptr_t const a, +inline static sz_bool_t sz_is_less_ascii(sz_string_start_t a, sz_size_t const a_length, - sz_string_ptr_t const b, + sz_string_start_t b, sz_size_t const b_length) { sz_size_t min_length = (a_length < b_length) ? a_length : b_length; - for (sz_size_t i = 0; i < min_length; ++i) { - if (a[i] < b[i]) return 1; - if (a[i] > b[i]) return 0; - } - return a_length < b_length; + sz_string_start_t const min_end = a + min_length; + while (a + 8 <= min_end && sz_u64_unaligned_load(a) == sz_u64_unaligned_load(b)) a += 8, b += 8; + while (a != min_end && *a == *b) a++, b++; + return a != min_end ? (*a < *b) : (a_length < b_length); } /** * @brief Char-level lexicographic comparison of two strings, insensitive to the case of ASCII symbols. * Doesn't provide major performance improvements, but helps avoid the LibC dependency. */ -inline static sz_bool_t sz_is_less_uncased_ascii(sz_string_ptr_t const a, +inline static sz_bool_t sz_is_less_uncased_ascii(sz_string_start_t const a, sz_size_t const a_length, - sz_string_ptr_t const b, + sz_string_start_t const b, sz_size_t const b_length) { sz_size_t min_length = (a_length < b_length) ? a_length : b_length; @@ -716,11 +731,11 @@ inline static void _sz_swap_order(sz_u64_t *a, sz_u64_t *b) { struct sz_sequence_t; -typedef sz_string_ptr_t (*sz_sequence_member_start_t)(struct sz_sequence_t const *, sz_size_t); +typedef sz_string_start_t (*sz_sequence_member_start_t)(struct sz_sequence_t const *, sz_size_t); typedef sz_size_t (*sz_sequence_member_length_t)(struct sz_sequence_t const *, sz_size_t); typedef sz_bool_t (*sz_sequence_predicate_t)(struct sz_sequence_t const *, sz_size_t); typedef sz_bool_t (*sz_sequence_comparator_t)(struct sz_sequence_t const *, sz_size_t, sz_size_t); -typedef sz_bool_t (*sz_string_is_less_t)(sz_string_ptr_t, sz_size_t, sz_string_ptr_t, sz_size_t); +typedef sz_bool_t (*sz_string_is_less_t)(sz_string_start_t, sz_size_t, sz_string_start_t, sz_size_t); typedef struct sz_sequence_t { sz_u64_t *order; @@ -795,9 +810,12 @@ inline static void sz_sort_insertion(sz_sequence_t *sequence, sz_sequence_compar // Utility functions inline static sz_size_t _sz_log2i(sz_size_t n) { - sz_size_t log2 = 0; - while (n >>= 1) ++log2; - return log2; + if (n == 0) return 0; // to avoid undefined behavior with __builtin_clz +#if defined(__LP64__) || defined(_WIN64) // 64-bit + return 63 - __builtin_clzll(n); +#else // 32-bit + return 31 - __builtin_clz(n); +#endif } inline static void _sz_sift_down( @@ -893,7 +911,7 @@ inline static void _sz_introsort( sz_u64_t pivot = sequence->order[median]; sz_size_t left = first; sz_size_t right = last - 1; - while (true) { + while (1) { while (less(sequence, sequence->order[left], pivot)) left++; while (less(sequence, pivot, sequence->order[right])) right--; if (left >= right) break; @@ -962,17 +980,17 @@ inline static void _sz_sort_recursion( // } inline static sz_bool_t _sz_sort_compare_less_ascii(sz_sequence_t *sequence, sz_size_t i_key, sz_size_t j_key) { - sz_string_ptr_t i_str = sequence->get_start(sequence, i_key); + sz_string_start_t i_str = sequence->get_start(sequence, i_key); sz_size_t i_len = sequence->get_length(sequence, i_key); - sz_string_ptr_t j_str = sequence->get_start(sequence, j_key); + sz_string_start_t j_str = sequence->get_start(sequence, j_key); sz_size_t j_len = sequence->get_length(sequence, j_key); return sz_is_less_ascii(i_str, i_len, j_str, j_len); } inline static sz_bool_t _sz_sort_compare_less_uncased_ascii(sz_sequence_t *sequence, sz_size_t i_key, sz_size_t j_key) { - sz_string_ptr_t i_str = sequence->get_start(sequence, i_key); + sz_string_start_t i_str = sequence->get_start(sequence, i_key); sz_size_t i_len = sequence->get_length(sequence, i_key); - sz_string_ptr_t j_str = sequence->get_start(sequence, j_key); + sz_string_start_t j_str = sequence->get_start(sequence, j_key); sz_size_t j_len = sequence->get_length(sequence, j_key); return sz_is_less_uncased_ascii(i_str, i_len, j_str, j_len); } @@ -994,7 +1012,7 @@ inline static void sz_sort(sz_sequence_t *sequence, sz_sort_config_t const *conf // Export up to 4 bytes into the `sequence` bits themselves for (sz_size_t i = 0; i != sequence->count; ++i) { - sz_string_ptr_t begin = sequence->get_start(sequence, sequence->order[i]); + sz_string_start_t begin = sequence->get_start(sequence, sequence->order[i]); sz_size_t length = sequence->get_length(sequence, sequence->order[i]); length = length > 4ul ? 4ul : length; char *prefix = (char *)&sequence->order[i]; @@ -1038,9 +1056,9 @@ inline static levenstein_distance_t _sz_levenstein_minimum( // * It accepts an upper bound on the possible error. Quadratic complexity in time, linear in space. */ inline static levenstein_distance_t sz_levenstein( // - sz_string_ptr_t const a, + sz_string_start_t const a, sz_size_t const a_length, - sz_string_ptr_t const b, + sz_string_start_t const b, sz_size_t const b_length, levenstein_distance_t const bound, void *buffer) { @@ -1093,11 +1111,11 @@ inline static levenstein_distance_t sz_levenstein( // /** * @brief Hashes provided string using hardware-accelerated CRC32 instructions. */ -inline static sz_u32_t sz_hash_crc32_native(sz_string_ptr_t start, sz_size_t length) { return 0; } +inline static sz_u32_t sz_hash_crc32_native(sz_string_start_t start, sz_size_t length) { return 0; } -inline static sz_u32_t sz_hash_crc32_neon(sz_string_ptr_t start, sz_size_t length) { return 0; } +inline static sz_u32_t sz_hash_crc32_neon(sz_string_start_t start, sz_size_t length) { return 0; } -inline static sz_u32_t sz_hash_crc32_sse(sz_string_ptr_t start, sz_size_t length) { return 0; } +inline static sz_u32_t sz_hash_crc32_sse(sz_string_start_t start, sz_size_t length) { return 0; } #ifdef __cplusplus } From 416b885429d2eb97e6c677eaac0eba6de5ff9fc4 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Mon, 9 Oct 2023 16:51:35 -0700 Subject: [PATCH 8/8] Make: Formatting and docs --- .vscode/settings.json | 3 +- CMakeLists.txt | 140 ++++++++++++++++++++++-------------------- README.md | 10 +-- scripts/bench.ipynb | 20 ++++-- scripts/test.c | 14 ++--- 5 files changed, 100 insertions(+), 87 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 08c5bb65..575441f2 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -118,7 +118,8 @@ "strstream": "cpp", "filesystem": "cpp", "stringzilla.h": "c", - "__memory": "c" + "__memory": "c", + "charconv": "c" }, "C_Cpp.default.configurationProvider": "ms-vscode.cmake-tools", "cSpell.words": [ diff --git a/CMakeLists.txt b/CMakeLists.txt index df569329..230c2a06 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,105 +1,109 @@ # This CMake file is heavily inspired by following `stringzilla` CMake: # https://github.com/nlohmann/json/blob/develop/CMakeLists.txt cmake_minimum_required(VERSION 3.1) -project(stringzilla VERSION 0.1.0 LANGUAGES C CXX) +project( + stringzilla + VERSION 0.1.0 + LANGUAGES C CXX) -set (CMAKE_C_STANDARD 11) -set (CMAKE_CXX_STANDARD 17) +set(CMAKE_C_STANDARD 11) +set(CMAKE_CXX_STANDARD 17) -# Determine if USearch is built as a subproject (using `add_subdirectory`) or if it is the main project +# Determine if USearch is built as a subproject (using `add_subdirectory`) or if +# it is the main project set(STRINGZILLA_IS_MAIN_PROJECT OFF) -if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR) - set(STRINGZILLA_IS_MAIN_PROJECT ON) +if(CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR) + set(STRINGZILLA_IS_MAIN_PROJECT ON) endif() # Options option(STRINGZILLA_INSTALL "Install CMake targets" OFF) -option(STRINGZILLA_BUILD_TEST "Compile a native unit test in C++" ${STRINGZILLA_IS_MAIN_PROJECT}) -option(STRINGZILLA_BUILD_BENCHMARK "Compile a native benchmark in C++" ${STRINGZILLA_IS_MAIN_PROJECT}) +option(STRINGZILLA_BUILD_TEST "Compile a native unit test in C++" + ${STRINGZILLA_IS_MAIN_PROJECT}) +option(STRINGZILLA_BUILD_BENCHMARK "Compile a native benchmark in C++" + ${STRINGZILLA_IS_MAIN_PROJECT}) option(STRINGZILLA_BUILD_WOLFRAM "Compile Wolfram Language bindings" OFF) # Includes set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH}) include(ExternalProject) -# Allow CMake 3.13+ to override options when using FetchContent / add_subdirectory -if (POLICY CMP0077) - cmake_policy(SET CMP0077 NEW) -endif () +# Allow CMake 3.13+ to override options when using FetchContent / +# add_subdirectory +if(POLICY CMP0077) + cmake_policy(SET CMP0077 NEW) +endif() # Configuration include(GNUInstallDirs) -set(STRINGZILLA_TARGET_NAME ${PROJECT_NAME}) -set(STRINGZILLA_CONFIG_INSTALL_DIR "${CMAKE_INSTALL_DATADIR}/cmake/${PROJECT_NAME}" CACHE INTERNAL "") -set(STRINGZILLA_INCLUDE_INSTALL_DIR "${CMAKE_INSTALL_INCLUDEDIR}") -set(STRINGZILLA_TARGETS_EXPORT_NAME "${PROJECT_NAME}Targets") -set(STRINGZILLA_CMAKE_CONFIG_TEMPLATE "cmake/config.cmake.in") -set(STRINGZILLA_CMAKE_CONFIG_DIR "${CMAKE_CURRENT_BINARY_DIR}") -set(STRINGZILLA_CMAKE_VERSION_CONFIG_FILE "${STRINGZILLA_CMAKE_CONFIG_DIR}/${PROJECT_NAME}ConfigVersion.cmake") -set(STRINGZILLA_CMAKE_PROJECT_CONFIG_FILE "${STRINGZILLA_CMAKE_CONFIG_DIR}/${PROJECT_NAME}Config.cmake") -set(STRINGZILLA_CMAKE_PROJECT_TARGETS_FILE "${STRINGZILLA_CMAKE_CONFIG_DIR}/${PROJECT_NAME}Targets.cmake") -set(STRINGZILLA_PKGCONFIG_INSTALL_DIR "${CMAKE_INSTALL_DATADIR}/pkgconfig") - +set(STRINGZILLA_TARGET_NAME ${PROJECT_NAME}) +set(STRINGZILLA_CONFIG_INSTALL_DIR + "${CMAKE_INSTALL_DATADIR}/cmake/${PROJECT_NAME}" + CACHE INTERNAL "") +set(STRINGZILLA_INCLUDE_INSTALL_DIR "${CMAKE_INSTALL_INCLUDEDIR}") +set(STRINGZILLA_TARGETS_EXPORT_NAME "${PROJECT_NAME}Targets") +set(STRINGZILLA_CMAKE_CONFIG_TEMPLATE "cmake/config.cmake.in") +set(STRINGZILLA_CMAKE_CONFIG_DIR "${CMAKE_CURRENT_BINARY_DIR}") +set(STRINGZILLA_CMAKE_VERSION_CONFIG_FILE + "${STRINGZILLA_CMAKE_CONFIG_DIR}/${PROJECT_NAME}ConfigVersion.cmake") +set(STRINGZILLA_CMAKE_PROJECT_CONFIG_FILE + "${STRINGZILLA_CMAKE_CONFIG_DIR}/${PROJECT_NAME}Config.cmake") +set(STRINGZILLA_CMAKE_PROJECT_TARGETS_FILE + "${STRINGZILLA_CMAKE_CONFIG_DIR}/${PROJECT_NAME}Targets.cmake") +set(STRINGZILLA_PKGCONFIG_INSTALL_DIR "${CMAKE_INSTALL_DATADIR}/pkgconfig") # Define our header-only library add_library(${STRINGZILLA_TARGET_NAME} INTERFACE) -add_library(${PROJECT_NAME}::${STRINGZILLA_TARGET_NAME} ALIAS ${STRINGZILLA_TARGET_NAME}) +add_library(${PROJECT_NAME}::${STRINGZILLA_TARGET_NAME} ALIAS + ${STRINGZILLA_TARGET_NAME}) set(STRINGZILLA_INCLUDE_BUILD_DIR "${PROJECT_SOURCE_DIR}/include/") target_compile_definitions( - ${STRINGZILLA_TARGET_NAME} - INTERFACE - $<$>:STRINGZILLA_USE_OPENMP=0> -) + ${STRINGZILLA_TARGET_NAME} + INTERFACE $<$>:STRINGZILLA_USE_OPENMP=0>) target_include_directories( - ${STRINGZILLA_TARGET_NAME} - ${STRINGZILLA_SYSTEM_INCLUDE} INTERFACE - $ - $ -) + ${STRINGZILLA_TARGET_NAME} ${STRINGZILLA_SYSTEM_INCLUDE} + INTERFACE $ + $) if(STRINGZILLA_INSTALL) - install( - DIRECTORY ${STRINGZILLA_INCLUDE_BUILD_DIR} - DESTINATION ${STRINGZILLA_INCLUDE_INSTALL_DIR} - ) - install( - FILES ${STRINGZILLA_CMAKE_PROJECT_CONFIG_FILE} ${STRINGZILLA_CMAKE_VERSION_CONFIG_FILE} - DESTINATION ${STRINGZILLA_CONFIG_INSTALL_DIR} - ) - export( - TARGETS ${STRINGZILLA_TARGET_NAME} - NAMESPACE ${PROJECT_NAME}:: - FILE ${STRINGZILLA_CMAKE_PROJECT_TARGETS_FILE} - ) - install( - TARGETS ${STRINGZILLA_TARGET_NAME} - EXPORT ${STRINGZILLA_TARGETS_EXPORT_NAME} - INCLUDES DESTINATION ${STRINGZILLA_INCLUDE_INSTALL_DIR} - ) - install( - EXPORT ${STRINGZILLA_TARGETS_EXPORT_NAME} - NAMESPACE ${PROJECT_NAME}:: - DESTINATION ${STRINGZILLA_CONFIG_INSTALL_DIR} - ) - install( - FILES "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}.pc" - DESTINATION ${STRINGZILLA_PKGCONFIG_INSTALL_DIR} - ) + install(DIRECTORY ${STRINGZILLA_INCLUDE_BUILD_DIR} + DESTINATION ${STRINGZILLA_INCLUDE_INSTALL_DIR}) + install(FILES ${STRINGZILLA_CMAKE_PROJECT_CONFIG_FILE} + ${STRINGZILLA_CMAKE_VERSION_CONFIG_FILE} + DESTINATION ${STRINGZILLA_CONFIG_INSTALL_DIR}) + export( + TARGETS ${STRINGZILLA_TARGET_NAME} + NAMESPACE ${PROJECT_NAME}:: + FILE ${STRINGZILLA_CMAKE_PROJECT_TARGETS_FILE}) + install( + TARGETS ${STRINGZILLA_TARGET_NAME} + EXPORT ${STRINGZILLA_TARGETS_EXPORT_NAME} + INCLUDES + DESTINATION ${STRINGZILLA_INCLUDE_INSTALL_DIR}) + install( + EXPORT ${STRINGZILLA_TARGETS_EXPORT_NAME} + NAMESPACE ${PROJECT_NAME}:: + DESTINATION ${STRINGZILLA_CONFIG_INSTALL_DIR}) + install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}.pc" + DESTINATION ${STRINGZILLA_PKGCONFIG_INSTALL_DIR}) endif() if(${STRINGZILLA_BUILD_TEST} OR ${STRINGZILLA_BUILD_BENCHMARK}) - add_executable(stringzilla_test scripts/test.c) + add_executable(stringzilla_test scripts/test.cpp) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3") + set(CMAKE_CXX_FLAGS + "${CMAKE_CXX_FLAGS} -O3 -flto -march=native -finline-functions -funroll-loops" + ) target_include_directories(stringzilla_test PRIVATE stringzilla) - set_target_properties(stringzilla_test PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) + set_target_properties(stringzilla_test PROPERTIES RUNTIME_OUTPUT_DIRECTORY + ${CMAKE_BINARY_DIR}) - if(${CMAKE_VERSION} VERSION_EQUAL 3.13 OR ${CMAKE_VERSION} VERSION_GREATER 3.13) - include(CTest) - enable_testing() - add_test(NAME stringzilla_test COMMAND stringzilla_test) + if(${CMAKE_VERSION} VERSION_EQUAL 3.13 OR ${CMAKE_VERSION} VERSION_GREATER + 3.13) + include(CTest) + enable_testing() + add_test(NAME stringzilla_test COMMAND stringzilla_test) endif() endif() - diff --git a/README.md b/README.md index 85032c34..8f0765c3 100644 --- a/README.md +++ b/README.md @@ -35,8 +35,8 @@ Coming soon. ## Quick Start: Python 🐍 -1️. Install via pip: `pip install stringzilla` -1. Import the classes you need: `from stringzilla import Str, Strs, File` +1. Install via pip: `pip install stringzilla` +2. Import the classes you need: `from stringzilla import Str, Strs, File` ### Basic Usage @@ -115,13 +115,13 @@ There is an ABI-stable C 99 interface, in case you have a database, an operating #include "stringzilla.h" // Initialize your haystack and needle -sz_haystack_t haystack = {your_text, your_text_length}; -sz_needle_t needle = {your_subtext, your_subtext_length, your_quadgram_offset}; +sz_string_view_t haystack = {your_text, your_text_length}; +sz_string_view_t needle = {your_subtext, your_subtext_length}; // Perform string-level operations size_t character_count = sz_count_char(haystack, 'a'); size_t character_position = sz_find_unigram(haystack, 'a'); -size_t substring_position = sz_find_substr(haystack, needle); +size_t substring_position = sz_find_substring(haystack, needle); // Perform collection level operations sz_sequence_t array = {your_order, your_count, your_get_start, your_get_length, your_handle}; diff --git a/scripts/bench.ipynb b/scripts/bench.ipynb index b3bc4392..492db50a 100644 --- a/scripts/bench.ipynb +++ b/scripts/bench.ipynb @@ -88,7 +88,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "150 ms ± 2.01 ms per loop (mean ± std. dev. of 100 runs, 1 loop each)\n" + "152 ms ± 3.24 ms per loop (mean ± std. dev. of 100 runs, 1 loop each)\n" ] } ], @@ -106,7 +106,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "37.8 ms ± 286 µs per loop (mean ± std. dev. of 100 runs, 1 loop each)\n" + "38.1 ms ± 312 µs per loop (mean ± std. dev. of 100 runs, 1 loop each)\n" ] } ], @@ -124,7 +124,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "173 ns ± 23.7 ns per loop (mean ± std. dev. of 1000 runs, 1 loop each)\n" + "The slowest run took 7.28 times longer than the fastest. This could mean that an intermediate result is being cached.\n", + "186 ns ± 41.1 ns per loop (mean ± std. dev. of 1000 runs, 1 loop each)\n" ] } ], @@ -142,8 +143,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "The slowest run took 82.51 times longer than the fastest. This could mean that an intermediate result is being cached.\n", - "94.3 ns ± 108 ns per loop (mean ± std. dev. of 1000 runs, 1 loop each)\n" + "The slowest run took 120.95 times longer than the fastest. This could mean that an intermediate result is being cached.\n", + "99.6 ns ± 155 ns per loop (mean ± std. dev. of 1000 runs, 1 loop each)\n" ] } ], @@ -152,6 +153,13 @@ "sz_str.find(pattern)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": null, @@ -176,7 +184,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.10.11" }, "orig_nbformat": 4 }, diff --git a/scripts/test.c b/scripts/test.c index 127975b0..b39fd982 100644 --- a/scripts/test.c +++ b/scripts/test.c @@ -18,8 +18,8 @@ void populate_random_string(char *buffer, int length, int variability) { buffer[length] = '\0'; } -// Test function for sz_find_substr -void test_sz_find_substr() { +// Test function for sz_find_substring +void test_sz_find_substring() { char buffer[MAX_LENGTH + 1]; char pattern[6]; // Maximum length of 5 + 1 for '\0' @@ -39,11 +39,11 @@ void test_sz_find_substr() { needle.length = pattern_length; // Comparing the result of your function with the standard library function. - sz_string_ptr_t result_libc = strstr(buffer, pattern); - sz_string_ptr_t result_stringzilla = - sz_find_substr(haystack.start, haystack.length, needle.start, needle.length); + sz_string_start_t result_libc = strstr(buffer, pattern); + sz_string_start_t result_stringzilla = + sz_find_substring(haystack.start, haystack.length, needle.start, needle.length); - assert(((result_libc == NULL) ^ (result_stringzilla == NULL)) && "Test failed for sz_find_substr"); + assert(((result_libc == NULL) ^ (result_stringzilla == NULL)) && "Test failed for sz_find_substring"); } } } @@ -51,7 +51,7 @@ void test_sz_find_substr() { int main() { srand((unsigned int)time(NULL)); - test_sz_find_substr(); + test_sz_find_substring(); // Add calls to other test functions as you implement them printf("All tests passed!\n");