Skip to content

Commit 0fda5f4

Browse files
committed
Add: Random strings generator
1 parent 461afd0 commit 0fda5f4

File tree

3 files changed

+59
-87
lines changed

3 files changed

+59
-87
lines changed

include/stringzilla/stringzilla.h

Lines changed: 39 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@
181181
#endif
182182
#endif
183183

184-
#define sz_assert(condition, message, ...) \
184+
#define SZ_ASSERT(condition, message, ...) \
185185
do { \
186186
if (!(condition)) { \
187187
fprintf(stderr, "Assertion failed: %s, in file %s, line %d\n", #condition, __FILE__, __LINE__); \
@@ -258,6 +258,7 @@ SZ_PUBLIC void sz_u8_set_invert(sz_u8_set_t *f) {
258258

259259
typedef sz_ptr_t (*sz_memory_allocate_t)(sz_size_t, void *);
260260
typedef void (*sz_memory_free_t)(sz_ptr_t, sz_size_t, void *);
261+
typedef sz_u64_t (*sz_random_generator_t)(void *);
261262

262263
/**
263264
* @brief Some complex pattern matching algorithms may require memory allocations.
@@ -402,7 +403,7 @@ SZ_PUBLIC void sz_toascii(sz_cptr_t text, sz_size_t length, sz_ptr_t result);
402403

403404
/**
404405
* @brief Generates a random string for a given alphabet, avoiding integer division and modulo operations.
405-
* Similar to `result[i] = alphabet[rand() % size]`.
406+
* Similar to `text[i] = alphabet[rand() % cardinality]`.
406407
*
407408
* The modulo operation is expensive, and should be avoided in performance-critical code.
408409
* We avoid it using small lookup tables and replacing it with a multiplication and shifts, similar to libdivide.
@@ -411,11 +412,14 @@ SZ_PUBLIC void sz_toascii(sz_cptr_t text, sz_size_t length, sz_ptr_t result);
411412
* - Barret reduction: https://www.nayuki.io/page/barrett-reduction-algorithm
412413
* - Lemire's trick: https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
413414
*
414-
* @param text String to be normalized.
415-
* @param length Number of bytes in the string.
416-
* @param result Output string, can point to the same address as ::text.
415+
* @param alphabet Set of characters to sample from.
416+
* @param cardinality Number of characters to sample from.
417+
* @param text Output string, can point to the same address as ::text.
418+
* @param generate Callback producing random numbers given the generator state.
419+
* @param generator Generator state, can be a pointer to a seed, or a pointer to a random number generator.
417420
*/
418-
SZ_PUBLIC void sz_generate(sz_cptr_t alphabet, sz_size_t size, sz_ptr_t result, sz_size_t length);
421+
SZ_PUBLIC void sz_generate(sz_cptr_t alphabet, sz_size_t cardinality, sz_ptr_t text, sz_size_t length,
422+
sz_random_generator_t generate, void *generator);
419423

420424
#pragma endregion
421425

@@ -1763,6 +1767,9 @@ SZ_INTERNAL sz_u8_t sz_u8_toupper(sz_u8_t c) {
17631767
/**
17641768
* @brief Uses two small lookup tables (768 bytes total) to accelerate division by a small
17651769
* unsigned integer. Performs two lookups, one multiplication, two shifts, and two accumulations.
1770+
*
1771+
* @param divisor Integral value larger than one.
1772+
* @param number Integral value to divide.
17661773
*/
17671774
SZ_INTERNAL sz_u8_t sz_u8_divide(sz_u8_t number, sz_u8_t divisor) {
17681775
static sz_u16_t multipliers[256] = {
@@ -1783,6 +1790,7 @@ SZ_INTERNAL sz_u8_t sz_u8_divide(sz_u8_t number, sz_u8_t divisor) {
17831790
9363, 9030, 8700, 8373, 8049, 7727, 7409, 7093, 6780, 6470, 6162, 5857, 5554, 5254, 4957, 4662,
17841791
4370, 4080, 3792, 3507, 3224, 2943, 2665, 2388, 2115, 1843, 1573, 1306, 1041, 778, 517, 258,
17851792
};
1793+
// This table can be avoided using a single addition and counting trailing zeros.
17861794
static sz_u8_t shifts[256] = {
17871795
0, 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, //
17881796
4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, //
@@ -1802,26 +1810,40 @@ SZ_INTERNAL sz_u8_t sz_u8_divide(sz_u8_t number, sz_u8_t divisor) {
18021810
}
18031811

18041812
SZ_PUBLIC void sz_tolower_serial(sz_cptr_t text, sz_size_t length, sz_ptr_t result) {
1805-
for (sz_cptr_t end = text + length; text != end; ++text, ++result) {
1806-
*result = sz_u8_tolower(*(sz_u8_t const *)text);
1807-
}
1813+
sz_u8_t *unsigned_result = (sz_u8_t *)result;
1814+
sz_u8_t const *unsigned_text = (sz_u8_t const *)text;
1815+
sz_u8_t const *end = unsigned_text + length;
1816+
for (; unsigned_text != end; ++unsigned_text, ++unsigned_result) *unsigned_result = sz_u8_tolower(*unsigned_text);
18081817
}
18091818

18101819
SZ_PUBLIC void sz_toupper_serial(sz_cptr_t text, sz_size_t length, sz_ptr_t result) {
1811-
for (sz_cptr_t end = text + length; text != end; ++text, ++result) {
1812-
*result = sz_u8_toupper(*(sz_u8_t const *)text);
1813-
}
1820+
sz_u8_t *unsigned_result = (sz_u8_t *)result;
1821+
sz_u8_t const *unsigned_text = (sz_u8_t const *)text;
1822+
sz_u8_t const *end = unsigned_text + length;
1823+
for (; unsigned_text != end; ++unsigned_text, ++unsigned_result) *unsigned_result = sz_u8_toupper(*unsigned_text);
18141824
}
18151825

18161826
SZ_PUBLIC void sz_toascii_serial(sz_cptr_t text, sz_size_t length, sz_ptr_t result) {
1817-
for (sz_cptr_t end = text + length; text != end; ++text, ++result) { *result = *(sz_u8_t const *)text & 0x7F; }
1827+
sz_u8_t *unsigned_result = (sz_u8_t *)result;
1828+
sz_u8_t const *unsigned_text = (sz_u8_t const *)text;
1829+
sz_u8_t const *end = unsigned_text + length;
1830+
for (; unsigned_text != end; ++unsigned_text, ++unsigned_result) *unsigned_result = *unsigned_text & 0x7F;
18181831
}
18191832

1820-
SZ_PUBLIC void sz_toascii_serial(sz_cptr_t text, sz_size_t length, sz_ptr_t result) {
1821-
for (sz_cptr_t end = text + length; text != end; ++text, ++result) { *result = *(sz_u8_t const *)text & 0x7F; }
1822-
}
1833+
SZ_PUBLIC void sz_generate(sz_cptr_t alphabet, sz_size_t alphabet_size, sz_ptr_t result, sz_size_t result_length,
1834+
sz_random_generator_t generator, void *generator_user_data) {
1835+
1836+
SZ_ASSERT(alphabet_size > 0 && alphabet_size <= 256, "Inadequate alphabet size");
18231837

1824-
SZ_PUBLIC void sz_generate(sz_cptr_t alphabet, sz_size_t size, sz_ptr_t result, sz_size_t length) {}
1838+
if (alphabet_size == 1)
1839+
for (sz_cptr_t end = result + result_length; result != end; ++result) *result = *alphabet;
1840+
1841+
else {
1842+
SZ_ASSERT(generator, "Expects a valid random generator");
1843+
for (sz_cptr_t end = result + result_length; result != end; ++result)
1844+
*result = alphabet[sz_u8_divide(generator(generator_user_data) & 0xFF, alphabet_size)];
1845+
}
1846+
}
18251847

18261848
#pragma endregion
18271849

@@ -2641,8 +2663,6 @@ SZ_PUBLIC sz_cptr_t sz_find_last_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr
26412663
*/
26422664
#pragma region Compile-Time Dispatching
26432665

2644-
#include <stringzilla/stringzilla.h>
2645-
26462666
SZ_PUBLIC sz_u64_t sz_hash(sz_cptr_t text, sz_size_t length) { return sz_hash_serial(text, length); }
26472667

26482668
SZ_PUBLIC sz_ordering_t sz_order(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length) {

scripts/test_sampling.py

Lines changed: 0 additions & 68 deletions
This file was deleted.

scripts/validate_fast_division.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
"""PyTest + Cppyy test of the `sz_u8_divide` utility function."""
2+
3+
import pytest
4+
import cppyy
5+
6+
cppyy.include("include/stringzilla/stringzilla.h")
7+
cppyy.cppdef(
8+
"""
9+
sz_u32_t sz_u8_divide_as_u32(sz_u8_t number, sz_u8_t divisor) {
10+
return sz_u8_divide(number, divisor);
11+
}
12+
"""
13+
)
14+
15+
16+
@pytest.mark.parametrize("number", range(0, 256))
17+
@pytest.mark.parametrize("divisor", range(2, 256))
18+
def test_efficient_division(number: int, divisor: int):
19+
sz_u8_divide = cppyy.gbl.sz_u8_divide_as_u32
20+
assert (number // divisor) == sz_u8_divide(number, divisor)

0 commit comments

Comments
 (0)