Skip to content

Commit

Permalink
Fix: Matching N3322 for memcpy UB in C2y
Browse files Browse the repository at this point in the history
  • Loading branch information
ashvardanian committed Dec 11, 2024
1 parent 152ed04 commit 14ee92c
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 15 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -1196,8 +1196,10 @@ __`SZ_AVOID_LIBC`__ and __`SZ_OVERRIDE_LIBC`__:
> This may affect the type resolution system on obscure hardware platforms.
> Moreover, one may let `stringzilla` override the common symbols like the `memcpy` and `memset` with its own implementations.
> In that case you can use the [`LD_PRELOAD` trick][ld-preload-trick] to prioritize it's symbols over the ones from the LibC and accelerate existing string-heavy applications without recompiling them.
> It also adds a layer of security, as the `stringzilla` isn't [undefined for NULL inputs][redhat-memcpy-ub] like `memcpy(NULL, NULL, 0)`.
[ld-preload-trick]: https://ashvardanian.com/posts/ld-preload-libsee
[redhat-memcpy-ub]: https://developers.redhat.com/articles/2024/12/11/making-memcpynull-null-0-well-defined

__`SZ_AVOID_STL`__ and __`SZ_SAFETY_OVER_COMPATIBILITY`__:

Expand Down
32 changes: 19 additions & 13 deletions include/stringzilla/stringzilla.h
Original file line number Diff line number Diff line change
Expand Up @@ -1941,7 +1941,7 @@ SZ_PUBLIC sz_cptr_t sz_find_byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr
if (!h_length) return SZ_NULL_CHAR;
sz_cptr_t const h_end = h + h_length;

#if !SZ_DETECT_BIG_ENDIAN // Use SWAR only on little-endian platforms for brevety.
#if !SZ_DETECT_BIG_ENDIAN // Use SWAR only on little-endian platforms for brevity.
#if !SZ_USE_MISALIGNED_LOADS // Process the misaligned head, to void UB on unaligned 64-bit loads.
for (; ((sz_size_t)h & 7ull) && h < h_end; ++h)
if (*h == *n) return h;
Expand Down Expand Up @@ -1978,7 +1978,7 @@ sz_cptr_t sz_rfind_byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
// Reposition the `h` pointer to the end, as we will be walking backwards.
h = h + h_length - 1;

#if !SZ_DETECT_BIG_ENDIAN // Use SWAR only on little-endian platforms for brevety.
#if !SZ_DETECT_BIG_ENDIAN // Use SWAR only on little-endian platforms for brevity.
#if !SZ_USE_MISALIGNED_LOADS // Process the misaligned head, to void UB on unaligned 64-bit loads.
for (; ((sz_size_t)(h + 1) & 7ull) && h >= h_start; --h)
if (*h == *n) return h;
Expand Down Expand Up @@ -2464,9 +2464,9 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_serial( //
sz_size_t cost_if_deletion_or_insertion = sz_min_of_two(current_distances[i], current_distances[i + 1]) + 1;
next_distances[i + 1] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
}
// Don't forget to populate the first row and the fiest column of the Levenshtein matrix.
// Don't forget to populate the first row and the first column of the Levenshtein matrix.
next_distances[0] = next_distances[next_skew_diagonal_length - 1] = next_skew_diagonal_index;
// Perform a circular rotarion of those buffers, to reuse the memory.
// Perform a circular rotation of those buffers, to reuse the memory.
sz_size_t *temporary = previous_distances;
previous_distances = current_distances;
current_distances = next_distances;
Expand All @@ -2486,7 +2486,7 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_serial( //
sz_size_t cost_if_deletion_or_insertion = sz_min_of_two(current_distances[i], current_distances[i + 1]) + 1;
next_distances[i] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution);
}
// Perform a circular rotarion of those buffers, to reuse the memory, this time, with a shift,
// Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
// dropping the first element in the current array.
sz_size_t *temporary = previous_distances;
previous_distances = current_distances + 1;
Expand Down Expand Up @@ -3486,32 +3486,38 @@ SZ_PUBLIC void sz_string_free(sz_string_t *string, sz_memory_allocator_t *alloca
sz_string_init(string);
}

// When overriding libc, disable optimisations for this function beacuse MSVC will optimize the loops into a memset.
// When overriding libc, disable optimizations for this function because MSVC will optimize the loops into a memset.
// Which then causes a stack overflow due to infinite recursion (memset -> sz_fill_serial -> memset).
#if defined(_MSC_VER) && defined(SZ_OVERRIDE_LIBC) && SZ_OVERRIDE_LIBC
#pragma optimize("", off)
#endif
SZ_PUBLIC void sz_fill_serial(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
sz_ptr_t end = target + length;
// Dealing with short strings, a single sequential pass would be faster.
// If the size is larger than 2 words, then at least 1 of them will be aligned.
// But just one aligned word may not be worth SWAR.
if (length < SZ_SWAR_THRESHOLD)
while (target != end) *(target++) = value;
while (length--) *(target++) = value;

// In case of long strings, skip unaligned bytes, and then fill the rest in 64-bit chunks.
else {
sz_u64_t value64 = (sz_u64_t)value * 0x0101010101010101ull;
while ((sz_size_t)target & 7ull) *(target++) = value;
while (target + 8 <= end) *(sz_u64_t *)target = value64, target += 8;
while (target != end) *(target++) = value;
while ((sz_size_t)target & 7ull) *(target++) = value, length--;
while (length >= 8) *(sz_u64_t *)target = value64, target += 8, length -= 8;
while (length--) *(target++) = value;
}
}
#if defined(_MSC_VER) && defined(SZ_OVERRIDE_LIBC) && SZ_OVERRIDE_LIBC
#pragma optimize("", on)
#endif

SZ_PUBLIC void sz_copy_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
// The most typical implementation of `memcpy` suffers from Undefined Behavior:
//
// for (char const *end = source + length; source < end; source++) *target++ = *source;
//
// As NULL pointer arithmetic is undefined for calls like `memcpy(NULL, NULL, 0)`.
// That's mitigated in C2y with the N3322 proposal, but our solution uses a design, that has no such issues.
// https://developers.redhat.com/articles/2024/12/11/making-memcpynull-null-0-well-defined
#if SZ_USE_MISALIGNED_LOADS
while (length >= 8) *(sz_u64_t *)target = *(sz_u64_t const *)source, target += 8, source += 8, length -= 8;
#endif
Expand Down Expand Up @@ -5215,7 +5221,7 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto65k_avx512( //
}
// Don't forget to populate the first row and the fiest column of the Levenshtein matrix.
next_distances[0] = next_distances[next_skew_diagonal_length - 1] = (sz_u16_t)next_skew_diagonal_index;
// Perform a circular rotarion of those buffers, to reuse the memory.
// Perform a circular rotation of those buffers, to reuse the memory.
sz_u16_t *temporary = previous_distances;
previous_distances = current_distances;
current_distances = next_distances;
Expand Down Expand Up @@ -5257,7 +5263,7 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto65k_avx512( //
i += register_length;
}

// Perform a circular rotarion of those buffers, to reuse the memory, this time, with a shift,
// Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift,
// dropping the first element in the current array.
sz_u16_t *temporary = previous_distances;
previous_distances = current_distances + 1;
Expand Down
13 changes: 11 additions & 2 deletions scripts/test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -153,14 +153,23 @@ inline void expect_equality(char const *a, char const *b, std::size_t size) {
* Uses a large heap-allocated buffer to ensure that operations optimized for @b larger-than-L2-cache memory
* regions are tested. Uses a combination of deterministic and random tests with uniform and exponential distributions.
*/
static void test_memory_utilities(std::size_t experiments = 1024ull * 1024ull,
std::size_t max_l2_size = 1024ull * 1024ull) {
static void test_memory_utilities( //
std::size_t experiments = 1024ull * 1024ull, std::size_t max_l2_size = 1024ull * 1024ull) {

// We will be mirroring the operations on both standard and StringZilla strings.
std::string text_stl(max_l2_size, '-');
std::string text_sz(max_l2_size, '-');
expect_equality(text_stl.data(), text_sz.data(), max_l2_size);

// The traditional `memset` and `memcpy` functions are undefined for zero-length buffers and NULL pointers
// for older C standards. However, with the N3322 proposal for C2y, that issue has been resolved.
// https://developers.redhat.com/articles/2024/12/11/making-memcpynull-null-0-well-defined
//
// Let's make sure, that our versions don't trigger any undefined behavior.
sz::memset(NULL, 0, 0);
sz::memcpy(NULL, NULL, 0);
sz::memmove(NULL, NULL, 0);

// First start with simple deterministic tests.
// Let's use `memset` to fill the strings with a pattern like "122333444455555...00000000000011111111111..."
std::size_t count_groups = 0;
Expand Down

0 comments on commit 14ee92c

Please sign in to comment.