Add: Look-Up Table transforms

ashvardanian · ashvardanian · commit 850e4e82684e · 2024-10-11T21:47:31.000Z
The new `sz_look_up_transform` API implements
a 256-byte lookup table using serial code and AVX-512
that can significantly accelerates text and image
processing.

The AVX-512 implementation reaches 18 GB/s on
Intel Sapphire Rapids CPU, while serial code stays
around 3 GB/s for large files.
diff --git a/c/lib.c b/c/lib.c
@@ -119,6 +119,7 @@ typedef struct sz_implementations_t {
     sz_move_t copy;
     sz_move_t move;
     sz_fill_t fill;
+    sz_look_up_transform_t look_up_transform;
 
     sz_find_byte_t find_byte;
     sz_find_byte_t rfind_byte;
@@ -153,6 +154,7 @@ static void sz_dispatch_table_init(void) {
     impl->copy = sz_copy_serial;
     impl->move = sz_move_serial;
     impl->fill = sz_fill_serial;
+    impl->look_up_transform = sz_look_up_transform_serial;
 
     impl->find = sz_find_serial;
     impl->rfind = sz_rfind_serial;
@@ -205,6 +207,7 @@ static void sz_dispatch_table_init(void) {
         impl->find_from_set = sz_find_charset_avx512;
         impl->rfind_from_set = sz_rfind_charset_avx512;
         impl->alignment_score = sz_alignment_score_avx512;
+        impl->look_up_transform = sz_look_up_transform_avx512;
     }
 #endif
 
@@ -261,6 +264,10 @@ SZ_DYNAMIC void sz_fill(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
     sz_dispatch_table.fill(target, length, value);
 }
 
+SZ_DYNAMIC void sz_look_up_transform(sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
+    sz_dispatch_table.look_up_transform(source, length, lut, target);
+}
+
 SZ_DYNAMIC sz_cptr_t sz_find_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle) {
     return sz_dispatch_table.find_byte(haystack, h_length, needle);
 }
diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h
@@ -453,6 +453,25 @@ SZ_DYNAMIC sz_ordering_t sz_order(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b,
 /** @copydoc sz_order */
 SZ_PUBLIC sz_ordering_t sz_order_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
 
+/**
+ *  @brief  Look Up Table @b (LUT) transformation of a string. Equivalent to `for (char & c : text) c = lut[c]`.
+ *
+ *  Can be used to implement some form of string normalization, partially masking punctuation marks,
+ *  or converting between different character sets, like uppercase or lowercase. Surprisingly, also has
+ *  broad implications in image processing, where image channel transformations are often done using LUTs.
+ *
+ *  @param text     String to be normalized.
+ *  @param length   Number of bytes in the string.
+ *  @param lut      Look Up Table to apply. Must be exactly @b 256 bytes long.
+ *  @param result   Output string, can point to the same address as ::text.
+ */
+SZ_DYNAMIC void sz_look_up_transform(sz_cptr_t text, sz_size_t length, sz_cptr_t lut, sz_ptr_t result);
+
+typedef void (*sz_look_up_transform_t)(sz_cptr_t, sz_size_t, sz_cptr_t, sz_ptr_t);
+
+/** @copydoc sz_look_up_transform */
+SZ_PUBLIC void sz_look_up_transform_serial(sz_cptr_t text, sz_size_t length, sz_cptr_t lut, sz_ptr_t result);
+
 /**
  *  @brief  Equivalent to `for (char & c : text) c = tolower(c)`.
  *
@@ -1169,6 +1188,8 @@ SZ_PUBLIC void sz_copy_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t lengt
 SZ_PUBLIC void sz_move_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t length);
 /** @copydoc sz_fill */
 SZ_PUBLIC void sz_fill_avx512(sz_ptr_t target, sz_size_t length, sz_u8_t value);
+/** @copydoc sz_look_up_tranform */
+SZ_PUBLIC void sz_look_up_tranform_avx512(sz_cptr_t source, sz_size_t length, sz_cptr_t table, sz_ptr_t target);
 /** @copydoc sz_find_byte */
 SZ_PUBLIC sz_cptr_t sz_find_byte_avx512(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
 /** @copydoc sz_rfind_byte */
@@ -3095,6 +3116,14 @@ SZ_INTERNAL sz_u8_t sz_u8_divide(sz_u8_t number, sz_u8_t divisor) {
     return (sz_u8_t)(t >> shift);
 }
 
+SZ_PUBLIC void sz_look_up_transform_serial(sz_cptr_t text, sz_size_t length, sz_cptr_t lut, sz_ptr_t result) {
+    sz_u8_t const *unsigned_lut = (sz_u8_t const *)lut;
+    sz_u8_t const *unsigned_text = (sz_u8_t const *)text;
+    sz_u8_t *unsigned_result = (sz_u8_t *)result;
+    sz_u8_t const *end = unsigned_text + length;
+    for (; unsigned_text != end; ++unsigned_text, ++unsigned_result) *unsigned_result = unsigned_lut[*unsigned_text];
+}
+
 SZ_PUBLIC void sz_tolower_serial(sz_cptr_t text, sz_size_t length, sz_ptr_t result) {
     sz_u8_t *unsigned_result = (sz_u8_t *)result;
     sz_u8_t const *unsigned_text = (sz_u8_t const *)text;
@@ -5106,6 +5135,108 @@ SZ_PUBLIC void sz_hashes_avx512(sz_cptr_t start, sz_size_t length, sz_size_t win
 #pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512vbmi,avx512vbmi2,bmi,bmi2"))), \
                              apply_to = function)
 
+SZ_PUBLIC void sz_look_up_transform_avx512(sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
+
+    // If the input is tiny (especially smaller than the look-up table itself), we may end up paying
+    // more for organizing the SIMD registers and changing the CPU state, than for the actual computation.
+    // But if at least 3 cache lines are touched, the AVX-512 implementation should be faster.
+    if (length <= 128) {
+        sz_look_up_transform_serial(source, length, lut, target);
+        return;
+    }
+
+    // When the buffer is over 64 bytes, it's guaranteed to touch at least two cache lines - the head and tail,
+    // and may include more cache-lines in-between. Knowing this, we can avoid expensive unaligned stores
+    // by computing 2 masks - for the head and tail, using masked stores for the head and tail, and unmasked
+    // for the body.
+    sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64; // 63 or less.
+    sz_size_t tail_length = (sz_size_t)(target + length) % 64;    // 63 or less.
+    __mmask64 head_mask = _sz_u64_mask_until(head_length);
+    __mmask64 tail_mask = _sz_u64_mask_until(tail_length);
+
+    // We need to pull the lookup table into 4x ZMM registers.
+    // We can use `vpermi2b` instruction to perform the look in two ZMM registers with `_mm512_permutex2var_epi8`
+    // intrinsics, but it has a 6-cycle latency on Sapphire Rapids and requires AVX512-VBMI. Assuming we need to
+    // operate on 4 registers, it might be cleaner to use 2x separate `_mm512_permutexvar_epi8` calls.
+    // Combining the results with 2x `_mm512_test_epi8_mask` and 3x blends afterwards.
+    //
+    //  - `_mm512_mask_blend_epi8` - 1 cycle latency, and generally 2x can run in parallel.
+    //  - `_mm512_test_epi8_mask` - 3 cycles latency, same as most comparison functions in AVX-512.
+    sz_u512_vec_t lut_0_to_63_vec, lut_64_to_127_vec, lut_128_to_191_vec, lut_192_to_255_vec;
+    lut_0_to_63_vec.zmm = _mm512_loadu_si512((lut));
+    lut_64_to_127_vec.zmm = _mm512_loadu_si512((lut + 64));
+    lut_128_to_191_vec.zmm = _mm512_loadu_si512((lut + 128));
+    lut_192_to_255_vec.zmm = _mm512_loadu_si512((lut + 192));
+
+    sz_u512_vec_t first_bit_vec, second_bit_vec;
+    first_bit_vec.zmm = _mm512_set1_epi8((char)0x80);
+    second_bit_vec.zmm = _mm512_set1_epi8((char)0x40);
+
+    __mmask64 first_bit_mask, second_bit_mask;
+    sz_u512_vec_t source_vec;
+    // If the top bit is set in each word of `source_vec`, than we use `lookup_128_to_191_vec` or
+    // `lookup_192_to_255_vec`. If the second bit is set, we use `lookup_64_to_127_vec` or `lookup_192_to_255_vec`.
+    sz_u512_vec_t lookup_0_to_63_vec, lookup_64_to_127_vec, lookup_128_to_191_vec, lookup_192_to_255_vec;
+    sz_u512_vec_t blended_0_to_127_vec, blended_128_to_255_vec, blended_0_to_255_vec;
+
+    // Handling the head.
+    if (head_length) {
+        source_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, source);
+        lookup_0_to_63_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_0_to_63_vec.zmm);
+        lookup_64_to_127_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_64_to_127_vec.zmm);
+        lookup_128_to_191_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_128_to_191_vec.zmm);
+        lookup_192_to_255_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_192_to_255_vec.zmm);
+        first_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, first_bit_vec.zmm);
+        second_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, second_bit_vec.zmm);
+        blended_0_to_127_vec.zmm =
+            _mm512_mask_blend_epi8(second_bit_mask, lookup_0_to_63_vec.zmm, lookup_64_to_127_vec.zmm);
+        blended_128_to_255_vec.zmm =
+            _mm512_mask_blend_epi8(second_bit_mask, lookup_128_to_191_vec.zmm, lookup_192_to_255_vec.zmm);
+        blended_0_to_255_vec.zmm =
+            _mm512_mask_blend_epi8(first_bit_mask, blended_0_to_127_vec.zmm, blended_128_to_255_vec.zmm);
+        _mm512_mask_storeu_epi8(target, head_mask, blended_0_to_255_vec.zmm);
+        source += head_length, target += head_length, length -= head_length;
+    }
+
+    // Handling the body in 64-byte chunks aligned to cache-line boundaries with respect to `target`.
+    while (length >= 64) {
+        source_vec.zmm = _mm512_loadu_si512(source);
+        lookup_0_to_63_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_0_to_63_vec.zmm);
+        lookup_64_to_127_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_64_to_127_vec.zmm);
+        lookup_128_to_191_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_128_to_191_vec.zmm);
+        lookup_192_to_255_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_192_to_255_vec.zmm);
+        first_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, first_bit_vec.zmm);
+        second_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, second_bit_vec.zmm);
+        blended_0_to_127_vec.zmm =
+            _mm512_mask_blend_epi8(second_bit_mask, lookup_0_to_63_vec.zmm, lookup_64_to_127_vec.zmm);
+        blended_128_to_255_vec.zmm =
+            _mm512_mask_blend_epi8(second_bit_mask, lookup_128_to_191_vec.zmm, lookup_192_to_255_vec.zmm);
+        blended_0_to_255_vec.zmm =
+            _mm512_mask_blend_epi8(first_bit_mask, blended_0_to_127_vec.zmm, blended_128_to_255_vec.zmm);
+        _mm512_store_si512(target, blended_0_to_255_vec.zmm); //! Aligned store, our main weapon!
+        source += 64, target += 64, length -= 64;
+    }
+
+    // Handling the tail.
+    if (tail_length) {
+        source_vec.zmm = _mm512_maskz_loadu_epi8(tail_mask, source);
+        lookup_0_to_63_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_0_to_63_vec.zmm);
+        lookup_64_to_127_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_64_to_127_vec.zmm);
+        lookup_128_to_191_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_128_to_191_vec.zmm);
+        lookup_192_to_255_vec.zmm = _mm512_permutexvar_epi8(source_vec.zmm, lut_192_to_255_vec.zmm);
+        first_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, first_bit_vec.zmm);
+        second_bit_mask = _mm512_test_epi8_mask(source_vec.zmm, second_bit_vec.zmm);
+        blended_0_to_127_vec.zmm =
+            _mm512_mask_blend_epi8(second_bit_mask, lookup_0_to_63_vec.zmm, lookup_64_to_127_vec.zmm);
+        blended_128_to_255_vec.zmm =
+            _mm512_mask_blend_epi8(second_bit_mask, lookup_128_to_191_vec.zmm, lookup_192_to_255_vec.zmm);
+        blended_0_to_255_vec.zmm =
+            _mm512_mask_blend_epi8(first_bit_mask, blended_0_to_127_vec.zmm, blended_128_to_255_vec.zmm);
+        _mm512_mask_storeu_epi8(target, tail_mask, blended_0_to_255_vec.zmm);
+        source += tail_length, target += tail_length, length -= tail_length;
+    }
+}
+
 SZ_PUBLIC sz_cptr_t sz_find_charset_avx512(sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
 
     // Before initializing the AVX-512 vectors, we may want to run the sequential code for the first few bytes.
@@ -5920,6 +6051,14 @@ SZ_DYNAMIC void sz_fill(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
 #endif
 }
 
+SZ_DYNAMIC void sz_look_up_transform(sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
+#if SZ_USE_X86_AVX512
+    sz_look_up_transform_avx512(source, length, lut, target);
+#else
+    sz_look_up_transform_serial(source, length, lut, target);
+#endif
+}
+
 SZ_DYNAMIC sz_cptr_t sz_find_byte(sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle) {
 #if SZ_USE_X86_AVX512
     return sz_find_byte_avx512(haystack, h_length, needle);
diff --git a/include/stringzilla/stringzilla.hpp b/include/stringzilla/stringzilla.hpp
@@ -340,6 +340,55 @@ inline char_set whitespaces_set() { return char_set {whitespaces()}; }
 inline char_set newlines_set() { return char_set {newlines()}; }
 inline char_set base64_set() { return char_set {base64()}; }
 
+/**
+ *  @brief  A look-up table for character replacement operations.
+ *          Exactly 256 bytes for byte-to-byte replacement.
+ *          ! For larger character types should be allocated on the heap.
+ */
+template <typename char_type_ = char>
+class basic_look_up_table {
+    static_assert(sizeof(char_type_) == 1 || sizeof(char_type_) == 2 || sizeof(char_type_) == 4,
+                  "Character type must be 1, 2, or 4 bytes long");
+    static constexpr std::size_t size_k = sizeof(char_type_) == 1   ? 256ul
+                                          : sizeof(char_type_) == 2 ? 65536ul
+                                                                    : 4294967296ul;
+    static constexpr std::size_t bytes_k = size_k * sizeof(char_type_);
+    using usnigned_type_ = typename std::make_unsigned<char_type_>::type;
+
+    char_type_ lut_[size_k];
+
+  public:
+    using char_type = char_type_;
+
+    basic_look_up_table() noexcept { memset(&lut_[0], 0, bytes_k); }
+    explicit basic_look_up_table(char_type const (&chars)[size_k]) noexcept { memcpy(&lut_[0], chars, bytes_k); }
+    basic_look_up_table(std::array<char_type, size_k> const &chars) noexcept {
+        memcpy(&lut_[0], chars.data(), bytes_k);
+    }
+
+    basic_look_up_table(basic_look_up_table const &other) noexcept { memcpy(&lut_[0], other.lut_, bytes_k); }
+    basic_look_up_table &operator=(basic_look_up_table const &other) noexcept {
+        memcpy(&lut_[0], other.lut_, bytes_k);
+        return *this;
+    }
+
+    /**
+     *  @brief  Creates a look-up table with a one-to-one mapping of characters to themselves.
+     *  Similar to `std::iota` filling, but properly handles signed integer casts.
+     */
+    static basic_look_up_table identity() noexcept {
+        basic_look_up_table result;
+        for (std::size_t i = 0; i < size_k; ++i) { result.lut_[i] = static_cast<usnigned_type_>(i); }
+        return result;
+    }
+
+    inline sz_cptr_t raw() const noexcept { return reinterpret_cast<sz_cptr_t>(&lut_[0]); }
+    inline char_type &operator[](char_type c) noexcept { return lut_[sz_bitcast(usnigned_type_, c)]; }
+    inline char_type const &operator[](char_type c) const noexcept { return lut_[sz_bitcast(usnigned_type_, c)]; }
+};
+
+using look_up_table = basic_look_up_table<char>;
+
 #pragma endregion
 
 #pragma region Ranges of Search Matches
@@ -3355,6 +3404,24 @@ class basic_string {
         return try_replace_all_<char_set>(pattern, replacement);
     }
 
+    /**
+     *  @brief  Replaces ( @b in-place ) all characters in the string using the provided lookup table.
+     */
+    basic_string &transform(look_up_table const &table) noexcept {
+        transform(table, data());
+        return *this;
+    }
+
+    /**
+     *  @brief  Maps all chatacters in the current string into another buffer using the provided lookup table.
+     */
+    void transform(look_up_table const &table, pointer output) const noexcept {
+        sz_ptr_t start;
+        sz_size_t length;
+        sz_string_range(&string_, &start, &length);
+        sz_look_up_transform((sz_cptr_t)start, (sz_size_t)length, (sz_cptr_t)table.raw(), (sz_ptr_t)output);
+    }
+
   private:
     template <typename pattern_type>
     bool try_replace_all_(pattern_type pattern, string_view replacement) noexcept;
@@ -3797,6 +3864,26 @@ void randomize(basic_string_slice<char_type_> string, generator_type_ &generator
     sz_generate(alphabet.data(), alphabet.size(), string.data(), string.size(), generator_callback, &generator);
 }
 
+/**
+ *  @brief  Replaces ( @b in-place ) all characters in the string using the provided lookup table.
+ */
+template <typename char_type_>
+void transform(basic_string_slice<char_type_> string, basic_look_up_table<char_type_> const &table) noexcept {
+    static_assert(sizeof(char_type_) == 1, "The character type must be 1 byte long.");
+    sz_look_up_transform((sz_cptr_t)string.data(), (sz_size_t)string.size(), (sz_cptr_t)table.raw(),
+                         (sz_ptr_t)string.data());
+}
+
+/**
+ *  @brief  Maps all chatacters in the current string into another buffer using the provided lookup table.
+ */
+template <typename char_type_>
+void transform(basic_string_slice<char_type_ const> source, basic_look_up_table<char_type_> const &table,
+               char_type_ *target) noexcept {
+    static_assert(sizeof(char_type_) == 1, "The character type must be 1 byte long.");
+    sz_look_up_transform((sz_cptr_t)source.data(), (sz_size_t)source.size(), (sz_cptr_t)table.raw(), (sz_ptr_t)target);
+}
+
 /**
  *  @brief  Overwrites the string slice with random characters from the given alphabet
  *          using `std::rand` as the random generator.
diff --git a/python/lib.c b/python/lib.c
@@ -1927,6 +1927,53 @@ static PyObject *Str_endswith(PyObject *self, PyObject *args, PyObject *kwargs)
     else { Py_RETURN_FALSE; }
 }
 
+static PyObject *Str_translate(PyObject *self, PyObject *args, PyObject *kwargs) {
+    int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
+    Py_ssize_t nargs = PyTuple_Size(args);
+    if (nargs < !is_member + 1 || nargs > !is_member + 3) {
+        PyErr_Format(PyExc_TypeError, "Invalid number of arguments");
+        return NULL;
+    }
+
+    PyObject *str_obj = is_member ? self : PyTuple_GET_ITEM(args, 0);
+    PyObject *look_up_table_obj = PyTuple_GET_ITEM(args, !is_member);
+    PyObject *start_obj = nargs > !is_member + 1 ? PyTuple_GET_ITEM(args, !is_member + 1) : NULL;
+    PyObject *end_obj = nargs > !is_member + 2 ? PyTuple_GET_ITEM(args, !is_member + 2) : NULL;
+
+    // Optional start and end arguments
+    Py_ssize_t start = 0, end = PY_SSIZE_T_MAX;
+
+    if (start_obj && ((start = PyLong_AsSsize_t(start_obj)) == -1 && PyErr_Occurred())) {
+        PyErr_SetString(PyExc_TypeError, "start must be an integer");
+        return NULL;
+    }
+
+    if (end_obj && ((end = PyLong_AsSsize_t(end_obj)) == -1 && PyErr_Occurred())) {
+        PyErr_SetString(PyExc_TypeError, "end must be an integer");
+        return NULL;
+    }
+
+    sz_string_view_t str, look_up_table;
+    if (!export_string_like(str_obj, &str.start, &str.length) ||
+        !export_string_like(look_up_table_obj, &look_up_table.start, &look_up_table.length)) {
+        PyErr_SetString(PyExc_TypeError, "Both arguments must be string-like");
+        return NULL;
+    }
+
+    // Apply start and end arguments
+    str.start += start;
+    str.length -= start;
+    if (end != PY_SSIZE_T_MAX && end - start < str.length) { str.length = end - start; }
+
+    if (look_up_table.length != 256) {
+        PyErr_SetString(PyExc_ValueError, "The look-up table must be exactly 256 bytes long");
+        return NULL;
+    }
+
+    sz_look_up_transform(str.start, str.length, look_up_table.start, str.start);
+    return Py_None;
+}
+
 static PyObject *Str_find_first_of(PyObject *self, PyObject *args, PyObject *kwargs) {
     Py_ssize_t signed_offset;
     sz_string_view_t text;
@@ -2438,6 +2485,7 @@ static PyMethodDef Str_methods[] = {
     {"splitlines", Str_splitlines, SZ_METHOD_FLAGS, "Split a string by line breaks."},
     {"startswith", Str_startswith, SZ_METHOD_FLAGS, "Check if a string starts with a given prefix."},
     {"endswith", Str_endswith, SZ_METHOD_FLAGS, "Check if a string ends with a given suffix."},
+    {"translate", Str_translate, SZ_METHOD_FLAGS, "Look-Up Table in-place transformation of a byte-string."},
     {"decode", Str_decode, SZ_METHOD_FLAGS, "Decode the bytes into `str` with a given encoding"},
 
     // Bidirectional operations
@@ -3139,6 +3187,7 @@ static PyMethodDef stringzilla_methods[] = {
     {"splitlines", Str_splitlines, SZ_METHOD_FLAGS, "Split a string by line breaks."},
     {"startswith", Str_startswith, SZ_METHOD_FLAGS, "Check if a string starts with a given prefix."},
     {"endswith", Str_endswith, SZ_METHOD_FLAGS, "Check if a string ends with a given suffix."},
+    {"translate", Str_translate, SZ_METHOD_FLAGS, "Look-Up Table in-place transformation of a byte-string."},
     {"decode", Str_decode, SZ_METHOD_FLAGS, "Decode the bytes into `str` with a given encoding"},
 
     // Bidirectional operations
diff --git a/scripts/bench_memory.cpp b/scripts/bench_memory.cpp
diff --git a/scripts/test.cpp b/scripts/test.cpp