@@ -453,6 +453,25 @@ SZ_DYNAMIC sz_ordering_t sz_order(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b,
453
453
/* * @copydoc sz_order */
454
454
SZ_PUBLIC sz_ordering_t sz_order_serial (sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length);
455
455
456
+ /* *
457
+ * @brief Look Up Table @b (LUT) transformation of a string. Equivalent to `for (char & c : text) c = lut[c]`.
458
+ *
459
+ * Can be used to implement some form of string normalization, partially masking punctuation marks,
460
+ * or converting between different character sets, like uppercase or lowercase. Surprisingly, also has
461
+ * broad implications in image processing, where image channel transformations are often done using LUTs.
462
+ *
463
+ * @param text String to be normalized.
464
+ * @param length Number of bytes in the string.
465
+ * @param lut Look Up Table to apply. Must be exactly @b 256 bytes long.
466
+ * @param result Output string, can point to the same address as ::text.
467
+ */
468
+ SZ_DYNAMIC void sz_look_up_transform (sz_cptr_t text, sz_size_t length, sz_cptr_t lut, sz_ptr_t result);
469
+
470
+ typedef void (*sz_look_up_transform_t )(sz_cptr_t , sz_size_t , sz_cptr_t , sz_ptr_t );
471
+
472
+ /* * @copydoc sz_look_up_transform */
473
+ SZ_PUBLIC void sz_look_up_transform_serial (sz_cptr_t text, sz_size_t length, sz_cptr_t lut, sz_ptr_t result);
474
+
456
475
/* *
457
476
* @brief Equivalent to `for (char & c : text) c = tolower(c)`.
458
477
*
@@ -1169,6 +1188,8 @@ SZ_PUBLIC void sz_copy_avx512(sz_ptr_t target, sz_cptr_t source, sz_size_t lengt
1169
1188
SZ_PUBLIC void sz_move_avx512 (sz_ptr_t target, sz_cptr_t source, sz_size_t length);
1170
1189
/* * @copydoc sz_fill */
1171
1190
SZ_PUBLIC void sz_fill_avx512 (sz_ptr_t target, sz_size_t length, sz_u8_t value);
1191
+ /* * @copydoc sz_look_up_tranform */
1192
+ SZ_PUBLIC void sz_look_up_tranform_avx512 (sz_cptr_t source, sz_size_t length, sz_cptr_t table, sz_ptr_t target);
1172
1193
/* * @copydoc sz_find_byte */
1173
1194
SZ_PUBLIC sz_cptr_t sz_find_byte_avx512 (sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
1174
1195
/* * @copydoc sz_rfind_byte */
@@ -3095,6 +3116,14 @@ SZ_INTERNAL sz_u8_t sz_u8_divide(sz_u8_t number, sz_u8_t divisor) {
3095
3116
return (sz_u8_t )(t >> shift);
3096
3117
}
3097
3118
3119
+ SZ_PUBLIC void sz_look_up_transform_serial (sz_cptr_t text, sz_size_t length, sz_cptr_t lut, sz_ptr_t result) {
3120
+ sz_u8_t const *unsigned_lut = (sz_u8_t const *)lut;
3121
+ sz_u8_t const *unsigned_text = (sz_u8_t const *)text;
3122
+ sz_u8_t *unsigned_result = (sz_u8_t *)result;
3123
+ sz_u8_t const *end = unsigned_text + length;
3124
+ for (; unsigned_text != end; ++unsigned_text, ++unsigned_result) *unsigned_result = unsigned_lut[*unsigned_text];
3125
+ }
3126
+
3098
3127
SZ_PUBLIC void sz_tolower_serial (sz_cptr_t text, sz_size_t length, sz_ptr_t result) {
3099
3128
sz_u8_t *unsigned_result = (sz_u8_t *)result;
3100
3129
sz_u8_t const *unsigned_text = (sz_u8_t const *)text;
@@ -5106,6 +5135,108 @@ SZ_PUBLIC void sz_hashes_avx512(sz_cptr_t start, sz_size_t length, sz_size_t win
5106
5135
#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512vbmi,avx512vbmi2,bmi,bmi2"))), \
5107
5136
apply_to = function)
5108
5137
5138
+ SZ_PUBLIC void sz_look_up_transform_avx512 (sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
5139
+
5140
+ // If the input is tiny (especially smaller than the look-up table itself), we may end up paying
5141
+ // more for organizing the SIMD registers and changing the CPU state, than for the actual computation.
5142
+ // But if at least 3 cache lines are touched, the AVX-512 implementation should be faster.
5143
+ if (length <= 128 ) {
5144
+ sz_look_up_transform_serial (source, length, lut, target);
5145
+ return ;
5146
+ }
5147
+
5148
+ // When the buffer is over 64 bytes, it's guaranteed to touch at least two cache lines - the head and tail,
5149
+ // and may include more cache-lines in-between. Knowing this, we can avoid expensive unaligned stores
5150
+ // by computing 2 masks - for the head and tail, using masked stores for the head and tail, and unmasked
5151
+ // for the body.
5152
+ sz_size_t head_length = (64 - ((sz_size_t )target % 64 )) % 64 ; // 63 or less.
5153
+ sz_size_t tail_length = (sz_size_t )(target + length) % 64 ; // 63 or less.
5154
+ __mmask64 head_mask = _sz_u64_mask_until (head_length);
5155
+ __mmask64 tail_mask = _sz_u64_mask_until (tail_length);
5156
+
5157
+ // We need to pull the lookup table into 4x ZMM registers.
5158
+ // We can use `vpermi2b` instruction to perform the look in two ZMM registers with `_mm512_permutex2var_epi8`
5159
+ // intrinsics, but it has a 6-cycle latency on Sapphire Rapids and requires AVX512-VBMI. Assuming we need to
5160
+ // operate on 4 registers, it might be cleaner to use 2x separate `_mm512_permutexvar_epi8` calls.
5161
+ // Combining the results with 2x `_mm512_test_epi8_mask` and 3x blends afterwards.
5162
+ //
5163
+ // - `_mm512_mask_blend_epi8` - 1 cycle latency, and generally 2x can run in parallel.
5164
+ // - `_mm512_test_epi8_mask` - 3 cycles latency, same as most comparison functions in AVX-512.
5165
+ sz_u512_vec_t lut_0_to_63_vec, lut_64_to_127_vec, lut_128_to_191_vec, lut_192_to_255_vec;
5166
+ lut_0_to_63_vec.zmm = _mm512_loadu_si512 ((lut));
5167
+ lut_64_to_127_vec.zmm = _mm512_loadu_si512 ((lut + 64 ));
5168
+ lut_128_to_191_vec.zmm = _mm512_loadu_si512 ((lut + 128 ));
5169
+ lut_192_to_255_vec.zmm = _mm512_loadu_si512 ((lut + 192 ));
5170
+
5171
+ sz_u512_vec_t first_bit_vec, second_bit_vec;
5172
+ first_bit_vec.zmm = _mm512_set1_epi8 ((char )0x80 );
5173
+ second_bit_vec.zmm = _mm512_set1_epi8 ((char )0x40 );
5174
+
5175
+ __mmask64 first_bit_mask, second_bit_mask;
5176
+ sz_u512_vec_t source_vec;
5177
+ // If the top bit is set in each word of `source_vec`, than we use `lookup_128_to_191_vec` or
5178
+ // `lookup_192_to_255_vec`. If the second bit is set, we use `lookup_64_to_127_vec` or `lookup_192_to_255_vec`.
5179
+ sz_u512_vec_t lookup_0_to_63_vec, lookup_64_to_127_vec, lookup_128_to_191_vec, lookup_192_to_255_vec;
5180
+ sz_u512_vec_t blended_0_to_127_vec, blended_128_to_255_vec, blended_0_to_255_vec;
5181
+
5182
+ // Handling the head.
5183
+ if (head_length) {
5184
+ source_vec.zmm = _mm512_maskz_loadu_epi8 (head_mask, source);
5185
+ lookup_0_to_63_vec.zmm = _mm512_permutexvar_epi8 (source_vec.zmm , lut_0_to_63_vec.zmm );
5186
+ lookup_64_to_127_vec.zmm = _mm512_permutexvar_epi8 (source_vec.zmm , lut_64_to_127_vec.zmm );
5187
+ lookup_128_to_191_vec.zmm = _mm512_permutexvar_epi8 (source_vec.zmm , lut_128_to_191_vec.zmm );
5188
+ lookup_192_to_255_vec.zmm = _mm512_permutexvar_epi8 (source_vec.zmm , lut_192_to_255_vec.zmm );
5189
+ first_bit_mask = _mm512_test_epi8_mask (source_vec.zmm , first_bit_vec.zmm );
5190
+ second_bit_mask = _mm512_test_epi8_mask (source_vec.zmm , second_bit_vec.zmm );
5191
+ blended_0_to_127_vec.zmm =
5192
+ _mm512_mask_blend_epi8 (second_bit_mask, lookup_0_to_63_vec.zmm , lookup_64_to_127_vec.zmm );
5193
+ blended_128_to_255_vec.zmm =
5194
+ _mm512_mask_blend_epi8 (second_bit_mask, lookup_128_to_191_vec.zmm , lookup_192_to_255_vec.zmm );
5195
+ blended_0_to_255_vec.zmm =
5196
+ _mm512_mask_blend_epi8 (first_bit_mask, blended_0_to_127_vec.zmm , blended_128_to_255_vec.zmm );
5197
+ _mm512_mask_storeu_epi8 (target, head_mask, blended_0_to_255_vec.zmm );
5198
+ source += head_length, target += head_length, length -= head_length;
5199
+ }
5200
+
5201
+ // Handling the body in 64-byte chunks aligned to cache-line boundaries with respect to `target`.
5202
+ while (length >= 64 ) {
5203
+ source_vec.zmm = _mm512_loadu_si512 (source);
5204
+ lookup_0_to_63_vec.zmm = _mm512_permutexvar_epi8 (source_vec.zmm , lut_0_to_63_vec.zmm );
5205
+ lookup_64_to_127_vec.zmm = _mm512_permutexvar_epi8 (source_vec.zmm , lut_64_to_127_vec.zmm );
5206
+ lookup_128_to_191_vec.zmm = _mm512_permutexvar_epi8 (source_vec.zmm , lut_128_to_191_vec.zmm );
5207
+ lookup_192_to_255_vec.zmm = _mm512_permutexvar_epi8 (source_vec.zmm , lut_192_to_255_vec.zmm );
5208
+ first_bit_mask = _mm512_test_epi8_mask (source_vec.zmm , first_bit_vec.zmm );
5209
+ second_bit_mask = _mm512_test_epi8_mask (source_vec.zmm , second_bit_vec.zmm );
5210
+ blended_0_to_127_vec.zmm =
5211
+ _mm512_mask_blend_epi8 (second_bit_mask, lookup_0_to_63_vec.zmm , lookup_64_to_127_vec.zmm );
5212
+ blended_128_to_255_vec.zmm =
5213
+ _mm512_mask_blend_epi8 (second_bit_mask, lookup_128_to_191_vec.zmm , lookup_192_to_255_vec.zmm );
5214
+ blended_0_to_255_vec.zmm =
5215
+ _mm512_mask_blend_epi8 (first_bit_mask, blended_0_to_127_vec.zmm , blended_128_to_255_vec.zmm );
5216
+ _mm512_store_si512 (target, blended_0_to_255_vec.zmm ); // ! Aligned store, our main weapon!
5217
+ source += 64 , target += 64 , length -= 64 ;
5218
+ }
5219
+
5220
+ // Handling the tail.
5221
+ if (tail_length) {
5222
+ source_vec.zmm = _mm512_maskz_loadu_epi8 (tail_mask, source);
5223
+ lookup_0_to_63_vec.zmm = _mm512_permutexvar_epi8 (source_vec.zmm , lut_0_to_63_vec.zmm );
5224
+ lookup_64_to_127_vec.zmm = _mm512_permutexvar_epi8 (source_vec.zmm , lut_64_to_127_vec.zmm );
5225
+ lookup_128_to_191_vec.zmm = _mm512_permutexvar_epi8 (source_vec.zmm , lut_128_to_191_vec.zmm );
5226
+ lookup_192_to_255_vec.zmm = _mm512_permutexvar_epi8 (source_vec.zmm , lut_192_to_255_vec.zmm );
5227
+ first_bit_mask = _mm512_test_epi8_mask (source_vec.zmm , first_bit_vec.zmm );
5228
+ second_bit_mask = _mm512_test_epi8_mask (source_vec.zmm , second_bit_vec.zmm );
5229
+ blended_0_to_127_vec.zmm =
5230
+ _mm512_mask_blend_epi8 (second_bit_mask, lookup_0_to_63_vec.zmm , lookup_64_to_127_vec.zmm );
5231
+ blended_128_to_255_vec.zmm =
5232
+ _mm512_mask_blend_epi8 (second_bit_mask, lookup_128_to_191_vec.zmm , lookup_192_to_255_vec.zmm );
5233
+ blended_0_to_255_vec.zmm =
5234
+ _mm512_mask_blend_epi8 (first_bit_mask, blended_0_to_127_vec.zmm , blended_128_to_255_vec.zmm );
5235
+ _mm512_mask_storeu_epi8 (target, tail_mask, blended_0_to_255_vec.zmm );
5236
+ source += tail_length, target += tail_length, length -= tail_length;
5237
+ }
5238
+ }
5239
+
5109
5240
SZ_PUBLIC sz_cptr_t sz_find_charset_avx512 (sz_cptr_t text, sz_size_t length, sz_charset_t const *filter) {
5110
5241
5111
5242
// Before initializing the AVX-512 vectors, we may want to run the sequential code for the first few bytes.
@@ -5920,6 +6051,14 @@ SZ_DYNAMIC void sz_fill(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
5920
6051
#endif
5921
6052
}
5922
6053
6054
+ SZ_DYNAMIC void sz_look_up_transform (sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
6055
+ #if SZ_USE_X86_AVX512
6056
+ sz_look_up_transform_avx512 (source, length, lut, target);
6057
+ #else
6058
+ sz_look_up_transform_serial (source, length, lut, target);
6059
+ #endif
6060
+ }
6061
+
5923
6062
SZ_DYNAMIC sz_cptr_t sz_find_byte (sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle) {
5924
6063
#if SZ_USE_X86_AVX512
5925
6064
return sz_find_byte_avx512 (haystack, h_length, needle);
0 commit comments