@@ -1251,6 +1251,8 @@ SZ_PUBLIC void sz_copy_neon(sz_ptr_t target, sz_cptr_t source, sz_size_t length)
1251
1251
SZ_PUBLIC void sz_move_neon (sz_ptr_t target, sz_cptr_t source, sz_size_t length);
1252
1252
/* * @copydoc sz_fill */
1253
1253
SZ_PUBLIC void sz_fill_neon (sz_ptr_t target, sz_size_t length, sz_u8_t value);
1254
+ /* * @copydoc sz_look_up_transform */
1255
+ SZ_PUBLIC void sz_look_up_transform_neon (sz_cptr_t source, sz_size_t length, sz_cptr_t table, sz_ptr_t target);
1254
1256
/* * @copydoc sz_find_byte */
1255
1257
SZ_PUBLIC sz_cptr_t sz_find_byte_neon (sz_cptr_t haystack, sz_size_t h_length, sz_cptr_t needle);
1256
1258
/* * @copydoc sz_rfind_byte */
@@ -5780,13 +5782,11 @@ SZ_PUBLIC sz_ordering_t sz_order_neon(sz_cptr_t a, sz_size_t a_length, sz_cptr_t
5780
5782
5781
5783
SZ_PUBLIC sz_bool_t sz_equal_neon (sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
5782
5784
sz_u128_vec_t a_vec, b_vec;
5783
-
5784
- while (length >= 16 ) {
5785
+ for (; length >= 16 ; a += 16 , b += 16 , length -= 16 ) {
5785
5786
a_vec.u8x16 = vld1q_u8 ((sz_u8_t const *)a);
5786
5787
b_vec.u8x16 = vld1q_u8 ((sz_u8_t const *)b);
5787
5788
uint8x16_t cmp = vceqq_u8 (a_vec.u8x16 , b_vec.u8x16 );
5788
5789
if (vmaxvq_u8 (cmp) != 255 ) { return sz_false_k; } // Check if all bytes match
5789
- a += 16 , b += 16 , length -= 16 ;
5790
5790
}
5791
5791
5792
5792
// Handle remaining bytes
@@ -5795,19 +5795,27 @@ SZ_PUBLIC sz_bool_t sz_equal_neon(sz_cptr_t a, sz_cptr_t b, sz_size_t length) {
5795
5795
}
5796
5796
5797
5797
SZ_PUBLIC void sz_copy_neon (sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
5798
- sz_u128_vec_t src_vec;
5799
-
5800
- while (length >= 16 ) {
5801
- src_vec.u8x16 = vld1q_u8 ((sz_u8_t const *)source);
5802
- vst1q_u8 ((sz_u8_t *)target, src_vec.u8x16 );
5803
- target += 16 , source += 16 , length -= 16 ;
5804
- }
5805
-
5806
- // Handle remaining bytes
5798
+ // In most cases the `source` and the `target` are not aligned, but we should
5799
+ // at least make sure that writes don't touch many cache lines.
5800
+ // NEON has an instruction to load and write 64 bytes at once.
5801
+ //
5802
+ // sz_size_t head_length = (64 - ((sz_size_t)target % 64)) % 64; // 63 or less.
5803
+ // sz_size_t tail_length = (sz_size_t)(target + length) % 64; // 63 or less.
5804
+ // for (; head_length; target += 1, source += 1, head_length -= 1) *target = *source;
5805
+ // length -= head_length;
5806
+ // for (; length >= 64; target += 64, source += 64, length -= 64)
5807
+ // vst4q_u8((sz_u8_t *)target, vld1q_u8_x4((sz_u8_t const *)source));
5808
+ // for (; tail_length; target += 1, source += 1, tail_length -= 1) *target = *source;
5809
+ //
5810
+ // Sadly, those instructions end up being 20% slower than the code processing 16 bytes at a time:
5811
+ for (; length >= 16 ; target += 16 , source += 16 , length -= 16 )
5812
+ vst1q_u8 ((sz_u8_t *)target, vld1q_u8 ((sz_u8_t const *)source));
5807
5813
if (length) sz_copy_serial (target, source, length);
5808
5814
}
5809
5815
5810
5816
SZ_PUBLIC void sz_move_neon (sz_ptr_t target, sz_cptr_t source, sz_size_t length) {
5817
+ // When moving small buffers, using a small buffer on stack as a temporary storage is faster.
5818
+
5811
5819
if (target < source || target >= source + length) {
5812
5820
// Non-overlapping, proceed forward
5813
5821
sz_copy_neon (target, source, length);
@@ -5843,6 +5851,56 @@ SZ_PUBLIC void sz_fill_neon(sz_ptr_t target, sz_size_t length, sz_u8_t value) {
5843
5851
if (length) sz_fill_serial (target, length, value);
5844
5852
}
5845
5853
5854
+ SZ_PUBLIC void sz_look_up_transform_neon (sz_cptr_t source, sz_size_t length, sz_cptr_t lut, sz_ptr_t target) {
5855
+
5856
+ // If the input is tiny (especially smaller than the look-up table itself), we may end up paying
5857
+ // more for organizing the SIMD registers and changing the CPU state, than for the actual computation.
5858
+ if (length <= 128 ) {
5859
+ sz_look_up_transform_serial (source, length, lut, target);
5860
+ return ;
5861
+ }
5862
+
5863
+ sz_size_t head_length = (16 - ((sz_size_t )target % 16 )) % 16 ; // 15 or less.
5864
+ sz_size_t tail_length = (sz_size_t )(target + length) % 16 ; // 15 or less.
5865
+
5866
+ // We need to pull the lookup table into 16x NEON registers. We have a total of 32 such registers.
5867
+ // According to the Neoverse V2 manual, the 4-table lookup has a latency of 6 cycles, and 4x throughput.
5868
+ uint8x16x4_t lut_0_to_63_vec, lut_64_to_127_vec, lut_128_to_191_vec, lut_192_to_255_vec;
5869
+ lut_0_to_63_vec = vld1q_u8_x4 ((sz_u8_t const *)(lut + 0 ));
5870
+ lut_64_to_127_vec = vld1q_u8_x4 ((sz_u8_t const *)(lut + 64 ));
5871
+ lut_128_to_191_vec = vld1q_u8_x4 ((sz_u8_t const *)(lut + 128 ));
5872
+ lut_192_to_255_vec = vld1q_u8_x4 ((sz_u8_t const *)(lut + 192 ));
5873
+
5874
+ sz_u128_vec_t source_vec;
5875
+ // If the top bit is set in each word of `source_vec`, than we use `lookup_128_to_191_vec` or
5876
+ // `lookup_192_to_255_vec`. If the second bit is set, we use `lookup_64_to_127_vec` or `lookup_192_to_255_vec`.
5877
+ sz_u128_vec_t lookup_0_to_63_vec, lookup_64_to_127_vec, lookup_128_to_191_vec, lookup_192_to_255_vec;
5878
+ sz_u128_vec_t blended_0_to_255_vec;
5879
+
5880
+ // Process the head with serial code
5881
+ for (; head_length; target += 1 , source += 1 , head_length -= 1 ) *target = lut[*(sz_u8_t const *)source];
5882
+
5883
+ // Table lookups on Arm are much simpler to use than on x86, as we can use the `vqtbl4q_u8` instruction
5884
+ // to perform a 4-table lookup in a single instruction. The XORs are used to adjust the lookup position
5885
+ // within each 64-byte range of the table.
5886
+ // Details on the 4-table lookup: https://lemire.me/blog/2019/07/23/arbitrary-byte-to-byte-maps-using-arm-neon/
5887
+ length -= head_length;
5888
+ length -= tail_length;
5889
+ for (; length >= 16 ; source += 16 , target += 16 , length -= 16 ) {
5890
+ source_vec.u8x16 = vld1q_u8 ((sz_u8_t const *)source);
5891
+ lookup_0_to_63_vec.u8x16 = vqtbl4q_u8 (lut_0_to_63_vec, source_vec.u8x16 );
5892
+ lookup_64_to_127_vec.u8x16 = vqtbl4q_u8 (lut_64_to_127_vec, veorq_u8 (source_vec.u8x16 , vdupq_n_u8 (0x40 )));
5893
+ lookup_128_to_191_vec.u8x16 = vqtbl4q_u8 (lut_128_to_191_vec, veorq_u8 (source_vec.u8x16 , vdupq_n_u8 (0x80 )));
5894
+ lookup_192_to_255_vec.u8x16 = vqtbl4q_u8 (lut_192_to_255_vec, veorq_u8 (source_vec.u8x16 , vdupq_n_u8 (0xc0 )));
5895
+ blended_0_to_255_vec.u8x16 = vorrq_u8 (vorrq_u8 (lookup_0_to_63_vec.u8x16 , lookup_64_to_127_vec.u8x16 ),
5896
+ vorrq_u8 (lookup_128_to_191_vec.u8x16 , lookup_192_to_255_vec.u8x16 ));
5897
+ vst1q_u8 ((sz_u8_t *)target, blended_0_to_255_vec.u8x16 );
5898
+ }
5899
+
5900
+ // Process the tail with serial code
5901
+ for (; tail_length; target += 1 , source += 1 , tail_length -= 1 ) *target = lut[*(sz_u8_t const *)source];
5902
+ }
5903
+
5846
5904
SZ_PUBLIC sz_cptr_t sz_find_byte_neon (sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
5847
5905
sz_u64_t matches;
5848
5906
sz_u128_vec_t h_vec, n_vec, matches_vec;
@@ -6276,6 +6334,8 @@ SZ_DYNAMIC void sz_look_up_transform(sz_cptr_t source, sz_size_t length, sz_cptr
6276
6334
sz_look_up_transform_avx512 (source, length, lut, target);
6277
6335
#elif SZ_USE_X86_AVX2
6278
6336
sz_look_up_transform_avx2 (source, length, lut, target);
6337
+ #elif SZ_USE_ARM_NEON
6338
+ sz_look_up_transform_neon (source, length, lut, target);
6279
6339
#else
6280
6340
sz_look_up_transform_serial (source, length, lut, target);
6281
6341
#endif
0 commit comments