Skip to content

Commit

Permalink
lib/x86/adler32: use dot product with ones in AVX512VNNI implementation
Browse files Browse the repository at this point in the history
After recent changes this is now faster.
  • Loading branch information
ebiggers committed Feb 24, 2024
1 parent dafc469 commit 8ab1074
Showing 1 changed file with 29 additions and 21 deletions.
50 changes: 29 additions & 21 deletions lib/x86/adler32_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,16 +36,22 @@
* of which halves the vector length, until just one counter remains.
*
* The s1 reductions don't depend on the s2 reductions and vice versa, so for
* efficiency they are interleaved. Also, every other s1 counter is 0 due to
* the 'psadbw' instruction (_mm_sad_epu8) summing groups of 8 bytes rather than
* 4; hence, one of the s1 reductions is skipped when going from 128 => 32 bits.
* efficiency they are interleaved.
*
* If used_sad=1, then the bytes were summed using one of the Sum of Absolute
* Differences instructions (psadbw or vpsadbw) before they were added to v_s1.
* In this case every other counter in v_s1 is 0, so we skip one of the s1
* reductions when going from 128 => 32 bits.
*/

#define ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2) \
#define ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2, used_sad) \
{ \
__m128i /* __v4su */ s1_last = (v_s1), s2_last = (v_s2); \
\
/* 128 => 32 bits */ \
if (!(used_sad)) \
s1_last = _mm_add_epi32(s1_last, \
_mm_shuffle_epi32(s1_last, 0x31)); \
s2_last = _mm_add_epi32(s2_last, _mm_shuffle_epi32(s2_last, 0x31)); \
s1_last = _mm_add_epi32(s1_last, _mm_shuffle_epi32(s1_last, 0x02)); \
s2_last = _mm_add_epi32(s2_last, _mm_shuffle_epi32(s2_last, 0x02)); \
Expand All @@ -54,7 +60,7 @@
*(s2) += (u32)_mm_cvtsi128_si32(s2_last); \
}

#define ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1, v_s2) \
#define ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1, v_s2, used_sad) \
{ \
__m128i /* __v4su */ s1_128bit, s2_128bit; \
\
Expand All @@ -64,10 +70,11 @@
s2_128bit = _mm_add_epi32(_mm256_extracti128_si256((v_s2), 0), \
_mm256_extracti128_si256((v_s2), 1)); \
\
ADLER32_FINISH_VEC_CHUNK_128((s1), (s2), s1_128bit, s2_128bit); \
ADLER32_FINISH_VEC_CHUNK_128((s1), (s2), s1_128bit, s2_128bit, \
(used_sad)); \
}

#define ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2) \
#define ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2, used_sad) \
{ \
__m256i /* __v8su */ s1_256bit, s2_256bit; \
\
Expand All @@ -77,7 +84,8 @@
s2_256bit = _mm256_add_epi32(_mm512_extracti64x4_epi64((v_s2), 0), \
_mm512_extracti64x4_epi64((v_s2), 1)); \
\
ADLER32_FINISH_VEC_CHUNK_256((s1), (s2), s1_256bit, s2_256bit); \
ADLER32_FINISH_VEC_CHUNK_256((s1), (s2), s1_256bit, s2_256bit, \
(used_sad)); \
}

/*
Expand Down Expand Up @@ -191,7 +199,7 @@ adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2)
v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(v_byte_sums_d, mults_d));

/* Add the counters to the real s1 and s2. */
ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2);
ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2, 1);
}
# include "../adler32_vec_template.h"
#endif /* HAVE_SSE2_INTRIN */
Expand Down Expand Up @@ -271,7 +279,7 @@ adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2)
v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(v_byte_sums_b, mults_b));
v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(v_byte_sums_c, mults_c));
v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(v_byte_sums_d, mults_d));
ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1, v_s2);
ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1, v_s2, 1);
}
# include "../adler32_vec_template.h"
#endif /* HAVE_AVX2_INTRIN */
Expand Down Expand Up @@ -361,19 +369,14 @@ adler32_avx2_vnni_chunk(const __m256i *p, const __m256i *const end,
v_s2_c = _mm256_add_epi32(v_s2_c, v_s2_d);
v_s2_a = _mm256_add_epi32(v_s2_a, v_s2_c);
v_s2_a = _mm256_add_epi32(v_s2_a, v_s1_sums_a);
ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1_a, v_s2_a);
ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1_a, v_s2_a, 1);
}
# include "../adler32_vec_template.h"
#endif /* HAVE_AVX2_INTRIN && HAVE_AVXVNNI_INTRIN */

/*
* AVX512BW/AVX512VNNI implementation. Uses the vpsadbw (sum of absolute
* differences) instruction from AVX512BW and the vpdpbusd (dot product)
* instruction from AVX512VNNI. Note that vpdpbusd with all-ones could be used
* instead of vpsadbw, but vpsadbw is faster.
*
* We currently don't include an implementation using AVX512BW or AVX512VNNI
* alone, since CPUs with good AVX-512 implementations tend to have both.
* AVX512BW/AVX512VNNI implementation. Uses the vpdpbusd (dot product)
* instruction from AVX512VNNI.
*/
#if HAVE_AVX512BW_INTRIN && HAVE_AVX512VNNI_INTRIN
# define adler32_avx512_vnni adler32_avx512_vnni
Expand Down Expand Up @@ -413,6 +416,7 @@ adler32_avx512_vnni_chunk(const __m512i *p, const __m512i *const end,
16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
};
const __m512i /* __v64qu */ mults_a = _mm512_load_si512((const __m512i *)mults);
const __m512i /* __v64qu */ ones = _mm512_set1_epi8(1);
const __m512i zeroes = _mm512_setzero_si512();
__m512i /* __v16su */ v_s1_a = zeroes;
__m512i /* __v16su */ v_s1_b = zeroes;
Expand All @@ -429,8 +433,12 @@ adler32_avx512_vnni_chunk(const __m512i *p, const __m512i *const end,
v_s2_b = _mm512_dpbusd_epi32(v_s2_b, bytes_b, mults_a);
v_s1_sums_a = _mm512_add_epi32(v_s1_sums_a, v_s1_a);
v_s1_sums_b = _mm512_add_epi32(v_s1_sums_b, v_s1_b);
v_s1_a = _mm512_add_epi32(v_s1_a, _mm512_sad_epu8(bytes_a, zeroes));
v_s1_b = _mm512_add_epi32(v_s1_b, _mm512_sad_epu8(bytes_b, zeroes));
/*
* vpdpbusd with 1's seems to be faster than vpsadbw + vpaddd
* here, provided that the accumulators are independent.
*/
v_s1_a = _mm512_dpbusd_epi32(v_s1_a, bytes_a, ones);
v_s1_b = _mm512_dpbusd_epi32(v_s1_b, bytes_b, ones);
GCC_UPDATE_VARS(v_s1_a, v_s1_b, v_s1_sums_a, v_s1_sums_b,
v_s2_a, v_s2_b);
} while (p != end);
Expand All @@ -441,7 +449,7 @@ adler32_avx512_vnni_chunk(const __m512i *p, const __m512i *const end,
v_s1_a = _mm512_add_epi32(v_s1_a, v_s1_b);
v_s2_a = _mm512_add_epi32(v_s2_a, v_s2_b);
v_s2_a = _mm512_add_epi32(v_s2_a, v_s1_sums_a);
ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1_a, v_s2_a);
ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1_a, v_s2_a, 0);
}
# include "../adler32_vec_template.h"
#endif /* HAVE_AVX512BW_INTRIN && HAVE_AVX512VNNI_INTRIN */
Expand Down

0 comments on commit 8ab1074

Please sign in to comment.