Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

lib/x86/adler32: use dot product with ones in AVX512VNNI implementation #344

Merged
merged 1 commit into from
Feb 24, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 29 additions & 21 deletions lib/x86/adler32_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,16 +36,22 @@
* of which halves the vector length, until just one counter remains.
*
* The s1 reductions don't depend on the s2 reductions and vice versa, so for
* efficiency they are interleaved. Also, every other s1 counter is 0 due to
* the 'psadbw' instruction (_mm_sad_epu8) summing groups of 8 bytes rather than
* 4; hence, one of the s1 reductions is skipped when going from 128 => 32 bits.
* efficiency they are interleaved.
*
* If used_sad=1, then the bytes were summed using one of the Sum of Absolute
* Differences instructions (psadbw or vpsadbw) before they were added to v_s1.
* In this case every other counter in v_s1 is 0, so we skip one of the s1
* reductions when going from 128 => 32 bits.
*/

#define ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2) \
#define ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2, used_sad) \
{ \
__m128i /* __v4su */ s1_last = (v_s1), s2_last = (v_s2); \
\
/* 128 => 32 bits */ \
if (!(used_sad)) \
s1_last = _mm_add_epi32(s1_last, \
_mm_shuffle_epi32(s1_last, 0x31)); \
s2_last = _mm_add_epi32(s2_last, _mm_shuffle_epi32(s2_last, 0x31)); \
s1_last = _mm_add_epi32(s1_last, _mm_shuffle_epi32(s1_last, 0x02)); \
s2_last = _mm_add_epi32(s2_last, _mm_shuffle_epi32(s2_last, 0x02)); \
Expand All @@ -54,7 +60,7 @@
*(s2) += (u32)_mm_cvtsi128_si32(s2_last); \
}

#define ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1, v_s2) \
#define ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1, v_s2, used_sad) \
{ \
__m128i /* __v4su */ s1_128bit, s2_128bit; \
\
Expand All @@ -64,10 +70,11 @@
s2_128bit = _mm_add_epi32(_mm256_extracti128_si256((v_s2), 0), \
_mm256_extracti128_si256((v_s2), 1)); \
\
ADLER32_FINISH_VEC_CHUNK_128((s1), (s2), s1_128bit, s2_128bit); \
ADLER32_FINISH_VEC_CHUNK_128((s1), (s2), s1_128bit, s2_128bit, \
(used_sad)); \
}

#define ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2) \
#define ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2, used_sad) \
{ \
__m256i /* __v8su */ s1_256bit, s2_256bit; \
\
Expand All @@ -77,7 +84,8 @@
s2_256bit = _mm256_add_epi32(_mm512_extracti64x4_epi64((v_s2), 0), \
_mm512_extracti64x4_epi64((v_s2), 1)); \
\
ADLER32_FINISH_VEC_CHUNK_256((s1), (s2), s1_256bit, s2_256bit); \
ADLER32_FINISH_VEC_CHUNK_256((s1), (s2), s1_256bit, s2_256bit, \
(used_sad)); \
}

/*
Expand Down Expand Up @@ -191,7 +199,7 @@ adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2)
v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(v_byte_sums_d, mults_d));

/* Add the counters to the real s1 and s2. */
ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2);
ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2, 1);
}
# include "../adler32_vec_template.h"
#endif /* HAVE_SSE2_INTRIN */
Expand Down Expand Up @@ -271,7 +279,7 @@ adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2)
v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(v_byte_sums_b, mults_b));
v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(v_byte_sums_c, mults_c));
v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(v_byte_sums_d, mults_d));
ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1, v_s2);
ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1, v_s2, 1);
}
# include "../adler32_vec_template.h"
#endif /* HAVE_AVX2_INTRIN */
Expand Down Expand Up @@ -361,19 +369,14 @@ adler32_avx2_vnni_chunk(const __m256i *p, const __m256i *const end,
v_s2_c = _mm256_add_epi32(v_s2_c, v_s2_d);
v_s2_a = _mm256_add_epi32(v_s2_a, v_s2_c);
v_s2_a = _mm256_add_epi32(v_s2_a, v_s1_sums_a);
ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1_a, v_s2_a);
ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1_a, v_s2_a, 1);
}
# include "../adler32_vec_template.h"
#endif /* HAVE_AVX2_INTRIN && HAVE_AVXVNNI_INTRIN */

/*
* AVX512BW/AVX512VNNI implementation. Uses the vpsadbw (sum of absolute
* differences) instruction from AVX512BW and the vpdpbusd (dot product)
* instruction from AVX512VNNI. Note that vpdpbusd with all-ones could be used
* instead of vpsadbw, but vpsadbw is faster.
*
* We currently don't include an implementation using AVX512BW or AVX512VNNI
* alone, since CPUs with good AVX-512 implementations tend to have both.
* AVX512BW/AVX512VNNI implementation. Uses the vpdpbusd (dot product)
* instruction from AVX512VNNI.
*/
#if HAVE_AVX512BW_INTRIN && HAVE_AVX512VNNI_INTRIN
# define adler32_avx512_vnni adler32_avx512_vnni
Expand Down Expand Up @@ -413,6 +416,7 @@ adler32_avx512_vnni_chunk(const __m512i *p, const __m512i *const end,
16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
};
const __m512i /* __v64qu */ mults_a = _mm512_load_si512((const __m512i *)mults);
const __m512i /* __v64qu */ ones = _mm512_set1_epi8(1);
const __m512i zeroes = _mm512_setzero_si512();
__m512i /* __v16su */ v_s1_a = zeroes;
__m512i /* __v16su */ v_s1_b = zeroes;
Expand All @@ -429,8 +433,12 @@ adler32_avx512_vnni_chunk(const __m512i *p, const __m512i *const end,
v_s2_b = _mm512_dpbusd_epi32(v_s2_b, bytes_b, mults_a);
v_s1_sums_a = _mm512_add_epi32(v_s1_sums_a, v_s1_a);
v_s1_sums_b = _mm512_add_epi32(v_s1_sums_b, v_s1_b);
v_s1_a = _mm512_add_epi32(v_s1_a, _mm512_sad_epu8(bytes_a, zeroes));
v_s1_b = _mm512_add_epi32(v_s1_b, _mm512_sad_epu8(bytes_b, zeroes));
/*
* vpdpbusd with 1's seems to be faster than vpsadbw + vpaddd
* here, provided that the accumulators are independent.
*/
v_s1_a = _mm512_dpbusd_epi32(v_s1_a, bytes_a, ones);
v_s1_b = _mm512_dpbusd_epi32(v_s1_b, bytes_b, ones);
GCC_UPDATE_VARS(v_s1_a, v_s1_b, v_s1_sums_a, v_s1_sums_b,
v_s2_a, v_s2_b);
} while (p != end);
Expand All @@ -441,7 +449,7 @@ adler32_avx512_vnni_chunk(const __m512i *p, const __m512i *const end,
v_s1_a = _mm512_add_epi32(v_s1_a, v_s1_b);
v_s2_a = _mm512_add_epi32(v_s2_a, v_s2_b);
v_s2_a = _mm512_add_epi32(v_s2_a, v_s1_sums_a);
ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1_a, v_s2_a);
ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1_a, v_s2_a, 0);
}
# include "../adler32_vec_template.h"
#endif /* HAVE_AVX512BW_INTRIN && HAVE_AVX512VNNI_INTRIN */
Expand Down
Loading