Skip to content

Commit

Permalink
lib/{adler32,crc32}: misc cleanups
Browse files Browse the repository at this point in the history
Various cleanups, including tweaks to make the Adler-32 code more
consistent with the CRC-32 code and vice versa.  No behavior changes.
  • Loading branch information
ebiggers committed Mar 12, 2024
1 parent 511893f commit 8ae3a19
Show file tree
Hide file tree
Showing 8 changed files with 215 additions and 202 deletions.
4 changes: 2 additions & 2 deletions lib/arm/adler32_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
# endif
# endif
# include <arm_neon.h>
static u32 ATTRIBUTES MAYBE_UNUSED
static ATTRIBUTES MAYBE_UNUSED u32
adler32_arm_neon(u32 adler, const u8 *p, size_t len)
{
static const u16 _aligned_attribute(16) mults[64] = {
Expand Down Expand Up @@ -225,7 +225,7 @@ adler32_arm_neon(u32 adler, const u8 *p, size_t len)
# endif
# endif
# include <arm_neon.h>
static u32 ATTRIBUTES
static ATTRIBUTES u32
adler32_arm_neon_dotprod(u32 adler, const u8 *p, size_t len)
{
static const u8 _aligned_attribute(16) mults[64] = {
Expand Down
6 changes: 3 additions & 3 deletions lib/arm/crc32_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ combine_crcs_slow(u32 crc0, u32 crc1, u32 crc2, u32 crc3)
}

#define crc32_arm_crc crc32_arm_crc
static u32 ATTRIBUTES MAYBE_UNUSED
static ATTRIBUTES MAYBE_UNUSED u32
crc32_arm_crc(u32 crc, const u8 *p, size_t len)
{
if (len >= 64) {
Expand Down Expand Up @@ -289,7 +289,7 @@ combine_crcs_fast(u32 crc0, u32 crc1, u32 crc2, u32 crc3, size_t i)
}

#define crc32_arm_crc_pmullcombine crc32_arm_crc_pmullcombine
static u32 ATTRIBUTES MAYBE_UNUSED
static ATTRIBUTES MAYBE_UNUSED u32
crc32_arm_crc_pmullcombine(u32 crc, const u8 *p, size_t len)
{
const size_t align = -(uintptr_t)p & 7;
Expand Down Expand Up @@ -470,7 +470,7 @@ crc32_arm_crc_pmullcombine(u32 crc, const u8 *p, size_t len)
# define ENABLE_EOR3 0
# include "crc32_pmull_helpers.h"

static u32 ATTRIBUTES MAYBE_UNUSED
static ATTRIBUTES MAYBE_UNUSED u32
crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len)
{
static const u64 _aligned_attribute(16) mults[3][2] = {
Expand Down
2 changes: 1 addition & 1 deletion lib/arm/crc32_pmull_wide.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@

#include "crc32_pmull_helpers.h"

static u32 ATTRIBUTES MAYBE_UNUSED
static ATTRIBUTES MAYBE_UNUSED u32
ADD_SUFFIX(crc32_arm)(u32 crc, const u8 *p, size_t len)
{
uint8x16_t v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11;
Expand Down
2 changes: 1 addition & 1 deletion lib/decompress_template.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
# define EXTRACT_VARBITS8(word, count) ((word) & BITMASK((u8)(count)))
#endif

static enum libdeflate_result ATTRIBUTES MAYBE_UNUSED
static ATTRIBUTES MAYBE_UNUSED enum libdeflate_result
FUNCNAME(struct libdeflate_decompressor * restrict d,
const void * restrict in, size_t in_nbytes,
void * restrict out, size_t out_nbytes_avail,
Expand Down
20 changes: 10 additions & 10 deletions lib/x86/adler32_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,19 +33,19 @@
/* SSE2 and AVX2 implementations. Used on older CPUs. */
#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)
# define adler32_x86_sse2 adler32_x86_sse2
# define SUFFIX _x86_sse2
# define SUFFIX _sse2
# define ATTRIBUTES _target_attribute("sse2")
# define VL 16
# define USE_VNNI 0
# define USE_MASKING 0
# define USE_AVX512 0
# include "adler32_template.h"

# define adler32_x86_avx2 adler32_x86_avx2
# define SUFFIX _x86_avx2
# define SUFFIX _avx2
# define ATTRIBUTES _target_attribute("avx2")
# define VL 32
# define USE_VNNI 0
# define USE_MASKING 0
# define USE_AVX512 0
# include "adler32_template.h"
#endif

Expand All @@ -55,11 +55,11 @@
*/
#if GCC_PREREQ(11, 1) || CLANG_PREREQ(12, 0, 13000000) || MSVC_PREREQ(1930)
# define adler32_x86_avx2_vnni adler32_x86_avx2_vnni
# define SUFFIX _x86_avx2_vnni
# define SUFFIX _avx2_vnni
# define ATTRIBUTES _target_attribute("avx2,avxvnni")
# define VL 32
# define USE_VNNI 1
# define USE_MASKING 0
# define USE_AVX512 0
# include "adler32_template.h"
#endif

Expand All @@ -72,11 +72,11 @@
* that support AVX10/256 but not AVX10/512.
*/
# define adler32_x86_avx512_vl256_vnni adler32_x86_avx512_vl256_vnni
# define SUFFIX _x86_avx512_vl256_vnni
# define SUFFIX _avx512_vl256_vnni
# define ATTRIBUTES _target_attribute("avx512bw,avx512vl,avx512vnni")
# define VL 32
# define USE_VNNI 1
# define USE_MASKING 1
# define USE_AVX512 1
# include "adler32_template.h"

/*
Expand All @@ -85,11 +85,11 @@
* the optimal implementation on CPUs that support AVX10/512.
*/
# define adler32_x86_avx512_vl512_vnni adler32_x86_avx512_vl512_vnni
# define SUFFIX _x86_avx512_vl512_vnni
# define SUFFIX _avx512_vl512_vnni
# define ATTRIBUTES _target_attribute("avx512bw,avx512vnni")
# define VL 64
# define USE_VNNI 1
# define USE_MASKING 1
# define USE_AVX512 1
# include "adler32_template.h"
#endif

Expand Down
52 changes: 29 additions & 23 deletions lib/x86/adler32_template.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,20 +34,21 @@
* ATTRIBUTES:
* Target function attributes to use. Must satisfy the dependencies of the
* other parameters as follows:
* VL=16 && USE_VNNI=0 && USE_MASKING=0: at least sse2
* VL=32 && USE_VNNI=0 && USE_MASKING=0: at least avx2
* VL=32 && USE_VNNI=1 && USE_MASKING=0: at least avx2,avxvnni
* VL=32 && USE_VNNI=1 && USE_MASKING=1: at least avx512bw,avx512vl,avx512vnni
* VL=64 && USE_VNNI=1 && USE_MASKING=1: at least avx512bw,avx512vnni
* VL=16 && USE_VNNI=0 && USE_AVX512=0: at least sse2
* VL=32 && USE_VNNI=0 && USE_AVX512=0: at least avx2
* VL=32 && USE_VNNI=1 && USE_AVX512=0: at least avx2,avxvnni
* VL=32 && USE_VNNI=1 && USE_AVX512=1: at least avx512bw,avx512vl,avx512vnni
* VL=64 && USE_VNNI=1 && USE_AVX512=1: at least avx512bw,avx512vnni
* (Other combinations are not useful and have not been tested.)
* VL:
* Vector length in bytes. Must be 16, 32, and 64.
* Vector length in bytes. Must be 16, 32, or 64.
* USE_VNNI:
* If 1, use the VNNI dot product based algorithm.
* If 0, use the legacy SSE2 and AVX2 compatible algorithm.
* USE_MASKING:
* If 1, use AVX-512 features such as masking.
* If 0, assume that the CPU might not support AVX-512.
* USE_AVX512:
* If 1, take advantage of AVX-512 features such as masking. This doesn't
* enable the use of 512-bit vectors; the vector length is controlled by
* VL. If 0, assume that the CPU might not support AVX-512.
*/

#if VL == 16
Expand All @@ -57,7 +58,7 @@
# define VADD8(a, b) _mm_add_epi8((a), (b))
# define VADD16(a, b) _mm_add_epi16((a), (b))
# define VADD32(a, b) _mm_add_epi32((a), (b))
# if USE_MASKING
# if USE_AVX512
# define VDPBUSD(a, b, c) _mm_dpbusd_epi32((a), (b), (c))
# else
# define VDPBUSD(a, b, c) _mm_dpbusd_avx_epi32((a), (b), (c))
Expand All @@ -68,20 +69,20 @@
# define VMASKZ_LOADU(mask, p) _mm_maskz_loadu_epi8((mask), (p))
# define VMULLO32(a, b) _mm_mullo_epi32((a), (b))
# define VSAD8(a, b) _mm_sad_epu8((a), (b))
# define VSET1_32(a) _mm_set1_epi32(a)
# define VSET1_8(a) _mm_set1_epi8(a)
# define VSET1_32(a) _mm_set1_epi32(a)
# define VSETZERO() _mm_setzero_si128()
# define VSLL32(a, b) _mm_slli_epi32((a), (b))
# define VUNPACKHI8(a, b) _mm_unpackhi_epi8((a), (b))
# define VUNPACKLO8(a, b) _mm_unpacklo_epi8((a), (b))
# define VUNPACKHI8(a, b) _mm_unpackhi_epi8((a), (b))
#elif VL == 32
# define vec_t __m256i
# define mask_t u32
# define LOG2_VL 5
# define VADD8(a, b) _mm256_add_epi8((a), (b))
# define VADD16(a, b) _mm256_add_epi16((a), (b))
# define VADD32(a, b) _mm256_add_epi32((a), (b))
# if USE_MASKING
# if USE_AVX512
# define VDPBUSD(a, b, c) _mm256_dpbusd_epi32((a), (b), (c))
# else
# define VDPBUSD(a, b, c) _mm256_dpbusd_avx_epi32((a), (b), (c))
Expand All @@ -92,27 +93,32 @@
# define VMASKZ_LOADU(mask, p) _mm256_maskz_loadu_epi8((mask), (p))
# define VMULLO32(a, b) _mm256_mullo_epi32((a), (b))
# define VSAD8(a, b) _mm256_sad_epu8((a), (b))
# define VSET1_32(a) _mm256_set1_epi32(a)
# define VSET1_8(a) _mm256_set1_epi8(a)
# define VSET1_32(a) _mm256_set1_epi32(a)
# define VSETZERO() _mm256_setzero_si256()
# define VSLL32(a, b) _mm256_slli_epi32((a), (b))
# define VUNPACKHI8(a, b) _mm256_unpackhi_epi8((a), (b))
# define VUNPACKLO8(a, b) _mm256_unpacklo_epi8((a), (b))
# define VUNPACKHI8(a, b) _mm256_unpackhi_epi8((a), (b))
#elif VL == 64
# define vec_t __m512i
# define mask_t u64
# define LOG2_VL 6
# define VADD8(a, b) _mm512_add_epi8((a), (b))
# define VADD16(a, b) _mm512_add_epi16((a), (b))
# define VADD32(a, b) _mm512_add_epi32((a), (b))
# define VDPBUSD(a, b, c) _mm512_dpbusd_epi32((a), (b), (c))
# define VLOAD(p) _mm512_load_si512((const void *)(p))
# define VLOADU(p) _mm512_loadu_si512((const void *)(p))
# define VMADD16(a, b) _mm512_madd_epi16((a), (b))
# define VMASKZ_LOADU(mask, p) _mm512_maskz_loadu_epi8((mask), (p))
# define VMULLO32(a, b) _mm512_mullo_epi32((a), (b))
# define VSET1_32(a) _mm512_set1_epi32(a)
# define VSAD8(a, b) _mm512_sad_epu8((a), (b))
# define VSET1_8(a) _mm512_set1_epi8(a)
# define VSET1_32(a) _mm512_set1_epi32(a)
# define VSETZERO() _mm512_setzero_si512()
# define VSLL32(a, b) _mm512_slli_epi32((a), (b))
# define VUNPACKLO8(a, b) _mm512_unpacklo_epi8((a), (b))
# define VUNPACKHI8(a, b) _mm512_unpackhi_epi8((a), (b))
#else
# error "unsupported vector length"
#endif
Expand Down Expand Up @@ -173,8 +179,8 @@ ADD_SUFFIX(reduce_to_32bits)(vec_t v_s1, vec_t v_s2, u32 *s1_p, u32 *s2_p)
}
#define reduce_to_32bits ADD_SUFFIX(reduce_to_32bits)

static u32 ATTRIBUTES
ADD_SUFFIX(adler32)(u32 adler, const u8 *p, size_t len)
static ATTRIBUTES u32
ADD_SUFFIX(adler32_x86)(u32 adler, const u8 *p, size_t len)
{
#if USE_VNNI
/* This contains the bytes [VL, VL-1, VL-2, ..., 1]. */
Expand Down Expand Up @@ -235,7 +241,7 @@ ADD_SUFFIX(adler32)(u32 adler, const u8 *p, size_t len)

#if USE_VNNI
/*
* This is Adler-32 using the vpdpbusd instruction from AVX512-VNNI or
* This is Adler-32 using the vpdpbusd instruction from AVX512VNNI or
* AVX-VNNI. vpdpbusd multiplies the unsigned bytes of one vector by
* the signed bytes of another vector and adds the sums in groups of 4
* to the 32-bit elements of a third vector. We use it in two ways:
Expand Down Expand Up @@ -369,7 +375,7 @@ ADD_SUFFIX(adler32)(u32 adler, const u8 *p, size_t len)
* Process the last 0 < n <= VL bytes of the chunk.
* Utilize a masked load if it's available.
*/
#if USE_MASKING
#if USE_AVX512
data = VMASKZ_LOADU((mask_t)-1 >> (VL - n), p);
#else
data = zeroes;
Expand Down Expand Up @@ -414,7 +420,7 @@ ADD_SUFFIX(adler32)(u32 adler, const u8 *p, size_t len)
* v_byte_sums_* counter is guaranteed to not exceed INT16_MAX.
* It's INT16_MAX, not UINT16_MAX, because v_byte_sums_* are
* used with pmaddwd which does signed multiplication. In the
* SSE2 case this limits chunks to 4096 bytes instead of 5504.
* SSE2 case this limits chunks to 4096 bytes instead of 5536.
*/
size_t n = MIN(len, MIN(2 * VL * (INT16_MAX / UINT8_MAX),
MAX_CHUNK_LEN) & ~(2*VL - 1));
Expand Down Expand Up @@ -502,11 +508,11 @@ ADD_SUFFIX(adler32)(u32 adler, const u8 *p, size_t len)
#undef VSET1_32
#undef VSETZERO
#undef VSLL32
#undef VUNPACKHI8
#undef VUNPACKLO8
#undef VUNPACKHI8

#undef SUFFIX
#undef ATTRIBUTES
#undef VL
#undef USE_VNNI
#undef USE_MASKING
#undef USE_AVX512
41 changes: 24 additions & 17 deletions lib/x86/crc32_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@
# define SUFFIX _pclmulqdq
# define ATTRIBUTES _target_attribute("pclmul")
# define VL 16
# define FOLD_LESSTHAN16BYTES 0
# define USE_TERNARYLOGIC 0
# define USE_SSE4_1 0
# define USE_AVX512 0
# include "crc32_pclmul_template.h"

/*
Expand All @@ -49,55 +49,62 @@
* non-destructive VEX-encoded instructions. Second, AVX support implies SSSE3
* and SSE4.1 support, and we can use SSSE3 and SSE4.1 intrinsics for efficient
* handling of partial blocks. (We *could* compile a variant with
* PCLMULQDQ+SSE4.1 without AVX, but for simplicity we don't currently bother.)
* PCLMULQDQ+SSE4.1 without AVX, but for simplicity we currently don't bother.)
*/
# define crc32_x86_pclmulqdq_avx crc32_x86_pclmulqdq_avx
# define SUFFIX _pclmulqdq_avx
# define ATTRIBUTES _target_attribute("pclmul,avx")
# define VL 16
# define FOLD_LESSTHAN16BYTES 1
# define USE_TERNARYLOGIC 0
# define USE_SSE4_1 1
# define USE_AVX512 0
# include "crc32_pclmul_template.h"
#endif

/*
* VPCLMULQDQ/AVX2 implementation. Uses 256-bit vectors.
* VPCLMULQDQ/AVX2 implementation. This is used on CPUs that have AVX2 and
* VPCLMULQDQ but don't have AVX-512, for example Intel Alder Lake.
*
* Currently this can't be enabled with MSVC because MSVC has a bug where it
* incorrectly assumes that VPCLMULQDQ implies AVX-512:
* https://developercommunity.visualstudio.com/t/Compiler-incorrectly-assumes-VAES-and-VP/10578785?space=62&q=AVX512&sort=newest
* https://developercommunity.visualstudio.com/t/Compiler-incorrectly-assumes-VAES-and-VP/10578785
*/
#if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000)
# define crc32_x86_vpclmulqdq_avx2 crc32_x86_vpclmulqdq_avx2
# define SUFFIX _vpclmulqdq_avx2
# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx2")
# define VL 32
# define FOLD_LESSTHAN16BYTES 1
# define USE_TERNARYLOGIC 0
# define USE_SSE4_1 1
# define USE_AVX512 0
# include "crc32_pclmul_template.h"
#endif

#if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920)
/*
* VPCLMULQDQ/AVX512 implementation with 256-bit vectors. This takes advantage
* of some AVX-512 instructions but uses 256-bit vectors rather than 512-bit.
* This can be useful on CPUs where 512-bit vectors cause downclocking.
* VPCLMULQDQ/AVX512 implementation using 256-bit vectors. This is very similar
* to the VPCLMULQDQ/AVX2 implementation but takes advantage of the vpternlog
* instruction and more registers. This is used on CPUs that support AVX-512
* but where using 512-bit vectors causes downclocking. This should also be the
* optimal implementation on CPUs that support AVX10/256 but not AVX10/512.
*/
# define crc32_x86_vpclmulqdq_avx512_vl256 crc32_x86_vpclmulqdq_avx512_vl256
# define SUFFIX _vpclmulqdq_avx512_vl256
# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512vl")
# define VL 32
# define FOLD_LESSTHAN16BYTES 1
# define USE_TERNARYLOGIC 1
# define USE_SSE4_1 1
# define USE_AVX512 1
# include "crc32_pclmul_template.h"

/* VPCLMULQDQ/AVX512 implementation with 512-bit vectors */
/*
* VPCLMULQDQ/AVX512 implementation using 512-bit vectors. This is used on CPUs
* that have a good AVX-512 implementation including VPCLMULQDQ. This should
* also be the optimal implementation on CPUs that support AVX10/512.
*/
# define crc32_x86_vpclmulqdq_avx512_vl512 crc32_x86_vpclmulqdq_avx512_vl512
# define SUFFIX _vpclmulqdq_avx512_vl512
# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512vl")
# define VL 64
# define FOLD_LESSTHAN16BYTES 1
# define USE_TERNARYLOGIC 1
# define USE_SSE4_1 1
# define USE_AVX512 1
# include "crc32_pclmul_template.h"
#endif

Expand Down
Loading

0 comments on commit 8ae3a19

Please sign in to comment.