Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More CRC-32 and Adler-32 updates #353

Merged
merged 3 commits into from
Mar 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions lib/arm/adler32_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
# endif
# endif
# include <arm_neon.h>
static u32 ATTRIBUTES MAYBE_UNUSED
static ATTRIBUTES MAYBE_UNUSED u32
adler32_arm_neon(u32 adler, const u8 *p, size_t len)
{
static const u16 _aligned_attribute(16) mults[64] = {
Expand Down Expand Up @@ -225,7 +225,7 @@ adler32_arm_neon(u32 adler, const u8 *p, size_t len)
# endif
# endif
# include <arm_neon.h>
static u32 ATTRIBUTES
static ATTRIBUTES u32
adler32_arm_neon_dotprod(u32 adler, const u8 *p, size_t len)
{
static const u8 _aligned_attribute(16) mults[64] = {
Expand Down
6 changes: 3 additions & 3 deletions lib/arm/crc32_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ combine_crcs_slow(u32 crc0, u32 crc1, u32 crc2, u32 crc3)
}

#define crc32_arm_crc crc32_arm_crc
static u32 ATTRIBUTES MAYBE_UNUSED
static ATTRIBUTES MAYBE_UNUSED u32
crc32_arm_crc(u32 crc, const u8 *p, size_t len)
{
if (len >= 64) {
Expand Down Expand Up @@ -289,7 +289,7 @@ combine_crcs_fast(u32 crc0, u32 crc1, u32 crc2, u32 crc3, size_t i)
}

#define crc32_arm_crc_pmullcombine crc32_arm_crc_pmullcombine
static u32 ATTRIBUTES MAYBE_UNUSED
static ATTRIBUTES MAYBE_UNUSED u32
crc32_arm_crc_pmullcombine(u32 crc, const u8 *p, size_t len)
{
const size_t align = -(uintptr_t)p & 7;
Expand Down Expand Up @@ -470,7 +470,7 @@ crc32_arm_crc_pmullcombine(u32 crc, const u8 *p, size_t len)
# define ENABLE_EOR3 0
# include "crc32_pmull_helpers.h"

static u32 ATTRIBUTES MAYBE_UNUSED
static ATTRIBUTES MAYBE_UNUSED u32
crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len)
{
static const u64 _aligned_attribute(16) mults[3][2] = {
Expand Down
2 changes: 1 addition & 1 deletion lib/arm/crc32_pmull_wide.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@

#include "crc32_pmull_helpers.h"

static u32 ATTRIBUTES MAYBE_UNUSED
static ATTRIBUTES MAYBE_UNUSED u32
ADD_SUFFIX(crc32_arm)(u32 crc, const u8 *p, size_t len)
{
uint8x16_t v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11;
Expand Down
2 changes: 1 addition & 1 deletion lib/decompress_template.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
# define EXTRACT_VARBITS8(word, count) ((word) & BITMASK((u8)(count)))
#endif

static enum libdeflate_result ATTRIBUTES MAYBE_UNUSED
static ATTRIBUTES MAYBE_UNUSED enum libdeflate_result
FUNCNAME(struct libdeflate_decompressor * restrict d,
const void * restrict in, size_t in_nbytes,
void * restrict out, size_t out_nbytes_avail,
Expand Down
20 changes: 10 additions & 10 deletions lib/x86/adler32_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,19 +33,19 @@
/* SSE2 and AVX2 implementations. Used on older CPUs. */
#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)
# define adler32_x86_sse2 adler32_x86_sse2
# define SUFFIX _x86_sse2
# define SUFFIX _sse2
# define ATTRIBUTES _target_attribute("sse2")
# define VL 16
# define USE_VNNI 0
# define USE_MASKING 0
# define USE_AVX512 0
# include "adler32_template.h"

# define adler32_x86_avx2 adler32_x86_avx2
# define SUFFIX _x86_avx2
# define SUFFIX _avx2
# define ATTRIBUTES _target_attribute("avx2")
# define VL 32
# define USE_VNNI 0
# define USE_MASKING 0
# define USE_AVX512 0
# include "adler32_template.h"
#endif

Expand All @@ -55,11 +55,11 @@
*/
#if GCC_PREREQ(11, 1) || CLANG_PREREQ(12, 0, 13000000) || MSVC_PREREQ(1930)
# define adler32_x86_avx2_vnni adler32_x86_avx2_vnni
# define SUFFIX _x86_avx2_vnni
# define SUFFIX _avx2_vnni
# define ATTRIBUTES _target_attribute("avx2,avxvnni")
# define VL 32
# define USE_VNNI 1
# define USE_MASKING 0
# define USE_AVX512 0
# include "adler32_template.h"
#endif

Expand All @@ -72,11 +72,11 @@
* that support AVX10/256 but not AVX10/512.
*/
# define adler32_x86_avx512_vl256_vnni adler32_x86_avx512_vl256_vnni
# define SUFFIX _x86_avx512_vl256_vnni
# define SUFFIX _avx512_vl256_vnni
# define ATTRIBUTES _target_attribute("avx512bw,avx512vl,avx512vnni")
# define VL 32
# define USE_VNNI 1
# define USE_MASKING 1
# define USE_AVX512 1
# include "adler32_template.h"

/*
Expand All @@ -85,11 +85,11 @@
* the optimal implementation on CPUs that support AVX10/512.
*/
# define adler32_x86_avx512_vl512_vnni adler32_x86_avx512_vl512_vnni
# define SUFFIX _x86_avx512_vl512_vnni
# define SUFFIX _avx512_vl512_vnni
# define ATTRIBUTES _target_attribute("avx512bw,avx512vnni")
# define VL 64
# define USE_VNNI 1
# define USE_MASKING 1
# define USE_AVX512 1
# include "adler32_template.h"
#endif

Expand Down
52 changes: 29 additions & 23 deletions lib/x86/adler32_template.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,20 +34,21 @@
* ATTRIBUTES:
* Target function attributes to use. Must satisfy the dependencies of the
* other parameters as follows:
* VL=16 && USE_VNNI=0 && USE_MASKING=0: at least sse2
* VL=32 && USE_VNNI=0 && USE_MASKING=0: at least avx2
* VL=32 && USE_VNNI=1 && USE_MASKING=0: at least avx2,avxvnni
* VL=32 && USE_VNNI=1 && USE_MASKING=1: at least avx512bw,avx512vl,avx512vnni
* VL=64 && USE_VNNI=1 && USE_MASKING=1: at least avx512bw,avx512vnni
* VL=16 && USE_VNNI=0 && USE_AVX512=0: at least sse2
* VL=32 && USE_VNNI=0 && USE_AVX512=0: at least avx2
* VL=32 && USE_VNNI=1 && USE_AVX512=0: at least avx2,avxvnni
* VL=32 && USE_VNNI=1 && USE_AVX512=1: at least avx512bw,avx512vl,avx512vnni
* VL=64 && USE_VNNI=1 && USE_AVX512=1: at least avx512bw,avx512vnni
* (Other combinations are not useful and have not been tested.)
* VL:
* Vector length in bytes. Must be 16, 32, and 64.
* Vector length in bytes. Must be 16, 32, or 64.
* USE_VNNI:
* If 1, use the VNNI dot product based algorithm.
* If 0, use the legacy SSE2 and AVX2 compatible algorithm.
* USE_MASKING:
* If 1, use AVX-512 features such as masking.
* If 0, assume that the CPU might not support AVX-512.
* USE_AVX512:
* If 1, take advantage of AVX-512 features such as masking. This doesn't
* enable the use of 512-bit vectors; the vector length is controlled by
* VL. If 0, assume that the CPU might not support AVX-512.
*/

#if VL == 16
Expand All @@ -57,7 +58,7 @@
# define VADD8(a, b) _mm_add_epi8((a), (b))
# define VADD16(a, b) _mm_add_epi16((a), (b))
# define VADD32(a, b) _mm_add_epi32((a), (b))
# if USE_MASKING
# if USE_AVX512
# define VDPBUSD(a, b, c) _mm_dpbusd_epi32((a), (b), (c))
# else
# define VDPBUSD(a, b, c) _mm_dpbusd_avx_epi32((a), (b), (c))
Expand All @@ -68,20 +69,20 @@
# define VMASKZ_LOADU(mask, p) _mm_maskz_loadu_epi8((mask), (p))
# define VMULLO32(a, b) _mm_mullo_epi32((a), (b))
# define VSAD8(a, b) _mm_sad_epu8((a), (b))
# define VSET1_32(a) _mm_set1_epi32(a)
# define VSET1_8(a) _mm_set1_epi8(a)
# define VSET1_32(a) _mm_set1_epi32(a)
# define VSETZERO() _mm_setzero_si128()
# define VSLL32(a, b) _mm_slli_epi32((a), (b))
# define VUNPACKHI8(a, b) _mm_unpackhi_epi8((a), (b))
# define VUNPACKLO8(a, b) _mm_unpacklo_epi8((a), (b))
# define VUNPACKHI8(a, b) _mm_unpackhi_epi8((a), (b))
#elif VL == 32
# define vec_t __m256i
# define mask_t u32
# define LOG2_VL 5
# define VADD8(a, b) _mm256_add_epi8((a), (b))
# define VADD16(a, b) _mm256_add_epi16((a), (b))
# define VADD32(a, b) _mm256_add_epi32((a), (b))
# if USE_MASKING
# if USE_AVX512
# define VDPBUSD(a, b, c) _mm256_dpbusd_epi32((a), (b), (c))
# else
# define VDPBUSD(a, b, c) _mm256_dpbusd_avx_epi32((a), (b), (c))
Expand All @@ -92,27 +93,32 @@
# define VMASKZ_LOADU(mask, p) _mm256_maskz_loadu_epi8((mask), (p))
# define VMULLO32(a, b) _mm256_mullo_epi32((a), (b))
# define VSAD8(a, b) _mm256_sad_epu8((a), (b))
# define VSET1_32(a) _mm256_set1_epi32(a)
# define VSET1_8(a) _mm256_set1_epi8(a)
# define VSET1_32(a) _mm256_set1_epi32(a)
# define VSETZERO() _mm256_setzero_si256()
# define VSLL32(a, b) _mm256_slli_epi32((a), (b))
# define VUNPACKHI8(a, b) _mm256_unpackhi_epi8((a), (b))
# define VUNPACKLO8(a, b) _mm256_unpacklo_epi8((a), (b))
# define VUNPACKHI8(a, b) _mm256_unpackhi_epi8((a), (b))
#elif VL == 64
# define vec_t __m512i
# define mask_t u64
# define LOG2_VL 6
# define VADD8(a, b) _mm512_add_epi8((a), (b))
# define VADD16(a, b) _mm512_add_epi16((a), (b))
# define VADD32(a, b) _mm512_add_epi32((a), (b))
# define VDPBUSD(a, b, c) _mm512_dpbusd_epi32((a), (b), (c))
# define VLOAD(p) _mm512_load_si512((const void *)(p))
# define VLOADU(p) _mm512_loadu_si512((const void *)(p))
# define VMADD16(a, b) _mm512_madd_epi16((a), (b))
# define VMASKZ_LOADU(mask, p) _mm512_maskz_loadu_epi8((mask), (p))
# define VMULLO32(a, b) _mm512_mullo_epi32((a), (b))
# define VSET1_32(a) _mm512_set1_epi32(a)
# define VSAD8(a, b) _mm512_sad_epu8((a), (b))
# define VSET1_8(a) _mm512_set1_epi8(a)
# define VSET1_32(a) _mm512_set1_epi32(a)
# define VSETZERO() _mm512_setzero_si512()
# define VSLL32(a, b) _mm512_slli_epi32((a), (b))
# define VUNPACKLO8(a, b) _mm512_unpacklo_epi8((a), (b))
# define VUNPACKHI8(a, b) _mm512_unpackhi_epi8((a), (b))
#else
# error "unsupported vector length"
#endif
Expand Down Expand Up @@ -173,8 +179,8 @@ ADD_SUFFIX(reduce_to_32bits)(vec_t v_s1, vec_t v_s2, u32 *s1_p, u32 *s2_p)
}
#define reduce_to_32bits ADD_SUFFIX(reduce_to_32bits)

static u32 ATTRIBUTES
ADD_SUFFIX(adler32)(u32 adler, const u8 *p, size_t len)
static ATTRIBUTES u32
ADD_SUFFIX(adler32_x86)(u32 adler, const u8 *p, size_t len)
{
#if USE_VNNI
/* This contains the bytes [VL, VL-1, VL-2, ..., 1]. */
Expand Down Expand Up @@ -235,7 +241,7 @@ ADD_SUFFIX(adler32)(u32 adler, const u8 *p, size_t len)

#if USE_VNNI
/*
* This is Adler-32 using the vpdpbusd instruction from AVX512-VNNI or
* This is Adler-32 using the vpdpbusd instruction from AVX512VNNI or
* AVX-VNNI. vpdpbusd multiplies the unsigned bytes of one vector by
* the signed bytes of another vector and adds the sums in groups of 4
* to the 32-bit elements of a third vector. We use it in two ways:
Expand Down Expand Up @@ -369,7 +375,7 @@ ADD_SUFFIX(adler32)(u32 adler, const u8 *p, size_t len)
* Process the last 0 < n <= VL bytes of the chunk.
* Utilize a masked load if it's available.
*/
#if USE_MASKING
#if USE_AVX512
data = VMASKZ_LOADU((mask_t)-1 >> (VL - n), p);
#else
data = zeroes;
Expand Down Expand Up @@ -414,7 +420,7 @@ ADD_SUFFIX(adler32)(u32 adler, const u8 *p, size_t len)
* v_byte_sums_* counter is guaranteed to not exceed INT16_MAX.
* It's INT16_MAX, not UINT16_MAX, because v_byte_sums_* are
* used with pmaddwd which does signed multiplication. In the
* SSE2 case this limits chunks to 4096 bytes instead of 5504.
* SSE2 case this limits chunks to 4096 bytes instead of 5536.
*/
size_t n = MIN(len, MIN(2 * VL * (INT16_MAX / UINT8_MAX),
MAX_CHUNK_LEN) & ~(2*VL - 1));
Expand Down Expand Up @@ -502,11 +508,11 @@ ADD_SUFFIX(adler32)(u32 adler, const u8 *p, size_t len)
#undef VSET1_32
#undef VSETZERO
#undef VSLL32
#undef VUNPACKHI8
#undef VUNPACKLO8
#undef VUNPACKHI8

#undef SUFFIX
#undef ATTRIBUTES
#undef VL
#undef USE_VNNI
#undef USE_MASKING
#undef USE_AVX512
3 changes: 0 additions & 3 deletions lib/x86/cpu_features.c
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,6 @@ static const struct cpu_feature x86_cpu_feature_table[] = {
{X86_CPU_FEATURE_AVX2, "avx2"},
{X86_CPU_FEATURE_BMI2, "bmi2"},
{X86_CPU_FEATURE_ZMM, "zmm"},
{X86_CPU_FEATURE_AVX512F, "avx512f"},
{X86_CPU_FEATURE_AVX512BW, "avx512bw"},
{X86_CPU_FEATURE_AVX512VL, "avx512vl"},
{X86_CPU_FEATURE_VPCLMULQDQ, "vpclmulqdq"},
Expand Down Expand Up @@ -163,8 +162,6 @@ void libdeflate_init_x86_cpu_features(void)
if (((xcr0 & 0xe6) == 0xe6) &&
allow_512bit_vectors(manufacturer, family, model))
features |= X86_CPU_FEATURE_ZMM;
if ((b & (1 << 16)) && ((xcr0 & 0xe6) == 0xe6))
features |= X86_CPU_FEATURE_AVX512F;
if ((b & (1 << 30)) && ((xcr0 & 0xe6) == 0xe6))
features |= X86_CPU_FEATURE_AVX512BW;
if ((b & (1U << 31)) && ((xcr0 & 0xe6) == 0xe6))
Expand Down
23 changes: 8 additions & 15 deletions lib/x86/cpu_features.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,17 +39,16 @@
#define X86_CPU_FEATURE_BMI2 (1 << 4)
/*
* ZMM indicates whether 512-bit vectors (zmm registers) should be used. On
* some CPUs, to avoid downclocking issues we don't set ZMM even if the CPU
* supports it, i.e. even if AVX512F is set. On these CPUs, we may still use
* AVX-512 instructions, but only with ymm and xmm registers.
* some CPUs, to avoid downclocking issues we don't set ZMM even if the CPU and
* operating system support AVX-512. On these CPUs, we may still use AVX-512
* instructions, but only with xmm and ymm registers.
*/
#define X86_CPU_FEATURE_ZMM (1 << 5)
#define X86_CPU_FEATURE_AVX512F (1 << 6)
#define X86_CPU_FEATURE_AVX512BW (1 << 7)
#define X86_CPU_FEATURE_AVX512VL (1 << 8)
#define X86_CPU_FEATURE_VPCLMULQDQ (1 << 9)
#define X86_CPU_FEATURE_AVX512VNNI (1 << 10)
#define X86_CPU_FEATURE_AVXVNNI (1 << 11)
#define X86_CPU_FEATURE_AVX512BW (1 << 6)
#define X86_CPU_FEATURE_AVX512VL (1 << 7)
#define X86_CPU_FEATURE_VPCLMULQDQ (1 << 8)
#define X86_CPU_FEATURE_AVX512VNNI (1 << 9)
#define X86_CPU_FEATURE_AVXVNNI (1 << 10)

#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)
/* Runtime x86 CPU feature detection is supported. */
Expand Down Expand Up @@ -135,12 +134,6 @@ static inline u32 get_x86_cpu_features(void) { return 0; }
# define HAVE_BMI2_NATIVE 0
#endif

#ifdef __AVX512F__
# define HAVE_AVX512F(features) 1
#else
# define HAVE_AVX512F(features) ((features) & X86_CPU_FEATURE_AVX512F)
#endif

#ifdef __AVX512BW__
# define HAVE_AVX512BW(features) 1
#else
Expand Down
Loading
Loading