Skip to content

Commit

Permalink
lib/x86/crc32: more optimizations
Browse files Browse the repository at this point in the history
- As was recently done in the Adler-32 code, take advantage of the fact
  that on recent x86 processors, vmovdqu with an aligned pointer is just
  as fast as vmovdqa.  Don't waste time aligning the pointer unless the
  length is very large, and at the same time, handle all cases of
  len >= 8*VL using the main loop so that the 4*VL wide loop isn't
  needed.  (Before, aligning the pointer was tied to whether the main
  loop was used or not, since the main loop used vmovdqa.)

- Handle short lengths more efficiently.  Instead of falling back to
  crc32_slice1() for all len < VL, use AVX-512 masking (when available)
  to handle 4 <= len <= 15, and use 128-bit vector instructions to
  handle 16 <= len < VL.

- Document why the main loop uses a width of 8*VL instead of 4*VL.
  • Loading branch information
ebiggers committed Mar 12, 2024
1 parent 8ae3a19 commit 5d15bce
Show file tree
Hide file tree
Showing 5 changed files with 148 additions and 137 deletions.
3 changes: 0 additions & 3 deletions lib/x86/cpu_features.c
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,6 @@ static const struct cpu_feature x86_cpu_feature_table[] = {
{X86_CPU_FEATURE_AVX2, "avx2"},
{X86_CPU_FEATURE_BMI2, "bmi2"},
{X86_CPU_FEATURE_ZMM, "zmm"},
{X86_CPU_FEATURE_AVX512F, "avx512f"},
{X86_CPU_FEATURE_AVX512BW, "avx512bw"},
{X86_CPU_FEATURE_AVX512VL, "avx512vl"},
{X86_CPU_FEATURE_VPCLMULQDQ, "vpclmulqdq"},
Expand Down Expand Up @@ -163,8 +162,6 @@ void libdeflate_init_x86_cpu_features(void)
if (((xcr0 & 0xe6) == 0xe6) &&
allow_512bit_vectors(manufacturer, family, model))
features |= X86_CPU_FEATURE_ZMM;
if ((b & (1 << 16)) && ((xcr0 & 0xe6) == 0xe6))
features |= X86_CPU_FEATURE_AVX512F;
if ((b & (1 << 30)) && ((xcr0 & 0xe6) == 0xe6))
features |= X86_CPU_FEATURE_AVX512BW;
if ((b & (1U << 31)) && ((xcr0 & 0xe6) == 0xe6))
Expand Down
23 changes: 8 additions & 15 deletions lib/x86/cpu_features.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,17 +39,16 @@
#define X86_CPU_FEATURE_BMI2 (1 << 4)
/*
* ZMM indicates whether 512-bit vectors (zmm registers) should be used. On
* some CPUs, to avoid downclocking issues we don't set ZMM even if the CPU
* supports it, i.e. even if AVX512F is set. On these CPUs, we may still use
* AVX-512 instructions, but only with ymm and xmm registers.
* some CPUs, to avoid downclocking issues we don't set ZMM even if the CPU and
* operating system support AVX-512. On these CPUs, we may still use AVX-512
* instructions, but only with xmm and ymm registers.
*/
#define X86_CPU_FEATURE_ZMM (1 << 5)
#define X86_CPU_FEATURE_AVX512F (1 << 6)
#define X86_CPU_FEATURE_AVX512BW (1 << 7)
#define X86_CPU_FEATURE_AVX512VL (1 << 8)
#define X86_CPU_FEATURE_VPCLMULQDQ (1 << 9)
#define X86_CPU_FEATURE_AVX512VNNI (1 << 10)
#define X86_CPU_FEATURE_AVXVNNI (1 << 11)
#define X86_CPU_FEATURE_AVX512BW (1 << 6)
#define X86_CPU_FEATURE_AVX512VL (1 << 7)
#define X86_CPU_FEATURE_VPCLMULQDQ (1 << 8)
#define X86_CPU_FEATURE_AVX512VNNI (1 << 9)
#define X86_CPU_FEATURE_AVXVNNI (1 << 10)

#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)
/* Runtime x86 CPU feature detection is supported. */
Expand Down Expand Up @@ -135,12 +134,6 @@ static inline u32 get_x86_cpu_features(void) { return 0; }
# define HAVE_BMI2_NATIVE 0
#endif

#ifdef __AVX512F__
# define HAVE_AVX512F(features) 1
#else
# define HAVE_AVX512F(features) ((features) & X86_CPU_FEATURE_AVX512F)
#endif

#ifdef __AVX512BW__
# define HAVE_AVX512BW(features) 1
#else
Expand Down
21 changes: 17 additions & 4 deletions lib/x86/crc32_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,19 @@

#include "cpu_features.h"

/*
* pshufb(x, shift_tab[len..len+15]) left shifts x by 16-len bytes.
* pshufb(x, shift_tab[len+16..len+31]) right shifts x by len bytes.
*/
static const u8 MAYBE_UNUSED shift_tab[48] = {
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
};

#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)
/* PCLMULQDQ implementation */
# define crc32_x86_pclmulqdq crc32_x86_pclmulqdq
Expand Down Expand Up @@ -88,7 +101,7 @@
*/
# define crc32_x86_vpclmulqdq_avx512_vl256 crc32_x86_vpclmulqdq_avx512_vl256
# define SUFFIX _vpclmulqdq_avx512_vl256
# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512vl")
# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512bw,avx512vl")
# define VL 32
# define USE_SSE4_1 1
# define USE_AVX512 1
Expand All @@ -101,7 +114,7 @@
*/
# define crc32_x86_vpclmulqdq_avx512_vl512 crc32_x86_vpclmulqdq_avx512_vl512
# define SUFFIX _vpclmulqdq_avx512_vl512
# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512vl")
# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512bw,avx512vl")
# define VL 64
# define USE_SSE4_1 1
# define USE_AVX512 1
Expand All @@ -116,12 +129,12 @@ arch_select_crc32_func(void)
#ifdef crc32_x86_vpclmulqdq_avx512_vl512
if ((features & X86_CPU_FEATURE_ZMM) &&
HAVE_VPCLMULQDQ(features) && HAVE_PCLMULQDQ(features) &&
HAVE_AVX512F(features) && HAVE_AVX512VL(features))
HAVE_AVX512BW(features) && HAVE_AVX512VL(features))
return crc32_x86_vpclmulqdq_avx512_vl512;
#endif
#ifdef crc32_x86_vpclmulqdq_avx512_vl256
if (HAVE_VPCLMULQDQ(features) && HAVE_PCLMULQDQ(features) &&
HAVE_AVX512F(features) && HAVE_AVX512VL(features))
HAVE_AVX512BW(features) && HAVE_AVX512VL(features))
return crc32_x86_vpclmulqdq_avx512_vl256;
#endif
#ifdef crc32_x86_vpclmulqdq_avx2
Expand Down
Loading

0 comments on commit 5d15bce

Please sign in to comment.