Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

lib/x86/crc32: add VPCLMULQDQ implementations of CRC-32 #341

Merged
merged 3 commits into from
Feb 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ jobs:
runs-on: ${{matrix.os}}
steps:
- uses: actions/checkout@v4
- uses: microsoft/setup-msbuild@v1.1
- uses: microsoft/setup-msbuild@v2
- run: vcpkg install zlib:${{matrix.vcpkg}}
- run: >
echo C:\vcpkg\packages\zlib_${{matrix.vcpkg}}\bin
Expand Down Expand Up @@ -145,7 +145,7 @@ jobs:
runs-on: windows-latest
steps:
- uses: actions/checkout@v4
- uses: microsoft/setup-msbuild@v1.1
- uses: microsoft/setup-msbuild@v2
# Note: as per the CMake documentation, DESTDIR is unsupported on Windows.
- run: >
cmake -B build -G "Visual Studio 17 2022" -T ${{matrix.toolset}}
Expand Down
8 changes: 4 additions & 4 deletions lib/arm/crc32_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -474,12 +474,12 @@ static u32 ATTRIBUTES MAYBE_UNUSED
crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len)
{
static const u64 _aligned_attribute(16) mults[3][2] = {
CRC32_1VECS_MULTS,
CRC32_4VECS_MULTS,
CRC32_2VECS_MULTS,
{ CRC32_X159_MODG, CRC32_X95_MODG }, /* 1 vecs */
{ CRC32_X543_MODG, CRC32_X479_MODG }, /* 4 vecs */
{ CRC32_X287_MODG, CRC32_X223_MODG }, /* 2 vecs */
};
static const u64 _aligned_attribute(16) final_mults[3][2] = {
{ CRC32_FINAL_MULT, 0 },
{ CRC32_X63_MODG, 0 },
{ CRC32_BARRETT_CONSTANT_1, 0 },
{ CRC32_BARRETT_CONSTANT_2, 0 },
};
Expand Down
10 changes: 7 additions & 3 deletions lib/arm/crc32_pmull_wide.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,9 @@ ADD_SUFFIX(crc32_arm)(u32 crc, const u8 *p, size_t len)

if (len < 3 * 192) {
static const u64 _aligned_attribute(16) mults[3][2] = {
CRC32_4VECS_MULTS, CRC32_2VECS_MULTS, CRC32_1VECS_MULTS,
{ CRC32_X543_MODG, CRC32_X479_MODG }, /* 4 vecs */
{ CRC32_X287_MODG, CRC32_X223_MODG }, /* 2 vecs */
{ CRC32_X159_MODG, CRC32_X95_MODG }, /* 1 vecs */
};
poly64x2_t multipliers_4, multipliers_2, multipliers_1;

Expand Down Expand Up @@ -97,8 +99,10 @@ ADD_SUFFIX(crc32_arm)(u32 crc, const u8 *p, size_t len)
v0 = fold_vec(v0, v1, multipliers_1);
} else {
static const u64 _aligned_attribute(16) mults[4][2] = {
CRC32_12VECS_MULTS, CRC32_6VECS_MULTS,
CRC32_3VECS_MULTS, CRC32_1VECS_MULTS,
{ CRC32_X1567_MODG, CRC32_X1503_MODG }, /* 12 vecs */
{ CRC32_X799_MODG, CRC32_X735_MODG }, /* 6 vecs */
{ CRC32_X415_MODG, CRC32_X351_MODG }, /* 3 vecs */
{ CRC32_X159_MODG, CRC32_X95_MODG }, /* 1 vecs */
};
const poly64x2_t multipliers_12 = load_multipliers(mults[0]);
const poly64x2_t multipliers_6 = load_multipliers(mults[1]);
Expand Down
2 changes: 1 addition & 1 deletion lib/crc32.c
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@
* intermediate remainder (which we never actually store explicitly) is 96 bits.
*
* On CPUs that support fast carryless multiplication, CRCs can be computed even
* more quickly via "folding". See e.g. the x86 PCLMUL implementation.
* more quickly via "folding". See e.g. the x86 PCLMUL implementations.
*/

#include "lib_common.h"
Expand Down
122 changes: 85 additions & 37 deletions lib/crc32_multipliers.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,55 +4,103 @@
* THIS FILE WAS GENERATED BY gen_crc32_multipliers.c. DO NOT EDIT.
*/

#define CRC32_1VECS_MULT_1 0xae689191 /* x^159 mod G(x) */
#define CRC32_1VECS_MULT_2 0xccaa009e /* x^95 mod G(x) */
#define CRC32_1VECS_MULTS { CRC32_1VECS_MULT_1, CRC32_1VECS_MULT_2 }
#define CRC32_X159_MODG 0xae689191 /* x^159 mod G(x) */
#define CRC32_X95_MODG 0xccaa009e /* x^95 mod G(x) */

#define CRC32_2VECS_MULT_1 0xf1da05aa /* x^287 mod G(x) */
#define CRC32_2VECS_MULT_2 0x81256527 /* x^223 mod G(x) */
#define CRC32_2VECS_MULTS { CRC32_2VECS_MULT_1, CRC32_2VECS_MULT_2 }
#define CRC32_X287_MODG 0xf1da05aa /* x^287 mod G(x) */
#define CRC32_X223_MODG 0x81256527 /* x^223 mod G(x) */

#define CRC32_3VECS_MULT_1 0x3db1ecdc /* x^415 mod G(x) */
#define CRC32_3VECS_MULT_2 0xaf449247 /* x^351 mod G(x) */
#define CRC32_3VECS_MULTS { CRC32_3VECS_MULT_1, CRC32_3VECS_MULT_2 }
#define CRC32_X415_MODG 0x3db1ecdc /* x^415 mod G(x) */
#define CRC32_X351_MODG 0xaf449247 /* x^351 mod G(x) */

#define CRC32_4VECS_MULT_1 0x8f352d95 /* x^543 mod G(x) */
#define CRC32_4VECS_MULT_2 0x1d9513d7 /* x^479 mod G(x) */
#define CRC32_4VECS_MULTS { CRC32_4VECS_MULT_1, CRC32_4VECS_MULT_2 }
#define CRC32_X543_MODG 0x8f352d95 /* x^543 mod G(x) */
#define CRC32_X479_MODG 0x1d9513d7 /* x^479 mod G(x) */

#define CRC32_5VECS_MULT_1 0x1c279815 /* x^671 mod G(x) */
#define CRC32_5VECS_MULT_2 0xae0b5394 /* x^607 mod G(x) */
#define CRC32_5VECS_MULTS { CRC32_5VECS_MULT_1, CRC32_5VECS_MULT_2 }
#define CRC32_X671_MODG 0x1c279815 /* x^671 mod G(x) */
#define CRC32_X607_MODG 0xae0b5394 /* x^607 mod G(x) */

#define CRC32_6VECS_MULT_1 0xdf068dc2 /* x^799 mod G(x) */
#define CRC32_6VECS_MULT_2 0x57c54819 /* x^735 mod G(x) */
#define CRC32_6VECS_MULTS { CRC32_6VECS_MULT_1, CRC32_6VECS_MULT_2 }
#define CRC32_X799_MODG 0xdf068dc2 /* x^799 mod G(x) */
#define CRC32_X735_MODG 0x57c54819 /* x^735 mod G(x) */

#define CRC32_7VECS_MULT_1 0x31f8303f /* x^927 mod G(x) */
#define CRC32_7VECS_MULT_2 0x0cbec0ed /* x^863 mod G(x) */
#define CRC32_7VECS_MULTS { CRC32_7VECS_MULT_1, CRC32_7VECS_MULT_2 }
#define CRC32_X927_MODG 0x31f8303f /* x^927 mod G(x) */
#define CRC32_X863_MODG 0x0cbec0ed /* x^863 mod G(x) */

#define CRC32_8VECS_MULT_1 0x33fff533 /* x^1055 mod G(x) */
#define CRC32_8VECS_MULT_2 0x910eeec1 /* x^991 mod G(x) */
#define CRC32_8VECS_MULTS { CRC32_8VECS_MULT_1, CRC32_8VECS_MULT_2 }
#define CRC32_X1055_MODG 0x33fff533 /* x^1055 mod G(x) */
#define CRC32_X991_MODG 0x910eeec1 /* x^991 mod G(x) */

#define CRC32_9VECS_MULT_1 0x26b70c3d /* x^1183 mod G(x) */
#define CRC32_9VECS_MULT_2 0x3f41287a /* x^1119 mod G(x) */
#define CRC32_9VECS_MULTS { CRC32_9VECS_MULT_1, CRC32_9VECS_MULT_2 }
#define CRC32_X1183_MODG 0x26b70c3d /* x^1183 mod G(x) */
#define CRC32_X1119_MODG 0x3f41287a /* x^1119 mod G(x) */

#define CRC32_10VECS_MULT_1 0xe3543be0 /* x^1311 mod G(x) */
#define CRC32_10VECS_MULT_2 0x9026d5b1 /* x^1247 mod G(x) */
#define CRC32_10VECS_MULTS { CRC32_10VECS_MULT_1, CRC32_10VECS_MULT_2 }
#define CRC32_X1311_MODG 0xe3543be0 /* x^1311 mod G(x) */
#define CRC32_X1247_MODG 0x9026d5b1 /* x^1247 mod G(x) */

#define CRC32_11VECS_MULT_1 0x5a1bb05d /* x^1439 mod G(x) */
#define CRC32_11VECS_MULT_2 0xd1df2327 /* x^1375 mod G(x) */
#define CRC32_11VECS_MULTS { CRC32_11VECS_MULT_1, CRC32_11VECS_MULT_2 }
#define CRC32_X1439_MODG 0x5a1bb05d /* x^1439 mod G(x) */
#define CRC32_X1375_MODG 0xd1df2327 /* x^1375 mod G(x) */

#define CRC32_12VECS_MULT_1 0x596c8d81 /* x^1567 mod G(x) */
#define CRC32_12VECS_MULT_2 0xf5e48c85 /* x^1503 mod G(x) */
#define CRC32_12VECS_MULTS { CRC32_12VECS_MULT_1, CRC32_12VECS_MULT_2 }
#define CRC32_X1567_MODG 0x596c8d81 /* x^1567 mod G(x) */
#define CRC32_X1503_MODG 0xf5e48c85 /* x^1503 mod G(x) */

#define CRC32_FINAL_MULT 0xb8bc6765 /* x^63 mod G(x) */
#define CRC32_X1695_MODG 0x682bdd4f /* x^1695 mod G(x) */
#define CRC32_X1631_MODG 0x3c656ced /* x^1631 mod G(x) */

#define CRC32_X1823_MODG 0x4a28bd43 /* x^1823 mod G(x) */
#define CRC32_X1759_MODG 0xfe807bbd /* x^1759 mod G(x) */

#define CRC32_X1951_MODG 0x0077f00d /* x^1951 mod G(x) */
#define CRC32_X1887_MODG 0x1f0c2cdd /* x^1887 mod G(x) */

#define CRC32_X2079_MODG 0xce3371cb /* x^2079 mod G(x) */
#define CRC32_X2015_MODG 0xe95c1271 /* x^2015 mod G(x) */

#define CRC32_X2207_MODG 0xa749e894 /* x^2207 mod G(x) */
#define CRC32_X2143_MODG 0xb918a347 /* x^2143 mod G(x) */

#define CRC32_X2335_MODG 0x2c538639 /* x^2335 mod G(x) */
#define CRC32_X2271_MODG 0x71d54a59 /* x^2271 mod G(x) */

#define CRC32_X2463_MODG 0x32b0733c /* x^2463 mod G(x) */
#define CRC32_X2399_MODG 0xff6f2fc2 /* x^2399 mod G(x) */

#define CRC32_X2591_MODG 0x0e9bd5cc /* x^2591 mod G(x) */
#define CRC32_X2527_MODG 0xcec97417 /* x^2527 mod G(x) */

#define CRC32_X2719_MODG 0x76278617 /* x^2719 mod G(x) */
#define CRC32_X2655_MODG 0x1c63267b /* x^2655 mod G(x) */

#define CRC32_X2847_MODG 0xc51b93e3 /* x^2847 mod G(x) */
#define CRC32_X2783_MODG 0xf183c71b /* x^2783 mod G(x) */

#define CRC32_X2975_MODG 0x7eaed122 /* x^2975 mod G(x) */
#define CRC32_X2911_MODG 0x9b9bdbd0 /* x^2911 mod G(x) */

#define CRC32_X3103_MODG 0x2ce423f1 /* x^3103 mod G(x) */
#define CRC32_X3039_MODG 0xd31343ea /* x^3039 mod G(x) */

#define CRC32_X3231_MODG 0x8b8d8645 /* x^3231 mod G(x) */
#define CRC32_X3167_MODG 0x4470ac44 /* x^3167 mod G(x) */

#define CRC32_X3359_MODG 0x4b700aa8 /* x^3359 mod G(x) */
#define CRC32_X3295_MODG 0xeea395c4 /* x^3295 mod G(x) */

#define CRC32_X3487_MODG 0xeff5e99d /* x^3487 mod G(x) */
#define CRC32_X3423_MODG 0xf9d9c7ee /* x^3423 mod G(x) */

#define CRC32_X3615_MODG 0xad0d2bb2 /* x^3615 mod G(x) */
#define CRC32_X3551_MODG 0xcd669a40 /* x^3551 mod G(x) */

#define CRC32_X3743_MODG 0x9fb66bd3 /* x^3743 mod G(x) */
#define CRC32_X3679_MODG 0x6d40f445 /* x^3679 mod G(x) */

#define CRC32_X3871_MODG 0xc2dcc467 /* x^3871 mod G(x) */
#define CRC32_X3807_MODG 0x9ee62949 /* x^3807 mod G(x) */

#define CRC32_X3999_MODG 0x398e2ff2 /* x^3999 mod G(x) */
#define CRC32_X3935_MODG 0x145575d5 /* x^3935 mod G(x) */

#define CRC32_X4127_MODG 0x1072db28 /* x^4127 mod G(x) */
#define CRC32_X4063_MODG 0x0c30f51d /* x^4063 mod G(x) */

#define CRC32_X63_MODG 0xb8bc6765 /* x^63 mod G(x) */
#define CRC32_BARRETT_CONSTANT_1 0x00000001f7011641ULL /* floor(x^64 / G(x)) */
#define CRC32_BARRETT_CONSTANT_2 0x00000001db710641ULL /* G(x) */
#define CRC32_BARRETT_CONSTANTS { CRC32_BARRETT_CONSTANT_1, CRC32_BARRETT_CONSTANT_2 }
Expand Down
54 changes: 50 additions & 4 deletions lib/x86/cpu_features.c
Original file line number Diff line number Diff line change
Expand Up @@ -86,32 +86,71 @@ read_xcr(u32 index)

static const struct cpu_feature x86_cpu_feature_table[] = {
{X86_CPU_FEATURE_SSE2, "sse2"},
{X86_CPU_FEATURE_PCLMUL, "pclmul"},
{X86_CPU_FEATURE_PCLMULQDQ, "pclmulqdq"},
{X86_CPU_FEATURE_AVX, "avx"},
{X86_CPU_FEATURE_AVX2, "avx2"},
{X86_CPU_FEATURE_BMI2, "bmi2"},
{X86_CPU_FEATURE_AVX512F, "avx512f"},
{X86_CPU_FEATURE_AVX512VL, "avx512vl"},
{X86_CPU_FEATURE_VPCLMULQDQ, "vpclmulqdq"},
};

volatile u32 libdeflate_x86_cpu_features = 0;

/*
* Don't use 512-bit vectors on Intel CPUs 10th generation and older, due to the
* downclocking penalty.
*/
static inline bool
allow_512bit_vectors(const u32 manufacturer[3], u32 family, u32 model)
{
#ifdef TEST_SUPPORT__DO_NOT_USE
return true;
#endif
if (memcmp(manufacturer, "GenuineIntel", 12) != 0)
return true;
if (family != 6)
return true;
switch (model) {
case 85: /* Skylake (Server), Cascade Lake, Cooper Lake */
case 106: /* Ice Lake (Server) */
case 108: /* Ice Lake (Server) */
case 126: /* Ice Lake (Client) */
case 140: /* Tiger Lake */
case 141: /* Tiger Lake */
return false;
}
return true;
}

/* Initialize libdeflate_x86_cpu_features. */
void libdeflate_init_x86_cpu_features(void)
{
u32 max_leaf, a, b, c, d;
u32 max_leaf;
u32 manufacturer[3];
u32 family, model;
u32 a, b, c, d;
u64 xcr0 = 0;
u32 features = 0;

/* EAX=0: Highest Function Parameter and Manufacturer ID */
cpuid(0, 0, &max_leaf, &b, &c, &d);
cpuid(0, 0, &max_leaf, &manufacturer[0], &manufacturer[2],
&manufacturer[1]);
if (max_leaf < 1)
goto out;

/* EAX=1: Processor Info and Feature Bits */
cpuid(1, 0, &a, &b, &c, &d);
family = (a >> 8) & 0xf;
model = (a >> 4) & 0xf;
if (family == 6 || family == 0xf)
model += (a >> 12) & 0xf0;
if (family == 0xf)
family += (a >> 20) & 0xff;
if (d & (1 << 26))
features |= X86_CPU_FEATURE_SSE2;
if (c & (1 << 1))
features |= X86_CPU_FEATURE_PCLMUL;
features |= X86_CPU_FEATURE_PCLMULQDQ;
if (c & (1 << 27))
xcr0 = read_xcr(0);
if ((c & (1 << 28)) && ((xcr0 & 0x6) == 0x6))
Expand All @@ -126,6 +165,13 @@ void libdeflate_init_x86_cpu_features(void)
features |= X86_CPU_FEATURE_AVX2;
if (b & (1 << 8))
features |= X86_CPU_FEATURE_BMI2;
if ((b & (1 << 16)) && ((xcr0 & 0xe6) == 0xe6) &&
allow_512bit_vectors(manufacturer, family, model))
features |= X86_CPU_FEATURE_AVX512F;
if ((b & (1U << 31)) && ((xcr0 & 0xa6) == 0xa6))
features |= X86_CPU_FEATURE_AVX512VL;
if ((c & (1 << 10)) && ((xcr0 & 0x6) == 0x6))
features |= X86_CPU_FEATURE_VPCLMULQDQ;

out:
disable_cpu_features_for_testing(&features, x86_cpu_feature_table,
Expand Down
65 changes: 55 additions & 10 deletions lib/x86/cpu_features.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,16 +40,22 @@
#endif

#define X86_CPU_FEATURE_SSE2 0x00000001
#define X86_CPU_FEATURE_PCLMUL 0x00000002
#define X86_CPU_FEATURE_PCLMULQDQ 0x00000002
#define X86_CPU_FEATURE_AVX 0x00000004
#define X86_CPU_FEATURE_AVX2 0x00000008
#define X86_CPU_FEATURE_BMI2 0x00000010
#define X86_CPU_FEATURE_AVX512F 0x00000020
#define X86_CPU_FEATURE_AVX512VL 0x00000040
#define X86_CPU_FEATURE_VPCLMULQDQ 0x00000080

#define HAVE_SSE2(features) (HAVE_SSE2_NATIVE || ((features) & X86_CPU_FEATURE_SSE2))
#define HAVE_PCLMUL(features) (HAVE_PCLMUL_NATIVE || ((features) & X86_CPU_FEATURE_PCLMUL))
#define HAVE_PCLMULQDQ(features) (HAVE_PCLMULQDQ_NATIVE || ((features) & X86_CPU_FEATURE_PCLMULQDQ))
#define HAVE_AVX(features) (HAVE_AVX_NATIVE || ((features) & X86_CPU_FEATURE_AVX))
#define HAVE_AVX2(features) (HAVE_AVX2_NATIVE || ((features) & X86_CPU_FEATURE_AVX2))
#define HAVE_BMI2(features) (HAVE_BMI2_NATIVE || ((features) & X86_CPU_FEATURE_BMI2))
#define HAVE_AVX512F(features) (HAVE_AVX512F_NATIVE || ((features) & X86_CPU_FEATURE_AVX512F))
#define HAVE_AVX512VL(features) (HAVE_AVX512VL_NATIVE || ((features) & X86_CPU_FEATURE_AVX512VL))
#define HAVE_VPCLMULQDQ(features) (HAVE_VPCLMULQDQ_NATIVE || ((features) & X86_CPU_FEATURE_VPCLMULQDQ))

#if HAVE_DYNAMIC_X86_CPU_FEATURES
#define X86_CPU_FEATURES_KNOWN 0x80000000
Expand Down Expand Up @@ -90,18 +96,18 @@ static inline u32 get_x86_cpu_features(void) { return 0; }
#endif
#define HAVE_SSE2_INTRIN (HAVE_SSE2_NATIVE || HAVE_TARGET_INTRINSICS)

/* PCLMUL */
/* PCLMULQDQ */
#if defined(__PCLMUL__) || (defined(_MSC_VER) && defined(__AVX2__))
# define HAVE_PCLMUL_NATIVE 1
# define HAVE_PCLMULQDQ_NATIVE 1
#else
# define HAVE_PCLMUL_NATIVE 0
# define HAVE_PCLMULQDQ_NATIVE 0
#endif
#if HAVE_PCLMUL_NATIVE || (HAVE_TARGET_INTRINSICS && \
(GCC_PREREQ(4, 4) || CLANG_PREREQ(3, 2, 0) || \
defined(_MSC_VER)))
# define HAVE_PCLMUL_INTRIN 1
#if HAVE_PCLMULQDQ_NATIVE || (HAVE_TARGET_INTRINSICS && \
(GCC_PREREQ(4, 4) || CLANG_PREREQ(3, 2, 0) || \
defined(_MSC_VER)))
# define HAVE_PCLMULQDQ_INTRIN 1
#else
# define HAVE_PCLMUL_INTRIN 0
# define HAVE_PCLMULQDQ_INTRIN 0
#endif

/* AVX */
Expand Down Expand Up @@ -156,6 +162,45 @@ static inline u32 get_x86_cpu_features(void) { return 0; }
# define HAVE_BMI2_INTRIN 0
#endif

/* AVX-512F */
#ifdef __AVX512F__
# define HAVE_AVX512F_NATIVE 1
#else
# define HAVE_AVX512F_NATIVE 0
#endif
#if HAVE_AVX512F_NATIVE || GCC_PREREQ(5, 1) || CLANG_PREREQ(3, 8, 0) || \
defined(_MSC_VER)
# define HAVE_AVX512F_INTRIN 1
#else
# define HAVE_AVX512F_INTRIN 0
#endif

/* AVX-512VL */
#ifdef __AVX512VL__
# define HAVE_AVX512VL_NATIVE 1
#else
# define HAVE_AVX512VL_NATIVE 0
#endif
#if HAVE_AVX512VL_NATIVE || GCC_PREREQ(5, 1) || CLANG_PREREQ(3, 8, 0) || \
defined(_MSC_VER)
# define HAVE_AVX512VL_INTRIN 1
#else
# define HAVE_AVX512VL_INTRIN 0
#endif

/* VPCLMULQDQ */
#ifdef __VPCLMULQDQ__
# define HAVE_VPCLMULQDQ_NATIVE 1
#else
# define HAVE_VPCLMULQDQ_NATIVE 0
#endif
#if HAVE_VPCLMULQDQ_NATIVE || (GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 0) || \
defined(_MSC_VER))
# define HAVE_VPCLMULQDQ_INTRIN 1
#else
# define HAVE_VPCLMULQDQ_INTRIN 0
#endif

#endif /* ARCH_X86_32 || ARCH_X86_64 */

#endif /* LIB_X86_CPU_FEATURES_H */
Loading
Loading