Skip to content

Commit

Permalink
lib/adler32: replace adler32_generic_noreduce() with ADLER32_CHUNK()
Browse files Browse the repository at this point in the history
Make it a macro to avoid any issues with target option mismatches.
Also make it do the reduction mod DIVISOR, as all callers want that.
  • Loading branch information
ebiggers committed Mar 9, 2024
1 parent ca8607e commit b22a482
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 91 deletions.
99 changes: 51 additions & 48 deletions lib/adler32.c
Original file line number Diff line number Diff line change
Expand Up @@ -53,47 +53,54 @@
*/
#define MAX_CHUNK_LEN 5552

static forceinline void MAYBE_UNUSED
adler32_generic_noreduce(u32 *s1_p, u32 *s2_p, const u8 *p, size_t len)
{
u32 s1 = *s1_p;
u32 s2 = *s2_p;

/*
* This loop processes four bytes at a time with increased instruction-
* level parallelism when compared to the traditional approach of
* repeatedly doing 's1 += *p++; s2 += s1'. It is very similar to how
* vectorized implementations (e.g. AVX2) of Adler-32 commonly work.
*/
if (len >= 4) {
u32 s1_sum = 0;
u32 byte_0_sum = 0;
u32 byte_1_sum = 0;
u32 byte_2_sum = 0;
u32 byte_3_sum = 0;

do {
s1_sum += s1;
s1 += p[0] + p[1] + p[2] + p[3];
byte_0_sum += p[0];
byte_1_sum += p[1];
byte_2_sum += p[2];
byte_3_sum += p[3];
p += 4;
len -= 4;
} while (len >= 4);
s2 += (4 * (s1_sum + byte_0_sum)) + (3 * byte_1_sum) +
(2 * byte_2_sum) + byte_3_sum;
}

/* Process any remainder. */
for (; len; len--, p++) {
s1 += *p;
s2 += s1;
}
*s1_p = s1;
*s2_p = s2;
}
/*
* Update the Adler-32 values s1 and s2 using n bytes from p, update p to p + n,
* update n to 0, and reduce s1 and s2 mod DIVISOR. It is assumed that neither
* s1 nor s2 can overflow before the reduction at the end, i.e. n plus any bytes
* already processed after the last reduction must not exceed MAX_CHUNK_LEN.
*
* This uses only portable C code. This is used as a fallback when a vectorized
* implementation of Adler-32 (e.g. AVX2) is unavailable on the platform.
*
* Some of the vectorized implementations also use this to handle the end of the
* data when the data isn't evenly divisible by the length the vectorized code
* works on. To avoid compiler errors about target-specific option mismatches
* when this is used in that way, this is a macro rather than a function.
*
* Although this is unvectorized, this does include an optimization where the
* main loop processes four bytes at a time using a strategy similar to that
* used by vectorized implementations. This provides increased instruction-
* level parallelism compared to the traditional 's1 += *p++; s2 += s1;'.
*/
#define ADLER32_CHUNK(s1, s2, p, n) \
do { \
if (n >= 4) { \
u32 s1_sum = 0; \
u32 byte_0_sum = 0; \
u32 byte_1_sum = 0; \
u32 byte_2_sum = 0; \
u32 byte_3_sum = 0; \
\
do { \
s1_sum += s1; \
s1 += p[0] + p[1] + p[2] + p[3]; \
byte_0_sum += p[0]; \
byte_1_sum += p[1]; \
byte_2_sum += p[2]; \
byte_3_sum += p[3]; \
p += 4; \
n -= 4; \
} while (n >= 4); \
s2 += (4 * (s1_sum + byte_0_sum)) + (3 * byte_1_sum) + \
(2 * byte_2_sum) + byte_3_sum; \
} \
for (; n; n--, p++) { \
s1 += *p; \
s2 += s1; \
} \
s1 %= DIVISOR; \
s2 %= DIVISOR; \
} while (0)

static u32 MAYBE_UNUSED
adler32_generic(u32 adler, const u8 *p, size_t len)
Expand All @@ -102,14 +109,10 @@ adler32_generic(u32 adler, const u8 *p, size_t len)
u32 s2 = adler >> 16;

while (len) {
size_t chunk_len = MIN(len, MAX_CHUNK_LEN);

adler32_generic_noreduce(&s1, &s2, p, chunk_len);
p += chunk_len;
len -= chunk_len;
size_t n = MIN(len, MAX_CHUNK_LEN & ~3);

s1 %= DIVISOR;
s2 %= DIVISOR;
len -= n;
ADLER32_CHUNK(s1, s2, p, n);
}

return (s2 << 16) | s1;
Expand Down
47 changes: 10 additions & 37 deletions lib/arm/adler32_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ adler32_arm_neon(u32 adler, const u8 *p, size_t len)

/*
* If the length is large and the pointer is misaligned, align it.
* For smaller lengths, just take the unaligned load penalty.
* For smaller lengths, just take the misaligned load penalty.
*/
if (unlikely(len > 32768 && ((uintptr_t)p & 15))) {
do {
Expand Down Expand Up @@ -194,10 +194,11 @@ adler32_arm_neon(u32 adler, const u8 *p, size_t len)
s2 += vaddvq_u32(v_s2);
#endif
}
adler32_generic_noreduce(&s1, &s2, p, n);
p += n;
s1 %= DIVISOR;
s2 %= DIVISOR;
/*
* Process the last 0 <= n < 64 bytes of the chunk using
* scalar instructions and reduce s1 and s2 mod DIVISOR.
*/
ADLER32_CHUNK(s1, s2, p, n);
}
return (s2 << 16) | s1;
}
Expand Down Expand Up @@ -243,7 +244,7 @@ adler32_arm_neon_dotprod(u32 adler, const u8 *p, size_t len)

/*
* If the length is large and the pointer is misaligned, align it.
* For smaller lengths, just take the unaligned load penalty.
* For smaller lengths, just take the misaligned load penalty.
*/
if (unlikely(len > 32768 && ((uintptr_t)p & 15))) {
do {
Expand Down Expand Up @@ -323,38 +324,10 @@ adler32_arm_neon_dotprod(u32 adler, const u8 *p, size_t len)
s2 += vaddvq_u32(v_s2);
}
/*
* Process the last 0 <= n < 64 bytes of the chunk. This is a
* copy of adler32_generic_noreduce(). We can't just call it
* directly here because in some cases the compiler errors out
* when inlining it due to a target specific option mismatch due
* to the use of arch=armv8.2 above.
* Process the last 0 <= n < 64 bytes of the chunk using
* scalar instructions and reduce s1 and s2 mod DIVISOR.
*/
if (n >= 4) {
u32 s1_sum = 0;
u32 byte_0_sum = 0;
u32 byte_1_sum = 0;
u32 byte_2_sum = 0;
u32 byte_3_sum = 0;

do {
s1_sum += s1;
s1 += p[0] + p[1] + p[2] + p[3];
byte_0_sum += p[0];
byte_1_sum += p[1];
byte_2_sum += p[2];
byte_3_sum += p[3];
p += 4;
n -= 4;
} while (n >= 4);
s2 += (4 * (s1_sum + byte_0_sum)) + (3 * byte_1_sum) +
(2 * byte_2_sum) + byte_3_sum;
}
for (; n; n--, p++) {
s1 += *p;
s2 += s1;
}
s1 %= DIVISOR;
s2 %= DIVISOR;
ADLER32_CHUNK(s1, s2, p, n);
}
return (s2 << 16) | s1;
}
Expand Down
9 changes: 3 additions & 6 deletions lib/x86/adler32_template.h
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ ADD_SUFFIX(adler32)(u32 adler, const u8 *p, size_t len)

/*
* If the length is large and the pointer is misaligned, align it.
* For smaller lengths, just take the unaligned load penalty.
* For smaller lengths, just take the misaligned load penalty.
*/
if (unlikely(len > 65536 && ((uintptr_t)p & (VL-1)))) {
do {
Expand Down Expand Up @@ -477,12 +477,9 @@ ADD_SUFFIX(adler32)(u32 adler, const u8 *p, size_t len)
}
/*
* Process the last 0 <= n < 2*VL bytes of the chunk using
* scalar instructions, then reduce s1 and s2 mod DIVISOR.
* scalar instructions and reduce s1 and s2 mod DIVISOR.
*/
adler32_generic_noreduce(&s1, &s2, p, n);
p += n;
s1 %= DIVISOR;
s2 %= DIVISOR;
ADLER32_CHUNK(s1, s2, p, n);
}
#endif /* !USE_VNNI */
return (s2 << 16) | s1;
Expand Down

0 comments on commit b22a482

Please sign in to comment.