From 3779fe88fea034922f808586b2564fbcab31efdc Mon Sep 17 00:00:00 2001 From: Nissa Date: Sun, 23 Apr 2023 16:42:51 +0000 Subject: [PATCH] fix incorrect results on big endian targets (#35) --- src/hw_aarch64.rs | 14 +++++++------- src/hw_x86_64.rs | 14 +++++++------- src/sw.rs | 6 +++--- src/util.rs | 27 +++++++++++++++++++++++---- 4 files changed, 40 insertions(+), 21 deletions(-) diff --git a/src/hw_aarch64.rs b/src/hw_aarch64.rs index 3d567d2..7f84114 100644 --- a/src/hw_aarch64.rs +++ b/src/hw_aarch64.rs @@ -1,5 +1,5 @@ use crate::hw_tables; -use crate::util; +use crate::util::{self, U64Le}; use std::arch::aarch64 as simd; use std::arch::asm; @@ -61,10 +61,10 @@ unsafe fn crc_u8(crc: u32, buffer: &[u8]) -> u32 { } #[inline(always)] -unsafe fn crc_u64(crc: u32, words: &[u64]) -> u32 { +unsafe fn crc_u64(crc: u32, words: &[U64Le]) -> u32 { words .iter() - .fold(crc, |crc, &next| crc_u64_append(crc, next)) + .fold(crc, |crc, &next| crc_u64_append(crc, next.get())) } #[inline(always)] @@ -77,7 +77,7 @@ unsafe fn crc_u64_parallel3( crc: u32, chunk_size: usize, table: &hw_tables::CrcTable, - buffer: &[u64], + buffer: &[U64Le], ) -> u32 { buffer.chunks(chunk_size).fold(crc, |mut crc0, chunk| { let mut crc1 = 0; @@ -92,9 +92,9 @@ unsafe fn crc_u64_parallel3( let c = blocks.next().unwrap(); for i in 0..block_size { - crc0 = crc_u64_append(crc0, a[i]); - crc1 = crc_u64_append(crc1, b[i]); - crc2 = crc_u64_append(crc2, c[i]); + crc0 = crc_u64_append(crc0, a[i].get()); + crc1 = crc_u64_append(crc1, b[i].get()); + crc2 = crc_u64_append(crc2, c[i].get()); } crc0 = table.shift_u32(crc0) ^ crc1; diff --git a/src/hw_x86_64.rs b/src/hw_x86_64.rs index 72535d5..8be304f 100644 --- a/src/hw_x86_64.rs +++ b/src/hw_x86_64.rs @@ -1,7 +1,7 @@ //! Implements crc32c with SSE 4.2 support. use crate::hw_tables; -use crate::util; +use crate::util::{self, U64Le}; use std::arch::x86_64 as simd; /// Computes CRC-32C using the SSE 4.2 hardware instruction. @@ -62,10 +62,10 @@ unsafe fn crc_u8(crc: u64, buffer: &[u8]) -> u64 { } #[inline] -unsafe fn crc_u64(crc: u64, buffer: &[u64]) -> u64 { +unsafe fn crc_u64(crc: u64, buffer: &[U64Le]) -> u64 { buffer .iter() - .fold(crc, |crc, &next| crc_u64_append(crc, next)) + .fold(crc, |crc, &next| crc_u64_append(crc, next.get())) } /// Hardware-parallel version of the algorithm. @@ -79,7 +79,7 @@ unsafe fn crc_u64_parallel3( crc: u64, chunk_size: usize, table: &hw_tables::CrcTable, - buffer: &[u64], + buffer: &[U64Le], ) -> u64 { buffer.chunks(chunk_size).fold(crc, |mut crc0, chunk| { let mut crc1 = 0; @@ -94,9 +94,9 @@ unsafe fn crc_u64_parallel3( let c = blocks.next().unwrap(); for i in 0..block_size { - crc0 = crc_u64_append(crc0, a[i]); - crc1 = crc_u64_append(crc1, b[i]); - crc2 = crc_u64_append(crc2, c[i]); + crc0 = crc_u64_append(crc0, a[i].get()); + crc1 = crc_u64_append(crc1, b[i].get()); + crc2 = crc_u64_append(crc2, c[i].get()); } crc0 = table.shift_u64(crc0) ^ crc1; diff --git a/src/sw.rs b/src/sw.rs index 2c9845b..2493b94 100644 --- a/src/sw.rs +++ b/src/sw.rs @@ -1,6 +1,6 @@ //! Implements crc32c without hardware support. -use crate::util; +use crate::util::{self, U64Le}; /// 8-KiB lookup table. pub struct CrcTable([[u32; 256]; 8]); @@ -41,9 +41,9 @@ fn crc_u8(crc: u64, buffer: &[u8]) -> u64 { } #[inline] -fn crc_u64(crci: u64, buffer: &[u64]) -> u64 { +fn crc_u64(crci: u64, buffer: &[U64Le]) -> u64 { buffer.iter().fold(crci, |crc, &next| { - let crc = crc ^ next; + let crc = crc ^ next.get(); // Note: I've tried refactoring this to a for-loop, // but then it gets worse performance. diff --git a/src/util.rs b/src/util.rs index 4579f50..e359786 100644 --- a/src/util.rs +++ b/src/util.rs @@ -1,11 +1,29 @@ use std::ptr::NonNull; -use std::{cmp, mem, slice}; +use std::{cmp, slice}; + +/// A newtype wrapper for a little endian `u64`. +/// +/// It is safe to transmute between a `u64` and `U64Le`. +#[repr(transparent)] +#[derive(Clone, Copy)] +pub(crate) struct U64Le(u64); + +impl U64Le { + /// Returns a `u64` with correct endianness for the target. + /// + /// On little endian targets, this is a no-op. + #[allow(clippy::inline_always)] + #[inline(always)] + pub const fn get(self) -> u64 { + u64::from_le(self.0) + } +} /// Splits a buffer into three subslices: /// - the first one is up to the first 8-byte aligned address. /// - the second one is 8-byte aligned and its length is a multiple of 8. /// - the third one is 8-byte aligned but its length is less than 8. -pub fn split(buffer: &[u8]) -> (&[u8], &[u64], &[u8]) { +pub(crate) fn split(buffer: &[u8]) -> (&[u8], &[U64Le], &[u8]) { let (start, mid) = { let split_index = { let addr = buffer.as_ptr() as usize; @@ -35,9 +53,10 @@ pub fn split(buffer: &[u8]) -> (&[u8], &[u64], &[u8]) { let ptr = if length == 0 { // `slice::from_raw_parts` requires that pointers be nonnull and // aligned even for zero-length slices. - NonNull::::dangling().as_ptr() + NonNull::::dangling().as_ptr() } else { - mem::transmute(mid.as_ptr()) + #[allow(clippy::cast_ptr_alignment)] + mid.as_ptr().cast::() }; slice::from_raw_parts(ptr, length)