From 9c204761dd5c1409d916dbaa4a1a25b37aebb8f6 Mon Sep 17 00:00:00 2001
From: LightQuantum <self@lightquantum.me>
Date: Fri, 1 Sep 2023 05:56:45 +0800
Subject: [PATCH] perf(fetcher/checksum): SIMD

---
 Cargo.lock                                    |  45 ++
 NOTICE.md                                     | 195 +++++++
 rsync-fetcher/Cargo.toml                      |   5 +
 rsync-fetcher/src/rsync/checksum.rs           | 154 ++++-
 .../md4_simd/aarch64_simd_transpose.rs        |  66 +++
 .../src/rsync/checksum/md4_simd/mod.rs        | 550 ++++++++++++++++++
 .../checksum/md4_simd/x86_simd_transpose.rs   | 131 +++++
 rsync-fetcher/src/rsync/generator.rs          |  34 +-
 8 files changed, 1129 insertions(+), 51 deletions(-)
 create mode 100644 NOTICE.md
 create mode 100755 rsync-fetcher/src/rsync/checksum/md4_simd/aarch64_simd_transpose.rs
 create mode 100755 rsync-fetcher/src/rsync/checksum/md4_simd/mod.rs
 create mode 100755 rsync-fetcher/src/rsync/checksum/md4_simd/x86_simd_transpose.rs

diff --git a/Cargo.lock b/Cargo.lock
index 8f8f23d..1e95116 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -399,6 +399,18 @@ version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bddcadddf5e9015d310179a59bb28c4d4b9920ad0f11e8e14dbadf654890c9a6"
 
+[[package]]
+name = "array-init"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3d62b7694a562cdf5a74227903507c56ab2cc8bdd1f781ed5cb4cf9c9f810bfc"
+
+[[package]]
+name = "arrayref"
+version = "0.3.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6b4930d2cb77ce62f89ee5d5289b4ac049559b1c45539271f5ed4fdc7db34545"
+
 [[package]]
 name = "async-compat"
 version = "0.2.1"
@@ -2416,6 +2428,28 @@ dependencies = [
  "uuid",
 ]
 
+[[package]]
+name = "multiversion"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b2c7b9d7fe61760ce5ea19532ead98541f6b4c495d87247aff9826445cf6872a"
+dependencies = [
+ "multiversion-macros",
+ "target-features",
+]
+
+[[package]]
+name = "multiversion-macros"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "26a83d8500ed06d68877e9de1dde76c1dbb83885dcdbda4ef44ccbc3fbda2ac8"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 1.0.109",
+ "target-features",
+]
+
 [[package]]
 name = "nanorand"
 version = "0.7.0"
@@ -3364,6 +3398,8 @@ dependencies = [
 name = "rsync-fetcher"
 version = "0.1.0"
 dependencies = [
+ "array-init",
+ "arrayref",
  "async-trait",
  "base64",
  "blake2",
@@ -3380,14 +3416,17 @@ dependencies = [
  "indicatif",
  "itertools 0.11.0",
  "md4",
+ "multiversion",
  "num",
  "opendal",
  "percent-encoding",
+ "proptest",
  "rsync-core",
  "scan_fmt",
  "sqlx",
  "tap",
  "tempfile",
+ "test-strategy",
  "tokio",
  "tokio-util",
  "tracing",
@@ -4327,6 +4366,12 @@ version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
 
+[[package]]
+name = "target-features"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06f6b473c37f9add4cf1df5b4d66a8ef58ab6c895f1a3b3f949cf3e21230140e"
+
 [[package]]
 name = "tempfile"
 version = "3.8.0"
diff --git a/NOTICE.md b/NOTICE.md
new file mode 100644
index 0000000..cd4f715
--- /dev/null
+++ b/NOTICE.md
@@ -0,0 +1,195 @@
+# Third-party code
+
+## fast_rsync
+
+Location: `rsync-fetcher/rsync/checksum/md4_simd`
+
+### Modifications
+
+modified the crate path to make it fit into the project structure.
+
+### License
+
+```
+Copyright (c) 2019 Dropbox, Inc.
+Copyright (c) 2016 bacher09, Artyom Pavlov (RustCrypto/hashes/MD4).
+```
+
+```
+                              Apache License
+                        Version 2.0, January 2004
+                     http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+   "License" shall mean the terms and conditions for use, reproduction,
+   and distribution as defined by Sections 1 through 9 of this document.
+
+   "Licensor" shall mean the copyright owner or entity authorized by
+   the copyright owner that is granting the License.
+
+   "Legal Entity" shall mean the union of the acting entity and all
+   other entities that control, are controlled by, or are under common
+   control with that entity. For the purposes of this definition,
+   "control" means (i) the power, direct or indirect, to cause the
+   direction or management of such entity, whether by contract or
+   otherwise, or (ii) ownership of fifty percent (50%) or more of the
+   outstanding shares, or (iii) beneficial ownership of such entity.
+
+   "You" (or "Your") shall mean an individual or Legal Entity
+   exercising permissions granted by this License.
+
+   "Source" form shall mean the preferred form for making modifications,
+   including but not limited to software source code, documentation
+   source, and configuration files.
+
+   "Object" form shall mean any form resulting from mechanical
+   transformation or translation of a Source form, including but
+   not limited to compiled object code, generated documentation,
+   and conversions to other media types.
+
+   "Work" shall mean the work of authorship, whether in Source or
+   Object form, made available under the License, as indicated by a
+   copyright notice that is included in or attached to the work
+   (an example is provided in the Appendix below).
+
+   "Derivative Works" shall mean any work, whether in Source or Object
+   form, that is based on (or derived from) the Work and for which the
+   editorial revisions, annotations, elaborations, or other modifications
+   represent, as a whole, an original work of authorship. For the purposes
+   of this License, Derivative Works shall not include works that remain
+   separable from, or merely link (or bind by name) to the interfaces of,
+   the Work and Derivative Works thereof.
+
+   "Contribution" shall mean any work of authorship, including
+   the original version of the Work and any modifications or additions
+   to that Work or Derivative Works thereof, that is intentionally
+   submitted to Licensor for inclusion in the Work by the copyright owner
+   or by an individual or Legal Entity authorized to submit on behalf of
+   the copyright owner. For the purposes of this definition, "submitted"
+   means any form of electronic, verbal, or written communication sent
+   to the Licensor or its representatives, including but not limited to
+   communication on electronic mailing lists, source code control systems,
+   and issue tracking systems that are managed by, or on behalf of, the
+   Licensor for the purpose of discussing and improving the Work, but
+   excluding communication that is conspicuously marked or otherwise
+   designated in writing by the copyright owner as "Not a Contribution."
+
+   "Contributor" shall mean Licensor and any individual or Legal Entity
+   on behalf of whom a Contribution has been received by Licensor and
+   subsequently incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   copyright license to reproduce, prepare Derivative Works of,
+   publicly display, publicly perform, sublicense, and distribute the
+   Work and such Derivative Works in Source or Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   (except as stated in this section) patent license to make, have made,
+   use, offer to sell, sell, import, and otherwise transfer the Work,
+   where such license applies only to those patent claims licensable
+   by such Contributor that are necessarily infringed by their
+   Contribution(s) alone or by combination of their Contribution(s)
+   with the Work to which such Contribution(s) was submitted. If You
+   institute patent litigation against any entity (including a
+   cross-claim or counterclaim in a lawsuit) alleging that the Work
+   or a Contribution incorporated within the Work constitutes direct
+   or contributory patent infringement, then any patent licenses
+   granted to You under this License for that Work shall terminate
+   as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the
+   Work or Derivative Works thereof in any medium, with or without
+   modifications, and in Source or Object form, provided that You
+   meet the following conditions:
+
+   (a) You must give any other recipients of the Work or
+       Derivative Works a copy of this License; and
+
+   (b) You must cause any modified files to carry prominent notices
+       stating that You changed the files; and
+
+   (c) You must retain, in the Source form of any Derivative Works
+       that You distribute, all copyright, patent, trademark, and
+       attribution notices from the Source form of the Work,
+       excluding those notices that do not pertain to any part of
+       the Derivative Works; and
+
+   (d) If the Work includes a "NOTICE" text file as part of its
+       distribution, then any Derivative Works that You distribute must
+       include a readable copy of the attribution notices contained
+       within such NOTICE file, excluding those notices that do not
+       pertain to any part of the Derivative Works, in at least one
+       of the following places: within a NOTICE text file distributed
+       as part of the Derivative Works; within the Source form or
+       documentation, if provided along with the Derivative Works; or,
+       within a display generated by the Derivative Works, if and
+       wherever such third-party notices normally appear. The contents
+       of the NOTICE file are for informational purposes only and
+       do not modify the License. You may add Your own attribution
+       notices within Derivative Works that You distribute, alongside
+       or as an addendum to the NOTICE text from the Work, provided
+       that such additional attribution notices cannot be construed
+       as modifying the License.
+
+   You may add Your own copyright statement to Your modifications and
+   may provide additional or different license terms and conditions
+   for use, reproduction, or distribution of Your modifications, or
+   for any such Derivative Works as a whole, provided Your use,
+   reproduction, and distribution of the Work otherwise complies with
+   the conditions stated in this License.
+
+5. Submission of Contributions. Unless You explicitly state otherwise,
+   any Contribution intentionally submitted for inclusion in the Work
+   by You to the Licensor shall be under the terms and conditions of
+   this License, without any additional terms or conditions.
+   Notwithstanding the above, nothing herein shall supersede or modify
+   the terms of any separate license agreement you may have executed
+   with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade
+   names, trademarks, service marks, or product names of the Licensor,
+   except as required for reasonable and customary use in describing the
+   origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or
+   agreed to in writing, Licensor provides the Work (and each
+   Contributor provides its Contributions) on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+   implied, including, without limitation, any warranties or conditions
+   of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+   PARTICULAR PURPOSE. You are solely responsible for determining the
+   appropriateness of using or redistributing the Work and assume any
+   risks associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory,
+   whether in tort (including negligence), contract, or otherwise,
+   unless required by applicable law (such as deliberate and grossly
+   negligent acts) or agreed to in writing, shall any Contributor be
+   liable to You for damages, including any direct, indirect, special,
+   incidental, or consequential damages of any character arising as a
+   result of this License or out of the use or inability to use the
+   Work (including but not limited to damages for loss of goodwill,
+   work stoppage, computer failure or malfunction, or any and all
+   other commercial damages or losses), even if such Contributor
+   has been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing
+   the Work or Derivative Works thereof, You may choose to offer,
+   and charge a fee for, acceptance of support, warranty, indemnity,
+   or other liability obligations and/or rights consistent with this
+   License. However, in accepting such obligations, You may act only
+   on Your own behalf and on Your sole responsibility, not on behalf
+   of any other Contributor, and only if You agree to indemnify,
+   defend, and hold each Contributor harmless for any liability
+   incurred by, or claims asserted against, such Contributor by reason
+   of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
+```
\ No newline at end of file
diff --git a/rsync-fetcher/Cargo.toml b/rsync-fetcher/Cargo.toml
index 1fad0a9..39b4b53 100644
--- a/rsync-fetcher/Cargo.toml
+++ b/rsync-fetcher/Cargo.toml
@@ -5,6 +5,8 @@ edition = "2021"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
+array-init = "2.1"
+arrayref = "0.3"
 async-trait = "0.1"
 base64 = "0.21"
 blake2 = "0.10"
@@ -21,6 +23,7 @@ futures = "0.3"
 indicatif = "0.17"
 itertools = "0.11"
 md4 = "0.10"
+multiversion = "0.7"
 num = "0.4"
 opendal = { version = "0.39", default-features = false }
 percent-encoding = "2.2"
@@ -37,4 +40,6 @@ url = "2.3"
 zeroize = { version = "1.6", features = ["derive"] }
 
 [dev-dependencies]
+proptest = "1.2"
 rsync-core = { path = "../rsync-core", features = ["tests"] }
+test-strategy = "0.3"
diff --git a/rsync-fetcher/src/rsync/checksum.rs b/rsync-fetcher/src/rsync/checksum.rs
index 849c0fc..5420221 100644
--- a/rsync-fetcher/src/rsync/checksum.rs
+++ b/rsync-fetcher/src/rsync/checksum.rs
@@ -1,9 +1,16 @@
+use std::cmp::min;
+use std::fs::File;
+use std::io::Read;
+
 use eyre::Result;
 use md4::{Digest, Md4};
+use multiversion::multiversion;
 use num::integer::Roots;
 use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt};
 
-#[derive(Debug, Default)]
+mod md4_simd;
+
+#[derive(Debug, Copy, Clone, Default)]
 pub struct SumHead {
     pub checksum_count: i32,
     pub block_len: i32,
@@ -68,29 +75,22 @@ const fn sign_extend(x: u8) -> u32 {
     x as i8 as u32
 }
 
+#[multiversion(targets = "simd")]
 pub fn checksum_1(buf: &[u8]) -> u32 {
-    // Reference: https://github.com/kristapsdz/openrsync/blob/c83683ab5d4fb8d4c466fc74affe005784220d23/hash.c
     let mut s1: u32 = 0;
     let mut s2: u32 = 0;
 
-    let mut it = buf.chunks_exact(4);
-    for chunk in &mut it {
-        s2 = s2.wrapping_add(
-            (s1.wrapping_add(sign_extend(chunk[0])).wrapping_mul(4))
-                .wrapping_add(sign_extend(chunk[1]).wrapping_mul(3))
-                .wrapping_add(sign_extend(chunk[2]).wrapping_mul(2))
-                .wrapping_add(sign_extend(chunk[3])),
-        );
-        s1 = s1.wrapping_add(
-            sign_extend(chunk[0])
-                .wrapping_add(sign_extend(chunk[1]))
-                .wrapping_add(sign_extend(chunk[2]))
-                .wrapping_add(sign_extend(chunk[3])),
-        );
-    }
-    for b in it.remainder() {
-        s1 = s1.wrapping_add(sign_extend(*b));
-        s2 = s2.wrapping_add(s1);
+    let len = u32::try_from(buf.len()).expect("overflow");
+
+    for (idx, b) in buf.iter().enumerate() {
+        let b = sign_extend(*b);
+        s1 = s1.wrapping_add(b);
+        s2 = s2.wrapping_add(b.wrapping_mul(len.wrapping_sub(
+            #[allow(clippy::cast_possible_truncation)] // checked above
+            {
+                idx as u32
+            },
+        )));
     }
 
     (s1 & 0xffff) + (s2 << 16)
@@ -102,3 +102,117 @@ pub fn checksum_2(seed: i32, buf: &[u8]) -> Vec<u8> {
     hasher.update(seed.to_le_bytes());
     hasher.finalize().to_vec()
 }
+
+pub fn checksum_payload(sum_head: SumHead, seed: i32, file: &mut File, file_len: u64) -> Vec<u8> {
+    checksum_payload_(sum_head, seed, file, file_len, true)
+}
+
+#[cfg(test)]
+pub fn checksum_payload_basic(
+    sum_head: SumHead,
+    seed: i32,
+    file: &mut File,
+    file_len: u64,
+) -> Vec<u8> {
+    checksum_payload_(sum_head, seed, file, file_len, false)
+}
+
+#[inline]
+#[allow(clippy::cast_sign_loss)] // block_len and checksum_count are always positive.
+fn checksum_payload_(
+    sum_head: SumHead,
+    seed: i32,
+    file: &mut File,
+    file_len: u64,
+    enable_simd: bool,
+) -> Vec<u8> {
+    let mut file_remaining = file_len;
+
+    let mut buf_sum = Vec::with_capacity(sum_head.checksum_count as usize * 20);
+    let mut block_remaining = sum_head.checksum_count as usize;
+
+    if enable_simd {
+        if let Some(simd_impl) = md4_simd::simd::Md4xN::select() {
+            // Sqrt of usize can't be negative.
+            let mut bufs: [_; md4_simd::simd::MAX_LANES] =
+                array_init::array_init(|_| vec![0u8; sum_head.block_len as usize + 4]);
+
+            while block_remaining >= simd_impl.lanes() {
+                let mut datas: [&[u8]; md4_simd::simd::MAX_LANES] =
+                    [&[]; md4_simd::simd::MAX_LANES];
+                for (idx, buf) in bufs[0..simd_impl.lanes()].iter_mut().enumerate() {
+                    // let buf = &mut bufs[idx];
+
+                    // Sqrt of usize must be in u32 range.
+                    #[allow(clippy::cast_possible_truncation)]
+                    let n1 = min(sum_head.block_len as u64, file_remaining) as usize;
+
+                    file.read_exact(&mut buf[..n1]).expect("IO error");
+
+                    file_remaining -= n1 as u64;
+                    let buf_slice = &mut buf[..n1 + 4];
+                    buf_slice[n1..].copy_from_slice(&seed.to_le_bytes());
+
+                    datas[idx] = buf_slice;
+                }
+
+                let md4_hashes = simd_impl.md4(&datas);
+
+                for (idx, block) in datas[..simd_impl.lanes()].iter().enumerate() {
+                    // fast checksum (must remove seed from buffer)
+                    buf_sum.extend_from_slice(&checksum_1(&block[..block.len() - 4]).to_le_bytes());
+                    // slow checksum
+                    buf_sum.extend_from_slice(&md4_hashes[idx]);
+                }
+
+                block_remaining -= simd_impl.lanes();
+            }
+            // final block shares the code with non-simd implementation
+        }
+    }
+
+    // Sqrt of usize can't be negative.
+    let mut buf = vec![0u8; sum_head.block_len as usize + 4];
+    for _ in 0..block_remaining {
+        // Sqrt of usize must be in u32 range.
+        #[allow(clippy::cast_possible_truncation)]
+        let n1 = min(sum_head.block_len as u64, file_remaining) as usize;
+        let buf_slice = &mut buf[..n1];
+
+        file.read_exact(buf_slice).expect("IO error");
+
+        file_remaining -= n1 as u64;
+
+        buf_sum.extend_from_slice(&checksum_1(buf_slice).to_le_bytes());
+        buf_sum.extend_from_slice(&checksum_2(seed, buf_slice));
+    }
+
+    buf_sum
+}
+
+#[cfg(test)]
+mod tests {
+    use std::io::{Seek, SeekFrom, Write};
+
+    use proptest::prop_assert_eq;
+    use tempfile::tempfile;
+    use test_strategy::proptest;
+
+    use crate::rsync::checksum::{checksum_payload, checksum_payload_basic, SumHead};
+
+    #[proptest]
+    fn must_checksum_payload_basic_eq_simd(data: Vec<u8>) {
+        let file_len = data.len() as u64;
+
+        let mut f = tempfile().expect("tempfile");
+        f.write_all(&data).expect("write_all");
+        f.seek(SeekFrom::Start(0)).expect("seek");
+
+        let sum_head = SumHead::sum_sizes_sqroot(file_len);
+
+        let chksum_simd = checksum_payload(sum_head, 0, &mut f, file_len);
+        f.seek(SeekFrom::Start(0)).expect("seek");
+        let chksum_basic = checksum_payload_basic(sum_head, 0, &mut f, file_len);
+        prop_assert_eq!(chksum_simd, chksum_basic);
+    }
+}
diff --git a/rsync-fetcher/src/rsync/checksum/md4_simd/aarch64_simd_transpose.rs b/rsync-fetcher/src/rsync/checksum/md4_simd/aarch64_simd_transpose.rs
new file mode 100755
index 0000000..46c8288
--- /dev/null
+++ b/rsync-fetcher/src/rsync/checksum/md4_simd/aarch64_simd_transpose.rs
@@ -0,0 +1,66 @@
+//! Utilities for loading and transposing data from memory on AArch64.
+//! This is useful for SPMD-style operations.
+#![cfg(target_endian = "little")] // only little-endian supported
+
+use arrayref::array_ref;
+
+use std::arch::aarch64::{uint32x4_t, vtrnq_u32, vzipq_u32};
+
+#[inline(always)]
+/// Loads four u32s (little-endian), potentially unaligned
+unsafe fn load_u32x4(slice: &[u8; 16]) -> uint32x4_t {
+    core::mem::transmute(*slice)
+}
+
+/// Load 16 bytes (1 u32x4) out of each lane of `data`, transposed.
+#[inline]
+#[target_feature(enable = "neon")]
+unsafe fn load_transpose4(data: [&[u8; 16]; 4]) -> [uint32x4_t; 4] {
+    let i0 = load_u32x4(data[0]);
+    let i1 = load_u32x4(data[1]);
+    let i2 = load_u32x4(data[2]);
+    let i3 = load_u32x4(data[3]);
+    // [[data[0][0], data[2][0], data[0][2], data[2][2]], [data[0][1], data[2][1], data[0][3], data[2][3]]
+    let tr02 = vtrnq_u32(i0, i2);
+    // [[data[1][0], data[3][0], data[1][2], data[3][2]], [data[1][1], data[3][1], data[1][3], data[3][3]]
+    let tr13 = vtrnq_u32(i1, i3);
+    // [[data[0][0], data[1][0], data[2][0], data[3][0]], [data[0][2], data[1][2], data[2][2], data[3][2]]]
+    let zip02 = vzipq_u32(tr02.0, tr13.0);
+    let zip13 = vzipq_u32(tr02.1, tr13.1);
+    [zip02.0, zip13.0, zip02.1, zip13.1]
+}
+
+macro_rules! get_blocks {
+    ($data: ident, ($($lane: tt)*), $from: expr, $width: expr) => ([$(array_ref![&$data($lane), $from, $width]),*]);
+}
+
+#[inline]
+#[target_feature(enable = "neon")]
+pub unsafe fn load_16x4<'a, F: Fn(usize) -> &'a [u8; 64]>(data: F) -> [uint32x4_t; 16] {
+    core::mem::transmute::<[[uint32x4_t; 4]; 4], [uint32x4_t; 16]>([
+        load_transpose4(get_blocks!(data, (0 1 2 3), 0, 16)),
+        load_transpose4(get_blocks!(data, (0 1 2 3), 16, 16)),
+        load_transpose4(get_blocks!(data, (0 1 2 3), 32, 16)),
+        load_transpose4(get_blocks!(data, (0 1 2 3), 48, 16)),
+    ])
+}
+
+#[test]
+fn test_transpose() {
+    let mut input = [[0; 64]; 4];
+    for lane in 0..4 {
+        for i in 0..16 {
+            let value = (lane * 16 + i) as u32;
+            input[lane][i * 4..i * 4 + 4].copy_from_slice(&value.to_le_bytes());
+        }
+    }
+    unsafe {
+        let output = load_16x4(|lane| &input[lane]);
+        let transmuted = core::mem::transmute::<_, [[u32; 4]; 16]>(output);
+        for lane in 0..4 {
+            for i in 0..16 {
+                assert_eq!(transmuted[i][lane], (lane * 16 + i) as u32);
+            }
+        }
+    }
+}
diff --git a/rsync-fetcher/src/rsync/checksum/md4_simd/mod.rs b/rsync-fetcher/src/rsync/checksum/md4_simd/mod.rs
new file mode 100755
index 0000000..8ce0cb0
--- /dev/null
+++ b/rsync-fetcher/src/rsync/checksum/md4_simd/mod.rs
@@ -0,0 +1,550 @@
+//! A SIMD-ized implementation of MD4 designed to hash many blocks in parallel.
+//! The base implementation is derived from https://github.com/RustCrypto/hashes/tree/master/md4.
+#![allow(clippy::all, warnings)]
+#![allow(clippy::ptr_offset_with_cast)]
+
+use arrayref::{array_mut_ref, array_ref, array_refs, mut_array_refs};
+
+#[cfg(target_arch = "aarch64")]
+mod aarch64_simd_transpose;
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+mod x86_simd_transpose;
+
+pub const MD4_SIZE: usize = 16;
+
+// initial values for Md4State
+const S: [u32; 4] = [0x6745_2301, 0xEFCD_AB89, 0x98BA_DCFE, 0x1032_5476];
+
+macro_rules! md4 {
+    (
+        ($($prefix:tt)*),
+        $u32:ty,
+        add = $add:path,
+        and = $and:path,
+        or = $or:path,
+        andnot = $andnot:path,
+        xor = $xor:path,
+        rol = ($($rol:tt)*),
+        splat = $splat:path,
+    ) => {
+        #[derive(Copy, Clone)]
+        struct Md4State {
+            s: [$u32; 4],
+        }
+
+        #[allow(unused_parens)]
+        impl Md4State {
+            $($prefix)*
+            fn process_block(&mut self, data: &[$u32; 16]) {
+                macro_rules! f {
+                    ($x:expr, $y:expr, $z:expr) => ($or($and($x, $y), $andnot($x, $z)));
+                }
+                macro_rules! g {
+                    ($x:expr, $y:expr, $z:expr) => ($or($or($and($x, $y), $and($x, $z)), $and($y, $z)));
+                }
+                macro_rules! h {
+                    ($x:expr, $y:expr, $z:expr) => ($xor($xor($x, $y), $z));
+                }
+                macro_rules! op1 {
+                    ($a:expr, $b:expr, $c:expr, $d:expr, $k:expr, $s:expr) => (
+                        $($rol)*($add($add($a, f!($b, $c, $d)), $k), $s)
+                    );
+                }
+                macro_rules! op2 {
+                    ($a:expr, $b:expr, $c:expr, $d:expr, $k:expr, $s:expr) => (
+                        $($rol)*($add($add($add($a, g!($b, $c, $d)), $k), $splat(0x5A82_7999)), $s)
+                    );
+                }
+                macro_rules! op3 {
+                    ($a:expr, $b:expr, $c:expr, $d:expr, $k:expr, $s:expr) => (
+                        $($rol)*($add($add($add($a, h!($b, $c, $d)), $k), $splat(0x6ED9_EBA1)), $s)
+                    );
+                }
+
+                let mut a = self.s[0];
+                let mut b = self.s[1];
+                let mut c = self.s[2];
+                let mut d = self.s[3];
+
+                // Manually unrolling these loops avoids bounds checking on `data` accesses.
+                macro_rules! round1 {
+                    ($i:expr) => {
+                        a = op1!(a, b, c, d, data[$i], 3);
+                        d = op1!(d, a, b, c, data[$i + 1], 7);
+                        c = op1!(c, d, a, b, data[$i + 2], 11);
+                        b = op1!(b, c, d, a, data[$i + 3], 19);
+                    }
+                }
+                round1!(0);
+                round1!(4);
+                round1!(8);
+                round1!(12);
+
+                macro_rules! round2 {
+                    ($i:expr) => {
+                        a = op2!(a, b, c, d, data[$i], 3);
+                        d = op2!(d, a, b, c, data[$i + 4], 5);
+                        c = op2!(c, d, a, b, data[$i + 8], 9);
+                        b = op2!(b, c, d, a, data[$i + 12], 13);
+                    }
+                }
+                round2!(0);
+                round2!(1);
+                round2!(2);
+                round2!(3);
+
+                macro_rules! round3 {
+                    ($i:expr) => {
+                        a = op3!(a, b, c, d, data[$i], 3);
+                        d = op3!(d, a, b, c, data[$i + 8], 9);
+                        c = op3!(c, d, a, b, data[$i + 4], 11);
+                        b = op3!(b, c, d, a, data[$i + 12], 15);
+                    }
+                }
+                round3!(0);
+                round3!(2);
+                round3!(1);
+                round3!(3);
+
+                self.s[0] = $add(self.s[0], a);
+                self.s[1] = $add(self.s[1], b);
+                self.s[2] = $add(self.s[2], c);
+                self.s[3] = $add(self.s[3], d);
+            }
+        }
+    };
+}
+
+use std::convert::identity;
+use std::ops::{BitAnd, BitOr, BitXor};
+fn andnot(x: u32, y: u32) -> u32 {
+    !x & y
+}
+md4!(
+    (),
+    u32,
+    add = u32::wrapping_add,
+    and = u32::bitand,
+    or = u32::bitor,
+    andnot = andnot,
+    xor = u32::bitxor,
+    rol = (u32::rotate_left),
+    splat = identity,
+);
+
+fn load_block(input: &[u8; 64]) -> [u32; 16] {
+    macro_rules! split {
+        ($($name: ident $(. $dummy:tt)*)*) => ({
+            let ($($name),*) = array_refs![input, $(4 $($dummy)*),*];
+            [$(u32::from_le_bytes(*$name)),*]
+        });
+    }
+    split!(x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 x10 x11 x12 x13 x14 x15)
+}
+
+pub fn md4(data: &[u8]) -> [u8; 16] {
+    let mut state = Md4State { s: S };
+    let mut chunks = data.chunks_exact(64);
+    for block in &mut chunks {
+        state.process_block(&load_block(array_ref![block, 0, 64]));
+    }
+    let remainder = chunks.remainder();
+    let mut last_blocks = [0; 128];
+    last_blocks[..remainder.len()].copy_from_slice(remainder);
+    last_blocks[remainder.len()] = 0x80;
+    let end = if remainder.len() >= 56 { 128 } else { 64 };
+    *array_mut_ref![&mut last_blocks, end - 8, 8] = (data.len() as u64 * 8).to_le_bytes();
+    let (last_block_0, last_block_1) = array_refs![&last_blocks, 64, 64];
+    state.process_block(&load_block(last_block_0));
+    if end == 128 {
+        state.process_block(&load_block(last_block_1));
+    }
+    let mut digest = [0; 16];
+    let (a, b, c, d) = mut_array_refs!(&mut digest, 4, 4, 4, 4);
+    *a = state.s[0].to_le_bytes();
+    *b = state.s[1].to_le_bytes();
+    *c = state.s[2].to_le_bytes();
+    *d = state.s[3].to_le_bytes();
+    digest
+}
+
+pub mod simd {
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    pub const MAX_LANES: usize = 8;
+    #[cfg(any(target_arch = "aarch64"))]
+    pub const MAX_LANES: usize = 4;
+    #[cfg(not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")))]
+    pub const MAX_LANES: usize = 0;
+
+    pub struct Md4xN {
+        lanes: usize,
+        fun: fn(&[&[u8]]) -> [[u8; 16]; MAX_LANES],
+    }
+
+    impl Md4xN {
+        /// The number of digests this implementation calculates at once.
+        pub fn lanes(&self) -> usize {
+            self.lanes
+        }
+
+        /// Calculate the digest of `self.lanes()` equally-sized blocks of data.
+        pub fn md4(&self, data: &[&[u8]]) -> [[u8; 16]; MAX_LANES] {
+            (self.fun)(data)
+        }
+    }
+
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))]
+    mod real_impl {
+        #[cfg(target_arch = "aarch64")]
+        use std::arch::aarch64 as arch;
+        #[cfg(target_arch = "x86")]
+        use std::arch::x86 as arch;
+        #[cfg(target_arch = "x86_64")]
+        use std::arch::x86_64 as arch;
+
+        macro_rules! n_lanes {
+            (
+                $u32xN:path,
+                $feature:tt,
+                $feature_enabled:expr,
+                load = $load:path,
+                add = $add:path,
+                and = $and:path,
+                or = $or:path,
+                andnot = $andnot:path,
+                xor = $xor:path,
+                rol = $rol:tt,
+                splat = $splat:path,
+            ) => (
+                use crate::rsync::checksum::md4_simd::S;
+                use crate::rsync::checksum::md4_simd::simd::{Md4xN, MAX_LANES};
+                use arrayref::{array_ref, mut_array_refs};
+                use std::mem;
+
+                #[allow(non_camel_case_types)]
+                type u32xN = $u32xN;
+                pub const LANES: usize = mem::size_of::<u32xN>() / mem::size_of::<u32>();
+
+                md4!(
+                    (#[target_feature(enable = $feature)] unsafe),
+                    u32xN,
+                    add = $add,
+                    and = $and,
+                    or = $or,
+                    andnot = $andnot,
+                    xor = $xor,
+                    rol = $rol,
+                    splat = $splat,
+                );
+
+                /// Compute the MD4 sum of multiple equally-sized blocks of data.
+                /// Unsafety: This function requires $feature to be available.
+                #[allow(non_snake_case)]
+                #[target_feature(enable = $feature)]
+                unsafe fn md4xN(data: &[&[u8]; LANES]) -> [[u8; 16]; LANES] {
+                    let mut state = Md4State {
+                        s: [
+                            $splat(S[0]),
+                            $splat(S[1]),
+                            $splat(S[2]),
+                            $splat(S[3]),
+                        ],
+                    };
+                    let len = data[0].len();
+                    for ix in 1..LANES {
+                        assert_eq!(len, data[ix].len());
+                    }
+                    for block in 0..(len / 64) {
+                        let blocks = $load(|lane| array_ref![&data[lane], 64 * block, 64]);
+                        state.process_block(&blocks);
+                    }
+                    let remainder = len % 64;
+                    let bit_len = len as u64 * 8;
+                    {
+                        let mut padded = [[0; 64]; LANES];
+                        for lane in 0..LANES {
+                            padded[lane][..remainder].copy_from_slice(&data[lane][len - remainder..]);
+                            padded[lane][remainder] = 0x80;
+                        }
+                        let mut blocks = $load(|lane| &padded[lane]);
+                        if remainder < 56 {
+                            blocks[14] = $splat(bit_len as u32);
+                            blocks[15] = $splat((bit_len >> 32) as u32);
+                        }
+                        state.process_block(&blocks);
+                    }
+                    if remainder >= 56 {
+                        let mut blocks = [$splat(0); 16];
+                        blocks[14] = $splat(bit_len as u32);
+                        blocks[15] = $splat((bit_len >> 32) as u32);
+                        state.process_block(&blocks);
+                    }
+                    let mut digests = [[0; 16]; LANES];
+                    // Safety: `u32xN` and `[u32; LANES]` are always safely transmutable
+                    let final_state = mem::transmute::<[u32xN; 4], [[u32; LANES]; 4]>(state.s);
+                    for lane in 0..LANES {
+                        let (a, b, c, d) = mut_array_refs!(&mut digests[lane], 4, 4, 4, 4);
+                        *a = final_state[0][lane].to_le_bytes();
+                        *b = final_state[1][lane].to_le_bytes();
+                        *c = final_state[2][lane].to_le_bytes();
+                        *d = final_state[3][lane].to_le_bytes();
+                    }
+                    digests
+                }
+
+                pub fn select() -> Option<Md4xN> {
+                    if $feature_enabled {
+                        Some(Md4xN {
+                            lanes: LANES,
+                            fun: |data| {
+                                let mut ret = [[0; 16]; MAX_LANES];
+                                let (prefix, _) = mut_array_refs!(&mut ret, LANES, MAX_LANES-LANES);
+                                // Safety: We just checked that $feature is available.
+                                *prefix = unsafe { md4xN(array_ref![data, 0, LANES]) };
+                                ret
+                            }
+                        })
+                    } else {
+                        None
+                    }
+                }
+                );
+        }
+
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        mod lanes_4 {
+            #[inline(always)]
+            unsafe fn splat(x: u32) -> super::arch::__m128i {
+                super::arch::_mm_set1_epi32(x as i32)
+            }
+            macro_rules! rotate_left {
+                ($x: expr, $shift: expr) => {{
+                    let x = $x;
+                    // (x << shift) | (x >> (32 - shift))
+                    super::arch::_mm_or_si128(
+                        super::arch::_mm_slli_epi32(x, $shift as i32),
+                        super::arch::_mm_srli_epi32(x, 32 - $shift as i32),
+                    )
+                }};
+            }
+            n_lanes!(
+                super::arch::__m128i,
+                "sse2",
+                is_x86_feature_detected!("sse2"),
+                load = crate::rsync::checksum::md4_simd::x86_simd_transpose::load_16x4_sse2,
+                add = super::arch::_mm_add_epi32,
+                and = super::arch::_mm_and_si128,
+                or = super::arch::_mm_or_si128,
+                andnot = super::arch::_mm_andnot_si128,
+                xor = super::arch::_mm_xor_si128,
+                rol = (rotate_left!),
+                splat = splat,
+            );
+        }
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        mod lanes_8 {
+            #[inline(always)]
+            unsafe fn splat(x: u32) -> super::arch::__m256i {
+                super::arch::_mm256_set1_epi32(x as i32)
+            }
+            macro_rules! rotate_left {
+                ($x: expr, $shift: expr) => {{
+                    let x = $x;
+                    // (x << shift) | (x >> (32 - shift))
+                    super::arch::_mm256_or_si256(
+                        super::arch::_mm256_slli_epi32(x, $shift as i32),
+                        super::arch::_mm256_srli_epi32(x, 32 - $shift as i32),
+                    )
+                }};
+            }
+            n_lanes!(
+                super::arch::__m256i,
+                "avx2",
+                is_x86_feature_detected!("avx2"),
+                load = crate::rsync::checksum::md4_simd::x86_simd_transpose::load_16x8_avx2,
+                add = super::arch::_mm256_add_epi32,
+                and = super::arch::_mm256_and_si256,
+                or = super::arch::_mm256_or_si256,
+                andnot = super::arch::_mm256_andnot_si256,
+                xor = super::arch::_mm256_xor_si256,
+                rol = (rotate_left!),
+                splat = splat,
+            );
+        }
+        #[cfg(target_arch = "aarch64")]
+        mod lanes_4 {
+            macro_rules! rotate_left {
+                ($x: expr, $shift: expr) => {{
+                    let x = $x;
+                    // (x << shift) | (x >> (32 - shift))
+                    super::arch::vorrq_u32(
+                        super::arch::vshlq_n_u32::<{ $shift as i32 }>(x),
+                        super::arch::vshrq_n_u32::<{ 32 - $shift as i32 }>(x),
+                    )
+                }};
+            }
+            #[inline(always)]
+            unsafe fn andnot(
+                a: super::arch::uint32x4_t,
+                b: super::arch::uint32x4_t,
+            ) -> super::arch::uint32x4_t {
+                // "bit clear", order of arguments is reversed compared to Intel
+                super::arch::vbicq_u32(b, a)
+            }
+            n_lanes!(
+                super::arch::uint32x4_t,
+                "neon",
+                std::arch::is_aarch64_feature_detected!("neon"),
+                load = crate::rsync::checksum::md4_simd::aarch64_simd_transpose::load_16x4,
+                add = super::arch::vaddq_u32,
+                and = super::arch::vandq_u32,
+                or = super::arch::vorrq_u32,
+                andnot = andnot,
+                xor = super::arch::veorq_u32,
+                rol = (rotate_left!),
+                splat = super::arch::vdupq_n_u32,
+            );
+        }
+
+        use super::Md4xN;
+
+        impl Md4xN {
+            /// Returns a SIMD implementation if one is available.
+            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+            pub fn select() -> Option<Md4xN> {
+                lanes_8::select().or_else(lanes_4::select)
+            }
+            #[cfg(target_arch = "aarch64")]
+            pub fn select() -> Option<Md4xN> {
+                lanes_4::select()
+            }
+        }
+    }
+    #[cfg(not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")))]
+    mod no_simd {
+        use super::Md4xN;
+
+        impl Md4xN {
+            /// Returns a SIMD implementation if one is available.
+            pub fn select() -> Option<Md4xN> {
+                None
+            }
+        }
+    }
+}
+
+pub fn md4_many<'a>(
+    datas: impl ExactSizeIterator<Item = &'a [u8]>,
+) -> impl ExactSizeIterator<Item = (&'a [u8], [u8; 16])> {
+    struct SimdImpl<'a> {
+        simd_impl: simd::Md4xN,
+        buf: [(&'a [u8], [u8; 16]); simd::MAX_LANES],
+        buf_len: usize,
+    }
+    struct It<'a, I: Iterator<Item = &'a [u8]>> {
+        len: usize,
+        inner: I,
+        simd: Option<SimdImpl<'a>>,
+    }
+    impl<'a, I: Iterator<Item = &'a [u8]>> Iterator for It<'a, I> {
+        type Item = (&'a [u8], [u8; 16]);
+        #[allow(clippy::needless_range_loop)]
+        fn next(&mut self) -> Option<Self::Item> {
+            if let Some(simd) = &mut self.simd {
+                if simd.buf_len == 0 && self.len >= simd.simd_impl.lanes() {
+                    let mut datas: [&[u8]; simd::MAX_LANES] = [&[]; simd::MAX_LANES];
+                    for ix in 0..simd.simd_impl.lanes() {
+                        datas[ix] = self.inner.next().unwrap();
+                    }
+                    self.len -= simd.simd_impl.lanes();
+                    let digests = simd.simd_impl.md4(&datas);
+                    simd.buf_len = simd.simd_impl.lanes();
+                    for lane in 0..simd.simd_impl.lanes() {
+                        simd.buf[lane] = (datas[lane], digests[lane]);
+                    }
+                }
+                if simd.buf_len > 0 {
+                    let digest = simd.buf[simd.simd_impl.lanes() - simd.buf_len];
+                    simd.buf_len -= 1;
+                    return Some(digest);
+                }
+            }
+            self.inner.next().map(|data| {
+                self.len -= 1;
+                (data, md4(data))
+            })
+        }
+        fn size_hint(&self) -> (usize, Option<usize>) {
+            (self.len, Some(self.len))
+        }
+    }
+    impl<'a, I: Iterator<Item = &'a [u8]>> ExactSizeIterator for It<'a, I> {
+        fn len(&self) -> usize {
+            self.len
+        }
+    }
+    It {
+        len: datas.len(),
+        inner: datas,
+        simd: simd::Md4xN::select().map(|simd_impl| SimdImpl {
+            simd_impl,
+            buf: [(&[] as &[_], [0; 16]); simd::MAX_LANES],
+            buf_len: 0,
+        }),
+    }
+}
+
+#[test]
+fn tests() {
+    let test_vectors: &[(&[u8], [u8; 16])] = &[
+        (
+            b"",
+            *b"\x31\xd6\xcf\xe0\xd1\x6a\xe9\x31\xb7\x3c\x59\xd7\xe0\xc0\x89\xc0",
+        ),
+        (
+            b"a",
+            *b"\xbd\xe5\x2c\xb3\x1d\xe3\x3e\x46\x24\x5e\x05\xfb\xdb\xd6\xfb\x24",
+        ),
+        (
+            b"abc",
+            *b"\xa4\x48\x01\x7a\xaf\x21\xd8\x52\x5f\xc1\x0a\xe8\x7a\xa6\x72\x9d",
+        ),
+        (
+            b"message digest",
+            *b"\xd9\x13\x0a\x81\x64\x54\x9f\xe8\x18\x87\x48\x06\xe1\xc7\x01\x4b",
+        ),
+        (
+            b"abcdefghijklmnopqrstuvwxyz",
+            *b"\xd7\x9e\x1c\x30\x8a\xa5\xbb\xcd\xee\xa8\xed\x63\xdf\x41\x2d\xa9",
+        ),
+        (
+            b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789",
+            *b"\x04\x3f\x85\x82\xf2\x41\xdb\x35\x1c\xe6\x27\xe1\x53\xe7\xf0\xe4",
+        ),
+        (
+            b"12345678901234567890123456789012345678901234567890123456789012345678901234567890",
+            *b"\xe3\x3b\x4d\xdc\x9c\x38\xf2\x19\x9c\x3e\x7b\x16\x4f\xcc\x05\x36",
+        ),
+    ];
+
+    for &(msg, expected) in test_vectors {
+        assert_eq!(md4(msg), expected);
+        if let Some(simd_impl) = simd::Md4xN::select() {
+            assert_eq!(
+                simd_impl.md4(&vec![msg; simd_impl.lanes()])[..simd_impl.lanes()],
+                vec![expected; simd_impl.lanes()][..]
+            );
+        }
+        // make sure it also works for unaligned input
+        if msg.len() > 0 {
+            let tail = &msg[1..];
+            let tail_md4 = md4(tail);
+            if let Some(simd_impl) = simd::Md4xN::select() {
+                assert_eq!(
+                    simd_impl.md4(&vec![tail; simd_impl.lanes()])[..simd_impl.lanes()],
+                    vec![tail_md4; simd_impl.lanes()][..]
+                );
+            }
+        }
+    }
+}
diff --git a/rsync-fetcher/src/rsync/checksum/md4_simd/x86_simd_transpose.rs b/rsync-fetcher/src/rsync/checksum/md4_simd/x86_simd_transpose.rs
new file mode 100755
index 0000000..d93d7cd
--- /dev/null
+++ b/rsync-fetcher/src/rsync/checksum/md4_simd/x86_simd_transpose.rs
@@ -0,0 +1,131 @@
+//! Utilities for loading and transposing data from memory on x86 architectures.
+//! This is useful for SPMD-style operations.
+use arrayref::array_ref;
+
+use self::arch::{
+    __m128i, __m256i, _mm256_castsi128_si256, _mm256_inserti128_si256, _mm256_unpackhi_epi32,
+    _mm256_unpackhi_epi64, _mm256_unpacklo_epi32, _mm256_unpacklo_epi64, _mm_loadu_si128,
+    _mm_unpackhi_epi32, _mm_unpackhi_epi64, _mm_unpacklo_epi32, _mm_unpacklo_epi64,
+};
+#[cfg(target_arch = "x86")]
+use std::arch::x86 as arch;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64 as arch;
+
+#[inline(always)]
+/// Loads four u32s (little-endian), potentially unaligned
+unsafe fn load_u32x4(slice: &[u8; 16]) -> __m128i {
+    _mm_loadu_si128(slice as *const [u8; 16] as *const __m128i)
+}
+
+/// Load 32 bytes (1 u32x8) out of each lane of `data`, transposed.
+#[inline]
+#[target_feature(enable = "avx2")]
+unsafe fn load_transpose8(data: [&[u8; 32]; 8]) -> [__m256i; 8] {
+    #[inline(always)]
+    /// Concatenate two u32x4s into a single u32x8
+    unsafe fn cat2x4(a: __m128i, b: __m128i) -> __m256i {
+        // `vinserti128`
+        _mm256_inserti128_si256(_mm256_castsi128_si256(a), b, 1)
+    }
+
+    let l04 = cat2x4(
+        load_u32x4(array_ref![data[0], 0, 16]),
+        load_u32x4(array_ref![data[4], 0, 16]),
+    );
+    let l15 = cat2x4(
+        load_u32x4(array_ref![data[1], 0, 16]),
+        load_u32x4(array_ref![data[5], 0, 16]),
+    );
+    let l26 = cat2x4(
+        load_u32x4(array_ref![data[2], 0, 16]),
+        load_u32x4(array_ref![data[6], 0, 16]),
+    );
+    let l37 = cat2x4(
+        load_u32x4(array_ref![data[3], 0, 16]),
+        load_u32x4(array_ref![data[7], 0, 16]),
+    );
+    let h04 = cat2x4(
+        load_u32x4(array_ref![data[0], 16, 16]),
+        load_u32x4(array_ref![data[4], 16, 16]),
+    );
+    let h15 = cat2x4(
+        load_u32x4(array_ref![data[1], 16, 16]),
+        load_u32x4(array_ref![data[5], 16, 16]),
+    );
+    let h26 = cat2x4(
+        load_u32x4(array_ref![data[2], 16, 16]),
+        load_u32x4(array_ref![data[6], 16, 16]),
+    );
+    let h37 = cat2x4(
+        load_u32x4(array_ref![data[3], 16, 16]),
+        load_u32x4(array_ref![data[7], 16, 16]),
+    );
+    // [data[0][0], data[1][0], data[0][1], data[1][1], data[4][0], data[5][0], data[4][1], data[5][1]]
+    let a0145 = _mm256_unpacklo_epi32(l04, l15);
+    // [data[0][2], data[1][2], data[0][3], data[1][3], data[4][2], data[5][2], data[4][3], data[5][3]]
+    let b0145 = _mm256_unpackhi_epi32(l04, l15);
+    let a2367 = _mm256_unpacklo_epi32(l26, l37);
+    let b2367 = _mm256_unpackhi_epi32(l26, l37);
+    let c0145 = _mm256_unpacklo_epi32(h04, h15);
+    let d0145 = _mm256_unpackhi_epi32(h04, h15);
+    let c2367 = _mm256_unpacklo_epi32(h26, h37);
+    let d2367 = _mm256_unpackhi_epi32(h26, h37);
+    [
+        _mm256_unpacklo_epi64(a0145, a2367),
+        _mm256_unpackhi_epi64(a0145, a2367),
+        _mm256_unpacklo_epi64(b0145, b2367),
+        _mm256_unpackhi_epi64(b0145, b2367),
+        _mm256_unpacklo_epi64(c0145, c2367),
+        _mm256_unpackhi_epi64(c0145, c2367),
+        _mm256_unpacklo_epi64(d0145, d2367),
+        _mm256_unpackhi_epi64(d0145, d2367),
+    ]
+}
+
+macro_rules! get_blocks {
+    ($data: ident, ($($lane: tt)*), $from: expr, $width: expr) => ([$(array_ref![&$data($lane), $from, $width]),*]);
+}
+
+#[inline]
+#[target_feature(enable = "avx2")]
+pub unsafe fn load_16x8_avx2<'a, F: Fn(usize) -> &'a [u8; 64]>(data: F) -> [__m256i; 16] {
+    // We're using transmute to concatenate arrays; not ideal, but it works
+    core::mem::transmute::<[[__m256i; 8]; 2], [__m256i; 16]>([
+        load_transpose8(get_blocks!(data, (0 1 2 3 4 5 6 7), 0, 32)),
+        load_transpose8(get_blocks!(data, (0 1 2 3 4 5 6 7), 32, 32)),
+    ])
+}
+
+/// Load 16 bytes (1 u32x4) out of each lane of `data`, transposed.
+#[inline]
+#[target_feature(enable = "sse2")]
+unsafe fn load_transpose4(data: [&[u8; 16]; 4]) -> [__m128i; 4] {
+    let i0 = load_u32x4(data[0]);
+    let i1 = load_u32x4(data[1]);
+    let i2 = load_u32x4(data[2]);
+    let i3 = load_u32x4(data[3]);
+    // [data[0][0], data[1][0], data[0][1], data[1][1]]
+    let l01 = _mm_unpacklo_epi32(i0, i1);
+    // [data[0][2], data[1][2], data[0][3], data[1][3]]
+    let h01 = _mm_unpackhi_epi32(i0, i1);
+    let l23 = _mm_unpacklo_epi32(i2, i3);
+    let h23 = _mm_unpackhi_epi32(i2, i3);
+    [
+        _mm_unpacklo_epi64(l01, l23),
+        _mm_unpackhi_epi64(l01, l23),
+        _mm_unpacklo_epi64(h01, h23),
+        _mm_unpackhi_epi64(h01, h23),
+    ]
+}
+
+#[inline]
+#[target_feature(enable = "sse2")]
+pub unsafe fn load_16x4_sse2<'a, F: Fn(usize) -> &'a [u8; 64]>(data: F) -> [__m128i; 16] {
+    core::mem::transmute::<[[__m128i; 4]; 4], [__m128i; 16]>([
+        load_transpose4(get_blocks!(data, (0 1 2 3), 0, 16)),
+        load_transpose4(get_blocks!(data, (0 1 2 3), 16, 16)),
+        load_transpose4(get_blocks!(data, (0 1 2 3), 32, 16)),
+        load_transpose4(get_blocks!(data, (0 1 2 3), 48, 16)),
+    ])
+}
diff --git a/rsync-fetcher/src/rsync/generator.rs b/rsync-fetcher/src/rsync/generator.rs
index d903a42..e434ed3 100644
--- a/rsync-fetcher/src/rsync/generator.rs
+++ b/rsync-fetcher/src/rsync/generator.rs
@@ -1,7 +1,4 @@
-use std::cmp::min;
 use std::ffi::OsStr;
-use std::io;
-use std::io::Read;
 use std::ops::{Deref, DerefMut};
 use std::os::unix::ffi::OsStrExt;
 use std::os::unix::fs::MetadataExt;
@@ -17,7 +14,7 @@ use tokio::sync::{mpsc, Semaphore};
 use tracing::{debug, info};
 
 use crate::plan::TransferItem;
-use crate::rsync::checksum::{checksum_1, checksum_2, SumHead};
+use crate::rsync::checksum::{checksum_payload, SumHead};
 use crate::rsync::file_list::FileEntry;
 use crate::rsync::progress_display::ProgressDisplay;
 use crate::utils::ignore_mode;
@@ -131,7 +128,6 @@ impl Generator {
         Ok(())
     }
 
-    #[allow(clippy::cast_sign_loss)] // block_len and checksum_count are always positive.
     async fn generate_and_send_sums(&mut self, file: File) -> Result<()> {
         let file_len = file.metadata().await?.size();
         let sum_head = SumHead::sum_sizes_sqroot(file_len);
@@ -141,33 +137,9 @@ impl Generator {
         let seed = self.seed;
 
         let sum_bytes = tokio::task::spawn_blocking(move || {
-            // Sqrt of usize can't be negative.
-            let mut buf = vec![0u8; sum_head.block_len as usize];
-            let mut remaining = file_len;
-
-            let mut buf_sum = Vec::with_capacity(sum_head.checksum_count as usize * 20);
-            for _ in 0..sum_head.checksum_count {
-                // Sqrt of usize must be in u32 range.
-                #[allow(clippy::cast_possible_truncation)]
-                let n1 = min(sum_head.block_len as u64, remaining) as usize;
-                let buf_slice = &mut buf[..n1];
-                file.read_exact(buf_slice)?;
-
-                let sum1 = checksum_1(buf_slice);
-                let sum2 = checksum_2(seed, buf_slice);
-                // The original implementation reads sum1 as i32, but casting it to i32 is a no-op anyway.
-                buf_sum.extend_from_slice(&sum1.to_le_bytes());
-                buf_sum.extend_from_slice(&sum2);
-
-                remaining -= n1 as u64;
-            }
-            debug_assert!(
-                buf_sum.len() <= sum_head.checksum_count as usize * 20,
-                "pre-allocated buffer is too small"
-            );
-            Ok::<_, io::Error>(buf_sum)
+            checksum_payload(sum_head, seed, &mut file, file_len)
         })
-        .await??;
+        .await?;
 
         self.write_all(&sum_bytes).await?;