From 9c7a2c2c339e4282c7d49df1b77bf717ee1fb37d Mon Sep 17 00:00:00 2001 From: Leo Balduf Date: Wed, 1 Mar 2023 15:28:24 +0100 Subject: [PATCH 1/3] Improve doc --- src/lib.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index 5d5e1b1..af8953d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -135,6 +135,9 @@ use std::num::Wrapping; /// hash, etc). pub trait ChunkerImpl { /// Look at the new bytes to maybe find a boundary. + /// The boundary is an index within `data`, after which the cut-point is set. + /// I.e., a return value of `Some(0)` indicates that only the first byte of this block should be + /// included in the current chunk. fn find_boundary(&mut self, data: &[u8]) -> Option; /// Reset the internal state after a chunk has been emitted @@ -429,6 +432,12 @@ impl<'a, I: ChunkerImpl> Iterator for Slices<'a, I> { } } +/// A wrapper that limits the size of produced chunks. +/// +/// Note that the inner chunking implementation is reset when a chunk boundary is +/// emitted because of the size limit. This will generally reduce content-dependence, +/// and thus deduplication ratio, because the boundary is set by size rather than by +/// content. pub struct SizeLimited { inner: I, pos: usize, From b4a91d4516330b812f922363c5968934475342ab Mon Sep 17 00:00:00 2001 From: Leo Balduf Date: Thu, 9 Mar 2023 10:01:55 +0100 Subject: [PATCH 2/3] Derive Debug for ChunkInput --- src/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lib.rs b/src/lib.rs index af8953d..d6898c8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -291,6 +291,7 @@ impl Iterator for WholeChunks { /// Objects returned from the ChunkStream iterator. /// /// This is either more data in the current chunk, or a chunk boundary. +#[derive(Debug)] pub enum ChunkInput<'a> { Data(&'a [u8]), End, From 40cccf198be587bb148657b6b463c523a8fdcc74 Mon Sep 17 00:00:00 2001 From: Leo Balduf Date: Mon, 13 Mar 2023 10:25:47 +0100 Subject: [PATCH 3/3] Factor SizeLimited constructor --- src/lib.rs | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index d6898c8..b338e28 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -255,11 +255,7 @@ impl Chunker { pub fn max_size(self, max: usize) -> Chunker> { assert!(max > 0); Chunker { - inner: SizeLimited { - inner: self.inner, - pos: 0, - max_size: max, - }, + inner: SizeLimited::new(self.inner, max), } } } @@ -445,6 +441,17 @@ pub struct SizeLimited { max_size: usize, } +impl SizeLimited { + /// Wraps the given chunker implementation to limit the size of produced chunks. + pub fn new(inner: I, max_size: usize) -> Self { + SizeLimited { + inner, + pos: 0, + max_size, + } + } +} + impl ChunkerImpl for SizeLimited { fn find_boundary(&mut self, data: &[u8]) -> Option { assert!(self.max_size > self.pos);