diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml new file mode 100644 index 0000000..fae3e19 --- /dev/null +++ b/.github/workflows/rust.yml @@ -0,0 +1,24 @@ +name: Build and test + +on: [ push ] + +env: + CARGO_TERM_COLOR: always + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Formatting + run: cargo fmt -- --check + - name: Clippy + run: cargo clippy --all-targets --tests -- -D warnings + - name: Build + run: cargo build --all-features --verbose + - name: Run tests + run: cargo test --verbose + - name: Run binary + run: cargo run -p runner --verbose --release diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..771f2e2 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,21 @@ +[workspace] +members = ["runner"] + +[package] +name = "sbc_algorithm" +version = "0.1.0" +edition = "2021" + +[dependencies] +chunkfs = { git = "https://github.com/Piletskii-Oleg/chunkfs.git" } +rayon = "1.10" +zstd = "0.13" +thiserror = "2.0.12" +huffman-compress = "0.6.1" +bit-vec = "0.6.3" +log = "0.4.27" +fasthash = "0.4.0" + +[dev-dependencies] +chunkfs = { git = "https://github.com/Piletskii-Oleg/chunkfs.git", features = ["chunkers", "hashers"]} +rand = "0.8.5" \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..18a0ae6 --- /dev/null +++ b/README.md @@ -0,0 +1,59 @@ +## Similarity Based Chunking Scrubber +SBC Scrubber is a scrubber that can be used to implement different SBC algorithms with ChunkFS + +SBC Scrubber is currently under active development, breaking changes can always happen. + +## Usage + +Add the following dependency to your `Cargo.toml`: + +```toml +[dependencies] +chunkfs = { version = "0.1", features = ["chunkers", "hashers"] } +sbc_algorithm = { git = "https://github.com/maxscherbakov/sbc_algorithm.git" } +``` + +## Example + +```rust +extern crate chunkfs; +extern crate sbc_algorithm; + +use chunkfs::chunkers::{SizeParams, SuperChunker}; +use chunkfs::hashers::Sha256Hasher; +use chunkfs::FileSystem; +use sbc_algorithm::{clusterer, decoder, encoder, hasher}; +use sbc_algorithm::{SBCMap, SBCScrubber}; +use std::collections::HashMap; +use std::{fs, io}; + +fn main() -> io::Result<()> { + let data = vec![10; 1024 * 1024]; + let chunk_size = SizeParams::new(2 * 1024, 8 * 1024, 16 * 1024); + let mut fs = FileSystem::new_with_scrubber( + HashMap::default(), + SBCMap::new(decoder::GdeltaDecoder::new(false)), + Box::new(SBCScrubber::new( + hasher::AronovichHasher, + clusterer::GraphClusterer::default(), + encoder::GdeltaEncoder::new(false), + )), + Sha256Hasher::default(), + ); + let mut handle = fs.create_file("file".to_string(), SuperChunker::new(chunk_size))?; + fs.write_to_file(&mut handle, &data)?; + fs.close_file(handle)?; + + let read_handle = fs.open_file_readonly("file")?; + let read = fs.read_file_complete(&read_handle)?; + + let cdc_dedup_ratio = fs.cdc_dedup_ratio(); + let res = fs.scrub().unwrap(); + let sbc_dedup_ratio = fs.total_dedup_ratio(); + println!("CDC dedup ratio: {}", cdc_dedup_ratio); + println!("SBC dedup ratio: {}", sbc_dedup_ratio); + println!("ScrubMeasure: {:?}", res); + assert_eq!(read.len(), data.len()); + Ok(()) +} +``` diff --git a/runner/Cargo.toml b/runner/Cargo.toml new file mode 100644 index 0000000..0d30885 --- /dev/null +++ b/runner/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "runner" +version = "0.1.0" +edition = "2021" + + +[dependencies] +sbc_algorithm = { path = ".." } +chunkfs = { git = "https://github.com/Piletskii-Oleg/chunkfs.git", features = ["chunkers", "hashers"]} diff --git a/runner/files/ferris.png b/runner/files/ferris.png new file mode 100644 index 0000000..d1e9f80 Binary files /dev/null and b/runner/files/ferris.png differ diff --git a/runner/files/my_data b/runner/files/my_data new file mode 100644 index 0000000..ec3f6ec Binary files /dev/null and b/runner/files/my_data differ diff --git a/runner/src/main.rs b/runner/src/main.rs new file mode 100644 index 0000000..3893960 --- /dev/null +++ b/runner/src/main.rs @@ -0,0 +1,40 @@ +extern crate chunkfs; +extern crate sbc_algorithm; + +use chunkfs::chunkers::{SizeParams, SuperChunker}; +use chunkfs::hashers::Sha256Hasher; +use chunkfs::FileSystem; +use sbc_algorithm::{clusterer, decoder, encoder, hasher}; +use sbc_algorithm::{SBCMap, SBCScrubber}; +use std::collections::HashMap; +use std::{fs, io}; + +fn main() -> io::Result<()> { + let data = fs::read("runner/files/my_data")?; + let chunk_size = SizeParams::new(2 * 1024, 8 * 1024, 16 * 1024); + let mut fs = FileSystem::new_with_scrubber( + HashMap::default(), + SBCMap::new(decoder::GdeltaDecoder::new(false)), + Box::new(SBCScrubber::new( + hasher::AronovichHasher, + clusterer::GraphClusterer::default(), + encoder::GdeltaEncoder::new(false), + )), + Sha256Hasher::default(), + ); + let mut handle = fs.create_file("file".to_string(), SuperChunker::new(chunk_size))?; + fs.write_to_file(&mut handle, &data)?; + fs.close_file(handle)?; + + let read_handle = fs.open_file_readonly("file")?; + let read = fs.read_file_complete(&read_handle)?; + + let cdc_dedup_ratio = fs.cdc_dedup_ratio(); + let res = fs.scrub().unwrap(); + let sbc_dedup_ratio = fs.total_dedup_ratio(); + println!("CDC dedup ratio: {}", cdc_dedup_ratio); + println!("SBC dedup ratio: {}", sbc_dedup_ratio); + println!("ScrubMeasure: {:?}", res); + assert_eq!(read.len(), data.len()); + Ok(()) +} diff --git a/src/chunkfs_sbc.rs b/src/chunkfs_sbc.rs new file mode 100644 index 0000000..4325f52 --- /dev/null +++ b/src/chunkfs_sbc.rs @@ -0,0 +1,321 @@ +use crate::clusterer::Clusterer; +use crate::decoder::Decoder; +use crate::encoder::Encoder; +use crate::hasher::SBCHasher; +use crate::{ChunkType, SBCHash, SBCKey, SBCMap}; +use chunkfs::{ + ChunkHash, Data, DataContainer, Database, IterableDatabase, Scrub, ScrubMeasurements, +}; +use rayon::prelude::*; +use rayon::ThreadPoolBuilder; +use std::collections::HashMap; +use std::io; +use std::io::{Error, ErrorKind}; +use std::sync::Mutex; +use std::time::Instant; + +const NUM_THREADS_FOR_HASHING: usize = 1; + +pub type ClusterPoint<'a, Hash> = (Hash, &'a mut &'a mut DataContainer>); +pub type Clusters<'a, Hash> = HashMap>>; + +/// Implements the `Database` trait for `SBCMap`, enabling it to act as a storage backend +/// for chunk-based filesystems (`chunkfs`). +/// +/// This implementation provides methods to insert, retrieve, and check for chunks +/// identified by `SBCKey`. It handles both simple chunks stored as raw data and delta chunks +/// which are decoded on retrieval using the provided decoder. +/// +/// # Type Parameters +/// +/// * `D` - The decoder type implementing the `Decoder` trait, used to decode delta chunks. +/// * `Hash` - The hash type implementing the `SBCHash` trait, identifying chunks. +/// +/// # Behavior +/// +/// - `insert` stores the raw chunk bytes keyed by their `SBCKey`. +/// - `get` retrieves the chunk data: +/// - For `Simple` chunks, returns the stored bytes directly. +/// - For `Delta` chunks, recursively retrieves the parent chunk and applies the decoder to reconstruct the full chunk. +/// - `contains` checks if a chunk key exists in the storage. +impl Database, Vec> for SBCMap { + /// Inserts a chunk into the storage. + /// + /// # Arguments + /// + /// * `sbc_hash` - The key identifying the chunk. + /// * `chunk` - The raw byte content of the chunk. + fn insert(&mut self, sbc_hash: SBCKey, chunk: Vec) -> io::Result<()> { + self.sbc_hashmap.insert(sbc_hash, chunk); + Ok(()) + } + + /// Retrieves a chunk by its key. + /// + /// For `Simple` chunks, returns the stored bytes directly. + /// For `Delta` chunks, recursively retrieves the parent chunk and decodes the delta + /// to reconstruct the full chunk. + /// + /// # Arguments + /// + /// * `sbc_hash` - Reference to the chunk key to retrieve. + /// + /// # Returns + /// + /// The full chunk bytes as a `Vec`. + fn get(&self, sbc_hash: &SBCKey) -> io::Result> { + let sbc_value = self + .sbc_hashmap + .get(sbc_hash) + .ok_or(Error::new(ErrorKind::NotFound, "Chunk not found"))?; + + let chunk = match &sbc_hash.chunk_type { + ChunkType::Simple => sbc_value.clone(), + ChunkType::Delta { + parent_hash, + number: _, + } => { + // Recursively get the parent chunk as a simple chunk + let parent_data = self.get(&SBCKey { + hash: parent_hash.clone(), + chunk_type: ChunkType::Simple, + })?; + + // Decode the delta chunk using the decoder + self.decoder.decode_chunk(parent_data, sbc_value.as_slice()) + } + }; + Ok(chunk) + } + + /// Checks if the storage contains a chunk with the given key. + /// + /// # Arguments + /// + /// * `key` - Reference to the chunk key. + /// + /// # Returns + /// + /// `true` if the chunk exists, `false` otherwise. + fn contains(&self, key: &SBCKey) -> bool { + self.sbc_hashmap.contains_key(key) + } +} + +impl IterableDatabase, Vec> for SBCMap { + fn iterator(&self) -> Box, &Vec)> + '_> { + Box::new(self.sbc_hashmap.iter()) + } + fn iterator_mut(&mut self) -> Box, &mut Vec)> + '_> { + Box::new(self.sbc_hashmap.iter_mut()) + } + + fn clear(&mut self) -> io::Result<()> { + HashMap::clear(&mut self.sbc_hashmap); + Ok(()) + } +} +/// Applies the Similarity-Based Chunking (SBC) algorithm to chunks obtained from +/// Content Defined Chunking (CDC). +/// +/// `SBCScrubber` orchestrates the process of hashing, clustering, and encoding chunks +/// to optimize storage by exploiting similarity between chunks. +/// +/// # Type Parameters +/// +/// * `Hash` - The hash type implementing `SBCHash`, representing the hash of chunks. +/// * `H` - The hasher type implementing `Hasher` producing `Hash`. +/// * `C` - The clusterer type implementing `Clusterer` for grouping similar chunks. +/// * `E` - The encoder type implementing `Encoder` for encoding clusters into delta or simple chunks. +/// +/// # Fields +/// +/// * `hasher` - Responsible for computing similarity hashes of chunks. +/// * `clusterer` - Responsible for grouping chunks based on similarity hashes. +/// * `encoder` - Responsible for encoding clusters into delta-encoded or simple chunks. +/// +/// # Overview +/// +/// The scrubber performs the following steps: +/// 1. **Hashing**: Computes similarity hashes of all chunks in parallel. +/// 2. **Clustering**: Groups chunks by similarity using the clusterer. +/// 3. **Encoding**: Encodes the clusters into delta or simple chunks and stores them in the target map. +/// +/// # Example +/// +/// ``` +/// extern crate chunkfs; +/// extern crate sbc_algorithm; +/// +/// use chunkfs::chunkers::{SizeParams, SuperChunker}; +/// use chunkfs::hashers::Sha256Hasher; +/// use chunkfs::FileSystem; +/// use sbc_algorithm::{SBCMap, SBCScrubber}; +/// use sbc_algorithm::{decoder, encoder, hasher, clusterer}; +/// use std::collections::HashMap; +/// use std::io; +/// +/// fn main() -> io::Result<()> { +/// let data = vec![10; 1024 * 1024]; +/// let chunk_size = SizeParams::new(2 * 1024, 8 * 1024, 16 * 1024); +/// let mut fs = FileSystem::new_with_scrubber( +/// HashMap::default(), +/// SBCMap::new(decoder::GdeltaDecoder::default()), +/// Box::new(SBCScrubber::new( +/// hasher::AronovichHasher, +/// clusterer::GraphClusterer::default(), +/// encoder::GdeltaEncoder::default(), +/// )), +/// Sha256Hasher::default(), +/// ); +/// let mut handle = fs.create_file("file".to_string(), SuperChunker::new(chunk_size))?; +/// fs.write_to_file(&mut handle, &data)?; +/// fs.close_file(handle)?; +/// +/// let read_handle = fs.open_file_readonly("file")?; +/// let read = fs.read_file_complete(&read_handle)?; +/// +/// let cdc_dedup_ratio = fs.cdc_dedup_ratio(); +/// let res = fs.scrub().unwrap(); +/// let sbc_dedup_ratio = fs.total_dedup_ratio(); +/// println!("CDC dedup ratio: {}", cdc_dedup_ratio); +/// println!("SBC dedup ratio: {}", cdc_dedup_ratio); +/// println!("ScrubMeasure: {:?}", res); +/// assert_eq!(read.len(), data.len()); +/// Ok(()) +/// } +/// ``` +/// +pub struct SBCScrubber +where + Hash: SBCHash, + H: SBCHasher, + C: Clusterer, + E: Encoder, +{ + /// Hasher used to compute similarity hashes of chunks. + hasher: H, + + /// Clusterer used to group chunks based on similarity. + clusterer: C, + + /// Encoder used to encode clusters into delta or simple chunks. + encoder: E, +} + +impl SBCScrubber +where + Hash: SBCHash, + H: SBCHasher, + C: Clusterer, + E: Encoder, +{ + /// Creates a new `SBCScrubber` with the given hasher, clusterer, and encoder. + /// + /// # Arguments + /// + /// * `hasher` - The hasher instance. + /// * `clusterer` - The clusterer instance. + /// * `encoder` - The encoder instance. + /// + /// # Returns + /// + /// A new `SBCScrubber` ready to process chunks. + pub fn new(hasher: H, clusterer: C, encoder: E) -> Self { + SBCScrubber { + hasher, + clusterer, + encoder, + } + } +} + +impl Scrub, SBCMap> + for SBCScrubber +where + CDCHash: ChunkHash, + for<'data> B: + IterableDatabase>> + IntoParallelRefMutIterator<'data>, + H: SBCHasher + Sync, + C: Clusterer, + D: Decoder + Send, + E: Encoder + Sync, + Hash: SBCHash, +{ + /// Applies the SBC algorithm to the chunks in the given database, storing results in the target map. + /// + /// This method performs hashing, clustering, and encoding in sequence, measuring the time taken by each step. + /// + /// # Arguments + /// + /// * `database` - The source database containing CDC chunks wrapped in `DataContainer`. + /// * `target_map` - The target storage map to store processed chunks. + /// + /// # Returns + /// + /// A `ScrubMeasurements` struct containing metrics about the operation. + fn scrub<'a>( + &mut self, + database: &mut B, + target_map: &mut SBCMap, + ) -> io::Result + where + CDCHash: 'a, + { + // Create a thread pool with a fixed number of threads for hashing + let pool = ThreadPoolBuilder::new() + .num_threads(NUM_THREADS_FOR_HASHING) + .build() + .unwrap(); + + // Collect mutable references to all data containers from the database + let mut mut_refs_database: Vec<_> = database.values_mut().collect(); + + // Mutex-protected vector to accumulate (hash, data_container) pairs after hashing + let sbc_hash_chunk: Mutex> = Mutex::default(); + + // 1. Hashing: compute similarity hashes in parallel + let time_start = Instant::now(); + pool.install(|| { + mut_refs_database.par_iter_mut().for_each(|data_container| { + match data_container.extract() { + Data::Chunk(data) => { + let sbc_hash = self.hasher.calculate_hash(data.as_slice()); + let mut chunk_sbc_hash_lock = sbc_hash_chunk.lock().unwrap(); + chunk_sbc_hash_lock.push((sbc_hash, data_container)); + } + Data::TargetChunk(_) => { + // Handling for target chunks not implemented yet + todo!() + } + } + }); + }); + let time_hashing = time_start.elapsed().as_secs_f64(); + print!("{time_hashing:.4};"); + + // 2. Clustering: group chunks by similarity + let time_clusterize_start = time_start.elapsed(); + let (mut clusters, clusterization_report) = self + .clusterer + .clusterize(sbc_hash_chunk.into_inner().unwrap()); + let time_clusterize = + time_start.elapsed().as_secs_f64() - time_clusterize_start.as_secs_f64(); + print!("{time_clusterize:.4};"); + + // 3. Encoding: encode clusters and store in target map + let time_encode_start = time_start.elapsed(); + let (data_left, processed_data) = self.encoder.encode_clusters(&mut clusters, target_map); + let time_encode = time_start.elapsed().as_secs_f64() - time_encode_start.as_secs_f64(); + print!("{time_encode:.4};"); + + let running_time = time_start.elapsed(); + + Ok(ScrubMeasurements { + processed_data, + running_time, + data_left, + clusterization_report, + }) + } +} diff --git a/src/clusterer.rs b/src/clusterer.rs new file mode 100644 index 0000000..4d4b9ed --- /dev/null +++ b/src/clusterer.rs @@ -0,0 +1,59 @@ +mod eq_clusterer; +mod graph_clusterer; + +use crate::chunkfs_sbc::{ClusterPoint, Clusters}; +use crate::SBCHash; +use chunkfs::ClusteringMeasurements; +pub use eq_clusterer::EqClusterer; +pub use graph_clusterer::GraphClusterer; +use std::collections::HashMap; + +/// A trait defining the clustering behavior for similarity-based chunking. +/// +/// The `Clusterer` trait groups chunks, identified by their similarity hashes, +/// into clusters of related chunks. This is a key step in similarity-based chunking +/// workflows to identify chunks that can be efficiently encoded as deltas. +/// +/// # Type Parameters +/// +/// * `Hash` - The hash type implementing `SBCHash` that identifies chunks. +/// +/// # Methods +/// +/// * `clusterize` - Takes a vector of chunk similarity points and returns clusters grouping them. +pub trait Clusterer { + /// Groups chunks into clusters based on their similarity hashes. + /// + /// # Arguments + /// + /// * `chunk_sbc_hash` - A vector of `ClusterPoint` items representing chunks and their hashes. + /// + /// # Returns + /// + /// A collection of clusters, where each cluster is a grouping of related chunks. + fn clusterize<'a>( + &mut self, + chunk_sbc_hash: Vec>, + ) -> (Clusters<'a, Hash>, ClusteringMeasurements); +} + +/// Accepts a vector consisting of vertices between which it is necessary to calculate the distances. +/// Returns a table with a list of distances corresponding to each vertex +fn calculate_distance_to_other_vertices(vertices: Vec) -> HashMap> { + let mut distance_to_other_vertices = HashMap::new(); + + for i in 0..vertices.len() { + let mut distances = Vec::new(); + + for j in 0..vertices.len() { + if i != j { + let distance = vertices[i].abs_diff(vertices[j]) as usize; + distances.push(distance); + } + } + + distance_to_other_vertices.insert(vertices[i], distances); + } + + distance_to_other_vertices +} diff --git a/src/clusterer/eq_clusterer.rs b/src/clusterer/eq_clusterer.rs new file mode 100644 index 0000000..ee6c484 --- /dev/null +++ b/src/clusterer/eq_clusterer.rs @@ -0,0 +1,103 @@ +use crate::chunkfs_sbc::{ClusterPoint, Clusters}; +use crate::clusterer::{calculate_distance_to_other_vertices, Clusterer}; +use crate::SBCHash; +use chunkfs::ClusteringMeasurements; +use std::collections::HashMap; + +pub struct EqClusterer; + +impl Clusterer for EqClusterer { + fn clusterize<'a>( + &mut self, + chunk_sbc_hash: Vec>, + ) -> (Clusters<'a, Hash>, ClusteringMeasurements) { + let mut clusters: Clusters = HashMap::default(); + + let mut total_cluster_size: usize = 0; + let mut number_of_vertices_in_cluster = HashMap::new(); + let mut parent_vertices: Vec = Vec::new(); + + for (sbc_hash, data_container) in chunk_sbc_hash { + let key = sbc_hash.get_key_for_graph_clusterer(); + parent_vertices.push(key); + number_of_vertices_in_cluster.insert(key, 1); + + let cluster = clusters.entry(sbc_hash.clone()).or_default(); + cluster.push((sbc_hash, data_container)); + + total_cluster_size += 1; + } + + let distance_to_other_clusters = calculate_distance_to_other_vertices(parent_vertices); + let distance_to_vertices_in_cluster = HashMap::new(); + let cluster_dedup_ratio = HashMap::new(); + let number_of_clusters = total_cluster_size; + + let clusterization_report = ClusteringMeasurements { + total_cluster_size, + number_of_clusters, + number_of_vertices_in_cluster, + distance_to_vertices_in_cluster, + distance_to_other_clusters, + cluster_dedup_ratio, + }; + + (clusters, clusterization_report) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::{decoder, encoder, hasher, SBCMap, SBCScrubber}; + use chunkfs::chunkers::{SizeParams, SuperChunker}; + use chunkfs::hashers::Sha256Hasher; + use chunkfs::FileSystem; + + fn generate_test_data() -> Vec { + const TEST_DATA_SIZE: usize = 16000; + (0..TEST_DATA_SIZE).map(|_| rand::random::()).collect() + } + + #[test] + fn scrub_should_return_correct_scrub_measurements_for_eq_clusterer() { + let test_data = generate_test_data(); + let chunk_size = SizeParams::new(2 * 1024, 8 * 1024, 16 * 1024); + + let mut fs = FileSystem::new_with_scrubber( + HashMap::default(), + SBCMap::new(decoder::GdeltaDecoder::new(false)), + Box::new(SBCScrubber::new( + hasher::AronovichHasher, + EqClusterer, + encoder::GdeltaEncoder::new(false), + )), + Sha256Hasher::default(), + ); + + let mut handle = fs + .create_file("file".to_string(), SuperChunker::new(chunk_size)) + .unwrap(); + fs.write_to_file(&mut handle, &test_data).unwrap(); + fs.close_file(handle).unwrap(); + + let scrub_report = fs.scrub().unwrap(); + + let cluster_report = &scrub_report.clusterization_report; + assert!(cluster_report.total_cluster_size > 0); + assert!(cluster_report.number_of_clusters > 0); + assert!(cluster_report + .number_of_vertices_in_cluster + .values() + .all(|&v| v == 1)); + assert!(cluster_report.distance_to_vertices_in_cluster.is_empty()); + assert!(cluster_report + .distance_to_other_clusters + .values() + .all(|v| !v.is_empty())); + assert!(cluster_report + .cluster_dedup_ratio + .values() + .all(|&v| v == 0.0)); + } +} diff --git a/src/clusterer/graph_clusterer.rs b/src/clusterer/graph_clusterer.rs new file mode 100644 index 0000000..d894d72 --- /dev/null +++ b/src/clusterer/graph_clusterer.rs @@ -0,0 +1,283 @@ +use crate::chunkfs_sbc::{ClusterPoint, Clusters}; +use crate::clusterer::{calculate_distance_to_other_vertices, Clusterer}; +use crate::SBCHash; +use chunkfs::ClusteringMeasurements; +use std::collections::HashMap; + +/// A vertex in the graph used for clustering. +/// +/// Each vertex tracks its parent for union-find operations during MST construction. +struct Vertex { + /// The parent vertex key in the union-find structure. + parent: u32, +} + +impl Vertex { + /// Creates a new vertex with itself as its own parent. + /// + /// # Arguments + /// + /// * `key` - The unique key identifying this vertex. + /// + /// # Returns + /// + /// A new `Vertex` instance. + pub fn new(key: u32) -> Vertex { + Vertex { parent: key } + } +} + +/// A clusterer that groups chunks using Kruskal's algorithm to build a minimum spanning tree (MST). +/// +/// `GraphClusterer` uses a union-find data structure to cluster chunks based on their hash keys, +/// grouping chunks whose keys are close within a certain threshold (`max_weight_edge`). +/// +/// # Details +/// +/// The clustering is performed by assigning each chunk to a cluster represented by the root parent +/// found via union-find. The `set_parent_vertex` method attempts to find a nearby parent vertex +/// within the allowed edge weight to merge clusters. +/// +/// # Type Parameters +/// +/// * `Hash` - The hash type implementing `SBCHash`. +/// +/// # Example +/// +/// ``` +/// # use sbc_algorithm::clusterer::GraphClusterer; +/// +/// let mut clusterer = GraphClusterer::default(); +/// // Use clusterer.clusterize(...) to cluster chunks. +/// ``` +pub struct GraphClusterer { + /// Map of vertex keys to their union-find vertex data. + vertices: HashMap, + max_weight_edge: u32, +} + +impl Default for GraphClusterer { + /// Creates a new, empty `GraphClusterer`. + fn default() -> Self { + Self::new(10) + } +} + +impl GraphClusterer { + /// Constructs a new `GraphClusterer`. + /// + /// # Returns + /// + /// An empty `GraphClusterer`. + pub fn new(_max_weight_edge: u32) -> GraphClusterer { + GraphClusterer { + max_weight_edge: _max_weight_edge, + vertices: HashMap::new(), + } + } + + /// Finds the root parent of the given vertex key using path compression. + /// + /// # Arguments + /// + /// * `hash_set` - The vertex key to find the parent for. + /// + /// # Returns + /// + /// The root parent's key. + fn find_set(&mut self, hash_set: u32) -> u32 { + let parent = self.vertices.get(&hash_set).unwrap().parent; + if hash_set != parent { + let parent = self.find_set(parent); + self.vertices.get_mut(&hash_set).unwrap().parent = parent; + parent + } else { + parent + } + } + + /// Attempts to find a nearby parent vertex within `max_weight_edge` distance to cluster with. + /// If no suitable parent is found, the vertex becomes its own parent. + /// + /// # Arguments + /// + /// * `hash` - The vertex key to assign a parent for. + /// + /// # Returns + /// + /// The parent vertex key assigned. + fn set_parent_vertex(&mut self, hash: u32) -> u32 { + let mut min_dist = u32::MAX; + let mut parent_hash = hash; + + // Search in the range [hash - MAX_WEIGHT_EDGE, hash + MAX_WEIGHT_EDGE] + let start = hash.saturating_sub(self.max_weight_edge); + let end = hash.saturating_add(self.max_weight_edge); + + for other_hash in start..=end { + if self.vertices.contains_key(&other_hash) { + let other_parent_hash = self.find_set(other_hash); + let dist = other_parent_hash.abs_diff(hash); + if dist < min_dist && dist <= self.max_weight_edge { + min_dist = dist; + parent_hash = other_parent_hash; + } + } + } + + self.vertices.insert(hash, Vertex::new(parent_hash)); + parent_hash + } +} + +impl Clusterer for GraphClusterer { + /// Clusters chunks by grouping them based on proximity of their hash keys using MST logic. + /// + /// # Arguments + /// + /// * `chunk_sbc_hash` - A vector of chunk points with their similarity hashes. + /// + /// # Returns + /// + /// A map of clusters keyed by the root hash, each containing grouped chunk points. + fn clusterize<'a>( + &mut self, + chunk_sbc_hash: Vec>, + ) -> (Clusters<'a, Hash>, ClusteringMeasurements) { + let mut clusters: Clusters = HashMap::default(); + let mut total_cluster_size = 0; + let mut number_of_clusters = 0; + let mut number_of_vertices_in_cluster = HashMap::new(); + let mut distance_to_vertices_in_cluster: HashMap> = HashMap::new(); + let mut parent_vertices: Vec = Vec::new(); + + for (sbc_hash, data_container) in chunk_sbc_hash { + total_cluster_size += 1; + + // Obtain u32 key for graph clustering from the hash + let key = sbc_hash.get_key_for_graph_clusterer(); + + // Find or assign the parent vertex for this key + let parent_key = self.set_parent_vertex(key); + + number_of_vertices_in_cluster + .entry(parent_key) + .and_modify(|value| *value += 1) + .or_insert(1); + if key == parent_key { + parent_vertices.push(key); + distance_to_vertices_in_cluster.insert(key, vec![]); + number_of_clusters += 1; + } else { + distance_to_vertices_in_cluster + .entry(parent_key) + .and_modify(|value| value.push(key.abs_diff(parent_key) as usize)); + } + + // Group the chunk into the cluster identified by the parent's hash + let cluster = clusters.entry(Hash::new_with_u32(parent_key)).or_default(); + cluster.push((sbc_hash, data_container)); + } + + let distance_to_other_clusters = calculate_distance_to_other_vertices(parent_vertices); + + // Stub. The calculation cannot be performed at this stage. + let cluster_dedup_ratio = HashMap::new(); + + let clusterization_report = ClusteringMeasurements { + total_cluster_size, + number_of_clusters, + number_of_vertices_in_cluster, + distance_to_vertices_in_cluster, + distance_to_other_clusters, + cluster_dedup_ratio, + }; + + (clusters, clusterization_report) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::{decoder, encoder, hasher, SBCMap, SBCScrubber}; + use chunkfs::chunkers::{SizeParams, SuperChunker}; + use chunkfs::hashers::Sha256Hasher; + use chunkfs::{FileSystem, ScrubMeasurements}; + + fn generate_test_data() -> Vec { + const TEST_DATA_SIZE: usize = 32000; + (0..TEST_DATA_SIZE).map(|_| rand::random::()).collect() + } + + fn create_scrub_report(data: Vec) -> ScrubMeasurements { + let chunk_size = SizeParams::new(2 * 1024, 8 * 1024, 16 * 1024); + + let mut fs = FileSystem::new_with_scrubber( + HashMap::default(), + SBCMap::new(decoder::GdeltaDecoder::new(false)), + Box::new(SBCScrubber::new( + hasher::AronovichHasher, + GraphClusterer::default(), + encoder::GdeltaEncoder::new(false), + )), + Sha256Hasher::default(), + ); + + let mut handle = fs + .create_file("file".to_string(), SuperChunker::new(chunk_size)) + .unwrap(); + fs.write_to_file(&mut handle, &data).unwrap(); + fs.close_file(handle).unwrap(); + fs.scrub().unwrap() + } + + #[test] + fn scrub_should_return_non_empty_scrub_measurements_for_graph_clusterer() { + let test_data = generate_test_data(); + let scrub_report = create_scrub_report(test_data); + + let cluster_report = &scrub_report.clusterization_report; + assert!(cluster_report.total_cluster_size > 0); + assert!(cluster_report.number_of_clusters > 0); + assert!(cluster_report + .number_of_vertices_in_cluster + .values() + .all(|&v| v >= 1)); + assert!(!cluster_report.distance_to_vertices_in_cluster.is_empty()); + assert!(cluster_report + .distance_to_other_clusters + .values() + .all(|v| !v.is_empty())); + } + + #[test] + fn scrub_should_return_scrub_measurements_with_correct_distance_to_vertices_in_cluster() { + let test_data = generate_test_data(); + let scrub_report = create_scrub_report(test_data); + + let cluster_report = &scrub_report.clusterization_report; + + for (parent_key, &cluster_size) in &cluster_report.number_of_vertices_in_cluster { + assert!(cluster_size > 0); + + let cluster_points = &scrub_report + .clusterization_report + .distance_to_vertices_in_cluster[parent_key]; + + // The parent vertex is ignored. + assert_eq!(cluster_points.len(), cluster_size - 1); + } + } + + #[test] + fn total_cluster_size_matches_sum_of_cluster_vertices() { + let test_data = generate_test_data(); + let scrub_report = create_scrub_report(test_data); + let cluster_report = &scrub_report.clusterization_report; + + let sum_vertices = cluster_report.number_of_vertices_in_cluster.values().sum(); + + assert_eq!(cluster_report.total_cluster_size, sum_vertices); + } +} diff --git a/src/decoder.rs b/src/decoder.rs new file mode 100644 index 0000000..f7de3df --- /dev/null +++ b/src/decoder.rs @@ -0,0 +1,22 @@ +mod gdelta_decoder; +mod levenshtein_decoder; +mod zdelta_decoder; + +pub use gdelta_decoder::GdeltaDecoder; +pub use levenshtein_decoder::LevenshteinDecoder; +pub use zdelta_decoder::ZdeltaDecoder; +/// A trait for decoding delta codes generated by Similarity Based Chunking. +/// +/// Implementors of this trait provide a method to decode a delta code into its original form, +/// given the parent data from which the delta was derived. +pub trait Decoder { + /// Decodes a delta code into its original form using the provided parent data. + /// + /// # Parameters + /// - `parent_data`: The original data from which the delta code was generated. + /// - `delta_code`: The delta code to be decoded. + /// + /// # Returns + /// The decoded data in its original form. + fn decode_chunk(&self, parent_data: Vec, delta_code: &[u8]) -> Vec; +} diff --git a/src/decoder/gdelta_decoder.rs b/src/decoder/gdelta_decoder.rs new file mode 100644 index 0000000..cda7319 --- /dev/null +++ b/src/decoder/gdelta_decoder.rs @@ -0,0 +1,53 @@ +use crate::decoder::Decoder; + +/// Decoder based on Gdelta compression algorithm. +pub struct GdeltaDecoder { + zstd_flag: bool, +} + +impl GdeltaDecoder { + pub fn new(zstd_flag: bool) -> Self { + GdeltaDecoder { zstd_flag } + } +} + +impl Default for GdeltaDecoder { + fn default() -> Self { + Self::new(false) + } +} + +/// The method is based on copy and paste constructions. +/// The insert contains a handler of 3 bytes with the length of the insert and the data to insert. +/// To copy a handler of 6 bytes with a length and a shift in the parent chunk. +impl Decoder for GdeltaDecoder { + fn decode_chunk(&self, parent_data: Vec, delta_code: &[u8]) -> Vec { + let delta_code = if self.zstd_flag { + zstd::decode_all(delta_code).unwrap() + } else { + delta_code.to_vec() + }; + + let mut chunk_data = Vec::new(); + let mut byte_id = 0; + + while byte_id < delta_code.len() { + let mut buf = [0u8; 8]; + buf[..3].copy_from_slice(&delta_code[byte_id..byte_id + 3]); + + if buf[2] >= 128 { + buf[2] -= 128; + let insert_len = usize::from_ne_bytes(buf); + chunk_data.extend_from_slice(&delta_code[byte_id + 3..byte_id + 3 + insert_len]); + byte_id += 3 + insert_len + } else { + let copy_len = usize::from_ne_bytes(buf); + buf[..3].copy_from_slice(&delta_code[byte_id + 3..byte_id + 6]); + let copy_offset = usize::from_ne_bytes(buf); + chunk_data.extend_from_slice(&parent_data[copy_offset..copy_offset + copy_len]); + byte_id += 6 + } + } + chunk_data + } +} diff --git a/src/decoder/levenshtein_decoder.rs b/src/decoder/levenshtein_decoder.rs new file mode 100644 index 0000000..120d70a --- /dev/null +++ b/src/decoder/levenshtein_decoder.rs @@ -0,0 +1,106 @@ +use crate::decoder::Decoder; +use crate::encoder::Action; + +/// Decoder based on Levenshtein compression algorithm. +pub struct LevenshteinDecoder { + zstd_flag: bool, +} + +impl Default for LevenshteinDecoder { + fn default() -> Self { + Self::new(false) + } +} +impl LevenshteinDecoder { + pub fn new(zstd_flag: bool) -> Self { + LevenshteinDecoder { zstd_flag } + } +} + +impl Decoder for LevenshteinDecoder { + /// Decodes a chunk by applying delta actions to the given parent data. + /// + /// # Arguments + /// + /// * `parent_data` - The original chunk data to be modified. + /// * `delta_code` - A byte slice encoding the delta actions to apply. + /// + /// # Returns + /// + /// A new `Vec` containing the fully decoded chunk. + fn decode_chunk(&self, mut parent_data: Vec, delta_code: &[u8]) -> Vec { + let delta_code = if self.zstd_flag { + zstd::decode_all(delta_code).unwrap() + } else { + delta_code.to_vec() + }; + + let mut buf = [0u8; 4]; + let mut byte_index = 0; + + while byte_index < delta_code.len() { + // Read next 4 bytes as a big-endian u32 delta action code + buf.copy_from_slice(&delta_code[byte_index..byte_index + 4]); + let delta_action = u32::from_be_bytes(buf); + + // Decode the delta action into operation, index, and byte value + let (action, index, byte_value) = get_delta_action(delta_action); + + // Apply the delta action to the parent data + match action { + Action::Del => { + parent_data.remove(index); + } + Action::Add => { + parent_data.insert(index, byte_value); + } + Action::Rep => { + parent_data[index] = byte_value; + } + } + byte_index += 4; + } + parent_data + } +} + +/// Decodes a delta action packed into a 32-bit integer. +/// +/// This function extracts three components from a packed `u32` value: +/// 1. The delta operation type ([`Action`]) +/// 2. The byte index in the chunk where the operation applies +/// 3. The byte value (for `Rep` and `Add` operations) +/// +/// # Bit Layout +/// The 32-bit value is divided as follows: +/// - Bits 30-31 (2 bits): Action type +/// - Bits 22-29 (8 bits): Byte value +/// - Bits 0-21 (22 bits): Index in chunk +/// +/// ```text +/// 3 2 1 0 +/// 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 +/// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +/// | Action | Byte Value | Index | +/// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +/// ``` +/// +/// # Arguments +/// * `code` - Packed 32-bit value containing action metadata +/// +/// # Returns +/// A tuple containing: +/// - [`Action`] variant (operation type) +/// - `usize` index in the chunk +/// - `u8` byte value (for replacement/insertion) +pub(crate) fn get_delta_action(code: u32) -> (Action, usize, u8) { + let action = match code / (1 << 30) { + 0 => Action::Rep, + 1 => Action::Add, + 2 => Action::Del, + _ => panic!("Invalid action code in delta encoding"), + }; + let byte_value = ((code % (1 << 30)) >> 22) as u8; + let index = (code % (1 << 22)) as usize; + (action, index, byte_value) +} diff --git a/src/decoder/zdelta_decoder.rs b/src/decoder/zdelta_decoder.rs new file mode 100644 index 0000000..f2b519f --- /dev/null +++ b/src/decoder/zdelta_decoder.rs @@ -0,0 +1,923 @@ +use crate::decoder::Decoder; +use crate::encoder::zdelta_encoder; +use crate::encoder::zdelta_match_pointers::{MatchPointers, ReferencePointerType}; +use bit_vec::BitVec; +use huffman_compress::Tree; +use thiserror::Error; + +/// Flag indicating a literal byte follows in the delta stream. +const LITERAL_FLAG: u8 = 0x00; +/// Bytes needed for a match instruction: flag, length_remainder, offset_high, offset_low. +const MATCH_INSTRUCTION_SIZE: usize = 4; +/// Minimum length of a match in the zdelta algorithm. +const MIN_MATCH_LENGTH: usize = 3; +/// Maximum length of a match in the zdelta algorithm. +const MAX_MATCH_LENGTH: usize = 1026; +/// Size of length block for match length encoding. +const LENGTH_BLOCK_SIZE: usize = 256; + +/// Represents the decoder for zdelta-compressed data, capable of handling both raw and Huffman-encoded streams. +pub struct ZdeltaDecoder { + huffman_tree: Option>, +} + +impl ZdeltaDecoder { + /// Creates a new `ZdeltaDecoder` instance. + /// + /// # Arguments + /// * `use_huffman_encoding` - If true, enables Huffman decoding; otherwise, uses raw data. + /// + /// # Returns + /// A new `ZdeltaDecoder` instance with the specified configuration. + pub fn new(use_huffman_encoding: bool) -> Self { + if use_huffman_encoding { + let (_, huffman_tree) = zdelta_encoder::create_default_huffman_book_and_tree(); + Self { + huffman_tree: Some(huffman_tree), + } + } else { + Self { huffman_tree: None } + } + } + + /// Converts Huffman-encoded data into raw bytes using the Huffman tree. + /// + /// # Arguments + /// * `data` - The Huffman-encoded byte slice. + /// + /// # Returns + /// A vector of raw bytes decoded from the Huffman stream, or the input data if Huffman is disabled. + /// + /// # Notes + /// Assumes the Huffman tree is initialized if Huffman encoding is enabled. Returns the input + /// data as-is if no tree is present. + pub fn huffman_to_raw(&self, data: &[u8]) -> Vec { + let Some(tree) = &self.huffman_tree else { + return data.to_vec(); + }; + + let bit_buffer = BitVec::from_bytes(data); + let mut decoder = tree.unbounded_decoder(bit_buffer); + let mut output = Vec::new(); + let mut bits_processed = 0; + + while let Some(flag) = decoder.next() { + bits_processed += 1; + + if flag == LITERAL_FLAG { + if let Some(literal) = decoder.next() { + bits_processed += 1; + output.push(LITERAL_FLAG); + output.push(literal); + } else { + log::warn!("Incomplete literal at bit {bits_processed}"); + continue; + } + } else if (1..=20).contains(&flag) { + if let (Some(length_remainder), Some(offset_high), Some(offset_low)) = + (decoder.next(), decoder.next(), decoder.next()) + { + output.push(flag); + output.push(length_remainder); + output.push(offset_high); + output.push(offset_low); + } else { + log::warn!("Incomplete match at bit {bits_processed}"); + continue; + } + } else { + log::warn!("Unexpected flag {flag} at bit {bits_processed}"); + continue; + } + } + + output + } +} + +impl Default for ZdeltaDecoder { + fn default() -> Self { + Self::new(true) + } +} + +impl Decoder for ZdeltaDecoder { + /// Decodes a chunk of delta-encoded data into the original target data. + /// + /// # Arguments + /// * `parent_data` - The reference data used for match instructions. + /// * `delta_code` - The delta-encoded data containing literals and matches. + /// + /// # Returns + /// A vector of bytes representing the decoded target data. + /// + /// # Description + /// Iterates through the delta-encoded data, processing literals (marked by LITERAL_FLAG) + /// and matches (marked by flags 1–20). + /// Errors in match processing are logged and skipped. + fn decode_chunk(&self, parent_data: Vec, delta_code: &[u8]) -> Vec { + let mut output: Vec = Vec::new(); + let mut pointers = MatchPointers::new(0, 0, 0); + let mut previous_offset: Option = None; + + let data_to_decode = self.huffman_to_raw(delta_code); + + let mut index_in_data_to_decode = 0; + while index_in_data_to_decode < data_to_decode.len() { + if data_to_decode[index_in_data_to_decode] == LITERAL_FLAG { + if index_in_data_to_decode + 1 >= data_to_decode.len() { + break; + } + output.push(data_to_decode[index_in_data_to_decode + 1]); + index_in_data_to_decode += 2; + continue; + } + + if index_in_data_to_decode + MATCH_INSTRUCTION_SIZE > data_to_decode.len() { + log::warn!("Incomplete match data at index {index_in_data_to_decode}"); + index_in_data_to_decode += 1; + continue; + } + + let flag = data_to_decode[index_in_data_to_decode]; + let length_remainder = data_to_decode[index_in_data_to_decode + 1]; + let offset_high = data_to_decode[index_in_data_to_decode + 2]; + let offset_low = data_to_decode[index_in_data_to_decode + 3]; + index_in_data_to_decode += MATCH_INSTRUCTION_SIZE; + + let (length_coefficient, pointer_type, is_positive) = match decode_flag(flag) { + Ok(res) => res, + Err(e) => { + log::error!( + "Invalid flag {flag} at index {index_in_data_to_decode}, skipping: {e:?}" + ); + index_in_data_to_decode += 1; + continue; + } + }; + + let match_length = MIN_MATCH_LENGTH + + length_remainder as usize + + (length_coefficient as usize * LENGTH_BLOCK_SIZE); + + if match_length > MAX_MATCH_LENGTH { + log::error!("Match length {match_length} exceeds MAX_MATCH_LENGTH at index {index_in_data_to_decode}"); + index_in_data_to_decode += 1; + continue; + } + + let offset = ((offset_high as i16) << 8) | offset_low as i16; + let offset = if is_positive { offset } else { -offset }; + + if let Err(e) = process_match( + match_length, + offset, + pointer_type, + &parent_data, + &mut pointers, + &mut output, + &mut previous_offset, + ) { + log::error!("Failed to process match at index {index_in_data_to_decode}: {e:?}"); + index_in_data_to_decode += 1; + continue; + } + } + + output + } +} + +/// Processes a match command in delta encoding. +/// +/// # Arguments +/// * `length` - Number of bytes to copy (3..1026). +/// * `offset` - Relative offset from the pointer. +/// * `pointer_type` - Which reference to use (TargetLocal/Main/Auxiliary). +/// * `parent_data` - Reference data for Main/Auxiliary pointers. +/// * `pointers` - Current positions of pointers. +/// * `output` - Output buffer to write decoded data. +/// * `previous_offset` - Track previous offset for pointer strategy. +/// +/// # Errors +/// Returns InvalidOffset or InvalidLength if parameters are out of bounds. +fn process_match( + length: usize, + offset: i16, + pointer_type: ReferencePointerType, + parent_data: &[u8], + pointers: &mut MatchPointers, + output: &mut Vec, + previous_offset: &mut Option, +) -> Result<(), DecodeError> { + let source_position = match pointer_type { + ReferencePointerType::TargetLocal => { + if offset > 0 || offset.unsigned_abs() as usize > output.len() { + return Err(DecodeError::Offset); + } + output.len() - offset.unsigned_abs() as usize + } + _ => { + let base_ptr = pointers.get(&pointer_type); + let position = (base_ptr as isize + offset as isize) as usize; + if position > parent_data.len() { + return Err(DecodeError::Offset); + } + position + } + }; + + let end_position = source_position + .checked_add(length) + .ok_or(DecodeError::Length)?; + + match pointer_type { + ReferencePointerType::TargetLocal => { + if end_position > output.len() { + return Err(DecodeError::Length); + } + + let data_to_copy = output[source_position..end_position].to_vec(); + output.extend_from_slice(&data_to_copy); + } + _ => { + if end_position > parent_data.len() { + return Err(DecodeError::Length); + } + + output.extend_from_slice(&parent_data[source_position..end_position]); + } + } + + pointers.smart_update_after_match( + source_position + length, + offset, + pointer_type, + *previous_offset, + ); + *previous_offset = Some(offset); + Ok(()) +} + +fn decode_flag(flag: u8) -> Result<(u8, ReferencePointerType, bool), DecodeError> { + match flag { + 1 => Ok((0, ReferencePointerType::TargetLocal, false)), + 2 => Ok((0, ReferencePointerType::Main, true)), + 3 => Ok((0, ReferencePointerType::Main, false)), + 4 => Ok((0, ReferencePointerType::Auxiliary, true)), + 5 => Ok((0, ReferencePointerType::Auxiliary, false)), + 6 => Ok((1, ReferencePointerType::TargetLocal, false)), + 7 => Ok((1, ReferencePointerType::Main, true)), + 8 => Ok((1, ReferencePointerType::Main, false)), + 9 => Ok((1, ReferencePointerType::Auxiliary, true)), + 10 => Ok((1, ReferencePointerType::Auxiliary, false)), + 11 => Ok((2, ReferencePointerType::TargetLocal, false)), + 12 => Ok((2, ReferencePointerType::Main, true)), + 13 => Ok((2, ReferencePointerType::Main, false)), + 14 => Ok((2, ReferencePointerType::Auxiliary, true)), + 15 => Ok((2, ReferencePointerType::Auxiliary, false)), + 16 => Ok((3, ReferencePointerType::TargetLocal, false)), + 17 => Ok((3, ReferencePointerType::Main, true)), + 18 => Ok((3, ReferencePointerType::Main, false)), + 19 => Ok((3, ReferencePointerType::Auxiliary, true)), + 20 => Ok((3, ReferencePointerType::Auxiliary, false)), + _ => Err(DecodeError::Flag), + } +} + +/// Error types for zdelta decoding. +#[derive(Debug, Error)] +pub enum DecodeError { + #[error("Invalid flag value")] + Flag, + + #[error("Invalid length value")] + Length, + + #[error("Invalid offset value")] + Offset, +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::encoder::zdelta_encoder::ZdeltaEncoder; + use crate::encoder::zdelta_match_pointers::ReferencePointerType; + use crate::encoder::zdelta_match_pointers::ReferencePointerType::{Auxiliary, TargetLocal}; + use bit_vec::BitVec; + use huffman_compress::CodeBuilder; + use std::collections::HashMap; + + #[test] + fn decode_chunk_should_handle_basic_literals() { + let decoder = ZdeltaDecoder::new(false); + let result = decoder.decode_chunk(vec![], &[0x00, b'X', 0x00, b'Y']); + assert_eq!(result, vec![b'X', b'Y']); + } + + #[test] + fn decode_chunk_should_handle_basic_match() { + let decoder = ZdeltaDecoder::new(false); + let parent_data = vec![b'a', b'b', b'c']; + let delta_code = vec![2, 0, 0, 0]; + let result = decoder.decode_chunk(parent_data, &delta_code); + assert_eq!(result, vec![b'a', b'b', b'c']); + } + + #[test] + fn decode_chunk_should_handle_mixed_literals_and_matches() { + let decoder = ZdeltaDecoder::new(false); + let parent_data = vec![b'a', b'b', b'c', b'd']; + let delta_code = vec![0x00, b'X', 2, 1, 0, 0, 0x00, b'Y']; + let result = decoder.decode_chunk(parent_data, &delta_code); + assert_eq!(result, vec![b'X', b'a', b'b', b'c', b'd', b'Y']); + } + + #[test] + fn decode_chunk_should_handle_incomplete_literal() { + let decoder = ZdeltaDecoder::new(false); + let result = decoder.decode_chunk(vec![], &[0x00]); + assert_eq!(result, vec![]); + } + + #[test] + fn decode_chunk_should_handle_incomplete_match() { + let decoder = ZdeltaDecoder::new(false); + let result = decoder.decode_chunk(vec![b'a'], &[1, 0, 0]); + assert_eq!(result, vec![0]); + } + + #[test] + fn decode_chunk_should_handle_invalid_flag() { + let decoder = ZdeltaDecoder::new(false); + let result = decoder.decode_chunk(vec![b'a'], &[21, 0, 0, 0]); + assert_eq!(result, vec![]); + } + + #[test] + fn decode_chunk_should_handle_excessive_match_length() { + let decoder = ZdeltaDecoder::new(false); + let result = decoder.decode_chunk(vec![b'a'], &[16, 255, 0, 0]); + assert_eq!(result, vec![]); + } + + #[test] + fn decode_chunk_should_handle_empty_input() { + let decoder = ZdeltaDecoder::new(false); + let result = decoder.decode_chunk(vec![], &[]); + assert_eq!(result, vec![]); + } + + #[test] + fn decode_chunk_should_handle_max_length_match() { + let decoder = ZdeltaDecoder::new(false); + let parent_data = vec![0; MAX_MATCH_LENGTH]; + let delta_code = vec![17, 255, 0, 0]; + let result = decoder.decode_chunk(parent_data, &delta_code); + assert_eq!(result.len(), MAX_MATCH_LENGTH); + } + + #[test] + fn decode_chunk_should_handle_trailing_literals_after_incomplete_match() { + let decoder = ZdeltaDecoder::new(false); + let result = decoder.decode_chunk(vec![b'a'], &[1, 0, 0, 0x00, b'X', 0x00, b'Y']); + assert_eq!(result, vec![b'Y']); + } + + #[test] + fn process_match_should_track_previous_offset_for_pointer_strategy() { + let mut pointers = MatchPointers::new(0, 0, 0); + let mut output = Vec::new(); + let parent_data = vec![b'x'; 5000]; + let mut previous_offset = None; + + process_match( + 100, + 100, + ReferencePointerType::Main, + &parent_data, + &mut pointers, + &mut output, + &mut previous_offset, + ) + .unwrap(); + assert_eq!(pointers.get(&ReferencePointerType::Main), 200); + assert_eq!(pointers.get(&Auxiliary), 0); + + process_match( + 100, + 50, + ReferencePointerType::Main, + &parent_data, + &mut pointers, + &mut output, + &mut previous_offset, + ) + .unwrap(); + assert_eq!(pointers.get(&ReferencePointerType::Main), 350); + assert_eq!(pointers.get(&Auxiliary), 0); + + process_match( + 100, + 2000, + ReferencePointerType::Main, + &parent_data, + &mut pointers, + &mut output, + &mut previous_offset, + ) + .unwrap(); + assert_eq!(pointers.get(&ReferencePointerType::Main), 350); + assert_eq!(pointers.get(&Auxiliary), 2450); + } + + #[test] + fn process_match_should_copy_from_target_local() { + let mut pointers = MatchPointers::new(0, 0, 0); + let mut output = vec![b'a', b'b', b'c']; + let parent_data = vec![]; + + process_match( + 3, + -3, + TargetLocal, + &parent_data, + &mut pointers, + &mut output, + &mut None, + ) + .unwrap(); + assert_eq!(output, vec![b'a', b'b', b'c', b'a', b'b', b'c']); + assert_eq!(pointers.get(&TargetLocal), 3); + + process_match( + 3, + -3, + TargetLocal, + &parent_data, + &mut pointers, + &mut output, + &mut None, + ) + .unwrap(); + assert_eq!( + output, + vec![b'a', b'b', b'c', b'a', b'b', b'c', b'a', b'b', b'c'] + ); + assert_eq!(pointers.get(&TargetLocal), 6); + } + + #[test] + fn process_match_should_copy_from_main_reference() { + let mut pointers = MatchPointers::new(0, 2, 0); + let mut output = Vec::new(); + let parent_data = vec![b'a', b'b', b'c', b'd', b'e']; + + process_match( + 2, + 1, + ReferencePointerType::Main, + &parent_data, + &mut pointers, + &mut output, + &mut None, + ) + .unwrap(); + + assert_eq!(output, vec![b'd', b'e']); + assert_eq!(pointers.get(&ReferencePointerType::Main), 5); + } + + #[test] + fn process_match_should_copy_from_auxiliary_reference() { + let mut pointers = MatchPointers::new(0, 0, 1); + let mut output = Vec::new(); + let parent_data = vec![b'a', b'b', b'c', b'd', b'e']; + + process_match( + 2, + -1, + Auxiliary, + &parent_data, + &mut pointers, + &mut output, + &mut None, + ) + .unwrap(); + + assert_eq!(output, vec![b'a', b'b']); + assert_eq!(pointers.get(&Auxiliary), 2); + } + + #[test] + fn process_match_should_return_error_for_invalid_target_local_offset() { + let mut pointers = MatchPointers::default(); + let mut output = vec![b'a', b'b']; + let parent_data = Vec::new(); + + let result = process_match( + 1, + -3, + TargetLocal, + &parent_data, + &mut pointers, + &mut output, + &mut None, + ); + + assert!(matches!(result, Err(DecodeError::Offset))); + } + + #[test] + fn process_match_should_return_error_for_positive_target_local_offset() { + let mut pointers = MatchPointers::default(); + let mut output = vec![b'a', b'b']; + let parent_data = Vec::new(); + + let result = process_match( + 1, + 1, + TargetLocal, + &parent_data, + &mut pointers, + &mut output, + &mut None, + ); + + assert!(matches!(result, Err(DecodeError::Offset))); + } + + #[test] + fn process_match_should_return_error_for_invalid_reference_offset() { + let mut pointers = MatchPointers::new(0, 2, 0); + let mut output = Vec::new(); + let parent_data = vec![b'a', b'b', b'c']; + + let result = process_match( + 2, + 2, + ReferencePointerType::Main, + &parent_data, + &mut pointers, + &mut output, + &mut None, + ); + + assert!(matches!(result, Err(DecodeError::Offset))); + } + + #[test] + fn process_match_should_handle_max_length_match() { + let mut pointers = MatchPointers::new(0, 0, 0); + let mut output = Vec::new(); + let parent_data = vec![b'x'; MAX_MATCH_LENGTH + 10]; + + process_match( + MAX_MATCH_LENGTH, + 0, + ReferencePointerType::Main, + &parent_data, + &mut pointers, + &mut output, + &mut None, + ) + .unwrap(); + + assert_eq!(output.len(), MAX_MATCH_LENGTH); + assert_eq!(pointers.get(&ReferencePointerType::Main), MAX_MATCH_LENGTH); + } + + #[test] + fn process_match_should_handle_zero_offset() { + let mut pointers = MatchPointers::new(0, 2, 0); + let mut output = Vec::new(); + let parent_data = vec![b'a', b'b', b'c', b'd', b'e']; + + process_match( + 2, + 0, + ReferencePointerType::Main, + &parent_data, + &mut pointers, + &mut output, + &mut None, + ) + .unwrap(); + + assert_eq!(output, vec![b'c', b'd']); + assert_eq!(pointers.get(&ReferencePointerType::Main), 4); + } + + #[test] + fn process_match_should_handle_consecutive_matches() { + let mut pointers = MatchPointers::new(0, 0, 0); + let mut output = Vec::new(); + let parent_data = vec![b'a', b'b', b'c', b'd', b'e', b'f']; + + process_match( + 2, + 0, + ReferencePointerType::Main, + &parent_data, + &mut pointers, + &mut output, + &mut None, + ) + .unwrap(); + + process_match( + 2, + -2, + TargetLocal, + &parent_data, + &mut pointers, + &mut output, + &mut None, + ) + .unwrap(); + + assert_eq!(output, vec![b'a', b'b', b'a', b'b']); + assert_eq!(pointers.get(&TargetLocal), 2); + } + + #[test] + fn huffman_to_raw_should_decode_single_match() { + let decoder = create_test_decoder(); + + let mut buffer = BitVec::new(); + buffer.extend(BitVec::from_bytes(&[2, 7, 0, 100])); + + let encoded = buffer.to_bytes(); + let decoded = decoder.huffman_to_raw(&encoded); + + assert_eq!(decoded, vec![2, 7, 0, 100]); + } + + #[test] + fn huffman_to_raw_should_decode_multiple_matches() { + let decoder = create_test_decoder(); + + let input = vec![2, 7, 0, 100, 10, 41, 4, 0]; + + let mut buffer = BitVec::new(); + buffer.extend(BitVec::from_bytes(&input)); + + let encoded = buffer.to_bytes(); + let decoded = decoder.huffman_to_raw(&encoded); + + assert_eq!(decoded, input); + } + + #[test] + fn huffman_to_raw_should_handle_empty_input() { + let decoder = create_test_decoder(); + let decoded = decoder.huffman_to_raw(&[]); + assert_eq!(decoded, vec![]); + } + + #[test] + fn huffman_to_raw_should_return_raw_data_when_huffman_disabled() { + let decoder = ZdeltaDecoder::new(false); + let data = vec![1, 2, 3, 4]; + let decoded = decoder.huffman_to_raw(&data); + assert_eq!(decoded, data); + } + + #[test] + fn huffman_to_raw_should_handle_incomplete_last_match() { + let decoder = create_test_decoder(); + + let input = vec![2, 7, 0, 100, 10, 41, 4]; + + let mut buffer = BitVec::new(); + buffer.extend(BitVec::from_bytes(&input)); + + let encoded = buffer.to_bytes(); + let decoded = decoder.huffman_to_raw(&encoded); + + assert_eq!(decoded, vec![2, 7, 0, 100]); + } + + #[test] + fn huffman_to_raw_should_decode_max_values() { + let decoder = create_test_decoder(); + + let input = vec![16, 255, 127, 255, 20, 255, 127, 254]; + + let mut buffer = BitVec::new(); + buffer.extend(BitVec::from_bytes(&input)); + + let encoded = buffer.to_bytes(); + let decoded = decoder.huffman_to_raw(&encoded); + + assert_eq!(decoded, input); + } + + #[test] + fn huffman_to_raw_should_preserve_byte_order() { + let decoder = create_test_decoder(); + + let input = vec![2, 10, 0x12, 0x34]; + + let mut buffer = BitVec::new(); + buffer.extend(BitVec::from_bytes(&input)); + + let encoded = buffer.to_bytes(); + let decoded = decoder.huffman_to_raw(&encoded); + + assert_eq!(decoded[2], 0x12); + assert_eq!(decoded[3], 0x34); + } + + #[test] + fn huffman_to_raw_should_handle_all_pointer_types() { + let decoder = create_test_decoder(); + + let input = vec![ + 1, 10, 0, 100, // TargetLocal + 2, 20, 1, 200, // Main, positive + 3, 30, 2, 100, // Main, negative + 4, 40, 3, 200, // Auxiliary, positive + 5, 50, 4, 100, // Auxiliary, negative + ]; + + let mut buffer = BitVec::new(); + buffer.extend(BitVec::from_bytes(&input)); + + let encoded = buffer.to_bytes(); + let decoded = decoder.huffman_to_raw(&encoded); + + assert_eq!(decoded, input); + } + + #[test] + fn huffman_to_raw_should_decode_huffman_encoded_data() { + let decoder = ZdeltaDecoder::new(true); + + let test_cases = vec![ + vec![2, 7, 0, 100], // length=10, offset=100 + vec![10, 41, 4, 0], // length=300, offset=-1024 + vec![16, 255, 127, 255], // length=1026, offset=32767 + ]; + + let mut full_bitvec = BitVec::new(); + for case in &test_cases { + let mut buffer = BitVec::new(); + let (huffman_book, _) = zdelta_encoder::create_default_huffman_book_and_tree(); + huffman_book.encode(&mut buffer, &case[0]).unwrap(); + huffman_book.encode(&mut buffer, &case[1]).unwrap(); + huffman_book.encode(&mut buffer, &case[2]).unwrap(); + huffman_book.encode(&mut buffer, &case[3]).unwrap(); + full_bitvec.extend(buffer); + } + + let encoded_data = full_bitvec.to_bytes(); + + let decoded = decoder.huffman_to_raw(&encoded_data); + + let expected_raw: Vec = test_cases.iter().flatten().cloned().collect(); + assert_eq!(decoded, expected_raw); + } + + #[test] + fn huffman_to_raw_should_handle_invalid_huffman_data_gracefully() { + let decoder = ZdeltaDecoder::new(true); + + let invalid_data = vec![0xFF, 0xFF, 0xFF]; + let result = decoder.huffman_to_raw(&invalid_data); + + assert_ne!(result, invalid_data); + assert!(result.is_empty()); + } + + #[test] + fn huffman_to_raw_should_decode_single_literal_correctly() { + let decoder = ZdeltaDecoder::new(true); + let encoder = ZdeltaEncoder::new(true); + let mut buffer = BitVec::new(); + + encoder + .huffman_book() + .as_ref() + .unwrap() + .encode(&mut buffer, &LITERAL_FLAG) + .expect("Literal flag must be in codebook"); + encoder + .huffman_book() + .as_ref() + .unwrap() + .encode(&mut buffer, &b'A') + .expect("All literals (0-255) must be in codebook"); + + let encoded = buffer.to_bytes(); + + let decoded = decoder.huffman_to_raw(&encoded); + assert_eq!(decoded, vec![LITERAL_FLAG, b'A']); + } + + #[test] + fn huffman_to_raw_should_handle_mixed_literals_and_matches() { + let decoder = ZdeltaDecoder::new(true); + let encoder = ZdeltaEncoder::new(true); + let mut buffer = BitVec::new(); + + // Literal 'A' + encoder + .huffman_book() + .as_ref() + .unwrap() + .encode(&mut buffer, &LITERAL_FLAG) + .expect("Literal flag must be in codebook"); + encoder + .huffman_book() + .as_ref() + .unwrap() + .encode(&mut buffer, &b'A') + .expect("Literal must be in codebook"); + + // Match + encoder + .huffman_book() + .as_ref() + .unwrap() + .encode(&mut buffer, &2) + .expect("Flag must be in codebook"); + encoder + .huffman_book() + .as_ref() + .unwrap() + .encode(&mut buffer, &10) + .expect("Length remainder must be in codebook"); + encoder + .huffman_book() + .as_ref() + .unwrap() + .encode(&mut buffer, &0) + .expect("Offset high must be in codebook"); + encoder + .huffman_book() + .as_ref() + .unwrap() + .encode(&mut buffer, &100) + .expect("Offset low must be in codebook"); + + // Literal 'B' + encoder + .huffman_book() + .as_ref() + .unwrap() + .encode(&mut buffer, &LITERAL_FLAG) + .expect("Literal flag must be in codebook"); + encoder + .huffman_book() + .as_ref() + .unwrap() + .encode(&mut buffer, &b'B') + .expect("Literal must be in codebook"); + + let encoded = buffer.to_bytes(); + let decoded = decoder.huffman_to_raw(&encoded); + + assert_eq!( + decoded, + vec![LITERAL_FLAG, b'A', 2, 10, 0, 100, LITERAL_FLAG, b'B'] + ); + } + + #[test] + fn huffman_to_raw_should_ignore_unknown_markers() { + let decoder = ZdeltaDecoder::new(true); + let encoder = ZdeltaEncoder::new(true); + let mut buffer = BitVec::new(); + + encoder + .huffman_book() + .as_ref() + .unwrap() + .encode(&mut buffer, &21) + .expect("Should encode invalid flag"); + encoder + .huffman_book() + .as_ref() + .unwrap() + .encode(&mut buffer, &65) + .expect("Should encode byte"); + + let encoded = buffer.to_bytes(); + let decoded = decoder.huffman_to_raw(&encoded); + + assert!(decoded.is_empty()); + } + + fn create_test_decoder() -> ZdeltaDecoder { + let mut frequencies = HashMap::new(); + for i in 0..=255 { + frequencies.insert(i, 1); + } + let (_, tree) = CodeBuilder::from_iter(frequencies).finish(); + + ZdeltaDecoder { + huffman_tree: Some(tree), + } + } +} diff --git a/src/encoder.rs b/src/encoder.rs new file mode 100644 index 0000000..d0b0d91 --- /dev/null +++ b/src/encoder.rs @@ -0,0 +1,235 @@ +mod ddelta_encoder; +mod gdelta_encoder; +mod levenshtein_encoder; +mod xdelta_encoder; +pub mod zdelta_comprassion_error; +pub mod zdelta_encoder; +pub mod zdelta_match_pointers; + +use super::chunkfs_sbc::{ClusterPoint, Clusters}; +use crate::decoder::Decoder; +use crate::{ChunkType, SBCHash, SBCKey, SBCMap}; +use chunkfs::{Data, Database, IterableDatabase}; +pub use ddelta_encoder::DdeltaEncoder; +pub use ddelta_encoder::EdeltaOptimizations; +pub use gdelta_encoder::GdeltaEncoder; +pub use levenshtein_encoder::LevenshteinEncoder; +use rayon::prelude::*; +use rayon::ThreadPoolBuilder; +use std::sync::{Arc, Mutex, MutexGuard}; +pub use xdelta_encoder::XdeltaEncoder; +pub(crate) use {gdelta_encoder::GEAR, levenshtein_encoder::Action}; + +/// A trait for encoding data clusters using Similarity Based Chunking (SBC). +/// +/// Implementors of this trait provide methods to efficiently encode data chunks +/// by creating delta codes relative to parent chunks in a hierarchy. +pub trait Encoder { + /// Encodes a single cluster of data chunks relative to a parent hash. + /// + /// # Parameters + /// - `target_map`: Mutable reference to the SBC structure tracking chunk relationships + /// - `cluster`: Mutable slice of (hash, data container) tuples to encode + /// - `parent_hash`: Identifier for the parent chunk used as delta reference + /// + /// # Returns + /// A tuple containing: + /// - `usize`: Amount of unprocessed data remaining in cluster + /// - `usize`: Amount of data successfully processed and encoded + fn encode_cluster( + &self, + target_map: Arc>>, + cluster: &mut [ClusterPoint], + parent_hash: Hash, + ) -> (usize, usize); + + /// Batch processes multiple clusters through the encoding pipeline. + /// + /// # Parameters + /// - `clusters`: Mutable HashMap of parent hashes to their associated data clusters + /// - `target_map`: Mutable reference to the SBC structure tracking relationships + /// + /// # Returns + /// A tuple containing: + /// - `usize`: Total unprocessed data across all clusters + /// - `usize`: Total processed data across all clusters + /// + /// # Note + /// Provides default implementation that iterates through all clusters, + /// but can be overridden for optimized batch processing strategies. + fn encode_clusters( + &self, + clusters: &mut Clusters, + target_map: &mut SBCMap, + ) -> (usize, usize) + where + Self: Sync, + { + let pool = ThreadPoolBuilder::new().num_threads(1).build().unwrap(); + + let data_left = Mutex::new(0); + let processed_data = Mutex::new(0); + let target_map_ref = Arc::new(Mutex::new(target_map)); + pool.install(|| { + clusters.par_iter_mut().for_each(|(parent_hash, cluster)| { + let data_analyse = self.encode_cluster( + target_map_ref.clone(), + cluster.as_mut_slice(), + parent_hash.clone(), + ); + + let mut data_left_lock = data_left.lock().unwrap(); + *data_left_lock += data_analyse.0; + + let mut processed_data_lock = processed_data.lock().unwrap(); + *processed_data_lock += data_analyse.1; + }); + }); + ( + data_left.into_inner().unwrap(), + processed_data.into_inner().unwrap(), + ) + } +} + +/// Encodes a sequence of raw bytes as an INSERT instruction in delta encoding format. +/// +/// # Format Specification +/// The INSERT instruction is encoded as: +/// - 3 bytes: Length of the data (lower 23 bits) with MSB set to 1 (flag) +/// - N bytes: Raw data bytes to be inserted +/// +/// # Arguments +/// * `insert_data` - The raw byte sequence to be inserted. +/// Maximum length supported is 2^23-1 bytes. +/// * `delta_code` - Output buffer where the encoded instruction will be appended. +/// Must have enough capacity for 3 + insert_data.len() bytes. +fn encode_insert_instruction(insert_data: Vec, delta_code: &mut Vec) { + let len_bytes = &mut (insert_data.len() as u32).to_ne_bytes()[..3]; + len_bytes[2] |= 1 << 7; + delta_code.extend_from_slice(len_bytes); + delta_code.extend_from_slice(&insert_data); +} + +/// Encodes a COPY instruction. +/// +/// A COPY instruction consists of: +/// - 3 bytes: Length of the data to copy. +/// - 3 bytes: Offset in the source data where to copy from. +/// +/// # Parameters +/// * `equal_part_len` - Length of the data to copy (must be ≤ 2^24-1). +/// * `copy_instruction_offset` - Offset in the source data where the matching block begins (must be ≤ 2^24-1). +/// * `delta_code` - Output buffer where the encoded instruction will be appended. +fn encode_copy_instruction( + equal_part_length: usize, + copy_instruction_offset: usize, + delta_code: &mut Vec, +) { + let copy_instruction_len = &equal_part_length.to_ne_bytes()[..3]; + let copy_instruction_offset = ©_instruction_offset.to_ne_bytes()[..3]; + delta_code.extend_from_slice(copy_instruction_len); + delta_code.extend_from_slice(copy_instruction_offset); +} + +fn count_delta_chunks_with_hash( + target_map: &MutexGuard<&mut SBCMap>, + hash: &Hash, +) -> u16 { + let count = target_map + .iterator() + .filter(|(sbc_key, _)| { + sbc_key.hash == *hash + && match sbc_key.chunk_type { + ChunkType::Delta { + parent_hash: _, + number: _, + } => true, + ChunkType::Simple => false, + } + }) + .count(); + count as u16 +} + +fn find_empty_cell( + target_map: &MutexGuard<&mut SBCMap>, + hash: &Hash, +) -> Hash { + let mut left = hash.clone(); + let mut right = hash.next_hash(); + loop { + if target_map.contains(&SBCKey { + hash: left.clone(), + chunk_type: ChunkType::Simple, + }) { + left = left.last_hash(); + } else { + return left; + } + + if target_map.contains(&SBCKey { + hash: right.clone(), + chunk_type: ChunkType::Simple, + }) { + right = right.next_hash(); + } else { + return right; + } + } +} + +fn encode_simple_chunk( + target_map: &mut MutexGuard<&mut SBCMap>, + data: &[u8], + hash: Hash, +) -> (usize, SBCKey) { + let sbc_hash = SBCKey { + hash: find_empty_cell(target_map, &hash), + chunk_type: ChunkType::Simple, + }; + + let _ = target_map.insert(sbc_hash.clone(), data.to_vec()); + + (data.len(), sbc_hash) +} + +struct ParentChunkInCluster { + index: i32, + parent_data: Vec, + data_left: usize, +} + +fn get_parent_data( + target_map: Arc>>, + parent_hash: Hash, + cluster: &mut [ClusterPoint], +) -> ParentChunkInCluster { + let mut target_map_lock = target_map.lock().unwrap(); + match target_map_lock.get(&SBCKey { + hash: parent_hash.clone(), + chunk_type: ChunkType::Simple, + }) { + Ok(parent_data) => ParentChunkInCluster { + index: -1, + parent_data, + data_left: 0, + }, + Err(_) => { + let (_, parent_data_container) = &mut cluster[0]; + let parent_data = match parent_data_container.extract() { + Data::Chunk(data) => data.clone(), + Data::TargetChunk(_) => panic!(), + }; + let (data_left, parent_sbc_hash) = + encode_simple_chunk(&mut target_map_lock, parent_data.as_slice(), parent_hash); + + parent_data_container.make_target(vec![parent_sbc_hash]); + ParentChunkInCluster { + index: 0, + parent_data, + data_left, + } + } + } +} diff --git a/src/encoder/ddelta_encoder.rs b/src/encoder/ddelta_encoder.rs new file mode 100644 index 0000000..a2314c7 --- /dev/null +++ b/src/encoder/ddelta_encoder.rs @@ -0,0 +1,1128 @@ +use crate::chunkfs_sbc::ClusterPoint; +use crate::decoder::Decoder; +use crate::encoder::gdelta_encoder::GEAR; +use crate::encoder::{ + count_delta_chunks_with_hash, encode_copy_instruction, encode_insert_instruction, + get_parent_data, Encoder, +}; +use crate::hasher::SBCHash; +use crate::{ChunkType, SBCKey, SBCMap}; +use chunkfs::{Data, Database}; +use fasthash::spooky; +use std::collections::HashMap; +use std::sync::{Arc, Mutex}; + +/// One kilobyte. +const KB: usize = 1024; +/// Expected arithmetic mean of all chunks present within a cluster (calculated empirically). +const AVERAGE_CHUNK_SIZE: usize = 8 * KB; +/// Threshold that determines when the Gear hash (fp) points to a chunk boundary. +const CHUNK_THRESHOLD: u64 = AVERAGE_CHUNK_SIZE as u64 / 2; + +/// Use this enum when creating a DdeltaEncoder if you want to use the optimized version of Ddelta (Edelta). +pub enum EdeltaOptimizations { + /// Use if speed is important. + SpeedIsPriority, + /// Use if high compression ratio is important. + CompressionIsPriority, +} + +/// Ddelta compression encoder. +pub struct DdeltaEncoder { + edelta_optimizations: Option, +} + +impl Default for DdeltaEncoder { + /// Creates DdeltaEncoder without Edelta optimizations. + fn default() -> Self { + Self::new() + } +} + +impl Encoder for DdeltaEncoder { + /// Encodes a cluster of data chunks using Ddelta compression against a parent chunk. + /// + /// # Arguments + /// * `target_map` - Thread-safe reference to the chunk storage map (Arc). + /// * `cluster` - Mutable slice of ClusterPoints to process. + /// * `parent_hash` - Hash of the suggested parent chunk for delta reference. + /// + /// # Returns + /// A tuple containing: + /// 1. `usize` - Total bytes of data that couldn't be delta-encoded (left as-is). + /// 2. `usize` - Total bytes of processed delta-encoded data. + fn encode_cluster( + &self, + target_map: Arc>>, + cluster: &mut [ClusterPoint], + parent_hash: Hash, + ) -> (usize, usize) { + let mut processed_data = 0; + let parent_chunk = get_parent_data(target_map.clone(), parent_hash.clone(), cluster); + let mut data_left = parent_chunk.data_left; + let parent_data = parent_chunk.parent_data; + let source_chunks = gear_chunking(&parent_data); + let mut source_chunks_indices = build_chunks_indices(&source_chunks); + + for (chunk_id, (hash, data_container)) in cluster.iter_mut().enumerate() { + if parent_chunk.index > -1 && chunk_id == parent_chunk.index as usize { + continue; + } + let mut target_hash = SBCKey::default(); + match data_container.extract() { + Data::Chunk(data) => { + let (left_in_delta_chunk, processed_in_delta_chunk, sbc_hash) = self + .encode_delta_chunk( + target_map.clone(), + data, + hash.clone(), + parent_data.as_slice(), + &mut source_chunks_indices, + parent_hash.clone(), + ); + data_left += left_in_delta_chunk; + processed_data += processed_in_delta_chunk; + target_hash = sbc_hash; + } + Data::TargetChunk(_) => {} + } + data_container.make_target(vec![target_hash]); + } + (data_left, processed_data) + } +} + +impl DdeltaEncoder { + /// Use EdeltaOptimizations enum when creating a DdeltaEncoder if you want to use the optimized version of Ddelta (Edelta). + /// Or pass None as a parameter. + pub fn new() -> DdeltaEncoder { + DdeltaEncoder { + edelta_optimizations: None, + } + } + + pub fn new_with_edelta_optimizations( + edelta_optimizations: EdeltaOptimizations, + ) -> DdeltaEncoder { + DdeltaEncoder { + edelta_optimizations: Some(edelta_optimizations), + } + } + + /// Encodes a single data chunk using delta compression against a reference. + /// + /// # Arguments + /// * `target_map` - Shared map for storing compressed chunks. + /// * `target_data` - The data to be compressed. + /// * `target_hash` - Hash identifier for the target data. + /// * `source_data` - Reference data to compare against. + /// * `source_chunks_indices` - Key is the chunk hash, value is its first position in the source data. + /// * `source_hash` - Hash identifier for the parent/reference data. + /// + /// # Returns + /// 1. Number of uncompressed bytes. + /// 2. Total bytes processed. + /// 3. Storage key for the compressed delta. + fn encode_delta_chunk( + &self, + target_map: Arc>>, + target_data: &[u8], + target_hash: Hash, + source_data: &[u8], + source_chunks_indices: &mut HashMap, + source_hash: Hash, + ) -> (usize, usize, SBCKey) { + let mut delta_code: Vec = Vec::new(); + let target_chunks = gear_chunking(target_data); + + for mut target_chunk_position in 0..target_chunks.len() { + let target_chunk = target_chunks[target_chunk_position]; + match self.edelta_optimizations { + Some(EdeltaOptimizations::SpeedIsPriority) => process_target_chunk_with_edelta( + source_data, + target_data, + source_chunks_indices, + &target_chunks, + &mut target_chunk_position, + &mut delta_code, + EdeltaOptimizations::SpeedIsPriority, + ), + Some(EdeltaOptimizations::CompressionIsPriority) => { + process_target_chunk_with_edelta( + source_data, + target_data, + source_chunks_indices, + &target_chunks, + &mut target_chunk_position, + &mut delta_code, + EdeltaOptimizations::CompressionIsPriority, + ); + } + None => process_target_chunk_with_ddelta( + source_data, + source_chunks_indices, + target_chunk, + &mut delta_code, + ), + } + + if target_chunk_position >= target_chunks.len() { + break; + } + } + + let (processed_data, sbc_hash) = + store_delta_chunk(target_map, target_hash, source_hash, delta_code); + (0, processed_data, sbc_hash) + } +} + +fn process_target_chunk_with_edelta( + source_data: &[u8], + target_data: &[u8], + source_chunks_indices: &mut HashMap, + target_chunks: &[&[u8]], + target_chunk_position: &mut usize, + delta_code: &mut Vec, + edelta_optimizations: EdeltaOptimizations, +) { + if *target_chunk_position >= target_chunks.len() { + return; + } + + let mut target_chunk = target_chunks[*target_chunk_position]; + match edelta_optimizations { + EdeltaOptimizations::SpeedIsPriority => { + if let Some(( + start_match_position_in_source_data, + number_of_processed_chunks, + match_length, + length_of_unprocessed_residue, + )) = find_match_compression_is_priority( + source_data, + source_chunks_indices, + *target_chunk_position, + target_chunks, + ) { + encode_copy_instruction( + match_length, + start_match_position_in_source_data, + delta_code, + ); + *target_chunk_position += number_of_processed_chunks; + if length_of_unprocessed_residue == 0 { + return; + } + + target_chunk = target_chunks[*target_chunk_position - 1]; + process_target_chunk_with_ddelta( + source_data, + source_chunks_indices, + &target_chunk[target_chunk.len() - length_of_unprocessed_residue..], + delta_code, + ); + } else { + encode_insert_instruction(target_chunk.to_vec(), delta_code); + *target_chunk_position += 1; + }; + } + EdeltaOptimizations::CompressionIsPriority => { + if let Some(( + start_match_position_in_source_data, + number_of_processed_chunks, + match_length, + length_of_unprocessed_residue, + )) = find_match_compression_is_priority( + source_data, + source_chunks_indices, + *target_chunk_position, + target_chunks, + ) { + encode_copy_instruction( + match_length, + start_match_position_in_source_data, + delta_code, + ); + let mut start_match_in_target_data: usize = 0; + for current_target_chunk in target_chunks.iter().take(*target_chunk_position) { + start_match_in_target_data += current_target_chunk.len(); + } + let chunk_hash = spooky::hash64( + &target_data + [start_match_in_target_data..start_match_in_target_data + match_length], + ); + + source_chunks_indices + .entry(chunk_hash) + .or_insert(start_match_position_in_source_data); + *target_chunk_position += number_of_processed_chunks; + if length_of_unprocessed_residue == 0 { + return; + } + + target_chunk = target_chunks[*target_chunk_position - 1]; + process_target_chunk_with_ddelta( + source_data, + source_chunks_indices, + &target_chunk[target_chunk.len() - length_of_unprocessed_residue..], + delta_code, + ); + } else { + encode_insert_instruction(target_chunk.to_vec(), delta_code); + *target_chunk_position += 1; + }; + } + } +} + +/// Encodes a part in the target data without Edelta optimizations. +fn process_target_chunk_with_ddelta( + source_data: &[u8], + source_chunks_indices: &HashMap, + target_chunk: &[u8], + delta_code: &mut Vec, +) { + match find_match_ddelta(source_data, source_chunks_indices, target_chunk) { + Some(start_of_match_in_source_data) => { + encode_copy_instruction( + target_chunk.len(), + start_of_match_in_source_data, + delta_code, + ); + } + None => { + encode_insert_instruction(target_chunk.to_vec(), delta_code); + } + } +} + +/// Stores a delta-encoded chunk in the shared chunk map. +/// +/// # Arguments +/// * `target_map` - Thread-safe reference to the chunk storage map (Arc). +/// * `target_hash` - Content hash of the original chunk data. +/// * `source_hash` - Hash of the parent chunk this delta is based on. +/// * `delta_code` - Raw delta-encoded data to store. +/// * `zstd_flag` - Whether to apply zstd compression to the delta data. +/// +/// # Returns +/// A tuple containing: +/// 1. `usize` - Final size of the stored data (after optional compression). +/// 2. `SBCKey` - Key under which the chunk was stored. +fn store_delta_chunk( + target_map: Arc>>, + hash: Hash, + parent_hash: Hash, + delta_code: Vec, +) -> (usize, SBCKey) { + let mut target_map_lock = target_map.lock().unwrap(); + let number_delta_chunk = count_delta_chunks_with_hash(&target_map_lock, &hash); + let sbc_hash = SBCKey { + hash, + chunk_type: ChunkType::Delta { + parent_hash, + number: number_delta_chunk, + }, + }; + + let processed_data = delta_code.len(); + let _ = target_map_lock.insert(sbc_hash.clone(), delta_code); + + (processed_data, sbc_hash) +} + +/// Finds the longest matching byte sequence between source data and target chunks using delta compression. +/// +/// This function implements Scheme 1 of the Edelta algorithm, which extends matches across chunk boundaries +/// while maintaining the original chunk indexing for the base data. +/// +/// # Arguments +/// * `source_data` - The complete base data as a contiguous byte slice +/// * `source_chunks_indices` - Precomputed hash map of chunk hashes to their positions in `source_data` +/// * `target_chunks` - Target data split into chunks (slice of byte slices) +/// * `target_chunk_position` - Starting chunk index in `target_chunks` to begin matching +/// +/// # Returns +/// `Option<(usize, usize, usize, usize)>` where: +/// +/// * `Some(( +/// start_match_position_in_source_data, +/// number_of_processed_chunks, +/// match_length, +/// length_of_unprocessed_residue +/// ))` - Start position in `source_data`, number of the processed chunks, length of the longest match and +/// the number of bytes in the last chunk that remained unprocessed. +/// * `None` - If no match found or invalid input position +fn find_match_compression_is_priority( + source_data: &[u8], + source_chunks_indices: &HashMap, + target_chunk_position: usize, + target_chunks: &[&[u8]], +) -> Option<(usize, usize, usize, usize)> { + if target_chunk_position > target_chunks.len() { + return None; + } + + let start_of_match_in_source_data = find_match_ddelta( + source_data, + source_chunks_indices, + target_chunks[target_chunk_position], + )?; + let mut number_of_processed_chunks = 1; + let mut source_byte_index = + start_of_match_in_source_data + target_chunks[target_chunk_position].len(); + + let mut match_length = target_chunks[target_chunk_position].len(); + let mut target_chunk_position = target_chunk_position + 1; + while target_chunk_position < target_chunks.len() { + let mut target_chunk = target_chunks[target_chunk_position]; + + let mut target_byte_index = 0usize; + while source_data[source_byte_index] == target_chunk[target_byte_index] { + match_length += 1; + + source_byte_index += 1; + target_byte_index += 1; + target_byte_index %= target_chunk.len(); + + if source_byte_index >= source_data.len() { + number_of_processed_chunks += 1; + let length_of_unprocessed_residue = + (target_chunk.len() - target_byte_index) % target_chunk.len(); + return Some(( + start_of_match_in_source_data, + number_of_processed_chunks, + match_length, + length_of_unprocessed_residue, + )); + } + + if target_byte_index == 0 { + target_chunk_position += 1; + if target_chunk_position >= target_chunks.len() { + number_of_processed_chunks += 1; + return Some(( + start_of_match_in_source_data, + number_of_processed_chunks, + match_length, + 0, + )); + } + + target_chunk = target_chunks[target_chunk_position]; + break; + } + } + + number_of_processed_chunks += 1; + if source_data[source_byte_index] != target_chunk[target_byte_index] { + let length_of_unprocessed_residue = + (target_chunk.len() - target_byte_index) % target_chunk.len(); + return Some(( + start_of_match_in_source_data, + number_of_processed_chunks, + match_length, + length_of_unprocessed_residue, + )); + } + + if target_byte_index != 0 { + target_chunk_position += 1; + } + } + + Some(( + start_of_match_in_source_data, + number_of_processed_chunks, + match_length, + 0, + )) +} + +/// Finds a matching chunk in source data for the given target chunk. +/// +/// # Arguments +/// * `source_data` - The original/reference data slice to search in +/// * `source_chunks_indices` - Precomputed hash map of chunk hashes to their positions in source_data +/// * `target_data` - The chunk of data to find in the source +/// +/// # Returns +/// * `Some(usize)` - The starting position of the matching chunk in source_data if found +/// * `None` - If no matching chunk was found +fn find_match_ddelta( + source_data: &[u8], + source_chunks_indices: &HashMap, + target_chunk: &[u8], +) -> Option { + let target_hash = spooky::hash64(target_chunk); + let &source_position = source_chunks_indices.get(&target_hash)?; + + if source_position + target_chunk.len() > source_data.len() { + return None; + } + + let source_slice = &source_data[source_position..source_position + target_chunk.len()]; + if source_slice != target_chunk { + return None; + } + + Some(source_position) +} + +/// Creates an index of chunks for quick matching. +/// +/// # Arguments +/// * `source_chunks` - vector of chunks from the base data block. +/// +/// # Returns +/// Hash table, where key is the chunk hash, value is its first position in the source data. +fn build_chunks_indices(source_chunks: &Vec<&[u8]>) -> HashMap { + let mut chunks_indices: HashMap = HashMap::new(); + let mut current_index: usize = 0; + for chunk in source_chunks { + let chunk_hash = spooky::hash64(chunk); + chunks_indices.entry(chunk_hash).or_insert(current_index); + current_index += chunk.len(); + } + + chunks_indices +} + +/// Splits input data into chunks using Gear-based Content-Defined Chunking (CDC) algorithm. +/// +/// # Parameters +/// * `data` - Input byte slice to be chunked. +/// +/// # Returns +/// Vector of byte slices (chunks) referencing the original data. +fn gear_chunking(data: &[u8]) -> Vec<&[u8]> { + let mut source_chunks: Vec<&[u8]> = Vec::new(); + let mut current_window_hash: u64 = 0; + let mut start_current_chunk = 0; + + let mask = (1 << AVERAGE_CHUNK_SIZE.next_power_of_two().trailing_zeros()) - 1; + let mut data_index = 0; + while data_index < data.len() { + current_window_hash = + (current_window_hash << 1).wrapping_add(GEAR[data[data_index] as usize]); + + if (current_window_hash & mask) == CHUNK_THRESHOLD { + source_chunks.push(&data[start_current_chunk..data_index]); + start_current_chunk = data_index; + } + + data_index += 1; + } + + if start_current_chunk < data.len() { + source_chunks.push(&data[start_current_chunk..data.len()]); + } + + source_chunks +} + +#[cfg(test)] +mod test { + use super::*; + use crate::decoder; + use crate::encoder::ddelta_encoder::EdeltaOptimizations::{ + CompressionIsPriority, SpeedIsPriority, + }; + use crate::encoder::encode_simple_chunk; + use crate::hasher::AronovichHash; + use rand::prelude::StdRng; + use rand::{Rng, SeedableRng}; + + #[test] + fn process_target_chunk_with_edelta_should_process_full_match_with_compression_priority() { + let source_data = b"prefix_match_suffix"; + let source_chunks: Vec<&[u8]> = vec![b"prefix_", b"match_", b"suffix"]; + let target_chunks: Vec<&[u8]> = vec![b"match_"]; + + let mut source_indices = build_chunks_indices(&source_chunks); + let mut position = 0; + let mut delta_code = Vec::new(); + + process_target_chunk_with_edelta( + source_data, + b"match_", + &mut source_indices, + &target_chunks, + &mut position, + &mut delta_code, + CompressionIsPriority, + ); + + assert_eq!(position, 1); + assert!(!delta_code.is_empty()); + } + + #[test] + fn process_target_chunk_with_edelta_should_insert_when_no_match_found() { + let source_data = b"source_data"; + let source_chunks: Vec<&[u8]> = vec![b"source", b"_data"]; + let target_chunks: Vec<&[u8]> = vec![b"no_match"]; + + let mut source_indices = build_chunks_indices(&source_chunks); + let mut position = 0; + let mut delta_code = Vec::new(); + + process_target_chunk_with_edelta( + source_data, + b"no_match", + &mut source_indices, + &target_chunks, + &mut position, + &mut delta_code, + CompressionIsPriority, + ); + + assert_eq!(position, 1); + assert!(!delta_code.is_empty()); + } + + #[test] + fn process_target_chunk_with_edelta_should_handle_partial_match_with_residue() { + let source_data = b"data_part1_part2"; + let source_chunks: Vec<&[u8]> = vec![b"data_", b"part1_", b"part2"]; + let target_chunks: Vec<&[u8]> = vec![b"part1_", b"par"]; + + let mut source_indices = build_chunks_indices(&source_chunks); + let mut position = 0; + let mut delta_code = Vec::new(); + + process_target_chunk_with_edelta( + source_data, + b"part1_par", + &mut source_indices, + &target_chunks, + &mut position, + &mut delta_code, + CompressionIsPriority, + ); + + assert_eq!(position, 2); + assert!(delta_code.len() > 1); + } + + #[test] + fn process_target_chunk_with_edelta_should_skip_processing_when_position_out_of_bounds() { + let source_data = b"data"; + let source_chunks: Vec<&[u8]> = vec![b"data"]; + let target_chunks: Vec<&[u8]> = vec![b"data"]; + + let mut source_indices = build_chunks_indices(&source_chunks); + let mut position = 1; + let mut delta_code = Vec::new(); + + process_target_chunk_with_edelta( + source_data, + b"data", + &mut source_indices, + &target_chunks, + &mut position, + &mut delta_code, + CompressionIsPriority, + ); + + assert_eq!(position, 1); + assert!(delta_code.is_empty()); + } + + #[test] + fn process_target_chunk_with_edelta_should_process_multiple_chunks_in_extended_match() { + let source_data = b"chunk1_chunk2_chunk3"; + let source_chunks: Vec<&[u8]> = vec![b"chunk1_", b"chunk2_", b"chunk3"]; + let target_chunks: Vec<&[u8]> = vec![b"chunk1_", b"chunk2_"]; + + let mut source_indices = build_chunks_indices(&source_chunks); + let mut position = 0; + let mut delta_code = Vec::new(); + + process_target_chunk_with_edelta( + source_data, + b"chunk1_chunk2_", + &mut source_indices, + &target_chunks, + &mut position, + &mut delta_code, + CompressionIsPriority, + ); + + assert_eq!(position, 2); + assert!(!delta_code.is_empty()); + } + + #[test] + fn process_target_chunk_with_edelta_should_handle_empty_target_chunk() { + let source_data = b"data"; + let source_chunks: Vec<&[u8]> = vec![b"data"]; + let target_chunks: Vec<&[u8]> = vec![b""]; + + let mut source_indices = build_chunks_indices(&source_chunks); + let mut position = 0; + let mut delta_code = Vec::new(); + + process_target_chunk_with_edelta( + source_data, + b"", + &mut source_indices, + &target_chunks, + &mut position, + &mut delta_code, + CompressionIsPriority, + ); + + assert_eq!(position, 1); + } + + #[test] + fn process_target_chunk_with_edelta_should_process_two_chunks() { + let source_data = b"abcdefghijklmnopqrstuvwxyz".to_vec(); + let target_chunks: Vec<&[u8]> = vec![b"cdefgh", b"ijklmn"]; + + let source_chunks: Vec<&[u8]> = vec![b"ab", b"cdefgh", b"ijklmnop", b"qrstuvwxyz"]; + let mut source_chunks_indices = build_chunks_indices(&source_chunks); + + let mut target_chunk_position = 0; + let mut delta_code = Vec::new(); + + process_target_chunk_with_edelta( + &source_data, + b"cdefghijklmn", + &mut source_chunks_indices, + &target_chunks, + &mut target_chunk_position, + &mut delta_code, + CompressionIsPriority, + ); + + assert!(!delta_code.is_empty()); + assert_eq!(target_chunk_position, 2); + } + + #[test] + fn find_match_compression_is_priority_should_handle_partial_match_at_chunk_boundary() { + let source_data = b"prefix_data_match_suffix"; + let source_chunks: Vec<&[u8]> = vec![b"prefix_", b"data_", b"match_", b"suffix"]; + let target_chunks: Vec<&[u8]> = vec![b"data_", b"matc"]; + + let source_indices = build_chunks_indices(&source_chunks); + + assert_eq!( + find_match_compression_is_priority(source_data, &source_indices, 0, &target_chunks), + Some((7, 2, 9, 0)) + ); + } + + #[test] + fn find_match_compression_is_priority_should_return_none_when_no_initial_chunk_match() { + let source_data = b"source_data"; + let source_chunks: Vec<&[u8]> = vec![b"sour", b"ce_d", b"ata"]; + let target_chunks: Vec<&[u8]> = vec![b"no_match", b"data"]; + + let source_indices = build_chunks_indices(&source_chunks); + + assert_eq!( + find_match_compression_is_priority(source_data, &source_indices, 0, &target_chunks), + None + ); + } + + #[test] + fn find_match_compression_is_priority_should_handle_source_exhaustion_during_extended_match() { + let source_data = b"short_source"; + let source_chunks: Vec<&[u8]> = vec![b"short_", b"source"]; + let target_chunks: Vec<&[u8]> = vec![b"short_", b"source", b"extra"]; + + let source_indices = build_chunks_indices(&source_chunks); + + assert_eq!( + find_match_compression_is_priority(source_data, &source_indices, 0, &target_chunks), + Some((0, 2, 12, 0)) + ); + } + + #[test] + fn find_match_compression_is_priority_should_handle_mismatch_in_middle_of_extended_match() { + let source_data = b"match_part1_part2_part3"; + let source_chunks: Vec<&[u8]> = vec![b"match_", b"part1_", b"part2_", b"part3"]; + let target_chunks: Vec<&[u8]> = vec![b"match_", b"part1_", b"XXXXX_", b"part3"]; + + let source_indices = build_chunks_indices(&source_chunks); + + assert_eq!( + find_match_compression_is_priority(source_data, &source_indices, 0, &target_chunks), + Some((0, 2, 12, 0)) + ); + } + + #[test] + fn find_match_compression_is_priority_should_handle_single_byte_chunks() { + let source_data = b"abcdef"; + let source_chunks: Vec<&[u8]> = vec![b"a", b"b", b"c", b"d", b"e", b"f"]; + let target_chunks: Vec<&[u8]> = vec![b"c", b"d", b"e"]; + + let source_indices = build_chunks_indices(&source_chunks); + + assert_eq!( + find_match_compression_is_priority(source_data, &source_indices, 0, &target_chunks), + Some((2, 3, 3, 0)) + ); + } + + #[test] + fn find_match_compression_is_priority_should_handle_variable_size_chunks() { + let source_data = b"abc_defgh_ijklmn"; + let source_chunks: Vec<&[u8]> = vec![b"abc_", b"defgh_", b"ijklmn"]; + let target_chunks: Vec<&[u8]> = vec![b"abc_", b"defgh_", b"ijk"]; + + let source_indices = build_chunks_indices(&source_chunks); + + assert_eq!( + find_match_compression_is_priority(source_data, &source_indices, 0, &target_chunks), + Some((0, 3, 13, 0)) + ); + } + + #[test] + fn find_match_compression_is_priority_should_return_correct_value_for_exact_match() { + let source_data = b"__PATTERN1__PATTERN2__"; + let target_chunks: Vec<&[u8]> = vec![b"_PATTERN2__"]; + let source_chunks = vec![ + &source_data[0..source_data.len() / 2], + &source_data[source_data.len() / 2..], + ]; + let chunk_indices = build_chunks_indices(&source_chunks); + + assert_eq!( + find_match_compression_is_priority(source_data, &chunk_indices, 0, &target_chunks), + Some((11, 1, 11, 0)) + ) + } + + #[test] + fn find_match_compression_is_priority_should_handle_one_chunk() { + let source_data = b"test1test2test"; + let source_chunks: Vec<&[u8]> = vec![b"test", b"1test", b"2test"]; + let target_chunks: Vec<&[u8]> = vec![b"test", b"1test", b"#test"]; + let source_chunks_indices = build_chunks_indices(&source_chunks); + assert_eq!( + find_match_compression_is_priority( + source_data, + &source_chunks_indices, + 0, + &target_chunks, + ), + Some((0, 2, 9, 0)) + ) + } + + #[test] + fn find_match_compression_is_priority_should_handle_two_chunks() { + let source_data = b"test1test2test"; + let source_chunks: Vec<&[u8]> = vec![b"test", b"1test", b"2test"]; + let target_chunks: Vec<&[u8]> = vec![b"test", b"1test", b"2te#t"]; + let source_chunks_indices = build_chunks_indices(&source_chunks); + assert_eq!( + find_match_compression_is_priority( + source_data, + &source_chunks_indices, + 1, + &target_chunks, + ), + Some((4, 2, 8, 2)) + ) + } + + #[test] + fn find_match_should_return_none_for_empty_source_or_target() { + let empty_data = &[]; + let empty_indices = HashMap::new(); + assert_eq!( + find_match_ddelta(empty_data, &empty_indices, b"test"), + None, + "Empty source should return None" + ); + + let non_empty_data = b"valid_data"; + let chunks = gear_chunking(non_empty_data); + let indices = build_chunks_indices(&chunks); + assert_eq!( + find_match_ddelta(non_empty_data, &indices, empty_data), + None, + "Empty target should return None" + ); + } + + #[test] + fn find_match_should_return_none_for_non_matching_data() { + let source_data = vec![0u8; AVERAGE_CHUNK_SIZE * 2]; + let target_data = vec![1u8; AVERAGE_CHUNK_SIZE]; + + let source_chunks = gear_chunking(&source_data); + let source_indices = build_chunks_indices(&source_chunks); + assert_eq!( + find_match_ddelta(&source_data, &source_indices, &target_data), + None, + "Non-matching data should return None" + ); + } + + #[test] + fn find_match_should_return_position_for_exact_match() { + let data = b"__PATTERN1__PATTERN2__"; + let pattern = b"__PATTERN1_"; + let chunks = vec![&data[0..data.len() / 2], &data[data.len() / 2..]]; + + let chunk_indices = build_chunks_indices(&chunks); + + assert_eq!( + find_match_ddelta(data, &chunk_indices, pattern), + Some(0), + "Should find pattern at known position" + ); + } + + #[test] + fn build_chunks_indices_should_map_chunks_to_correct_positions() { + let chunks: Vec<&[u8]> = vec![&[1u8; AVERAGE_CHUNK_SIZE], &[2u8; AVERAGE_CHUNK_SIZE]]; + + let indices = build_chunks_indices(&chunks); + assert_eq!( + indices.get(&spooky::hash64(chunks[0])), + Some(&0), + "First chunk should be at position 0" + ); + assert_eq!( + indices.get(&spooky::hash64(chunks[1])), + Some(&AVERAGE_CHUNK_SIZE), + "Second chunk should be at position AVERAGE_CHUNK_SIZE" + ); + } + + #[test] + fn build_chunks_indices_should_handle_duplicate_hashes_correctly() { + let chunks: Vec<&[u8]> = vec![&[1u8; AVERAGE_CHUNK_SIZE], &[1u8; AVERAGE_CHUNK_SIZE]]; + + let indices = build_chunks_indices(&chunks); + let hash = spooky::hash64(chunks[0]); + assert_eq!( + Some(&0), + indices.get(&hash), + "Only first position should be stored for duplicates" + ); + assert_eq!( + indices.len(), + 1, + "HashMap should contain only one entry for duplicate chunks" + ); + } + + #[test] + fn gear_chunking_should_handle_empty_data() { + let data = &[]; + assert_eq!(gear_chunking(data).len(), 0); + } + + #[test] + fn gear_chunking_should_handle_data_smaller_than_chunk() { + let data = b"abc"; + let chunks = gear_chunking(data); + assert_eq!(chunks, vec![b"abc".to_vec()]); + } + + #[test] + fn gear_chunking_should_return_chunk_for_exact_chunk_boundary() { + let data = b"abcdefgh"; + let chunks = gear_chunking(data); + assert_eq!(chunks, vec![b"abcdefgh".to_vec()]); + } + + #[test] + fn gear_chunking_should_split_data_into_multiple_chunks() { + let mut rng = rand::thread_rng(); + let mut data = vec![0u8; AVERAGE_CHUNK_SIZE * 1000]; + rng.fill(&mut data[..]); + + let chunks = gear_chunking(&data); + assert!( + chunks.len() > 1, + "Data should be split into multiple chunks" + ); + } + + #[test] + fn test_restore_similarity_chunk_1_byte_diff() { + let mut data: Vec = generate_test_data(); + let data2 = data.clone(); + if data[15] < 255 { + data[15] = 255; + } else { + data[15] = 0; + } + + let (sbc_map, sbc_key) = + create_map_and_key(data.as_slice(), data2.as_slice(), SpeedIsPriority); + + assert_eq!(sbc_map.get(&sbc_key).unwrap(), data2); + } + + #[test] + fn test_restore_similarity_chunk_2_neighbor_byte_diff() { + let mut data: Vec = generate_test_data(); + let data2 = data.clone(); + if data[15] < 255 { + data[15] = 255; + } else { + data[15] = 0; + } + if data[16] < 255 { + data[16] = 255; + } else { + data[16] = 0; + } + + let (sbc_map, sbc_key) = + create_map_and_key(data.as_slice(), data2.as_slice(), SpeedIsPriority); + + assert_eq!(sbc_map.get(&sbc_key).unwrap(), data2); + } + + #[test] + fn test_restore_similarity_chunk_2_byte_diff() { + let mut data: Vec = generate_test_data(); + let data2 = data.clone(); + if data[15] < 255 { + data[15] = 255; + } else { + data[15] = 0; + } + if data[106] < 255 { + data[106] = 255; + } else { + data[106] = 0; + } + + let (sbc_map, sbc_key) = + create_map_and_key(data.as_slice(), data2.as_slice(), CompressionIsPriority); + + assert_eq!(sbc_map.get(&sbc_key).unwrap(), data2); + } + + #[test] + fn test_restore_similarity_chunk_with_offset_left() { + let data: Vec = generate_test_data(); + let data2 = data[15..].to_vec(); + + let (sbc_map, sbc_key) = + create_map_and_key(data.as_slice(), data2.as_slice(), SpeedIsPriority); + + assert_eq!(sbc_map.get(&sbc_key).unwrap(), data2); + } + + #[test] + fn test_restore_similarity_chunk_with_offset_right() { + let data: Vec = generate_test_data(); + let data2 = data[..8000].to_vec(); + + let (sbc_map, sbc_key) = + create_map_and_key(data.as_slice(), data2.as_slice(), CompressionIsPriority); + + assert_eq!(sbc_map.get(&sbc_key).unwrap(), data2); + } + + #[test] + fn test_restore_similarity_chunk_with_offset() { + let data: Vec = generate_test_data(); + let mut data2 = data[15..8000].to_vec(); + data2[0] /= 3; + data2[7000] /= 3; + + let (sbc_map, sbc_key) = + create_map_and_key(data.as_slice(), data2.as_slice(), CompressionIsPriority); + + assert_eq!(sbc_map.get(&sbc_key).unwrap(), data2); + } + + #[test] + fn test_restore_similarity_chunk_with_cyclic_shift_right() { + let data: Vec = generate_test_data(); + let mut data2 = data.clone(); + data2.extend(&data[8000..]); + + let (sbc_map, sbc_key) = + create_map_and_key(data.as_slice(), data2.as_slice(), SpeedIsPriority); + + assert_ne!(data, []); + assert_eq!( + sbc_key.chunk_type, + ChunkType::Delta { + parent_hash: AronovichHash::new_with_u32(0), + number: 0 + } + ); + assert_eq!(sbc_map.get(&sbc_key).unwrap(), data2); + } + + #[test] + fn test_restore_similarity_chunk_with_cyclic_shift_left() { + let data: Vec = generate_test_data_deterministic(42); + let mut data2 = data[..192].to_vec(); + data2.extend(&data); + + let (sbc_map, sbc_key) = + create_map_and_key(data.as_slice(), data2.as_slice(), SpeedIsPriority); + + assert_ne!(data, []); + assert_eq!( + sbc_key.chunk_type, + ChunkType::Delta { + parent_hash: AronovichHash::new_with_u32(0), + number: 0 + } + ); + assert_eq!(sbc_map.get(&sbc_key).unwrap(), data2); + } + + fn generate_test_data() -> Vec { + const TEST_DATA_SIZE: usize = 8192; + (0..TEST_DATA_SIZE).map(|_| rand::random::()).collect() + } + + fn generate_test_data_deterministic(seed: u64) -> Vec { + const TEST_DATA_SIZE: usize = 8192; + let mut rng = StdRng::seed_from_u64(seed); + (0..TEST_DATA_SIZE).map(|_| rng.gen()).collect() + } + + fn create_map_and_key<'a>( + data: &'a [u8], + data2: &'a [u8], + edelta_optimizations: EdeltaOptimizations, + ) -> ( + SBCMap, + SBCKey, + ) { + let source_chunks = gear_chunking(data); + let mut word_hash_offsets = build_chunks_indices(&source_chunks); + let mut binding = SBCMap::new(decoder::GdeltaDecoder::default()); + let sbc_map = Arc::new(Mutex::new(&mut binding)); + + let (_, sbc_key) = encode_simple_chunk( + &mut sbc_map.lock().unwrap(), + data, + AronovichHash::new_with_u32(0), + ); + let ddelta_encoder = DdeltaEncoder::new_with_edelta_optimizations(edelta_optimizations); + let (_, _, sbc_key_2) = ddelta_encoder.encode_delta_chunk( + sbc_map.clone(), + data2, + AronovichHash::new_with_u32(3), + data, + &mut word_hash_offsets, + sbc_key.hash.clone(), + ); + (binding, sbc_key_2) + } +} diff --git a/src/encoder/gdelta_encoder.rs b/src/encoder/gdelta_encoder.rs new file mode 100644 index 0000000..3e31639 --- /dev/null +++ b/src/encoder/gdelta_encoder.rs @@ -0,0 +1,238 @@ +use crate::chunkfs_sbc::ClusterPoint; +use crate::decoder::Decoder; +use crate::encoder::{count_delta_chunks_with_hash, get_parent_data, Encoder}; +use crate::{ChunkType, SBCHash, SBCKey, SBCMap}; +use chunkfs::Data; +use chunkfs::Database; +use std::cmp::min; +use std::collections::HashMap; +use std::sync::{Arc, Mutex}; +use zstd::stream; + +pub struct GdeltaEncoder { + zstd_flag: bool, +} + +impl Default for GdeltaEncoder { + fn default() -> Self { + Self::new(false) + } +} + +impl GdeltaEncoder { + pub fn new(zstd_flag: bool) -> Self { + GdeltaEncoder { zstd_flag } + } + + fn encode_delta_chunk( + &self, + target_map: Arc>>, + chunk_data: &[u8], + hash: Hash, + parent_data: &[u8], + word_hash_offsets: &HashMap, + parent_hash: Hash, + ) -> (usize, usize, SBCKey) { + let mut delta_code = Vec::new(); + + let mut anchor: usize = 0; + let word_size: usize = 16; + let move_bts: usize = 64 / word_size; + let mask_bts: usize = (parent_data.len() as f64).log2() as usize; + let mut fp = 0u64; + + for j in 0..(word_size - 1) { + fp = (fp << move_bts).wrapping_add(GEAR[chunk_data[j] as usize]); + } + let mut j = 0; + while j < (chunk_data.len() - word_size + 1) { + fp = (fp << move_bts).wrapping_add(GEAR[chunk_data[j + word_size - 1] as usize]); + let word_hash: u64 = fp >> (64 - mask_bts); + + if let Some(&offset) = word_hash_offsets.get(&word_hash) { + let mut equal_part_len: usize = 0; + for k in 0..min(parent_data.len() - offset, chunk_data.len() - j) { + if parent_data[offset + k] != chunk_data[j + k] { + break; + } + equal_part_len += 1; + } + + if equal_part_len >= word_size { + //Insert instruction + let insert_data_len: usize = j - anchor; + if insert_data_len > 0 { + let insert_data = &chunk_data[anchor..(anchor + insert_data_len)]; + let insert_instruction = &mut insert_data_len.to_ne_bytes()[..3]; + insert_instruction[2] += 1 << 7; + delta_code.extend_from_slice(insert_instruction); + delta_code.extend_from_slice(insert_data); + } + + // Copy instruction + let copy_instruction_len = &equal_part_len.to_ne_bytes()[..3]; + let copy_instruction_offset = &offset.to_ne_bytes()[..3]; + delta_code.extend_from_slice(copy_instruction_len); + delta_code.extend_from_slice(copy_instruction_offset); + + anchor = j + equal_part_len; + j = anchor - 1; // Update j to skip the matched part + if j < chunk_data.len() - word_size { + for k in anchor..(anchor + word_size - 1) { + fp = (fp << move_bts).wrapping_add(GEAR[chunk_data[k] as usize]); + } + } + } + } + + if j >= chunk_data.len() - word_size { + let insert_data_len: usize = chunk_data.len() - anchor; + let insert_data = &chunk_data[anchor..(anchor + insert_data_len)]; + let insert_instruction = &mut insert_data_len.to_ne_bytes()[..3]; + insert_instruction[2] += 1 << 7; + delta_code.extend_from_slice(insert_instruction); + delta_code.extend_from_slice(insert_data); + } + j += 1 + } + + let mut target_map_lock = target_map.lock().unwrap(); + let number_delta_chunk = count_delta_chunks_with_hash(&target_map_lock, &hash); + let sbc_hash = SBCKey { + hash, + chunk_type: ChunkType::Delta { + parent_hash, + number: number_delta_chunk, + }, + }; + if self.zstd_flag { + delta_code = stream::encode_all(delta_code.as_slice(), 0).unwrap(); + } + let processed_data = delta_code.len(); + let _ = target_map_lock.insert(sbc_hash.clone(), delta_code); + (0, processed_data, sbc_hash) + } +} + +impl Encoder for GdeltaEncoder { + fn encode_cluster( + &self, + target_map: Arc>>, + cluster: &mut [ClusterPoint], + parent_hash: Hash, + ) -> (usize, usize) { + let mut processed_data = 0; + let parent_chunk = get_parent_data(target_map.clone(), parent_hash.clone(), cluster); + let mut data_left = parent_chunk.data_left; + let parent_data = parent_chunk.parent_data; + let word_size: usize = 16; + let move_bts: usize = 64 / word_size; + let mut word_hash_offsets: HashMap = HashMap::new(); + let mask_bts: usize = (parent_data.len() as f64).log2() as usize; + let mut fp: u64 = 0; + + for i in 0..(word_size - 1) { + fp = (fp << move_bts).wrapping_add(GEAR[parent_data[i] as usize]); + } + + for i in 0..(parent_data.len() - word_size + 1) { + fp = (fp << move_bts).wrapping_add(GEAR[parent_data[i + word_size - 1] as usize]); + let word_hash: u64 = fp >> (64 - mask_bts); + word_hash_offsets.insert(word_hash, i); + } + + for (chunk_id, (hash, data_container)) in cluster.iter_mut().enumerate() { + if parent_chunk.index > -1 && chunk_id == parent_chunk.index as usize { + continue; + } + let mut target_hash = SBCKey::default(); + match data_container.extract() { + Data::Chunk(data) => { + let (left, processed, sbc_hash) = self.encode_delta_chunk( + target_map.clone(), + data, + hash.clone(), + parent_data.as_slice(), + &word_hash_offsets, + parent_hash.clone(), + ); + data_left += left; + processed_data += processed; + target_hash = sbc_hash; + } + Data::TargetChunk(_) => {} + } + data_container.make_target(vec![target_hash]); + } + (data_left, processed_data) + } +} + +// Gear table taken from https://github.com/nlfiedler/fastcdc-rs +#[rustfmt::skip] +pub(crate) const GEAR: [u64; 256] = [ + 0x3b5d3c7d207e37dc, 0x784d68ba91123086, 0xcd52880f882e7298, 0xeacf8e4e19fdcca7, + 0xc31f385dfbd1632b, 0x1d5f27001e25abe6, 0x83130bde3c9ad991, 0xc4b225676e9b7649, + 0xaa329b29e08eb499, 0xb67fcbd21e577d58, 0x0027baaada2acf6b, 0xe3ef2d5ac73c2226, + 0x0890f24d6ed312b7, 0xa809e036851d7c7e, 0xf0a6fe5e0013d81b, 0x1d026304452cec14, + 0x03864632648e248f, 0xcdaacf3dcd92b9b4, 0xf5e012e63c187856, 0x8862f9d3821c00b6, + 0xa82f7338750f6f8a, 0x1e583dc6c1cb0b6f, 0x7a3145b69743a7f1, 0xabb20fee404807eb, + 0xb14b3cfe07b83a5d, 0xb9dc27898adb9a0f, 0x3703f5e91baa62be, 0xcf0bb866815f7d98, + 0x3d9867c41ea9dcd3, 0x1be1fa65442bf22c, 0x14300da4c55631d9, 0xe698e9cbc6545c99, + 0x4763107ec64e92a5, 0xc65821fc65696a24, 0x76196c064822f0b7, 0x485be841f3525e01, + 0xf652bc9c85974ff5, 0xcad8352face9e3e9, 0x2a6ed1dceb35e98e, 0xc6f483badc11680f, + 0x3cfd8c17e9cf12f1, 0x89b83c5e2ea56471, 0xae665cfd24e392a9, 0xec33c4e504cb8915, + 0x3fb9b15fc9fe7451, 0xd7fd1fd1945f2195, 0x31ade0853443efd8, 0x255efc9863e1e2d2, + 0x10eab6008d5642cf, 0x46f04863257ac804, 0xa52dc42a789a27d3, 0xdaaadf9ce77af565, + 0x6b479cd53d87febb, 0x6309e2d3f93db72f, 0xc5738ffbaa1ff9d6, 0x6bd57f3f25af7968, + 0x67605486d90d0a4a, 0xe14d0b9663bfbdae, 0xb7bbd8d816eb0414, 0xdef8a4f16b35a116, + 0xe7932d85aaaffed6, 0x08161cbae90cfd48, 0x855507beb294f08b, 0x91234ea6ffd399b2, + 0xad70cf4b2435f302, 0xd289a97565bc2d27, 0x8e558437ffca99de, 0x96d2704b7115c040, + 0x0889bbcdfc660e41, 0x5e0d4e67dc92128d, 0x72a9f8917063ed97, 0x438b69d409e016e3, + 0xdf4fed8a5d8a4397, 0x00f41dcf41d403f7, 0x4814eb038e52603f, 0x9dafbacc58e2d651, + 0xfe2f458e4be170af, 0x4457ec414df6a940, 0x06e62f1451123314, 0xbd1014d173ba92cc, + 0xdef318e25ed57760, 0x9fea0de9dfca8525, 0x459de1e76c20624b, 0xaeec189617e2d666, + 0x126a2c06ab5a83cb, 0xb1321532360f6132, 0x65421503dbb40123, 0x2d67c287ea089ab3, + 0x6c93bff5a56bd6b6, 0x4ffb2036cab6d98d, 0xce7b785b1be7ad4f, 0xedb42ef6189fd163, + 0xdc905288703988f6, 0x365f9c1d2c691884, 0xc640583680d99bfe, 0x3cd4624c07593ec6, + 0x7f1ea8d85d7c5805, 0x014842d480b57149, 0x0b649bcb5a828688, 0xbcd5708ed79b18f0, + 0xe987c862fbd2f2f0, 0x982731671f0cd82c, 0xbaf13e8b16d8c063, 0x8ea3109cbd951bba, + 0xd141045bfb385cad, 0x2acbc1a0af1f7d30, 0xe6444d89df03bfdf, 0xa18cc771b8188ff9, + 0x9834429db01c39bb, 0x214add07fe086a1f, 0x8f07c19b1f6b3ff9, 0x56a297b1bf4ffe55, + 0x94d558e493c54fc7, 0x40bfc24c764552cb, 0x931a706f8a8520cb, 0x32229d322935bd52, + 0x2560d0f5dc4fefaf, 0x9dbcc48355969bb6, 0x0fd81c3985c0b56a, 0xe03817e1560f2bda, + 0xc1bb4f81d892b2d5, 0xb0c4864f4e28d2d7, 0x3ecc49f9d9d6c263, 0x51307e99b52ba65e, + 0x8af2b688da84a752, 0xf5d72523b91b20b6, 0x6d95ff1ff4634806, 0x562f21555458339a, + 0xc0ce47f889336346, 0x487823e5089b40d8, 0xe4727c7ebc6d9592, 0x5a8f7277e94970ba, + 0xfca2f406b1c8bb50, 0x5b1f8a95f1791070, 0xd304af9fc9028605, 0x5440ab7fc930e748, + 0x312d25fbca2ab5a1, 0x10f4a4b234a4d575, 0x90301d55047e7473, 0x3b6372886c61591e, + 0x293402b77c444e06, 0x451f34a4d3e97dd7, 0x3158d814d81bc57b, 0x034942425b9bda69, + 0xe2032ff9e532d9bb, 0x62ae066b8b2179e5, 0x9545e10c2f8d71d8, 0x7ff7483eb2d23fc0, + 0x00945fcebdc98d86, 0x8764bbbe99b26ca2, 0x1b1ec62284c0bfc3, 0x58e0fcc4f0aa362b, + 0x5f4abefa878d458d, 0xfd74ac2f9607c519, 0xa4e3fb37df8cbfa9, 0xbf697e43cac574e5, + 0x86f14a3f68f4cd53, 0x24a23d076f1ce522, 0xe725cd8048868cc8, 0xbf3c729eb2464362, + 0xd8f6cd57b3cc1ed8, 0x6329e52425541577, 0x62aa688ad5ae1ac0, 0x0a242566269bf845, + 0x168b1a4753aca74b, 0xf789afefff2e7e3c, 0x6c3362093b6fccdb, 0x4ce8f50bd28c09b2, + 0x006a2db95ae8aa93, 0x975b0d623c3d1a8c, 0x18605d3935338c5b, 0x5bb6f6136cad3c71, + 0x0f53a20701f8d8a6, 0xab8c5ad2e7e93c67, 0x40b5ac5127acaa29, 0x8c7bf63c2075895f, + 0x78bd9f7e014a805c, 0xb2c9e9f4f9c8c032, 0xefd6049827eb91f3, 0x2be459f482c16fbd, + 0xd92ce0c5745aaa8c, 0x0aaa8fb298d965b9, 0x2b37f92c6c803b15, 0x8c54a5e94e0f0e78, + 0x95f9b6e90c0a3032, 0xe7939faa436c7874, 0xd16bfe8f6a8a40c9, 0x44982b86263fd2fa, + 0xe285fb39f984e583, 0x779a8df72d7619d3, 0xf2d79a8de8d5dd1e, 0xd1037354d66684e2, + 0x004c82a4e668a8e5, 0x31d40a7668b044e6, 0xd70578538bd02c11, 0xdb45431078c5f482, + 0x977121bb7f6a51ad, 0x73d5ccbd34eff8dd, 0xe437a07d356e17cd, 0x47b2782043c95627, + 0x9fb251413e41d49a, 0xccd70b60652513d3, 0x1c95b31e8a1b49b2, 0xcae73dfd1bcb4c1b, + 0x34d98331b1f5b70f, 0x784e39f22338d92f, 0x18613d4a064df420, 0xf1d8dae25f0bcebe, + 0x33f77c15ae855efc, 0x3c88b3b912eb109c, 0x956a2ec96bafeea5, 0x1aa005b5e0ad0e87, + 0x5500d70527c4bb8e, 0xe36c57196421cc44, 0x13c4d286cc36ee39, 0x5654a23d818b2a81, + 0x77b1dc13d161abdc, 0x734f44de5f8d5eb5, 0x60717e174a6c89a2, 0xd47d9649266a211e, + 0x5b13a4322bb69e90, 0xf7669609f8b5fc3c, 0x21e6ac55bedcdac9, 0x9b56b62b61166dea, + 0xf48f66b939797e9c, 0x35f332f9c0e6ae9a, 0xcc733f6a9a878db0, 0x3da161e41cc108c2, + 0xb7d74ae535914d51, 0x4d493b0b11d36469, 0xce264d1dfba9741a, 0xa9d1f2dc7436dc06, + 0x70738016604c2a27, 0x231d36e96e93f3d5, 0x7666881197838d19, 0x4a2a83090aaad40c, + 0xf1e761591668b35d, 0x7363236497f730a7, 0x301080e37379dd4d, 0x502dea2971827042, + 0xc2c5eb858f32625f, 0x786afb9edfafbdff, 0xdaee0d868490b2a4, 0x617366b3268609f6, + 0xae0e35a0fe46173e, 0xd1a07de93e824f11, 0x079b8b115ea4cca8, 0x93a99274558faebb, + 0xfb1e6e22e08a03b3, 0xea635fdba3698dd0, 0xcf53659328503a5c, 0xcde3b31e6fd5d780, + 0x8e3e4221d3614413, 0xef14d0d86bf1a22c, 0xe1d830d3f16c5ddb, 0xaabd2b2a451504e1 +]; diff --git a/src/encoder/levenshtein_encoder.rs b/src/encoder/levenshtein_encoder.rs new file mode 100644 index 0000000..6b97bf5 --- /dev/null +++ b/src/encoder/levenshtein_encoder.rs @@ -0,0 +1,415 @@ +use crate::chunkfs_sbc::ClusterPoint; +use crate::decoder::Decoder; +use crate::encoder::{count_delta_chunks_with_hash, encode_simple_chunk, get_parent_data, Encoder}; +use crate::{ChunkType, SBCHash, SBCKey, SBCMap}; +use chunkfs::{Data, Database}; +use std::cmp::min; +use std::sync::{Arc, Mutex}; + +/// an enumeration indicating the action of converting a byte +pub(crate) enum Action { + Del, + Add, + Rep, +} + +/// An encoder using the Levenshtein editorial prescription method +pub struct LevenshteinEncoder { + zstd_flag: bool, +} + +impl Default for LevenshteinEncoder { + fn default() -> Self { + Self::new(false) + } +} + +impl LevenshteinEncoder { + pub fn new(zstd_flag: bool) -> Self { + LevenshteinEncoder { zstd_flag } + } + + /// Method of calculating the delta code using Levenshtein's editorial prescription and writing it to the repository + fn encode_delta_chunk( + &self, + target_map: Arc>>, + data: &[u8], + hash: Hash, + parent_data: &[u8], + parent_hash: Hash, + ) -> (usize, usize, SBCKey) { + let mut delta_chunk = Vec::new(); + + match encode(data, parent_data) { + None => { + let (data_left, sbc_hash) = + encode_simple_chunk(&mut target_map.clone().lock().unwrap(), data, hash); + (data_left, 0, sbc_hash) + } + Some(delta_code) => { + for delta_action in delta_code { + for byte in delta_action.to_be_bytes() { + delta_chunk.push(byte); + } + } + + if self.zstd_flag { + delta_chunk = zstd::encode_all(delta_chunk.as_slice(), 0).unwrap(); + } + + let processed_data = delta_chunk.len(); + + let mut target_map_lock = target_map.lock().unwrap(); + + let number_delta_chunk = count_delta_chunks_with_hash(&target_map_lock, &hash); + let sbc_hash = SBCKey { + hash, + chunk_type: ChunkType::Delta { + parent_hash, + number: number_delta_chunk, + }, + }; + let _ = target_map_lock.insert(sbc_hash.clone(), delta_chunk); + (0, processed_data, sbc_hash) + } + } + } +} + +impl Encoder for LevenshteinEncoder { + fn encode_cluster( + &self, + target_map: Arc>>, + cluster: &mut [ClusterPoint], + parent_hash: Hash, + ) -> (usize, usize) { + let mut processed_data = 0; + let parent_chunk = get_parent_data(target_map.clone(), parent_hash.clone(), cluster); + let mut data_left = parent_chunk.data_left; + for (chunk_id, (hash, data_container)) in cluster.iter_mut().enumerate() { + if parent_chunk.index > -1 && chunk_id == parent_chunk.index as usize { + continue; + } + let mut target_hash = SBCKey::default(); + match data_container.extract() { + Data::Chunk(data) => { + if data.len().abs_diff(parent_chunk.parent_data.len()) > 4000 { + let (left, sbc_hash) = encode_simple_chunk( + &mut target_map.clone().lock().unwrap(), + data, + hash.clone(), + ); + data_left += left; + target_hash = sbc_hash; + } else { + let (left, processed, sbc_hash) = self.encode_delta_chunk( + target_map.clone(), + data, + hash.clone(), + parent_chunk.parent_data.as_slice(), + parent_hash.clone(), + ); + data_left += left; + processed_data += processed; + target_hash = sbc_hash; + } + } + Data::TargetChunk(_) => {} + } + data_container.make_target(vec![target_hash]); + } + (data_left, processed_data) + } +} + +/// A method for optimizing the construction of the Levenshtein editorial prescription matrix by +/// chopping off identical parts at the end and beginning of chunks +fn find_id_non_eq_byte(data_chunk: &[u8], data_chunk_parent: &[u8]) -> (usize, usize) { + let mut id_non_eq_byte_start = 0; + while data_chunk[id_non_eq_byte_start] == data_chunk_parent[id_non_eq_byte_start] { + id_non_eq_byte_start += 1; + if id_non_eq_byte_start == min(data_chunk_parent.len(), data_chunk.len()) { + break; + } + } + let mut id_non_eq_byte_end = 0; + if !((data_chunk.len() <= id_non_eq_byte_start) + | (data_chunk_parent.len() <= id_non_eq_byte_start)) + { + while data_chunk[data_chunk.len() - id_non_eq_byte_end - 1] + == data_chunk_parent[data_chunk_parent.len() - id_non_eq_byte_end - 1] + { + id_non_eq_byte_end += 1; + if min(data_chunk.len(), data_chunk_parent.len()) - id_non_eq_byte_end + == id_non_eq_byte_start + { + break; + } + } + } + (id_non_eq_byte_start, id_non_eq_byte_end) +} + +/// A method that calculates the delta-code according to the matrix of editorial requirements +fn encode(data_chunk: &[u8], data_chunk_parent: &[u8]) -> Option> { + let max_len_delta_code = data_chunk.len() as u32; + let mut delta_code = Vec::new(); + let (id_non_eq_byte_start, id_non_eq_byte_end) = + find_id_non_eq_byte(data_chunk, data_chunk_parent); + + let data_chunk = + data_chunk[id_non_eq_byte_start..data_chunk.len() - id_non_eq_byte_end].to_vec(); + let data_chunk_parent = data_chunk_parent + [id_non_eq_byte_start..data_chunk_parent.len() - id_non_eq_byte_end] + .to_vec(); + + let matrix = levenshtein_matrix(data_chunk.as_slice(), data_chunk_parent.as_slice()); + + if matrix[matrix.len() - 1][matrix[0].len() - 1] * 4 + 4 > max_len_delta_code { + return None; + } + let mut x = matrix[0].len() - 1; + let mut y = matrix.len() - 1; + while x > 0 || y > 0 { + if x > 0 + && y > 0 + && (data_chunk_parent[y - 1] != data_chunk[x - 1]) + && (matrix[y - 1][x - 1] < matrix[y][x]) + { + delta_code.push(encode_delta_action( + Action::Rep, + id_non_eq_byte_start + y - 1, + data_chunk[x - 1], + )); + x -= 1; + y -= 1; + } else if y > 0 && matrix[y - 1][x] < matrix[y][x] { + delta_code.push(encode_delta_action( + Action::Del, + id_non_eq_byte_start + y - 1, + 0, + )); + y -= 1; + } else if x > 0 && matrix[y][x - 1] < matrix[y][x] { + delta_code.push(encode_delta_action( + Action::Add, + id_non_eq_byte_start + y, + data_chunk[x - 1], + )); + x -= 1; + } else { + x -= 1; + y -= 1; + } + } + Some(delta_code) +} + +#[allow(dead_code)] +pub(crate) fn levenshtein_distance(data_chunk: &[u8], data_chunk_parent: &[u8]) -> u32 { + let mut id_eq_byte = 0; + while data_chunk[id_eq_byte] == data_chunk_parent[id_eq_byte] { + if id_eq_byte == min(data_chunk_parent.len(), data_chunk.len()) - 1 { + break; + } + id_eq_byte += 1; + } + let levenshtein_matrix = + levenshtein_matrix(&data_chunk[id_eq_byte..], &data_chunk_parent[id_eq_byte..]); + levenshtein_matrix[data_chunk_parent.len()][data_chunk.len()] +} + +/// Create Levenshtein matrix for chunks +fn levenshtein_matrix(data_chunk: &[u8], data_chunk_parent: &[u8]) -> Vec> { + let mut levenshtein_matrix = + vec![vec![0u32; data_chunk.len() + 1]; data_chunk_parent.len() + 1]; + levenshtein_matrix[0] = (0..data_chunk.len() as u32 + 1).collect(); + for y in 1..data_chunk_parent.len() + 1 { + levenshtein_matrix[y][0] = y as u32; + for x in 1..data_chunk.len() + 1 { + let add = levenshtein_matrix[y - 1][x] + 1; + let del = levenshtein_matrix[y][x - 1] + 1; + let mut replace = levenshtein_matrix[y - 1][x - 1]; + if data_chunk_parent[y - 1] != data_chunk[x - 1] { + replace += 1; + } + levenshtein_matrix[y][x] = min(min(del, add), replace); + } + } + levenshtein_matrix +} + +/// A function that turns a tuple from a Yandex action and a byte into a u32 for writing to storage +fn encode_delta_action(action: Action, index: usize, byte_value: u8) -> u32 { + let mut code = 0u32; + match action { + Action::Del => { + code += 1 << 31; + } + Action::Add => { + code += 1 << 30; + } + Action::Rep => {} + } + code += byte_value as u32 * (1 << 22); + if index >= (1 << 22) { + panic!() + } + code += index as u32; + code +} + +#[cfg(test)] +mod test { + use super::*; + use crate::decoder; + use crate::hasher::AronovichHash; + + #[test] + fn test_restore_similarity_chunk_1_byte_diff() { + let mut data: Vec = (0..8192).map(|_| rand::random::()).collect(); + let data2 = data.clone(); + if data[15] < 255 { + data[15] = 255; + } else { + data[15] = 0; + } + + let (sbc_map, sbc_key) = create_map_and_key(data.as_slice(), data2.as_slice()); + + assert_eq!(sbc_map.get(&sbc_key).unwrap(), data2); + } + + #[test] + fn test_restore_similarity_chunk_2_neighbor_byte_diff() { + let mut data: Vec = (0..8192).map(|_| rand::random::()).collect(); + let data2 = data.clone(); + if data[15] < 255 { + data[15] = 255; + } else { + data[15] = 0; + } + if data[16] < 255 { + data[16] = 255; + } else { + data[16] = 0; + } + + let (sbc_map, sbc_key) = create_map_and_key(data.as_slice(), data2.as_slice()); + + assert_eq!(sbc_map.get(&sbc_key).unwrap(), data2); + } + + #[test] + fn test_restore_similarity_chunk_2_byte_diff() { + let mut data: Vec = (0..8192).map(|_| rand::random::()).collect(); + let data2 = data.clone(); + if data[15] < 255 { + data[15] = 255; + } else { + data[15] = 0; + } + if data[106] < 255 { + data[106] = 255; + } else { + data[106] = 0; + } + + let (sbc_map, sbc_key) = create_map_and_key(data.as_slice(), data2.as_slice()); + + assert_eq!(sbc_map.get(&sbc_key).unwrap(), data2); + } + + #[test] + fn test_restore_similarity_chunk_with_offset_left() { + let data: Vec = (0..8192).map(|_| rand::random::()).collect(); + let data2 = data[15..].to_vec(); + + let (sbc_map, sbc_key) = create_map_and_key(data.as_slice(), data2.as_slice()); + + assert_eq!(sbc_map.get(&sbc_key).unwrap(), data2); + } + + #[test] + fn test_restore_similarity_chunk_with_offset_right() { + let data: Vec = (0..8192).map(|_| rand::random::()).collect(); + let data2 = data[..8000].to_vec(); + + let (sbc_map, sbc_key) = create_map_and_key(data.as_slice(), data2.as_slice()); + + assert_eq!(sbc_map.get(&sbc_key).unwrap(), data2); + } + + #[test] + fn test_restore_similarity_chunk_with_offset() { + let data: Vec = (0..8192).map(|_| rand::random::()).collect(); + let mut data2 = data[15..8000].to_vec(); + data2[0] /= 3; + data2[7000] /= 3; + + let (sbc_map, sbc_key) = create_map_and_key(data.as_slice(), data2.as_slice()); + + assert_eq!(sbc_map.get(&sbc_key).unwrap(), data2); + } + + #[test] + fn test_restore_similarity_chunk_with_cyclic_shift_right() { + let data: Vec = (0..8192).map(|_| rand::random::()).collect(); + let mut data2 = data.clone(); + data2.extend(&data[8000..]); + + let (sbc_map, sbc_key) = create_map_and_key(data.as_slice(), data2.as_slice()); + + assert_ne!(data, []); + assert_eq!( + sbc_key.chunk_type, + ChunkType::Delta { + parent_hash: AronovichHash::new_with_u32(0), + number: 0 + } + ); + assert_eq!(sbc_map.get(&sbc_key).unwrap(), data2); + } + #[test] + fn test_restore_similarity_chunk_with_cyclic_shift_left() { + let data: Vec = (0..8192).map(|_| rand::random::()).collect(); + let mut data2 = data[..192].to_vec(); + data2.extend(&data); + + let (sbc_map, sbc_key) = create_map_and_key(data.as_slice(), data2.as_slice()); + + assert_ne!(data, []); + assert_eq!( + sbc_key.chunk_type, + ChunkType::Delta { + parent_hash: AronovichHash::new_with_u32(0), + number: 0 + } + ); + assert_eq!(sbc_map.get(&sbc_key).unwrap(), data2); + } + fn create_map_and_key<'a>( + data: &'a [u8], + data2: &'a [u8], + ) -> ( + SBCMap, + SBCKey, + ) { + let mut binding = SBCMap::new(decoder::LevenshteinDecoder::default()); + let sbc_map = Arc::new(Mutex::new(&mut binding)); + + let (_, sbc_key) = encode_simple_chunk( + &mut sbc_map.lock().unwrap(), + data, + AronovichHash::new_with_u32(0), + ); + let (_, _, sbc_key_2) = LevenshteinEncoder::default().encode_delta_chunk( + sbc_map.clone(), + data2, + AronovichHash::new_with_u32(3), + data, + sbc_key.hash.clone(), + ); + (binding, sbc_key_2) + } +} diff --git a/src/encoder/xdelta_encoder.rs b/src/encoder/xdelta_encoder.rs new file mode 100644 index 0000000..1615bfd --- /dev/null +++ b/src/encoder/xdelta_encoder.rs @@ -0,0 +1,745 @@ +use crate::chunkfs_sbc::ClusterPoint; +use crate::decoder::Decoder; +use crate::encoder::{ + count_delta_chunks_with_hash, encode_copy_instruction, encode_insert_instruction, + get_parent_data, Encoder, +}; +use crate::{ChunkType, SBCHash, SBCKey, SBCMap}; +use chunkfs::Data; +use chunkfs::Database; +use std::cmp::min; +use std::collections::HashMap; +use std::sync::{Arc, Mutex}; +use zstd::stream; + +const BLOCK_SIZE: usize = 16; +const ADLER_MOD: u32 = 65521; + +pub struct XdeltaEncoder { + zstd_flag: bool, +} + +impl Default for XdeltaEncoder { + fn default() -> Self { + Self::new(false) + } +} + +impl XdeltaEncoder { + /// Creates a new XdeltaEncoder with specified compression settings. + /// + /// # Arguments + /// * `zstd_flag` - Whether to apply zstd compression to delta-encoded data: + /// - `true`: Apply zstd compression (level 0). + /// - `false`: Store raw delta instructions. + pub fn new(zstd_flag: bool) -> Self { + XdeltaEncoder { zstd_flag } + } + + /// Encodes a single chunk as delta against a parent chunk using xdelta algorithm. + /// + /// # Type Parameters + /// - `D`: Decoder implementation for chunk retrieval + /// - `Hash`: Hash type implementing SBCHash + /// + /// # Arguments + /// * `target_map` - Thread-safe reference to chunk storage. + /// * `chunk_data` - Raw data to encode. + /// * `hash` - Content hash of the chunk data. + /// * `parent_data` - Reference parent chunk data. + /// * `word_hash_offsets` - Precomputed block positions from parent. + /// * `parent_hash` - Hash of the parent chunk. + /// + /// # Returns + /// Tuple containing: + /// 1. `usize` - Always 0 (represents unused data) + /// 2. `usize` - Size of stored delta data (after optional compression) + /// 3. `SBCKey` - Key where delta was stored + fn encode_delta_chunk( + &self, + target_map: Arc>>, + chunk_data: &[u8], + hash: Hash, + parent_data: &[u8], + word_hash_offsets: &HashMap>, + parent_hash: Hash, + ) -> (usize, usize, SBCKey) { + let mut delta_code = Vec::new(); + + let mut i = 0; + while i + BLOCK_SIZE <= chunk_data.len() { + let adler_hash_word = adler32(&chunk_data[i..i + BLOCK_SIZE]); + + if !word_hash_offsets.contains_key(&adler_hash_word) { + encode_insert_sequence( + chunk_data, + &mut i, + word_hash_offsets, + &mut delta_code, + adler_hash_word, + ); + } else { + encode_copy_sequence( + parent_data, + chunk_data, + &mut i, + &mut delta_code, + adler_hash_word, + word_hash_offsets, + ) + } + } + if i < chunk_data.len() { + let remaining_data = chunk_data[i..].to_vec(); + encode_insert_instruction(remaining_data, &mut delta_code); + } + + let (processed_data, sbc_hash) = prepare_and_store_delta_chunk( + target_map, + hash, + parent_hash, + delta_code, + self.zstd_flag, + ); + (0, processed_data, sbc_hash) + } +} + +impl Encoder for XdeltaEncoder { + /// Encodes a cluster of data chunks using Xdelta compression against a parent chunk. + /// + /// # Arguments + /// * `target_map` - Thread-safe reference to the chunk storage map (Arc). + /// * `cluster` - Mutable slice of ClusterPoints to process. + /// * `parent_hash` - Hash of the suggested parent chunk for delta reference. + /// + /// # Returns + /// A tuple containing: + /// 1. `usize` - Total bytes of data that couldn't be delta-encoded (left as-is). + /// 2. `usize` - Total bytes of processed delta-encoded data. + fn encode_cluster( + &self, + target_map: Arc>>, + cluster: &mut [ClusterPoint], + parent_hash: Hash, + ) -> (usize, usize) { + let mut processed_data = 0; + let parent_chunk = get_parent_data(target_map.clone(), parent_hash.clone(), cluster); + let mut data_left = parent_chunk.data_left; + let parent_data = parent_chunk.parent_data; + let word_hash_offsets = create_block_hashmap(parent_data.as_slice()); + + for (chunk_id, (hash, data_container)) in cluster.iter_mut().enumerate() { + if parent_chunk.index > -1 && chunk_id == parent_chunk.index as usize { + continue; + } + let mut target_hash = SBCKey::default(); + match data_container.extract() { + Data::Chunk(data) => { + let (left, processed, sbc_hash) = self.encode_delta_chunk( + target_map.clone(), + data, + hash.clone(), + parent_data.as_slice(), + &word_hash_offsets, + parent_hash.clone(), + ); + data_left += left; + processed_data += processed; + target_hash = sbc_hash; + } + Data::TargetChunk(_) => {} + } + data_container.make_target(vec![target_hash]); + } + (data_left, processed_data) + } +} + +/// Prepares and stores a delta-encoded chunk in the shared chunk map. +/// +/// # Arguments +/// * `target_map` - Thread-safe reference to the chunk storage map (Arc). +/// * `hash` - Content hash of the original chunk data. +/// * `parent_hash` - Hash of the parent chunk this delta is based on. +/// * `delta_code` - Raw delta-encoded data to store. +/// * `zstd_flag` - Whether to apply zstd compression to the delta data. +/// +/// # Returns +/// A tuple containing: +/// 1. `usize` - Final size of the stored data (after optional compression). +/// 2. `SBCKey` - Key under which the chunk was stored. +fn prepare_and_store_delta_chunk( + target_map: Arc>>, + hash: Hash, + parent_hash: Hash, + delta_code: Vec, + zstd_flag: bool, +) -> (usize, SBCKey) { + let mut target_map_lock = target_map.lock().unwrap(); + let number_delta_chunk = count_delta_chunks_with_hash(&target_map_lock, &hash); + let sbc_hash = SBCKey { + hash, + chunk_type: ChunkType::Delta { + parent_hash, + number: number_delta_chunk, + }, + }; + + let delta_code = if zstd_flag { + stream::encode_all(delta_code.as_slice(), 0).unwrap() + } else { + delta_code + }; + + let processed_data = delta_code.len(); + let _ = target_map_lock.insert(sbc_hash.clone(), delta_code); + + (processed_data, sbc_hash) +} + +/// Encodes a matching sequence as a COPY instruction. +/// +/// # Arguments +/// * `parent_data` - Reference data containing the matching block. +/// * `chunk_data` - Current data being processed. +/// * `i` - Current position in `chunk_data` (updated after execution). +/// * `delta_code` - Output buffer for delta instructions. +/// * `initial_hash` - Adler-32 hash of the first block at position `i` in `chunk_data`. +/// * `word_hash_offsets` - A hash table mapping Adler-32 hashes of blocks in the parent data to their offsets. Used to detect when a match starts at the current position. +fn encode_copy_sequence( + parent_data: &[u8], + chunk_data: &[u8], + i: &mut usize, + delta_code: &mut Vec, + initial_hash: u32, + word_hash_offsets: &HashMap>, +) { + if *i >= chunk_data.len() || !word_hash_offsets.contains_key(&initial_hash) { + return; + } + + let offsets = match word_hash_offsets.get(&initial_hash) { + Some(v) => v, + None => return, + }; + + let mut best_len = 0; + let mut best_offset = 0; + + for &offset in offsets { + let max_len = min(parent_data.len() - offset, chunk_data.len() - *i); + let mut equal_part_len = 0; + + while equal_part_len < max_len + && parent_data[offset + equal_part_len] == chunk_data[*i + equal_part_len] + { + equal_part_len += 1; + } + + if equal_part_len > best_len { + best_len = equal_part_len; + best_offset = offset; + } + } + + if best_len > 0 { + encode_copy_instruction(best_len, best_offset, delta_code); + *i += best_len; + } else { + let end = min(*i + BLOCK_SIZE, chunk_data.len()); + let insert_data = chunk_data[*i..end].to_vec(); + encode_insert_instruction(insert_data, delta_code); + *i = end; + } +} + +/// Encodes a matching sequence as a INSERT instruction. +/// +/// # Arguments +/// * `chunk_data` - Current data being processed. +/// * `i` - Current position in `chunk_data` (updated after execution). +/// * `word_hash_offsets` - A hash table mapping Adler-32 hashes of blocks in the parent data to their offsets. Used to detect when a match starts at the current position. +/// * `delta_code` - Output buffer for delta instructions. +/// * `initial_hash` - Adler-32 hash of the first block at position `i` in `chunk_data`. +fn encode_insert_sequence( + chunk_data: &[u8], + i: &mut usize, + word_hash_offsets: &HashMap>, + delta_code: &mut Vec, + initial_hash: u32, +) { + if *i >= chunk_data.len() { + return; + } + + let mut current_hash = initial_hash; + let mut insert_data = Vec::new(); + + while !word_hash_offsets.contains_key(¤t_hash) { + insert_data.push(chunk_data[*i]); + *i += 1; + + if *i + BLOCK_SIZE <= chunk_data.len() { + current_hash = adler32(&chunk_data[*i..*i + BLOCK_SIZE]); + } else { + let right_border = min(*i + BLOCK_SIZE, chunk_data.len()); + insert_data.extend_from_slice(&chunk_data[*i..right_border]); + *i = chunk_data.len(); + break; + } + } + + if !insert_data.is_empty() { + encode_insert_instruction(insert_data, delta_code); + } +} + +/// Computes the Adler-32 checksum for a given byte slice. +fn adler32(data: &[u8]) -> u32 { + let mut a: u32 = 1; + let mut b: u32 = 0; + + for &byte in data { + a = (a + byte as u32) % ADLER_MOD; + b = (b + a) % ADLER_MOD; + } + + (b << 16) | a +} + +/// Creates a hash map that maps each block's hash to its first occurrence position in the source data. +/// +/// # Arguments +/// * `source_data` - The reference data to be indexed. +/// +/// # Returns +/// HashMap where: +/// - Key: Adler32 hash of a block. +/// - Value: First starting position of that block in source_data. +fn create_block_hashmap(source_data: &[u8]) -> HashMap> { + let mut i = 0; + let mut block_position_map = HashMap::new(); + + while i + BLOCK_SIZE <= source_data.len() { + let block_hash = adler32(&source_data[i..i + BLOCK_SIZE]); + block_position_map + .entry(block_hash) + .or_insert_with(Vec::new) + .push(i); + i += 1; + } + + block_position_map +} + +#[cfg(test)] +mod test { + use super::*; + use crate::decoder; + use crate::encoder::encode_simple_chunk; + use crate::hasher::AronovichHash; + use rand::prelude::StdRng; + use rand::{Rng, SeedableRng}; + + const TEST_DATA_SIZE: usize = 8192; + + #[test] + fn create_block_hashmap_should_return_empty_map_for_data_shorter_than_16_bytes() { + let short_data = [0u8; 15]; + let result = create_block_hashmap(&short_data); + assert!(result.is_empty()); + } + + #[test] + fn create_block_hashmap_should_create_empty_map_for_empty_data() { + let empty_data = []; + let result = create_block_hashmap(&empty_data); + assert!(result.is_empty()); + } + + #[test] + fn create_block_hashmap_should_store_first_position_for_duplicate_blocks() { + let data = b"abcdabcdabcdabcdabcdabcdabcdabcd"; + let result = create_block_hashmap(data); + assert_eq!( + result.get(&adler32(b"abcdabcdabcdabcd")), + Some(&vec![0, 4, 8, 12, 16]) + ); + assert_eq!(result.len(), 3); + } + + #[test] + fn encode_insert_sequence_should_insert_full_chunk_when_no_hash_matches() { + let chunk_data = vec![10; 20]; + let word_hash_offsets = HashMap::new(); + let mut delta_code = Vec::new(); + let mut i = 0; + + let adler_hash = adler32(&chunk_data[i..i + BLOCK_SIZE]); + encode_insert_sequence( + &chunk_data, + &mut i, + &word_hash_offsets, + &mut delta_code, + adler_hash, + ); + + assert_eq!(i, chunk_data.len()); + + let header = &delta_code[..3]; + assert_eq!(header, &[20, 0, 0x80]); + + let data = &delta_code[3..]; + assert_eq!(data, chunk_data.as_slice()); + } + + #[test] + fn encode_insert_sequence_should_insert_partial_tail_when_less_than_block_size() { + let chunk_data = vec![10; 5]; + let word_hash_offsets = HashMap::new(); + let mut delta_code = Vec::new(); + let mut i = 0; + + let adler_hash = 0; + encode_insert_sequence( + &chunk_data, + &mut i, + &word_hash_offsets, + &mut delta_code, + adler_hash, + ); + assert_eq!(i, chunk_data.len()); + + let header = &delta_code[..3]; + assert_eq!(header, &[5, 0, 0x80]); + + let data = &delta_code[3..]; + assert_eq!(data, chunk_data.as_slice()); + } + + #[test] + fn encode_insert_sequence_should_not_insert_if_match_found_at_start() { + let chunk_data = vec![10; 16]; + let mut word_hash_offsets = HashMap::new(); + let hash = adler32(&chunk_data); + word_hash_offsets.insert(hash, vec![0]); + + let mut delta_code = Vec::new(); + let mut i = 0; + + encode_insert_sequence( + &chunk_data, + &mut i, + &word_hash_offsets, + &mut delta_code, + hash, + ); + + assert!(delta_code.is_empty()); + assert_eq!(i, 0); + } + + #[test] + fn encode_insert_sequence_should_insert_only_part_of_data_if_match_found_later() { + let mut chunk_data = vec![10; 32]; + chunk_data[..4].copy_from_slice(&[1, 2, 3, 4]); + + let hash_second_block = adler32(&chunk_data[16..32]); + + let mut word_hash_offsets = HashMap::new(); + word_hash_offsets.insert(hash_second_block, vec![16]); + + let mut delta_code = Vec::new(); + let mut i = 0; + + let initial_hash = adler32(&chunk_data[i..i + BLOCK_SIZE]); + encode_insert_sequence( + &chunk_data, + &mut i, + &word_hash_offsets, + &mut delta_code, + initial_hash, + ); + + let mut expected = vec![4, 0, 0x80]; + expected.extend_from_slice(&[1, 2, 3, 4]); + + assert_eq!(delta_code, expected); + assert_eq!(i, 4); + } + + #[test] + fn encode_copy_sequence_should_encode_full_copy_when_blocks_match() { + let parent_data = vec![10; 32]; + let chunk_data = vec![10; 32]; + + let word_hash_offsets = create_block_hashmap(&parent_data); + + let mut i = 0; + let mut delta_code = Vec::new(); + let initial_hash = adler32(&chunk_data[i..i + BLOCK_SIZE]); + + encode_copy_sequence( + &parent_data, + &chunk_data, + &mut i, + &mut delta_code, + initial_hash, + &word_hash_offsets, + ); + + let expected = vec![32, 0, 0, 0, 0, 0]; + assert_eq!(delta_code, expected); + assert_eq!(i, 32); + } + + #[test] + fn encode_copy_sequence_should_handle_non_aligned_matches() { + let parent = b"abcdefghijklmnopqrstuvwxyzABCDEF".to_vec(); + let chunk = b"ijklmnopqrstuvwxyzABCDEFGHIJKL".to_vec(); + let word_hash_offsets = create_block_hashmap(&parent); + + let mut i = 0; + let mut delta = vec![]; + let hash = adler32(&chunk[i..i + BLOCK_SIZE]); + + encode_copy_sequence( + &parent, + &chunk, + &mut i, + &mut delta, + hash, + &word_hash_offsets, + ); + + assert_eq!(i, 24); + assert_eq!(delta[..3], 24u32.to_ne_bytes()[..3]); + assert_eq!(delta[3..6], 8u32.to_ne_bytes()[..3]); + } + + #[test] + fn encode_copy_sequence_should_limit_match_by_parent_data_size() { + let parent = vec![0u8; 16]; + let chunk = vec![0u8; 32]; + let word_hash_offsets = create_block_hashmap(&parent); + let hash = adler32(&parent[..BLOCK_SIZE]); + + let mut i = 0; + let mut delta = vec![]; + + encode_copy_sequence( + &parent, + &chunk, + &mut i, + &mut delta, + hash, + &word_hash_offsets, + ); + + assert_eq!(i, BLOCK_SIZE); + } + + #[test] + fn encode_copy_sequence_should_do_nothing_when_hash_not_found() { + let parent = vec![0u8; 16]; + let chunk = vec![0u8; 16]; + let word_hash_offsets = create_block_hashmap(&parent); + + let mut i = 0; + let mut delta = vec![]; + let invalid_hash = adler32(b"invalid_block____"); + + encode_copy_sequence( + &parent, + &chunk, + &mut i, + &mut delta, + invalid_hash, + &word_hash_offsets, + ); + + assert!(delta.is_empty()); + assert_eq!(i, 0); + } + + #[test] + fn encode_copy_sequence_should_do_nothing_when_position_out_of_bounds() { + let parent = vec![0u8; 16]; + let chunk = vec![0u8; 16]; + let word_hash_offsets = create_block_hashmap(&parent); + + let mut i = chunk.len(); + let mut delta = vec![]; + let hash = adler32(&[0; BLOCK_SIZE]); + + encode_copy_sequence( + &parent, + &chunk, + &mut i, + &mut delta, + hash, + &word_hash_offsets, + ); + + assert!(delta.is_empty()); + assert_eq!(i, chunk.len()); + } + + #[test] + fn test_restore_similarity_chunk_1_byte_diff() { + let mut data: Vec = generate_test_data_deterministic(13); + let data2 = data.clone(); + if data[15] < 255 { + data[15] = 255; + } else { + data[15] = 0; + } + + let (sbc_map, sbc_key) = create_map_and_key(data.as_slice(), data2.as_slice()); + + assert_eq!(sbc_map.get(&sbc_key).unwrap(), data2); + } + + #[test] + fn test_restore_similarity_chunk_2_neighbor_byte_diff() { + let mut data: Vec = generate_test_data_deterministic(56); + let data2 = data.clone(); + if data[15] < 255 { + data[15] = 255; + } else { + data[15] = 0; + } + if data[16] < 255 { + data[16] = 255; + } else { + data[16] = 0; + } + + let (sbc_map, sbc_key) = create_map_and_key(data.as_slice(), data2.as_slice()); + + assert_eq!(sbc_map.get(&sbc_key).unwrap(), data2); + } + + #[test] + fn test_restore_similarity_chunk_2_byte_diff() { + let mut data: Vec = generate_test_data_deterministic(35); + let data2 = data.clone(); + if data[15] < 255 { + data[15] = 255; + } else { + data[15] = 0; + } + if data[106] < 255 { + data[106] = 255; + } else { + data[106] = 0; + } + + let (sbc_map, sbc_key) = create_map_and_key(&data, &data2); + + assert_eq!(sbc_map.get(&sbc_key).unwrap(), data2); + } + + #[test] + fn test_restore_similarity_chunk_with_offset_left() { + let data: Vec = generate_test_data_deterministic(41); + let data2 = data[15..].to_vec(); + + let (sbc_map, sbc_key) = create_map_and_key(data.as_slice(), data2.as_slice()); + + assert_eq!(sbc_map.get(&sbc_key).unwrap(), data2); + } + + #[test] + fn test_restore_similarity_chunk_with_offset_right() { + let data: Vec = generate_test_data_deterministic(65); + let data2 = data[..8000].to_vec(); + + let (sbc_map, sbc_key) = create_map_and_key(data.as_slice(), data2.as_slice()); + + assert_eq!(sbc_map.get(&sbc_key).unwrap(), data2); + } + + #[test] + fn test_restore_similarity_chunk_with_offset() { + let data: Vec = generate_test_data_deterministic(45); + let mut data2 = data[15..8000].to_vec(); + data2[0] /= 3; + data2[7000] /= 3; + + let (sbc_map, sbc_key) = create_map_and_key(data.as_slice(), data2.as_slice()); + + assert_eq!(sbc_map.get(&sbc_key).unwrap(), data2); + } + + #[test] + fn test_restore_similarity_chunk_with_cyclic_shift_right() { + let data: Vec = generate_test_data_deterministic(44); + let mut data2 = data.clone(); + data2.extend(&data[8000..]); + + let (sbc_map, sbc_key) = create_map_and_key(data.as_slice(), data2.as_slice()); + + assert_ne!(data, []); + assert_eq!( + sbc_key.chunk_type, + ChunkType::Delta { + parent_hash: AronovichHash::new_with_u32(0), + number: 0 + } + ); + assert_eq!(sbc_map.get(&sbc_key).unwrap(), data2); + } + + #[test] + fn test_restore_similarity_chunk_with_cyclic_shift_left() { + let data: Vec = generate_test_data_deterministic(42); + let mut data2 = data[..192].to_vec(); + data2.extend(&data); + + let (sbc_map, sbc_key) = create_map_and_key(data.as_slice(), data2.as_slice()); + + assert_ne!(data, []); + assert_eq!( + sbc_key.chunk_type, + ChunkType::Delta { + parent_hash: AronovichHash::new_with_u32(0), + number: 0 + } + ); + assert_eq!(sbc_map.get(&sbc_key).unwrap(), data2); + } + + fn generate_test_data_deterministic(seed: u64) -> Vec { + let mut rng = StdRng::seed_from_u64(seed); + (0..TEST_DATA_SIZE).map(|_| rng.gen()).collect() + } + + fn create_map_and_key<'a>( + data: &'a [u8], + data2: &'a [u8], + ) -> ( + SBCMap, + SBCKey, + ) { + let word_hash_offsets = create_block_hashmap(data); + let mut binding = SBCMap::new(decoder::GdeltaDecoder::default()); + let sbc_map = Arc::new(Mutex::new(&mut binding)); + + let (_, sbc_key) = encode_simple_chunk( + &mut sbc_map.lock().unwrap(), + data, + AronovichHash::new_with_u32(0), + ); + let (_, _, sbc_key_2) = XdeltaEncoder::default().encode_delta_chunk( + sbc_map.clone(), + data2, + AronovichHash::new_with_u32(3), + data, + &word_hash_offsets, + sbc_key.hash.clone(), + ); + (binding, sbc_key_2) + } +} diff --git a/src/encoder/zdelta_comprassion_error.rs b/src/encoder/zdelta_comprassion_error.rs new file mode 100644 index 0000000..9b6aff0 --- /dev/null +++ b/src/encoder/zdelta_comprassion_error.rs @@ -0,0 +1,52 @@ +use thiserror::Error; + +/// Error types related to zdelta encoding operations. +#[derive(Debug, Error)] +pub enum ZdeltaCompressionError { + #[error("Match encoding error: {0}")] + MatchEncoding(#[from] MatchEncodingError), + + #[error("Data conversion error: {0}")] + DataConversion(#[from] DataConversionError), + + #[error("IO/Storage error: {0}")] + Storage(#[from] StorageError), +} + +/// Errors related to data format and conversion. +/// +/// These errors occur when input data doesn't meet requirements for processing. +#[derive(Debug, Error)] +pub enum DataConversionError { + #[error("Chunk too small: got {actual_size} bytes, need at least {required_size}")] + ChunkTooSmall { + /// Actual size of provided data chunk. + actual_size: usize, + /// Minimum required size for processing + required_size: usize, + }, +} + +/// Errors related to storage operations and IO. +/// +/// These errors occur during interaction with storage systems and locks. +#[derive(Debug, Error)] +pub enum StorageError { + #[error("Lock acquisition failed: {0}")] + LockFailed(String), + + #[error("Data insertion failed: {0}")] + InsertionFailed(String), +} + +/// Errors related to match encoding operations. +/// +/// These errors occur during the encoding of matches between target and reference data. +#[derive(Debug, Error, PartialEq)] +pub enum MatchEncodingError { + #[error("Invalid match length {0} (allowed {1}-{2})")] + InvalidLength(usize, usize, usize), + + #[error("Invalid parameter combination")] + InvalidParameterCombination, +} diff --git a/src/encoder/zdelta_encoder.rs b/src/encoder/zdelta_encoder.rs new file mode 100644 index 0000000..3bab384 --- /dev/null +++ b/src/encoder/zdelta_encoder.rs @@ -0,0 +1,1311 @@ +use crate::chunkfs_sbc::ClusterPoint; +use crate::decoder::Decoder; +use crate::encoder::zdelta_comprassion_error::{ + DataConversionError, MatchEncodingError, StorageError, +}; +use crate::encoder::zdelta_match_pointers::{MatchPointers, ReferencePointerType}; +use crate::encoder::{count_delta_chunks_with_hash, get_parent_data, Encoder}; +use crate::hasher::SBCHash; +use crate::{ChunkType, SBCKey, SBCMap}; +use bit_vec::BitVec; +use chunkfs::{Data, Database}; +use huffman_compress::{Book, CodeBuilder, Tree}; +use std::cmp::min; +use std::collections::HashMap; +use std::sync::{Arc, Mutex}; + +const LARGE_OFFSET_PENALTY_THRESHOLD: i32 = 4096; +const MIN_MATCH_LENGTH: usize = 3; +const MAX_MATCH_LENGTH: usize = 1026; +const LENGTH_BLOCK_SIZE: usize = 256; +const HASH_TABLE_SIZE: usize = 65536; +const MAX_HASH_CHAIN_LENGTH: usize = 1024; +const LITERAL_FLAG: u8 = 0x00; + +/// A 3-byte sequence used for finding matches. +type Triplet = [u8; 3]; + +/// Zdelta compression encoder. +/// +/// Implements delta compression between target and reference data using: +/// - LZ77-style matching with reference pointers. +/// - Optional Huffman encoding of the delta. +pub struct ZdeltaEncoder { + huffman_book: Option>, +} + +impl Default for ZdeltaEncoder { + fn default() -> Self { + Self::new(true) + } +} + +impl Encoder for ZdeltaEncoder { + /// Encodes a cluster of data chunks using Zdelta compression against a parent chunk. + /// + /// # Arguments + /// * `target_map` - Thread-safe reference to the chunk storage map (Arc). + /// * `cluster` - Mutable slice of ClusterPoints to process. + /// * `parent_hash` - Hash of the suggested parent chunk for delta reference. + /// + /// # Returns + /// A tuple containing: + /// 1. `usize` - Total bytes of data that couldn't be delta-encoded (left as-is). + /// 2. `usize` - Total bytes of processed delta-encoded data. + fn encode_cluster( + &self, + target_map: Arc>>, + cluster: &mut [ClusterPoint], + parent_hash: Hash, + ) -> (usize, usize) { + let parent_info = get_parent_data(target_map.clone(), parent_hash.clone(), cluster); + let mut data_left = parent_info.data_left; + let mut total_processed_bytes = 0; + let parent_data = parent_info.parent_data; + let parent_triplet_lookup_table = match build_triplet_lookup_table(&parent_data) { + Ok(triplet_lookup_table) => triplet_lookup_table, + Err(_) => { + panic!("Chunk is too small (Chunk size should be at least three bytes)") + } + }; + + for (chunk_id, (hash, data_container)) in cluster.iter_mut().enumerate() { + if parent_info.index > -1 && chunk_id == parent_info.index as usize { + continue; + } + let mut target_hash = SBCKey::default(); + match data_container.extract() { + Data::Chunk(data) => { + let (left, processed, sbc_hash) = self.encode_delta_chunk( + target_map.clone(), + data, + hash.clone(), + parent_data.as_slice(), + &parent_triplet_lookup_table, + parent_hash.clone(), + ); + data_left += left; + total_processed_bytes += processed; + target_hash = sbc_hash; + } + Data::TargetChunk(_) => {} + } + data_container.make_target(vec![target_hash]); + } + (data_left, total_processed_bytes) + } +} + +impl ZdeltaEncoder { + /// Creates a new ZdeltaEncoder. + /// + /// # Arguments + /// * `use_huffman_encoding` - Whether to use Huffman encoding for the delta. + pub fn new(use_huffman_encoding: bool) -> Self { + if use_huffman_encoding { + let (huffman_book, _) = create_default_huffman_book_and_tree(); + Self { + huffman_book: Some(huffman_book), + } + } else { + Self { huffman_book: None } + } + } + + pub fn huffman_book(&self) -> Option<&Book> { + self.huffman_book.as_ref() + } + + /// Encodes a single data chunk using delta compression against a reference. + /// + /// # Arguments + /// * `target_map` - Shared map for storing compressed chunks. + /// * `target_data` - The data to be compressed. + /// * `target_hash` - Hash identifier for the target data. + /// * `parent_data` - Reference data to compare against. + /// * `parent_triplet_lookup_table` - Precomputed positions of triplets in reference data. + /// * `parent_hash` - Hash identifier for the parent/reference data. + /// + /// # Returns + /// 1. Number of uncompressed bytes. + /// 2. Total bytes processed. + /// 3. Storage key for the compressed delta. + /// + /// # Errors + /// Returns `ZdeltaCompressionError` if: + /// - Huffman encoding fails when enabled. + /// - Match parameters are invalid. + /// - Storage operations fail. + fn encode_delta_chunk( + &self, + target_map: Arc>>, + target_data: &[u8], + target_hash: Hash, + parent_data: &[u8], + parent_triplet_lookup_table: &HashMap>, + parent_hash: Hash, + ) -> (usize, usize, SBCKey) { + let mut delta_code: Vec = Vec::new(); + let mut uncompressed_data = 0; + let mut pointers = MatchPointers::new(0, 0, 0); + let mut previous_match_offset: Option = None; + let mut bit_vec_delta_code = BitVec::new(); + + let mut position_in_target_data: usize = 0; + while position_in_target_data + MIN_MATCH_LENGTH <= target_data.len() { + let mut triplet = [0u8; 3]; + triplet.copy_from_slice( + &target_data[position_in_target_data..position_in_target_data + 3], + ); + let hash = compute_triplet_hash(&triplet); + + if let Some(parent_positions) = parent_triplet_lookup_table.get(&hash) { + if let Some((match_length, offset, pointer_type)) = select_best_match( + target_data, + parent_data, + position_in_target_data, + parent_positions, + &pointers, + ) { + if match_length < MIN_MATCH_LENGTH { + self.encode_literal( + target_data[position_in_target_data], + &mut delta_code, + &mut bit_vec_delta_code, + &mut uncompressed_data, + ); + position_in_target_data += 1; + continue; + } + if let Some(book) = self.huffman_book() { + match encode_match_huffman( + match_length, + offset, + &pointer_type, + book, + target_data.len() - position_in_target_data, + ) { + Ok(encoded) => { + bit_vec_delta_code.extend(&encoded); + } + Err(_) => { + log::warn!( + "Invalid match length \ + (allowed: {MIN_MATCH_LENGTH}-{MAX_MATCH_LENGTH}), \ + falling back to literal encoding" + ); + + for &byte in &target_data[position_in_target_data + ..position_in_target_data + match_length] + { + self.encode_literal( + byte, + &mut delta_code, + &mut bit_vec_delta_code, + &mut uncompressed_data, + ); + } + } + } + } else { + match encode_match_raw( + match_length, + offset, + &pointer_type, + target_data.len() - position_in_target_data, + ) { + Ok(encoded) => delta_code.extend_from_slice(&encoded), + Err(e) => { + match e { + MatchEncodingError::InvalidLength(..) => { + log::warn!( + "Invalid match length \ + (allowed: {MIN_MATCH_LENGTH}-{MAX_MATCH_LENGTH}), \ + falling back to literal encoding" + ); + } + MatchEncodingError::InvalidParameterCombination => { + log::error!( + "Invalid parameter combination \ + (length: {match_length}, offset: {offset}, pointer: {pointer_type:?})"); + } + } + for &byte in &target_data[position_in_target_data + ..position_in_target_data + match_length] + { + delta_code.push(byte); + uncompressed_data += 1; + } + } + } + } + + let reference_match_end = match pointer_type { + ReferencePointerType::TargetLocal => position_in_target_data + match_length, + _ => { + let base_ptr = pointers.get(&pointer_type); + (base_ptr as isize + offset as isize + match_length as isize) as usize + } + }; + pointers.smart_update_after_match( + reference_match_end, + offset, + pointer_type, + previous_match_offset, + ); + previous_match_offset = Some(offset); + position_in_target_data += match_length; + continue; + } + } + + self.encode_literal( + target_data[position_in_target_data], + &mut delta_code, + &mut bit_vec_delta_code, + &mut uncompressed_data, + ); + position_in_target_data += 1; + } + + while position_in_target_data < target_data.len() { + self.encode_literal( + target_data[position_in_target_data], + &mut delta_code, + &mut bit_vec_delta_code, + &mut uncompressed_data, + ); + position_in_target_data += 1; + } + if self.huffman_book().is_some() { + delta_code.extend_from_slice(&bit_vec_delta_code.to_bytes()); + } + + let sbc_key = match store_delta_chunk(target_map, target_hash, parent_hash, delta_code) { + Ok(key) => key, + Err(StorageError::LockFailed(e)) => { + panic!("Critical storage lock failure: {e}"); + } + Err(StorageError::InsertionFailed(e)) => { + panic!("Non-critical insertion failure: {e}"); + } + }; + + (uncompressed_data, target_data.len(), sbc_key) + } + + /// Encodes a single literal byte using configured encoding. + /// + /// # Arguments + /// * `byte` - The byte to encode. + /// * `huffman_book` - Huffman code book (when Huffman encoding is enabled). + /// * `delta_code` - Output buffer for encoded data. + /// * `bit_vec_delta_code` - Used as delta_code when huffman is enabled. + /// * `uncompressed_data` - Counter for tracking uncompressed bytes. + /// + /// # Errors + /// Returns `MatchEncodingError` if: + /// - Huffman encoding is enabled but book is not available. + /// - Huffman encoding fails. + fn encode_literal( + &self, + byte: u8, + delta_code: &mut Vec, + bit_vec_delta_code: &mut BitVec, + uncompressed_data: &mut usize, + ) { + if let Some(book) = self.huffman_book() { + let encoded = encode_literal_huffman(byte, book); + bit_vec_delta_code.extend(&encoded); + } else { + delta_code.push(0x00); + delta_code.push(byte); + } + *uncompressed_data += 1; + } +} + +/// Stores a compressed delta chunk in the target map. +fn store_delta_chunk( + target_map: Arc>>, + target_hash: Hash, + parent_hash: Hash, + delta_code: Vec, +) -> Result, StorageError> { + let mut target_map_lock = target_map + .lock() + .map_err(|e| StorageError::LockFailed(format!("Failed to acquire lock: {e}")))?; + let number_delta_chunk = count_delta_chunks_with_hash(&target_map_lock, &target_hash); + let sbc_hash = SBCKey { + hash: target_hash, + chunk_type: ChunkType::Delta { + parent_hash, + number: number_delta_chunk, + }, + }; + + target_map_lock + .insert(sbc_hash.clone(), delta_code) + .map_err(|e| StorageError::InsertionFailed(format!("Failed to insert delta chunk: {e}")))?; + + Ok(sbc_hash) +} + +/// Encodes a match using Huffman coding. +/// +/// # Arguments +/// * `match_length` - Length of match (3-1026 bytes). +/// * `offset` - Signed offset from reference pointer (-32768..32767). +/// * `pointer_type` - Which reference pointer was used. +/// * `book` - Huffman code book for encoding. +/// * `data_length` - The total length of the data to ensure the match fits. +/// +/// # Returns encoded bytes representing the match or error if: +/// - Match length is out of valid range. +/// - Huffman encoding fails. +/// +/// # Encoding Format +/// The match is encoded as: +/// 1. Flag byte (combines length coefficient, pointer type and direction). +/// 2. Length remainder. +/// 3. Offset bytes (big-endian). +fn encode_match_huffman( + match_length: usize, + offset: i16, + pointer_type: &ReferencePointerType, + book: &Book, + data_length: usize, +) -> Result { + let effective_length = min(match_length, data_length); + + if !(MIN_MATCH_LENGTH..=MAX_MATCH_LENGTH).contains(&effective_length) { + return Err(MatchEncodingError::InvalidLength( + effective_length, + MIN_MATCH_LENGTH, + MAX_MATCH_LENGTH, + )); + } + + let (length_remainder, length_coefficient) = + calculate_length_components(effective_length, data_length); + let is_positive_offset = offset >= 0; + + let flag = encode_match_flag(length_coefficient, pointer_type, is_positive_offset)?; + + let offset_abs = offset.unsigned_abs(); + let [offset_high, offset_low] = offset_abs.to_be_bytes(); + + use bit_vec::BitVec; + let mut buffer = BitVec::new(); + + book.encode(&mut buffer, &flag) + .expect("Flag codes (1-20) must be in codebook"); + book.encode(&mut buffer, &length_remainder) + .expect("Length remainders (0-255) must be in codebook"); + book.encode(&mut buffer, &offset_high) + .expect("Offset bytes (0-255) must be in codebook"); + book.encode(&mut buffer, &offset_low) + .expect("Offset bytes (0-255) must be in codebook"); + + Ok(buffer) +} + +/// Creates default Huffman coding book and tree optimized for zdelta. +/// +/// The book contains codes for: +/// - 20 flag values. +/// - 256 literal bytes. +/// - 256 length remainders. +/// - 256 offset bytes. +/// +/// Frequencies are weighted to favor: +/// - Smaller flag values. +/// - ASCII literals. +/// - Smaller lengths and offsets. +pub fn create_default_huffman_book_and_tree() -> (Book, Tree) { + let mut frequencies = HashMap::new(); + + frequencies.insert(LITERAL_FLAG, 100); + + // Frequencies for flags (1-20) + for i in 1..=20 { + frequencies.insert(i as u8, 100); + } + + // Frequencies for literals (0-255) + for i in 0..=255 { + frequencies.insert(i as u8, if i < 128 { 50 } else { 10 }); + } + + // Frequencies for length residues (0-255) + for i in 0..=255 { + frequencies.insert(i as u8, if i < 128 { 30 } else { 5 }); + } + + // Frequencies for offsets (0-255) + for i in 0..=255 { + frequencies.insert(i as u8, if i < 128 { 20 } else { 5 }); + } + + CodeBuilder::from_iter(frequencies).finish() +} + +/// Encodes a literal byte using Huffman coding. +/// +/// # Arguments +/// * `literal` - The byte value to encode. +/// * `book` - Huffman code book for encoding. +/// +/// # Returns +/// Encoded BitVec or error if encoding fails. +fn encode_literal_huffman(literal: u8, book: &Book) -> BitVec { + use bit_vec::BitVec; + let mut buffer = BitVec::new(); + + book.encode(&mut buffer, &LITERAL_FLAG) + .expect("Literal flag must be in codebook"); + book.encode(&mut buffer, &literal) + .expect("All literals (0-255) must be in codebook"); + + buffer +} + +/// Encodes a match using raw byte representation (without Huffman coding). +/// +/// # Arguments +/// * `match_length` - Length of the match (3-1026 bytes). +/// * `offset` - Signed offset from reference pointer. +/// * `pointer_type` - Which reference pointer was used. +/// * `data_length` - The total length of the data to ensure the match fits. +/// +/// # Encoding Format +/// 1. Flag byte. +/// 2. Length remainder byte. +/// 3. Offset high byte. +/// 4. Offset low byte. +fn encode_match_raw( + match_length: usize, + offset: i16, + pointer_type: &ReferencePointerType, + data_length: usize, +) -> Result, MatchEncodingError> { + let effective_length = min(match_length, data_length); + + if !(MIN_MATCH_LENGTH..=MAX_MATCH_LENGTH).contains(&effective_length) { + return Err(MatchEncodingError::InvalidLength( + effective_length, + MIN_MATCH_LENGTH, + MAX_MATCH_LENGTH, + )); + } + + let (length_remainder, length_coefficient) = + calculate_length_components(effective_length, data_length); + let is_positive_offset = offset >= 0; + + let flag = encode_match_flag(length_coefficient, pointer_type, is_positive_offset)?; + + let offset_abs = offset.unsigned_abs(); + let [offset_high, offset_low] = offset_abs.to_be_bytes(); + + Ok(vec![flag, length_remainder, offset_high, offset_low]) +} + +/// Calculates length components for match encoding. +/// +/// Splits match length into: +/// - Remainder (0-255). +/// - Coefficient (0-3). +/// +/// # Returns +/// Tuple of (remainder, coefficient). +fn calculate_length_components(match_length: usize, max_length: usize) -> (u8, u8) { + let effective_length = + min(match_length, max_length).clamp(MIN_MATCH_LENGTH, MAX_MATCH_LENGTH) - MIN_MATCH_LENGTH; + + let length_coefficient = (effective_length / LENGTH_BLOCK_SIZE) as u8; + let length_remainder = (effective_length % LENGTH_BLOCK_SIZE) as u8; + + (length_remainder, length_coefficient) +} + +/// Encodes match flag combining length coefficient, pointer type and direction. +/// +/// # Arguments +/// * `length_coefficient` - Length coefficient (0-3). +/// * `pointer_type` - Which pointer was used. +/// * `is_positive_offset` - Whether offset is positive. +/// +/// # Returns +/// Encoded flag byte or error for invalid combination. +/// +/// # Flag Encoding +/// Each unique combination maps to a value 1-20: +/// - First 5 values: coefficient 0. +/// - Next 5: coefficient 1. +/// - Next 5: coefficient 2. +/// - Last 5: coefficient 3. +fn encode_match_flag( + length_coefficient: u8, + pointer_type: &ReferencePointerType, + is_positive_offset: bool, +) -> Result { + match (length_coefficient, pointer_type, is_positive_offset) { + (0, ReferencePointerType::TargetLocal, _) => Ok(1), + (0, ReferencePointerType::Main, true) => Ok(2), + (0, ReferencePointerType::Main, false) => Ok(3), + (0, ReferencePointerType::Auxiliary, true) => Ok(4), + (0, ReferencePointerType::Auxiliary, false) => Ok(5), + (1, ReferencePointerType::TargetLocal, _) => Ok(6), + (1, ReferencePointerType::Main, true) => Ok(7), + (1, ReferencePointerType::Main, false) => Ok(8), + (1, ReferencePointerType::Auxiliary, true) => Ok(9), + (1, ReferencePointerType::Auxiliary, false) => Ok(10), + (2, ReferencePointerType::TargetLocal, _) => Ok(11), + (2, ReferencePointerType::Main, true) => Ok(12), + (2, ReferencePointerType::Main, false) => Ok(13), + (2, ReferencePointerType::Auxiliary, true) => Ok(14), + (2, ReferencePointerType::Auxiliary, false) => Ok(15), + (3, ReferencePointerType::TargetLocal, _) => Ok(16), + (3, ReferencePointerType::Main, true) => Ok(17), + (3, ReferencePointerType::Main, false) => Ok(18), + (3, ReferencePointerType::Auxiliary, true) => Ok(19), + (3, ReferencePointerType::Auxiliary, false) => Ok(20), + _ => Err(MatchEncodingError::InvalidParameterCombination), + } +} + +/// Selects the best match from possible candidate positions. +/// +/// Uses scoring system that considers both match length and offset: +/// - Longer matches score higher. +/// - Smaller offsets score higher. +/// - Large offsets (>4096) get length penalty. +/// +/// # Arguments +/// * `target_data` - Data being compressed. +/// * `parent_data` - Reference data. +/// * `current_position` - Position in target data. +/// * `parent_positions` - Candidate match positions in reference. +/// * `pointers` - Current pointer positions. +/// +/// # Returns +/// Best match (length, offset, pointer_type) or None if no good matches. +fn select_best_match( + target_data: &[u8], + parent_data: &[u8], + current_position: usize, + parent_positions: &[usize], + pointers: &MatchPointers, +) -> Option<(usize, i16, ReferencePointerType)> { + const SCORE_LENGTH_SHIFT: usize = 16; + const MAX_SCORE_OFFSET: usize = 0xFFFF; + + let mut best_match = None; + let mut best_score = 0; + + for &parent_position in parent_positions { + if parent_position >= parent_data.len() { + continue; + } + + if let Some(length) = + find_max_match_length(target_data, parent_data, current_position, parent_position) + { + let (offset, pointer_type) = pointers.calculate_offset(parent_position); + + let safe_length = if pointer_type == ReferencePointerType::TargetLocal { + length + } else { + min(length, parent_data.len() - parent_position) + }; + + let adjusted_length = if offset.abs() > LARGE_OFFSET_PENALTY_THRESHOLD as i16 { + length.saturating_sub(1) + } else { + length + }; + + let score = (adjusted_length << SCORE_LENGTH_SHIFT) + | (!offset.abs() as usize & MAX_SCORE_OFFSET); + + if score > best_score && safe_length >= MIN_MATCH_LENGTH { + best_score = score; + best_match = Some((safe_length, offset, pointer_type)); + } + } + } + + best_match +} + +/// Finds the longest match between target and reference data at given positions. +/// +/// # Arguments +/// * `target_data` - Data being compressed. +/// * `parent_data` - Reference data. +/// * `start_position_in_target` - Start position in target data. +/// * `start_position_in_parent` - Start position in reference data. +/// +/// # Returns +/// Length of longest match (at least MIN_MATCH_LENGTH) or None if: +/// - Positions are out of bounds. +/// - Initial triplet doesn't match. +/// - No match of minimum length found. +fn find_max_match_length( + target_data: &[u8], + parent_data: &[u8], + start_position_in_target: usize, + start_position_in_parent: usize, +) -> Option { + if start_position_in_target + MIN_MATCH_LENGTH > target_data.len() + || start_position_in_parent + MIN_MATCH_LENGTH > parent_data.len() + || target_data[start_position_in_target..start_position_in_target + MIN_MATCH_LENGTH] + != parent_data[start_position_in_parent..start_position_in_parent + MIN_MATCH_LENGTH] + { + return None; + } + + let max_possible_match_length = min( + parent_data.len() - start_position_in_parent, + target_data.len() - start_position_in_target, + ) + .min(MAX_MATCH_LENGTH); + + let mut match_length = MIN_MATCH_LENGTH; + while match_length < max_possible_match_length + && target_data[start_position_in_target + match_length] + == parent_data[start_position_in_parent + match_length] + { + match_length += 1; + } + Some(match_length) +} + +/// Computes hash value for a 3-byte sequence. +fn compute_triplet_hash(triplet: &Triplet) -> u32 { + ((triplet[0] as u32) << 16) | ((triplet[1] as u32) << 8) | triplet[2] as u32 +} + +/// Builds lookup table mapping triplets to their positions in data. +/// +/// # Returns +/// Hash map of triplet hashes to positions or error if data too small +fn build_triplet_lookup_table( + chunk: &[u8], +) -> Result>, DataConversionError> { + if chunk.len() < MIN_MATCH_LENGTH { + return Err(DataConversionError::ChunkTooSmall { + actual_size: chunk.len(), + required_size: MIN_MATCH_LENGTH, + }); + } + + let mut lookup_table: HashMap> = HashMap::with_capacity(HASH_TABLE_SIZE); + + for (current_position, triplet) in chunk.windows(MIN_MATCH_LENGTH).enumerate() { + let triplet_array = [triplet[0], triplet[1], triplet[2]]; + let hash = compute_triplet_hash(&triplet_array); + + let entry = lookup_table.entry(hash).or_default(); + if entry.len() < MAX_HASH_CHAIN_LENGTH { + entry.push(current_position); + } + } + + Ok(lookup_table) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::decoder::ZdeltaDecoder; + use crate::encoder::encode_simple_chunk; + use crate::hasher::AronovichHash; + use bit_vec::BitVec; + use huffman_compress::Book; + use std::sync::{Arc, Mutex}; + + const TEST_DATA_SIZE: usize = 9008 + 100; + + #[test] + fn test_encode_decode_identical_data() { + let data: Vec = (0..TEST_DATA_SIZE).map(|i| (i % 256) as u8).collect(); + let (sbc_map, sbc_key) = create_map_and_key(&data, &data); + + assert_eq!(sbc_map.get(&sbc_key).unwrap(), data); + } + + #[test] + fn test_encode_decode_single_byte_diff() { + let reference_data: Vec = (0..TEST_DATA_SIZE).map(|i| (i % 256) as u8).collect(); + let mut target_data = reference_data.clone(); + target_data[15] = target_data[15].wrapping_add(1); + + let (sbc_map, sbc_key) = create_map_and_key(&reference_data, &target_data); + + assert_eq!(sbc_map.get(&sbc_key).unwrap(), target_data); + } + + #[test] + fn test_encode_decode_multiple_byte_diffs() { + let reference_data: Vec = (0..TEST_DATA_SIZE).map(|i| (i % 256) as u8).collect(); + let mut target_data = reference_data.clone(); + target_data[15] = target_data[15].wrapping_add(1); + target_data[1000] = target_data[1000].wrapping_add(1); + target_data[5000] = target_data[5000].wrapping_add(1); + + let (sbc_map, sbc_key) = create_map_and_key(&reference_data, &target_data); + + assert_eq!(sbc_map.get(&sbc_key).unwrap(), target_data); + } + + #[test] + fn test_encode_decode_with_left_offset() { + let reference_data: Vec = (0..TEST_DATA_SIZE).map(|i| (i % 256) as u8).collect(); + let target_data = reference_data[100..].to_vec(); + + let (sbc_map, sbc_key) = create_map_and_key(&reference_data, &target_data); + + assert_eq!(sbc_map.get(&sbc_key).unwrap(), target_data); + } + + #[test] + fn test_encode_decode_with_right_offset() { + let reference_data: Vec = (0..TEST_DATA_SIZE).map(|i| (i % 256) as u8).collect(); + let target_data = reference_data[..TEST_DATA_SIZE - 100].to_vec(); + + let (sbc_map, sbc_key) = create_map_and_key(&reference_data, &target_data); + + assert_eq!(sbc_map.get(&sbc_key).unwrap(), target_data); + } + + #[test] + fn test_encode_decode_with_middle_slice() { + let reference_data: Vec = (0..TEST_DATA_SIZE).map(|i| (i % 256) as u8).collect(); + let mut target_data = reference_data[100..TEST_DATA_SIZE - 100].to_vec(); + target_data[50] = target_data[50].wrapping_add(1); + target_data[150] = target_data[150].wrapping_add(1); + + let (sbc_map, sbc_key) = create_map_and_key(&reference_data, &target_data); + + assert_eq!(sbc_map.get(&sbc_key).unwrap(), target_data); + } + + #[test] + fn test_encode_decode_cyclic_shift_right() { + let reference_data: Vec = (0..TEST_DATA_SIZE).map(|i| (i % 256) as u8).collect(); + let mut target_data = reference_data[500..].to_vec(); + target_data.extend(&reference_data[..500]); + + let (sbc_map, sbc_key) = create_map_and_key(&reference_data, &target_data); + + assert_eq!(sbc_map.get(&sbc_key).unwrap(), target_data); + } + + #[test] + fn test_encode_decode_cyclic_shift_left() { + let reference_data: Vec = (0..TEST_DATA_SIZE).map(|i| (i % 256) as u8).collect(); + let mut target_data = reference_data[TEST_DATA_SIZE - 500..].to_vec(); + target_data.extend(&reference_data[..TEST_DATA_SIZE - 500]); + + let (sbc_map, sbc_key) = create_map_and_key(&reference_data, &target_data); + + assert_eq!(sbc_map.get(&sbc_key).unwrap(), target_data); + } + + #[test] + fn test_encode_decode_random_data_with_small_changes() { + let reference_data: Vec = (0..TEST_DATA_SIZE).map(|_| rand::random::()).collect(); + let mut target_data = reference_data.clone(); + + for i in (0..TEST_DATA_SIZE).step_by(100) { + target_data[i] = target_data[i].wrapping_add(1); + } + + let (sbc_map, sbc_key) = create_map_and_key(&reference_data, &target_data); + + assert_eq!(target_data, sbc_map.get(&sbc_key).unwrap()); + } + + #[test] + fn test_encode_decode_small_data() { + let reference_data: Vec = vec![1, 2, 3, 4, 5]; + let target_data = vec![1, 2, 3, 4, 6]; + + let (sbc_map, sbc_key) = create_map_and_key(&reference_data, &target_data); + + assert_eq!(target_data, sbc_map.get(&sbc_key).unwrap()); + } + + fn create_map_and_key( + reference_data: &[u8], + target_data: &[u8], + ) -> (SBCMap, SBCKey) { + let mut binding = SBCMap::new(ZdeltaDecoder::new(true)); + let sbc_map = Arc::new(Mutex::new(&mut binding)); + + let (_, sbc_key) = encode_simple_chunk( + &mut sbc_map.lock().unwrap(), + reference_data, + AronovichHash::new_with_u32(0), + ); + + let encoder = ZdeltaEncoder::new(true); + let (_, _, sbc_key_2) = encoder.encode_delta_chunk( + sbc_map.clone(), + target_data, + AronovichHash::new_with_u32(3), + reference_data, + &build_triplet_lookup_table(reference_data).unwrap(), + sbc_key.hash.clone(), + ); + + (binding, sbc_key_2) + } + + #[test] + fn encode_match_huffman_should_encode_valid_match_correctly() { + let book = create_test_huffman_book(); + + let test_cases = vec![ + (3, 100, ReferencePointerType::TargetLocal, false), + (258, 32767, ReferencePointerType::Main, true), + (1026, 100, ReferencePointerType::Auxiliary, false), + (128, 4096, ReferencePointerType::Main, false), + ]; + + for (length, offset, pointer_type, _) in test_cases { + let result = encode_match_huffman(length, offset as i16, &pointer_type, &book, length); + + assert!( + result.is_ok(), + "Failed to encode length {length}, offset {offset}" + ); + let encoded = result.unwrap(); + assert!(!encoded.is_empty(), "Encoded data should not be empty"); + } + } + + #[test] + fn encode_match_huffman_should_return_error_for_invalid_length() { + let book = create_test_huffman_book(); + + let test_cases = vec![ + (2, 100, ReferencePointerType::Main, true), + (1027, 100, ReferencePointerType::Main, true), + (0, 100, ReferencePointerType::Main, true), + ]; + + for (length, offset, pointer_type, _) in test_cases { + let result = encode_match_huffman(length, offset as i16, &pointer_type, &book, length); + + assert!(result.is_err()); + assert_eq!( + result.unwrap_err(), + MatchEncodingError::InvalidLength(length, MIN_MATCH_LENGTH, MAX_MATCH_LENGTH) + ); + } + } + + #[test] + fn encode_match_huffman_should_produce_different_output_for_different_inputs() { + let book = create_test_huffman_book(); + + let case1 = encode_match_huffman(10, 100, &ReferencePointerType::Main, &book, 10).unwrap(); + + let case2 = encode_match_huffman(10, 101, &ReferencePointerType::Main, &book, 10).unwrap(); + + let case3 = + encode_match_huffman(11, 100, &ReferencePointerType::Auxiliary, &book, 11).unwrap(); + + assert_ne!(case1, case2); + assert_ne!(case1, case3); + assert_ne!(case2, case3); + } + + #[test] + fn encode_match_huffman_should_handle_edge_cases_correctly() { + let book = create_test_huffman_book(); + + let max_offset = + encode_match_huffman(10, 32767, &ReferencePointerType::Main, &book, 10).unwrap(); + + let min_offset = + encode_match_huffman(10, 0, &ReferencePointerType::Main, &book, 10).unwrap(); + + assert!(!max_offset.is_empty()); + assert!(!min_offset.is_empty()); + assert_ne!(max_offset, min_offset); + } + + #[test] + fn create_default_huffman_book_and_tree_should_return_valid_book_for_all_supported_symbols() { + let (book, _) = create_default_huffman_book_and_tree(); + + assert!(!encode_to_bits(&book, 1).is_empty()); // Flag + assert!(!encode_to_bits(&book, 65).is_empty()); // Literal + assert!(!encode_to_bits(&book, 200).is_empty()); // Non-ASCII + assert!(!encode_to_bits(&book, 30).is_empty()); // Length remainder + assert!(!encode_to_bits(&book, 150).is_empty()); // Offset + } + + #[test] + fn create_default_huffman_book_and_tree_should_assign_shorter_codes_to_more_frequent_symbols() { + let (book, _) = create_default_huffman_book_and_tree(); + + let flag_code_len = encode_to_bits(&book, 1).len(); + let common_literal_len = encode_to_bits(&book, 65).len(); + let rare_literal_len = encode_to_bits(&book, 200).len(); + + assert!(flag_code_len < rare_literal_len); + assert!(common_literal_len < rare_literal_len); + } + + #[test] + fn create_default_huffman_book_and_tree_should_assign_shorter_codes_to_ascii_vs_non_ascii_literals( + ) { + let (book, _) = create_default_huffman_book_and_tree(); + + let ascii_len = encode_to_bits(&book, 65).len(); + let non_ascii_len = encode_to_bits(&book, 200).len(); + + assert!(ascii_len <= non_ascii_len); + } + + #[test] + fn create_default_huffman_book_and_tree_should_support_all_possible_byte_values() { + let (book, _) = create_default_huffman_book_and_tree(); + + for i in 0..=255u8 { + assert!( + !encode_to_bits(&book, i).is_empty(), + "Failed to encode byte {i}" + ); + } + } + + #[test] + fn create_default_huffman_book_and_tree_should_produce_different_codes_for_different_inputs() { + let (book, _) = create_default_huffman_book_and_tree(); + + let code1 = encode_to_bits(&book, 1); + let code2 = encode_to_bits(&book, 2); + let code65 = encode_to_bits(&book, 65); + let code200 = encode_to_bits(&book, 200); + + assert_ne!(code1, code2); + assert_ne!(code1, code65); + assert_ne!(code1, code200); + assert_ne!(code65, code200); + } + + #[test] + fn encode_match_raw_should_return_correct_encoding_for_basic_match() { + let result = encode_match_raw(10, 100, &ReferencePointerType::Main, 10); + assert_eq!(result, Ok(vec![2, 7, 0, 100])); + } + + #[test] + fn encode_match_raw_should_handle_negative_offset_correctly() { + let result = encode_match_raw(300, -1024, &ReferencePointerType::Auxiliary, 300); + assert_eq!(result, Ok(vec![10, 41, 4, 0])); + } + + #[test] + fn encode_match_raw_should_encode_max_values_correctly() { + let result = encode_match_raw(1026, -32766, &ReferencePointerType::TargetLocal, 1026); + assert_eq!(result, Ok(vec![16, 255, 127, 254])); + } + + #[test] + fn encode_match_raw_should_reject_length_below_minimum() { + let result = encode_match_raw(2, 100, &ReferencePointerType::Main, 2); + assert_eq!(result, Err(MatchEncodingError::InvalidLength(2, 3, 1026))); + } + + #[test] + fn encode_match_raw_should_reject_length_above_maximum() { + let result = encode_match_raw(2000, 100, &ReferencePointerType::Main, 2000); + assert_eq!( + result, + Err(MatchEncodingError::InvalidLength(2000, 3, 1026)) + ); + } + + #[test] + fn encode_match_flag_should_return_correct_flag_for_target_local() { + assert_eq!( + encode_match_flag(0, &ReferencePointerType::TargetLocal, true), + Ok(1) + ); + assert_eq!( + encode_match_flag(1, &ReferencePointerType::TargetLocal, false), + Ok(6) + ); + assert_eq!( + encode_match_flag(2, &ReferencePointerType::TargetLocal, true), + Ok(11) + ); + assert_eq!( + encode_match_flag(3, &ReferencePointerType::TargetLocal, false), + Ok(16) + ); + } + + #[test] + fn encode_match_flag_should_return_correct_flag_for_main_pointer() { + assert_eq!( + encode_match_flag(0, &ReferencePointerType::Main, true), + Ok(2) + ); + assert_eq!( + encode_match_flag(1, &ReferencePointerType::Main, true), + Ok(7) + ); + assert_eq!( + encode_match_flag(2, &ReferencePointerType::Main, false), + Ok(13) + ); + assert_eq!( + encode_match_flag(3, &ReferencePointerType::Main, false), + Ok(18) + ); + } + + #[test] + fn encode_match_flag_should_return_correct_flag_for_auxiliary_pointer() { + assert_eq!( + encode_match_flag(0, &ReferencePointerType::Auxiliary, true), + Ok(4) + ); + assert_eq!( + encode_match_flag(1, &ReferencePointerType::Auxiliary, true), + Ok(9) + ); + assert_eq!( + encode_match_flag(2, &ReferencePointerType::Auxiliary, false), + Ok(15) + ); + assert_eq!( + encode_match_flag(3, &ReferencePointerType::Auxiliary, false), + Ok(20) + ); + } + + #[test] + fn encode_match_flag_should_return_error_for_invalid_combination() { + assert_eq!( + encode_match_flag(4, &ReferencePointerType::Main, true), + Err(MatchEncodingError::InvalidParameterCombination) + ); + } + + #[test] + fn calculate_length_components_should_calculate_correctly_for_min_length() { + assert_eq!( + calculate_length_components(MIN_MATCH_LENGTH, MIN_MATCH_LENGTH), + (0, 0) + ); + assert_eq!(calculate_length_components(MIN_MATCH_LENGTH, 10), (0, 0)); + } + + #[test] + fn calculate_length_components_should_calculate_correctly_for_mid_range() { + assert_eq!(calculate_length_components(259, 259), (0, 1)); + assert_eq!(calculate_length_components(514, 514), (255, 1)); + assert_eq!(calculate_length_components(514, 300), (41, 1)); + } + + #[test] + fn calculate_length_components_should_calculate_correctly_for_max_length() { + assert_eq!(calculate_length_components(1024, 1024), (253, 3)); + assert_eq!(calculate_length_components(1026, 1024), (253, 3)); + assert_eq!( + calculate_length_components(MAX_MATCH_LENGTH, MAX_MATCH_LENGTH), + (255, 3) + ); + } + + #[test] + fn calculate_length_components_should_cap_at_max_length() { + assert_eq!(calculate_length_components(2000, 2000), (255, 3)); + assert_eq!(calculate_length_components(2000, 500), (241, 1)); + } + + #[test] + fn select_best_match_should_find_best_match_with_small_offset() { + let target = b"abcdefghijklmnopqrstuvwxyz".to_vec(); + let parent = b"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJK".to_vec(); + let pointers = MatchPointers::new(0, 10, 20); + let parent_positions = vec![10]; + + let result = select_best_match(&target, &parent, 0, &parent_positions, &pointers); + + assert_eq!(result, Some((26, 0, ReferencePointerType::Main))); + } + + #[test] + fn select_best_match_should_apply_penalty_for_large_offset() { + let target = b"0123456789abcdefghijklmnopqrstuvwxyz".to_vec(); + let parent = b"012345678#012345678#".repeat(500).to_vec(); + let pointers = MatchPointers::new(0, 0, 10_000); + let parent_positions = vec![0, 10_000 - 10]; + + let result = select_best_match(&target, &parent, 0, &parent_positions, &pointers); + + assert_eq!(result, Some((9, 0, ReferencePointerType::Main))); + } + + #[test] + fn select_best_match_should_prefer_closer_match_when_lengths_equal() { + let target = b"abcdef".to_vec(); + let parent = b"xxabcdefyyabcdefzz".to_vec(); + let pointers = MatchPointers::new(0, 2, 10); + let parent_positions = vec![2, 10]; + + let result = select_best_match(&target, &parent, 0, &parent_positions, &pointers); + + assert_eq!(result, Some((6, 0, ReferencePointerType::Main))); + } + + #[test] + fn select_best_match_should_prefer_longer_match_over_closer() { + let target = b"abcdefgh".to_vec(); + let parent = b"abcdwxyzabcdefghijkl".to_vec(); + let pointers = MatchPointers::new(0, 0, 8); + let parent_positions = vec![0, 8]; + + let result = select_best_match(&target, &parent, 0, &parent_positions, &pointers); + + assert_eq!(result, Some((8, 0, ReferencePointerType::Auxiliary))); + } + + #[test] + fn select_best_match_should_use_target_local_for_matches_before_target_ptr() { + let target = b"abcdef".to_vec(); + let parent = b"abcdef".to_vec(); + let pointers = MatchPointers::new(10, 0, 0); + let parent_positions = vec![0]; + + let result = select_best_match(&target, &parent, 0, &parent_positions, &pointers); + + assert_eq!(result, Some((6, -10, ReferencePointerType::TargetLocal))); + } + + #[test] + fn select_best_match_should_return_none_when_no_matches_found() { + let target = b"abcdef".to_vec(); + let parent = b"ghijkl".to_vec(); + let pointers = MatchPointers::default(); + let parent_positions = vec![0]; + + let result = select_best_match(&target, &parent, 0, &parent_positions, &pointers); + + assert_eq!(result, None); + } + + #[test] + fn select_best_match_should_handle_min_length_match() { + let target = b"abc".to_vec(); + let parent = b"xyzabc123".to_vec(); + let pointers = MatchPointers::new(0, 3, 0); + let parent_positions = vec![3]; + + let result = select_best_match(&target, &parent, 0, &parent_positions, &pointers); + + assert_eq!(result, Some((3, 0, ReferencePointerType::Main))); + } + + #[test] + fn find_max_match_length_should_return_full_match_length_when_sequences_are_identical() { + let (parent_data, target_data) = create_test_data_for_find_max_match_length(); + let result = find_max_match_length(target_data, parent_data, 0, 0); + assert_eq!(result, Some(6)); + } + + #[test] + fn find_max_match_length_should_return_min_match_length_when_only_triplet_matches() { + let (parent_data, target_data) = create_test_data_for_find_max_match_length(); + let result = find_max_match_length(target_data, parent_data, 3, 3); + assert_eq!(result, Some(3)); + } + + #[test] + fn find_max_match_length_should_return_none_when_triplet_does_not_match() { + let (parent_data, target_data) = create_test_data_for_find_max_match_length(); + let result = find_max_match_length(target_data, parent_data, 9, 9); + assert_eq!(result, None); + } + + #[test] + fn find_max_match_length_should_respect_max_length_limit() { + let long_data = vec![b'X'; 2000]; + let result = find_max_match_length(&long_data, &long_data, 0, 0); + assert_eq!(result, Some(MAX_MATCH_LENGTH)); + } + + #[test] + fn find_max_match_length_should_handle_edge_cases_safely() { + assert_eq!(find_max_match_length(b"", b"", 0, 0), None); + assert_eq!(find_max_match_length(b"a", b"a", 0, 0), None); // Меньше MIN_MATCH_LENGTH + } + + #[test] + fn find_max_match_length_should_detect_hash_collisions_correctly() { + let parent = b"abc"; // Хэш может совпадать с "abd" + let target = b"abd"; + assert_eq!(find_max_match_length(target, parent, 0, 0), None); + } + + #[test] + fn build_triplet_lookup_table_should_handles_duplicate_triplets_correctly() { + let data = b"abcabcabc"; + let table = build_triplet_lookup_table(data).unwrap(); + + assert_eq!(table.len(), 3); + + assert_eq!( + table.get(&compute_triplet_hash(b"abc")), + Some(&vec![0, 3, 6]) + ); + assert_eq!(table.get(&compute_triplet_hash(b"bca")), Some(&vec![1, 4])); + assert_eq!(table.get(&compute_triplet_hash(b"cab")), Some(&vec![2, 5])); + } + + #[test] + fn compute_triplet_hash_should_return_correct_hash_for_normal_triplet() { + let data: Triplet = [1, 2, 3]; + assert_eq!(compute_triplet_hash(&data), 0x010203); + } + + #[test] + fn compute_triplet_hash_should_return_correct_hash_for_edge_case_values() { + assert_eq!(compute_triplet_hash(&[0, 0, 0]), 0x000000); + assert_eq!(compute_triplet_hash(&[255, 255, 255]), 0xFFFFFF); + } + + fn create_test_data_for_find_max_match_length<'a>() -> (&'a [u8], &'a [u8]) { + let target_data = b"abc123xyzabc"; + let parent_data = b"abc123def456"; + (target_data, parent_data) + } + + fn encode_to_bits(book: &Book, symbol: u8) -> BitVec { + let mut buffer = BitVec::new(); + book.encode(&mut buffer, &symbol) + .expect("Encoding failed in test"); + buffer + } + + fn create_test_huffman_book() -> Book { + let mut frequencies = HashMap::new(); + for i in 1..=20 { + frequencies.insert(i, 1); + } + for i in 0..=255 { + frequencies.insert(i, 1); + } + CodeBuilder::from_iter(frequencies).finish().0 + } +} diff --git a/src/encoder/zdelta_match_pointers.rs b/src/encoder/zdelta_match_pointers.rs new file mode 100644 index 0000000..095b209 --- /dev/null +++ b/src/encoder/zdelta_match_pointers.rs @@ -0,0 +1,309 @@ +const SMALL_OFFSET_THRESHOLD: i16 = 256; + +/// Types of reference pointers used in delta compression. +#[derive(Debug, PartialEq)] +pub enum ReferencePointerType { + /// Main reference pointer (primary pointer into reference data). + Main, + /// Auxiliary reference pointer (secondary pointer into reference data). + Auxiliary, + /// Pointer into the target data (local matches). + TargetLocal, +} + +/// Maintains pointers used for finding matches in delta compression. +/// +/// Contains: +/// - A pointer into the target data (for local matches). +/// - Two pointers into reference data (main and auxiliary). +pub struct MatchPointers { + target_ptr: usize, + main_ref_ptr: usize, + auxiliary_ref_ptr: usize, +} + +impl MatchPointers { + /// Creates new MatchPointers with specified initial positions. + pub fn new(target_ptr: usize, main_ref_ptr: usize, auxiliary_ref_ptr: usize) -> Self { + MatchPointers { + target_ptr, + main_ref_ptr, + auxiliary_ref_ptr, + } + } + + pub fn get(&self, pointer: &ReferencePointerType) -> usize { + match pointer { + ReferencePointerType::Main => self.main_ref_ptr, + ReferencePointerType::Auxiliary => self.auxiliary_ref_ptr, + ReferencePointerType::TargetLocal => self.target_ptr, + } + } + + /// Calculates the offset from the nearest pointer to the given position. + /// + /// Returns: + /// - The calculated offset (signed). + /// - The pointer type that was used (which pointer was closest). + pub fn calculate_offset(&self, parent_position: usize) -> (i16, ReferencePointerType) { + if parent_position < self.target_ptr { + let offset = parent_position as i16 - self.target_ptr as i16; + return (offset, ReferencePointerType::TargetLocal); + } + + let offset_main = parent_position as i16 - self.main_ref_ptr as i16; + let offset_auxiliary = parent_position as i16 - self.auxiliary_ref_ptr as i16; + + if offset_main.abs() <= offset_auxiliary.abs() { + (offset_main, ReferencePointerType::Main) + } else { + (offset_auxiliary, ReferencePointerType::Auxiliary) + } + } + + /// Updates the pointers after a match has been found. + /// + /// According to zdelta's strategy: + /// - For small offsets (< SMALL_OFFSET_THRESHOLD), moves the pointer that was used. + /// - For large offsets, moves the other pointer. + /// - Target pointer is always moved to match end position. + pub fn update_after_match( + &mut self, + match_end_position: usize, + offset: i16, + pointer_type: ReferencePointerType, + ) { + match pointer_type { + ReferencePointerType::TargetLocal => self.target_ptr = match_end_position, + ReferencePointerType::Main => { + if offset.abs() < SMALL_OFFSET_THRESHOLD { + self.main_ref_ptr = match_end_position; + } else { + self.auxiliary_ref_ptr = match_end_position; + } + } + ReferencePointerType::Auxiliary => { + if offset.abs() < SMALL_OFFSET_THRESHOLD { + self.auxiliary_ref_ptr = match_end_position; + } else { + self.main_ref_ptr = match_end_position; + } + } + } + } + + pub fn smart_update_after_match( + &mut self, + match_end_position: usize, + offset: i16, + pointer_type: ReferencePointerType, + previous_match_offset: Option, + ) { + match pointer_type { + ReferencePointerType::TargetLocal => { + self.target_ptr = match_end_position; + } + _ => { + if let Some(previous_offset) = previous_match_offset { + if previous_offset.abs() < SMALL_OFFSET_THRESHOLD + && offset.abs() < SMALL_OFFSET_THRESHOLD + { + match pointer_type { + ReferencePointerType::Main => self.main_ref_ptr = match_end_position, + ReferencePointerType::Auxiliary => { + self.auxiliary_ref_ptr = match_end_position + } + _ => {} + } + } else { + match pointer_type { + ReferencePointerType::Main => { + self.auxiliary_ref_ptr = match_end_position + } + ReferencePointerType::Auxiliary => { + self.main_ref_ptr = match_end_position + } + _ => {} + } + } + } else { + match pointer_type { + ReferencePointerType::Main => self.main_ref_ptr = match_end_position, + ReferencePointerType::Auxiliary => { + self.auxiliary_ref_ptr = match_end_position + } + _ => {} + } + } + } + } + } +} + +impl Default for MatchPointers { + fn default() -> Self { + MatchPointers::new(0, 0, 0) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn smart_update_after_match_should_update_target_ptr_for_target_local_matches() { + let mut pointers = MatchPointers::new(100, 200, 300); + pointers.smart_update_after_match(150, -50, ReferencePointerType::TargetLocal, None); + assert_eq!(pointers.target_ptr, 150); + assert_eq!(pointers.main_ref_ptr, 200); + assert_eq!(pointers.auxiliary_ref_ptr, 300); + } + + #[test] + fn smart_update_after_match_should_update_main_ptr_for_consecutive_small_offsets() { + let mut pointers = MatchPointers::new(100, 200, 300); + pointers.smart_update_after_match(250, 50, ReferencePointerType::Main, Some(30)); + assert_eq!(pointers.main_ref_ptr, 250); + assert_eq!(pointers.auxiliary_ref_ptr, 300); + } + + #[test] + fn smart_update_after_match_should_update_auxiliary_ptr_for_consecutive_small_offsets() { + let mut pointers = MatchPointers::new(100, 200, 300); + pointers.smart_update_after_match(350, 50, ReferencePointerType::Auxiliary, Some(40)); + assert_eq!(pointers.auxiliary_ref_ptr, 350); + assert_eq!(pointers.main_ref_ptr, 200); + } + + #[test] + fn smart_update_after_match_should_update_auxiliary_ptr_for_large_offset_after_small_offset() { + let mut pointers = MatchPointers::new(100, 200, 300); + pointers.smart_update_after_match(500, 300, ReferencePointerType::Main, Some(50)); + assert_eq!(pointers.auxiliary_ref_ptr, 500); + assert_eq!(pointers.main_ref_ptr, 200); + } + + #[test] + fn smart_update_after_match_should_update_main_ptr_for_large_offset_after_small_offset() { + let mut pointers = MatchPointers::new(100, 200, 300); + pointers.smart_update_after_match(600, 300, ReferencePointerType::Auxiliary, Some(60)); + assert_eq!(pointers.main_ref_ptr, 600); + assert_eq!(pointers.auxiliary_ref_ptr, 300); + } + + #[test] + fn smart_update_after_match_should_update_used_pointer_when_no_previous_offset() { + let mut pointers = MatchPointers::new(100, 200, 300); + pointers.smart_update_after_match(400, 200, ReferencePointerType::Main, None); + assert_eq!(pointers.main_ref_ptr, 400); + pointers.smart_update_after_match(500, 200, ReferencePointerType::Auxiliary, None); + assert_eq!(pointers.auxiliary_ref_ptr, 500); + } + + #[test] + fn smart_update_after_match_should_handle_edge_case_positions_correctly() { + let mut pointers = MatchPointers::new(100, 200, 300); + pointers.smart_update_after_match(usize::MAX, 32767, ReferencePointerType::Main, Some(256)); + assert_eq!(pointers.auxiliary_ref_ptr, usize::MAX); + } + + #[test] + fn smart_update_after_match_should_handle_negative_offsets_correctly() { + let mut pointers = MatchPointers::new(1000, 2000, 3000); + pointers.smart_update_after_match(2500, -500, ReferencePointerType::Main, Some(-200)); + assert_eq!(pointers.auxiliary_ref_ptr, 2500); + } + + #[test] + fn smart_update_after_match_should_maintain_pointer_integrity_for_zero_offsets() { + let mut pointers = MatchPointers::new(100, 200, 200); + pointers.smart_update_after_match(300, 0, ReferencePointerType::Main, Some(0)); + assert!(pointers.main_ref_ptr == 300 || pointers.auxiliary_ref_ptr == 300); + } + + #[test] + fn update_after_match_should_update_target_ptr_for_target_local_matches() { + let mut pointers = MatchPointers::new(100, 200, 300); + pointers.update_after_match(150, -50, ReferencePointerType::TargetLocal); + assert_eq!(pointers.target_ptr, 150); + } + + #[test] + fn update_after_match_should_update_main_ptr_for_small_offset() { + let mut pointers = MatchPointers::new(100, 200, 300); + pointers.update_after_match(250, SMALL_OFFSET_THRESHOLD - 1, ReferencePointerType::Main); + assert_eq!(pointers.main_ref_ptr, 250); + } + + #[test] + fn update_after_match_should_update_auxiliary_ptr_for_large_offset() { + let mut pointers = MatchPointers::new(100, 200, 300); + pointers.update_after_match(500, SMALL_OFFSET_THRESHOLD, ReferencePointerType::Main); + assert_eq!(pointers.auxiliary_ref_ptr, 500); + } + + #[test] + fn calculate_offset_should_return_target_local_with_negative_offset_when_position_before_target_ptr( + ) { + let pointers = MatchPointers::new(100, 200, 300); + let (offset, pointer_type) = pointers.calculate_offset(50); + assert_eq!(offset, -50); + assert!(matches!(pointer_type, ReferencePointerType::TargetLocal)); + } + + #[test] + fn calculate_offset_should_use_main_ref_ptr_when_its_offset_is_smaller() { + let pointers = MatchPointers::new(100, 200, 300); + let (offset, pointer_type) = pointers.calculate_offset(210); + assert_eq!(offset, 10); + assert!(matches!(pointer_type, ReferencePointerType::Main)); + } + + #[test] + fn calculate_offset_should_use_auxiliary_ref_ptr_when_its_offset_is_smaller() { + let pointers = MatchPointers::new(100, 200, 300); + let (offset, pointer_type) = pointers.calculate_offset(310); + assert_eq!(offset, 10); + assert!(matches!(pointer_type, ReferencePointerType::Auxiliary)); + } + + #[test] + fn calculate_offset_should_prefer_main_ref_ptr_when_offsets_are_equal() { + let pointers = MatchPointers::new(100, 200, 300); + let (offset, pointer_type) = pointers.calculate_offset(250); + assert_eq!(offset, 50); + assert!(matches!(pointer_type, ReferencePointerType::Main)); + } + + #[test] + fn calculate_offset_should_handle_position_at_target_ptr_edge_case() { + let pointers = MatchPointers::new(100, 200, 300); + let (offset, pointer_type) = pointers.calculate_offset(100); + assert_eq!(offset, -100); + assert!(matches!(pointer_type, ReferencePointerType::Main)); + } + + #[test] + fn calculate_offset_should_handle_position_at_main_ref_ptr_edge_case() { + let pointers = MatchPointers::new(100, 200, 300); + let (offset, pointer_type) = pointers.calculate_offset(200); + assert_eq!(offset, 0); + assert!(matches!(pointer_type, ReferencePointerType::Main)); + } + + #[test] + fn calculate_offset_should_handle_position_at_auxiliary_ref_ptr_edge_case() { + let pointers = MatchPointers::new(100, 200, 300); + let (offset, pointer_type) = pointers.calculate_offset(300); + assert_eq!(offset, 0); + assert!(matches!(pointer_type, ReferencePointerType::Auxiliary)); + } + + #[test] + fn calculate_offset_should_handle_large_offsets_correctly() { + let pointers = MatchPointers::new(1000, 2000, 3000); + let (offset, pointer_type) = pointers.calculate_offset(2500); + assert_eq!(offset, 500); + assert!(matches!(pointer_type, ReferencePointerType::Main)); + } +} diff --git a/src/hasher.rs b/src/hasher.rs new file mode 100644 index 0000000..f144441 --- /dev/null +++ b/src/hasher.rs @@ -0,0 +1,35 @@ +pub use aronovich_hash::{AronovichHash, AronovichHasher}; +pub use odess_hasher::{OdessHash, OdessHasher}; +use std::hash; + +mod aronovich_hash; +mod odess_hasher; + +/// Defines core hash functionality for Similarity-Based Chunking (SBC). +pub trait SBCHash: hash::Hash + Clone + Eq + PartialEq + Default + Send + Sync { + /// Creates a new hash instance from a 32-bit unsigned integer key. + fn new_with_u32(key: u32) -> Self; + + /// Generates the successor hash in the similarity hash sequence. + /// Used when exploring adjacent hashes in clustering operations. + fn next_hash(&self) -> Self; + + /// Generates the predecessor hash in the similarity hash sequence. + /// Used when exploring adjacent hashes in clustering operations. + fn last_hash(&self) -> Self; + + /// Extracts a 32-bit key for graph clustering algorithms. + fn get_key_for_graph_clusterer(&self) -> u32; +} + +/// A hasher that produces `SBCHash`-compatible digests from raw data. +/// +/// # Type Parameters +/// * `Hash` - The output hash type implementing `SBCHash` +pub trait SBCHasher { + /// The concrete hash type produced by this hasher + type Hash: SBCHash; + + /// Computes the similarity hash for a data chunk. + fn calculate_hash(&self, chunk_data: &[u8]) -> Self::Hash; +} diff --git a/src/hasher/aronovich_hash.rs b/src/hasher/aronovich_hash.rs new file mode 100644 index 0000000..a60b7c4 --- /dev/null +++ b/src/hasher/aronovich_hash.rs @@ -0,0 +1,316 @@ +use crate::hasher::SBCHasher; +use crate::SBCHash; +use std::collections::HashMap; +use std::hash::Hash; +use std::ops::Range; + +const BLOCKS_IN_C_SPECTRUM_COUNT: usize = 8; +const MIN_SPACE_VALUE: u32 = 1; +const BITS_IN_F_SPECTRUM_BLOCKS_COUNT: u32 = 3; +const BLOCKS_IN_F_SPECTRUM_COUNT: usize = 16; +const SHIFT_FOR_PAIR: u8 = 3; +const BLOCKS_FOR_P_SPECTRUM_INDEXES: Range = 5..9; +const MIN_FREQUENCY_FOR_BYTE: u32 = 50; + +#[derive(Debug)] +pub struct AronovichHash { + hash: u32, +} + +impl Hash for AronovichHash { + fn hash(&self, state: &mut H) { + self.hash.hash(state) + } +} + +impl Clone for AronovichHash { + fn clone(&self) -> Self { + AronovichHash::new_with_u32(self.hash) + } +} + +impl Eq for AronovichHash {} + +impl PartialEq for AronovichHash { + fn eq(&self, other: &Self) -> bool { + self.hash == other.hash + } +} + +impl Default for AronovichHash { + fn default() -> Self { + Self::new_with_u32(u32::default()) + } +} + +impl SBCHash for AronovichHash { + fn new_with_u32(hash: u32) -> Self { + AronovichHash { hash } + } + fn next_hash(&self) -> Self { + AronovichHash { + hash: self.hash.saturating_add(1), + } + } + + fn last_hash(&self) -> Self { + AronovichHash { + hash: self.hash.saturating_sub(1), + } + } + + fn get_key_for_graph_clusterer(&self) -> u32 { + self.hash + } +} +pub struct AronovichHasher; + +impl SBCHasher for AronovichHasher { + type Hash = AronovichHash; + + fn calculate_hash(&self, chunk_data: &[u8]) -> AronovichHash { + let mut byte_value_byte_frequency = HashMap::new(); + let mut pair_value_pair_frequency = HashMap::new(); + let mut last_byte = chunk_data[0]; + byte_value_byte_frequency.insert(last_byte, 1u32); + for byte in &chunk_data[1..] { + let byte_count = byte_value_byte_frequency.entry(*byte).or_insert(0); + *byte_count += 1; + + let pair_count = pair_value_pair_frequency + .entry((last_byte, *byte)) + .or_insert(0u32); + *pair_count += 1; + last_byte = *byte; + } + + let c_f_hash = processing_of_c_f_spectrum(byte_value_byte_frequency); + let p_hash = processing_of_p_spectrum(pair_value_pair_frequency); + AronovichHash::new_with_u32(c_f_hash ^ p_hash) + } +} + +fn processing_of_c_spectrum(c_f_spectrum: &[(&u8, &u32)]) -> u32 { + let mut spaces_in_c_spectrum = Vec::new(); + for byte_index in 0..c_f_spectrum.len() - 1 { + let frequency_delta = + (c_f_spectrum[byte_index].1 - c_f_spectrum[byte_index + 1].1) * (byte_index + 1) as u32; + if frequency_delta >= MIN_SPACE_VALUE + && *c_f_spectrum[byte_index + 1].1 >= MIN_FREQUENCY_FOR_BYTE + { + spaces_in_c_spectrum.push((byte_index, frequency_delta)); + } + } + spaces_in_c_spectrum.sort_by(|a, b| { + if b.1 != a.1 { + b.1.cmp(&a.1) + } else { + a.0.cmp(&b.0) + } + }); + + let mut spaces_in_c_spectrum_indexes = Vec::new(); + for space in spaces_in_c_spectrum.iter().take(std::cmp::min( + spaces_in_c_spectrum.len(), + BLOCKS_IN_C_SPECTRUM_COUNT, + )) { + spaces_in_c_spectrum_indexes.push(space.0); + } + spaces_in_c_spectrum_indexes.sort(); + + let mut hash: u32 = 0; + + let mut start_block = 0; + for (block_number, id_end_block) in spaces_in_c_spectrum_indexes.iter().enumerate() { + let end_block = *id_end_block; + let block = &c_f_spectrum[start_block..=end_block]; + let mut block_hash = 0; + for byte_frequency in block { + block_hash ^= *byte_frequency.0 as u32; + } + + block_hash <<= (BLOCKS_IN_C_SPECTRUM_COUNT - block_number) * 3; + hash ^= block_hash; + start_block = end_block + 1; + } + hash +} + +fn find_first_significant_bit(block: u32) -> u32 { + let mut number = block; + let mut bit_index = 0; + while number > 1 { + number >>= 1; + bit_index += 1; + } + bit_index +} + +fn processing_of_f_spectrum(c_f_spectrum: &[(&u8, &u32)]) -> u32 { + let mut hash: u32 = 0; + let shifts = [0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 6]; + + for block_index in 0..std::cmp::min(c_f_spectrum.len(), BLOCKS_IN_F_SPECTRUM_COUNT) { + let mut block_hash = *c_f_spectrum[block_index].1; + block_hash <<= BITS_IN_F_SPECTRUM_BLOCKS_COUNT; + let significant_bit = find_first_significant_bit(block_hash); + block_hash >>= significant_bit - BITS_IN_F_SPECTRUM_BLOCKS_COUNT; + block_hash %= 1 << BITS_IN_F_SPECTRUM_BLOCKS_COUNT; + + block_hash <<= shifts[block_index]; + hash ^= block_hash; + } + + hash +} + +fn processing_of_pair(pair: &(u8, u8)) -> u32 { + let byte1 = ((pair.0 % (1 << (8 - SHIFT_FOR_PAIR))) << SHIFT_FOR_PAIR) + + pair.0 / (1 << (8 - SHIFT_FOR_PAIR)); + let byte2 = + ((pair.1 % (1 << SHIFT_FOR_PAIR)) << (8 - SHIFT_FOR_PAIR)) + pair.1 / (1 << SHIFT_FOR_PAIR); + ((byte1 as u32) << 4) ^ (byte2 as u32) +} + +fn processing_of_p_spectrum(pair_value_pair_frequency: HashMap<(u8, u8), u32>) -> u32 { + let mut p_spectrum: Vec<(&(u8, u8), &u32)> = pair_value_pair_frequency.iter().collect(); + p_spectrum.sort_by(|a, b| { + if b.1 != a.1 { + b.1.cmp(a.1) + } else if a.0 .0 != b.0 .0 { + a.0 .0.cmp(&b.0 .0) + } else { + a.0 .1.cmp(&b.0 .1) + } + }); + let mut hash: u32 = 0; + for block_index in BLOCKS_FOR_P_SPECTRUM_INDEXES { + if block_index >= p_spectrum.len() { + break; + } + hash ^= processing_of_pair(p_spectrum[block_index].0) << 20; + } + + hash +} + +fn processing_of_c_f_spectrum(byte_value_byte_frequency: HashMap) -> u32 { + let mut c_f_spectrum: Vec<(&u8, &u32)> = byte_value_byte_frequency.iter().collect(); + c_f_spectrum.sort_by(|a, b| { + if b.1 != a.1 { + b.1.cmp(a.1) + } else { + a.0.cmp(b.0) + } + }); + let c_hash = processing_of_c_spectrum(c_f_spectrum.as_slice()); + let f_hash = processing_of_f_spectrum(c_f_spectrum.as_slice()); + c_hash ^ f_hash +} + +#[cfg(test)] +mod test { + use super::*; + #[test] + fn test_processing_of_pair() { + let a = 175u8; + let b = 113u8; + let processed_pair = processing_of_pair(&(a, b)); + let name = &format!("{processed_pair:b}"); + assert_eq!(name, "11111111110") + } + + #[test] + fn test_processing_of_p_spectrum_with_one_pair() { + let mut p_spectrum = HashMap::new(); + for i in 0..6 { + p_spectrum.insert((175u8 + i as u8, 113u8), i); + } + let processed_p_spectrum = processing_of_p_spectrum(p_spectrum); + let name = &format!("{processed_p_spectrum:b}"); + assert_eq!(name, "1111111111000000000000000000000") + } + + #[test] + fn test_processing_of_p_spectrum_with_two_eq_pairs() { + let mut p_spectrum = HashMap::new(); + for _ in 0..7 { + p_spectrum.insert((175u8, 113u8), 0u32); + } + let processed_p_spectrum = processing_of_p_spectrum(p_spectrum); + assert_eq!(processed_p_spectrum, 0) + } + + #[test] + fn test_processing_of_p_spectrum() { + let mut p_spectrum = HashMap::new(); + for i in 0..6 { + p_spectrum.insert((175u8 + i as u8, 113u8), i); + } + p_spectrum.insert((7u8, 7u8), 0u32); + let processed_p_spectrum = processing_of_p_spectrum(p_spectrum); + let name = &format!("{processed_p_spectrum:b}"); + assert_eq!(name, "1001001111000000000000000000000") + } + + pub fn return_p_spectrum_hash(data: &[u8]) -> u32 { + let mut pair_value_pair_frequency = HashMap::new(); + let mut last_byte = data[0]; + for byte in &data[1..] { + let pair_count = pair_value_pair_frequency + .entry((last_byte, *byte)) + .or_insert(0u32); + *pair_count += 1; + last_byte = *byte; + } + processing_of_p_spectrum(pair_value_pair_frequency) + } + + #[test] + fn test_pairs_vec_for_eq_chunks() { + let chunk: Vec = (0..300).map(|_| rand::random::()).collect(); + let pairs_vec_1 = return_p_spectrum_hash(chunk.as_slice()); + let pairs_vec_2 = return_p_spectrum_hash(chunk.as_slice()); + assert_eq!(pairs_vec_1, pairs_vec_2) + } + + fn return_c_f_spectrum_hash(data: &[u8]) -> u32 { + let mut byte_value_byte_frequency = HashMap::new(); + for byte in data { + let byte_count = byte_value_byte_frequency.entry(*byte).or_insert(0); + *byte_count += 1; + } + processing_of_c_f_spectrum(byte_value_byte_frequency) + } + + #[test] + fn test_c_f_spectrum_for_eq_chunks() { + let chunk: Vec = (0..8192).map(|_| rand::random::()).collect(); + let c_f_hash_1 = return_c_f_spectrum_hash(chunk.as_slice()); + let c_f_hash_2 = return_c_f_spectrum_hash(chunk.as_slice()); + assert_eq!(c_f_hash_1, c_f_hash_2) + } + + #[test] + fn test_c_f_hash_for_different_1_byte() { + let chunk: Vec = (0..8192).map(|_| rand::random::()).collect(); + let mut similarity_chunk = chunk.clone(); + if similarity_chunk[15] == 255 { + similarity_chunk[15] = 0; + } else { + similarity_chunk[15] = 255; + } + let c_f_hash_1 = return_c_f_spectrum_hash(chunk.as_slice()); + let c_f_hash_2 = return_c_f_spectrum_hash(similarity_chunk.as_slice()); + assert_eq!(c_f_hash_1, c_f_hash_2); + assert!(u32::abs_diff(c_f_hash_1, c_f_hash_2) <= 32) + } + + #[test] + fn test_hash_for_eq_chunks() { + let chunk: Vec = (0..8192).map(|_| rand::random::()).collect(); + let hash = AronovichHasher.calculate_hash(chunk.as_slice()); + let eq_hash = AronovichHasher.calculate_hash(chunk.as_slice()); + assert_eq!(hash, eq_hash) + } +} diff --git a/src/hasher/broders_method.rs b/src/hasher/broders_method.rs new file mode 100644 index 0000000..e037dc6 --- /dev/null +++ b/src/hasher/broders_method.rs @@ -0,0 +1,52 @@ +use std::collections::HashSet; + +const WORD_LEN: usize = 8; +const COUNT_WORDS: usize = 5; +const RABIN_HASH_X: u32 = 43; +const RABIN_HASH_Q: u32 = (1 << 31) - 1; + +fn set_for_chunk(data: &[u8]) -> HashSet { + let block_size = WORD_LEN * COUNT_WORDS; + let mut set_blocks = HashSet::new(); + let mut rabin_hash = rabin_hash_simple(&data[0..std::cmp::min(block_size, data.len())]); + + for index_word in (0..data.len()).step_by(WORD_LEN) { + set_blocks.insert(rabin_hash); + if index_word + block_size > data.len() { + break; + } + rabin_hash = rabin_hash_next( + rabin_hash, + hash_word(&data[index_word..index_word + WORD_LEN]), + hash_word( + &data[index_word + block_size + ..std::cmp::min(index_word + block_size + WORD_LEN, data.len())], + ), + ); + } + set_blocks +} + +fn rabin_hash_simple(data: &[u8]) -> u32 { + let mut rabin_hash = 0; + for i in (0..data.len()).step_by(WORD_LEN) { + rabin_hash += hash_word(&data[i..i + WORD_LEN]) + * RABIN_HASH_X.pow((COUNT_WORDS - i / WORD_LEN) as u32) + % RABIN_HASH_Q; + } + rabin_hash +} + +fn hash_word(word: &[u8]) -> u32 { + let mut hash_word = 0; + for byte in word { + hash_word += *byte as u32; + } + hash_word +} + +fn rabin_hash_next(past_hash: u32, hash_start_word: u32, hash_next_word: u32) -> u32 { + ((past_hash - hash_start_word * RABIN_HASH_X.pow(COUNT_WORDS as u32 - 1)) * RABIN_HASH_X + + hash_next_word) + % RABIN_HASH_Q +} diff --git a/src/hasher/odess_hasher.rs b/src/hasher/odess_hasher.rs new file mode 100644 index 0000000..7a365fb --- /dev/null +++ b/src/hasher/odess_hasher.rs @@ -0,0 +1,123 @@ +use crate::encoder::GEAR; +use crate::hasher::{SBCHash, SBCHasher}; +use std::hash::Hash; +#[derive(Default)] +pub struct OdessHash { + hash: [u64; 3], +} + +impl Hash for OdessHash { + fn hash(&self, state: &mut H) { + self.hash.hash(state) + } +} + +impl Clone for OdessHash { + fn clone(&self) -> Self { + OdessHash { hash: self.hash } + } +} + +impl Eq for OdessHash {} + +impl PartialEq for OdessHash { + fn eq(&self, other: &Self) -> bool { + self.hash == other.hash + } +} + +impl SBCHash for OdessHash { + fn new_with_u32(_: u32) -> Self { + todo!() + } + + fn next_hash(&self) -> Self { + let mut odess_hash = self.clone(); + if odess_hash.hash[0] < u64::MAX { + odess_hash.hash[0] += 1; + } else if odess_hash.hash[1] < u64::MAX { + odess_hash.hash[0] = 0; + odess_hash.hash[1] += 1; + } else if odess_hash.hash[2] < u64::MAX { + odess_hash.hash[0] = 0; + odess_hash.hash[1] = 0; + odess_hash.hash[2] += 1; + } else { + odess_hash.hash = [u64::MAX; 3] + } + odess_hash + } + + fn last_hash(&self) -> Self { + let mut odess_hash = self.clone(); + if odess_hash.hash[0] > 0 { + odess_hash.hash[0] -= 1; + } else if odess_hash.hash[1] > 0 { + odess_hash.hash[0] = u64::MAX; + odess_hash.hash[1] -= 1; + } else if odess_hash.hash[2] > 0 { + odess_hash.hash[0] = u64::MAX; + odess_hash.hash[1] = u64::MAX; + odess_hash.hash[2] -= 1; + } else { + odess_hash.hash = [0u64; 3] + } + odess_hash + } + + fn get_key_for_graph_clusterer(&self) -> u32 { + todo!() + } +} + +/// Реализация метода Odess для вычисления признаков чанка +pub struct OdessHasher { + sampling_rate: u64, + linear_coeffs: [u64; 3], +} + +impl SBCHasher for OdessHasher { + type Hash = OdessHash; + fn calculate_hash(&self, chunk: &[u8]) -> OdessHash { + let mut features = [u64::MAX; 3]; + let mask = self.sampling_rate - 1; + let mut fp = 0u64; + + for &byte in chunk { + // Gear rolling hash: FP = (FP << 1) + Gear[byte] + fp = (fp << 1).wrapping_add(GEAR[byte as usize]); + + // Content-defined sampling + if fp & mask == 0 { + for (i, feature) in features.iter_mut().enumerate() { + let transform = self.linear_coeffs[i] + .wrapping_mul(fp) + .wrapping_add(byte as u64) + % (1u64 << 32); + if *feature >= transform { + *feature = transform; + } + } + } + } + OdessHash { hash: features } + } +} + +impl Default for OdessHasher { + fn default() -> Self { + Self::new(7) + } +} + +impl OdessHasher { + pub fn new(sampling_ratio: u32) -> Self { + // Инициализация коэффициентов для линейных преобразований + let linear_coeffs = [0x3f9c9a5d4e8a3b2a, 0x7d4f1b2c3a6e5d8c, 0x1a2b3c4d5e6f7a8b]; + + OdessHasher { + sampling_rate: 1u64 << sampling_ratio, + linear_coeffs, + } + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..a6336ed --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,105 @@ +use crate::decoder::Decoder; +pub use chunkfs_sbc::SBCScrubber; +use hasher::SBCHash; +use std::collections::HashMap; + +mod chunkfs_sbc; +pub mod clusterer; +pub mod decoder; +pub mod encoder; +pub mod hasher; + +/// Represents the type of a chunk stored in the filesystem. +/// +/// # There are two variants: +/// - `Simple`: The chunk is stored in its entirety (raw data). +/// - `Delta`: The chunk is stored as a delta-encoded difference relative to a parent chunk. +/// +/// # Type Parameters +/// +/// * `Hash` - A type that implements the `SBCHash` trait, representing the hash of the parent chunk. +#[derive(Hash, PartialEq, Eq, Clone, Default, Debug)] +enum ChunkType { + /// The chunk is stored as a delta relative to a parent chunk. + Delta { + /// The hash of the parent chunk. + parent_hash: Hash, + /// The delta chunk's sequence number. + number: u16, + }, + /// The chunk is stored in full (non-delta). + #[default] + Simple, +} + +/// A key identifying a chunk stored in the filesystem. +/// +/// This structure uniquely represents a chunk by combining its content hash and its storage type. +/// +/// # Type Parameters +/// +/// * `H` - A hash type implementing the `SBCHash` trait, used to identify the chunk content. +/// +/// # Fields +/// +/// * `hash` - The hash of the chunk's content. +/// * `chunk_type` - The type of the chunk, indicating whether it is stored as a full chunk or as a delta. +#[derive(Hash, PartialEq, Eq, Clone, Default)] +pub struct SBCKey { + /// The hash identifying the chunk content. + hash: H, + + /// The type of the chunk (simple or delta). + chunk_type: ChunkType, +} + +/// A storage map for chunks in the filesystem. +/// +/// `SBCMap` manages a collection of chunks identified by their keys (`SBCKey`), +/// storing the raw chunk data as byte vectors. It also holds a decoder instance +/// used for decoding chunk data when needed. +/// +/// # Type Parameters +/// +/// * `D` - The decoder type implementing the `Decoder` trait, responsible for decoding chunk bytes. +/// * `H` - The hash type implementing the `SBCHash` trait, used to identify chunks. +/// +/// # Fields +/// +/// * `sbc_hashmap` - A `HashMap` mapping each chunk's key to its raw byte data. +/// * `decoder` - An instance of the decoder used to interpret chunk data. +/// +/// # Example +/// +/// ``` +/// use sbc_algorithm::decoder::LevenshteinDecoder; +/// use sbc_algorithm::hasher::AronovichHash; +/// use sbc_algorithm::SBCMap; +/// +/// let mut map: SBCMap = SBCMap::new(LevenshteinDecoder::default()); +/// ``` +pub struct SBCMap { + /// Internal storage mapping chunk keys to their raw byte content. + sbc_hashmap: HashMap, Vec>, + + /// Decoder instance used to decode chunk data. + decoder: D, +} + +impl SBCMap { + /// Creates a new, empty `SBCMap` with the given decoder. + /// + /// # Arguments + /// + /// * `_decoder` - An instance of a decoder implementing the `Decoder` trait. + /// + /// # Returns + /// + /// A new `SBCMap` ready to store chunks and decode them on demand. + pub fn new(decoder: D) -> Self { + SBCMap { + sbc_hashmap: HashMap::new(), + decoder, + } + } +} diff --git a/tests/sbc_tests.rs b/tests/sbc_tests.rs new file mode 100644 index 0000000..bc8b39d --- /dev/null +++ b/tests/sbc_tests.rs @@ -0,0 +1,70 @@ +#[cfg(test)] +mod test { + extern crate chunkfs; + extern crate sbc_algorithm; + use chunkfs::chunkers::SuperChunker; + use chunkfs::hashers::Sha256Hasher; + use chunkfs::FileSystem; + use sbc_algorithm::decoder::{GdeltaDecoder, LevenshteinDecoder}; + use sbc_algorithm::encoder::{GdeltaEncoder, LevenshteinEncoder}; + use sbc_algorithm::{clusterer, hasher, SBCMap, SBCScrubber}; + use std::collections::HashMap; + + #[test] + fn test_data_recovery_levenshtein() { + let mut fs = FileSystem::new_with_scrubber( + HashMap::default(), + SBCMap::new(LevenshteinDecoder::default()), + Box::new(SBCScrubber::new( + hasher::AronovichHasher, + clusterer::GraphClusterer::default(), + LevenshteinEncoder::default(), + )), + Sha256Hasher::default(), + ); + let mut handle = fs + .create_file("file".to_string(), SuperChunker::default()) + .unwrap(); + let data = generate_data(3); + fs.write_to_file(&mut handle, &data).unwrap(); + fs.close_file(handle).unwrap(); + + let _res = fs.scrub().unwrap(); + + let handle = fs.open_file("file", SuperChunker::default()).unwrap(); + let read = fs.read_file_complete(&handle).unwrap(); + assert_eq!(read, data); + } + + #[test] + fn test_data_recovery_gdelta() { + let mut fs = FileSystem::new_with_scrubber( + HashMap::default(), + SBCMap::new(GdeltaDecoder::default()), + Box::new(SBCScrubber::new( + hasher::AronovichHasher, + clusterer::GraphClusterer::default(), + GdeltaEncoder::default(), + )), + Sha256Hasher::default(), + ); + let mut handle = fs + .create_file("file".to_string(), SuperChunker::default()) + .unwrap(); + let data = generate_data(8); + fs.write_to_file(&mut handle, &data).unwrap(); + fs.close_file(handle).unwrap(); + + let _res = fs.scrub().unwrap(); + + let handle = fs.open_file("file", SuperChunker::default()).unwrap(); + let read = fs.read_file_complete(&handle).unwrap(); + assert_eq!(read, data); + } + const MB: usize = 1024 * 1024; + + fn generate_data(mb_size: usize) -> Vec { + let bytes = mb_size * MB; + (0..bytes).map(|_| rand::random::()).collect() + } +}