diff --git a/src/encoder/ddelta_encoder.rs b/src/encoder/ddelta_encoder.rs index 0a677e3..a2314c7 100644 --- a/src/encoder/ddelta_encoder.rs +++ b/src/encoder/ddelta_encoder.rs @@ -71,16 +71,17 @@ impl Encoder for DdeltaEncoder { let mut target_hash = SBCKey::default(); match data_container.extract() { Data::Chunk(data) => { - let (left, processed, sbc_hash) = self.encode_delta_chunk( - target_map.clone(), - data, - hash.clone(), - parent_data.as_slice(), - &mut source_chunks_indices, - parent_hash.clone(), - ); - data_left += left; - processed_data += processed; + let (left_in_delta_chunk, processed_in_delta_chunk, sbc_hash) = self + .encode_delta_chunk( + target_map.clone(), + data, + hash.clone(), + parent_data.as_slice(), + &mut source_chunks_indices, + parent_hash.clone(), + ); + data_left += left_in_delta_chunk; + processed_data += processed_in_delta_chunk; target_hash = sbc_hash; } Data::TargetChunk(_) => {} @@ -497,13 +498,14 @@ fn build_chunks_indices(source_chunks: &Vec<&[u8]>) -> HashMap { fn gear_chunking(data: &[u8]) -> Vec<&[u8]> { let mut source_chunks: Vec<&[u8]> = Vec::new(); let mut current_window_hash: u64 = 0; - let mut start_current_chunk: usize = 0; + let mut start_current_chunk = 0; let mask = (1 << AVERAGE_CHUNK_SIZE.next_power_of_two().trailing_zeros()) - 1; - let mut data_index: usize = 0; + let mut data_index = 0; while data_index < data.len() { current_window_hash = (current_window_hash << 1).wrapping_add(GEAR[data[data_index] as usize]); + if (current_window_hash & mask) == CHUNK_THRESHOLD { source_chunks.push(&data[start_current_chunk..data_index]); start_current_chunk = data_index; @@ -528,7 +530,8 @@ mod test { }; use crate::encoder::encode_simple_chunk; use crate::hasher::AronovichHash; - use rand::Rng; + use rand::prelude::StdRng; + use rand::{Rng, SeedableRng}; #[test] fn process_target_chunk_with_edelta_should_process_full_match_with_compression_priority() { @@ -793,7 +796,7 @@ mod test { let chunk_indices = build_chunks_indices(&source_chunks); assert_eq!( - find_match_compression_is_priority(source_data, &chunk_indices, 0, &target_chunks,), + find_match_compression_is_priority(source_data, &chunk_indices, 0, &target_chunks), Some((11, 1, 11, 0)) ) } @@ -1064,7 +1067,7 @@ mod test { #[test] fn test_restore_similarity_chunk_with_cyclic_shift_left() { - let data: Vec = generate_test_data(); + let data: Vec = generate_test_data_deterministic(42); let mut data2 = data[..192].to_vec(); data2.extend(&data); @@ -1087,6 +1090,12 @@ mod test { (0..TEST_DATA_SIZE).map(|_| rand::random::()).collect() } + fn generate_test_data_deterministic(seed: u64) -> Vec { + const TEST_DATA_SIZE: usize = 8192; + let mut rng = StdRng::seed_from_u64(seed); + (0..TEST_DATA_SIZE).map(|_| rng.gen()).collect() + } + fn create_map_and_key<'a>( data: &'a [u8], data2: &'a [u8], diff --git a/src/encoder/xdelta_encoder.rs b/src/encoder/xdelta_encoder.rs index 1f7fd86..1615bfd 100644 --- a/src/encoder/xdelta_encoder.rs +++ b/src/encoder/xdelta_encoder.rs @@ -61,7 +61,7 @@ impl XdeltaEncoder { chunk_data: &[u8], hash: Hash, parent_data: &[u8], - word_hash_offsets: &HashMap, + word_hash_offsets: &HashMap>, parent_hash: Hash, ) -> (usize, usize, SBCKey) { let mut delta_code = Vec::new(); @@ -213,25 +213,44 @@ fn encode_copy_sequence( i: &mut usize, delta_code: &mut Vec, initial_hash: u32, - word_hash_offsets: &HashMap, + word_hash_offsets: &HashMap>, ) { if *i >= chunk_data.len() || !word_hash_offsets.contains_key(&initial_hash) { return; } - let mut equal_part_len = 0; - let offset = *word_hash_offsets.get(&initial_hash).unwrap(); - let max_len = min(parent_data.len() - offset, chunk_data.len() - *i); + let offsets = match word_hash_offsets.get(&initial_hash) { + Some(v) => v, + None => return, + }; + + let mut best_len = 0; + let mut best_offset = 0; + + for &offset in offsets { + let max_len = min(parent_data.len() - offset, chunk_data.len() - *i); + let mut equal_part_len = 0; - while equal_part_len < max_len - && parent_data[offset + equal_part_len] == chunk_data[*i + equal_part_len] - { - equal_part_len += 1; + while equal_part_len < max_len + && parent_data[offset + equal_part_len] == chunk_data[*i + equal_part_len] + { + equal_part_len += 1; + } + + if equal_part_len > best_len { + best_len = equal_part_len; + best_offset = offset; + } } - if equal_part_len > 0 { - encode_copy_instruction(equal_part_len, offset, delta_code); - *i += equal_part_len; + if best_len > 0 { + encode_copy_instruction(best_len, best_offset, delta_code); + *i += best_len; + } else { + let end = min(*i + BLOCK_SIZE, chunk_data.len()); + let insert_data = chunk_data[*i..end].to_vec(); + encode_insert_instruction(insert_data, delta_code); + *i = end; } } @@ -246,11 +265,11 @@ fn encode_copy_sequence( fn encode_insert_sequence( chunk_data: &[u8], i: &mut usize, - word_hash_offsets: &HashMap, + word_hash_offsets: &HashMap>, delta_code: &mut Vec, initial_hash: u32, ) { - if *i >= chunk_data.len() || word_hash_offsets.contains_key(&initial_hash) { + if *i >= chunk_data.len() { return; } @@ -298,13 +317,16 @@ fn adler32(data: &[u8]) -> u32 { /// HashMap where: /// - Key: Adler32 hash of a block. /// - Value: First starting position of that block in source_data. -fn create_block_hashmap(source_data: &[u8]) -> HashMap { +fn create_block_hashmap(source_data: &[u8]) -> HashMap> { let mut i = 0; let mut block_position_map = HashMap::new(); while i + BLOCK_SIZE <= source_data.len() { let block_hash = adler32(&source_data[i..i + BLOCK_SIZE]); - block_position_map.entry(block_hash).or_insert(i); + block_position_map + .entry(block_hash) + .or_insert_with(Vec::new) + .push(i); i += 1; } @@ -317,6 +339,8 @@ mod test { use crate::decoder; use crate::encoder::encode_simple_chunk; use crate::hasher::AronovichHash; + use rand::prelude::StdRng; + use rand::{Rng, SeedableRng}; const TEST_DATA_SIZE: usize = 8192; @@ -338,7 +362,10 @@ mod test { fn create_block_hashmap_should_store_first_position_for_duplicate_blocks() { let data = b"abcdabcdabcdabcdabcdabcdabcdabcd"; let result = create_block_hashmap(data); - assert_eq!(result.get(&adler32(b"abcdabcdabcdabcd")), Some(&0)); + assert_eq!( + result.get(&adler32(b"abcdabcdabcdabcd")), + Some(&vec![0, 4, 8, 12, 16]) + ); assert_eq!(result.len(), 3); } @@ -396,7 +423,7 @@ mod test { let chunk_data = vec![10; 16]; let mut word_hash_offsets = HashMap::new(); let hash = adler32(&chunk_data); - word_hash_offsets.insert(hash, 0); + word_hash_offsets.insert(hash, vec![0]); let mut delta_code = Vec::new(); let mut i = 0; @@ -421,7 +448,7 @@ mod test { let hash_second_block = adler32(&chunk_data[16..32]); let mut word_hash_offsets = HashMap::new(); - word_hash_offsets.insert(hash_second_block, 16); + word_hash_offsets.insert(hash_second_block, vec![16]); let mut delta_code = Vec::new(); let mut i = 0; @@ -561,7 +588,7 @@ mod test { #[test] fn test_restore_similarity_chunk_1_byte_diff() { - let mut data: Vec = generate_test_data(); + let mut data: Vec = generate_test_data_deterministic(13); let data2 = data.clone(); if data[15] < 255 { data[15] = 255; @@ -576,7 +603,7 @@ mod test { #[test] fn test_restore_similarity_chunk_2_neighbor_byte_diff() { - let mut data: Vec = generate_test_data(); + let mut data: Vec = generate_test_data_deterministic(56); let data2 = data.clone(); if data[15] < 255 { data[15] = 255; @@ -596,7 +623,7 @@ mod test { #[test] fn test_restore_similarity_chunk_2_byte_diff() { - let mut data: Vec = generate_test_data(); + let mut data: Vec = generate_test_data_deterministic(35); let data2 = data.clone(); if data[15] < 255 { data[15] = 255; @@ -609,14 +636,14 @@ mod test { data[106] = 0; } - let (sbc_map, sbc_key) = create_map_and_key(data.as_slice(), data2.as_slice()); + let (sbc_map, sbc_key) = create_map_and_key(&data, &data2); assert_eq!(sbc_map.get(&sbc_key).unwrap(), data2); } #[test] fn test_restore_similarity_chunk_with_offset_left() { - let data: Vec = generate_test_data(); + let data: Vec = generate_test_data_deterministic(41); let data2 = data[15..].to_vec(); let (sbc_map, sbc_key) = create_map_and_key(data.as_slice(), data2.as_slice()); @@ -626,7 +653,7 @@ mod test { #[test] fn test_restore_similarity_chunk_with_offset_right() { - let data: Vec = generate_test_data(); + let data: Vec = generate_test_data_deterministic(65); let data2 = data[..8000].to_vec(); let (sbc_map, sbc_key) = create_map_and_key(data.as_slice(), data2.as_slice()); @@ -636,7 +663,7 @@ mod test { #[test] fn test_restore_similarity_chunk_with_offset() { - let data: Vec = generate_test_data(); + let data: Vec = generate_test_data_deterministic(45); let mut data2 = data[15..8000].to_vec(); data2[0] /= 3; data2[7000] /= 3; @@ -648,7 +675,7 @@ mod test { #[test] fn test_restore_similarity_chunk_with_cyclic_shift_right() { - let data: Vec = generate_test_data(); + let data: Vec = generate_test_data_deterministic(44); let mut data2 = data.clone(); data2.extend(&data[8000..]); @@ -667,7 +694,7 @@ mod test { #[test] fn test_restore_similarity_chunk_with_cyclic_shift_left() { - let data: Vec = generate_test_data(); + let data: Vec = generate_test_data_deterministic(42); let mut data2 = data[..192].to_vec(); data2.extend(&data); @@ -684,8 +711,9 @@ mod test { assert_eq!(sbc_map.get(&sbc_key).unwrap(), data2); } - fn generate_test_data() -> Vec { - (0..TEST_DATA_SIZE).map(|_| rand::random::()).collect() + fn generate_test_data_deterministic(seed: u64) -> Vec { + let mut rng = StdRng::seed_from_u64(seed); + (0..TEST_DATA_SIZE).map(|_| rng.gen()).collect() } fn create_map_and_key<'a>(