Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
141 commits
Select commit Hold shift + click to select a range
0502623
clustering and delta coding
maxscherbakov Apr 15, 2024
34e1f4b
Delete code_dir/target directory
maxscherbakov Apr 15, 2024
e5df667
Delete code_dir/Cargo.lock
maxscherbakov Apr 15, 2024
7b4e0f4
delete folders with modules
maxscherbakov Apr 15, 2024
81560a4
add modules
maxscherbakov Apr 15, 2024
cb34836
delete new_branch
maxscherbakov Apr 15, 2024
5694e08
test fastCDC
maxscherbakov Apr 15, 2024
fe49f75
fix from review
maxscherbakov Apr 15, 2024
792a52c
fix from review
maxscherbakov Apr 15, 2024
c87ba63
remove ChunkWithDeltaCode
maxscherbakov Apr 18, 2024
f185382
encode data from a file
maxscherbakov Apr 18, 2024
12f6c2f
decode data from a file
maxscherbakov Apr 18, 2024
6d926fd
decode data from a file
maxscherbakov Apr 18, 2024
7fae8d7
fix from review
maxscherbakov Apr 19, 2024
c82ff6e
add unit tests module
maxscherbakov Apr 20, 2024
2fb4de2
processing of c-spectrum elements in hash-function
maxscherbakov Apr 30, 2024
2035f64
cargo clippy and fmt fix
maxscherbakov May 8, 2024
48e3499
refactor directory
maxscherbakov May 8, 2024
2d0b035
add function for find leader_chunk
maxscherbakov May 10, 2024
0005e3a
refactor code, chunks to enum
maxscherbakov May 19, 2024
b4151e5
Delete .idea directory
maxscherbakov May 19, 2024
85613cd
merge
maxscherbakov May 19, 2024
a3f556a
merge
maxscherbakov May 19, 2024
c24b856
levenshtein_functions param to &[u8]
maxscherbakov May 19, 2024
8682062
add tests
maxscherbakov May 19, 2024
267588b
add trait Map, add hashmap_transitions
maxscherbakov May 19, 2024
c9069e2
replace cdc_hash to index_chunk (conflict correction)
maxscherbakov May 19, 2024
d81e69a
rename parameters
maxscherbakov May 19, 2024
ed881d3
fix index for create_indexes
maxscherbakov May 19, 2024
5c0eb43
fix warning in tests
maxscherbakov May 19, 2024
4b9344e
move tests
maxscherbakov May 19, 2024
29d05c9
cargo fmt
maxscherbakov May 19, 2024
db3bf10
add code for DeltaAction
maxscherbakov May 20, 2024
6171469
in Cargo.toml move fastcdc to dev-dependencies
maxscherbakov May 21, 2024
9575d89
add trait Database for SBCMap
maxscherbakov May 24, 2024
3276fe7
add scrubber prototype
maxscherbakov May 24, 2024
de87bbe
fix scrubber prototype
maxscherbakov May 24, 2024
dfbccea
completed the trait of the database
maxscherbakov May 25, 2024
11043fe
completed scrubber
maxscherbakov May 25, 2024
103171a
add tests for trait Database, fix remove method in Database
maxscherbakov May 25, 2024
09630ed
delete transitions_hashmap from sbc_map, complete scrubber, connect g…
maxscherbakov May 25, 2024
35441a7
transferring graph to scrubber
maxscherbakov May 26, 2024
12347ea
transferring delta_chunk to scrubber
maxscherbakov May 27, 2024
e7ec703
add enum to hash, return delta_chunks to SBCMap, refactored Scrubber,…
maxscherbakov Jun 5, 2024
a1dabc0
fix get for delta_chunk
maxscherbakov Jun 5, 2024
b193cd9
add tests for scrubber, fix get in database for delta_chunk
maxscherbakov Jun 6, 2024
24948f5
add new() for SBCScrubber
maxscherbakov Jun 6, 2024
4923d9e
fix create_edges for graph
maxscherbakov Jun 6, 2024
d561521
changed the structure of the scrubber
maxscherbakov Jul 16, 2024
3a89560
pub SBCScrubber::new()
maxscherbakov Jul 16, 2024
8e1defa
scrubbing over multiple chunks
maxscherbakov Jul 18, 2024
09ee3b8
deleted rank of vertex of graph and add default for SBCMap, SBCScrubber
maxscherbakov Jul 20, 2024
1c0bcee
add README.md
maxscherbakov Aug 1, 2024
13d473e
Update README.md
maxscherbakov Aug 1, 2024
1374b00
Merge pull request #6 from maxscherbakov/branch-for-integration
maxscherbakov Aug 1, 2024
5340533
add fn set_for_chunk, rename file and variables
maxscherbakov Sep 30, 2024
f53a6b8
add CI
maxscherbakov Sep 30, 2024
de41364
add rabin_hash
maxscherbakov Oct 1, 2024
f055010
fix types for rabin_hash
maxscherbakov Oct 11, 2024
5b10c06
Merge branch 'refs/heads/branch-for-integration'
maxscherbakov Oct 11, 2024
d127add
refactor code
maxscherbakov Oct 11, 2024
8933b8e
fix runner dependencies
maxscherbakov Oct 11, 2024
b002315
fix path for sbc_algorithm in runner
maxscherbakov Nov 11, 2024
3e93a64
make changes to the scrubber
maxscherbakov Nov 11, 2024
024d790
change MAX_WEIGHT_EDGE
maxscherbakov Nov 11, 2024
e8be541
fix encode for delta chunk
maxscherbakov Nov 11, 2024
d29e5bb
fix test_data_recovery
maxscherbakov Nov 11, 2024
046b0d2
fix main.rs
maxscherbakov Nov 11, 2024
a044b1c
fix find_parent_chunk_in_cluster
maxscherbakov Nov 12, 2024
fed38d9
replace FileSystem::new with new_with_scrubber
maxscherbakov Nov 12, 2024
99e162c
fix tests
maxscherbakov Nov 25, 2024
6f8782e
fix sbc_hash
maxscherbakov Nov 25, 2024
c325a9b
fix for the chunkfs update
maxscherbakov Nov 25, 2024
979bad9
fix hash function, add tests
maxscherbakov Feb 4, 2025
d5c25bb
fix features, fmt
maxscherbakov Feb 4, 2025
1331f97
add dataset
maxscherbakov Feb 4, 2025
d43caf6
update README.md
maxscherbakov Feb 4, 2025
2cfeaa6
add gdelta method and refactor
maxscherbakov Feb 19, 2025
5e80f89
update README.md
maxscherbakov Mar 1, 2025
3c15561
make hash calculation parallel
maxscherbakov Mar 1, 2025
0635b85
fix from review
maxscherbakov Mar 19, 2025
63d5660
make parallelization of the delta code calculation
maxscherbakov Mar 23, 2025
e85acae
fix CI
maxscherbakov Mar 25, 2025
00b3818
refactor code
maxscherbakov Mar 30, 2025
02226c2
make SBCHash a trait
maxscherbakov Mar 31, 2025
b08b39e
fix CI and parallelism
maxscherbakov Mar 31, 2025
3c00c89
update README.md
maxscherbakov Mar 31, 2025
cd57edd
add xdelta method
maxscherbakov Apr 30, 2025
b0bf7da
add odess hasher and eq clusterer
maxscherbakov May 17, 2025
27f27a5
fix fmt
maxscherbakov May 17, 2025
848e10e
fix CI
maxscherbakov Jul 9, 2025
e3721dc
update README.md
maxscherbakov Jul 9, 2025
2e309aa
fix cargo clippy only for CI
maxscherbakov Jul 9, 2025
2039fdd
refactor: добавить комментарии к adler32 и для безопасности сделать в…
ArtemNikit1n Jul 14, 2025
fd2b1e2
refactor: Вынести логику добавления insert инструкции в отдельную фун…
ArtemNikit1n Jul 15, 2025
b3bd181
test: Добавить тестов на encode_insert_sequence и поправить реализацию
ArtemNikit1n Jul 15, 2025
ef4d068
test: Вынести Copy инструкцию в функцию, починить create_block_hashma…
ArtemNikit1n Jul 16, 2025
32b6df6
refactor: Вынести инструкции копирования и вставки в отедбные функции…
ArtemNikit1n Jul 16, 2025
c138dc7
style: Добавить документацию
ArtemNikit1n Jul 16, 2025
a51e527
style: Убрать предупреждение о дублировании кода в тестах
ArtemNikit1n Jul 16, 2025
d215166
Merge pull request #9 from ArtemNikit1n/xdelta-refactor
maxscherbakov Jul 17, 2025
1ef7e39
fix cargo fmt
maxscherbakov Jul 17, 2025
ab671e8
feat: Add a function for calculating the hash and tests for it
ArtemNikit1n Jul 18, 2025
406dff5
feat: Add types and function build_triplet_lookup_table (with tests)
ArtemNikit1n Jul 18, 2025
447cfe2
feat: Add encode_cluster
ArtemNikit1n Jul 18, 2025
fed73e9
feat: Add update_after_match (with tests) and default
ArtemNikit1n Jul 20, 2025
7e531f4
feat: Add a best match search function and tests to it
ArtemNikit1n Jul 20, 2025
94894bb
chore: Add thiserror, huffman-compress, bit-vec
ArtemNikit1n Jul 21, 2025
10ac5d4
feat: Add encode
ArtemNikit1n Jul 21, 2025
dc02f46
feat: Add error hierarchy
ArtemNikit1n Jul 21, 2025
36185c8
feat: Add pointer update after match and fix error handling
ArtemNikit1n Jul 21, 2025
25b32f6
style: Add documentation for the remaining functions
ArtemNikit1n Jul 22, 2025
90c1ad3
feat: Add error handling
ArtemNikit1n Jul 22, 2025
8cff8cc
feat: Add collision limit and smart pointer handling
ArtemNikit1n Jul 22, 2025
0d3cbe2
feat: Add huffman_to_raw and test
ArtemNikit1n Jul 23, 2025
a71350e
feat: Add decoder and fix bug with updating pointers (huffman doesn't…
ArtemNikit1n Jul 24, 2025
847ff30
fix: Fix huffman encoding
ArtemNikit1n Jul 24, 2025
c60828b
style: Fix clippy warnings
ArtemNikit1n Jul 24, 2025
6c5a810
chore: Remove auxiliary test
ArtemNikit1n Jul 24, 2025
0766150
feat: Add gear_chunking and tests
ArtemNikit1n Jul 25, 2025
436d685
style: Remove unnecessary conditions, types and add names for indexes
ArtemNikit1n Jul 25, 2025
846d8a3
feat: Add encode_cluster and build_chunks_indices (find_match not fin…
ArtemNikit1n Jul 25, 2025
cc02128
feat: Add encode_delta_chunk with tests
ArtemNikit1n Jul 28, 2025
2f1181c
style: fix clippy warnings
ArtemNikit1n Jul 28, 2025
fa812f2
feat: Add CompressionIsPriority optimization
ArtemNikit1n Jul 29, 2025
3382597
feat: Add SpeedIsPriority optimization
ArtemNikit1n Jul 30, 2025
d6ac243
fix: Make Edelta a public interface
ArtemNikit1n Jul 30, 2025
a2d822a
Merge pull request #10 from ArtemNikit1n/zdelta
maxscherbakov Jul 31, 2025
24ee82e
fix cargo fmt
maxscherbakov Jul 31, 2025
f151c03
Merge branch 'main' into Edelta
maxscherbakov Jul 31, 2025
3fb1152
Merge pull request #12 from ArtemNikit1n/Edelta
maxscherbakov Jul 31, 2025
9e7a3b2
fix cargo fmt
maxscherbakov Jul 31, 2025
598b37e
feat: Add an unstable implementation of cluster metrics for the Eq cl…
ArtemNik1tin Dec 10, 2025
b088189
test: Add tests for scrub_measurements
ArtemNik1tin Dec 10, 2025
e4f02b3
feat: Add ClusteringMeasurements for GraphClusterer
ArtemNik1tin Dec 11, 2025
91c066d
ci: fix Formatting
ArtemNik1tin Dec 11, 2025
37287e5
Merge pull request #13 from ArtemNikit1n/cluster-benchmark
maxscherbakov Dec 11, 2025
f0c661b
test: fix data generation
ArtemNik1tin Dec 16, 2025
1a72005
style: fix Formatting
ArtemNik1tin Dec 16, 2025
784a388
fix: Account for Adler32 hash collisions
ArtemNik1tin Dec 16, 2025
9d20e46
Merge pull request #14 from maxscherbakov/ddelta
ArtemNikit1n Dec 16, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
name: Build and test

on: [ push ]

env:
CARGO_TERM_COLOR: always

jobs:
build:

runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v3
- name: Formatting
run: cargo fmt -- --check
- name: Clippy
run: cargo clippy --all-targets --tests -- -D warnings
- name: Build
run: cargo build --all-features --verbose
- name: Run tests
run: cargo test --verbose
- name: Run binary
run: cargo run -p runner --verbose --release
21 changes: 21 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
[workspace]
members = ["runner"]

[package]
name = "sbc_algorithm"
version = "0.1.0"
edition = "2021"

[dependencies]
chunkfs = { git = "https://github.com/Piletskii-Oleg/chunkfs.git" }
rayon = "1.10"
zstd = "0.13"
thiserror = "2.0.12"
huffman-compress = "0.6.1"
bit-vec = "0.6.3"
log = "0.4.27"
fasthash = "0.4.0"

[dev-dependencies]
chunkfs = { git = "https://github.com/Piletskii-Oleg/chunkfs.git", features = ["chunkers", "hashers"]}
rand = "0.8.5"
59 changes: 59 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
## Similarity Based Chunking Scrubber
SBC Scrubber is a scrubber that can be used to implement different SBC algorithms with ChunkFS

SBC Scrubber is currently under active development, breaking changes can always happen.

## Usage

Add the following dependency to your `Cargo.toml`:

```toml
[dependencies]
chunkfs = { version = "0.1", features = ["chunkers", "hashers"] }
sbc_algorithm = { git = "https://github.com/maxscherbakov/sbc_algorithm.git" }
```

## Example

```rust
extern crate chunkfs;
extern crate sbc_algorithm;

use chunkfs::chunkers::{SizeParams, SuperChunker};
use chunkfs::hashers::Sha256Hasher;
use chunkfs::FileSystem;
use sbc_algorithm::{clusterer, decoder, encoder, hasher};
use sbc_algorithm::{SBCMap, SBCScrubber};
use std::collections::HashMap;
use std::{fs, io};

fn main() -> io::Result<()> {
let data = vec![10; 1024 * 1024];
let chunk_size = SizeParams::new(2 * 1024, 8 * 1024, 16 * 1024);
let mut fs = FileSystem::new_with_scrubber(
HashMap::default(),
SBCMap::new(decoder::GdeltaDecoder::new(false)),
Box::new(SBCScrubber::new(
hasher::AronovichHasher,
clusterer::GraphClusterer::default(),
encoder::GdeltaEncoder::new(false),
)),
Sha256Hasher::default(),
);
let mut handle = fs.create_file("file".to_string(), SuperChunker::new(chunk_size))?;
fs.write_to_file(&mut handle, &data)?;
fs.close_file(handle)?;

let read_handle = fs.open_file_readonly("file")?;
let read = fs.read_file_complete(&read_handle)?;

let cdc_dedup_ratio = fs.cdc_dedup_ratio();
let res = fs.scrub().unwrap();
let sbc_dedup_ratio = fs.total_dedup_ratio();
println!("CDC dedup ratio: {}", cdc_dedup_ratio);
println!("SBC dedup ratio: {}", sbc_dedup_ratio);
println!("ScrubMeasure: {:?}", res);
assert_eq!(read.len(), data.len());
Ok(())
}
```
9 changes: 9 additions & 0 deletions runner/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
[package]
name = "runner"
version = "0.1.0"
edition = "2021"


[dependencies]
sbc_algorithm = { path = ".." }
chunkfs = { git = "https://github.com/Piletskii-Oleg/chunkfs.git", features = ["chunkers", "hashers"]}
Binary file added runner/files/ferris.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added runner/files/my_data
Binary file not shown.
40 changes: 40 additions & 0 deletions runner/src/main.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
extern crate chunkfs;
extern crate sbc_algorithm;

use chunkfs::chunkers::{SizeParams, SuperChunker};
use chunkfs::hashers::Sha256Hasher;
use chunkfs::FileSystem;
use sbc_algorithm::{clusterer, decoder, encoder, hasher};
use sbc_algorithm::{SBCMap, SBCScrubber};
use std::collections::HashMap;
use std::{fs, io};

fn main() -> io::Result<()> {
let data = fs::read("runner/files/my_data")?;
let chunk_size = SizeParams::new(2 * 1024, 8 * 1024, 16 * 1024);
let mut fs = FileSystem::new_with_scrubber(
HashMap::default(),
SBCMap::new(decoder::GdeltaDecoder::new(false)),
Box::new(SBCScrubber::new(
hasher::AronovichHasher,
clusterer::GraphClusterer::default(),
encoder::GdeltaEncoder::new(false),
)),
Sha256Hasher::default(),
);
let mut handle = fs.create_file("file".to_string(), SuperChunker::new(chunk_size))?;
fs.write_to_file(&mut handle, &data)?;
fs.close_file(handle)?;

let read_handle = fs.open_file_readonly("file")?;
let read = fs.read_file_complete(&read_handle)?;

let cdc_dedup_ratio = fs.cdc_dedup_ratio();
let res = fs.scrub().unwrap();
let sbc_dedup_ratio = fs.total_dedup_ratio();
println!("CDC dedup ratio: {}", cdc_dedup_ratio);
println!("SBC dedup ratio: {}", sbc_dedup_ratio);
println!("ScrubMeasure: {:?}", res);
assert_eq!(read.len(), data.len());
Ok(())
}
Loading