diff --git a/Cargo.lock b/Cargo.lock index 6c8ae86..bf67ece 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -142,6 +142,15 @@ version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + [[package]] name = "bstr" version = "1.9.1" @@ -314,6 +323,15 @@ version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" +[[package]] +name = "cpufeatures" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53fe5e26ff1b7aef8bca9c6080520cfb8d9333c7568e1829cef191a9723e5504" +dependencies = [ + "libc", +] + [[package]] name = "crossbeam-channel" version = "0.5.13" @@ -379,6 +397,16 @@ dependencies = [ "winapi", ] +[[package]] +name = "crypto-common" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +dependencies = [ + "generic-array", + "typenum", +] + [[package]] name = "datapod" version = "0.1.0" @@ -392,6 +420,7 @@ dependencies = [ "rayon", "semver", "serde", + "sha2", "thiserror", "toml", ] @@ -400,6 +429,16 @@ dependencies = [ name = "dataset" version = "0.1.0" +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", +] + [[package]] name = "dyn-clone" version = "1.0.17" @@ -470,6 +509,16 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ee1b05cbd864bcaecbd3455d6d967862d446e4ebfc3c2e5e5b9841e53cba6673" +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + [[package]] name = "getrandom" version = "0.2.15" @@ -1490,6 +1539,17 @@ dependencies = [ "serde", ] +[[package]] +name = "sha2" +version = "0.10.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + [[package]] name = "simdutf8" version = "0.1.4" @@ -1700,6 +1760,12 @@ dependencies = [ "winnow", ] +[[package]] +name = "typenum" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" + [[package]] name = "unicode-ident" version = "1.0.12" diff --git a/crates/datapod/Cargo.toml b/crates/datapod/Cargo.toml index 95d3b5e..8fe4be7 100644 --- a/crates/datapod/Cargo.toml +++ b/crates/datapod/Cargo.toml @@ -15,6 +15,7 @@ polars.workspace = true rayon.workspace = true semver.workspace = true serde.workspace = true +sha2 = "0.10.8" thiserror.workspace = true toml.workspace = true diff --git a/crates/datapod/src/commands/index.rs b/crates/datapod/src/commands/index.rs index 004d043..ae6846f 100644 --- a/crates/datapod/src/commands/index.rs +++ b/crates/datapod/src/commands/index.rs @@ -43,6 +43,7 @@ struct Row { idn: String, path: PathBuf, len: u64, + hash: String, } impl TryFrom<&PathBuf> for Row { @@ -54,6 +55,7 @@ impl TryFrom<&PathBuf> for Row { idn: doc.idn(), path: path.into(), len: doc.len(), + hash: doc.hash(), }) } } @@ -89,12 +91,14 @@ pub(crate) fn execute(args: Index) -> DatapodResult<()> { let mut remote: Vec<&str> = vec![]; let mut path: Vec = vec![]; let mut len: Vec = vec![]; + let mut hash: Vec = vec![]; for row in rows.into_iter() { idn.push(row.idn); remote.push(&config.metadata.name); path.push(relpath(&row.path, base_dir)); len.push(row.len); + hash.push(row.hash[0..12].to_string()); } let mut df = DataFrame::new(vec![ @@ -102,6 +106,7 @@ pub(crate) fn execute(args: Index) -> DatapodResult<()> { Series::new("remote", remote), Series::new("path", path), Series::new("len", len), + Series::new("hash", hash), ])?; match args.output { diff --git a/crates/datapod/src/document.rs b/crates/datapod/src/document.rs index 1d0718d..9786f76 100644 --- a/crates/datapod/src/document.rs +++ b/crates/datapod/src/document.rs @@ -1,8 +1,10 @@ +use std::fmt::Write; use std::fs::File; use std::io::Read; use std::path::{Path, PathBuf}; use bstr::BString; +use sha2::{Digest, Sha256}; use crate::error::DatapodResult; @@ -33,15 +35,20 @@ impl Document { } /// Returns the length of the document in bytes. - /// - /// ```rust - /// use document_stats::Document; - /// - /// let doc = Document::new("a ∉ ℕ"); - /// assert_eq!(doc.len(), 9); - /// ``` #[inline] pub(crate) fn len(&self) -> u64 { self.buf.len() as u64 } + + /// Returns the SHA256 digest of the document. + pub(crate) fn hash(&self) -> String { + let mut hasher = Sha256::new(); + hasher.update(&self.buf); + + let hash = hasher.finalize(); + hash.iter().fold(String::new(), |mut out, b| { + let _ = write!(out, "{b:02x}"); + out + }) + } }