Skip to content

Commit

Permalink
datapod: add hash column to index (#12)
Browse files Browse the repository at this point in the history
Signed-off-by: Nico Wagner <n.wagner@dnb.de>
  • Loading branch information
nwagner84 authored Jul 10, 2024
1 parent 1eb3e47 commit 4d40127
Show file tree
Hide file tree
Showing 4 changed files with 86 additions and 7 deletions.
66 changes: 66 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions crates/datapod/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ polars.workspace = true
rayon.workspace = true
semver.workspace = true
serde.workspace = true
sha2 = "0.10.8"
thiserror.workspace = true
toml.workspace = true

Expand Down
5 changes: 5 additions & 0 deletions crates/datapod/src/commands/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ struct Row {
idn: String,
path: PathBuf,
len: u64,
hash: String,
}

impl TryFrom<&PathBuf> for Row {
Expand All @@ -54,6 +55,7 @@ impl TryFrom<&PathBuf> for Row {
idn: doc.idn(),
path: path.into(),
len: doc.len(),
hash: doc.hash(),
})
}
}
Expand Down Expand Up @@ -89,19 +91,22 @@ pub(crate) fn execute(args: Index) -> DatapodResult<()> {
let mut remote: Vec<&str> = vec![];
let mut path: Vec<String> = vec![];
let mut len: Vec<u64> = vec![];
let mut hash: Vec<String> = vec![];

for row in rows.into_iter() {
idn.push(row.idn);
remote.push(&config.metadata.name);
path.push(relpath(&row.path, base_dir));
len.push(row.len);
hash.push(row.hash[0..12].to_string());
}

let mut df = DataFrame::new(vec![
Series::new("idn", idn),
Series::new("remote", remote),
Series::new("path", path),
Series::new("len", len),
Series::new("hash", hash),
])?;

match args.output {
Expand Down
21 changes: 14 additions & 7 deletions crates/datapod/src/document.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
use std::fmt::Write;
use std::fs::File;
use std::io::Read;
use std::path::{Path, PathBuf};

use bstr::BString;
use sha2::{Digest, Sha256};

use crate::error::DatapodResult;

Expand Down Expand Up @@ -33,15 +35,20 @@ impl Document {
}

/// Returns the length of the document in bytes.
///
/// ```rust
/// use document_stats::Document;
///
/// let doc = Document::new("a ∉ ℕ");
/// assert_eq!(doc.len(), 9);
/// ```
#[inline]
pub(crate) fn len(&self) -> u64 {
self.buf.len() as u64
}

/// Returns the SHA256 digest of the document.
pub(crate) fn hash(&self) -> String {
let mut hasher = Sha256::new();
hasher.update(&self.buf);

let hash = hasher.finalize();
hash.iter().fold(String::new(), |mut out, b| {
let _ = write!(out, "{b:02x}");
out
})
}
}

0 comments on commit 4d40127

Please sign in to comment.