From 0b08486541f1ce322e65e91fec7cd31f4507703d Mon Sep 17 00:00:00 2001 From: boxdot Date: Fri, 22 Dec 2023 11:32:35 +0100 Subject: [PATCH] feat: store attachments under a shorter path (#259) * Attachments are now stored under `/gurk/files/year-month-day/`. It is easier to open a shorter path from terminal. Also they are nicely grouped under the `files` folder. * If attachment has a file name, it is used when saving it, otherwise the first 8 chars of its hex digest are used. * All mimes are based on the content type field are supported now. In particular, extensions are resolved for all content types. Also upgrade presage which now verifies the digest of downloaded attachments. --- Cargo.lock | 16 ++- Cargo.toml | 51 +++++----- src/signal/attachment.rs | 204 +++++++++++++++++++++++++++++++++++++++ src/signal/impl.rs | 40 ++------ src/signal/mod.rs | 1 + 5 files changed, 253 insertions(+), 59 deletions(-) create mode 100644 src/signal/attachment.rs diff --git a/Cargo.lock b/Cargo.lock index f38102e..71697cf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1755,6 +1755,8 @@ dependencies = [ "dirs", "emojis", "futures-channel", + "hex", + "hex-literal", "hostname", "image", "insta", @@ -1779,6 +1781,7 @@ dependencies = [ "scopeguard", "serde", "serde_json", + "sha2", "sqlx", "tempfile", "textwrap", @@ -1897,6 +1900,12 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" +[[package]] +name = "hex-literal" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fe2267d4ed49bc07b63801559be28c718ea06c4738b7a03c94df7386d2cde46" + [[package]] name = "hkdf" version = "0.12.4" @@ -3141,7 +3150,7 @@ checksum = "94e851c7654eed9e68d7d27164c454961a616cf8c203d500607ef22c737b51bb" [[package]] name = "presage" version = "0.6.0-dev" -source = "git+https://github.com/whisperfish/presage?rev=2d9a02b#2d9a02b9f5534e2d79811be62ed70a64c75215c8" +source = "git+https://github.com/whisperfish/presage?rev=6d7003f#6d7003fed1fa0397d42ac257ed49573d01ca6e93" dependencies = [ "base64 0.21.5", "futures", @@ -3152,6 +3161,7 @@ dependencies = [ "rand", "serde", "serde_json", + "sha2", "thiserror", "tokio", "url", @@ -3160,7 +3170,7 @@ dependencies = [ [[package]] name = "presage-store-cipher" version = "0.1.0" -source = "git+https://github.com/whisperfish/presage?rev=2d9a02b#2d9a02b9f5534e2d79811be62ed70a64c75215c8" +source = "git+https://github.com/whisperfish/presage?rev=6d7003f#6d7003fed1fa0397d42ac257ed49573d01ca6e93" dependencies = [ "blake3", "chacha20poly1305", @@ -3177,7 +3187,7 @@ dependencies = [ [[package]] name = "presage-store-sled" version = "0.6.0-dev" -source = "git+https://github.com/whisperfish/presage?rev=2d9a02b#2d9a02b9f5534e2d79811be62ed70a64c75215c8" +source = "git+https://github.com/whisperfish/presage?rev=6d7003f#6d7003fed1fa0397d42ac257ed49573d01ca6e93" dependencies = [ "async-trait", "base64 0.12.3", diff --git a/Cargo.toml b/Cargo.toml index 44cf05f..3e66432 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,58 +28,61 @@ debug = true dev = ["prost", "base64"] [dependencies] -presage = { git = "https://github.com/whisperfish/presage", rev = "2d9a02b" } -presage-store-sled = { git = "https://github.com/whisperfish/presage", rev = "2d9a02b" } +presage = { git = "https://github.com/whisperfish/presage", rev = "6d7003f" } +presage-store-sled = { git = "https://github.com/whisperfish/presage", rev = "6d7003f" } +# dev feature dependencies +prost = { version = "0.10.4", optional = true } +base64 = { version = "0.13.1", optional = true } + +aho-corasick = "0.7.19" anyhow = "1.0.66" +arboard = { version = "3.2.0", features = ["wayland-data-control"] } async-trait = "0.1.58" chrono = { version = "0.4.22", default-features = false, features = ["serde"] } +clap = { version = "4.0.19", features = ["derive"] } crossterm = { version = "0.27.0", features = ["event-stream"] } derivative = "2.2.0" dirs = "4.0.0" +emojis = "0.6.1" +futures-channel = "0.3.25" +hex = "0.4.3" hostname = "0.3.1" image = { version = "0.24.6", default-features = false, features = ["png"] } itertools = "0.10.5" log-panics = "2.1.0" mime_guess = "2.0.4" notify-rust = "4.5.10" +once_cell = "1.18.0" opener = "0.5.0" phonenumber = "0.3.1" +postcard = { version = "1.0.4", features = ["alloc"] } +qr2term = { git = "https://github.com/boxdot/qr2term-rs", rev = "ed8ae7f" } +ratatui = "0.23.0" +rayon = "1.7.0" +regex = "1.9.5" regex-automata = "0.1.10" scopeguard = "1.1.0" serde = { version = "1.0.147", features = ["derive"] } serde_json = "1.0.87" +sha2 = "0.10.8" +sqlx = { version = "0.7.2", features = ["sqlite", "runtime-tokio-rustls", "uuid", "chrono"] } textwrap = "0.16.0" +thiserror = "1.0.40" +thread_local = "1.1.7" tokio = { version = "1.21.2", default-features = false, features = ["rt-multi-thread", "macros", "net", "time"] } tokio-stream = "0.1.11" toml = "0.5.9" -unicode-width = "0.1.10" -uuid = { version = "1.2", features = ["v4"] } -whoami = "1.2.3" tracing = "0.1.37" tracing-appender = "0.2.2" tracing-subscriber = "0.3.16" -futures-channel = "0.3.25" -qr2term = { git = "https://github.com/boxdot/qr2term-rs", rev = "ed8ae7f" } -clap = { version = "4.0.19", features = ["derive"] } -aho-corasick = "0.7.19" -sqlx = { version = "0.7.2", features = ["sqlite", "runtime-tokio-rustls", "uuid", "chrono"] } -thiserror = "1.0.40" -postcard = { version = "1.0.4", features = ["alloc"] } -emojis = "0.6.1" - -# dev feature dependencies -prost = { version = "0.10.4", optional = true } -base64 = { version = "0.13.1", optional = true } -arboard = { version = "3.2.0", features = ["wayland-data-control"] } -rayon = "1.7.0" -thread_local = "1.1.7" -ratatui = "0.23.0" -regex = "1.9.5" -once_cell = "1.18.0" +unicode-width = "0.1.10" +uuid = { version = "1.2", features = ["v4"] } +whoami = "1.2.3" [dev-dependencies] criterion = { version = "0.5", features = ["async_tokio", "html_reports"] } +hex-literal = "0.4.1" insta = { version = "1.21.0", features = ["json"] } quickcheck = "1.0.3" quickcheck_macros = "1.0.0" @@ -89,7 +92,7 @@ tempfile = "3.3.0" name = "app" harness = false -# [patch."https://github.com/whisperfish/presage.git"] +[patch."https://github.com/whisperfish/presage.git"] # presage = { path = "../presage/presage" } # presage-store-sled = { path = "../presage/presage-store-sled" } # diff --git a/src/signal/attachment.rs b/src/signal/attachment.rs new file mode 100644 index 0000000..0139bc3 --- /dev/null +++ b/src/signal/attachment.rs @@ -0,0 +1,204 @@ +use std::path::{Path, PathBuf}; + +use anyhow::Context; +use chrono::Local; +use mime_guess::mime::{APPLICATION_OCTET_STREAM, IMAGE_JPEG}; +use mime_guess::{get_mime_extensions, Mime}; +use presage::proto::AttachmentPointer; +use tracing::info; + +use crate::signal::Attachment; +use crate::util::utc_timestamp_msec_to_local; + +const DIGEST_BYTES_LEN: usize = 4; + +pub(super) fn save( + data_dir: impl AsRef, + pointer: AttachmentPointer, + data: Vec, +) -> anyhow::Result { + let base_dir = data_dir.as_ref().join("files"); + + let digest = pointer + .digest + .as_deref() + .context("dropping attachment without digest")?; + let digest_hex = hex::encode(digest); + + let mime: Mime = pointer + .content_type() + .parse() + .unwrap_or(APPLICATION_OCTET_STREAM); + + let name = derive_name(&pointer, digest, &mime); + + let date = pointer + .upload_timestamp + .map(utc_timestamp_msec_to_local) + .unwrap_or_else(Local::now) + .date_naive(); + let filedir = base_dir.join(date.to_string()); + let filepath = conflict_free_filename(&filedir, name); + + std::fs::create_dir_all(&filedir) + .with_context(|| format!("failed to create dir: {}", filedir.display()))?; + std::fs::write(&filepath, data) + .with_context(|| format!("failed to save attachment at: {}", filepath.display()))?; + + info!(dest =% filepath.display(), "saved attachment"); + + Ok(Attachment { + id: digest_hex, + content_type: mime.to_string(), + filename: filepath, + size: pointer.size.unwrap_or_default(), + }) +} + +fn conflict_free_filename(filedir: &Path, name: String) -> PathBuf { + let mut filepath = filedir.join(&name); + + // resolve conflicts + let mut idx = 0; + while filepath.exists() { + let name_path = Path::new(&name); + match name_path.file_stem().zip(name_path.extension()) { + Some((stem, extension)) => { + idx += 1; + let stem = stem.to_string_lossy(); + let extension = extension.to_string_lossy(); + filepath = filedir.join(format!("{stem}.{idx}.{extension}")); + } + None => { + idx += 1; + filepath = filedir.join(format!("{name}.{idx}")); + } + } + } + filepath +} + +fn derive_name(pointer: &AttachmentPointer, digest: &[u8], mime: &Mime) -> String { + pointer.file_name.clone().unwrap_or_else(|| { + let mut name = hex::encode(&digest[..DIGEST_BYTES_LEN]); + let extension = if mime == &IMAGE_JPEG { + // special case due to: + Some("jpeg") + } else if mime == &APPLICATION_OCTET_STREAM { + None + } else { + get_mime_extensions(mime).and_then(|extensions| extensions.first().copied()) + }; + if let Some(extension) = extension { + name.push('.'); + name.push_str(extension); + }; + name + }) +} + +#[cfg(test)] +mod tests { + use hex_literal::hex; + + use super::*; + + fn attachment_pointer( + content_type: &str, + digest: &[u8], + file_name: Option<&str>, + upload_timestamp: u64, + ) -> AttachmentPointer { + AttachmentPointer { + content_type: Some(content_type.into()), + digest: Some(digest.into()), + file_name: file_name.map(|s| s.to_owned()), + upload_timestamp: Some(upload_timestamp), + key: None, + size: Some(42), + thumbnail: None, + incremental_digest: None, + incremental_mac_chunk_size: None, + flags: None, + width: None, + height: None, + caption: None, + blur_hash: None, + cdn_number: None, + attachment_identifier: None, + } + } + + #[test] + fn test_save() { + let tempdir = tempfile::tempdir().unwrap(); + + let digest = hex!("d51e9a355d4351ae5fbf2846d18bb384471555aa0ea6ee9075eb63f99ecddf77"); + let upload_timestamp = 1703160458 * 1000; + + let attachment = save( + tempdir.path(), + attachment_pointer("image/jpeg", &digest, Some("image.jpeg"), upload_timestamp), + vec![42], + ) + .unwrap(); + + assert_eq!(attachment.id, hex::encode(digest)); + assert_eq!(attachment.content_type, "image/jpeg"); + assert_eq!(attachment.size, 42); + assert_eq!( + attachment.filename, + tempdir.path().join("files/2023-12-21/image.jpeg") + ); + + assert_eq!(std::fs::read(attachment.filename).unwrap(), &[42]); + + // duplicate + let attachment = save( + tempdir.path(), + attachment_pointer("image/jpeg", &digest, Some("image.jpeg"), upload_timestamp), + vec![42], + ) + .unwrap(); + assert_eq!( + attachment.filename, + tempdir.path().join("files/2023-12-21/image.1.jpeg") + ); + + // without name + let attachment = save( + tempdir.path(), + attachment_pointer("image/jpeg", &digest, None, upload_timestamp), + vec![42], + ) + .unwrap(); + assert_eq!( + attachment.filename, + tempdir.path().join("files/2023-12-21/d51e9a35.jpeg") + ); + + // without name and mime octet-stream + let attachment = save( + tempdir.path(), + attachment_pointer("application/octet-stream", &digest, None, upload_timestamp), + vec![42], + ) + .unwrap(); + assert_eq!( + attachment.filename, + tempdir.path().join("files/2023-12-21/d51e9a35") + ); + + // without name and mime pdf + let attachment = save( + tempdir.path(), + attachment_pointer("application/pdf", &digest, None, upload_timestamp), + vec![42], + ) + .unwrap(); + assert_eq!( + attachment.filename, + tempdir.path().join("files/2023-12-21/d51e9a35.pdf") + ); + } +} diff --git a/src/signal/impl.rs b/src/signal/impl.rs index ea73b1f..02fa8ae 100644 --- a/src/signal/impl.rs +++ b/src/signal/impl.rs @@ -2,9 +2,8 @@ use std::pin::Pin; -use anyhow::{anyhow, Context}; +use anyhow::Context; use async_trait::async_trait; -use chrono::Utc; use presage::libsignal_service::content::{Content, ContentBody}; use presage::libsignal_service::models::Contact; use presage::libsignal_service::prelude::{Group, ProfileKey}; @@ -16,14 +15,16 @@ use presage::store::ContentsStore; use presage_store_sled::SledStore; use tokio::sync::oneshot; use tokio_stream::Stream; -use tracing::{error, warn}; +use tracing::error; use uuid::Uuid; use crate::data::{Channel, ChannelId, GroupData, Message}; use crate::receipt::Receipt; use crate::util::utc_now_timestamp_msec; -use super::{Attachment, GroupMasterKeyBytes, ProfileKeyBytes, ResolvedGroup, SignalManager}; +use super::{ + attachment, Attachment, GroupMasterKeyBytes, ProfileKeyBytes, ResolvedGroup, SignalManager, +}; pub(super) struct PresageManager { manager: presage::Manager, @@ -80,36 +81,11 @@ impl SignalManager for PresageManager { &mut self, attachment_pointer: AttachmentPointer, ) -> anyhow::Result { + let attachment_data = self.manager.get_attachment(&attachment_pointer).await?; let data_dir = dirs::data_dir() - .ok_or_else(|| anyhow!("could not find data directory"))? + .context("could not find data directory")? .join("gurk"); - let attachment_data = self.manager.get_attachment(&attachment_pointer).await?; - - let date = Utc::now().to_rfc3339(); - let filename = match attachment_pointer.content_type.as_deref() { - Some("image/jpeg") => format!("signal-{date}.jpg"), - Some("image/gif") => format!("signal-{date}.gif"), - Some("image/png") => format!("signal-{date}.png"), - Some(mimetype) => { - warn!("unsupported attachment mimetype: {}", mimetype); - format!("signal-{date}") - } - None => { - format!("signal-{date}") - } - }; - - let filepath = data_dir.join(filename); - std::fs::write(&filepath, attachment_data)?; - - Ok(Attachment { - id: date, - content_type: attachment_pointer - .content_type - .unwrap_or_else(|| "application/octet-stream".to_owned()), - filename: filepath, - size: attachment_pointer.size.unwrap_or_default(), - }) + attachment::save(data_dir, attachment_pointer, attachment_data) } fn send_receipt(&self, sender_uuid: Uuid, timestamps: Vec, receipt: Receipt) { diff --git a/src/signal/mod.rs b/src/signal/mod.rs index c752480..da63715 100644 --- a/src/signal/mod.rs +++ b/src/signal/mod.rs @@ -1,3 +1,4 @@ +mod attachment; mod r#impl; mod manager; pub mod test;