From 9593e91f2fb38b9af0255faf575e63b9a3ece4b6 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 2 Feb 2024 09:36:40 -0500 Subject: [PATCH 01/17] =?UTF-8?q?dandi::Client=20=E2=86=92=20DandiClient?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dandi/mod.rs | 12 ++++++------ src/dav/mod.rs | 16 ++++++++-------- src/main.rs | 6 +++--- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/src/dandi/mod.rs b/src/dandi/mod.rs index d77b48f..fbc875e 100644 --- a/src/dandi/mod.rs +++ b/src/dandi/mod.rs @@ -21,17 +21,17 @@ use tokio::sync::Mutex; use url::Url; #[derive(Clone, Debug)] -pub(crate) struct Client { +pub(crate) struct DandiClient { inner: reqwest::Client, api_url: Url, s3clients: Arc>>>, } -impl Client { +impl DandiClient { pub(crate) fn new(api_url: Url) -> Result { let inner = ClientBuilder::new().user_agent(USER_AGENT).build()?; let s3clients = Arc::new(Mutex::new(LruCache::new(S3CLIENT_CACHE_SIZE))); - Ok(Client { + Ok(DandiClient { inner, api_url, s3clients, @@ -162,12 +162,12 @@ impl Client { #[derive(Clone, Debug)] pub(crate) struct DandisetEndpoint<'a> { - client: &'a Client, + client: &'a DandiClient, dandiset_id: DandisetId, } impl<'a> DandisetEndpoint<'a> { - fn new(client: &'a Client, dandiset_id: DandisetId) -> Self { + fn new(client: &'a DandiClient, dandiset_id: DandisetId) -> Self { Self { client, dandiset_id, @@ -200,7 +200,7 @@ impl<'a> DandisetEndpoint<'a> { #[derive(Clone, Debug)] pub(crate) struct VersionEndpoint<'a> { - client: &'a Client, + client: &'a DandiClient, dandiset_id: DandisetId, version_id: VersionId, } diff --git a/src/dav/mod.rs b/src/dav/mod.rs index 02712ac..39f11b3 100644 --- a/src/dav/mod.rs +++ b/src/dav/mod.rs @@ -29,15 +29,15 @@ const WEBDAV_RESPONSE_HEADERS: [(&str, &str); 2] = [ ]; pub(crate) struct DandiDav { - client: Client, + dandi: DandiClient, templater: Templater, title: String, } impl DandiDav { - pub(crate) fn new(client: Client, templater: Templater, title: String) -> DandiDav { + pub(crate) fn new(dandi: DandiClient, templater: Templater, title: String) -> DandiDav { DandiDav { - client, + dandi, templater, title, } @@ -146,7 +146,7 @@ impl DandiDav { dandiset_id: &DandisetId, version: &VersionSpec, ) -> Result, DavError> { - let d = self.client.dandiset(dandiset_id.clone()); + let d = self.dandi.dandiset(dandiset_id.clone()); match version { VersionSpec::Draft => Ok(d.version(VersionId::Draft)), VersionSpec::Published(v) => Ok(d.version(VersionId::Published(v.clone()))), @@ -189,7 +189,7 @@ impl DandiDav { DavPath::Root => Ok(DavResource::root()), DavPath::DandisetIndex => Ok(DavResource::Collection(DavCollection::dandiset_index())), DavPath::Dandiset { dandiset_id } => { - let ds = self.client.dandiset(dandiset_id.clone()).get().await?; + let ds = self.dandi.dandiset(dandiset_id.clone()).get().await?; Ok(DavResource::Collection(ds.into())) } DavPath::DandisetReleases { dandiset_id } => { @@ -237,7 +237,7 @@ impl DandiDav { DavPath::DandisetIndex => { let col = DavCollection::dandiset_index(); let mut children = Vec::new(); - let stream = self.client.get_all_dandisets(); + let stream = self.dandi.get_all_dandisets(); tokio::pin!(stream); while let Some(ds) = stream.try_next().await? { children.push(DavResource::Collection(ds.into())); @@ -245,7 +245,7 @@ impl DandiDav { Ok(DavResourceWithChildren::Collection { col, children }) } DavPath::Dandiset { dandiset_id } => { - let ds = self.client.dandiset(dandiset_id.clone()).get().await?; + let ds = self.dandi.dandiset(dandiset_id.clone()).get().await?; let draft = DavResource::Collection(DavCollection::dandiset_version( ds.draft_version.clone(), version_path(dandiset_id, &VersionSpec::Draft), @@ -271,7 +271,7 @@ impl DandiDav { // have any published releases? let col = DavCollection::dandiset_releases(dandiset_id); let mut children = Vec::new(); - let endpoint = self.client.dandiset(dandiset_id.clone()); + let endpoint = self.dandi.dandiset(dandiset_id.clone()); let stream = endpoint.get_all_versions(); tokio::pin!(stream); while let Some(v) = stream.try_next().await? { diff --git a/src/main.rs b/src/main.rs index bbc56a3..09c7cc2 100644 --- a/src/main.rs +++ b/src/main.rs @@ -4,7 +4,7 @@ mod dav; mod paths; mod s3; use crate::consts::{CSS_CONTENT_TYPE, DEFAULT_API_URL, SERVER_VALUE}; -use crate::dandi::Client; +use crate::dandi::DandiClient; use crate::dav::{DandiDav, Templater}; use anyhow::Context; use axum::{ @@ -57,9 +57,9 @@ async fn main() -> anyhow::Result<()> { tracing_subscriber::fmt() .with_max_level(LevelFilter::TRACE) .init(); - let client = Client::new(args.api_url)?; + let dandi = DandiClient::new(args.api_url)?; let templater = Templater::load()?; - let dav = Arc::new(DandiDav::new(client, templater, args.title)); + let dav = Arc::new(DandiDav::new(dandi, templater, args.title)); let app = Router::new() .route( "/.static/styles.css", From 26aa12e93eb8d5270d9e8c4a99d2fd649a659eb6 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 2 Feb 2024 10:00:58 -0500 Subject: [PATCH 02/17] Replace lru with moka --- Cargo.lock | 285 ++++++++++++++++++++++++++++++++++++++--------- Cargo.toml | 4 +- src/consts.rs | 15 +-- src/dandi/mod.rs | 46 ++++---- 4 files changed, 256 insertions(+), 94 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index cd8dfa3..425c84a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,18 +17,6 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" -[[package]] -name = "ahash" -version = "0.8.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77c3a9648d43b9cd48db467b3f87fdd6e146bcc88ab0180006cef2179fe11d01" -dependencies = [ - "cfg-if", - "once_cell", - "version_check", - "zerocopy", -] - [[package]] name = "aho-corasick" version = "1.1.2" @@ -38,12 +26,6 @@ dependencies = [ "memchr", ] -[[package]] -name = "allocator-api2" -version = "0.2.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5" - [[package]] name = "anstyle" version = "1.0.5" @@ -62,6 +44,15 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b34d609dfbaf33d6889b2b7106d3ca345eacad44200913df5ba02bfd31d2ba9" +[[package]] +name = "async-lock" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "287272293e9d8c41773cec55e365490fe034813a2f172f502d6ddcf75b2f582b" +dependencies = [ + "event-listener", +] + [[package]] name = "async-stream" version = "0.3.5" @@ -603,6 +594,12 @@ version = "3.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec" +[[package]] +name = "bytecount" +version = "0.6.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1e5f035d16fc623ae5f74981db80a439803888314e3a555fd6f04acd51a3205" + [[package]] name = "bytes" version = "1.5.0" @@ -619,6 +616,37 @@ dependencies = [ "either", ] +[[package]] +name = "camino" +version = "1.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c59e92b5a388f549b863a7bea62612c09f24c8393560709a54558a9abdfb3b9c" +dependencies = [ + "serde", +] + +[[package]] +name = "cargo-platform" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ceed8ef69d8518a5dda55c07425450b58a4e1946f4951eab6d7191ee86c2443d" +dependencies = [ + "serde", +] + +[[package]] +name = "cargo_metadata" +version = "0.14.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4acbb09d9ee8e23699b9634375c72795d095bf268439da88562cf9b501f181fa" +dependencies = [ + "camino", + "cargo-platform", + "semver", + "serde", + "serde_json", +] + [[package]] name = "cc" version = "1.0.83" @@ -723,6 +751,15 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "crossbeam-channel" +version = "0.5.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "176dc175b78f56c0f321911d9c8eb2b77a78a4860b9c19db83835fea1a46649b" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "crossbeam-deque" version = "0.8.5" @@ -798,7 +835,7 @@ dependencies = [ "enum_dispatch", "futures-util", "indoc", - "lru", + "moka", "percent-encoding", "pretty_assertions", "reqwest", @@ -941,6 +978,21 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "error-chain" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d2f06b9cac1506ece98fe3231e3cc9c4410ec3d5b1f24ae1c8946f0742cdefc" +dependencies = [ + "version_check", +] + +[[package]] +name = "event-listener" +version = "2.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0" + [[package]] name = "fastrand" version = "2.0.1" @@ -1135,10 +1187,6 @@ name = "hashbrown" version = "0.14.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" -dependencies = [ - "ahash", - "allocator-api2", -] [[package]] name = "heck" @@ -1392,19 +1440,20 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" [[package]] -name = "log" -version = "0.4.20" +name = "lock_api" +version = "0.4.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" +checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45" +dependencies = [ + "autocfg", + "scopeguard", +] [[package]] -name = "lru" -version = "0.12.2" +name = "log" +version = "0.4.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db2c024b41519440580066ba82aab04092b333e09066a5eb86c7c4890df31f22" -dependencies = [ - "hashbrown", -] +checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" [[package]] name = "matchit" @@ -1454,6 +1503,30 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "moka" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1911e88d5831f748a4097a43862d129e3c6fca831eecac9b8db6d01d93c9de2" +dependencies = [ + "async-lock", + "async-trait", + "crossbeam-channel", + "crossbeam-epoch", + "crossbeam-utils", + "futures-util", + "once_cell", + "parking_lot", + "quanta", + "rustc_version", + "skeptic", + "smallvec", + "tagptr", + "thiserror", + "triomphe", + "uuid", +] + [[package]] name = "nu-ansi-term" version = "0.46.0" @@ -1543,6 +1616,29 @@ dependencies = [ "sha2", ] +[[package]] +name = "parking_lot" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-targets 0.48.5", +] + [[package]] name = "percent-encoding" version = "2.3.1" @@ -1661,6 +1757,32 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "pulldown-cmark" +version = "0.9.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57206b407293d2bcd3af849ce869d52068623f19e1b5ff8e8778e3309439682b" +dependencies = [ + "bitflags 2.4.2", + "memchr", + "unicase", +] + +[[package]] +name = "quanta" +version = "0.12.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ca0b7bac0b97248c40bb77288fc52029cf1459c0461ea1b05ee32ccf011de2c" +dependencies = [ + "crossbeam-utils", + "libc", + "once_cell", + "raw-cpuid", + "wasi", + "web-sys", + "winapi", +] + [[package]] name = "quote" version = "1.0.35" @@ -1679,6 +1801,24 @@ dependencies = [ "getrandom", ] +[[package]] +name = "raw-cpuid" +version = "11.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d86a7c4638d42c44551f4791a20e687dbb4c3de1f33c43dd71e355cd429def1" +dependencies = [ + "bitflags 2.4.2", +] + +[[package]] +name = "redox_syscall" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" +dependencies = [ + "bitflags 1.3.2", +] + [[package]] name = "regex" version = "1.10.3" @@ -1914,6 +2054,12 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + [[package]] name = "sct" version = "0.7.1" @@ -1966,6 +2112,9 @@ name = "semver" version = "1.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b97ed7a9823b74f99c7742f5336af7be5ecd3eeafcb1507d1fa93347b1d589b0" +dependencies = [ + "serde", +] [[package]] name = "serde" @@ -2074,6 +2223,21 @@ dependencies = [ "rand_core", ] +[[package]] +name = "skeptic" +version = "0.13.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16d23b015676c90a0f01c197bfdc786c20342c73a0afdda9025adb0bc42940a8" +dependencies = [ + "bytecount", + "cargo_metadata", + "error-chain", + "glob", + "pulldown-cmark", + "tempfile", + "walkdir", +] + [[package]] name = "slab" version = "0.4.9" @@ -2193,6 +2357,25 @@ dependencies = [ "libc", ] +[[package]] +name = "tagptr" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417" + +[[package]] +name = "tempfile" +version = "3.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa" +dependencies = [ + "cfg-if", + "fastrand", + "redox_syscall", + "rustix", + "windows-sys 0.52.0", +] + [[package]] name = "tera" version = "1.19.1" @@ -2451,6 +2634,12 @@ dependencies = [ "tracing-log", ] +[[package]] +name = "triomphe" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "859eb650cfee7434994602c3a68b25d77ad9e68c8a6cd491616ef86661382eb3" + [[package]] name = "try-lock" version = "0.2.5" @@ -2519,6 +2708,15 @@ dependencies = [ "unic-common", ] +[[package]] +name = "unicase" +version = "2.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7d2d4dafb69621809a81864c9c1b864479e1235c0dd4e199924b9742439ed89" +dependencies = [ + "version_check", +] + [[package]] name = "unicode-bidi" version = "0.3.15" @@ -2575,6 +2773,9 @@ name = "uuid" version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f00cc9702ca12d3c81455259621e676d0f7251cec66a21e98fe2e9a37db93b2a" +dependencies = [ + "getrandom", +] [[package]] name = "valuable" @@ -2886,26 +3087,6 @@ version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09041cd90cf85f7f8b2df60c646f853b7f535ce68f85244eb6731cf89fa498ec" -[[package]] -name = "zerocopy" -version = "0.7.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74d4d3961e53fa4c9a25a8637fc2bfaf2595b3d3ae34875568a5cf64787716be" -dependencies = [ - "zerocopy-derive", -] - -[[package]] -name = "zerocopy-derive" -version = "0.7.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.48", -] - [[package]] name = "zeroize" version = "1.7.0" diff --git a/Cargo.toml b/Cargo.toml index 8b2eae8..e8dc5c1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,7 +28,7 @@ derive_more = { version = "0.99.17", default-features = false, features = ["as_r enum_dispatch = "0.3.12" futures-util = "0.3.30" indoc = "2.0.4" -lru = "0.12.2" +moka = { version = "0.12.5", features = ["future"] } percent-encoding = "2.3.1" reqwest = { version = "0.11.24", default-features = false, features = ["json", "rustls-tls-native-roots"] } serde = { version = "1.0.196", features = ["derive"] } @@ -38,7 +38,7 @@ smartstring = "1.0.1" tera = { version = "1.19.1", default-features = false } thiserror = "1.0.56" time = { version = "0.3.34", features = ["formatting", "macros", "parsing", "serde"] } -tokio = { version = "1.36.0", features = ["macros", "net", "rt-multi-thread", "sync"] } +tokio = { version = "1.36.0", features = ["macros", "net", "rt-multi-thread"] } tower = "0.4.13" tower-http = { version = "0.5.1", features = ["set-header", "trace"] } tracing-subscriber = "0.3.18" diff --git a/src/consts.rs b/src/consts.rs index 8d267c1..d5d7675 100644 --- a/src/consts.rs +++ b/src/consts.rs @@ -1,4 +1,3 @@ -use std::num::NonZeroUsize; use time::{format_description::FormatItem, macros::format_description}; /// The value of the "User-Agent" header sent in requests to the Dandi Archive @@ -21,19 +20,7 @@ pub(crate) static DEFAULT_API_URL: &str = "https://api.dandiarchive.org/api"; // Case sensitive: pub(crate) static ZARR_EXTENSIONS: [&str; 2] = [".zarr", ".ngff"]; -const S3CLIENT_CACHE_SIZE_RAW: usize = 8; - -#[allow(unsafe_code)] -#[allow(clippy::assertions_on_constants)] -// -pub(crate) const S3CLIENT_CACHE_SIZE: NonZeroUsize = { - assert!( - S3CLIENT_CACHE_SIZE_RAW != 0, - "cache size should not be zero" - ); - // SAFETY: Cache size is not zero - unsafe { NonZeroUsize::new_unchecked(S3CLIENT_CACHE_SIZE_RAW) } -}; +pub(crate) const S3CLIENT_CACHE_SIZE: u64 = 8; pub(crate) static HTML_CONTENT_TYPE: &str = "text/html; charset=utf-8"; diff --git a/src/dandi/mod.rs b/src/dandi/mod.rs index fbc875e..81ccb2c 100644 --- a/src/dandi/mod.rs +++ b/src/dandi/mod.rs @@ -11,26 +11,29 @@ use crate::s3::{ }; use async_stream::try_stream; use futures_util::{Stream, TryStreamExt}; -use lru::LruCache; +use moka::future::{Cache, CacheBuilder}; use reqwest::{ClientBuilder, StatusCode}; use serde::de::DeserializeOwned; use smartstring::alias::CompactString; use std::sync::Arc; use thiserror::Error; -use tokio::sync::Mutex; use url::Url; #[derive(Clone, Debug)] pub(crate) struct DandiClient { inner: reqwest::Client, api_url: Url, - s3clients: Arc>>>, + s3clients: Arc>>, } impl DandiClient { pub(crate) fn new(api_url: Url) -> Result { let inner = ClientBuilder::new().user_agent(USER_AGENT).build()?; - let s3clients = Arc::new(Mutex::new(LruCache::new(S3CLIENT_CACHE_SIZE))); + let s3clients = Arc::new( + CacheBuilder::new(S3CLIENT_CACHE_SIZE) + .name("s3clients") + .build(), + ); Ok(DandiClient { inner, api_url, @@ -108,27 +111,18 @@ impl DandiClient { let prefix = key .parse::() .map_err(|source| ZarrToS3Error::BadS3Key { key, source })?; - let client = { - let mut cache = self.s3clients.lock().await; - if let Some(client) = cache.get(&bucket_spec) { - client.clone() - } else { - match bucket_spec.clone().into_s3client().await { - Ok(client) => { - let client = Arc::new(client); - cache.put(bucket_spec, client.clone()); - client - } - Err(source) => { - return Err(ZarrToS3Error::LocateBucket { - bucket: bucket_spec.bucket, - source, - }) - } - } - } - }; - Ok(client.with_prefix(prefix)) + // Box large future: + match Box::pin(self.s3clients.try_get_with_by_ref(&bucket_spec, async { + bucket_spec.clone().into_s3client().await.map(Arc::new) + })) + .await + { + Ok(client) => Ok(client.with_prefix(prefix)), + Err(source) => Err(ZarrToS3Error::LocateBucket { + bucket: bucket_spec.bucket, + source, + }), + } } async fn get_s3client_for_zarr( @@ -473,7 +467,7 @@ pub(crate) enum ZarrToS3Error { #[error("failed to determine region for S3 bucket {bucket:?}")] LocateBucket { bucket: CompactString, - source: GetBucketRegionError, + source: Arc, }, } From bfaf02ed02eb47b1636e8b867f64fe7b461e3b41 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 2 Feb 2024 10:59:36 -0500 Subject: [PATCH 03/17] Skeleton for Zarr manifest support --- Cargo.lock | 10 +++++++ Cargo.toml | 1 + src/dandi/mod.rs | 11 +++----- src/httputil.rs | 14 +++++++++ src/main.rs | 2 ++ src/zarrman/manifest.rs | 53 ++++++++++++++++++++++++++++++++++ src/zarrman/mod.rs | 61 ++++++++++++++++++++++++++++++++++++++++ src/zarrman/resources.rs | 56 ++++++++++++++++++++++++++++++++++++ 8 files changed, 201 insertions(+), 7 deletions(-) create mode 100644 src/httputil.rs create mode 100644 src/zarrman/manifest.rs create mode 100644 src/zarrman/mod.rs create mode 100644 src/zarrman/resources.rs diff --git a/Cargo.lock b/Cargo.lock index 425c84a..a3321ac 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -835,6 +835,7 @@ dependencies = [ "enum_dispatch", "futures-util", "indoc", + "itertools", "moka", "percent-encoding", "pretty_assertions", @@ -1406,6 +1407,15 @@ version = "2.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3" +[[package]] +name = "itertools" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.10" diff --git a/Cargo.toml b/Cargo.toml index e8dc5c1..1c79427 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,6 +28,7 @@ derive_more = { version = "0.99.17", default-features = false, features = ["as_r enum_dispatch = "0.3.12" futures-util = "0.3.30" indoc = "2.0.4" +itertools = "0.12.1" moka = { version = "0.12.5", features = ["future"] } percent-encoding = "2.3.1" reqwest = { version = "0.11.24", default-features = false, features = ["json", "rustls-tls-native-roots"] } diff --git a/src/dandi/mod.rs b/src/dandi/mod.rs index 81ccb2c..ceb51b7 100644 --- a/src/dandi/mod.rs +++ b/src/dandi/mod.rs @@ -4,7 +4,8 @@ mod version_id; pub(crate) use self::dandiset_id::*; pub(crate) use self::types::*; pub(crate) use self::version_id::*; -use crate::consts::{S3CLIENT_CACHE_SIZE, USER_AGENT}; +use crate::consts::S3CLIENT_CACHE_SIZE; +use crate::httputil::{new_client, BuildClientError}; use crate::paths::{ParsePureDirPathError, PureDirPath, PurePath}; use crate::s3::{ BucketSpec, GetBucketRegionError, PrefixedS3Client, S3Client, S3Error, S3Location, @@ -12,7 +13,7 @@ use crate::s3::{ use async_stream::try_stream; use futures_util::{Stream, TryStreamExt}; use moka::future::{Cache, CacheBuilder}; -use reqwest::{ClientBuilder, StatusCode}; +use reqwest::StatusCode; use serde::de::DeserializeOwned; use smartstring::alias::CompactString; use std::sync::Arc; @@ -28,7 +29,7 @@ pub(crate) struct DandiClient { impl DandiClient { pub(crate) fn new(api_url: Url) -> Result { - let inner = ClientBuilder::new().user_agent(USER_AGENT).build()?; + let inner = new_client()?; let s3clients = Arc::new( CacheBuilder::new(S3CLIENT_CACHE_SIZE) .name("s3clients") @@ -425,10 +426,6 @@ impl<'a> VersionEndpoint<'a> { } } -#[derive(Debug, Error)] -#[error("failed to initialize Dandi API client")] -pub(crate) struct BuildClientError(#[from] reqwest::Error); - #[derive(Debug, Error)] pub(crate) enum DandiError { #[error("failed to make request to {url}")] diff --git a/src/httputil.rs b/src/httputil.rs new file mode 100644 index 0000000..c83e18c --- /dev/null +++ b/src/httputil.rs @@ -0,0 +1,14 @@ +use crate::consts::USER_AGENT; +use reqwest::ClientBuilder; +use thiserror::Error; + +pub(crate) fn new_client() -> Result { + ClientBuilder::new() + .user_agent(USER_AGENT) + .build() + .map_err(Into::into) +} + +#[derive(Debug, Error)] +#[error("failed to initialize HTTP client")] +pub(crate) struct BuildClientError(#[from] reqwest::Error); diff --git a/src/main.rs b/src/main.rs index 09c7cc2..02805d4 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,8 +1,10 @@ mod consts; mod dandi; mod dav; +mod httputil; mod paths; mod s3; +mod zarrman; use crate::consts::{CSS_CONTENT_TYPE, DEFAULT_API_URL, SERVER_VALUE}; use crate::dandi::DandiClient; use crate::dav::{DandiDav, Templater}; diff --git a/src/zarrman/manifest.rs b/src/zarrman/manifest.rs new file mode 100644 index 0000000..07b72c7 --- /dev/null +++ b/src/zarrman/manifest.rs @@ -0,0 +1,53 @@ +use crate::paths::PurePath; +use itertools::{Itertools, Position}; +use serde::Deserialize; +use std::collections::BTreeMap; +use time::OffsetDateTime; + +// TODO: Add a deserialization test +#[derive(Clone, Debug, Deserialize, Eq, PartialEq)] +pub(crate) struct Manifest { + entries: ManifestFolder, +} + +impl Manifest { + pub(super) fn get(&self, path: &PurePath) -> Option> { + let mut folder = &self.entries; + for (pos, p) in path.split('/').with_position() { + match folder.get(p)? { + FolderEntry::Folder(f) => folder = f, + FolderEntry::Entry(e) if matches!(pos, Position::Last | Position::Only) => { + return Some(EntryRef::Entry(e)) + } + FolderEntry::Entry(_) => return None, + } + } + Some(EntryRef::Folder(folder)) + } +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub(crate) enum EntryRef<'a> { + Folder(&'a ManifestFolder), + Entry(&'a ManifestEntry), +} + +pub(super) type ManifestFolder = BTreeMap; + +#[derive(Clone, Debug, Deserialize, Eq, PartialEq)] +#[serde(untagged)] +pub(crate) enum FolderEntry { + Folder(ManifestFolder), + Entry(ManifestEntry), +} + +#[derive(Clone, Debug, Deserialize, Eq, PartialEq)] +pub(crate) struct ManifestEntry { + // Keep these fields in this order so that deserialization will work + // properly! + version_id: String, + size: i64, + #[serde(with = "time::serde::rfc3339")] + modified: OffsetDateTime, + etag: String, +} diff --git a/src/zarrman/mod.rs b/src/zarrman/mod.rs new file mode 100644 index 0000000..4675b4d --- /dev/null +++ b/src/zarrman/mod.rs @@ -0,0 +1,61 @@ +mod manifest; +mod resources; +pub(crate) use self::resources::*; +use crate::httputil::{new_client, BuildClientError}; +use crate::paths::PurePath; +use moka::future::{Cache, CacheBuilder}; +use std::sync::Arc; +use thiserror::Error; + +static MANIFEST_ROOT_URL: &str = + "https://datasets.datalad.org/dandi/zarr-manifests/zarr-manifests-v2-sorted/"; + +static S3_DOWNLOAD_PREFIX: &str = "https://dandiarchive.s3.amazonaws.com/zarr/"; + +const MANIFEST_CACHE_SIZE: u64 = 16; + +#[derive(Clone, Debug)] +pub(crate) struct ZarrManClient { + inner: reqwest::Client, + //TODO: manifests: Arc>>, + manifests: Arc>>, +} + +impl ZarrManClient { + pub(crate) fn new() -> Result { + let inner = new_client()?; + let manifests = Arc::new( + CacheBuilder::new(MANIFEST_CACHE_SIZE) + .name("zarr-manifests") + .build(), + ); + Ok(ZarrManClient { inner, manifests }) + } + + #[allow(clippy::unused_async)] + pub(crate) async fn get_top_level_dirs(&self) -> Result, ZarrManError> { + todo!() + } + + #[allow(clippy::unused_async)] + #[allow(unused_variables)] + pub(crate) async fn get_resource( + &self, + path: &PurePath, + ) -> Result { + todo!() + } + + #[allow(clippy::unused_async)] + #[allow(unused_variables)] + pub(crate) async fn get_resource_with_children( + &self, + path: &PurePath, + ) -> Result { + todo!() + } +} + +#[derive(Debug, Error)] +#[error("TODO")] +pub(crate) struct ZarrManError; // TODO diff --git a/src/zarrman/resources.rs b/src/zarrman/resources.rs new file mode 100644 index 0000000..b50a103 --- /dev/null +++ b/src/zarrman/resources.rs @@ -0,0 +1,56 @@ +use crate::paths::{PureDirPath, PurePath}; +use time::OffsetDateTime; + +#[derive(Clone, Debug, Eq, PartialEq)] +pub(crate) enum ZarrManResource { + WebFolder(WebFolder), + Manifest(Manifest), + ManFolder(ManifestFolder), + ManEntry(ManifestEntry), +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub(crate) struct WebFolder { + web_path: PureDirPath, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub(crate) struct Manifest { + web_path: PureDirPath, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub(crate) enum ZarrManResourceWithChildren { + WebFolder { + folder: WebFolder, + children: Vec, + }, + Manifest { + folder: Manifest, + children: Vec, + }, + ManFolder { + folder: ManifestFolder, + children: Vec, + }, + ManEntry(ManifestEntry), +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub(crate) struct ManifestFolder { + //pub(crate) manifest_path: PurePath, + //pub(crate) path: PureDirPath, + pub(crate) web_path: PureDirPath, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub(crate) struct ManifestEntry { + //pub(crate) manifest_path: PurePath, + //pub(crate) path: PurePath, + pub(crate) web_path: PurePath, + //pub(crate) version_id: String, + pub(crate) size: i64, + pub(crate) modified: OffsetDateTime, + pub(crate) etag: String, + pub(crate) url: url::Url, +} From 4d2d1c8be60add8bcbd98d463a37e3bbc1cbf26d Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 2 Feb 2024 11:01:20 -0500 Subject: [PATCH 04/17] PurePath::components() --- src/paths/purepath.rs | 4 ++++ src/zarrman/manifest.rs | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/paths/purepath.rs b/src/paths/purepath.rs index 3f8b832..85c70a0 100644 --- a/src/paths/purepath.rs +++ b/src/paths/purepath.rs @@ -52,6 +52,10 @@ impl PurePath { pub(crate) fn to_dir_path(&self) -> PureDirPath { PureDirPath(format!("{}/", self.0)) } + + pub(crate) fn components(&self) -> std::str::Split<'_, char> { + self.0.split('/') + } } impl fmt::Debug for PurePath { diff --git a/src/zarrman/manifest.rs b/src/zarrman/manifest.rs index 07b72c7..62c6adb 100644 --- a/src/zarrman/manifest.rs +++ b/src/zarrman/manifest.rs @@ -13,7 +13,7 @@ pub(crate) struct Manifest { impl Manifest { pub(super) fn get(&self, path: &PurePath) -> Option> { let mut folder = &self.entries; - for (pos, p) in path.split('/').with_position() { + for (pos, p) in path.components().with_position() { match folder.get(p)? { FolderEntry::Folder(f) => folder = f, FolderEntry::Entry(e) if matches!(pos, Position::Last | Position::Only) => { From 5b00bb4266fffd9585af646cfeaedb6e8cf601c0 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 2 Feb 2024 11:34:35 -0500 Subject: [PATCH 05/17] Test of deserializing & querying Zarr manifest --- src/zarrman/manifest.rs | 160 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 158 insertions(+), 2 deletions(-) diff --git a/src/zarrman/manifest.rs b/src/zarrman/manifest.rs index 62c6adb..1c7db76 100644 --- a/src/zarrman/manifest.rs +++ b/src/zarrman/manifest.rs @@ -4,7 +4,6 @@ use serde::Deserialize; use std::collections::BTreeMap; use time::OffsetDateTime; -// TODO: Add a deserialization test #[derive(Clone, Debug, Deserialize, Eq, PartialEq)] pub(crate) struct Manifest { entries: ManifestFolder, @@ -46,8 +45,165 @@ pub(crate) struct ManifestEntry { // Keep these fields in this order so that deserialization will work // properly! version_id: String, - size: i64, #[serde(with = "time::serde::rfc3339")] modified: OffsetDateTime, + size: i64, etag: String, } + +#[cfg(test)] +mod tests { + use super::*; + use assert_matches::assert_matches; + use indoc::indoc; + use time::macros::datetime; + + #[test] + fn test_manifest() { + let s = indoc! {r#" + { + "schemaVersion": 2, + "fields": ["versionId","lastModified","size","ETag"], + "statistics": { + "entries": 509, + "depth": 5, + "totalSize": 710206390, + "lastModified": "2022-06-27T23:09:39+00:00", + "zarrChecksum": "6ddc4625befef8d6f9796835648162be-509--710206390" + }, + "entries": { + ".zattrs": ["VwOSu7IVLAQcQHcqOesmlrEDm2sL_Tfs","2022-06-27T23:07:47+00:00",8312,"cb32b88f6488d55818aba94746bcc19a"], + ".zgroup": ["7obAY5BUNOdI1Uch3RoI4oHuGXhW4h0R","2022-06-27T23:07:47+00:00",24,"e20297935e73dd0154104d4ea53040ab"], + ".zmetadata": ["Vfe0W0v4zkydmzyXkUMjm2Xr7.rIvfZQ","2022-06-27T23:07:47+00:00",15191,"4f505878fbb943a9793516cf084e07ad"], + "0": { + ".zarray": ["Ou6TnKwWPmEJrL.0utCWLPxgfr_lA0I1","2022-06-27T23:07:48+00:00",446,"5477ec3da352681e5ba6f6ea550ef740"], + "0": { + "0": { + "13": { + "8": { + "100": ["lqNZ6OQ6lKd2QRW8ekWOiVfdZhiicWsh","2022-06-27T23:09:11+00:00",1793451,"7b5af4c6c28047c83dd86e4814bc0272"], + "101": ["_i9cZBerb4mB9D8IFbPHo8nrefWcbq0p","2022-06-27T23:09:28+00:00",1799564,"50b6cfb69609319da9bf900a21d5f25c"] + } + } + } + } + } + } + } + "#}; + + let manifest = serde_json::from_str::(s).unwrap(); + let zattrs = ManifestEntry { + version_id: "VwOSu7IVLAQcQHcqOesmlrEDm2sL_Tfs".into(), + modified: datetime!(2022-06-27 23:07:47 UTC), + size: 8312, + etag: "cb32b88f6488d55818aba94746bcc19a".into(), + }; + let zarray = ManifestEntry { + version_id: "Ou6TnKwWPmEJrL.0utCWLPxgfr_lA0I1".into(), + modified: datetime!(2022-06-27 23:07:48 UTC), + size: 446, + etag: "5477ec3da352681e5ba6f6ea550ef740".into(), + }; + let entry_100 = ManifestEntry { + version_id: "lqNZ6OQ6lKd2QRW8ekWOiVfdZhiicWsh".into(), + modified: datetime!(2022-06-27 23:09:11 UTC), + size: 1793451, + etag: "7b5af4c6c28047c83dd86e4814bc0272".into(), + }; + + assert_eq!( + manifest, + Manifest { + entries: BTreeMap::from([ + (".zattrs".into(), FolderEntry::Entry(zattrs.clone())), + ( + ".zgroup".into(), + FolderEntry::Entry(ManifestEntry { + version_id: "7obAY5BUNOdI1Uch3RoI4oHuGXhW4h0R".into(), + modified: datetime!(2022-06-27 23:07:47 UTC), + size: 24, + etag: "e20297935e73dd0154104d4ea53040ab".into(), + }) + ), + ( + ".zmetadata".into(), + FolderEntry::Entry(ManifestEntry { + version_id: "Vfe0W0v4zkydmzyXkUMjm2Xr7.rIvfZQ".into(), + modified: datetime!(2022-06-27 23:07:47 UTC), + size: 15191, + etag: "4f505878fbb943a9793516cf084e07ad".into(), + }) + ), + ( + "0".into(), + FolderEntry::Folder(BTreeMap::from([ + (".zarray".into(), FolderEntry::Entry(zarray.clone())), + ( + "0".into(), + FolderEntry::Folder(BTreeMap::from([( + "0".into(), + FolderEntry::Folder(BTreeMap::from([( + "13".into(), + FolderEntry::Folder(BTreeMap::from([( + "8".into(), + FolderEntry::Folder(BTreeMap::from([ + ( + "100".into(), + FolderEntry::Entry(entry_100.clone()) + ), + ( + "101".into(), + FolderEntry::Entry(ManifestEntry { + version_id: + "_i9cZBerb4mB9D8IFbPHo8nrefWcbq0p" + .into(), + modified: datetime!(2022-06-27 23:09:28 UTC), + size: 1799564, + etag: "50b6cfb69609319da9bf900a21d5f25c" + .into(), + }) + ), + ])) + )])) + )])) + )])) + ) + ])) + ) + ]) + } + ); + + assert_eq!( + manifest.get(&".zattrs".parse::().unwrap()), + Some(EntryRef::Entry(&zattrs)) + ); + assert_eq!( + manifest.get(&"not-found".parse::().unwrap()), + None, + ); + assert_eq!( + manifest.get(&".zattrs/0".parse::().unwrap()), + None, + ); + assert_eq!( + manifest.get(&"0/.zarray".parse::().unwrap()), + Some(EntryRef::Entry(&zarray)) + ); + assert_eq!( + manifest.get(&"0/not-found".parse::().unwrap()), + None, + ); + assert_eq!( + manifest.get(&"0/0/0/13/8/100".parse::().unwrap()), + Some(EntryRef::Entry(&entry_100)) + ); + assert_matches!( + manifest.get(&"0/0/0/13/8".parse::().unwrap()), + Some(EntryRef::Folder(folder)) => { + assert_eq!(folder.keys().collect::>(), ["100", "101"]); + } + ); + } +} From 1e54a42e53e7978386e576869c0c674e393d6d8a Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 2 Feb 2024 12:04:55 -0500 Subject: [PATCH 06/17] Integrate zarrman/ into dav/ --- src/dav/mod.rs | 41 ++++++++--- src/dav/path.rs | 147 +++++++++++++++++++++++++-------------- src/dav/types.rs | 115 ++++++++++++++++++++++++++++-- src/main.rs | 9 ++- src/zarrman/mod.rs | 10 ++- src/zarrman/resources.rs | 4 +- 6 files changed, 251 insertions(+), 75 deletions(-) diff --git a/src/dav/mod.rs b/src/dav/mod.rs index 39f11b3..ad22ed1 100644 --- a/src/dav/mod.rs +++ b/src/dav/mod.rs @@ -11,6 +11,7 @@ use self::util::*; use self::xml::*; use crate::consts::{DAV_XML_CONTENT_TYPE, HTML_CONTENT_TYPE}; use crate::dandi::*; +use crate::zarrman::*; use axum::{ body::Body, extract::Request, @@ -29,20 +30,13 @@ const WEBDAV_RESPONSE_HEADERS: [(&str, &str); 2] = [ ]; pub(crate) struct DandiDav { - dandi: DandiClient, - templater: Templater, - title: String, + pub(crate) dandi: DandiClient, + pub(crate) zarrman: ZarrManClient, + pub(crate) templater: Templater, + pub(crate) title: String, } impl DandiDav { - pub(crate) fn new(dandi: DandiClient, templater: Templater, title: String) -> DandiDav { - DandiDav { - dandi, - templater, - title, - } - } - pub(crate) async fn handle_request( &self, req: Request, @@ -225,6 +219,11 @@ impl DandiDav { .await?; Ok(DavResource::from(res).under_version_path(dandiset_id, version)) } + DavPath::ZarrIndex => Ok(DavResource::Collection(DavCollection::zarr_index())), + DavPath::ZarrPath { path } => { + let res = self.zarrman.get_resource(path).await?; + Ok(DavResource::from(res)) + } } } @@ -321,6 +320,21 @@ impl DandiDav { .await?; Ok(DavResourceWithChildren::from(res).under_version_path(dandiset_id, version)) } + DavPath::ZarrIndex => { + let col = DavCollection::zarr_index(); + let children = self + .zarrman + .get_top_level_dirs() + .await? + .into_iter() + .map(DavResource::from) + .collect(); + Ok(DavResourceWithChildren::Collection { col, children }) + } + DavPath::ZarrPath { path } => { + let res = self.zarrman.get_resource_with_children(path).await?; + Ok(DavResourceWithChildren::from(res)) + } } } } @@ -329,6 +343,8 @@ impl DandiDav { pub(crate) enum DavError { #[error("failed to fetch data from Archive")] DandiApi(#[from] DandiError), + #[error("failed to fetch data from Zarr manifests")] + ZarrMan(#[from] ZarrManError), #[error( "latest version was requested for Dandiset {dandiset_id}, but it has not been published" )] @@ -344,6 +360,9 @@ impl DavError { matches!( self, DavError::DandiApi(DandiError::NotFound { .. } | DandiError::ZarrEntryNotFound { .. }) + | DavError::ZarrMan( + ZarrManError::NotFound { .. } | ZarrManError::InvalidPath { .. } + ) | DavError::NoLatestVersion { .. } ) } diff --git a/src/dav/path.rs b/src/dav/path.rs index 568d3e6..90926cf 100644 --- a/src/dav/path.rs +++ b/src/dav/path.rs @@ -25,6 +25,10 @@ pub(super) enum DavPath { version: VersionSpec, path: PurePath, }, + ZarrIndex, + ZarrPath { + path: PurePath, + }, } impl DavPath { @@ -36,67 +40,60 @@ impl DavPath { let Some(p1) = parts.next() else { return Some(DavPath::Root); }; - if !p1.eq_ignore_ascii_case("dandisets") { - return None; - } - let Some(did) = parts.next() else { - return Some(DavPath::DandisetIndex); - }; - let Ok(dandiset_id) = did.parse::() else { - return None; - }; - let Some(p3) = parts.next() else { - return Some(DavPath::Dandiset { dandiset_id }); - }; - let version = if p3.eq_ignore_ascii_case("releases") { - let Some(v) = parts.next() else { - return Some(DavPath::DandisetReleases { dandiset_id }); + if p1.eq_ignore_ascii_case("dandisets") { + let Some(did) = parts.next() else { + return Some(DavPath::DandisetIndex); }; - let Ok(pv) = v.parse::() else { + let Ok(dandiset_id) = did.parse::() else { return None; }; - VersionSpec::Published(pv) - } else if p3.eq_ignore_ascii_case("latest") { - VersionSpec::Latest - } else if p3.eq_ignore_ascii_case("draft") { - VersionSpec::Draft - } else { - return None; - }; - let mut path = String::new(); - for p in parts { - if p == "." { - continue; - } else if p == ".." || FAST_NOT_EXIST.binary_search(&p).is_ok() { - // axum collapses `..` components in requests; the only way a - // `..` could have snuck in is if the component were - // percent-escaped, in which case we're going to reject the - // user's meddling. + let Some(p3) = parts.next() else { + return Some(DavPath::Dandiset { dandiset_id }); + }; + let version = if p3.eq_ignore_ascii_case("releases") { + let Some(v) = parts.next() else { + return Some(DavPath::DandisetReleases { dandiset_id }); + }; + let Ok(pv) = v.parse::() else { + return None; + }; + VersionSpec::Published(pv) + } else if p3.eq_ignore_ascii_case("latest") { + VersionSpec::Latest + } else if p3.eq_ignore_ascii_case("draft") { + VersionSpec::Draft + } else { return None; + }; + let path = join_parts(parts)?; + if path.is_empty() { + Some(DavPath::Version { + dandiset_id, + version, + }) + } else if path == "dandiset.yaml" { + Some(DavPath::DandisetYaml { + dandiset_id, + version, + }) } else { - if !path.is_empty() { - path.push('/'); - } - path.push_str(p); + let path = path.parse::().expect("should be valid path"); + Some(DavPath::DandiResource { + dandiset_id, + version, + path, + }) + } + } else if p1.eq_ignore_ascii_case("zarrs") { + let path = join_parts(parts)?; + if path.is_empty() { + Some(DavPath::ZarrIndex) + } else { + let path = path.parse::().expect("should be valid path"); + Some(DavPath::ZarrPath { path }) } - } - if path.is_empty() { - Some(DavPath::Version { - dandiset_id, - version, - }) - } else if path == "dandiset.yaml" { - Some(DavPath::DandisetYaml { - dandiset_id, - version, - }) } else { - let path = path.parse::().expect("should be valid path"); - Some(DavPath::DandiResource { - dandiset_id, - version, - path, - }) + None } } } @@ -136,6 +133,26 @@ impl<'a> Iterator for SplitComponents<'a> { impl std::iter::FusedIterator for SplitComponents<'_> {} +fn join_parts(parts: SplitComponents<'_>) -> Option { + let mut path = String::new(); + for p in parts { + if p == "." { + continue; + } else if p == ".." || FAST_NOT_EXIST.binary_search(&p).is_ok() { + // axum collapses `..` components in requests; the only way a `..` + // could have snuck in is if the component were percent-escaped, in + // which case we're going to reject the user's meddling. + return None; + } else { + if !path.is_empty() { + path.push('/'); + } + path.push_str(p); + } + } + Some(path) +} + #[cfg(test)] mod tests { use super::*; @@ -307,4 +324,26 @@ mod tests { assert_eq!(path, respath); }); } + + #[rstest] + #[case("/zarrs")] + #[case("/zarrs/")] + #[case("/zarrs//")] + #[case("//zarrs/")] + #[case("/Zarrs")] + #[case("/ZARRS")] + fn test_zarr_index(#[case] path: &str) { + assert_eq!(DavPath::parse_uri_path(path), Some(DavPath::ZarrIndex)); + } + + #[rstest] + #[case("/zarrs/123", "123")] + #[case("/zarrs/123/", "123")] + #[case("/zarrs/123/abc", "123/abc")] + #[case("/ZARRS/123/ABC", "123/ABC")] + fn test_zarr_path(#[case] s: &str, #[case] respath: &str) { + assert_matches!(DavPath::parse_uri_path(s), Some(DavPath::ZarrPath {path}) => { + assert_eq!(path, respath); + }); + } } diff --git a/src/dav/types.rs b/src/dav/types.rs index 2d47bd1..149c768 100644 --- a/src/dav/types.rs +++ b/src/dav/types.rs @@ -4,6 +4,7 @@ use super::VersionSpec; use crate::consts::{DEFAULT_CONTENT_TYPE, YAML_CONTENT_TYPE}; use crate::dandi::*; use crate::paths::{PureDirPath, PurePath}; +use crate::zarrman::*; use enum_dispatch::enum_dispatch; use serde::{ser::Serializer, Serialize}; use time::OffsetDateTime; @@ -79,6 +80,17 @@ impl From for DavResource { } } +impl From for DavResource { + fn from(res: ZarrManResource) -> DavResource { + match res { + ZarrManResource::WebFolder(folder) => DavResource::Collection(folder.into()), + ZarrManResource::Manifest(folder) => DavResource::Collection(folder.into()), + ZarrManResource::ManFolder(folder) => DavResource::Collection(folder.into()), + ZarrManResource::ManEntry(entry) => DavResource::Item(entry.into()), + } + } +} + #[derive(Clone, Debug, Eq, PartialEq)] pub(super) enum DavResourceWithChildren { Collection { @@ -92,7 +104,10 @@ impl DavResourceWithChildren { pub(super) fn root() -> Self { DavResourceWithChildren::Collection { col: DavCollection::root(), - children: vec![DavResource::Collection(DavCollection::dandiset_index())], + children: vec![ + DavResource::Collection(DavCollection::dandiset_index()), + DavResource::Collection(DavCollection::zarr_index()), + ], } } @@ -150,6 +165,31 @@ impl From for DavResourceWithChildren { } } +impl From for DavResourceWithChildren { + fn from(res: ZarrManResourceWithChildren) -> DavResourceWithChildren { + fn map_children(vec: Vec) -> Vec { + vec.into_iter().map(DavResource::from).collect() + } + + use ZarrManResourceWithChildren::*; + match res { + WebFolder { folder, children } => DavResourceWithChildren::Collection { + col: DavCollection::from(folder), + children: map_children(children), + }, + Manifest { folder, children } => DavResourceWithChildren::Collection { + col: DavCollection::from(folder), + children: map_children(children), + }, + ManFolder { folder, children } => DavResourceWithChildren::Collection { + col: DavCollection::from(folder), + children: map_children(children), + }, + ManEntry(entry) => DavResourceWithChildren::Item(entry.into()), + } + } +} + #[derive(Clone, Debug, Eq, PartialEq)] pub(super) struct DavCollection { pub(super) path: Option, // None for root collection @@ -239,6 +279,20 @@ impl DavCollection { kind: ResourceKind::Version, } } + + pub(super) fn zarr_index() -> Self { + DavCollection { + path: Some( + "zarrs/" + .parse::() + .expect(r#""zarrs/" should be a valid dir path"#), + ), + created: None, + modified: None, + size: None, + kind: ResourceKind::ZarrIndex, + } + } } impl HasProperties for DavCollection { @@ -298,7 +352,7 @@ impl From for DavCollection { created: None, modified: None, size: None, - kind: ResourceKind::AssetFolder, + kind: ResourceKind::Directory, } } } @@ -327,6 +381,42 @@ impl From for DavCollection { } } +impl From for DavCollection { + fn from(WebFolder { web_path }: WebFolder) -> DavCollection { + DavCollection { + path: Some(web_path), + created: None, + modified: None, + size: None, + kind: ResourceKind::Directory, + } + } +} + +impl From for DavCollection { + fn from(Manifest { web_path }: Manifest) -> DavCollection { + DavCollection { + path: Some(web_path), + created: None, + modified: None, + size: None, + kind: ResourceKind::Zarr, + } + } +} + +impl From for DavCollection { + fn from(ManifestFolder { web_path }: ManifestFolder) -> DavCollection { + DavCollection { + path: Some(web_path), + created: None, + modified: None, + size: None, + kind: ResourceKind::Directory, + } + } +} + #[derive(Clone, Debug, Eq, PartialEq)] pub(super) struct DavItem { pub(super) path: PurePath, @@ -459,6 +549,21 @@ impl From for DavItem { } } +impl From for DavItem { + fn from(entry: ManifestEntry) -> DavItem { + DavItem { + path: entry.web_path, + created: None, + modified: Some(entry.modified), + content_type: DEFAULT_CONTENT_TYPE.to_owned(), + size: Some(entry.size), + etag: Some(entry.etag), + kind: ResourceKind::ZarrEntry, + content: DavContent::Redirect(entry.url), + } + } +} + #[derive(Clone, Debug, Eq, PartialEq)] pub(super) enum DavContent { Blob(Vec), @@ -477,11 +582,12 @@ pub(super) enum ResourceKind { DandisetReleases, Version, VersionMetadata, - AssetFolder, + Directory, Blob, Zarr, ZarrEntry, ZarrFolder, + ZarrIndex, } impl ResourceKind { @@ -494,11 +600,12 @@ impl ResourceKind { ResourceKind::DandisetReleases => "Published versions", ResourceKind::Version => "Dandiset version", ResourceKind::VersionMetadata => "Version metadata", - ResourceKind::AssetFolder => "Directory", + ResourceKind::Directory => "Directory", ResourceKind::Blob => "Blob asset", ResourceKind::Zarr => "Zarr asset", ResourceKind::ZarrEntry => "Zarr entry", ResourceKind::ZarrFolder => "Directory", + ResourceKind::ZarrIndex => "Zarrs", } } } diff --git a/src/main.rs b/src/main.rs index 02805d4..7f4a45a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -8,6 +8,7 @@ mod zarrman; use crate::consts::{CSS_CONTENT_TYPE, DEFAULT_API_URL, SERVER_VALUE}; use crate::dandi::DandiClient; use crate::dav::{DandiDav, Templater}; +use crate::zarrman::ZarrManClient; use anyhow::Context; use axum::{ body::Body, @@ -60,8 +61,14 @@ async fn main() -> anyhow::Result<()> { .with_max_level(LevelFilter::TRACE) .init(); let dandi = DandiClient::new(args.api_url)?; + let zarrman = ZarrManClient::new()?; let templater = Templater::load()?; - let dav = Arc::new(DandiDav::new(dandi, templater, args.title)); + let dav = Arc::new(DandiDav { + dandi, + zarrman, + templater, + title: args.title, + }); let app = Router::new() .route( "/.static/styles.css", diff --git a/src/zarrman/mod.rs b/src/zarrman/mod.rs index 4675b4d..94ae8ec 100644 --- a/src/zarrman/mod.rs +++ b/src/zarrman/mod.rs @@ -33,7 +33,7 @@ impl ZarrManClient { } #[allow(clippy::unused_async)] - pub(crate) async fn get_top_level_dirs(&self) -> Result, ZarrManError> { + pub(crate) async fn get_top_level_dirs(&self) -> Result, ZarrManError> { todo!() } @@ -57,5 +57,9 @@ impl ZarrManClient { } #[derive(Debug, Error)] -#[error("TODO")] -pub(crate) struct ZarrManError; // TODO +pub(crate) enum ZarrManError { + #[error("no such resource: {url}")] + NotFound { url: url::Url }, + #[error("invalid path requested: {path:?}")] + InvalidPath { path: PurePath }, +} diff --git a/src/zarrman/resources.rs b/src/zarrman/resources.rs index b50a103..bae8fd3 100644 --- a/src/zarrman/resources.rs +++ b/src/zarrman/resources.rs @@ -11,12 +11,12 @@ pub(crate) enum ZarrManResource { #[derive(Clone, Debug, Eq, PartialEq)] pub(crate) struct WebFolder { - web_path: PureDirPath, + pub(crate) web_path: PureDirPath, } #[derive(Clone, Debug, Eq, PartialEq)] pub(crate) struct Manifest { - web_path: PureDirPath, + pub(crate) web_path: PureDirPath, } #[derive(Clone, Debug, Eq, PartialEq)] From 1d1d90c383792a3ece6c0b06dc95a01f888d4ea8 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 2 Feb 2024 12:06:10 -0500 Subject: [PATCH 07/17] Replace ResourceKind::ZarrFolder with ResourceKind::Directory --- src/dav/types.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/dav/types.rs b/src/dav/types.rs index 149c768..d22107b 100644 --- a/src/dav/types.rs +++ b/src/dav/types.rs @@ -376,7 +376,7 @@ impl From for DavCollection { created: None, modified: None, size: None, - kind: ResourceKind::ZarrFolder, + kind: ResourceKind::Directory, } } } @@ -586,7 +586,6 @@ pub(super) enum ResourceKind { Blob, Zarr, ZarrEntry, - ZarrFolder, ZarrIndex, } @@ -604,7 +603,6 @@ impl ResourceKind { ResourceKind::Blob => "Blob asset", ResourceKind::Zarr => "Zarr asset", ResourceKind::ZarrEntry => "Zarr entry", - ResourceKind::ZarrFolder => "Directory", ResourceKind::ZarrIndex => "Zarrs", } } From 97f34d082f251b847a4e3f26eb466808d6ef88c2 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 2 Feb 2024 12:36:41 -0500 Subject: [PATCH 08/17] More ZarrMan skeleton --- src/dandi/mod.rs | 51 +-------------------- src/httputil.rs | 53 ++++++++++++++++++++++ src/paths/dirpath.rs | 4 ++ src/zarrman/index.rs | 21 +++++++++ src/zarrman/mod.rs | 68 +++++++++++++++++++++++++--- src/zarrman/testdata/016.html | 84 +++++++++++++++++++++++++++++++++++ 6 files changed, 227 insertions(+), 54 deletions(-) create mode 100644 src/zarrman/index.rs create mode 100644 src/zarrman/testdata/016.html diff --git a/src/dandi/mod.rs b/src/dandi/mod.rs index ceb51b7..68fd492 100644 --- a/src/dandi/mod.rs +++ b/src/dandi/mod.rs @@ -5,7 +5,7 @@ pub(crate) use self::dandiset_id::*; pub(crate) use self::types::*; pub(crate) use self::version_id::*; use crate::consts::S3CLIENT_CACHE_SIZE; -use crate::httputil::{new_client, BuildClientError}; +use crate::httputil::{new_client, urljoin_slashed, BuildClientError}; use crate::paths::{ParsePureDirPathError, PureDirPath, PurePath}; use crate::s3::{ BucketSpec, GetBucketRegionError, PrefixedS3Client, S3Client, S3Error, S3Location, @@ -47,7 +47,7 @@ impl DandiClient { I: IntoIterator, I::Item: AsRef, { - urljoin(&self.api_url, segments) + urljoin_slashed(&self.api_url, segments) } async fn get(&self, url: Url) -> Result { @@ -468,21 +468,6 @@ pub(crate) enum ZarrToS3Error { }, } -fn urljoin(url: &Url, segments: I) -> Url -where - I: IntoIterator, - I::Item: AsRef, -{ - let mut url = url.clone(); - url.path_segments_mut() - .expect("API URL should be able to be a base") - .pop_if_empty() - .extend(segments) - // Add an empty segment so that the final URL will end with a slash: - .push(""); - url -} - fn dump_json_as_yaml(data: serde_json::Value) -> String { serde_yaml::to_string(&data).expect("converting JSON to YAML should not fail") } @@ -491,40 +476,8 @@ fn dump_json_as_yaml(data: serde_json::Value) -> String { mod tests { use super::*; use indoc::indoc; - use rstest::rstest; use serde_json::json; - #[rstest] - #[case("https://api.github.com")] - #[case("https://api.github.com/")] - fn test_urljoin_nopath(#[case] base: Url) { - let u = urljoin(&base, ["foo"]); - assert_eq!(u.as_str(), "https://api.github.com/foo/"); - let u = urljoin(&base, ["foo", "bar"]); - assert_eq!(u.as_str(), "https://api.github.com/foo/bar/"); - } - - #[rstest] - #[case("https://api.github.com/foo/bar")] - #[case("https://api.github.com/foo/bar/")] - fn test_urljoin_path(#[case] base: Url) { - let u = urljoin(&base, ["gnusto"]); - assert_eq!(u.as_str(), "https://api.github.com/foo/bar/gnusto/"); - let u = urljoin(&base, ["gnusto", "cleesh"]); - assert_eq!(u.as_str(), "https://api.github.com/foo/bar/gnusto/cleesh/"); - } - - #[rstest] - #[case("foo#bar", "https://api.github.com/base/foo%23bar/")] - #[case("foo%bar", "https://api.github.com/base/foo%25bar/")] - #[case("foo/bar", "https://api.github.com/base/foo%2Fbar/")] - #[case("foo?bar", "https://api.github.com/base/foo%3Fbar/")] - fn test_urljoin_special_chars(#[case] path: &str, #[case] expected: &str) { - let base = Url::parse("https://api.github.com/base").unwrap(); - let u = urljoin(&base, [path]); - assert_eq!(u.as_str(), expected); - } - #[test] fn test_dump_json_as_yaml() { let data = json! ({ diff --git a/src/httputil.rs b/src/httputil.rs index c83e18c..00f22f4 100644 --- a/src/httputil.rs +++ b/src/httputil.rs @@ -1,6 +1,7 @@ use crate::consts::USER_AGENT; use reqwest::ClientBuilder; use thiserror::Error; +use url::Url; pub(crate) fn new_client() -> Result { ClientBuilder::new() @@ -12,3 +13,55 @@ pub(crate) fn new_client() -> Result { #[derive(Debug, Error)] #[error("failed to initialize HTTP client")] pub(crate) struct BuildClientError(#[from] reqwest::Error); + +pub(crate) fn urljoin_slashed(url: &Url, segments: I) -> Url +where + I: IntoIterator, + I::Item: AsRef, +{ + let mut url = url.clone(); + url.path_segments_mut() + .expect("URL should be able to be a base") + .pop_if_empty() + .extend(segments) + // Add an empty segment so that the final URL will end with a slash: + .push(""); + url +} + +#[cfg(test)] +mod tests { + use super::*; + use rstest::rstest; + + #[rstest] + #[case("https://api.github.com")] + #[case("https://api.github.com/")] + fn test_urljoin_slashed_nopath(#[case] base: Url) { + let u = urljoin_slashed(&base, ["foo"]); + assert_eq!(u.as_str(), "https://api.github.com/foo/"); + let u = urljoin_slashed(&base, ["foo", "bar"]); + assert_eq!(u.as_str(), "https://api.github.com/foo/bar/"); + } + + #[rstest] + #[case("https://api.github.com/foo/bar")] + #[case("https://api.github.com/foo/bar/")] + fn test_urljoin_slashed_path(#[case] base: Url) { + let u = urljoin_slashed(&base, ["gnusto"]); + assert_eq!(u.as_str(), "https://api.github.com/foo/bar/gnusto/"); + let u = urljoin_slashed(&base, ["gnusto", "cleesh"]); + assert_eq!(u.as_str(), "https://api.github.com/foo/bar/gnusto/cleesh/"); + } + + #[rstest] + #[case("foo#bar", "https://api.github.com/base/foo%23bar/")] + #[case("foo%bar", "https://api.github.com/base/foo%25bar/")] + #[case("foo/bar", "https://api.github.com/base/foo%2Fbar/")] + #[case("foo?bar", "https://api.github.com/base/foo%3Fbar/")] + fn test_urljoin_slashed_special_chars(#[case] path: &str, #[case] expected: &str) { + let base = Url::parse("https://api.github.com/base").unwrap(); + let u = urljoin_slashed(&base, [path]); + assert_eq!(u.as_str(), expected); + } +} diff --git a/src/paths/dirpath.rs b/src/paths/dirpath.rs index 80577b5..315c1cc 100644 --- a/src/paths/dirpath.rs +++ b/src/paths/dirpath.rs @@ -41,6 +41,10 @@ impl PureDirPath { let s = self.0.strip_prefix(&dirpath.0)?; (!s.is_empty()).then(|| PureDirPath(s.to_owned())) } + + pub(crate) fn components(&self) -> std::str::Split<'_, char> { + self.0.split('/') + } } impl fmt::Debug for PureDirPath { diff --git a/src/zarrman/index.rs b/src/zarrman/index.rs new file mode 100644 index 0000000..38dee1f --- /dev/null +++ b/src/zarrman/index.rs @@ -0,0 +1,21 @@ +use crate::paths::PurePath; +use thiserror::Error; + +pub(super) fn parse_apache_index(_s: &str) -> Result, ParseIndexError> { + todo!() +} + +#[derive(Clone, Debug, Eq, Error, PartialEq)] +#[error("TODO")] +pub(crate) struct ParseIndexError; + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_apache_index() { + let s = include_str!("testdata/016.html"); + assert_eq!(parse_apache_index(s).unwrap(), ["5a2", "fc8", "foo bar"]); + } +} diff --git a/src/zarrman/mod.rs b/src/zarrman/mod.rs index 94ae8ec..6ee89e5 100644 --- a/src/zarrman/mod.rs +++ b/src/zarrman/mod.rs @@ -1,11 +1,15 @@ +mod index; mod manifest; mod resources; +use self::index::*; pub(crate) use self::resources::*; -use crate::httputil::{new_client, BuildClientError}; -use crate::paths::PurePath; +use crate::httputil::{new_client, urljoin_slashed, BuildClientError}; +use crate::paths::{PureDirPath, PurePath}; use moka::future::{Cache, CacheBuilder}; +use reqwest::StatusCode; use std::sync::Arc; use thiserror::Error; +use url::Url; static MANIFEST_ROOT_URL: &str = "https://datasets.datalad.org/dandi/zarr-manifests/zarr-manifests-v2-sorted/"; @@ -32,9 +36,55 @@ impl ZarrManClient { Ok(ZarrManClient { inner, manifests }) } - #[allow(clippy::unused_async)] pub(crate) async fn get_top_level_dirs(&self) -> Result, ZarrManError> { - todo!() + let entries = self.get_index_entries(None).await?; + Ok(entries + .into_iter() + .map(|path| { + ZarrManResource::WebFolder(WebFolder { + web_path: path.to_dir_path(), + }) + }) + .collect()) + } + + async fn get_index_entries( + &self, + path: Option<&PureDirPath>, + ) -> Result, ZarrManError> { + let mut url = + Url::parse(MANIFEST_ROOT_URL).expect("MANIFEST_ROOT_URL should be a valid URL"); + if let Some(p) = path { + url = urljoin_slashed(&url, p.components()); + } + let r = self + .inner + .get(url.clone()) + .send() + .await + .map_err(|source| ZarrManError::Send { + url: url.clone(), + source, + })?; + if r.status() == StatusCode::NOT_FOUND { + return Err(ZarrManError::NotFound { url: url.clone() }); + } + let txt = r + .error_for_status() + .map_err(|source| ZarrManError::Status { + url: url.clone(), + source, + })? + .text() + .await + .map_err(|source| ZarrManError::Read { + url: url.clone(), + source, + })?; + parse_apache_index(&txt).map_err(|source| ZarrManError::ParseIndex { + url: url.clone(), + source, + }) } #[allow(clippy::unused_async)] @@ -58,8 +108,16 @@ impl ZarrManClient { #[derive(Debug, Error)] pub(crate) enum ZarrManError { + #[error("failed to make request to {url}")] + Send { url: Url, source: reqwest::Error }, #[error("no such resource: {url}")] - NotFound { url: url::Url }, + NotFound { url: Url }, + #[error("request to {url} returned error")] + Status { url: Url, source: reqwest::Error }, + #[error("failed to read response from {url}")] + Read { url: Url, source: reqwest::Error }, + #[error("failed to parse Apache index at {url}")] + ParseIndex { url: Url, source: ParseIndexError }, #[error("invalid path requested: {path:?}")] InvalidPath { path: PurePath }, } diff --git a/src/zarrman/testdata/016.html b/src/zarrman/testdata/016.html new file mode 100644 index 0000000..8a7d6fa --- /dev/null +++ b/src/zarrman/testdata/016.html @@ -0,0 +1,84 @@ + + + + Index of /dandi/zarr-manifests/zarr-manifests-v2-sorted/016 + + +

Index of /dandi/zarr-manifests/zarr-manifests-v2-sorted/016

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ [ICO] + + Name + + Last modified + + Size + + Description +
+
+
+ [PARENTDIR] + + Parent Directory +   -  
+ [DIR] + + 5a2/ + 2024-02-01 15:30 -  
+ [DIR] + + fc8/ + 2024-02-01 15:30 -  
+ [   ] + + foo bar + 2024-02-01 15:30 1B 
+
+
+
Apache/2.4.57 (Debian) Server at datasets.datalad.org Port 443
+ + From ca9c953f2907b44836c7d58351c8473b1a73dfce Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 2 Feb 2024 12:52:46 -0500 Subject: [PATCH 09/17] Factor out HTTP `GET` request stuff --- src/dandi/mod.rs | 56 ++++++++-------------------------------------- src/dav/mod.rs | 11 ++++----- src/httputil.rs | 54 +++++++++++++++++++++++++++++++++++++++++++- src/zarrman/mod.rs | 38 ++++--------------------------- 4 files changed, 72 insertions(+), 87 deletions(-) diff --git a/src/dandi/mod.rs b/src/dandi/mod.rs index 68fd492..b09cb27 100644 --- a/src/dandi/mod.rs +++ b/src/dandi/mod.rs @@ -5,7 +5,7 @@ pub(crate) use self::dandiset_id::*; pub(crate) use self::types::*; pub(crate) use self::version_id::*; use crate::consts::S3CLIENT_CACHE_SIZE; -use crate::httputil::{new_client, urljoin_slashed, BuildClientError}; +use crate::httputil::{get_json, new_client, urljoin_slashed, BuildClientError, HttpError}; use crate::paths::{ParsePureDirPathError, PureDirPath, PurePath}; use crate::s3::{ BucketSpec, GetBucketRegionError, PrefixedS3Client, S3Client, S3Error, S3Location, @@ -13,7 +13,6 @@ use crate::s3::{ use async_stream::try_stream; use futures_util::{Stream, TryStreamExt}; use moka::future::{Cache, CacheBuilder}; -use reqwest::StatusCode; use serde::de::DeserializeOwned; use smartstring::alias::CompactString; use std::sync::Arc; @@ -51,26 +50,7 @@ impl DandiClient { } async fn get(&self, url: Url) -> Result { - let r = self - .inner - .get(url.clone()) - .send() - .await - .map_err(|source| DandiError::Send { - url: url.clone(), - source, - })?; - if r.status() == StatusCode::NOT_FOUND { - return Err(DandiError::NotFound { url: url.clone() }); - } - r.error_for_status() - .map_err(|source| DandiError::Status { - url: url.clone(), - source, - })? - .json::() - .await - .map_err(move |source| DandiError::Deserialize { url, source }) + get_json(&self.inner, url).await.map_err(Into::into) } fn paginate( @@ -80,19 +60,7 @@ impl DandiClient { try_stream! { let mut url = Some(url); while let Some(u) = url { - let resp = self.inner - .get(u.clone()) - .send() - .await - .map_err(|source| DandiError::Send {url: u.clone(), source})?; - if resp.status() == StatusCode::NOT_FOUND { - Err(DandiError::NotFound {url: u.clone() })?; - } - let page = resp.error_for_status() - .map_err(|source| DandiError::Status {url: u.clone(), source})? - .json::>() - .await - .map_err(move |source| DandiError::Deserialize {url: u, source})?; + let page = get_json::>(&self.inner, u).await?; for r in page.results { yield r; } @@ -259,7 +227,7 @@ impl<'a> VersionEndpoint<'a> { FolderEntry::Folder(subf) => yield DandiResource::Folder(subf), FolderEntry::Asset { id, path } => match self.get_asset_by_id(&id).await { Ok(asset) => yield DandiResource::Asset(asset), - Err(DandiError::NotFound { .. }) => { + Err(DandiError::Http(HttpError::NotFound { .. })) => { Err(DandiError::DisappearingAsset { asset_id: id, path })?; } Err(e) => Err(e)?, @@ -319,7 +287,7 @@ impl<'a> VersionEndpoint<'a> { break; } } - Err(DandiError::NotFound { url }) + Err(DandiError::Http(HttpError::NotFound { url })) } async fn get_resource_with_s3( @@ -352,7 +320,7 @@ impl<'a> VersionEndpoint<'a> { "assets", ]); url.query_pairs_mut().append_pair("path", path.as_ref()); - return Err(DandiError::NotFound { url }); + return Err(DandiError::Http(HttpError::NotFound { url })); } AtAssetPath::Asset(Asset::Zarr(zarr)) => { let s3 = self.client.get_s3client_for_zarr(&zarr).await?; @@ -387,7 +355,7 @@ impl<'a> VersionEndpoint<'a> { FolderEntry::Folder(subf) => DandiResource::Folder(subf), FolderEntry::Asset { id, path } => match self.get_asset_by_id(&id).await { Ok(asset) => DandiResource::Asset(asset), - Err(DandiError::NotFound { .. }) => { + Err(DandiError::Http(HttpError::NotFound { .. })) => { return Err(DandiError::DisappearingAsset { asset_id: id, path }) } Err(e) => return Err(e), @@ -428,10 +396,8 @@ impl<'a> VersionEndpoint<'a> { #[derive(Debug, Error)] pub(crate) enum DandiError { - #[error("failed to make request to {url}")] - Send { url: Url, source: reqwest::Error }, - #[error("no such resource: {url}")] - NotFound { url: Url }, + #[error(transparent)] + Http(#[from] HttpError), #[error("entry {entry_path:?} in Zarr {zarr_path:?} not found")] ZarrEntryNotFound { zarr_path: PurePath, @@ -439,10 +405,6 @@ pub(crate) enum DandiError { }, #[error("folder listing included asset ID={asset_id} at path {path:?}, but request to asset returned 404")] DisappearingAsset { asset_id: String, path: PurePath }, - #[error("request to {url} returned error")] - Status { url: Url, source: reqwest::Error }, - #[error("failed to deserialize response body from {url}")] - Deserialize { url: Url, source: reqwest::Error }, #[error("failed to acquire S3 client for Zarr with asset ID {asset_id}")] ZarrToS3Error { asset_id: String, diff --git a/src/dav/mod.rs b/src/dav/mod.rs index ad22ed1..699d448 100644 --- a/src/dav/mod.rs +++ b/src/dav/mod.rs @@ -11,6 +11,7 @@ use self::util::*; use self::xml::*; use crate::consts::{DAV_XML_CONTENT_TYPE, HTML_CONTENT_TYPE}; use crate::dandi::*; +use crate::httputil::HttpError; use crate::zarrman::*; use axum::{ body::Body, @@ -359,11 +360,11 @@ impl DavError { pub(crate) fn is_404(&self) -> bool { matches!( self, - DavError::DandiApi(DandiError::NotFound { .. } | DandiError::ZarrEntryNotFound { .. }) - | DavError::ZarrMan( - ZarrManError::NotFound { .. } | ZarrManError::InvalidPath { .. } - ) - | DavError::NoLatestVersion { .. } + DavError::DandiApi( + DandiError::Http(HttpError::NotFound { .. }) | DandiError::ZarrEntryNotFound { .. } + ) | DavError::ZarrMan( + ZarrManError::Http(HttpError::NotFound { .. }) | ZarrManError::InvalidPath { .. } + ) | DavError::NoLatestVersion { .. } ) } } diff --git a/src/httputil.rs b/src/httputil.rs index 00f22f4..bc42910 100644 --- a/src/httputil.rs +++ b/src/httputil.rs @@ -1,5 +1,6 @@ use crate::consts::USER_AGENT; -use reqwest::ClientBuilder; +use reqwest::{ClientBuilder, StatusCode}; +use serde::de::DeserializeOwned; use thiserror::Error; use url::Url; @@ -14,6 +15,57 @@ pub(crate) fn new_client() -> Result { #[error("failed to initialize HTTP client")] pub(crate) struct BuildClientError(#[from] reqwest::Error); +async fn get_response(client: &reqwest::Client, url: Url) -> Result { + let r = client + .get(url.clone()) + .send() + .await + .map_err(|source| HttpError::Send { + url: url.clone(), + source, + })?; + if r.status() == StatusCode::NOT_FOUND { + return Err(HttpError::NotFound { url: url.clone() }); + } + r.error_for_status().map_err(|source| HttpError::Status { + url: url.clone(), + source, + }) +} + +pub(crate) async fn get_json( + client: &reqwest::Client, + url: Url, +) -> Result { + get_response(client, url.clone()) + .await? + .json::() + .await + .map_err(move |source| HttpError::Deserialize { url, source }) +} + +pub(crate) async fn get_text(client: &reqwest::Client, url: Url) -> Result { + get_response(client, url.clone()) + .await? + .text() + .await + .map_err(move |source| HttpError::Read { url, source }) +} + +#[derive(Debug, Error)] +pub(crate) enum HttpError { + #[error("failed to make request to {url}")] + Send { url: Url, source: reqwest::Error }, + #[error("no such resource: {url}")] + NotFound { url: Url }, + #[error("request to {url} returned error")] + Status { url: Url, source: reqwest::Error }, + #[error("failed to read response from {url}")] + Read { url: Url, source: reqwest::Error }, + #[error("failed to deserialize response body from {url}")] + Deserialize { url: Url, source: reqwest::Error }, +} + pub(crate) fn urljoin_slashed(url: &Url, segments: I) -> Url where I: IntoIterator, diff --git a/src/zarrman/mod.rs b/src/zarrman/mod.rs index 6ee89e5..1bd0619 100644 --- a/src/zarrman/mod.rs +++ b/src/zarrman/mod.rs @@ -3,10 +3,9 @@ mod manifest; mod resources; use self::index::*; pub(crate) use self::resources::*; -use crate::httputil::{new_client, urljoin_slashed, BuildClientError}; +use crate::httputil::{get_text, new_client, urljoin_slashed, BuildClientError, HttpError}; use crate::paths::{PureDirPath, PurePath}; use moka::future::{Cache, CacheBuilder}; -use reqwest::StatusCode; use std::sync::Arc; use thiserror::Error; use url::Url; @@ -57,30 +56,7 @@ impl ZarrManClient { if let Some(p) = path { url = urljoin_slashed(&url, p.components()); } - let r = self - .inner - .get(url.clone()) - .send() - .await - .map_err(|source| ZarrManError::Send { - url: url.clone(), - source, - })?; - if r.status() == StatusCode::NOT_FOUND { - return Err(ZarrManError::NotFound { url: url.clone() }); - } - let txt = r - .error_for_status() - .map_err(|source| ZarrManError::Status { - url: url.clone(), - source, - })? - .text() - .await - .map_err(|source| ZarrManError::Read { - url: url.clone(), - source, - })?; + let txt = get_text(&self.inner, url.clone()).await?; parse_apache_index(&txt).map_err(|source| ZarrManError::ParseIndex { url: url.clone(), source, @@ -108,14 +84,8 @@ impl ZarrManClient { #[derive(Debug, Error)] pub(crate) enum ZarrManError { - #[error("failed to make request to {url}")] - Send { url: Url, source: reqwest::Error }, - #[error("no such resource: {url}")] - NotFound { url: Url }, - #[error("request to {url} returned error")] - Status { url: Url, source: reqwest::Error }, - #[error("failed to read response from {url}")] - Read { url: Url, source: reqwest::Error }, + #[error(transparent)] + Http(#[from] HttpError), #[error("failed to parse Apache index at {url}")] ParseIndex { url: Url, source: ParseIndexError }, #[error("invalid path requested: {path:?}")] From 42f0fa720dcc32d753b6ca2838eb5972497c2ee0 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 2 Feb 2024 13:05:30 -0500 Subject: [PATCH 10/17] Better info for 404 errors --- src/dandi/mod.rs | 23 +++++++++++++++++++++-- src/dav/mod.rs | 17 +++++++---------- src/zarrman/mod.rs | 9 +++++++++ 3 files changed, 37 insertions(+), 12 deletions(-) diff --git a/src/dandi/mod.rs b/src/dandi/mod.rs index b09cb27..492b0ae 100644 --- a/src/dandi/mod.rs +++ b/src/dandi/mod.rs @@ -287,7 +287,7 @@ impl<'a> VersionEndpoint<'a> { break; } } - Err(DandiError::Http(HttpError::NotFound { url })) + Err(DandiError::PathNotFound { path: path.clone() }) } async fn get_resource_with_s3( @@ -320,7 +320,10 @@ impl<'a> VersionEndpoint<'a> { "assets", ]); url.query_pairs_mut().append_pair("path", path.as_ref()); - return Err(DandiError::Http(HttpError::NotFound { url })); + return Err(DandiError::PathUnderBlob { + path: path.clone(), + blob_path: zarr_path, + }); } AtAssetPath::Asset(Asset::Zarr(zarr)) => { let s3 = self.client.get_s3client_for_zarr(&zarr).await?; @@ -398,6 +401,10 @@ impl<'a> VersionEndpoint<'a> { pub(crate) enum DandiError { #[error(transparent)] Http(#[from] HttpError), + #[error("path {path:?} not found in assets")] + PathNotFound { path: PurePath }, + #[error("path {path:?} points nowhere as leading portion {blob_path:?} points to a blob")] + PathUnderBlob { path: PurePath, blob_path: PurePath }, #[error("entry {entry_path:?} in Zarr {zarr_path:?} not found")] ZarrEntryNotFound { zarr_path: PurePath, @@ -414,6 +421,18 @@ pub(crate) enum DandiError { S3(#[from] S3Error), } +impl DandiError { + pub(crate) fn is_404(&self) -> bool { + matches!( + self, + DandiError::Http(HttpError::NotFound { .. }) + | DandiError::PathNotFound { .. } + | DandiError::PathUnderBlob { .. } + | DandiError::ZarrEntryNotFound { .. } + ) + } +} + #[derive(Debug, Error)] pub(crate) enum ZarrToS3Error { #[error("Zarr does not have an S3 download URL")] diff --git a/src/dav/mod.rs b/src/dav/mod.rs index 699d448..3a51357 100644 --- a/src/dav/mod.rs +++ b/src/dav/mod.rs @@ -11,7 +11,6 @@ use self::util::*; use self::xml::*; use crate::consts::{DAV_XML_CONTENT_TYPE, HTML_CONTENT_TYPE}; use crate::dandi::*; -use crate::httputil::HttpError; use crate::zarrman::*; use axum::{ body::Body, @@ -343,7 +342,7 @@ impl DandiDav { #[derive(Debug, Error)] pub(crate) enum DavError { #[error("failed to fetch data from Archive")] - DandiApi(#[from] DandiError), + Dandi(#[from] DandiError), #[error("failed to fetch data from Zarr manifests")] ZarrMan(#[from] ZarrManError), #[error( @@ -358,14 +357,12 @@ pub(crate) enum DavError { impl DavError { pub(crate) fn is_404(&self) -> bool { - matches!( - self, - DavError::DandiApi( - DandiError::Http(HttpError::NotFound { .. }) | DandiError::ZarrEntryNotFound { .. } - ) | DavError::ZarrMan( - ZarrManError::Http(HttpError::NotFound { .. }) | ZarrManError::InvalidPath { .. } - ) | DavError::NoLatestVersion { .. } - ) + match self { + DavError::Dandi(e) => e.is_404(), + DavError::ZarrMan(e) => e.is_404(), + DavError::NoLatestVersion { .. } => true, + _ => false, + } } } diff --git a/src/zarrman/mod.rs b/src/zarrman/mod.rs index 1bd0619..0ec39c2 100644 --- a/src/zarrman/mod.rs +++ b/src/zarrman/mod.rs @@ -91,3 +91,12 @@ pub(crate) enum ZarrManError { #[error("invalid path requested: {path:?}")] InvalidPath { path: PurePath }, } + +impl ZarrManError { + pub(crate) fn is_404(&self) -> bool { + matches!( + self, + ZarrManError::Http(HttpError::NotFound { .. }) | ZarrManError::InvalidPath { .. } + ) + } +} From dca509dfe7037ebefbc5836ab63e78103a30dcde Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 2 Feb 2024 13:42:14 -0500 Subject: [PATCH 11/17] Zarr manifest directory indices are now JSON --- src/httputil.rs | 10 --- src/paths/component.rs | 134 ++++++++++++++++++++++++++++++++++ src/paths/dirpath.rs | 14 +++- src/paths/mod.rs | 2 + src/paths/purepath.rs | 8 +- src/zarrman/index.rs | 21 ------ src/zarrman/mod.rs | 54 ++++++++------ src/zarrman/testdata/016.html | 84 --------------------- 8 files changed, 189 insertions(+), 138 deletions(-) create mode 100644 src/paths/component.rs delete mode 100644 src/zarrman/index.rs delete mode 100644 src/zarrman/testdata/016.html diff --git a/src/httputil.rs b/src/httputil.rs index bc42910..87caa68 100644 --- a/src/httputil.rs +++ b/src/httputil.rs @@ -44,14 +44,6 @@ pub(crate) async fn get_json( .map_err(move |source| HttpError::Deserialize { url, source }) } -pub(crate) async fn get_text(client: &reqwest::Client, url: Url) -> Result { - get_response(client, url.clone()) - .await? - .text() - .await - .map_err(move |source| HttpError::Read { url, source }) -} - #[derive(Debug, Error)] pub(crate) enum HttpError { #[error("failed to make request to {url}")] @@ -60,8 +52,6 @@ pub(crate) enum HttpError { NotFound { url: Url }, #[error("request to {url} returned error")] Status { url: Url, source: reqwest::Error }, - #[error("failed to read response from {url}")] - Read { url: Url, source: reqwest::Error }, #[error("failed to deserialize response body from {url}")] Deserialize { url: Url, source: reqwest::Error }, } diff --git a/src/paths/component.rs b/src/paths/component.rs new file mode 100644 index 0000000..7457a7f --- /dev/null +++ b/src/paths/component.rs @@ -0,0 +1,134 @@ +use derive_more::{AsRef, Deref, Display}; +use serde::{ + de::{Deserializer, Unexpected, Visitor}, + ser::Serializer, + Deserialize, Serialize, +}; +use std::fmt; +use thiserror::Error; + +/// A nonempty path component that does not contain a forward slash or NUL nor +/// equals `.` or `..` +#[derive(AsRef, Clone, Deref, Display, Eq, Hash, Ord, PartialEq, PartialOrd)] +#[as_ref(forward)] +#[deref(forward)] +pub(crate) struct Component(pub(super) String); + +impl fmt::Debug for Component { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{:?}", self.0) + } +} + +impl PartialEq for Component { + fn eq(&self, other: &str) -> bool { + self.0 == other + } +} + +impl<'a> PartialEq<&'a str> for Component { + fn eq(&self, other: &&'a str) -> bool { + &self.0 == other + } +} + +impl std::str::FromStr for Component { + type Err = ParseComponentError; + + fn from_str(s: &str) -> Result { + if s.is_empty() { + Err(ParseComponentError::Empty) + } else if s.contains('/') { + Err(ParseComponentError::Slash) + } else if s.contains('\0') { + Err(ParseComponentError::Nul) + } else if s == "." || s == ".." { + Err(ParseComponentError::SpecialDir) + } else { + Ok(Component(s.into())) + } + } +} + +#[derive(Clone, Copy, Debug, Eq, Error, PartialEq)] +pub(crate) enum ParseComponentError { + #[error("path components cannot be empty")] + Empty, + #[error("path components cannot contain a forward slash")] + Slash, + #[error("path components cannot contain NUL")] + Nul, + #[error(r#"path components cannot equal "." or "..""#)] + SpecialDir, +} + +impl Serialize for Component { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + serializer.serialize_str(self.as_ref()) + } +} + +impl<'de> Deserialize<'de> for Component { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + struct ComponentVisitor; + + impl Visitor<'_> for ComponentVisitor { + type Value = Component; + + fn expecting(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { + formatter.write_str("a plain path component") + } + + fn visit_str(self, input: &str) -> Result + where + E: serde::de::Error, + { + input + .parse::() + .map_err(|_| E::invalid_value(Unexpected::Str(input), &self)) + } + } + + deserializer.deserialize_str(ComponentVisitor) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use assert_matches::assert_matches; + use rstest::rstest; + + #[rstest] + #[case("foo")] + #[case("foo.nwb")] + #[case("foo bar")] + fn test_good(#[case] s: &str) { + let r = s.parse::(); + assert_matches!(r, Ok(c) => { + assert_eq!(c, s); + }); + } + + #[rstest] + #[case("")] + #[case(".")] + #[case("..")] + #[case("/")] + #[case("\0")] + #[case("foo/bar.nwb")] + #[case("foo\0bar.nwb")] + #[case("/foo")] + #[case("foo/")] + #[case("/foo/")] + fn test_bad(#[case] s: &str) { + let r = s.parse::(); + assert_matches!(r, Err(_)); + } +} diff --git a/src/paths/dirpath.rs b/src/paths/dirpath.rs index 315c1cc..c56c908 100644 --- a/src/paths/dirpath.rs +++ b/src/paths/dirpath.rs @@ -1,4 +1,4 @@ -use super::PurePath; +use super::{Component, PurePath}; use derive_more::{AsRef, Deref, Display}; use std::fmt; use thiserror::Error; @@ -37,6 +37,10 @@ impl PureDirPath { PureDirPath(format!("{self}{path}")) } + pub(crate) fn join_one_dir(&self, c: &Component) -> PureDirPath { + PureDirPath(format!("{self}{c}/")) + } + pub(crate) fn relative_to(&self, dirpath: &PureDirPath) -> Option { let s = self.0.strip_prefix(&dirpath.0)?; (!s.is_empty()).then(|| PureDirPath(s.to_owned())) @@ -87,6 +91,14 @@ impl std::str::FromStr for PureDirPath { } } +impl From for PureDirPath { + fn from(value: Component) -> PureDirPath { + let mut s = value.0; + s.push('/'); + PureDirPath(s) + } +} + #[derive(Clone, Copy, Debug, Eq, Error, PartialEq)] pub(crate) enum ParsePureDirPathError { #[error("path does not end with a forward slash")] diff --git a/src/paths/mod.rs b/src/paths/mod.rs index 1347ec5..2e5f24d 100644 --- a/src/paths/mod.rs +++ b/src/paths/mod.rs @@ -1,4 +1,6 @@ +mod component; mod dirpath; mod purepath; +pub(crate) use self::component::*; pub(crate) use self::dirpath::*; pub(crate) use self::purepath::*; diff --git a/src/paths/purepath.rs b/src/paths/purepath.rs index 85c70a0..e8b5bf7 100644 --- a/src/paths/purepath.rs +++ b/src/paths/purepath.rs @@ -1,4 +1,4 @@ -use super::PureDirPath; +use super::{Component, PureDirPath}; use crate::consts::ZARR_EXTENSIONS; use derive_more::{AsRef, Deref, Display}; use serde::{ @@ -96,6 +96,12 @@ impl std::str::FromStr for PurePath { } } +impl From for PurePath { + fn from(value: Component) -> PurePath { + PurePath(value.0) + } +} + #[derive(Clone, Copy, Debug, Eq, Error, PartialEq)] pub(crate) enum ParsePurePathError { #[error("paths cannot be empty")] diff --git a/src/zarrman/index.rs b/src/zarrman/index.rs deleted file mode 100644 index 38dee1f..0000000 --- a/src/zarrman/index.rs +++ /dev/null @@ -1,21 +0,0 @@ -use crate::paths::PurePath; -use thiserror::Error; - -pub(super) fn parse_apache_index(_s: &str) -> Result, ParseIndexError> { - todo!() -} - -#[derive(Clone, Debug, Eq, Error, PartialEq)] -#[error("TODO")] -pub(crate) struct ParseIndexError; - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_parse_apache_index() { - let s = include_str!("testdata/016.html"); - assert_eq!(parse_apache_index(s).unwrap(), ["5a2", "fc8", "foo bar"]); - } -} diff --git a/src/zarrman/mod.rs b/src/zarrman/mod.rs index 0ec39c2..4c18e06 100644 --- a/src/zarrman/mod.rs +++ b/src/zarrman/mod.rs @@ -1,11 +1,10 @@ -mod index; mod manifest; mod resources; -use self::index::*; pub(crate) use self::resources::*; -use crate::httputil::{get_text, new_client, urljoin_slashed, BuildClientError, HttpError}; -use crate::paths::{PureDirPath, PurePath}; +use crate::httputil::{get_json, new_client, urljoin_slashed, BuildClientError, HttpError}; +use crate::paths::{Component, PureDirPath, PurePath}; use moka::future::{Cache, CacheBuilder}; +use serde::Deserialize; use std::sync::Arc; use thiserror::Error; use url::Url; @@ -36,31 +35,39 @@ impl ZarrManClient { } pub(crate) async fn get_top_level_dirs(&self) -> Result, ZarrManError> { - let entries = self.get_index_entries(None).await?; - Ok(entries - .into_iter() - .map(|path| { - ZarrManResource::WebFolder(WebFolder { - web_path: path.to_dir_path(), - }) - }) - .collect()) + self.get_index_entries(None).await } async fn get_index_entries( &self, path: Option<&PureDirPath>, - ) -> Result, ZarrManError> { + ) -> Result, ZarrManError> { let mut url = Url::parse(MANIFEST_ROOT_URL).expect("MANIFEST_ROOT_URL should be a valid URL"); if let Some(p) = path { url = urljoin_slashed(&url, p.components()); } - let txt = get_text(&self.inner, url.clone()).await?; - parse_apache_index(&txt).map_err(|source| ZarrManError::ParseIndex { - url: url.clone(), - source, - }) + let index = get_json::(&self.inner, url).await?; + let mut entries = + Vec::with_capacity(index.files.len().saturating_add(index.directories.len())); + for f in index.files { + if f.ends_with(".json") && !f.ends_with(".versionid.json") { + let web_path = match path { + Some(p) => p.join_one_dir(&f), + None => PureDirPath::from(f), + }; + entries.push(ZarrManResource::Manifest(Manifest { web_path })); + } + // else: Ignore + } + for d in index.directories { + let web_path = match path { + Some(p) => p.join_one_dir(&d), + None => PureDirPath::from(d), + }; + entries.push(ZarrManResource::WebFolder(WebFolder { web_path })); + } + Ok(entries) } #[allow(clippy::unused_async)] @@ -86,8 +93,6 @@ impl ZarrManClient { pub(crate) enum ZarrManError { #[error(transparent)] Http(#[from] HttpError), - #[error("failed to parse Apache index at {url}")] - ParseIndex { url: Url, source: ParseIndexError }, #[error("invalid path requested: {path:?}")] InvalidPath { path: PurePath }, } @@ -100,3 +105,10 @@ impl ZarrManError { ) } } + +#[derive(Clone, Debug, Deserialize, Eq, PartialEq)] +struct Index { + //path: String, + files: Vec, + directories: Vec, +} diff --git a/src/zarrman/testdata/016.html b/src/zarrman/testdata/016.html deleted file mode 100644 index 8a7d6fa..0000000 --- a/src/zarrman/testdata/016.html +++ /dev/null @@ -1,84 +0,0 @@ - - - - Index of /dandi/zarr-manifests/zarr-manifests-v2-sorted/016 - - -

Index of /dandi/zarr-manifests/zarr-manifests-v2-sorted/016

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- [ICO] - - Name - - Last modified - - Size - - Description -
-
-
- [PARENTDIR] - - Parent Directory -   -  
- [DIR] - - 5a2/ - 2024-02-01 15:30 -  
- [DIR] - - fc8/ - 2024-02-01 15:30 -  
- [   ] - - foo bar - 2024-02-01 15:30 1B 
-
-
-
Apache/2.4.57 (Debian) Server at datasets.datalad.org Port 443
- - From c87fee6da90079a4b301c4d71d6f4b03db6e11de Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Sat, 3 Feb 2024 10:16:18 -0500 Subject: [PATCH 12/17] Assorted ZarrMan stuff --- src/dav/types.rs | 4 +- src/httputil.rs | 109 ++++++++++++++++++++-------- src/paths/component.rs | 7 ++ src/paths/dirpath.rs | 4 ++ src/zarrman/manifest.rs | 8 +-- src/zarrman/mod.rs | 149 +++++++++++++++++++++++++++++++++------ src/zarrman/path.rs | 79 +++++++++++++++++++++ src/zarrman/resources.rs | 60 +++++++++++++--- 8 files changed, 355 insertions(+), 65 deletions(-) create mode 100644 src/zarrman/path.rs diff --git a/src/dav/types.rs b/src/dav/types.rs index d22107b..c4e8de2 100644 --- a/src/dav/types.rs +++ b/src/dav/types.rs @@ -394,9 +394,9 @@ impl From for DavCollection { } impl From for DavCollection { - fn from(Manifest { web_path }: Manifest) -> DavCollection { + fn from(Manifest { path }: Manifest) -> DavCollection { DavCollection { - path: Some(web_path), + path: Some(path.to_web_path()), created: None, modified: None, size: None, diff --git a/src/httputil.rs b/src/httputil.rs index 87caa68..f2c549a 100644 --- a/src/httputil.rs +++ b/src/httputil.rs @@ -56,6 +56,19 @@ pub(crate) enum HttpError { Deserialize { url: Url, source: reqwest::Error }, } +pub(crate) fn urljoin(url: &Url, segments: I) -> Url +where + I: IntoIterator, + I::Item: AsRef, +{ + let mut url = url.clone(); + url.path_segments_mut() + .expect("URL should be able to be a base") + .pop_if_empty() + .extend(segments); + url +} + pub(crate) fn urljoin_slashed(url: &Url, segments: I) -> Url where I: IntoIterator, @@ -74,36 +87,76 @@ where #[cfg(test)] mod tests { use super::*; - use rstest::rstest; - - #[rstest] - #[case("https://api.github.com")] - #[case("https://api.github.com/")] - fn test_urljoin_slashed_nopath(#[case] base: Url) { - let u = urljoin_slashed(&base, ["foo"]); - assert_eq!(u.as_str(), "https://api.github.com/foo/"); - let u = urljoin_slashed(&base, ["foo", "bar"]); - assert_eq!(u.as_str(), "https://api.github.com/foo/bar/"); - } - #[rstest] - #[case("https://api.github.com/foo/bar")] - #[case("https://api.github.com/foo/bar/")] - fn test_urljoin_slashed_path(#[case] base: Url) { - let u = urljoin_slashed(&base, ["gnusto"]); - assert_eq!(u.as_str(), "https://api.github.com/foo/bar/gnusto/"); - let u = urljoin_slashed(&base, ["gnusto", "cleesh"]); - assert_eq!(u.as_str(), "https://api.github.com/foo/bar/gnusto/cleesh/"); + mod urljoin { + use super::*; + use rstest::rstest; + + #[rstest] + #[case("https://api.github.com")] + #[case("https://api.github.com/")] + fn nopath(#[case] base: Url) { + let u = urljoin(&base, ["foo"]); + assert_eq!(u.as_str(), "https://api.github.com/foo"); + let u = urljoin(&base, ["foo", "bar"]); + assert_eq!(u.as_str(), "https://api.github.com/foo/bar"); + } + + #[rstest] + #[case("https://api.github.com/foo/bar")] + #[case("https://api.github.com/foo/bar/")] + fn path(#[case] base: Url) { + let u = urljoin(&base, ["gnusto"]); + assert_eq!(u.as_str(), "https://api.github.com/foo/bar/gnusto"); + let u = urljoin(&base, ["gnusto", "cleesh"]); + assert_eq!(u.as_str(), "https://api.github.com/foo/bar/gnusto/cleesh"); + } + + #[rstest] + #[case("foo#bar", "https://api.github.com/base/foo%23bar")] + #[case("foo%bar", "https://api.github.com/base/foo%25bar")] + #[case("foo/bar", "https://api.github.com/base/foo%2Fbar")] + #[case("foo?bar", "https://api.github.com/base/foo%3Fbar")] + fn special_chars(#[case] path: &str, #[case] expected: &str) { + let base = Url::parse("https://api.github.com/base").unwrap(); + let u = urljoin(&base, [path]); + assert_eq!(u.as_str(), expected); + } } - #[rstest] - #[case("foo#bar", "https://api.github.com/base/foo%23bar/")] - #[case("foo%bar", "https://api.github.com/base/foo%25bar/")] - #[case("foo/bar", "https://api.github.com/base/foo%2Fbar/")] - #[case("foo?bar", "https://api.github.com/base/foo%3Fbar/")] - fn test_urljoin_slashed_special_chars(#[case] path: &str, #[case] expected: &str) { - let base = Url::parse("https://api.github.com/base").unwrap(); - let u = urljoin_slashed(&base, [path]); - assert_eq!(u.as_str(), expected); + mod urljoin_slashed { + use super::*; + use rstest::rstest; + + #[rstest] + #[case("https://api.github.com")] + #[case("https://api.github.com/")] + fn nopath(#[case] base: Url) { + let u = urljoin_slashed(&base, ["foo"]); + assert_eq!(u.as_str(), "https://api.github.com/foo/"); + let u = urljoin_slashed(&base, ["foo", "bar"]); + assert_eq!(u.as_str(), "https://api.github.com/foo/bar/"); + } + + #[rstest] + #[case("https://api.github.com/foo/bar")] + #[case("https://api.github.com/foo/bar/")] + fn path(#[case] base: Url) { + let u = urljoin_slashed(&base, ["gnusto"]); + assert_eq!(u.as_str(), "https://api.github.com/foo/bar/gnusto/"); + let u = urljoin_slashed(&base, ["gnusto", "cleesh"]); + assert_eq!(u.as_str(), "https://api.github.com/foo/bar/gnusto/cleesh/"); + } + + #[rstest] + #[case("foo#bar", "https://api.github.com/base/foo%23bar/")] + #[case("foo%bar", "https://api.github.com/base/foo%25bar/")] + #[case("foo/bar", "https://api.github.com/base/foo%2Fbar/")] + #[case("foo?bar", "https://api.github.com/base/foo%3Fbar/")] + fn special_chars(#[case] path: &str, #[case] expected: &str) { + let base = Url::parse("https://api.github.com/base").unwrap(); + let u = urljoin_slashed(&base, [path]); + assert_eq!(u.as_str(), expected); + } } } diff --git a/src/paths/component.rs b/src/paths/component.rs index 7457a7f..cd0fc5e 100644 --- a/src/paths/component.rs +++ b/src/paths/component.rs @@ -14,6 +14,13 @@ use thiserror::Error; #[deref(forward)] pub(crate) struct Component(pub(super) String); +impl Component { + pub(crate) fn strip_suffix(&self, suffix: &str) -> Option { + let s = self.0.strip_suffix(suffix)?; + (!s.is_empty()).then(|| Component(s.into())) + } +} + impl fmt::Debug for Component { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{:?}", self.0) diff --git a/src/paths/dirpath.rs b/src/paths/dirpath.rs index c56c908..96bd3ed 100644 --- a/src/paths/dirpath.rs +++ b/src/paths/dirpath.rs @@ -24,6 +24,10 @@ impl PureDirPath { .expect("path should be nonempty") } + pub(crate) fn name_component(&self) -> Component { + Component(self.name().to_owned()) + } + pub(crate) fn parent(&self) -> Option { let i = self.0.trim_end_matches('/').rfind('/')?; Some(PureDirPath(self.0[..=i].to_owned())) diff --git a/src/zarrman/manifest.rs b/src/zarrman/manifest.rs index 1c7db76..4e88fb0 100644 --- a/src/zarrman/manifest.rs +++ b/src/zarrman/manifest.rs @@ -44,11 +44,11 @@ pub(crate) enum FolderEntry { pub(crate) struct ManifestEntry { // Keep these fields in this order so that deserialization will work // properly! - version_id: String, + pub(super) version_id: String, #[serde(with = "time::serde::rfc3339")] - modified: OffsetDateTime, - size: i64, - etag: String, + pub(super) modified: OffsetDateTime, + pub(super) size: i64, + pub(super) etag: String, } #[cfg(test)] diff --git a/src/zarrman/mod.rs b/src/zarrman/mod.rs index 4c18e06..814bc68 100644 --- a/src/zarrman/mod.rs +++ b/src/zarrman/mod.rs @@ -1,7 +1,11 @@ mod manifest; +mod path; mod resources; +use self::path::ReqPath; pub(crate) use self::resources::*; -use crate::httputil::{get_json, new_client, urljoin_slashed, BuildClientError, HttpError}; +use crate::httputil::{ + get_json, new_client, urljoin, urljoin_slashed, BuildClientError, HttpError, +}; use crate::paths::{Component, PureDirPath, PurePath}; use moka::future::{Cache, CacheBuilder}; use serde::Deserialize; @@ -19,8 +23,10 @@ const MANIFEST_CACHE_SIZE: u64 = 16; #[derive(Clone, Debug)] pub(crate) struct ZarrManClient { inner: reqwest::Client, - //TODO: manifests: Arc>>, - manifests: Arc>>, + manifests: Arc>>, + manifest_root_url: Url, + s3_download_prefix: Url, + web_path_prefix: PureDirPath, } impl ZarrManClient { @@ -31,7 +37,20 @@ impl ZarrManClient { .name("zarr-manifests") .build(), ); - Ok(ZarrManClient { inner, manifests }) + let manifest_root_url = + Url::parse(MANIFEST_ROOT_URL).expect("MANIFEST_ROOT_URL should be a valid URL"); + let s3_download_prefix = + Url::parse(S3_DOWNLOAD_PREFIX).expect("S3_DOWNLOAD_PREFIX should be a valid URL"); + let web_path_prefix = "zarrs/" + .parse::() + .expect(r#""zarrs/" should be a valid URL"#); + Ok(ZarrManClient { + inner, + manifests, + manifest_root_url, + s3_download_prefix, + web_path_prefix, + }) } pub(crate) async fn get_top_level_dirs(&self) -> Result, ZarrManError> { @@ -42,29 +61,42 @@ impl ZarrManClient { &self, path: Option<&PureDirPath>, ) -> Result, ZarrManError> { - let mut url = - Url::parse(MANIFEST_ROOT_URL).expect("MANIFEST_ROOT_URL should be a valid URL"); - if let Some(p) = path { - url = urljoin_slashed(&url, p.components()); - } + let url = match path { + Some(p) => urljoin_slashed(&self.manifest_root_url, p.components()), + None => self.manifest_root_url.clone(), + }; let index = get_json::(&self.inner, url).await?; let mut entries = Vec::with_capacity(index.files.len().saturating_add(index.directories.len())); - for f in index.files { - if f.ends_with(".json") && !f.ends_with(".versionid.json") { - let web_path = match path { - Some(p) => p.join_one_dir(&f), - None => PureDirPath::from(f), - }; - entries.push(ZarrManResource::Manifest(Manifest { web_path })); + if let Some(path) = path { + if let Some(prefix) = path.parent() { + for f in index.files { + // This calls Component::strip_suffix(), so `checksum` is + // guaranteed to be non-empty. + let Some(checksum) = f.strip_suffix(".json") else { + // Ignore + continue; + }; + if !checksum.contains('.') { + entries.push(ZarrManResource::Manifest(Manifest { + path: ManifestPath { + prefix: prefix.clone(), + zarr_id: path.name_component(), + checksum, + }, + })); + } + // else: Ignore + } } - // else: Ignore } + // else: Ignore + let web_path_prefix = match path { + Some(p) => self.web_path_prefix.join_dir(p), + None => self.web_path_prefix.clone(), + }; for d in index.directories { - let web_path = match path { - Some(p) => p.join_one_dir(&d), - None => PureDirPath::from(d), - }; + let web_path = web_path_prefix.join_one_dir(&d); entries.push(ZarrManResource::WebFolder(WebFolder { web_path })); } Ok(entries) @@ -72,11 +104,53 @@ impl ZarrManClient { #[allow(clippy::unused_async)] #[allow(unused_variables)] + async fn get_zarr_manifest( + &self, + path: &ManifestPath, + ) -> Result { + todo!() + } + pub(crate) async fn get_resource( &self, path: &PurePath, ) -> Result { - todo!() + let Some(rp) = ReqPath::parse_path(path) else { + return Err(ZarrManError::InvalidPath { path: path.clone() }); + }; + match rp { + ReqPath::Dir(p) => { + // Make a request to confirm that directory exists + let _ = self.get_index_entries(Some(&p)).await?; + Ok(ZarrManResource::WebFolder(WebFolder { web_path: p })) + } + ReqPath::Manifest(path) => { + // Make a request to confirm that manifest exists + let _ = self.get_zarr_manifest(&path).await?; + Ok(ZarrManResource::Manifest(Manifest { path })) + } + ReqPath::InManifest { + manifest_path, + entry_path, + } => { + let man = self.get_zarr_manifest(&manifest_path).await?; + match man.get(&entry_path) { + Some(manifest::EntryRef::Folder(_)) => { + let web_path = manifest_path + .to_web_path() + .join_dir(&entry_path.to_dir_path()); + Ok(ZarrManResource::ManFolder(ManifestFolder { web_path })) + } + Some(manifest::EntryRef::Entry(entry)) => Ok(ZarrManResource::ManEntry( + self.convert_manifest_entry(&manifest_path, &entry_path, entry), + )), + None => Err(ZarrManError::ManifestPathNotFound { + manifest_path, + entry_path, + }), + } + } + } } #[allow(clippy::unused_async)] @@ -87,6 +161,28 @@ impl ZarrManClient { ) -> Result { todo!() } + + fn convert_manifest_entry( + &self, + manifest_path: &ManifestPath, + entry_path: &PurePath, + entry: &manifest::ManifestEntry, + ) -> ManifestEntry { + let web_path = manifest_path.to_web_path().join(entry_path); + let mut url = urljoin( + &self.s3_download_prefix, + std::iter::once(manifest_path.zarr_id()).chain(entry_path.components()), + ); + url.query_pairs_mut() + .append_pair("versionId", &entry.version_id); + ManifestEntry { + web_path, + size: entry.size, + modified: entry.modified, + etag: entry.etag.clone(), + url, + } + } } #[derive(Debug, Error)] @@ -95,13 +191,20 @@ pub(crate) enum ZarrManError { Http(#[from] HttpError), #[error("invalid path requested: {path:?}")] InvalidPath { path: PurePath }, + #[error("path {entry_path:?} inside manifest at {manifest_path} does not exist")] + ManifestPathNotFound { + manifest_path: ManifestPath, + entry_path: PurePath, + }, } impl ZarrManError { pub(crate) fn is_404(&self) -> bool { matches!( self, - ZarrManError::Http(HttpError::NotFound { .. }) | ZarrManError::InvalidPath { .. } + ZarrManError::Http(HttpError::NotFound { .. }) + | ZarrManError::InvalidPath { .. } + | ZarrManError::ManifestPathNotFound { .. } ) } } diff --git a/src/zarrman/path.rs b/src/zarrman/path.rs new file mode 100644 index 0000000..017fcc5 --- /dev/null +++ b/src/zarrman/path.rs @@ -0,0 +1,79 @@ +use super::resources::ManifestPath; +use crate::paths::{PureDirPath, PurePath}; + +#[derive(Clone, Debug, Eq, PartialEq)] +pub(super) enum ReqPath { + Dir(PureDirPath), + Manifest(ManifestPath), + InManifest { + manifest_path: ManifestPath, + entry_path: PurePath, + }, +} + +impl ReqPath { + pub(super) fn parse_path(_path: &PurePath) -> Option { + todo!() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use assert_matches::assert_matches; + use rstest::rstest; + + #[rstest] + #[case("128", "128/")] + #[case("128/4a1", "128/4a1/")] + #[case( + "128/4a1/1284a14f-fe4f-4dc3-b10d-48e5db8bf18d", + "128/4a1/1284a14f-fe4f-4dc3-b10d-48e5db8bf18d/" + )] + fn test_parse_dir(#[case] inpath: PurePath, #[case] outpath: PureDirPath) { + assert_matches!(ReqPath::parse_path(&inpath), Some(ReqPath::Dir(p)) => { + assert_eq!(p, outpath); + }); + } + + #[test] + fn test_parse_manifest() { + let path = "128/4a1/1284a14f-fe4f-4dc3-b10d-48e5db8bf18d/6ddc4625befef8d6f9796835648162be-509--710206390".parse::().unwrap(); + assert_matches!( + ReqPath::parse_path(&path), + Some(ReqPath::Manifest(ManifestPath {prefix, zarr_id, checksum})) => { + assert_eq!(prefix, "128/4a1/"); + assert_eq!(zarr_id, "1284a14f-fe4f-4dc3-b10d-48e5db8bf18d"); + assert_eq!(checksum, "6ddc4625befef8d6f9796835648162be-509--710206390"); + } + ); + } + + #[rstest] + #[case(".zarray")] + #[case("0")] + #[case("0/0/0")] + fn test_parse_in_manifest(#[case] part2: PurePath) { + let part1 = "128/4a1/1284a14f-fe4f-4dc3-b10d-48e5db8bf18d/6ddc4625befef8d6f9796835648162be-509--710206390".parse::().unwrap(); + let path = part1.to_dir_path().join(&part2); + assert_matches!( + ReqPath::parse_path(&path), + Some(ReqPath::InManifest { + manifest_path: ManifestPath { prefix, zarr_id, checksum }, + entry_path + }) => { + assert_eq!(prefix, "128/4a1/"); + assert_eq!(zarr_id, "1284a14f-fe4f-4dc3-b10d-48e5db8bf18d"); + assert_eq!(checksum, "6ddc4625befef8d6f9796835648162be-509--710206390"); + assert_eq!(entry_path, part2); + } + ); + } + + #[rstest] + #[case("128/4a1/1284a14f-fe4f-4dc3-b10d-48e5db8bf18d/6ddc4625befef8d6f9796835648162be-509--710206390.versionid")] + #[case("128/4a1/1284a14f-fe4f-4dc3-b10d-48e5db8bf18d/6ddc4625befef8d6f9796835648162be-509--710206390.versionid/0")] + fn test_reject_versionid(#[case] path: PurePath) { + assert_eq!(ReqPath::parse_path(&path), None); + } +} diff --git a/src/zarrman/resources.rs b/src/zarrman/resources.rs index bae8fd3..1a24768 100644 --- a/src/zarrman/resources.rs +++ b/src/zarrman/resources.rs @@ -1,5 +1,9 @@ -use crate::paths::{PureDirPath, PurePath}; +use crate::httputil::urljoin; +use crate::paths::{Component, PureDirPath, PurePath}; +use std::borrow::Cow; +use std::fmt; use time::OffsetDateTime; +use url::Url; #[derive(Clone, Debug, Eq, PartialEq)] pub(crate) enum ZarrManResource { @@ -16,7 +20,52 @@ pub(crate) struct WebFolder { #[derive(Clone, Debug, Eq, PartialEq)] pub(crate) struct Manifest { - pub(crate) web_path: PureDirPath, + pub(crate) path: ManifestPath, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub(crate) struct ManifestPath { + pub(super) prefix: PureDirPath, + pub(super) zarr_id: Component, + pub(super) checksum: Component, +} + +impl ManifestPath { + pub(super) fn zarr_id(&self) -> &str { + self.zarr_id.as_ref() + } + + pub(crate) fn to_web_path(&self) -> PureDirPath { + format!("zarrs/{}/{}/{}/", self.prefix, self.zarr_id, self.checksum) + .parse::() + .expect("ManifestPath should have valid web_path") + } + + pub(crate) fn urljoin(&self, url: &Url) -> Url { + urljoin( + url, + self.prefix + .components() + .map(Cow::from) + .chain(std::iter::once(Cow::from(&*self.zarr_id))) + .chain(std::iter::once(Cow::from(format!( + "{}.json", + self.checksum + )))), + ) + } +} + +impl fmt::Display for ManifestPath { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + r#""{}/{}/{}/""#, + self.prefix.escape_debug(), + self.zarr_id.escape_debug(), + self.checksum.escape_debug() + ) + } } #[derive(Clone, Debug, Eq, PartialEq)] @@ -38,19 +87,14 @@ pub(crate) enum ZarrManResourceWithChildren { #[derive(Clone, Debug, Eq, PartialEq)] pub(crate) struct ManifestFolder { - //pub(crate) manifest_path: PurePath, - //pub(crate) path: PureDirPath, pub(crate) web_path: PureDirPath, } #[derive(Clone, Debug, Eq, PartialEq)] pub(crate) struct ManifestEntry { - //pub(crate) manifest_path: PurePath, - //pub(crate) path: PurePath, pub(crate) web_path: PurePath, - //pub(crate) version_id: String, pub(crate) size: i64, pub(crate) modified: OffsetDateTime, pub(crate) etag: String, - pub(crate) url: url::Url, + pub(crate) url: Url, } From b755e9a41c0b8463034b6451b20c122ee0ca8a2f Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Mon, 5 Feb 2024 13:10:57 -0500 Subject: [PATCH 13/17] get_zarr_manifest() --- src/dandi/mod.rs | 10 ++++------ src/zarrman/mod.rs | 38 ++++++++++++++++++++++++-------------- src/zarrman/resources.rs | 2 +- 3 files changed, 29 insertions(+), 21 deletions(-) diff --git a/src/dandi/mod.rs b/src/dandi/mod.rs index 492b0ae..7c8e6a7 100644 --- a/src/dandi/mod.rs +++ b/src/dandi/mod.rs @@ -23,17 +23,15 @@ use url::Url; pub(crate) struct DandiClient { inner: reqwest::Client, api_url: Url, - s3clients: Arc>>, + s3clients: Cache>, } impl DandiClient { pub(crate) fn new(api_url: Url) -> Result { let inner = new_client()?; - let s3clients = Arc::new( - CacheBuilder::new(S3CLIENT_CACHE_SIZE) - .name("s3clients") - .build(), - ); + let s3clients = CacheBuilder::new(S3CLIENT_CACHE_SIZE) + .name("s3clients") + .build(); Ok(DandiClient { inner, api_url, diff --git a/src/zarrman/mod.rs b/src/zarrman/mod.rs index 814bc68..34af8ce 100644 --- a/src/zarrman/mod.rs +++ b/src/zarrman/mod.rs @@ -23,7 +23,7 @@ const MANIFEST_CACHE_SIZE: u64 = 16; #[derive(Clone, Debug)] pub(crate) struct ZarrManClient { inner: reqwest::Client, - manifests: Arc>>, + manifests: Cache>, manifest_root_url: Url, s3_download_prefix: Url, web_path_prefix: PureDirPath, @@ -32,11 +32,9 @@ pub(crate) struct ZarrManClient { impl ZarrManClient { pub(crate) fn new() -> Result { let inner = new_client()?; - let manifests = Arc::new( - CacheBuilder::new(MANIFEST_CACHE_SIZE) - .name("zarr-manifests") - .build(), - ); + let manifests = CacheBuilder::new(MANIFEST_CACHE_SIZE) + .name("zarr-manifests") + .build(); let manifest_root_url = Url::parse(MANIFEST_ROOT_URL).expect("MANIFEST_ROOT_URL should be a valid URL"); let s3_download_prefix = @@ -102,13 +100,18 @@ impl ZarrManClient { Ok(entries) } - #[allow(clippy::unused_async)] - #[allow(unused_variables)] async fn get_zarr_manifest( &self, path: &ManifestPath, - ) -> Result { - todo!() + ) -> Result, ZarrManError> { + self.manifests + .try_get_with_by_ref(path, async move { + get_json::(&self.inner, path.urljoin(&self.manifest_root_url)) + .await + .map(Arc::new) + }) + .await + .map_err(Into::into) } pub(crate) async fn get_resource( @@ -188,7 +191,7 @@ impl ZarrManClient { #[derive(Debug, Error)] pub(crate) enum ZarrManError { #[error(transparent)] - Http(#[from] HttpError), + Http(#[from] Arc), #[error("invalid path requested: {path:?}")] InvalidPath { path: PurePath }, #[error("path {entry_path:?} inside manifest at {manifest_path} does not exist")] @@ -202,13 +205,20 @@ impl ZarrManError { pub(crate) fn is_404(&self) -> bool { matches!( self, - ZarrManError::Http(HttpError::NotFound { .. }) - | ZarrManError::InvalidPath { .. } - | ZarrManError::ManifestPathNotFound { .. } + ZarrManError::Http(e) if matches!(**e, HttpError::NotFound {..}) + ) || matches!( + self, + ZarrManError::InvalidPath { .. } | ZarrManError::ManifestPathNotFound { .. } ) } } +impl From for ZarrManError { + fn from(e: HttpError) -> ZarrManError { + Arc::new(e).into() + } +} + #[derive(Clone, Debug, Deserialize, Eq, PartialEq)] struct Index { //path: String, diff --git a/src/zarrman/resources.rs b/src/zarrman/resources.rs index 1a24768..732eac0 100644 --- a/src/zarrman/resources.rs +++ b/src/zarrman/resources.rs @@ -23,7 +23,7 @@ pub(crate) struct Manifest { pub(crate) path: ManifestPath, } -#[derive(Clone, Debug, Eq, PartialEq)] +#[derive(Clone, Debug, Eq, Hash, PartialEq)] pub(crate) struct ManifestPath { pub(super) prefix: PureDirPath, pub(super) zarr_id: Component, From 0a377c4f131b8b12b2175dcc19bbc3ba8c31c7a1 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Tue, 6 Feb 2024 11:05:04 -0500 Subject: [PATCH 14/17] ZarrManClient::get_resource_with_children() --- src/dav/types.rs | 4 +- src/paths/dirpath.rs | 12 +++-- src/paths/purepath.rs | 14 ++++-- src/zarrman/manifest.rs | 44 +++++++++++-------- src/zarrman/mod.rs | 94 +++++++++++++++++++++++++++++++++++++--- src/zarrman/resources.rs | 2 +- 6 files changed, 134 insertions(+), 36 deletions(-) diff --git a/src/dav/types.rs b/src/dav/types.rs index c4e8de2..4f2bfd6 100644 --- a/src/dav/types.rs +++ b/src/dav/types.rs @@ -201,7 +201,7 @@ pub(super) struct DavCollection { impl DavCollection { pub(super) fn name(&self) -> Option<&str> { - self.path.as_ref().map(PureDirPath::name) + self.path.as_ref().map(PureDirPath::name_str) } pub(super) fn web_link(&self) -> Href { @@ -431,7 +431,7 @@ pub(super) struct DavItem { impl DavItem { pub(super) fn name(&self) -> &str { - self.path.name() + self.path.name_str() } pub(super) fn web_link(&self) -> Href { diff --git a/src/paths/dirpath.rs b/src/paths/dirpath.rs index 96bd3ed..48d5896 100644 --- a/src/paths/dirpath.rs +++ b/src/paths/dirpath.rs @@ -16,7 +16,7 @@ use thiserror::Error; pub(crate) struct PureDirPath(pub(super) String); impl PureDirPath { - pub(crate) fn name(&self) -> &str { + pub(crate) fn name_str(&self) -> &str { self.0 .trim_end_matches('/') .split('/') @@ -24,8 +24,8 @@ impl PureDirPath { .expect("path should be nonempty") } - pub(crate) fn name_component(&self) -> Component { - Component(self.name().to_owned()) + pub(crate) fn name(&self) -> Component { + Component(self.name_str().to_owned()) } pub(crate) fn parent(&self) -> Option { @@ -41,6 +41,10 @@ impl PureDirPath { PureDirPath(format!("{self}{path}")) } + pub(crate) fn join_one(&self, c: &Component) -> PurePath { + PurePath(format!("{self}{c}")) + } + pub(crate) fn join_one_dir(&self, c: &Component) -> PureDirPath { PureDirPath(format!("{self}{c}/")) } @@ -50,7 +54,7 @@ impl PureDirPath { (!s.is_empty()).then(|| PureDirPath(s.to_owned())) } - pub(crate) fn components(&self) -> std::str::Split<'_, char> { + pub(crate) fn component_strs(&self) -> std::str::Split<'_, char> { self.0.split('/') } } diff --git a/src/paths/purepath.rs b/src/paths/purepath.rs index e8b5bf7..f207320 100644 --- a/src/paths/purepath.rs +++ b/src/paths/purepath.rs @@ -22,13 +22,17 @@ use thiserror::Error; pub(crate) struct PurePath(pub(super) String); impl PurePath { - pub(crate) fn name(&self) -> &str { + pub(crate) fn name_str(&self) -> &str { self.0 .split('/') .next_back() .expect("path should be nonempty") } + pub(crate) fn join_one(&self, c: &Component) -> PurePath { + PurePath(format!("{self}/{c}")) + } + pub(crate) fn is_strictly_under(&self, other: &PureDirPath) -> bool { self.0.starts_with(&other.0) } @@ -53,9 +57,13 @@ impl PurePath { PureDirPath(format!("{}/", self.0)) } - pub(crate) fn components(&self) -> std::str::Split<'_, char> { + pub(crate) fn component_strs(&self) -> std::str::Split<'_, char> { self.0.split('/') } + + pub(crate) fn components(&self) -> impl Iterator + '_ { + self.0.split('/').map(|c| Component(c.to_owned())) + } } impl fmt::Debug for PurePath { @@ -200,7 +208,7 @@ mod tests { #[case("foo", "foo")] #[case("foo/bar/baz", "baz")] fn test_name(#[case] p: PurePath, #[case] name: &str) { - assert_eq!(p.name(), name); + assert_eq!(p.name_str(), name); } #[rstest] diff --git a/src/zarrman/manifest.rs b/src/zarrman/manifest.rs index 4e88fb0..64fcfcd 100644 --- a/src/zarrman/manifest.rs +++ b/src/zarrman/manifest.rs @@ -1,19 +1,19 @@ -use crate::paths::PurePath; +use crate::paths::{Component, PurePath}; use itertools::{Itertools, Position}; use serde::Deserialize; use std::collections::BTreeMap; use time::OffsetDateTime; #[derive(Clone, Debug, Deserialize, Eq, PartialEq)] -pub(crate) struct Manifest { - entries: ManifestFolder, +pub(super) struct Manifest { + pub(super) entries: ManifestFolder, } impl Manifest { pub(super) fn get(&self, path: &PurePath) -> Option> { let mut folder = &self.entries; for (pos, p) in path.components().with_position() { - match folder.get(p)? { + match folder.get(&p)? { FolderEntry::Folder(f) => folder = f, FolderEntry::Entry(e) if matches!(pos, Position::Last | Position::Only) => { return Some(EntryRef::Entry(e)) @@ -26,22 +26,22 @@ impl Manifest { } #[derive(Clone, Debug, Eq, PartialEq)] -pub(crate) enum EntryRef<'a> { +pub(super) enum EntryRef<'a> { Folder(&'a ManifestFolder), Entry(&'a ManifestEntry), } -pub(super) type ManifestFolder = BTreeMap; +pub(super) type ManifestFolder = BTreeMap; #[derive(Clone, Debug, Deserialize, Eq, PartialEq)] #[serde(untagged)] -pub(crate) enum FolderEntry { +pub(super) enum FolderEntry { Folder(ManifestFolder), Entry(ManifestEntry), } #[derive(Clone, Debug, Deserialize, Eq, PartialEq)] -pub(crate) struct ManifestEntry { +pub(super) struct ManifestEntry { // Keep these fields in this order so that deserialization will work // properly! pub(super) version_id: String, @@ -116,9 +116,12 @@ mod tests { manifest, Manifest { entries: BTreeMap::from([ - (".zattrs".into(), FolderEntry::Entry(zattrs.clone())), ( - ".zgroup".into(), + ".zattrs".parse().unwrap(), + FolderEntry::Entry(zattrs.clone()) + ), + ( + ".zgroup".parse().unwrap(), FolderEntry::Entry(ManifestEntry { version_id: "7obAY5BUNOdI1Uch3RoI4oHuGXhW4h0R".into(), modified: datetime!(2022-06-27 23:07:47 UTC), @@ -127,7 +130,7 @@ mod tests { }) ), ( - ".zmetadata".into(), + ".zmetadata".parse().unwrap(), FolderEntry::Entry(ManifestEntry { version_id: "Vfe0W0v4zkydmzyXkUMjm2Xr7.rIvfZQ".into(), modified: datetime!(2022-06-27 23:07:47 UTC), @@ -136,24 +139,27 @@ mod tests { }) ), ( - "0".into(), + "0".parse().unwrap(), FolderEntry::Folder(BTreeMap::from([ - (".zarray".into(), FolderEntry::Entry(zarray.clone())), ( - "0".into(), + ".zarray".parse().unwrap(), + FolderEntry::Entry(zarray.clone()) + ), + ( + "0".parse().unwrap(), FolderEntry::Folder(BTreeMap::from([( - "0".into(), + "0".parse().unwrap(), FolderEntry::Folder(BTreeMap::from([( - "13".into(), + "13".parse().unwrap(), FolderEntry::Folder(BTreeMap::from([( - "8".into(), + "8".parse().unwrap(), FolderEntry::Folder(BTreeMap::from([ ( - "100".into(), + "100".parse().unwrap(), FolderEntry::Entry(entry_100.clone()) ), ( - "101".into(), + "101".parse().unwrap(), FolderEntry::Entry(ManifestEntry { version_id: "_i9cZBerb4mB9D8IFbPHo8nrefWcbq0p" diff --git a/src/zarrman/mod.rs b/src/zarrman/mod.rs index 34af8ce..26fef36 100644 --- a/src/zarrman/mod.rs +++ b/src/zarrman/mod.rs @@ -60,7 +60,7 @@ impl ZarrManClient { path: Option<&PureDirPath>, ) -> Result, ZarrManError> { let url = match path { - Some(p) => urljoin_slashed(&self.manifest_root_url, p.components()), + Some(p) => urljoin_slashed(&self.manifest_root_url, p.component_strs()), None => self.manifest_root_url.clone(), }; let index = get_json::(&self.inner, url).await?; @@ -79,7 +79,7 @@ impl ZarrManClient { entries.push(ZarrManResource::Manifest(Manifest { path: ManifestPath { prefix: prefix.clone(), - zarr_id: path.name_component(), + zarr_id: path.name(), checksum, }, })); @@ -125,7 +125,9 @@ impl ZarrManClient { ReqPath::Dir(p) => { // Make a request to confirm that directory exists let _ = self.get_index_entries(Some(&p)).await?; - Ok(ZarrManResource::WebFolder(WebFolder { web_path: p })) + Ok(ZarrManResource::WebFolder(WebFolder { + web_path: self.web_path_prefix.join_dir(&p), + })) } ReqPath::Manifest(path) => { // Make a request to confirm that manifest exists @@ -156,13 +158,57 @@ impl ZarrManClient { } } - #[allow(clippy::unused_async)] - #[allow(unused_variables)] pub(crate) async fn get_resource_with_children( &self, path: &PurePath, ) -> Result { - todo!() + let Some(rp) = ReqPath::parse_path(path) else { + return Err(ZarrManError::InvalidPath { path: path.clone() }); + }; + match rp { + ReqPath::Dir(p) => { + let children = self.get_index_entries(Some(&p)).await?; + let folder = WebFolder { + web_path: self.web_path_prefix.join_dir(&p), + }; + Ok(ZarrManResourceWithChildren::WebFolder { folder, children }) + } + ReqPath::Manifest(path) => { + let man = self.get_zarr_manifest(&path).await?; + let children = self.convert_manifest_folder_children(&path, None, &man.entries); + let folder = Manifest { path }; + Ok(ZarrManResourceWithChildren::Manifest { folder, children }) + } + ReqPath::InManifest { + manifest_path, + entry_path, + } => { + let man = self.get_zarr_manifest(&manifest_path).await?; + match man.get(&entry_path) { + Some(manifest::EntryRef::Folder(folref)) => { + let web_path = manifest_path + .to_web_path() + .join_dir(&entry_path.to_dir_path()); + let children = self.convert_manifest_folder_children( + &manifest_path, + Some(&entry_path), + folref, + ); + let folder = ManifestFolder { web_path }; + Ok(ZarrManResourceWithChildren::ManFolder { folder, children }) + } + Some(manifest::EntryRef::Entry(entry)) => { + Ok(ZarrManResourceWithChildren::ManEntry( + self.convert_manifest_entry(&manifest_path, &entry_path, entry), + )) + } + None => Err(ZarrManError::ManifestPathNotFound { + manifest_path, + entry_path, + }), + } + } + } } fn convert_manifest_entry( @@ -174,7 +220,7 @@ impl ZarrManClient { let web_path = manifest_path.to_web_path().join(entry_path); let mut url = urljoin( &self.s3_download_prefix, - std::iter::once(manifest_path.zarr_id()).chain(entry_path.components()), + std::iter::once(manifest_path.zarr_id()).chain(entry_path.component_strs()), ); url.query_pairs_mut() .append_pair("versionId", &entry.version_id); @@ -186,6 +232,40 @@ impl ZarrManClient { url, } } + + fn convert_manifest_folder_children( + &self, + manifest_path: &ManifestPath, + entry_path: Option<&PurePath>, + folder: &manifest::ManifestFolder, + ) -> Vec { + let mut children = Vec::with_capacity(folder.len()); + let web_path_prefix = match entry_path { + Some(p) => manifest_path.to_web_path().join_dir(&p.to_dir_path()), + None => manifest_path.to_web_path(), + }; + for (name, child) in folder { + match child { + manifest::FolderEntry::Folder(_) => { + children.push(ZarrManResource::ManFolder(ManifestFolder { + web_path: web_path_prefix.join_one_dir(name), + })); + } + manifest::FolderEntry::Entry(entry) => { + let thispath = match entry_path { + Some(p) => p.join_one(name), + None => PurePath::from(name.clone()), + }; + children.push(ZarrManResource::ManEntry(self.convert_manifest_entry( + manifest_path, + &thispath, + entry, + ))); + } + } + } + children + } } #[derive(Debug, Error)] diff --git a/src/zarrman/resources.rs b/src/zarrman/resources.rs index 732eac0..8705336 100644 --- a/src/zarrman/resources.rs +++ b/src/zarrman/resources.rs @@ -45,7 +45,7 @@ impl ManifestPath { urljoin( url, self.prefix - .components() + .component_strs() .map(Cow::from) .chain(std::iter::once(Cow::from(&*self.zarr_id))) .chain(std::iter::once(Cow::from(format!( From 22538bea7878cc193942ea1537bed0d0c1f29f87 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Tue, 6 Feb 2024 11:18:19 -0500 Subject: [PATCH 15/17] ReqPath::parse_path() --- src/paths/dirpath.rs | 9 +++++---- src/paths/purepath.rs | 5 +++++ src/zarrman/path.rs | 38 ++++++++++++++++++++++++++++++++++++-- 3 files changed, 46 insertions(+), 6 deletions(-) diff --git a/src/paths/dirpath.rs b/src/paths/dirpath.rs index 48d5896..06b8f22 100644 --- a/src/paths/dirpath.rs +++ b/src/paths/dirpath.rs @@ -41,14 +41,15 @@ impl PureDirPath { PureDirPath(format!("{self}{path}")) } - pub(crate) fn join_one(&self, c: &Component) -> PurePath { - PurePath(format!("{self}{c}")) - } - pub(crate) fn join_one_dir(&self, c: &Component) -> PureDirPath { PureDirPath(format!("{self}{c}/")) } + pub(crate) fn push(&mut self, c: &Component) { + self.0.push_str(c.as_ref()); + self.0.push('/'); + } + pub(crate) fn relative_to(&self, dirpath: &PureDirPath) -> Option { let s = self.0.strip_prefix(&dirpath.0)?; (!s.is_empty()).then(|| PureDirPath(s.to_owned())) diff --git a/src/paths/purepath.rs b/src/paths/purepath.rs index f207320..0c618d9 100644 --- a/src/paths/purepath.rs +++ b/src/paths/purepath.rs @@ -64,6 +64,11 @@ impl PurePath { pub(crate) fn components(&self) -> impl Iterator + '_ { self.0.split('/').map(|c| Component(c.to_owned())) } + + pub(crate) fn push(&mut self, c: &Component) { + self.0.push('/'); + self.0.push_str(c.as_ref()); + } } impl fmt::Debug for PurePath { diff --git a/src/zarrman/path.rs b/src/zarrman/path.rs index 017fcc5..e846152 100644 --- a/src/zarrman/path.rs +++ b/src/zarrman/path.rs @@ -12,8 +12,42 @@ pub(super) enum ReqPath { } impl ReqPath { - pub(super) fn parse_path(_path: &PurePath) -> Option { - todo!() + pub(super) fn parse_path(path: &PurePath) -> Option { + let mut components = path.components(); + let Some(c1) = components.next() else { + unreachable!("path should have at least one component"); + }; + let mut prefix = PureDirPath::from(c1); + let Some(c2) = components.next() else { + return Some(ReqPath::Dir(prefix)); + }; + prefix.push(&c2); + let Some(zarr_id) = components.next() else { + return Some(ReqPath::Dir(prefix)); + }; + let Some(checksum) = components.next() else { + prefix.push(&zarr_id); + return Some(ReqPath::Dir(prefix)); + }; + if checksum.contains('.') { + return None; + } + let manifest_path = ManifestPath { + prefix, + zarr_id, + checksum, + }; + let Some(e1) = components.next() else { + return Some(ReqPath::Manifest(manifest_path)); + }; + let mut entry_path = PurePath::from(e1); + for e in components { + entry_path.push(&e); + } + Some(ReqPath::InManifest { + manifest_path, + entry_path, + }) } } From b3502a19910fddb65476870330f7c452d45c3395 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Tue, 6 Feb 2024 11:21:00 -0500 Subject: [PATCH 16/17] Update CHANGELOG --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a09f45a..a60d40a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +v0.2.0 (in development) +----------------------- +- Serve Zarr entries via manifests from + at "`/zarrs/`" + v0.1.0 (2024-02-01) ------------------- Initial release From fc6e4a1fa01857aa0ae973718abc7bfd6079010f Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Tue, 6 Feb 2024 11:26:29 -0500 Subject: [PATCH 17/17] Bugfixes --- src/paths/dirpath.rs | 2 +- src/zarrman/resources.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/paths/dirpath.rs b/src/paths/dirpath.rs index 06b8f22..e55e59a 100644 --- a/src/paths/dirpath.rs +++ b/src/paths/dirpath.rs @@ -56,7 +56,7 @@ impl PureDirPath { } pub(crate) fn component_strs(&self) -> std::str::Split<'_, char> { - self.0.split('/') + self.0.trim_end_matches('/').split('/') } } diff --git a/src/zarrman/resources.rs b/src/zarrman/resources.rs index 8705336..69ba423 100644 --- a/src/zarrman/resources.rs +++ b/src/zarrman/resources.rs @@ -36,7 +36,7 @@ impl ManifestPath { } pub(crate) fn to_web_path(&self) -> PureDirPath { - format!("zarrs/{}/{}/{}/", self.prefix, self.zarr_id, self.checksum) + format!("zarrs/{}{}/{}/", self.prefix, self.zarr_id, self.checksum) .parse::() .expect("ManifestPath should have valid web_path") }