From 6aec376aa101bf62ea353e48bd8a47184d81cf7e Mon Sep 17 00:00:00 2001 From: Alessandro Passaro Date: Fri, 27 Sep 2024 18:58:06 +0100 Subject: [PATCH] Refactor inode module Signed-off-by: Alessandro Passaro --- mountpoint-s3/src/fs.rs | 8 +- mountpoint-s3/src/fs/error.rs | 2 +- mountpoint-s3/src/lib.rs | 2 +- mountpoint-s3/src/{inode.rs => superblock.rs} | 764 +----------------- .../src/{inode => superblock}/expiry.rs | 0 mountpoint-s3/src/superblock/inode.rs | 710 ++++++++++++++++ .../{inode => superblock}/negative_cache.rs | 0 .../src/{inode => superblock}/readdir.rs | 3 +- 8 files changed, 748 insertions(+), 741 deletions(-) rename mountpoint-s3/src/{inode.rs => superblock.rs} (76%) rename mountpoint-s3/src/{inode => superblock}/expiry.rs (100%) create mode 100644 mountpoint-s3/src/superblock/inode.rs rename mountpoint-s3/src/{inode => superblock}/negative_cache.rs (100%) rename mountpoint-s3/src/{inode => superblock}/readdir.rs (99%) diff --git a/mountpoint-s3/src/fs.rs b/mountpoint-s3/src/fs.rs index b65762a3c..bc500a94c 100644 --- a/mountpoint-s3/src/fs.rs +++ b/mountpoint-s3/src/fs.rs @@ -17,18 +17,18 @@ use mountpoint_s3_client::types::ETag; use mountpoint_s3_client::ObjectClient; use crate::fs::error_metadata::{ErrorMetadata, MOUNTPOINT_ERROR_LOOKUP_NONEXISTENT}; -use crate::inode::{ - Inode, InodeError, InodeKind, LookedUp, ReadHandle, ReaddirHandle, Superblock, SuperblockConfig, WriteHandle, -}; use crate::logging; use crate::prefetch::{Prefetch, PrefetchResult}; use crate::prefix::Prefix; use crate::s3::S3Personality; +use crate::superblock::{ + Inode, InodeError, InodeKind, LookedUp, ReadHandle, ReaddirHandle, Superblock, SuperblockConfig, WriteHandle, +}; use crate::sync::atomic::{AtomicI64, AtomicU64, Ordering}; use crate::sync::{Arc, AsyncMutex, AsyncRwLock}; use crate::upload::{UploadRequest, Uploader}; -pub use crate::inode::InodeNo; +pub use crate::superblock::InodeNo; #[macro_use] mod error; diff --git a/mountpoint-s3/src/fs/error.rs b/mountpoint-s3/src/fs/error.rs index 477d09974..0f1208f88 100644 --- a/mountpoint-s3/src/fs/error.rs +++ b/mountpoint-s3/src/fs/error.rs @@ -4,8 +4,8 @@ use mountpoint_s3_client::error::{GetObjectError, ObjectClientError}; use tracing::Level; use crate::fs::error_metadata::ErrorMetadata; -use crate::inode::InodeError; use crate::prefetch::PrefetchReadError; +use crate::superblock::InodeError; use crate::upload::UploadWriteError; /// Generate an error that includes a conversion to a libc errno for use in replies to FUSE. diff --git a/mountpoint-s3/src/lib.rs b/mountpoint-s3/src/lib.rs index 084cd3cc9..2bd553d4f 100644 --- a/mountpoint-s3/src/lib.rs +++ b/mountpoint-s3/src/lib.rs @@ -5,13 +5,13 @@ pub mod cli; pub mod data_cache; pub mod fs; pub mod fuse; -mod inode; pub mod logging; pub mod metrics; mod object; pub mod prefetch; pub mod prefix; pub mod s3; +mod superblock; mod sync; mod upload; diff --git a/mountpoint-s3/src/inode.rs b/mountpoint-s3/src/superblock.rs similarity index 76% rename from mountpoint-s3/src/inode.rs rename to mountpoint-s3/src/superblock.rs index 8be944604..03fde557e 100644 --- a/mountpoint-s3/src/inode.rs +++ b/mountpoint-s3/src/superblock.rs @@ -23,62 +23,41 @@ use std::collections::{HashMap, HashSet}; use std::ffi::{OsStr, OsString}; -use std::fmt::{Debug, Display}; -use std::os::unix::prelude::OsStrExt; -use std::sync::atomic::AtomicBool; -use std::time::{Duration, SystemTime}; +use std::fmt::Debug; +use std::time::Duration; use anyhow::anyhow; -use fuser::FileType; use futures::{select_biased, FutureExt}; use mountpoint_s3_client::error::{HeadObjectError, ObjectClientError}; use mountpoint_s3_client::error_metadata::ProvideErrorMetadata; -use mountpoint_s3_client::types::{HeadObjectResult, RestoreStatus}; +use mountpoint_s3_client::types::HeadObjectResult; use mountpoint_s3_client::ObjectClient; -use mountpoint_s3_crt::checksums::crc32c::{self, Crc32c}; use thiserror::Error; use time::OffsetDateTime; use tracing::{debug, error, trace, warn}; use crate::fs::error_metadata::{ErrorMetadata, MOUNTPOINT_ERROR_CLIENT}; -use crate::fs::CacheConfig; +use crate::fs::{CacheConfig, FUSE_ROOT_INODE}; use crate::logging; use crate::prefix::Prefix; use crate::s3::S3Personality; use crate::sync::atomic::{AtomicU64, Ordering}; -use crate::sync::RwLockReadGuard; -use crate::sync::RwLockWriteGuard; use crate::sync::{Arc, RwLock}; mod expiry; use expiry::Expiry; +mod inode; +use inode::{valid_inode_name, InodeErrorInfo, InodeKindData, InodeStat, InodeState, WriteStatus}; + +pub use inode::{Inode, InodeKind, InodeNo}; + mod negative_cache; use negative_cache::NegativeCache; mod readdir; pub use readdir::ReaddirHandle; -pub type InodeNo = u64; - -pub const ROOT_INODE_NO: InodeNo = 1; - -// 200 years seems long enough -const NEVER_EXPIRE_TTL: Duration = Duration::from_secs(200 * 365 * 24 * 60 * 60); - -pub fn valid_inode_name>(name: T) -> bool { - let name = name.as_ref(); - // Names cannot be empty - !name.is_empty() && - // "." and ".." are reserved names (presented by the filesystem layer) - name != "." && - name != ".." && - // The delimiter / can never appear in a name - !name.as_bytes().contains(&b'/') && - // NUL is invalid in POSIX names - !name.as_bytes().contains(&b'\0') -} - /// Superblock is the root object of the file system #[derive(Debug)] pub struct Superblock { @@ -106,26 +85,10 @@ impl Superblock { /// Create a new Superblock that targets the given bucket/prefix pub fn new(bucket: &str, prefix: &Prefix, config: SuperblockConfig) -> Self { let mount_time = OffsetDateTime::now_utc(); - - let root = Inode::new( - ROOT_INODE_NO, - ROOT_INODE_NO, - String::new(), - prefix.to_string(), - InodeKind::Directory, - InodeState { - // The root inode never expires because there's no remote to consult for its - // metadata, and it always exists. - stat: InodeStat::for_directory(mount_time, NEVER_EXPIRE_TTL), - write_status: WriteStatus::Remote, - kind_data: InodeKindData::default_for(InodeKind::Directory), - lookup_count: 1, - reader_count: 0, - }, - ); + let root = Inode::new_root(prefix.to_string(), mount_time); let mut inodes = InodeMap::default(); - inodes.insert(ROOT_INODE_NO, root); + inodes.insert(root.ino(), root); let negative_cache = NegativeCache::new(config.cache_config.negative_cache_size, config.cache_config.file_ttl); @@ -177,7 +140,7 @@ impl Superblock { return; } }; - let mut parent_state = parent.inner.sync.write().unwrap(); + let mut parent_state = parent.get_mut_inode_state_no_check(); let InodeKindData::Directory { children, writing_children, @@ -438,13 +401,7 @@ impl Superblock { } }; - let state = InodeState { - stat: stat.clone(), - kind_data: InodeKindData::default_for(kind), - write_status: WriteStatus::LocalUnopened, - lookup_count: 0, - reader_count: 0, - }; + let state = InodeState::new(&stat, kind, WriteStatus::LocalUnopened); let inode = self .inner .create_inode_locked(&parent_inode, &mut parent_state, name, kind, state, true)?; @@ -970,13 +927,7 @@ impl SuperblockInner { } } (Some(remote), None) => { - let state = InodeState { - stat: remote.stat.clone(), - kind_data: InodeKindData::default_for(remote.kind), - write_status: WriteStatus::Remote, - lookup_count: 0, - reader_count: 0, - }; + let state = InodeState::new(&remote.stat, remote.kind, WriteStatus::Remote); self.create_inode_locked(&parent, &mut parent_state, name, remote.kind, state, false) .map(|inode| LookedUp { inode, @@ -1039,13 +990,7 @@ impl SuperblockInner { ino=?existing_inode.ino(), "inode needs to be recreated", ); - let state = InodeState { - stat: remote.stat.clone(), - kind_data: InodeKindData::default_for(remote.kind), - write_status: WriteStatus::Remote, - lookup_count: 0, - reader_count: 0, - }; + let state = InodeState::new(&remote.stat, remote.kind, WriteStatus::Remote); let new_inode = self.create_inode_locked(&parent, &mut parent_state, name, remote.kind, state, false)?; Ok(LookedUp { @@ -1148,7 +1093,7 @@ impl WriteHandle { } pub fn inc_file_size(&self, len: usize) { - let mut state = self.inode.inner.sync.write().unwrap(); + let mut state = self.inode.get_mut_inode_state_no_check(); state.stat.size += len; } @@ -1164,7 +1109,8 @@ impl WriteHandle { assert!(visited.insert(ancestor_ino), "cycle detected in inode ancestors"); let ancestor = self.inner.get(ancestor_ino)?; ancestors.push(ancestor.clone()); - if ancestor.ino() == ROOT_INODE_NO || ancestor.get_inode_state()?.write_status == WriteStatus::Remote { + if ancestor.ino() == FUSE_ROOT_INODE || ancestor.get_inode_state()?.write_status == WriteStatus::Remote + { break; } ancestor_ino = ancestor.parent(); @@ -1229,342 +1175,6 @@ impl ReadHandle { } } -#[derive(Debug, Clone)] -pub struct Inode { - inner: Arc, -} - -#[derive(Debug)] -struct InodeInner { - // Immutable inode state -- any changes to these requires a new inode - ino: InodeNo, - parent: InodeNo, - name: String, - // TODO deduplicate keys by string interning or something -- many keys will have common prefixes - full_key: String, - kind: InodeKind, - checksum: Crc32c, - - /// Mutable inode state. This lock should also be held to serialize operations on an inode (like - /// creating a new child). - /// - /// When taking a lock across multiple [Inode]s, - /// we must always acquire the locks in the following order to avoid deadlock: - /// - /// - Any ancestors in descending order, if they need to be locked. - /// - Otherwise, ascending order by [InodeNo]. - /// This reflects similar behavior in the Kernel's VFS named 'inode pointer order', - /// described in https://www.kernel.org/doc/html/next/filesystems/directory-locking.html - sync: RwLock, -} - -impl Inode { - pub fn ino(&self) -> InodeNo { - self.inner.ino - } - - pub fn parent(&self) -> InodeNo { - self.inner.parent - } - - pub fn name(&self) -> &str { - &self.inner.name - } - - pub fn kind(&self) -> InodeKind { - self.inner.kind - } - - pub fn full_key(&self) -> &str { - &self.inner.full_key - } - - /// Increment lookup count for [Inode] by 1, returning the new value. - /// This should be called whenever we pass a `fuse_reply_entry` or `fuse_reply_create` struct to the FUSE driver. - /// - /// Locks [InodeState] for writing. - fn inc_lookup_count(&self) -> u64 { - let mut state = self.inner.sync.write().unwrap(); - let lookup_count = &mut state.lookup_count; - *lookup_count += 1; - trace!( - ino = self.ino(), - new_lookup_count = lookup_count, - "incremented lookup count", - ); - *lookup_count - } - - /// Decrement lookup count by `n` for [Inode], returning the new value. - /// - /// Locks [InodeState] for writing. - fn dec_lookup_count(&self, n: u64) -> u64 { - let mut state = self.inner.sync.write().unwrap(); - let lookup_count = &mut state.lookup_count; - debug_assert!(n <= *lookup_count, "lookup count cannot go negative"); - *lookup_count = lookup_count.saturating_sub(n); - trace!( - ino = self.ino(), - new_lookup_count = lookup_count, - "decremented lookup count", - ); - *lookup_count - } - - pub fn is_remote(&self) -> Result { - let state = self.get_inode_state()?; - Ok(state.write_status == WriteStatus::Remote) - } - - /// return Inode State with read lock after checking whether the directory inode is deleted or not. - fn get_inode_state(&self) -> Result, InodeError> { - let inode_state = self.inner.sync.read().unwrap(); - match &inode_state.kind_data { - InodeKindData::Directory { deleted, .. } if *deleted => Err(InodeError::InodeDoesNotExist(self.ino())), - _ => Ok(inode_state), - } - } - - /// return Inode State with write lock after checking whether the directory inode is deleted or not. - fn get_mut_inode_state(&self) -> Result, InodeError> { - let inode_state = self.inner.sync.write().unwrap(); - match &inode_state.kind_data { - InodeKindData::Directory { deleted, .. } if *deleted => Err(InodeError::InodeDoesNotExist(self.ino())), - _ => Ok(inode_state), - } - } - - fn new(ino: InodeNo, parent: InodeNo, name: String, full_key: String, kind: InodeKind, state: InodeState) -> Self { - let checksum = Self::compute_checksum(ino, &full_key); - let sync = RwLock::new(state); - let inner = InodeInner { - ino, - parent, - name, - full_key, - kind, - checksum, - sync, - }; - Self { inner: inner.into() } - } - - /// Verify [Inode] has the expected inode number and the inode content is valid for its checksum. - fn verify_inode(&self, expected_ino: InodeNo) -> Result<(), InodeError> { - let computed = Self::compute_checksum(self.ino(), self.full_key()); - if computed == self.inner.checksum && self.ino() == expected_ino { - Ok(()) - } else { - Err(InodeError::CorruptedMetadata(self.err())) - } - } - - /// Verify [Inode] has the expected inode number, expected parent inode number, - /// and the inode's content is valid for its checksum. - fn verify_child(&self, expected_parent: InodeNo, expected_name: &str) -> Result<(), InodeError> { - let computed = Self::compute_checksum(self.ino(), self.full_key()); - if computed == self.inner.checksum && self.parent() == expected_parent && self.name() == expected_name { - Ok(()) - } else { - Err(InodeError::CorruptedMetadata(self.err())) - } - } - - fn compute_checksum(ino: InodeNo, full_key: &str) -> Crc32c { - let mut hasher = crc32c::Hasher::new(); - hasher.update(ino.to_be_bytes().as_ref()); - hasher.update(full_key.as_bytes()); - hasher.finalize() - } - - /// Produce a description of this Inode for use in errors - pub fn err(&self) -> InodeErrorInfo { - InodeErrorInfo(self.clone()) - } -} - -/// A wrapper that prints useful customer-facing error messages for inodes by including the object -/// key rather than just the inode number. -pub struct InodeErrorInfo(Inode); - -impl Display for InodeErrorInfo { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{} (full key {:?})", self.0.ino(), self.0.full_key()) - } -} - -impl Debug for InodeErrorInfo { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - self.0.fmt(f) - } -} - -#[derive(Debug)] -struct InodeState { - stat: InodeStat, - write_status: WriteStatus, - kind_data: InodeKindData, - /// Number of references the kernel is holding to the [Inode]. - /// A number of FS operations increment this, while the kernel calls [`Inode::forget(ino, n)`] to decrement. - lookup_count: u64, - /// Number of active prefetching streams on the [Inode]. - reader_count: u64, -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum InodeKind { - File, - Directory, -} - -impl InodeKind { - fn as_str(&self) -> &'static str { - match self { - InodeKind::File => "file", - InodeKind::Directory => "directory", - } - } -} - -impl From for FileType { - fn from(kind: InodeKind) -> Self { - match kind { - InodeKind::File => FileType::RegularFile, - InodeKind::Directory => FileType::Directory, - } - } -} - -#[derive(Debug)] -enum InodeKindData { - File {}, - Directory { - /// Mapping from child names to previously seen [Inode]s. - /// - /// The existence of a child or lack thereof does not imply the object does not exist, - /// nor that it currently exists in S3 in that state. - children: HashMap, - - /// A set of inode numbers that have been opened for write but not completed yet. - /// This should be a subset of the [children](Self::Directory::children) field. - writing_children: HashSet, - - /// True if this directory has been deleted (`rmdir`) from its parent - deleted: bool, - }, -} - -impl InodeKindData { - fn default_for(kind: InodeKind) -> Self { - match kind { - InodeKind::File => Self::File {}, - InodeKind::Directory => Self::Directory { - children: Default::default(), - writing_children: Default::default(), - deleted: false, - }, - } - } -} - -#[derive(Debug, Clone)] -pub struct InodeStat { - /// Time this stat becomes invalid and needs to be refreshed - expiry: Expiry, - - /// Size in bytes - pub size: usize, - - /// Time of last file content modification - pub mtime: OffsetDateTime, - /// Time of last file metadata (or content) change - pub ctime: OffsetDateTime, - /// Time of last access - pub atime: OffsetDateTime, - /// Etag for the file (object) - pub etag: Option, - /// Inodes corresponding to S3 objects with GLACIER or DEEP_ARCHIVE storage classes - /// are only readable after restoration. For objects with other storage classes - /// this field should be always `true`. - pub is_readable: bool, -} - -/// Inode write status (local vs remote) -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum WriteStatus { - /// Local inode created but not yet opened - LocalUnopened, - /// Local inode already opened - LocalOpen, - /// Remote inode - Remote, -} - -impl InodeStat { - fn is_valid(&self) -> bool { - !self.expiry.is_expired() - } - - /// Objects in flexible retrieval storage classes can't be accessed via GetObject unless they are - /// restored, and so we override their permissions to 000 and reject reads to them. We also warn - /// the first time we see an object like this, because FUSE enforces the 000 permissions on our - /// behalf so we might not see an attempted `open` call. - fn is_readable(storage_class: Option, restore_status: Option) -> bool { - static HAS_SENT_WARNING: AtomicBool = AtomicBool::new(false); - match storage_class.as_deref() { - Some("GLACIER") | Some("DEEP_ARCHIVE") => { - let restored = - matches!(restore_status, Some(RestoreStatus::Restored { expiry }) if expiry > SystemTime::now()); - if !restored && !HAS_SENT_WARNING.swap(true, Ordering::SeqCst) { - tracing::warn!( - "objects in the GLACIER and DEEP_ARCHIVE storage classes are only accessible if restored" - ); - } - restored - } - _ => true, - } - } - - /// Initialize an [InodeStat] for a file, given some metadata. - fn for_file( - size: usize, - datetime: OffsetDateTime, - etag: Option, - storage_class: Option, - restore_status: Option, - validity: Duration, - ) -> InodeStat { - let is_readable = Self::is_readable(storage_class, restore_status); - InodeStat { - expiry: Expiry::from_now(validity), - size, - atime: datetime, - ctime: datetime, - mtime: datetime, - etag, - is_readable, - } - } - - /// Initialize an [InodeStat] for a directory, given some metadata. - fn for_directory(datetime: OffsetDateTime, validity: Duration) -> InodeStat { - InodeStat { - expiry: Expiry::from_now(validity), - size: 0, - atime: datetime, - ctime: datetime, - mtime: datetime, - etag: None, - is_readable: true, - } - } - - fn update_validity(&mut self, validity: Duration) { - self.expiry = Expiry::from_now(validity); - } -} - /// A wrapper around a `HashMap`` that just takes care of metrics when inodes are /// added or removed. #[derive(Debug, Default)] @@ -1948,125 +1558,6 @@ mod tests { } } - #[tokio::test] - async fn test_forget() { - let superblock = Superblock::new("test_bucket", &Default::default(), Default::default()); - let ino = 42; - let inode_name = "made-up-inode"; - let inode = Inode::new( - ino, - ROOT_INODE_NO, - inode_name.to_owned(), - inode_name.to_owned(), - InodeKind::File, - InodeState { - write_status: WriteStatus::Remote, - stat: InodeStat::for_file(0, OffsetDateTime::now_utc(), None, None, None, Default::default()), - kind_data: InodeKindData::File {}, - lookup_count: 5, - reader_count: 0, - }, - ); - superblock.inner.inodes.write().unwrap().insert(ino, inode.clone()); - - superblock.forget(ino, 3); - let lookup_count = { - let inode_state = inode.inner.sync.read().unwrap(); - inode_state.lookup_count - }; - assert_eq!(lookup_count, 2, "lookup should have been reduced"); - assert!( - superblock.inner.get(ino).is_ok(), - "inode should be present in superblock" - ); - - superblock.forget(ino, 2); - let lookup_count = { - let inode_state = inode.inner.sync.read().unwrap(); - inode_state.lookup_count - }; - assert_eq!(lookup_count, 0, "lookup should have been reduced"); - assert!( - superblock.inner.inodes.read().unwrap().get(&ino).is_none(), - "inode should not be present in superblock" - ); - - // Make sure we didn't leak the inode anywhere else - assert_eq!(Arc::strong_count(&inode.inner), 1); - } - - #[tokio::test] - async fn test_forget_can_remove_inodes() { - let client_config = MockClientConfig { - bucket: "test_bucket".to_string(), - part_size: 1024 * 1024, - ..Default::default() - }; - let client = Arc::new(MockClient::new(client_config)); - - let name = "foo"; - client.add_object(name, b"foo".into()); - - let superblock = Superblock::new("test_bucket", &Default::default(), Default::default()); - - let lookup = superblock.lookup(&client, ROOT_INODE_NO, name.as_ref()).await.unwrap(); - let lookup_count = lookup.inode.inner.sync.read().unwrap().lookup_count; - assert_eq!(lookup_count, 1); - let ino = lookup.inode.ino(); - - superblock.forget(ino, 1); - - let lookup_count = lookup.inode.inner.sync.read().unwrap().lookup_count; - assert_eq!(lookup_count, 0); - // This test should now hold the only reference to the inode, so we know it's unreferenced - // and will be freed - assert_eq!(Arc::strong_count(&lookup.inode.inner), 1); - drop(lookup); - - let err = superblock - .getattr(&client, ino, false) - .await - .expect_err("Inode should not be valid"); - assert!(matches!(err, InodeError::InodeDoesNotExist(_))); - - let lookup = superblock.lookup(&client, ROOT_INODE_NO, name.as_ref()).await.unwrap(); - let lookup_count = lookup.inode.inner.sync.read().unwrap().lookup_count; - assert_eq!(lookup_count, 1); - } - - #[tokio::test] - async fn test_forget_shadowed_inode() { - let client_config = MockClientConfig { - bucket: "test_bucket".to_string(), - part_size: 1024 * 1024, - ..Default::default() - }; - let client = Arc::new(MockClient::new(client_config)); - - let name = "foo"; - client.add_object(name, b"foo".into()); - - let superblock = Superblock::new("test_bucket", &Default::default(), Default::default()); - - let lookup = superblock.lookup(&client, ROOT_INODE_NO, name.as_ref()).await.unwrap(); - let lookup_count = lookup.inode.inner.sync.read().unwrap().lookup_count; - assert_eq!(lookup_count, 1); - let ino = lookup.inode.ino(); - drop(lookup); - - client.add_object(&format!("{name}/bar"), b"bar".into()); - - // Should be a directory now, so a different inode - let new_lookup = superblock.lookup(&client, ROOT_INODE_NO, name.as_ref()).await.unwrap(); - assert_ne!(ino, new_lookup.inode.ino()); - - superblock.forget(ino, 1); - - // Lookup still works after forgetting the old inode - let new_lookup2 = superblock.lookup(&client, ROOT_INODE_NO, name.as_ref()).await.unwrap(); - assert_eq!(new_lookup.inode.ino(), new_lookup2.inode.ino()); - } - #[test_case(""; "unprefixed")] #[test_case("test_prefix/"; "prefixed")] #[tokio::test] @@ -2161,12 +1652,7 @@ mod tests { for i in 0..5 { let filename = format!("file{i}.txt"); let new_inode = superblock - .create( - &client, - FUSE_ROOT_INODE, - OsStr::from_bytes(filename.as_bytes()), - InodeKind::File, - ) + .create(&client, FUSE_ROOT_INODE, filename.as_ref(), InodeKind::File) .await .unwrap(); superblock @@ -2218,12 +1704,7 @@ mod tests { for i in 0..5 { let filename = format!("newfile{i}.txt"); let new_inode = superblock - .create( - &client, - FUSE_ROOT_INODE, - OsStr::from_bytes(filename.as_bytes()), - InodeKind::File, - ) + .create(&client, FUSE_ROOT_INODE, filename.as_ref(), InodeKind::File) .await .unwrap(); superblock @@ -2377,12 +1858,7 @@ mod tests { // Open some local keys for filename in ["aaa", "dir3", "dm3", "file3", "zzz"] { let new_inode = superblock - .create( - &client, - FUSE_ROOT_INODE, - OsStr::from_bytes(filename.as_bytes()), - InodeKind::File, - ) + .create(&client, FUSE_ROOT_INODE, filename.as_ref(), InodeKind::File) .await .unwrap(); superblock @@ -2394,12 +1870,7 @@ mod tests { // Create some local directories for dirname in ["dir2", "dm2", "file2"] { let _new_inode = superblock - .create( - &client, - FUSE_ROOT_INODE, - OsStr::from_bytes(dirname.as_bytes()), - InodeKind::Directory, - ) + .create(&client, FUSE_ROOT_INODE, dirname.as_ref(), InodeKind::Directory) .await .unwrap(); } @@ -2580,67 +2051,6 @@ mod tests { assert_eq!(libc::ENOENT, err, "lookup should return no existing entry error"); } - #[tokio::test] - async fn test_unlink_verify_checksum() { - let client_config = MockClientConfig { - bucket: "test_bucket".to_string(), - part_size: 1024 * 1024, - ..Default::default() - }; - let client = Arc::new(MockClient::new(client_config)); - let file_name = "corrupted"; - client.add_object(file_name.as_ref(), MockObject::constant(0xaa, 30, ETag::for_tests())); - - let superblock = Superblock::new("test_bucket", &Default::default(), Default::default()); - - // Create an inode with "corrupted" metadata, i.e. - // checksum not matching ino + full key. - let parent_ino = FUSE_ROOT_INODE; - let bad_checksum = Crc32c::new(42); - let inode = Inode { - inner: Arc::new(InodeInner { - ino: 42, - parent: parent_ino, - name: file_name.into(), - full_key: file_name.into(), - kind: InodeKind::File, - checksum: bad_checksum, - sync: RwLock::new(InodeState { - stat: InodeStat::for_file( - 0, - OffsetDateTime::now_utc(), - Some(ETag::for_tests().as_str().to_owned()), - None, - None, - NEVER_EXPIRE_TTL, - ), - write_status: WriteStatus::Remote, - kind_data: InodeKindData::File {}, - lookup_count: 1, - reader_count: 0, - }), - }), - }; - - // Manually add the corrupted inode to the superblock and root directory. - { - let mut inodes = superblock.inner.inodes.write().unwrap(); - inodes.insert(inode.ino(), inode.clone()); - let parent = inodes.get(&parent_ino).unwrap(); - let mut parent_state = parent.get_mut_inode_state().unwrap(); - match &mut parent_state.kind_data { - InodeKindData::File {} => panic!("root is always a directory"), - InodeKindData::Directory { children, .. } => _ = children.insert(file_name.into(), inode.clone()), - } - } - - let err = superblock - .unlink(&client, parent_ino, file_name.as_ref()) - .await - .expect_err("unlink of a corrupted inode should fail"); - assert!(matches!(err, InodeError::CorruptedMetadata(_))); - } - #[tokio::test] async fn test_finish_writing_convert_parent_local_dirs_to_remote() { let client_config = MockClientConfig { @@ -2677,12 +2087,7 @@ mod tests { // Create object under leaf dir let filename = "newfile.txt"; let new_inode = superblock - .create( - &client, - leaf_dir_ino, - OsStr::from_bytes(filename.as_bytes()), - InodeKind::File, - ) + .create(&client, leaf_dir_ino, filename.as_ref(), InodeKind::File) .await .unwrap(); @@ -2715,21 +2120,21 @@ mod tests { for _ in 0..2 { let dir1_1 = superblock - .lookup(&client, FUSE_ROOT_INODE, OsStr::from_bytes("dir1".as_bytes())) + .lookup(&client, FUSE_ROOT_INODE, "dir1".as_ref()) .await .unwrap(); let dir1_2 = superblock - .lookup(&client, FUSE_ROOT_INODE, OsStr::from_bytes("dir1".as_bytes())) + .lookup(&client, FUSE_ROOT_INODE, "dir1".as_ref()) .await .unwrap(); assert_eq!(dir1_1.inode.ino(), dir1_2.inode.ino()); let file1_1 = superblock - .lookup(&client, dir1_1.inode.ino(), OsStr::from_bytes("file1.txt".as_bytes())) + .lookup(&client, dir1_1.inode.ino(), "file1.txt".as_ref()) .await .unwrap(); let file1_2 = superblock - .lookup(&client, dir1_1.inode.ino(), OsStr::from_bytes("file1.txt".as_bytes())) + .lookup(&client, dir1_1.inode.ino(), "file1.txt".as_ref()) .await .unwrap(); assert_eq!(file1_1.inode.ino(), file1_2.inode.ino()); @@ -2770,7 +2175,7 @@ mod tests { ); let dir = superblock - .lookup(&client, FUSE_ROOT_INODE, OsStr::from_bytes("dir".as_bytes())) + .lookup(&client, FUSE_ROOT_INODE, "dir".as_ref()) .await .unwrap(); assert_eq!(dir.inode.full_key(), OsString::from("dir/")); @@ -2827,9 +2232,7 @@ mod tests { // Neither of these keys should exist in the directory for key in ["/", "."] { - let lookup = superblock - .lookup(&client, dir1_ino, OsStr::from_bytes(key.as_bytes())) - .await; + let lookup = superblock.lookup(&client, dir1_ino, key.as_ref()).await; assert!(matches!(lookup, Err(InodeError::InvalidFileName(_)))); } } @@ -2850,12 +2253,7 @@ mod tests { // Create a new file let filename = "newfile.txt"; let new_inode = superblock - .create( - &client, - FUSE_ROOT_INODE, - OsStr::from_bytes(filename.as_bytes()), - InodeKind::File, - ) + .create(&client, FUSE_ROOT_INODE, filename.as_ref(), InodeKind::File) .await .unwrap(); @@ -2894,59 +2292,6 @@ mod tests { assert!(matches!(result, Err(InodeError::SetAttrNotPermittedOnRemoteInode(_)))); } - #[tokio::test] - async fn test_setattr_invalid_stat() { - let client_config = MockClientConfig { - bucket: "test_bucket".to_string(), - part_size: 1024 * 1024, - ..Default::default() - }; - let client = Arc::new(MockClient::new(client_config)); - let superblock = Superblock::new("test_bucket", &Default::default(), Default::default()); - - let ino: u64 = 42; - let inode_name = "made-up-inode"; - let mut hasher = crc32c::Hasher::new(); - hasher.update(ino.to_be_bytes().as_ref()); - hasher.update(inode_name.as_bytes()); - let checksum = hasher.finalize(); - let inode = Inode { - inner: Arc::new(InodeInner { - ino, - parent: ROOT_INODE_NO, - name: inode_name.to_owned(), - full_key: inode_name.to_owned(), - kind: InodeKind::File, - checksum, - sync: RwLock::new(InodeState { - write_status: WriteStatus::LocalOpen, - stat: InodeStat::for_file(0, OffsetDateTime::UNIX_EPOCH, None, None, None, Default::default()), - kind_data: InodeKindData::File {}, - lookup_count: 5, - reader_count: 0, - }), - }), - }; - superblock.inner.inodes.write().unwrap().insert(ino, inode.clone()); - - // Verify that the stat is invalid - let inode = superblock.inner.get(ino).unwrap(); - let stat = inode.get_inode_state().unwrap().stat.clone(); - assert!(!stat.is_valid()); - - // Should be able to reset expiry back and make stat valid when calling setattr - let atime = OffsetDateTime::UNIX_EPOCH + Duration::days(90); - let mtime = OffsetDateTime::UNIX_EPOCH + Duration::days(60); - let lookup = superblock - .setattr(&client, ino, Some(atime), Some(mtime)) - .await - .expect("setattr should be successful"); - let stat = lookup.stat; - assert_eq!(stat.atime, atime); - assert_eq!(stat.mtime, mtime); - assert!(stat.is_valid()); - } - #[test] fn test_inodestat_constructors() { let ts = OffsetDateTime::UNIX_EPOCH + Duration::days(90); @@ -2963,53 +2308,4 @@ mod tests { assert_eq!(file_inodestat.ctime, ts); assert_eq!(file_inodestat.mtime, ts); } - - #[cfg(feature = "shuttle")] - mod shuttle_tests { - use mountpoint_s3_client::mock_client::{MockClient, MockClientConfig}; - use shuttle::{check_pct, check_random, thread}; - use shuttle::{future::block_on, sync::Arc}; - - use super::*; - - #[test] - fn test_create_and_forget_race_condition() { - async fn test_helper() { - let client_config = MockClientConfig { - bucket: "test_bucket".to_string(), - part_size: 1024 * 1024, - ..Default::default() - }; - let client = Arc::new(MockClient::new(client_config)); - - let name = "foo"; - client.add_object(name, b"foo".into()); - - let superblock = Arc::new(Superblock::new("test_bucket", &Default::default(), Default::default())); - - let lookup = superblock.lookup(&client, ROOT_INODE_NO, name.as_ref()).await.unwrap(); - let lookup_count = lookup.inode.inner.sync.read().unwrap().lookup_count; - assert_eq!(lookup_count, 1); - let ino = lookup.inode.ino(); - - let superblock_clone = superblock.clone(); - let forget_task = thread::spawn(move || { - superblock_clone.forget(ino, 1); - }); - - let file_name = "bar"; - superblock - .create(&client, ROOT_INODE_NO, file_name.as_ref(), InodeKind::File) - .await - .unwrap(); - - forget_task.join().unwrap(); - let lookup_count = lookup.inode.inner.sync.read().unwrap().lookup_count; - assert_eq!(lookup_count, 0); - } - - check_random(|| block_on(test_helper()), 1000); - check_pct(|| block_on(test_helper()), 1000, 3); - } - } } diff --git a/mountpoint-s3/src/inode/expiry.rs b/mountpoint-s3/src/superblock/expiry.rs similarity index 100% rename from mountpoint-s3/src/inode/expiry.rs rename to mountpoint-s3/src/superblock/expiry.rs diff --git a/mountpoint-s3/src/superblock/inode.rs b/mountpoint-s3/src/superblock/inode.rs new file mode 100644 index 000000000..44dea7065 --- /dev/null +++ b/mountpoint-s3/src/superblock/inode.rs @@ -0,0 +1,710 @@ +use std::collections::{HashMap, HashSet}; +use std::ffi::OsStr; +use std::fmt::{Debug, Display}; +use std::os::unix::ffi::OsStrExt as _; +use std::time::{Duration, SystemTime}; + +use fuser::FileType; +use mountpoint_s3_client::types::RestoreStatus; +use mountpoint_s3_crt::checksums::crc32c::{self, Crc32c}; +use time::OffsetDateTime; +use tracing::trace; + +use crate::sync::atomic::{AtomicBool, Ordering}; +use crate::sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard}; + +use super::Expiry; +use super::InodeError; + +pub type InodeNo = u64; + +#[derive(Debug, Clone)] +pub struct Inode { + inner: Arc, +} + +const ROOT_INODE_NO: InodeNo = crate::fs::FUSE_ROOT_INODE; + +// 200 years seems long enough +const NEVER_EXPIRE_TTL: Duration = Duration::from_secs(200 * 365 * 24 * 60 * 60); + +#[derive(Debug)] +struct InodeInner { + // Immutable inode state -- any changes to these requires a new inode + ino: InodeNo, + parent: InodeNo, + name: String, + // TODO deduplicate keys by string interning or something -- many keys will have common prefixes + full_key: String, + kind: InodeKind, + checksum: Crc32c, + + /// Mutable inode state. This lock should also be held to serialize operations on an inode (like + /// creating a new child). + /// + /// When taking a lock across multiple [Inode]s, + /// we must always acquire the locks in the following order to avoid deadlock: + /// + /// - Any ancestors in descending order, if they need to be locked. + /// - Otherwise, ascending order by [InodeNo]. + /// This reflects similar behavior in the Kernel's VFS named 'inode pointer order', + /// described in https://www.kernel.org/doc/html/next/filesystems/directory-locking.html + sync: RwLock, +} + +impl Inode { + pub fn ino(&self) -> InodeNo { + self.inner.ino + } + + pub fn parent(&self) -> InodeNo { + self.inner.parent + } + + pub fn name(&self) -> &str { + &self.inner.name + } + + pub fn kind(&self) -> InodeKind { + self.inner.kind + } + + pub fn full_key(&self) -> &str { + &self.inner.full_key + } + + /// Increment lookup count for [Inode] by 1, returning the new value. + /// This should be called whenever we pass a `fuse_reply_entry` or `fuse_reply_create` struct to the FUSE driver. + /// + /// Locks [InodeState] for writing. + pub fn inc_lookup_count(&self) -> u64 { + let mut state = self.inner.sync.write().unwrap(); + let lookup_count = &mut state.lookup_count; + *lookup_count += 1; + trace!( + ino = self.ino(), + new_lookup_count = lookup_count, + "incremented lookup count", + ); + *lookup_count + } + + /// Decrement lookup count by `n` for [Inode], returning the new value. + /// + /// Locks [InodeState] for writing. + pub fn dec_lookup_count(&self, n: u64) -> u64 { + let mut state = self.inner.sync.write().unwrap(); + let lookup_count = &mut state.lookup_count; + debug_assert!(n <= *lookup_count, "lookup count cannot go negative"); + *lookup_count = lookup_count.saturating_sub(n); + trace!( + ino = self.ino(), + new_lookup_count = lookup_count, + "decremented lookup count", + ); + *lookup_count + } + + pub fn is_remote(&self) -> Result { + let state = self.get_inode_state()?; + Ok(state.write_status == WriteStatus::Remote) + } + + /// return Inode State with read lock after checking whether the directory inode is deleted or not. + pub fn get_inode_state(&self) -> Result, InodeError> { + let inode_state = self.inner.sync.read().unwrap(); + match &inode_state.kind_data { + InodeKindData::Directory { deleted, .. } if *deleted => Err(InodeError::InodeDoesNotExist(self.ino())), + _ => Ok(inode_state), + } + } + + /// return Inode State with write lock after checking whether the directory inode is deleted or not. + pub fn get_mut_inode_state(&self) -> Result, InodeError> { + let inode_state = self.inner.sync.write().unwrap(); + match &inode_state.kind_data { + InodeKindData::Directory { deleted, .. } if *deleted => Err(InodeError::InodeDoesNotExist(self.ino())), + _ => Ok(inode_state), + } + } + + /// return Inode State with write lock without checking whether the directory inode is deleted or not. + pub fn get_mut_inode_state_no_check(&self) -> RwLockWriteGuard { + self.inner.sync.write().unwrap() + } + + pub fn new( + ino: InodeNo, + parent: InodeNo, + name: String, + full_key: String, + kind: InodeKind, + state: InodeState, + ) -> Self { + let checksum = Self::compute_checksum(ino, &full_key); + let sync = RwLock::new(state); + let inner = InodeInner { + ino, + parent, + name, + full_key, + kind, + checksum, + sync, + }; + Self { inner: inner.into() } + } + + pub fn new_root(prefix: String, mount_time: OffsetDateTime) -> Self { + Self::new( + ROOT_INODE_NO, + ROOT_INODE_NO, + String::new(), + prefix, + InodeKind::Directory, + InodeState { + // The root inode never expires because there's no remote to consult for its + // metadata, and it always exists. + stat: InodeStat::for_directory(mount_time, NEVER_EXPIRE_TTL), + write_status: WriteStatus::Remote, + kind_data: InodeKindData::default_for(InodeKind::Directory), + lookup_count: 1, + reader_count: 0, + }, + ) + } + + /// Verify [Inode] has the expected inode number and the inode content is valid for its checksum. + pub fn verify_inode(&self, expected_ino: InodeNo) -> Result<(), InodeError> { + let computed = Self::compute_checksum(self.ino(), self.full_key()); + if computed == self.inner.checksum && self.ino() == expected_ino { + Ok(()) + } else { + Err(InodeError::CorruptedMetadata(self.err())) + } + } + + /// Verify [Inode] has the expected inode number, expected parent inode number, + /// and the inode's content is valid for its checksum. + pub fn verify_child(&self, expected_parent: InodeNo, expected_name: &str) -> Result<(), InodeError> { + let computed = Self::compute_checksum(self.ino(), self.full_key()); + if computed == self.inner.checksum && self.parent() == expected_parent && self.name() == expected_name { + Ok(()) + } else { + Err(InodeError::CorruptedMetadata(self.err())) + } + } + + fn compute_checksum(ino: InodeNo, full_key: &str) -> Crc32c { + let mut hasher = crc32c::Hasher::new(); + hasher.update(ino.to_be_bytes().as_ref()); + hasher.update(full_key.as_bytes()); + hasher.finalize() + } + + /// Produce a description of this Inode for use in errors + pub fn err(&self) -> InodeErrorInfo { + InodeErrorInfo(self.clone()) + } +} + +pub fn valid_inode_name>(name: T) -> bool { + let name = name.as_ref(); + // Names cannot be empty + !name.is_empty() && + // "." and ".." are reserved names (presented by the filesystem layer) + name != "." && + name != ".." && + // The delimiter / can never appear in a name + !name.as_bytes().contains(&b'/') && + // NUL is invalid in POSIX names + !name.as_bytes().contains(&b'\0') +} + +/// A wrapper that prints useful customer-facing error messages for inodes by including the object +/// key rather than just the inode number. +pub struct InodeErrorInfo(Inode); + +impl Display for InodeErrorInfo { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{} (full key {:?})", self.0.ino(), self.0.full_key()) + } +} + +impl Debug for InodeErrorInfo { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.0.fmt(f) + } +} + +#[derive(Debug)] +pub struct InodeState { + pub stat: InodeStat, + pub write_status: WriteStatus, + pub kind_data: InodeKindData, + /// Number of references the kernel is holding to the [Inode]. + /// A number of FS operations increment this, while the kernel calls [`Inode::forget(ino, n)`] to decrement. + lookup_count: u64, + /// Number of active prefetching streams on the [Inode]. + pub reader_count: u64, +} + +impl InodeState { + pub fn new(stat: &InodeStat, kind: InodeKind, write_status: WriteStatus) -> Self { + Self { + stat: stat.clone(), + kind_data: InodeKindData::default_for(kind), + write_status, + lookup_count: 0, + reader_count: 0, + } + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum InodeKind { + File, + Directory, +} + +impl InodeKind { + pub fn as_str(&self) -> &'static str { + match self { + InodeKind::File => "file", + InodeKind::Directory => "directory", + } + } +} + +impl From for FileType { + fn from(kind: InodeKind) -> Self { + match kind { + InodeKind::File => FileType::RegularFile, + InodeKind::Directory => FileType::Directory, + } + } +} + +#[derive(Debug)] +pub enum InodeKindData { + File {}, + Directory { + /// Mapping from child names to previously seen [Inode]s. + /// + /// The existence of a child or lack thereof does not imply the object does not exist, + /// nor that it currently exists in S3 in that state. + children: HashMap, + + /// A set of inode numbers that have been opened for write but not completed yet. + /// This should be a subset of the [children](Self::Directory::children) field. + writing_children: HashSet, + + /// True if this directory has been deleted (`rmdir`) from its parent + deleted: bool, + }, +} + +impl InodeKindData { + pub fn default_for(kind: InodeKind) -> Self { + match kind { + InodeKind::File => Self::File {}, + InodeKind::Directory => Self::Directory { + children: Default::default(), + writing_children: Default::default(), + deleted: false, + }, + } + } +} + +#[derive(Debug, Clone)] +pub struct InodeStat { + /// Time this stat becomes invalid and needs to be refreshed + pub expiry: Expiry, + + /// Size in bytes + pub size: usize, + + /// Time of last file content modification + pub mtime: OffsetDateTime, + /// Time of last file metadata (or content) change + pub ctime: OffsetDateTime, + /// Time of last access + pub atime: OffsetDateTime, + /// Etag for the file (object) + pub etag: Option, + /// Inodes corresponding to S3 objects with GLACIER or DEEP_ARCHIVE storage classes + /// are only readable after restoration. For objects with other storage classes + /// this field should be always `true`. + pub is_readable: bool, +} + +/// Inode write status (local vs remote) +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum WriteStatus { + /// Local inode created but not yet opened + LocalUnopened, + /// Local inode already opened + LocalOpen, + /// Remote inode + Remote, +} + +impl InodeStat { + pub fn is_valid(&self) -> bool { + !self.expiry.is_expired() + } + + /// Objects in flexible retrieval storage classes can't be accessed via GetObject unless they are + /// restored, and so we override their permissions to 000 and reject reads to them. We also warn + /// the first time we see an object like this, because FUSE enforces the 000 permissions on our + /// behalf so we might not see an attempted `open` call. + fn is_readable(storage_class: Option, restore_status: Option) -> bool { + static HAS_SENT_WARNING: AtomicBool = AtomicBool::new(false); + match storage_class.as_deref() { + Some("GLACIER") | Some("DEEP_ARCHIVE") => { + let restored = + matches!(restore_status, Some(RestoreStatus::Restored { expiry }) if expiry > SystemTime::now()); + if !restored && !HAS_SENT_WARNING.swap(true, Ordering::SeqCst) { + tracing::warn!( + "objects in the GLACIER and DEEP_ARCHIVE storage classes are only accessible if restored" + ); + } + restored + } + _ => true, + } + } + + /// Initialize an [InodeStat] for a file, given some metadata. + pub fn for_file( + size: usize, + datetime: OffsetDateTime, + etag: Option, + storage_class: Option, + restore_status: Option, + validity: Duration, + ) -> InodeStat { + let is_readable = Self::is_readable(storage_class, restore_status); + InodeStat { + expiry: Expiry::from_now(validity), + size, + atime: datetime, + ctime: datetime, + mtime: datetime, + etag, + is_readable, + } + } + + /// Initialize an [InodeStat] for a directory, given some metadata. + pub fn for_directory(datetime: OffsetDateTime, validity: Duration) -> InodeStat { + InodeStat { + expiry: Expiry::from_now(validity), + size: 0, + atime: datetime, + ctime: datetime, + mtime: datetime, + etag: None, + is_readable: true, + } + } + + pub fn update_validity(&mut self, validity: Duration) { + self.expiry = Expiry::from_now(validity); + } +} + +#[cfg(test)] +mod tests { + use crate::superblock::Superblock; + use mountpoint_s3_client::{ + mock_client::{MockClient, MockClientConfig, MockObject}, + types::ETag, + }; + use time::Duration; + + use super::*; + + #[tokio::test] + async fn test_forget() { + let superblock = Superblock::new("test_bucket", &Default::default(), Default::default()); + let ino = 42; + let inode_name = "made-up-inode"; + let inode = Inode::new( + ino, + ROOT_INODE_NO, + inode_name.to_owned(), + inode_name.to_owned(), + InodeKind::File, + InodeState { + write_status: WriteStatus::Remote, + stat: InodeStat::for_file(0, OffsetDateTime::now_utc(), None, None, None, Default::default()), + kind_data: InodeKindData::File {}, + lookup_count: 5, + reader_count: 0, + }, + ); + superblock.inner.inodes.write().unwrap().insert(ino, inode.clone()); + + superblock.forget(ino, 3); + let lookup_count = { + let inode_state = inode.inner.sync.read().unwrap(); + inode_state.lookup_count + }; + assert_eq!(lookup_count, 2, "lookup should have been reduced"); + assert!( + superblock.inner.get(ino).is_ok(), + "inode should be present in superblock" + ); + + superblock.forget(ino, 2); + let lookup_count = { + let inode_state = inode.inner.sync.read().unwrap(); + inode_state.lookup_count + }; + assert_eq!(lookup_count, 0, "lookup should have been reduced"); + assert!( + superblock.inner.inodes.read().unwrap().get(&ino).is_none(), + "inode should not be present in superblock" + ); + + // Make sure we didn't leak the inode anywhere else + assert_eq!(Arc::strong_count(&inode.inner), 1); + } + + #[tokio::test] + async fn test_forget_can_remove_inodes() { + let client_config = MockClientConfig { + bucket: "test_bucket".to_string(), + part_size: 1024 * 1024, + ..Default::default() + }; + let client = Arc::new(MockClient::new(client_config)); + + let name = "foo"; + client.add_object(name, b"foo".into()); + + let superblock = Superblock::new("test_bucket", &Default::default(), Default::default()); + + let lookup = superblock.lookup(&client, ROOT_INODE_NO, name.as_ref()).await.unwrap(); + let lookup_count = lookup.inode.inner.sync.read().unwrap().lookup_count; + assert_eq!(lookup_count, 1); + let ino = lookup.inode.ino(); + + superblock.forget(ino, 1); + + let lookup_count = lookup.inode.inner.sync.read().unwrap().lookup_count; + assert_eq!(lookup_count, 0); + // This test should now hold the only reference to the inode, so we know it's unreferenced + // and will be freed + assert_eq!(Arc::strong_count(&lookup.inode.inner), 1); + drop(lookup); + + let err = superblock + .getattr(&client, ino, false) + .await + .expect_err("Inode should not be valid"); + assert!(matches!(err, InodeError::InodeDoesNotExist(_))); + + let lookup = superblock.lookup(&client, ROOT_INODE_NO, name.as_ref()).await.unwrap(); + let lookup_count = lookup.inode.inner.sync.read().unwrap().lookup_count; + assert_eq!(lookup_count, 1); + } + + #[tokio::test] + async fn test_forget_shadowed_inode() { + let client_config = MockClientConfig { + bucket: "test_bucket".to_string(), + part_size: 1024 * 1024, + ..Default::default() + }; + let client = Arc::new(MockClient::new(client_config)); + + let name = "foo"; + client.add_object(name, b"foo".into()); + + let superblock = Superblock::new("test_bucket", &Default::default(), Default::default()); + + let lookup = superblock.lookup(&client, ROOT_INODE_NO, name.as_ref()).await.unwrap(); + let lookup_count = lookup.inode.inner.sync.read().unwrap().lookup_count; + assert_eq!(lookup_count, 1); + let ino = lookup.inode.ino(); + drop(lookup); + + client.add_object(&format!("{name}/bar"), b"bar".into()); + + // Should be a directory now, so a different inode + let new_lookup = superblock.lookup(&client, ROOT_INODE_NO, name.as_ref()).await.unwrap(); + assert_ne!(ino, new_lookup.inode.ino()); + + superblock.forget(ino, 1); + + // Lookup still works after forgetting the old inode + let new_lookup2 = superblock.lookup(&client, ROOT_INODE_NO, name.as_ref()).await.unwrap(); + assert_eq!(new_lookup.inode.ino(), new_lookup2.inode.ino()); + } + + #[tokio::test] + async fn test_unlink_verify_checksum() { + let client_config = MockClientConfig { + bucket: "test_bucket".to_string(), + part_size: 1024 * 1024, + ..Default::default() + }; + let client = Arc::new(MockClient::new(client_config)); + let file_name = "corrupted"; + client.add_object(file_name.as_ref(), MockObject::constant(0xaa, 30, ETag::for_tests())); + + let superblock = Superblock::new("test_bucket", &Default::default(), Default::default()); + + // Create an inode with "corrupted" metadata, i.e. + // checksum not matching ino + full key. + let parent_ino = ROOT_INODE_NO; + let bad_checksum = Crc32c::new(42); + let inode = Inode { + inner: Arc::new(InodeInner { + ino: 42, + parent: parent_ino, + name: file_name.into(), + full_key: file_name.into(), + kind: InodeKind::File, + checksum: bad_checksum, + sync: RwLock::new(InodeState { + stat: InodeStat::for_file( + 0, + OffsetDateTime::now_utc(), + Some(ETag::for_tests().as_str().to_owned()), + None, + None, + NEVER_EXPIRE_TTL, + ), + write_status: WriteStatus::Remote, + kind_data: InodeKindData::File {}, + lookup_count: 1, + reader_count: 0, + }), + }), + }; + + // Manually add the corrupted inode to the superblock and root directory. + { + let mut inodes = superblock.inner.inodes.write().unwrap(); + inodes.insert(inode.ino(), inode.clone()); + let parent = inodes.get(&parent_ino).unwrap(); + let mut parent_state = parent.get_mut_inode_state().unwrap(); + match &mut parent_state.kind_data { + InodeKindData::File {} => panic!("root is always a directory"), + InodeKindData::Directory { children, .. } => _ = children.insert(file_name.into(), inode.clone()), + } + } + + let err = superblock + .unlink(&client, parent_ino, file_name.as_ref()) + .await + .expect_err("unlink of a corrupted inode should fail"); + assert!(matches!(err, InodeError::CorruptedMetadata(_))); + } + + #[tokio::test] + async fn test_setattr_invalid_stat() { + let client_config = MockClientConfig { + bucket: "test_bucket".to_string(), + part_size: 1024 * 1024, + ..Default::default() + }; + let client = Arc::new(MockClient::new(client_config)); + let superblock = Superblock::new("test_bucket", &Default::default(), Default::default()); + + let ino: u64 = 42; + let inode_name = "made-up-inode"; + let mut hasher = crc32c::Hasher::new(); + hasher.update(ino.to_be_bytes().as_ref()); + hasher.update(inode_name.as_bytes()); + let checksum = hasher.finalize(); + let inode = Inode { + inner: Arc::new(InodeInner { + ino, + parent: ROOT_INODE_NO, + name: inode_name.to_owned(), + full_key: inode_name.to_owned(), + kind: InodeKind::File, + checksum, + sync: RwLock::new(InodeState { + write_status: WriteStatus::LocalOpen, + stat: InodeStat::for_file(0, OffsetDateTime::UNIX_EPOCH, None, None, None, Default::default()), + kind_data: InodeKindData::File {}, + lookup_count: 5, + reader_count: 0, + }), + }), + }; + superblock.inner.inodes.write().unwrap().insert(ino, inode.clone()); + + // Verify that the stat is invalid + let inode = superblock.inner.get(ino).unwrap(); + let stat = inode.get_inode_state().unwrap().stat.clone(); + assert!(!stat.is_valid()); + + // Should be able to reset expiry back and make stat valid when calling setattr + let atime = OffsetDateTime::UNIX_EPOCH + Duration::days(90); + let mtime = OffsetDateTime::UNIX_EPOCH + Duration::days(60); + let lookup = superblock + .setattr(&client, ino, Some(atime), Some(mtime)) + .await + .expect("setattr should be successful"); + let stat = lookup.stat; + assert_eq!(stat.atime, atime); + assert_eq!(stat.mtime, mtime); + assert!(stat.is_valid()); + } + + #[cfg(feature = "shuttle")] + mod shuttle_tests { + use mountpoint_s3_client::mock_client::{MockClient, MockClientConfig}; + use shuttle::{check_pct, check_random, thread}; + use shuttle::{future::block_on, sync::Arc}; + + use super::*; + + #[test] + fn test_create_and_forget_race_condition() { + async fn test_helper() { + let client_config = MockClientConfig { + bucket: "test_bucket".to_string(), + part_size: 1024 * 1024, + ..Default::default() + }; + let client = Arc::new(MockClient::new(client_config)); + + let name = "foo"; + client.add_object(name, b"foo".into()); + + let superblock = Arc::new(Superblock::new("test_bucket", &Default::default(), Default::default())); + + let lookup = superblock.lookup(&client, ROOT_INODE_NO, name.as_ref()).await.unwrap(); + let lookup_count = lookup.inode.inner.sync.read().unwrap().lookup_count; + assert_eq!(lookup_count, 1); + let ino = lookup.inode.ino(); + + let superblock_clone = superblock.clone(); + let forget_task = thread::spawn(move || { + superblock_clone.forget(ino, 1); + }); + + let file_name = "bar"; + superblock + .create(&client, ROOT_INODE_NO, file_name.as_ref(), InodeKind::File) + .await + .unwrap(); + + forget_task.join().unwrap(); + let lookup_count = lookup.inode.inner.sync.read().unwrap().lookup_count; + assert_eq!(lookup_count, 0); + } + + check_random(|| block_on(test_helper()), 1000); + check_pct(|| block_on(test_helper()), 1000, 3); + } + } +} diff --git a/mountpoint-s3/src/inode/negative_cache.rs b/mountpoint-s3/src/superblock/negative_cache.rs similarity index 100% rename from mountpoint-s3/src/inode/negative_cache.rs rename to mountpoint-s3/src/superblock/negative_cache.rs diff --git a/mountpoint-s3/src/inode/readdir.rs b/mountpoint-s3/src/superblock/readdir.rs similarity index 99% rename from mountpoint-s3/src/inode/readdir.rs rename to mountpoint-s3/src/superblock/readdir.rs index a6dc5a7d9..0a7a4ca07 100644 --- a/mountpoint-s3/src/inode/readdir.rs +++ b/mountpoint-s3/src/superblock/readdir.rs @@ -51,7 +51,8 @@ use tracing::{error, trace, warn}; use crate::sync::{Arc, AsyncMutex, Mutex}; use super::{ - valid_inode_name, InodeError, InodeKind, InodeKindData, InodeNo, InodeStat, LookedUp, RemoteLookup, SuperblockInner, + inode::valid_inode_name, InodeError, InodeKind, InodeKindData, InodeNo, InodeStat, LookedUp, RemoteLookup, + SuperblockInner, }; /// Handle for an inflight directory listing