From bc9b19c7652048d9461f5985ce8738fb7d640e25 Mon Sep 17 00:00:00 2001 From: Isabella Bosia Date: Mon, 1 Dec 2025 16:14:13 +0000 Subject: [PATCH 1/7] implement O_TMPFILE --- kmod/inode.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/kmod/inode.c b/kmod/inode.c index 7fd90ec3..017e3b21 100644 --- a/kmod/inode.c +++ b/kmod/inode.c @@ -607,6 +607,30 @@ static int COMPAT_FUNC_UNS_IMP(ternfs_symlink, struct inode* dir, struct dentry* return 0; } +#if LINUX_VERSION_CODE >= KERNEL_VERSION(6,6,0) +static int COMPAT_FUNC_UNS_IMP(ternfs_tmpfile, struct inode* dir, struct file* file, umode_t mode) { + struct dentry* dentry = file->f_path.dentry; // dentry with a "fake" name +#else +static int COMPAT_FUNC_UNS_IMP(ternfs_tmpfile, struct inode* dir, struct dentry* dentry, umode_t mode) { +#endif + ternfs_debug("ternfs_tempfile: name: %s", dentry->d_name.name); + struct ternfs_inode* enode = ternfs_create_internal(dir, TERNFS_INODE_FILE, dentry); + if (IS_ERR(enode)) { return PTR_ERR(enode); } + + // the file is created in writing status by ternfs_create_internal + // and it remains unlinked until linkat() is called + // once link is called we can then assign a name + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(6,6,0) + // in 6.6+, finish_open_simple handles d_instantiate + return finish_open_simple(file, 0); +#else + // for older kernels, we need to instantiate as unhashed + d_tmpfile(dentry, &enode->inode); + return 0; +#endif +} + static const char* ternfs_get_link(struct dentry* dentry, struct inode* inode, struct delayed_call* destructor) { // Can't be bothered to think about RCU if (dentry == NULL) { return ERR_PTR(-ECHILD); } @@ -631,6 +655,7 @@ static const struct inode_operations ternfs_dir_inode_ops = { .rename = ternfs_rename, .getattr = ternfs_getattr, .symlink = ternfs_symlink, + .tmpfile = ternfs_tmpfile, }; static const struct inode_operations ternfs_file_inode_ops = { From c443e117a6317f5b7ebe69e05680e89b2d417575 Mon Sep 17 00:00:00 2001 From: Isabella Bosia Date: Mon, 1 Dec 2025 21:26:56 +0000 Subject: [PATCH 2/7] impl link --- kmod/file.c | 150 +++++++++++++++++++++++++++++++++++++++++++++++++++ kmod/file.h | 1 + kmod/inode.c | 1 + 3 files changed, 152 insertions(+) diff --git a/kmod/file.c b/kmod/file.c index 6e9496c0..9276ca8a 100644 --- a/kmod/file.c +++ b/kmod/file.c @@ -1007,6 +1007,156 @@ int ternfs_file_flush(struct ternfs_inode* enode, struct dentry* dentry) { return err; } +static int flush_and_link(struct ternfs_inode *enode, struct dentry *parent, unsigned long i_ino, const char *name, size_t name_len) { + int err = 0; + bool file_is_alive_and_flushing = false; + + err = atomic_read(&enode->file.transient_err); + if (err < 0) return err; + + err = start_flushing(enode, false); + if (err < 0) return err; + + down(&enode->file.flushing_span_sema); + file_is_alive_and_flushing = true; + + err = atomic_read(&enode->file.transient_err); + if (err < 0) return err; + + // finally link the file + ternfs_debug("linking file"); + err = ternfs_error_to_linux(ternfs_shard_link_file( + (struct ternfs_fs_info*)enode->inode.i_sb->s_fs_info, enode->inode.i_ino, + enode->file.cookie, i_ino, name, name_len, + &enode->edge_creation_time + )); + if (err < 0) return err; + + // update timestamps + inode_set_mtime(&enode->inode, enode->edge_creation_time / 1000000000, enode->edge_creation_time % 1000000000); + inode_set_ctime(&enode->inode, enode->edge_creation_time / 1000000000, enode->edge_creation_time % 1000000000); + + // file is now flushed and immutable + enode->file.status = TERNFS_FILE_STATUS_READING; + smp_store_release(&enode->getattr_expiry, 0); + + // expire parent directory listing + { + WRITE_ONCE(TERNFS_I(d_inode(parent))->dir.mtime_expiry, 0); + dput(parent); + } + return err; +} + +int ternfs_link(struct dentry* old_dentry, struct inode* dir, struct dentry* new_dentry) { + struct inode* inode = d_inode(old_dentry); + struct ternfs_inode* enode = TERNFS_I(inode); + + inode_lock(&enode->inode); + + int err = 0; + + // TODO: there are probably cases in which this could be allowed (e.g. cross directory things that happen to use identical storage) + if (!old_dentry->d_parent || old_dentry->d_parent->d_inode != dir) { + ternfs_debug("tried to link a file not in the right directory"); + err = -EXDEV; + goto out_early; + } + + if (inode->i_nlink > 0) { + ternfs_debug("tried to hardlink an existing file"); + err = -EINVAL; + goto out_early; + } + + // O_TMPFILE files have I_LINKABLE set (it's unset on e.g. deleted files) + // TODO: do we even get called if I_LINKABLE is not set? + if (!(inode->i_state & I_LINKABLE)) { + ternfs_debug("file not linkable"); + err = -EINVAL; + goto out_early; + } + + if (enode->file.status != TERNFS_FILE_STATUS_WRITING) { + ternfs_debug("status=%d, won't link", enode->file.status); + err = -EINVAL; + goto out_early; + } + + if (enode->file.owner != current->group_leader) { + ternfs_debug("owner=%p != group_leader=%p, won't link", enode->file.owner, current->group_leader); + err = -EPERM; + goto out_early; + } + + bool file_is_alive_and_flushing = false; + + err = atomic_read(&enode->file.transient_err); + if (err < 0) { goto out; } + + err = start_flushing(enode, false); + if (err < 0) { goto out; } + + down(&enode->file.flushing_span_sema); + file_is_alive_and_flushing = true; + + err = atomic_read(&enode->file.transient_err); + if (err < 0) { goto out; } + + // finally link the file + ternfs_debug("linking file"); + err = ternfs_error_to_linux(ternfs_shard_link_file( + (struct ternfs_fs_info*)enode->inode.i_sb->s_fs_info, enode->inode.i_ino, + enode->file.cookie, dir->i_ino, new_dentry->d_name.name, new_dentry->d_name.len, + &enode->edge_creation_time + )); + if (err < 0) { goto out; } + + // update timestamps + inode_set_mtime(&enode->inode, enode->edge_creation_time / 1000000000, enode->edge_creation_time % 1000000000); + inode_set_ctime(&enode->inode, enode->edge_creation_time / 1000000000, enode->edge_creation_time % 1000000000); + + // file is now flushed and immutable + enode->file.status = TERNFS_FILE_STATUS_READING; + smp_store_release(&enode->getattr_expiry, 0); + + // expire parent directory listing + { + struct dentry* parent = dget_parent(new_dentry); + WRITE_ONCE(TERNFS_I(d_inode(parent))->dir.mtime_expiry, 0); + dput(parent); + } + + // increment inode refcount and instantiate dentry + ihold(&enode->inode); + d_instantiate(new_dentry, &enode->inode); + +out: + if (err) { + atomic_cmpxchg(&enode->file.transient_err, 0, err); + } + if (!file_is_alive_and_flushing) { + down(&enode->file.flushing_span_sema); + up(&enode->file.flushing_span_sema); + } + // clean up writing span + if (enode->file.writing_span != NULL) { + BUG_ON(!put_transient_span(enode->file.writing_span)); + enode->file.writing_span = NULL; + } + BUG_ON(enode->file.writing_span != NULL); + if (enode->file.mm) { + mmdrop(enode->file.mm); + } + enode->file.mm = NULL; + inode_unlock(&enode->inode); + return err; + +out_early: + inode_unlock(&enode->inode); + return err; +} + static int file_flush_internal(struct file* filp, fl_owner_t id) { // can we get write while this is in progress? struct ternfs_inode* enode = TERNFS_I(filp->f_inode); struct dentry* dentry = filp->f_path.dentry; diff --git a/kmod/file.h b/kmod/file.h index 49fbae82..fcc49570 100644 --- a/kmod/file.h +++ b/kmod/file.h @@ -16,6 +16,7 @@ extern int ternfs_file_getattr_refresh_time_jiffies; // this is only relevant fo ssize_t ternfs_file_write(struct ternfs_inode* enode, int flags, loff_t* ppos, struct iov_iter* from); int ternfs_file_flush(struct ternfs_inode* enode, struct dentry* dentry); +int ternfs_link(struct dentry* old_dentry, struct inode* dir, struct dentry* new_dentry); // Also used in ternfs_do_ftruncate to fill the end of the file. ssize_t ternfs_file_write_internal(struct ternfs_inode* enode, int flags, loff_t* ppos, struct iov_iter* from, size_t count); diff --git a/kmod/inode.c b/kmod/inode.c index 017e3b21..6c6c47e4 100644 --- a/kmod/inode.c +++ b/kmod/inode.c @@ -649,6 +649,7 @@ static const char* ternfs_get_link(struct dentry* dentry, struct inode* inode, s static const struct inode_operations ternfs_dir_inode_ops = { .create = ternfs_create, .lookup = ternfs_lookup, + .link = ternfs_link, .unlink = ternfs_unlink, .mkdir = ternfs_mkdir, .rmdir = ternfs_rmdir, From 20e7a2331ef4df7fa1c69ec62c9a58a4fd153b42 Mon Sep 17 00:00:00 2001 From: Isabella Bosia Date: Mon, 1 Dec 2025 22:28:01 +0000 Subject: [PATCH 3/7] simplify code path --- kmod/file.c | 162 ++++++++++------------------------------------------ 1 file changed, 29 insertions(+), 133 deletions(-) diff --git a/kmod/file.c b/kmod/file.c index 9276ca8a..e8c6e7e3 100644 --- a/kmod/file.c +++ b/kmod/file.c @@ -899,23 +899,13 @@ static ssize_t file_write_iter(struct kiocb* iocb, struct iov_iter* from) { return res; } -int ternfs_file_flush(struct ternfs_inode* enode, struct dentry* dentry) { - inode_lock(&enode->inode); +// shared functionality of ternfs_flush and ternfs_link +// takes the file we're trying to flush/link, the directory we're going to put it in, and a name/len pair to assign +static int flush_and_link(struct ternfs_inode *enode, struct dentry *parent, const char *name, size_t name_len) { + BUG_ON(!inode_is_locked(&enode->inode)); int err = 0; - // Not writing, there's nothing to do, there's nothing to do, files are immutable - if (enode->file.status != TERNFS_FILE_STATUS_WRITING) { - ternfs_debug("status=%d, won't flush", enode->file.status); - goto out_early; - } - - // We are in another process, skip - if (enode->file.owner != current->group_leader) { - ternfs_debug("owner=%p != group_leader=%p, won't flush", enode->file.owner, current->group_leader); - goto out_early; - } - bool file_is_alive_and_flushing = false; // if we've errored out already, just exit @@ -948,7 +938,7 @@ int ternfs_file_flush(struct ternfs_inode* enode, struct dentry* dentry) { ternfs_debug("linking file"); err = ternfs_error_to_linux(ternfs_shard_link_file( (struct ternfs_fs_info*)enode->inode.i_sb->s_fs_info, enode->inode.i_ino, - enode->file.cookie, dentry->d_parent->d_inode->i_ino, dentry->d_name.name, dentry->d_name.len, + enode->file.cookie, parent->d_inode->i_ino, name, name_len, &enode->edge_creation_time )); if (err < 0) { goto out; } @@ -964,7 +954,6 @@ int ternfs_file_flush(struct ternfs_inode* enode, struct dentry* dentry) { // expire the directory listing -- we know for a fact that it // is wrong, it now contains this file. { - struct dentry* parent = dget_parent(dentry); WRITE_ONCE(TERNFS_I(d_inode(parent))->dir.mtime_expiry, 0); dput(parent); } @@ -999,52 +988,30 @@ int ternfs_file_flush(struct ternfs_inode* enode, struct dentry* dentry) { mmdrop(enode->file.mm); } enode->file.mm = NULL; - inode_unlock(&enode->inode); - return err; -out_early: - inode_unlock(&enode->inode); return err; } -static int flush_and_link(struct ternfs_inode *enode, struct dentry *parent, unsigned long i_ino, const char *name, size_t name_len) { +int ternfs_file_flush(struct ternfs_inode* enode, struct dentry* dentry) { int err = 0; - bool file_is_alive_and_flushing = false; - - err = atomic_read(&enode->file.transient_err); - if (err < 0) return err; - - err = start_flushing(enode, false); - if (err < 0) return err; - - down(&enode->file.flushing_span_sema); - file_is_alive_and_flushing = true; - - err = atomic_read(&enode->file.transient_err); - if (err < 0) return err; + inode_lock(&enode->inode); - // finally link the file - ternfs_debug("linking file"); - err = ternfs_error_to_linux(ternfs_shard_link_file( - (struct ternfs_fs_info*)enode->inode.i_sb->s_fs_info, enode->inode.i_ino, - enode->file.cookie, i_ino, name, name_len, - &enode->edge_creation_time - )); - if (err < 0) return err; + // Not writing, there's nothing to do, there's nothing to do, files are immutable + if (enode->file.status != TERNFS_FILE_STATUS_WRITING) { + ternfs_debug("status=%d, won't flush", enode->file.status); + goto out; + } - // update timestamps - inode_set_mtime(&enode->inode, enode->edge_creation_time / 1000000000, enode->edge_creation_time % 1000000000); - inode_set_ctime(&enode->inode, enode->edge_creation_time / 1000000000, enode->edge_creation_time % 1000000000); + // We are in another process, skip + if (enode->file.owner != current->group_leader) { + ternfs_debug("owner=%p != group_leader=%p, won't flush", enode->file.owner, current->group_leader); + goto out; + } - // file is now flushed and immutable - enode->file.status = TERNFS_FILE_STATUS_READING; - smp_store_release(&enode->getattr_expiry, 0); + err = flush_and_link(enode, dentry->d_parent, dentry->d_name.name, dentry->d_name.len); - // expire parent directory listing - { - WRITE_ONCE(TERNFS_I(d_inode(parent))->dir.mtime_expiry, 0); - dput(parent); - } +out: + inode_unlock(&enode->inode); return err; } @@ -1058,101 +1025,30 @@ int ternfs_link(struct dentry* old_dentry, struct inode* dir, struct dentry* new // TODO: there are probably cases in which this could be allowed (e.g. cross directory things that happen to use identical storage) if (!old_dentry->d_parent || old_dentry->d_parent->d_inode != dir) { - ternfs_debug("tried to link a file not in the right directory"); + ternfs_debug("tried to link a file in a different directory than the one it was opened in"); err = -EXDEV; - goto out_early; - } - - if (inode->i_nlink > 0) { - ternfs_debug("tried to hardlink an existing file"); - err = -EINVAL; - goto out_early; - } - - // O_TMPFILE files have I_LINKABLE set (it's unset on e.g. deleted files) - // TODO: do we even get called if I_LINKABLE is not set? - if (!(inode->i_state & I_LINKABLE)) { - ternfs_debug("file not linkable"); - err = -EINVAL; - goto out_early; + goto out; } + // linking existing files is not allowed + // TODO: check i_nlink once we actually start reporting link counts if (enode->file.status != TERNFS_FILE_STATUS_WRITING) { ternfs_debug("status=%d, won't link", enode->file.status); err = -EINVAL; - goto out_early; + goto out; } + // this is not an error in normal flush (because other processes could close the fd) but linking would be weird if (enode->file.owner != current->group_leader) { ternfs_debug("owner=%p != group_leader=%p, won't link", enode->file.owner, current->group_leader); err = -EPERM; - goto out_early; - } - - bool file_is_alive_and_flushing = false; - - err = atomic_read(&enode->file.transient_err); - if (err < 0) { goto out; } - - err = start_flushing(enode, false); - if (err < 0) { goto out; } - - down(&enode->file.flushing_span_sema); - file_is_alive_and_flushing = true; - - err = atomic_read(&enode->file.transient_err); - if (err < 0) { goto out; } - - // finally link the file - ternfs_debug("linking file"); - err = ternfs_error_to_linux(ternfs_shard_link_file( - (struct ternfs_fs_info*)enode->inode.i_sb->s_fs_info, enode->inode.i_ino, - enode->file.cookie, dir->i_ino, new_dentry->d_name.name, new_dentry->d_name.len, - &enode->edge_creation_time - )); - if (err < 0) { goto out; } - - // update timestamps - inode_set_mtime(&enode->inode, enode->edge_creation_time / 1000000000, enode->edge_creation_time % 1000000000); - inode_set_ctime(&enode->inode, enode->edge_creation_time / 1000000000, enode->edge_creation_time % 1000000000); - - // file is now flushed and immutable - enode->file.status = TERNFS_FILE_STATUS_READING; - smp_store_release(&enode->getattr_expiry, 0); - - // expire parent directory listing - { - struct dentry* parent = dget_parent(new_dentry); - WRITE_ONCE(TERNFS_I(d_inode(parent))->dir.mtime_expiry, 0); - dput(parent); + goto out; } - // increment inode refcount and instantiate dentry - ihold(&enode->inode); - d_instantiate(new_dentry, &enode->inode); + struct dentry* parent = dget_parent(new_dentry); + err = flush_and_link(enode, parent, new_dentry->d_name.name, new_dentry->d_name.len); out: - if (err) { - atomic_cmpxchg(&enode->file.transient_err, 0, err); - } - if (!file_is_alive_and_flushing) { - down(&enode->file.flushing_span_sema); - up(&enode->file.flushing_span_sema); - } - // clean up writing span - if (enode->file.writing_span != NULL) { - BUG_ON(!put_transient_span(enode->file.writing_span)); - enode->file.writing_span = NULL; - } - BUG_ON(enode->file.writing_span != NULL); - if (enode->file.mm) { - mmdrop(enode->file.mm); - } - enode->file.mm = NULL; - inode_unlock(&enode->inode); - return err; - -out_early: inode_unlock(&enode->inode); return err; } From 29ed28ac24ef714f5db3c4988f691d5bac5fa79d Mon Sep 17 00:00:00 2001 From: Isabella Bosia Date: Mon, 1 Dec 2025 23:26:40 +0000 Subject: [PATCH 4/7] fix double locking in link --- kmod/file.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/kmod/file.c b/kmod/file.c index e8c6e7e3..5bff2005 100644 --- a/kmod/file.c +++ b/kmod/file.c @@ -1019,12 +1019,15 @@ int ternfs_link(struct dentry* old_dentry, struct inode* dir, struct dentry* new struct inode* inode = d_inode(old_dentry); struct ternfs_inode* enode = TERNFS_I(inode); - inode_lock(&enode->inode); + // should be done by vfs_link + BUG_ON(!inode_is_locked(inode)); int err = 0; + struct dentry* parent = dget_parent(old_dentry); + // TODO: there are probably cases in which this could be allowed (e.g. cross directory things that happen to use identical storage) - if (!old_dentry->d_parent || old_dentry->d_parent->d_inode != dir) { + if (!parent || parent->d_inode != dir) { ternfs_debug("tried to link a file in a different directory than the one it was opened in"); err = -EXDEV; goto out; @@ -1045,11 +1048,9 @@ int ternfs_link(struct dentry* old_dentry, struct inode* dir, struct dentry* new goto out; } - struct dentry* parent = dget_parent(new_dentry); err = flush_and_link(enode, parent, new_dentry->d_name.name, new_dentry->d_name.len); out: - inode_unlock(&enode->inode); return err; } From b8c23d022b7ea0ee6b77ce34c1de098d2902feed Mon Sep 17 00:00:00 2001 From: Isabella Bosia Date: Tue, 2 Dec 2025 00:04:39 +0000 Subject: [PATCH 5/7] don't link tmpfiles on close --- kmod/file.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kmod/file.c b/kmod/file.c index 5bff2005..ebfa4d6c 100644 --- a/kmod/file.c +++ b/kmod/file.c @@ -1057,6 +1057,11 @@ int ternfs_link(struct dentry* old_dentry, struct inode* dir, struct dentry* new static int file_flush_internal(struct file* filp, fl_owner_t id) { // can we get write while this is in progress? struct ternfs_inode* enode = TERNFS_I(filp->f_inode); struct dentry* dentry = filp->f_path.dentry; + + // ternfs_file_flush also links, but tmpfiles are only linked when linkat is called + if (unlikely(filp->f_flags & __O_TMPFILE)) + return 0; + return ternfs_file_flush(enode, dentry); } From cc79221543984620c77be3fb1eeb0599356c6d63 Mon Sep 17 00:00:00 2001 From: Isabella Bosia Date: Tue, 2 Dec 2025 10:21:46 +0000 Subject: [PATCH 6/7] address comment --- kmod/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kmod/file.c b/kmod/file.c index ebfa4d6c..68ab9487 100644 --- a/kmod/file.c +++ b/kmod/file.c @@ -1026,7 +1026,7 @@ int ternfs_link(struct dentry* old_dentry, struct inode* dir, struct dentry* new struct dentry* parent = dget_parent(old_dentry); - // TODO: there are probably cases in which this could be allowed (e.g. cross directory things that happen to use identical storage) + // TODO: there are probably cases in which this could be allowed (e.g. cross directory things that happen to be in the same shard) if (!parent || parent->d_inode != dir) { ternfs_debug("tried to link a file in a different directory than the one it was opened in"); err = -EXDEV; From 43450dff76dca30c9aa98f93be70df137e160eab Mon Sep 17 00:00:00 2001 From: Isabella Bosia Date: Tue, 2 Dec 2025 10:30:02 +0000 Subject: [PATCH 7/7] always dget the parent and always dput it --- kmod/file.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/kmod/file.c b/kmod/file.c index 68ab9487..2f0f3dc6 100644 --- a/kmod/file.c +++ b/kmod/file.c @@ -901,6 +901,7 @@ static ssize_t file_write_iter(struct kiocb* iocb, struct iov_iter* from) { // shared functionality of ternfs_flush and ternfs_link // takes the file we're trying to flush/link, the directory we're going to put it in, and a name/len pair to assign +// note: caller must take care of dget_parent/dput static int flush_and_link(struct ternfs_inode *enode, struct dentry *parent, const char *name, size_t name_len) { BUG_ON(!inode_is_locked(&enode->inode)); @@ -953,10 +954,7 @@ static int flush_and_link(struct ternfs_inode *enode, struct dentry *parent, con // expire the directory listing -- we know for a fact that it // is wrong, it now contains this file. - { - WRITE_ONCE(TERNFS_I(d_inode(parent))->dir.mtime_expiry, 0); - dput(parent); - } + WRITE_ONCE(TERNFS_I(d_inode(parent))->dir.mtime_expiry, 0); out: if (err) { @@ -1008,7 +1006,13 @@ int ternfs_file_flush(struct ternfs_inode* enode, struct dentry* dentry) { goto out; } - err = flush_and_link(enode, dentry->d_parent, dentry->d_name.name, dentry->d_name.len); + + struct dentry *parent = dget_parent(dentry); + + err = flush_and_link(enode, parent, dentry->d_name.name, dentry->d_name.len); + + if (parent) + dput(parent); out: inode_unlock(&enode->inode); @@ -1051,6 +1055,9 @@ int ternfs_link(struct dentry* old_dentry, struct inode* dir, struct dentry* new err = flush_and_link(enode, parent, new_dentry->d_name.name, new_dentry->d_name.len); out: + if (parent) + dput(parent); + return err; }