diff --git a/kmod/file.c b/kmod/file.c index 0c1bb1d5..ec6765f7 100644 --- a/kmod/file.c +++ b/kmod/file.c @@ -78,7 +78,6 @@ static int file_open(struct inode* inode, struct file* filp) { struct ternfs_inode* enode = TERNFS_I(inode); ternfs_debug("enode=%p status=%d owner=%p", enode, enode->file.status, current->group_leader); - int err = 0; if ((filp->f_mode&FMODE_WRITE) && (enode->file.status == TERNFS_FILE_STATUS_WRITING)) { // this is the "common" writing case, we've just created a file to write it. @@ -90,51 +89,11 @@ static int file_open(struct inode* inode, struct file* filp) { // to files) are attempted. the reason is that some workflows (such as open write + // setattr) _will_ work. enode->file.status = TERNFS_FILE_STATUS_READING; - // also, set atime, if requested - if (!(filp->f_flags&O_NOATIME)) { - u64 atime_ns = ktime_get_real_ns(); - struct timespec64 atime_ts = ns_to_timespec64(atime_ns); - u64 diff = atime_ts.tv_sec - min(inode_get_atime_sec(&enode->inode), atime_ts.tv_sec); - if (diff < ternfs_atime_update_interval_sec) { - // we don't think we should update - goto out; - } - - // https://internal-repo/issues/292 - // we might have cached data and another client updated atime. - // ternfs_do_getattr is orders of magnitude cheaper than ternfs_shard_set_time, - // so we might as well refresh and re-check - int err = ternfs_do_getattr(enode, ATTR_CACHE_NO_TIMEOUT); - if (err) { - goto out; - } - diff = atime_ts.tv_sec - min(inode_get_atime_sec(&enode->inode), atime_ts.tv_sec); - if (diff < ternfs_atime_update_interval_sec) { - // out local time changed and we see we don't need to update - goto out; - } - - if ((inode_get_atime_sec(&enode->inode) > atime_ts.tv_sec) || - (inode_get_atime_sec(&enode->inode) == atime_ts.tv_sec && - inode_get_atime_nsec(&enode->inode) == atime_ts.tv_nsec - ) - ) { - // we don't want atime to go into the past don't update - goto out; - } - u64 atime = atime_ns | (1ull<<63); - err = ternfs_shard_set_time((struct ternfs_fs_info*)enode->inode.i_sb->s_fs_info, inode->i_ino, 0, atime); - if (err) { - goto out; - } - // we updated time. we don't need to refresh it now but allow refresh on next stat by getattr_expiry - smp_store_release(&enode->getattr_expiry, 0); - } } -out: + inode_unlock(inode); trace_eggsfs_inode_lock(inode, TERNFS_INODE_UNLOCK, "file_open"); - return err; + return 0; } static void init_transient_span(void* p) { @@ -1177,13 +1136,20 @@ static int file_fsync(struct file* f, loff_t start, loff_t end, int datasync) { return 0; } +// filemap_fault() does not call file_accessed(), so mmap-driven reads would +// otherwise never update atime. Bump it at mmap time before delegating. +static int file_mmap(struct file* file, struct vm_area_struct* vma) { + file_accessed(file); + return generic_file_readonly_mmap(file, vma); +} + const struct file_operations ternfs_file_operations = { .open = file_open, .read_iter = file_read_iter, .write_iter = file_write_iter, .flush = file_flush_internal, .llseek = file_lseek, - .mmap = generic_file_readonly_mmap, + .mmap = file_mmap, .fsync = file_fsync, }; diff --git a/kmod/inode.c b/kmod/inode.c index d1ed5eb1..8754c0db 100644 --- a/kmod/inode.c +++ b/kmod/inode.c @@ -626,6 +626,73 @@ static const char* ternfs_get_link(struct dentry* dentry, struct inode* inode, s return buf; } +// Called by the VFS (via touch_atime -> inode_update_time) on actual data access +// of a regular file: generic_file_read_iter hits it on read/pread/readv, and +// file_mmap() in file.c hits it for mmap. atime_needs_update() in fs/inode.c +// already honours O_NOATIME, MNT_NOATIME, and relatime before reaching us, so +// this function only needs to apply the ternfs_atime_update_interval_sec +// throttle on top and push the update to the shard. +// +// Only wired into ternfs_file_inode_ops: directory atime is TernFS-internal +// state (dentry cache invalidation) and must not be touched by VFS access +// paths -- SB_NODIRATIME keeps the VFS out. +#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 6, 0) +static int ternfs_update_time(struct inode* inode, struct timespec64*, int flags) { +#else +static int ternfs_update_time(struct inode* inode, int flags) { +#endif + struct ternfs_inode* enode = TERNFS_I(inode); + + int non_atime = flags & ~S_ATIME; + if (non_atime) { + ternfs_generic_update_time(inode, non_atime); + } + if (!(flags & S_ATIME)) { return 0; } + + u64 now_ns = ktime_get_real_ns(); + struct timespec64 now_ts = ns_to_timespec64(now_ns); + + time64_t cur_sec = inode_get_atime_sec(inode); + u64 diff = now_ts.tv_sec - min(cur_sec, now_ts.tv_sec); + if (diff < ternfs_atime_update_interval_sec) { + // within throttle window: bump in-memory atime only + inode_set_atime_to_ts(inode, now_ts); + return 0; + } + + // https://internal-repo/issues/292 + // a peer client may have already bumped atime. ternfs_do_getattr is orders + // of magnitude cheaper than ternfs_shard_set_time, so refresh before RPC. + int err = ternfs_do_getattr(enode, ATTR_CACHE_NO_TIMEOUT); + if (err) { + // non-fatal: atime errors shouldn't break reads + ternfs_warn("file=%016lx update_time getattr failed err=%d", inode->i_ino, err); + return 0; + } + cur_sec = inode_get_atime_sec(inode); + diff = now_ts.tv_sec - min(cur_sec, now_ts.tv_sec); + if (diff < ternfs_atime_update_interval_sec) { return 0; } + + if ((cur_sec > now_ts.tv_sec) || + (cur_sec == now_ts.tv_sec && + inode_get_atime_nsec(inode) >= now_ts.tv_nsec)) { + // don't let atime go backwards + return 0; + } + + u64 atime = now_ns | (1ull << 63); + err = ternfs_shard_set_time( + (struct ternfs_fs_info*)inode->i_sb->s_fs_info, + inode->i_ino, 0, atime); + if (err) { + ternfs_warn("file=%016lx update_time shard_set_time failed err=%d", inode->i_ino, err); + return 0; + } + inode_set_atime_to_ts(inode, now_ts); + smp_store_release(&enode->getattr_expiry, 0); + return 0; +} + static const struct inode_operations ternfs_dir_inode_ops = { .create = ternfs_create, .lookup = ternfs_lookup, @@ -640,6 +707,7 @@ static const struct inode_operations ternfs_dir_inode_ops = { static const struct inode_operations ternfs_file_inode_ops = { .getattr = ternfs_getattr, .setattr = ternfs_setattr, + .update_time = ternfs_update_time, }; static const struct inode_operations ternfs_symlink_inode_ops = { diff --git a/kmod/inode_compat.h b/kmod/inode_compat.h index a1c4a004..042c24bd 100644 --- a/kmod/inode_compat.h +++ b/kmod/inode_compat.h @@ -125,4 +125,20 @@ static inline struct timespec64 inode_set_mtime(struct inode *inode, #endif +// The inode_operations->update_time signature lost the `struct timespec64 *now` +// parameter in mainline 6.6; filesystems compute the timestamp themselves now. +// ternfs_generic_update_time hides the difference so callers can pass just (inode, flags). +#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 6, 0) +static inline int ternfs_generic_update_time(struct inode *inode, int flags) +{ + struct timespec64 now = current_time(inode); + return generic_update_time(inode, &now, flags); +} +#else +static inline int ternfs_generic_update_time(struct inode *inode, int flags) +{ + return generic_update_time(inode, flags); +} +#endif + #endif /* _TERNFS_INODE_COMPAT_H */ diff --git a/kmod/super.c b/kmod/super.c index d3e17b2e..f4ccd4c6 100644 --- a/kmod/super.c +++ b/kmod/super.c @@ -409,7 +409,9 @@ static struct dentry* ternfs_mount(struct file_system_type* fs_type, int flags, sb->s_fs_info = info; - sb->s_flags = SB_NOSUID | SB_NODEV | SB_NOEXEC | SB_NOATIME | SB_NODIRATIME; + // SB_NODIRATIME needs to be set: directory atime is TernFS-internal state used + // for dentry-cache invalidation and must not be touched by the VFS. + sb->s_flags = SB_NOSUID | SB_NODEV | SB_NOEXEC | SB_NODIRATIME; sb->s_iflags = SB_I_NOEXEC | SB_I_NODEV; sb->s_op = &ternfs_super_ops;