Add a real_fs field to task_struct that always mirrors the fs field. This lays the groundwork for distinguishing between a task's permanent fs_struct and one that is temporarily overridden via scoped_with_init_fs(). When a kthread temporarily overrides current->fs for path lookup, we need to know the original fs_struct for operations like exit_fs() and unshare_fs_struct() that must operate on the real, permanent fs. For now real_fs is always equal to fs. It is maintained alongside fs in all the relevant paths: exit_fs(), unshare_fs_struct(), switch_fs_struct(), and copy_fs(). Signed-off-by: Christian Brauner --- fs/fs_struct.c | 11 ++++++++--- fs/proc/array.c | 4 ++-- fs/proc/base.c | 8 ++++---- fs/proc_namespace.c | 4 ++-- include/linux/sched.h | 1 + init/init_task.c | 1 + kernel/fork.c | 8 +++++++- kernel/kcmp.c | 2 +- 8 files changed, 26 insertions(+), 13 deletions(-) diff --git a/fs/fs_struct.c b/fs/fs_struct.c index fcecf209f1a9..c03a574ed65a 100644 --- a/fs/fs_struct.c +++ b/fs/fs_struct.c @@ -61,7 +61,7 @@ void chroot_fs_refs(const struct path *old_root, const struct path *new_root) read_lock(&tasklist_lock); for_each_process_thread(g, p) { task_lock(p); - fs = p->fs; + fs = p->real_fs; if (fs) { int hits = 0; write_seqlock(&fs->seq); @@ -89,12 +89,13 @@ void free_fs_struct(struct fs_struct *fs) void exit_fs(struct task_struct *tsk) { - struct fs_struct *fs = tsk->fs; + struct fs_struct *fs = tsk->real_fs; if (fs) { int kill; task_lock(tsk); read_seqlock_excl(&fs->seq); + tsk->real_fs = NULL; tsk->fs = NULL; kill = !--fs->users; read_sequnlock_excl(&fs->seq); @@ -126,7 +127,7 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old) int unshare_fs_struct(void) { - struct fs_struct *fs = current->fs; + struct fs_struct *fs = current->real_fs; struct fs_struct *new_fs = copy_fs_struct(fs); int kill; @@ -135,8 +136,10 @@ int unshare_fs_struct(void) task_lock(current); read_seqlock_excl(&fs->seq); + VFS_WARN_ON_ONCE(fs != current->fs); kill = !--fs->users; current->fs = new_fs; + current->real_fs = new_fs; read_sequnlock_excl(&fs->seq); task_unlock(current); @@ -177,8 +180,10 @@ struct fs_struct *switch_fs_struct(struct fs_struct *new_fs) scoped_guard(task_lock, current) { fs = current->fs; + VFS_WARN_ON_ONCE(fs != current->real_fs); read_seqlock_excl(&fs->seq); current->fs = new_fs; + current->real_fs = new_fs; if (--fs->users) new_fs = NULL; else diff --git a/fs/proc/array.c b/fs/proc/array.c index f447e734612a..10d792b8f170 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -168,8 +168,8 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, cred = get_task_cred(p); task_lock(p); - if (p->fs) - umask = p->fs->umask; + if (p->real_fs) + umask = p->real_fs->umask; if (p->files) max_fds = files_fdtable(p->files)->max_fds; task_unlock(p); diff --git a/fs/proc/base.c b/fs/proc/base.c index 4c863d17dfb4..28067e77b820 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -210,8 +210,8 @@ static int get_task_root(struct task_struct *task, struct path *root) int result = -ENOENT; task_lock(task); - if (task->fs) { - get_fs_root(task->fs, root); + if (task->real_fs) { + get_fs_root(task->real_fs, root); result = 0; } task_unlock(task); @@ -225,8 +225,8 @@ static int proc_cwd_link(struct dentry *dentry, struct path *path) if (task) { task_lock(task); - if (task->fs) { - get_fs_pwd(task->fs, path); + if (task->real_fs) { + get_fs_pwd(task->real_fs, path); result = 0; } task_unlock(task); diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 5c555db68aa2..036356c0a55b 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -254,13 +254,13 @@ static int mounts_open_common(struct inode *inode, struct file *file, } ns = nsp->mnt_ns; get_mnt_ns(ns); - if (!task->fs) { + if (!task->real_fs) { task_unlock(task); put_task_struct(task); ret = -ENOENT; goto err_put_ns; } - get_fs_root(task->fs, &root); + get_fs_root(task->real_fs, &root); task_unlock(task); put_task_struct(task); diff --git a/include/linux/sched.h b/include/linux/sched.h index a7b4a980eb2f..5c7b9df92ebb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1179,6 +1179,7 @@ struct task_struct { unsigned long last_switch_time; #endif /* Filesystem information: */ + struct fs_struct *real_fs; struct fs_struct *fs; /* Open file information: */ diff --git a/init/init_task.c b/init/init_task.c index 5c838757fc10..7d0b4a5927eb 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -152,6 +152,7 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = { RCU_POINTER_INITIALIZER(cred, &init_cred), .comm = INIT_TASK_COMM, .thread = INIT_THREAD, + .real_fs = &init_fs, .fs = &init_fs, .files = &init_files, #ifdef CONFIG_IO_URING diff --git a/kernel/fork.c b/kernel/fork.c index 67e57ee44548..154703cf7d3d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1593,6 +1593,8 @@ static int copy_mm(u64 clone_flags, struct task_struct *tsk) static int copy_fs(u64 clone_flags, struct task_struct *tsk) { struct fs_struct *fs = current->fs; + + VFS_WARN_ON_ONCE(current->fs != current->real_fs); if (clone_flags & CLONE_FS) { /* tsk->fs is already what we want */ read_seqlock_excl(&fs->seq); @@ -1605,7 +1607,7 @@ static int copy_fs(u64 clone_flags, struct task_struct *tsk) read_sequnlock_excl(&fs->seq); return 0; } - tsk->fs = copy_fs_struct(fs); + tsk->real_fs = tsk->fs = copy_fs_struct(fs); if (!tsk->fs) return -ENOMEM; return 0; @@ -3152,6 +3154,10 @@ int ksys_unshare(unsigned long unshare_flags) if (unshare_flags & CLONE_NEWNS) unshare_flags |= CLONE_FS; + /* No unsharing with overriden fs state */ + VFS_WARN_ON_ONCE(unshare_flags & (CLONE_NEWNS | CLONE_FS) && + current->fs != current->real_fs); + err = check_unshare_flags(unshare_flags); if (err) goto bad_unshare_out; diff --git a/kernel/kcmp.c b/kernel/kcmp.c index 7c1a65bd5f8d..76476aeee067 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -186,7 +186,7 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, ret = kcmp_ptr(task1->files, task2->files, KCMP_FILES); break; case KCMP_FS: - ret = kcmp_ptr(task1->fs, task2->fs, KCMP_FS); + ret = kcmp_ptr(task1->real_fs, task2->real_fs, KCMP_FS); break; case KCMP_SIGHAND: ret = kcmp_ptr(task1->sighand, task2->sighand, KCMP_SIGHAND); -- 2.47.3