Leave all kthreads isolated in nullfs and move userspace init into its separate fs_struct that any kthread can grab on demand to perform lookup. This isolates kthreads from userspace filesystem state quite a bit and makes it hard for anyone to mess up when performing filesystem operations from kthreads. Without LOOKUP_IN_INIT they will just not be able to do anything at all: no lookup or creation. Add a new struct kernel_clone_args extension that allows to create a task that shares init's filesystem state. This is only going to be used by user_mode_thread() which execute stuff in init's filesystem state. That concept should go away. Signed-off-by: Christian Brauner --- fs/fs_struct.c | 49 +++++++++++++++++++++++++++++++++++++++++++--- fs/namei.c | 4 ++-- fs/namespace.c | 4 ---- include/linux/fs_struct.h | 1 + include/linux/init_task.h | 1 + include/linux/sched/task.h | 1 + init/main.c | 10 +++++++++- kernel/fork.c | 26 +++++++++++++++++++++--- 8 files changed, 83 insertions(+), 13 deletions(-) diff --git a/fs/fs_struct.c b/fs/fs_struct.c index 64b5840131cb..164139c27380 100644 --- a/fs/fs_struct.c +++ b/fs/fs_struct.c @@ -8,6 +8,7 @@ #include #include #include "internal.h" +#include "mount.h" /* * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values. @@ -160,13 +161,30 @@ EXPORT_SYMBOL_GPL(unshare_fs_struct); * fs_struct state. Breaking that contract sucks for both sides. * So just don't bother with extra work for this. No sane init * system should ever do this. + * + * On older kernels if PID 1 unshared its filesystem state with us the + * kernel simply used the stale fs_struct state implicitly pinning + * anything that PID 1 had last used. Even if PID 1 might've moved on to + * some completely different fs_struct state and might've even unmounted + * the old root. + * + * This has hilarious consequences: Think continuing to dump coredump + * state into an implicitly pinned directory somewhere. Calling random + * binaries in the old rootfs via usermodehelpers. + * + * Be aggressive about this: We simply reject operating on stale + * fs_struct state by reverting to nullfs. Every kworker that does + * lookups after this point will fail. Every usermodehelper call will + * fail. Tough luck but let's be kind and emit a warning to userspace. */ static inline bool nullfs_userspace_init(void) { struct fs_struct *fs = current->fs; - if (unlikely(current->pid == 1) && fs != &init_fs) { + if (unlikely(current->pid == 1) && fs != &userspace_init_fs) { pr_warn("VFS: Pid 1 stopped sharing filesystem state\n"); + set_fs_root(&userspace_init_fs, &init_fs.root); + set_fs_pwd(&userspace_init_fs, &init_fs.root); return true; } @@ -186,7 +204,9 @@ struct fs_struct *switch_fs_struct(struct fs_struct *new_fs) new_fs = fs; read_sequnlock_excl(&fs->seq); - nullfs_userspace_init(); + /* one reference belongs to us */ + if (nullfs_userspace_init()) + return NULL; return new_fs; } @@ -197,8 +217,31 @@ struct fs_struct init_fs = { .umask = 0022, }; +struct fs_struct userspace_init_fs = { + .users = 1, + .seq = __SEQLOCK_UNLOCKED(userspace_init_fs.seq), + .umask = 0022, +}; + void init_root(struct path *root) { - get_fs_root(&init_fs, root); + get_fs_root(&userspace_init_fs, root); } EXPORT_SYMBOL_GPL(init_root); + +void __init init_userspace_fs(void) +{ + struct mount *m; + struct path root; + + /* Move PID 1 from nullfs into the initramfs. */ + m = topmost_overmount(current->nsproxy->mnt_ns->root); + root.mnt = &m->mnt; + root.dentry = root.mnt->mnt_root; + + VFS_WARN_ON_ONCE(current->fs != &init_fs); + VFS_WARN_ON_ONCE(current->pid != 1); + set_fs_root(&userspace_init_fs, &root); + set_fs_pwd(&userspace_init_fs, &root); + switch_fs_struct(&userspace_init_fs); +} diff --git a/fs/namei.c b/fs/namei.c index 976b1e9f7032..6cc53040e9eb 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1102,7 +1102,7 @@ static int set_root(struct nameidata *nd) struct fs_struct *fs; if (nd->flags & LOOKUP_IN_INIT) - fs = &init_fs; + fs = &userspace_init_fs; else fs = current->fs; @@ -2724,7 +2724,7 @@ static const char *path_init(struct nameidata *nd, unsigned flags) struct fs_struct *fs; if (nd->flags & LOOKUP_IN_INIT) - fs = &init_fs; + fs = &userspace_init_fs; else fs = current->fs; diff --git a/fs/namespace.c b/fs/namespace.c index 854f4fc66469..10056ac1dcd2 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -6190,10 +6190,6 @@ static void __init init_mount_tree(void) init_task.nsproxy->mnt_ns = &init_mnt_ns; get_mnt_ns(&init_mnt_ns); - - /* The root and pwd always point to the mutable rootfs. */ - root.mnt = mnt; - root.dentry = mnt->mnt_root; set_fs_pwd(current->fs, &root); set_fs_root(current->fs, &root); diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h index 8ff1acd8389d..5c40fdc39550 100644 --- a/include/linux/fs_struct.h +++ b/include/linux/fs_struct.h @@ -50,5 +50,6 @@ static inline int current_umask(void) } void init_root(struct path *root); +void __init init_userspace_fs(void); #endif /* _LINUX_FS_STRUCT_H */ diff --git a/include/linux/init_task.h b/include/linux/init_task.h index a6cb241ea00c..f27f88598394 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -24,6 +24,7 @@ extern struct files_struct init_files; extern struct fs_struct init_fs; +extern struct fs_struct userspace_init_fs; extern struct nsproxy init_nsproxy; #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 41ed884cffc9..e0c1ca8c6a18 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -31,6 +31,7 @@ struct kernel_clone_args { u32 io_thread:1; u32 user_worker:1; u32 no_files:1; + u32 umh:1; unsigned long stack; unsigned long stack_size; unsigned long tls; diff --git a/init/main.c b/init/main.c index 1cb395dd94e4..ca0d0914c63e 100644 --- a/init/main.c +++ b/init/main.c @@ -102,6 +102,7 @@ #include #include #include +#include #include #include #include @@ -713,6 +714,11 @@ static __initdata DECLARE_COMPLETION(kthreadd_done); static noinline void __ref __noreturn rest_init(void) { + struct kernel_clone_args init_args = { + .flags = (CLONE_FS | CLONE_VM | CLONE_UNTRACED), + .fn = kernel_init, + .fn_arg = NULL, + }; struct task_struct *tsk; int pid; @@ -722,7 +728,7 @@ static noinline void __ref __noreturn rest_init(void) * the init task will end up wanting to create kthreads, which, if * we schedule it before we create kthreadd, will OOPS. */ - pid = user_mode_thread(kernel_init, NULL, CLONE_FS); + pid = kernel_clone(&init_args); /* * Pin init on the boot CPU. Task migration is not properly working * until sched_init_smp() has been run. It will set the allowed @@ -1574,6 +1580,8 @@ static int __ref kernel_init(void *unused) { int ret; + init_userspace_fs(); + /* * Wait until kthreadd is all set-up. */ diff --git a/kernel/fork.c b/kernel/fork.c index 583078c69bbd..121538f58272 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1590,9 +1590,28 @@ static int copy_mm(u64 clone_flags, struct task_struct *tsk) return 0; } -static int copy_fs(u64 clone_flags, struct task_struct *tsk) +static int copy_fs(u64 clone_flags, struct task_struct *tsk, bool umh) { - struct fs_struct *fs = current->fs; + struct fs_struct *fs; + + /* + * Usermodehelper may use userspace_init_fs filesystem state but + * they don't get to create mount namespaces, share the + * filesystem state, or be started from a non-initial mount + * namespace. + */ + if (umh) { + if (clone_flags & (CLONE_NEWNS | CLONE_FS)) + return -EINVAL; + if (current->nsproxy->mnt_ns != &init_mnt_ns) + return -EINVAL; + } + + if (umh) + fs = &userspace_init_fs; + else + fs = current->fs; + if (clone_flags & CLONE_FS) { /* tsk->fs is already what we want */ read_seqlock_excl(&fs->seq); @@ -2211,7 +2230,7 @@ __latent_entropy struct task_struct *copy_process( retval = copy_files(clone_flags, p, args->no_files); if (retval) goto bad_fork_cleanup_semundo; - retval = copy_fs(clone_flags, p); + retval = copy_fs(clone_flags, p, args->umh); if (retval) goto bad_fork_cleanup_files; retval = copy_sighand(clone_flags, p); @@ -2725,6 +2744,7 @@ pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags) .exit_signal = (flags & CSIGNAL), .fn = fn, .fn_arg = arg, + .umh = 1, }; return kernel_clone(&args); -- 2.47.3