Change userspace_init_fs from a declared-but-unused extern struct to a dynamically initialized pointer. Add init_userspace_fs() which is called early in kernel_init() (PID 1) to record PID 1's fs_struct as the canonical userspace filesystem state. Wire up __override_init_fs() and __revert_init_fs() to actually swap current->fs to/from userspace_init_fs. Previously these were no-ops that stored current->fs back to itself. Fix nullfs_userspace_init() to compare against userspace_init_fs instead of &init_fs. When PID 1 unshares its filesystem state, revert userspace_init_fs to init_fs's root (nullfs) so that stale filesystem state is not silently inherited by kworkers and usermodehelpers. At this stage PID 1's fs still points to rootfs (set by init_mount_tree), so userspace_init_fs points to rootfs and scoped_with_init_fs() is functionally equivalent to its previous no-op behavior. Signed-off-by: Christian Brauner --- fs/fs_struct.c | 46 +++++++++++++++++++++++++++++++++++++++++++++- include/linux/fs_struct.h | 5 +++-- include/linux/init_task.h | 1 + init/main.c | 3 +++ 4 files changed, 52 insertions(+), 3 deletions(-) diff --git a/fs/fs_struct.c b/fs/fs_struct.c index b9b9a327f299..c1afa7513e34 100644 --- a/fs/fs_struct.c +++ b/fs/fs_struct.c @@ -8,6 +8,7 @@ #include #include #include "internal.h" +#include "mount.h" /* * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values. @@ -163,15 +164,32 @@ EXPORT_SYMBOL_GPL(unshare_fs_struct); * fs_struct state. Breaking that contract sucks for both sides. * So just don't bother with extra work for this. No sane init * system should ever do this. + * + * On older kernels if PID 1 unshared its filesystem state with us the + * kernel simply used the stale fs_struct state implicitly pinning + * anything that PID 1 had last used. Even if PID 1 might've moved on to + * some completely different fs_struct state and might've even unmounted + * the old root. + * + * This has hilarious consequences: Think continuing to dump coredump + * state into an implicitly pinned directory somewhere. Calling random + * binaries in the old rootfs via usermodehelpers. + * + * Be aggressive about this: We simply reject operating on stale + * fs_struct state by reverting to nullfs. Every kworker that does + * lookups after this point will fail. Every usermodehelper call will + * fail. Tough luck but let's be kind and emit a warning to userspace. */ static inline void nullfs_userspace_init(struct fs_struct *old_fs) { if (likely(current->pid != 1)) return; /* @old_fs may be dangling but for comparison it's fine */ - if (old_fs != &init_fs) + if (old_fs != userspace_init_fs) return; pr_warn("VFS: Pid 1 stopped sharing filesystem state\n"); + set_fs_root(userspace_init_fs, &init_fs.root); + set_fs_pwd(userspace_init_fs, &init_fs.root); } struct fs_struct *switch_fs_struct(struct fs_struct *new_fs) @@ -198,3 +216,29 @@ struct fs_struct init_fs = { .seq = __SEQLOCK_UNLOCKED(init_fs.seq), .umask = 0022, }; + +struct fs_struct *userspace_init_fs __ro_after_init; +EXPORT_SYMBOL_GPL(userspace_init_fs); + +void __init init_userspace_fs(void) +{ + struct mount *m; + struct path root; + + /* Move PID 1 from nullfs into the initramfs. */ + m = topmost_overmount(current->nsproxy->mnt_ns->root); + root.mnt = &m->mnt; + root.dentry = root.mnt->mnt_root; + + VFS_WARN_ON_ONCE(current->pid != 1); + + set_fs_root(current->fs, &root); + set_fs_pwd(current->fs, &root); + + /* Hold a reference for the global pointer. */ + read_seqlock_excl(¤t->fs->seq); + current->fs->users++; + read_sequnlock_excl(¤t->fs->seq); + + userspace_init_fs = current->fs; +} diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h index ff525a1e45d4..51d335924029 100644 --- a/include/linux/fs_struct.h +++ b/include/linux/fs_struct.h @@ -17,6 +17,7 @@ struct fs_struct { } __randomize_layout; extern struct kmem_cache *fs_cachep; +extern struct fs_struct *userspace_init_fs; extern void exit_fs(struct task_struct *); extern void set_fs_root(struct fs_struct *, const struct path *); @@ -60,13 +61,13 @@ static inline struct fs_struct *__override_init_fs(void) struct fs_struct *fs; fs = current->fs; - smp_store_release(¤t->fs, current->fs); + smp_store_release(¤t->fs, userspace_init_fs); return fs; } static inline void __revert_init_fs(struct fs_struct *revert_fs) { - VFS_WARN_ON_ONCE(current->fs != current->fs); + VFS_WARN_ON_ONCE(current->fs != userspace_init_fs); smp_store_release(¤t->fs, revert_fs); } diff --git a/include/linux/init_task.h b/include/linux/init_task.h index a6cb241ea00c..61536be773f5 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -24,6 +24,7 @@ extern struct files_struct init_files; extern struct fs_struct init_fs; +extern struct fs_struct *userspace_init_fs; extern struct nsproxy init_nsproxy; #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE diff --git a/init/main.c b/init/main.c index 1cb395dd94e4..5ccc642a5aa7 100644 --- a/init/main.c +++ b/init/main.c @@ -102,6 +102,7 @@ #include #include #include +#include #include #include #include @@ -1574,6 +1575,8 @@ static int __ref kernel_init(void *unused) { int ret; + init_userspace_fs(); + /* * Wait until kthreadd is all set-up. */ -- 2.47.3