procfs has a number of mounting restrictions that are not documented anywhere. Signed-off-by: Alexey Gladkov --- Documentation/filesystems/proc.rst | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 8256e857e2d7..c8864fcbdec7 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -52,6 +52,7 @@ fixes/update part 1.1 Stefani Seibold June 9 2009 4 Configuring procfs 4.1 Mount options + 4.2 Mount restrictions 5 Filesystem behavior @@ -2410,6 +2411,19 @@ will use the calling process's active pid namespace. Note that the pid namespace of an existing procfs instance cannot be modified (attempting to do so will give an `-EBUSY` error). +4.2 Mount restrictions +-------------------------- + +If user namespaces are in use, the kernel additionally checks the instances of +procfs available to the mounter and will not allow procfs to be mounted if: + + 1. This mount is not fully visible. + + a. It's root directory is not the root directory of the filesystem. + b. If any file or non-empty procfs directory is hidden by another mount. + + 2. A new mount overrides the readonly option or any option from atime familty. + Chapter 5: Filesystem behavior ============================== -- 2.53.0 Cache the mounters credentials and allow access to the net directories contingent of the permissions of the mounter of proc. Do not show /proc/self/net when proc is mounted with subset=pid option and the mounter does not have CAP_NET_ADMIN. To avoid inadvertently allowing access to /proc//net, updating mounter credentials is not supported. Signed-off-by: Alexey Gladkov --- fs/proc/proc_net.c | 8 ++++++++ fs/proc/root.c | 2 ++ include/linux/proc_fs.h | 1 + 3 files changed, 11 insertions(+) diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 52f0b75cbce2..6e0ccef0169f 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "internal.h" @@ -270,6 +271,7 @@ static struct net *get_proc_task_net(struct inode *dir) struct task_struct *task; struct nsproxy *ns; struct net *net = NULL; + struct proc_fs_info *fs_info = proc_sb_info(dir->i_sb); rcu_read_lock(); task = pid_task(proc_pid(dir), PIDTYPE_PID); @@ -282,6 +284,12 @@ static struct net *get_proc_task_net(struct inode *dir) } rcu_read_unlock(); + if (net && (fs_info->pidonly == PROC_PIDONLY_ON) && + security_capable(fs_info->mounter_cred, net->user_ns, CAP_NET_ADMIN, CAP_OPT_NONE) < 0) { + put_net(net); + net = NULL; + } + return net; } diff --git a/fs/proc/root.c b/fs/proc/root.c index d8ca41d823e4..c4af3a9b1a44 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -254,6 +254,7 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc) return -ENOMEM; fs_info->pid_ns = get_pid_ns(ctx->pid_ns); + fs_info->mounter_cred = get_cred(fc->cred); proc_apply_options(fs_info, fc, current_user_ns()); /* User space would break if executables or devices appear on proc */ @@ -350,6 +351,7 @@ static void proc_kill_sb(struct super_block *sb) kill_anon_super(sb); if (fs_info) { put_pid_ns(fs_info->pid_ns); + put_cred(fs_info->mounter_cred); kfree_rcu(fs_info, rcu); } } diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 19d1c5e5f335..ec123c277d49 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -67,6 +67,7 @@ enum proc_pidonly { struct proc_fs_info { struct pid_namespace *pid_ns; kgid_t pid_gid; + const struct cred *mounter_cred; enum proc_hidepid hide_pid; enum proc_pidonly pidonly; struct rcu_head rcu; -- 2.53.0 When procfs is mounted with subset=pid option, where is no way to remount it with this option removed. This is done in order not to make visible what ever was hidden since some checks occur during mount. This patch makes the limitation explicit and prints an error message. Signed-off-by: Alexey Gladkov --- fs/proc/root.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/fs/proc/root.c b/fs/proc/root.c index c4af3a9b1a44..535a168046e3 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -223,7 +223,7 @@ static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param) return 0; } -static void proc_apply_options(struct proc_fs_info *fs_info, +static int proc_apply_options(struct proc_fs_info *fs_info, struct fs_context *fc, struct user_namespace *user_ns) { @@ -233,13 +233,17 @@ static void proc_apply_options(struct proc_fs_info *fs_info, fs_info->pid_gid = make_kgid(user_ns, ctx->gid); if (ctx->mask & (1 << Opt_hidepid)) fs_info->hide_pid = ctx->hidepid; - if (ctx->mask & (1 << Opt_subset)) + if (ctx->mask & (1 << Opt_subset)) { + if (ctx->pidonly != PROC_PIDONLY_ON && fs_info->pidonly == PROC_PIDONLY_ON) + return invalf(fc, "proc: subset=pid cannot be unset\n"); fs_info->pidonly = ctx->pidonly; + } if (ctx->mask & (1 << Opt_pidns) && !WARN_ON_ONCE(fc->purpose == FS_CONTEXT_FOR_RECONFIGURE)) { put_pid_ns(fs_info->pid_ns); fs_info->pid_ns = get_pid_ns(ctx->pid_ns); } + return 0; } static int proc_fill_super(struct super_block *s, struct fs_context *fc) @@ -255,7 +259,9 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc) fs_info->pid_ns = get_pid_ns(ctx->pid_ns); fs_info->mounter_cred = get_cred(fc->cred); - proc_apply_options(fs_info, fc, current_user_ns()); + ret = proc_apply_options(fs_info, fc, current_user_ns()); + if (ret) + return ret; /* User space would break if executables or devices appear on proc */ s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV; @@ -304,8 +310,7 @@ static int proc_reconfigure(struct fs_context *fc) sync_filesystem(sb); - proc_apply_options(fs_info, fc, current_user_ns()); - return 0; + return proc_apply_options(fs_info, fc, current_user_ns()); } static int proc_get_tree(struct fs_context *fc) -- 2.53.0 When /proc is mounted with the subset=pid option, all system files from the root of the file system are not accessible in userspace. Only dynamic information about processes is available, which cannot be hidden with overmount. For this reason, checking for full visibility is not relevant if mounting is performed with the subset=pid option. Signed-off-by: Alexey Gladkov --- fs/namespace.c | 29 ++++++++++++++++------------- fs/proc/root.c | 17 ++++++++++------- include/linux/fs/super_types.h | 2 ++ 3 files changed, 28 insertions(+), 20 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index c58674a20cad..7daa86315c05 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -6116,7 +6116,8 @@ static bool mnt_already_visible(struct mnt_namespace *ns, /* This mount is not fully visible if it's root directory * is not the root directory of the filesystem. */ - if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root) + if (!(sb->s_iflags & SB_I_USERNS_ALLOW_REVEALING) && + mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root) continue; /* A local view of the mount flags */ @@ -6136,18 +6137,20 @@ static bool mnt_already_visible(struct mnt_namespace *ns, ((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK))) continue; - /* This mount is not fully visible if there are any - * locked child mounts that cover anything except for - * empty directories. - */ - list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { - struct inode *inode = child->mnt_mountpoint->d_inode; - /* Only worry about locked mounts */ - if (!(child->mnt.mnt_flags & MNT_LOCKED)) - continue; - /* Is the directory permanently empty? */ - if (!is_empty_dir_inode(inode)) - goto next; + if (!(sb->s_iflags & SB_I_USERNS_ALLOW_REVEALING)) { + /* This mount is not fully visible if there are any + * locked child mounts that cover anything except for + * empty directories. + */ + list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { + struct inode *inode = child->mnt_mountpoint->d_inode; + /* Only worry about locked mounts */ + if (!IS_MNT_LOCKED(child)) + continue; + /* Is the directory permanently empty? */ + if (!is_empty_dir_inode(inode)) + goto next; + } } /* Preserve the locked attributes */ *new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \ diff --git a/fs/proc/root.c b/fs/proc/root.c index 535a168046e3..e029d3587494 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -223,18 +223,21 @@ static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param) return 0; } -static int proc_apply_options(struct proc_fs_info *fs_info, +static int proc_apply_options(struct super_block *s, struct fs_context *fc, struct user_namespace *user_ns) { struct proc_fs_context *ctx = fc->fs_private; + struct proc_fs_info *fs_info = proc_sb_info(s); if (ctx->mask & (1 << Opt_gid)) fs_info->pid_gid = make_kgid(user_ns, ctx->gid); if (ctx->mask & (1 << Opt_hidepid)) fs_info->hide_pid = ctx->hidepid; if (ctx->mask & (1 << Opt_subset)) { - if (ctx->pidonly != PROC_PIDONLY_ON && fs_info->pidonly == PROC_PIDONLY_ON) + if (ctx->pidonly == PROC_PIDONLY_ON) + s->s_iflags |= SB_I_USERNS_ALLOW_REVEALING; + else if (fs_info->pidonly == PROC_PIDONLY_ON) return invalf(fc, "proc: subset=pid cannot be unset\n"); fs_info->pidonly = ctx->pidonly; } @@ -259,9 +262,6 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc) fs_info->pid_ns = get_pid_ns(ctx->pid_ns); fs_info->mounter_cred = get_cred(fc->cred); - ret = proc_apply_options(fs_info, fc, current_user_ns()); - if (ret) - return ret; /* User space would break if executables or devices appear on proc */ s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV; @@ -273,6 +273,10 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc) s->s_time_gran = 1; s->s_fs_info = fs_info; + ret = proc_apply_options(s, fc, current_user_ns()); + if (ret) + return ret; + /* * procfs isn't actually a stacking filesystem; however, there is * too much magic going on inside it to permit stacking things on @@ -306,11 +310,10 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc) static int proc_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; - struct proc_fs_info *fs_info = proc_sb_info(sb); sync_filesystem(sb); - return proc_apply_options(fs_info, fc, current_user_ns()); + return proc_apply_options(sb, fc, current_user_ns()); } static int proc_get_tree(struct fs_context *fc) diff --git a/include/linux/fs/super_types.h b/include/linux/fs/super_types.h index 6bd3009e09b3..5e640b9140df 100644 --- a/include/linux/fs/super_types.h +++ b/include/linux/fs/super_types.h @@ -333,4 +333,6 @@ struct super_block { #define SB_I_NOIDMAP 0x00002000 /* No idmapped mounts on this superblock */ #define SB_I_ALLOW_HSM 0x00004000 /* Allow HSM events on this superblock */ +#define SB_I_USERNS_ALLOW_REVEALING 0x00008000 /* Skip full visibility check */ + #endif /* _LINUX_FS_SUPER_TYPES_H */ -- 2.53.0 Signed-off-by: Alexey Gladkov --- Documentation/filesystems/proc.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index c8864fcbdec7..3acf178c1202 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -2417,7 +2417,8 @@ so will give an `-EBUSY` error). If user namespaces are in use, the kernel additionally checks the instances of procfs available to the mounter and will not allow procfs to be mounted if: - 1. This mount is not fully visible. + 1. This mount is not fully visible unless the new procfs is going to be + mounted with subset=pid option. a. It's root directory is not the root directory of the filesystem. b. If any file or non-empty procfs directory is hidden by another mount. -- 2.53.0