Right now we only support trusted.* xattrs which require CAP_SYS_ADMIN which doesn't really require any meaningful permission checking. But in order to support user.* xattrs and custom pidfs.* xattrs in the future we need permission checking for pidfs inodes. Add baseline permission checking that can later be extended with additional write-time checks for specific pidfs.* xattrs. Make the effective {u,g}id of the task the owner of the pidfs inode (like procfs does). The ownership is set when the dentry is first stashed and reported dynamically via getattr since credentials may change due to setuid() and similar operations. For kernel threads use root, for exited tasks use the credentials saved at exit time. The inode's ownership is updated via WRITE_ONCE() from the getattr() and permission() callbacks. This doesn't serialize against inode->i_op->setattr() but since pidfs rejects setattr() this isn't currently an issue. A seqcount-based approach can be used if setattr() support is added in the future [1]. Save the task's credentials and thread group pid inode number at exit time so that ownership and permission checks remain functional after the task has been reaped. Add a permission callback that checks access in two steps: (1) Verify the caller is either in the same thread group as the target or has equivalent signal permissions. This reuses the same uid-based logic as kill() by extracting may_signal_creds() from kill_ok_by_cred() so it can operate on credential pointers directly. For exited tasks the check uses the saved exit credentials and compares thread group identity. (2) Perform standard POSIX permission checking via generic_permission() against the inode's ownership and mode bits. This is intentionally less strict than ptrace_may_access() because pidfs currently does not allow operating on data that is completely private to the process such as its mm or file descriptors. Additional checks will be needed once that changes. Link: https://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git/log/?h=work.inode.seqcount [1] Signed-off-by: Christian Brauner --- fs/pidfs.c | 133 +++++++++++++++++++++++++++++++++++++++++++++++---- include/linux/cred.h | 2 + kernel/signal.c | 19 ++++---- 3 files changed, 136 insertions(+), 18 deletions(-) diff --git a/fs/pidfs.c b/fs/pidfs.c index 318253344b5c..16a3cfa84af4 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -46,15 +47,23 @@ enum pidfs_attr_mask_bits { PIDFS_ATTR_BIT_COREDUMP = 1, }; +struct pidfs_exit_attr { + __u64 cgroupid; + __s32 exit_code; + const struct cred *exit_cred; + u64 exit_tgid_ino; +}; + +struct pidfs_coredump_attr { + __u32 coredump_mask; + __u32 coredump_signal; +}; + struct pidfs_attr { unsigned long attr_mask; struct simple_xattrs *xattrs; - struct /* exit info */ { - __u64 cgroupid; - __s32 exit_code; - }; - __u32 coredump_mask; - __u32 coredump_signal; + struct pidfs_exit_attr; + struct pidfs_coredump_attr; }; static struct rhashtable pidfs_ino_ht; @@ -200,6 +209,7 @@ void pidfs_free_pid(struct pid *pid) if (IS_ERR(attr)) return; + put_cred(attr->exit_cred); xattrs = no_free_ptr(attr->xattrs); if (xattrs) simple_xattrs_free(xattrs, NULL); @@ -703,12 +713,14 @@ void pidfs_exit(struct task_struct *tsk) * is put */ -#ifdef CONFIG_CGROUPS rcu_read_lock(); +#ifdef CONFIG_CGROUPS cgrp = task_dfl_cgroup(tsk); attr->cgroupid = cgroup_id(cgrp); - rcu_read_unlock(); #endif + attr->exit_cred = get_cred(__task_cred(tsk)); + rcu_read_unlock(); + attr->exit_tgid_ino = task_tgid(tsk)->ino; attr->exit_code = tsk->exit_code; /* Ensure that PIDFD_GET_INFO sees either all or nothing. */ @@ -741,6 +753,47 @@ void pidfs_coredump(const struct coredump_params *cprm) static struct vfsmount *pidfs_mnt __ro_after_init; +static void pidfs_update_owner(struct inode *inode) +{ + struct pid *pid = inode->i_private; + struct task_struct *task; + const struct cred *cred; + kuid_t kuid; + kgid_t kgid; + + VFS_WARN_ON_ONCE(!pid); + + guard(rcu)(); + task = pid_task(pid, PIDTYPE_PID); + if (!task) { + struct pidfs_attr *attr = READ_ONCE(pid->attr); + + VFS_WARN_ON_ONCE(!attr); + /* + * During copy_process() with CLONE_PIDFD the + * task hasn't been attached to the pid yet so + * pid_task() returns NULL and there's no + * exit_cred as the task obviously hasn't + * exited. Use the parent's credentials. + */ + cred = attr->exit_cred; + if (!cred) + cred = current_cred(); + kuid = cred->euid; + kgid = cred->egid; + } else if (unlikely(task->flags & PF_KTHREAD)) { + kuid = GLOBAL_ROOT_UID; + kgid = GLOBAL_ROOT_GID; + } else { + cred = __task_cred(task); + kuid = cred->euid; + kgid = cred->egid; + } + + WRITE_ONCE(inode->i_uid, kuid); + WRITE_ONCE(inode->i_gid, kgid); +} + /* * The vfs falls back to simple_setattr() if i_op->setattr() isn't * implemented. Let's reject it completely until we have a clean @@ -756,7 +809,11 @@ static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { - return anon_inode_getattr(idmap, path, stat, request_mask, query_flags); + struct inode *inode = d_inode(path->dentry); + + pidfs_update_owner(inode); + anon_inode_getattr(idmap, path, stat, request_mask, query_flags); + return 0; } static ssize_t pidfs_listxattr(struct dentry *dentry, char *buf, size_t size) @@ -773,10 +830,64 @@ static ssize_t pidfs_listxattr(struct dentry *dentry, char *buf, size_t size) return simple_xattr_list(inode, xattrs, buf, size); } +static int pidfs_permission(struct mnt_idmap *idmap, struct inode *inode, + int mask) +{ + struct pid *pid = inode->i_private; + struct task_struct *task; + const struct cred *cred; + u64 pid_tg_ino; + + scoped_guard(rcu) { + task = pid_task(pid, PIDTYPE_PID); + if (task) { + if (unlikely(task->flags & PF_KTHREAD)) + return -EPERM; + + cred = __task_cred(task); + pid_tg_ino = task_tgid(task)->ino; + } else { + struct pidfs_attr *attr; + + attr = READ_ONCE(pid->attr); + VFS_WARN_ON_ONCE(!attr); + /* + * During copy_process() with CLONE_PIDFD the + * task hasn't been attached to the pid yet so + * pid_task() returns NULL and there's no + * exit_cred as the task obviously hasn't + * exited. Use the parent's credentials. + */ + cred = attr->exit_cred; + if (!cred) + cred = current_cred(); + pid_tg_ino = attr->exit_tgid_ino; + } + + /* + * If the caller and the target are in the same + * thread-group or the caller can signal the target + * we're good. + */ + if (pid_tg_ino != task_tgid(current)->ino && + !may_signal_creds(current_cred(), cred)) + return -EPERM; + + /* + * This is racy but not more racy then what we generally + * do for permission checking. + */ + WRITE_ONCE(inode->i_uid, cred->euid); + WRITE_ONCE(inode->i_gid, cred->egid); + } + return generic_permission(&nop_mnt_idmap, inode, mask); +} + static const struct inode_operations pidfs_inode_operations = { .getattr = pidfs_getattr, .setattr = pidfs_setattr, .listxattr = pidfs_listxattr, + .permission = pidfs_permission, }; static void pidfs_evict_inode(struct inode *inode) @@ -983,7 +1094,8 @@ static struct dentry *pidfs_stash_dentry(struct dentry **stashed, struct dentry *dentry) { int ret; - struct pid *pid = d_inode(dentry)->i_private; + struct inode *inode = d_inode(dentry); + struct pid *pid = inode->i_private; VFS_WARN_ON_ONCE(stashed != &pid->stashed); @@ -991,6 +1103,7 @@ static struct dentry *pidfs_stash_dentry(struct dentry **stashed, if (ret) return ERR_PTR(ret); + pidfs_update_owner(inode); return stash_dentry(stashed, dentry); } diff --git a/include/linux/cred.h b/include/linux/cred.h index ed1609d78cd7..d14b29fe9fee 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -168,6 +168,8 @@ extern int set_create_files_as(struct cred *, struct inode *); extern int cred_fscmp(const struct cred *, const struct cred *); extern void __init cred_init(void); extern int set_cred_ucounts(struct cred *); +bool may_signal_creds(const struct cred *signaler_cred, + const struct cred *signalee_cred); static inline bool cap_ambient_invariant_ok(const struct cred *cred) { diff --git a/kernel/signal.c b/kernel/signal.c index d65d0fe24bfb..e20dabf143c2 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -777,19 +777,22 @@ static inline bool si_fromuser(const struct kernel_siginfo *info) (!is_si_special(info) && SI_FROMUSER(info)); } +bool may_signal_creds(const struct cred *signaler_cred, + const struct cred *signalee_cred) +{ + return uid_eq(signaler_cred->euid, signalee_cred->suid) || + uid_eq(signaler_cred->euid, signalee_cred->uid) || + uid_eq(signaler_cred->uid, signalee_cred->suid) || + uid_eq(signaler_cred->uid, signalee_cred->uid) || + ns_capable(signalee_cred->user_ns, CAP_KILL); +} + /* * called with RCU read lock from check_kill_permission() */ static bool kill_ok_by_cred(struct task_struct *t) { - const struct cred *cred = current_cred(); - const struct cred *tcred = __task_cred(t); - - return uid_eq(cred->euid, tcred->suid) || - uid_eq(cred->euid, tcred->uid) || - uid_eq(cred->uid, tcred->suid) || - uid_eq(cred->uid, tcred->uid) || - ns_capable(tcred->user_ns, CAP_KILL); + return may_signal_creds(current_cred(), __task_cred(t)); } /* -- 2.47.3