In c8db08110cbe ("Merge tag 'vfs-7.1-rc1.xattr' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs") we added support for extended attributes for sockets. This comes in two flavors: sockfs and non-sockfs/filesystem sockets. Filesystem sockets are actual filesystem objects so reading xattrs must use dedicated fs helpers such as bpf_get_dentry_xattr() and bpf_get_file_xattr(). Those are inherently sleeping operations. Sockfs sockets on the other hand don't need to use sleeping operations as the underlying data structure is lockless. In addition, retrieval of sockfs extended attributes often happens from LSM hooks that only provide struct socket and it's completely nonsensical to grab a reference to a file, then force a sleeping operation to retrieve the xattr and drop the reference. We know that the sockfs file cannot go away while the LSM hook runs. This series adds a bpf_sock_read_xattr() kfunc that, given a struct socket, reads a user.* extended attribute from the socket's sockfs inode into a bpf_dynptr. Together with fsetxattr() from userspace this lets a process label a socket with a user.* xattr and have a BPF LSM program retrieve that label locklessly. The kfunc mirrors the existing bpf_cgroup_read_xattr(), including the restriction to the user.* namespace. systemd uses user.* xattrs on sockets to implement socket rate limiting and to tag sockets for other purposes [1] such as implementing a varlink registry. There is currently no efficient way for a BPF program to read those labels back. The new helper allows a listening socket marked with an extended attribute to be read back during bind/connect and then act on the connect()ing socket. Extended attributes make it possible to allow an unprivileged user manager such as systemd --user to mark sockets from userspace and then rediscover them or implement policies. The kfunc is registered KF_RCU and only for BPF LSM programs. A struct socket is only guaranteed to live in sockfs when an LSM socket hook hands it out, which is what keeps SOCK_INODE() valid. Sockets that embed struct socket outside sockfs (tun, tap) are only reachable from tracing programs and are excluded by the registration. (Btw, for consistency it would be nice to force allocation of struct socket from sockfs instead of simply embedding it in e.g., struct tun_file which makes the SOCKFS_I() pattern a hazard - at least outside of sockfs functions.) The read never sleeps and takes no lock. For sockfs the value lives in the inode's in-memory xattr store and simple_xattr_get() resolves it with an RCU-protected rhashtable lookup, taking neither the inode lock nor any xattr lock. The kfunc is therefore usable from both sleepable and non-sleepable LSM hooks. Link: https://github.com/systemd/systemd/pull/40559 [1] Signed-off-by: Christian Brauner (Amutable) --- fs/bpf_fs_kfuncs.c | 37 +++++++++++++++++++++++++++++++++++++ include/linux/net.h | 1 + net/socket.c | 25 +++++++++++++++++++++++++ 3 files changed, 63 insertions(+) diff --git a/fs/bpf_fs_kfuncs.c b/fs/bpf_fs_kfuncs.c index 11841c3d4260..85fc9519d1ff 100644 --- a/fs/bpf_fs_kfuncs.c +++ b/fs/bpf_fs_kfuncs.c @@ -11,6 +11,7 @@ #include #include #include +#include #include __bpf_kfunc_start_defs(); @@ -359,6 +360,39 @@ __bpf_kfunc int bpf_cgroup_read_xattr(struct cgroup *cgroup, const char *name__s } #endif /* CONFIG_CGROUPS */ +#ifdef CONFIG_NET +/** + * bpf_sock_read_xattr - read xattr of a socket's inode in sockfs + * @sock: socket to get xattr from + * @name__str: name of the xattr + * @value_p: output buffer of the xattr value + * + * Get xattr *name__str* of *sock* and store the output in *value_p*. + * + * For security reasons, only *name__str* with prefix "user." is allowed. + * + * Return: length of the xattr value on success, a negative value on error. + */ +__bpf_kfunc int bpf_sock_read_xattr(struct socket *sock, const char *name__str, + struct bpf_dynptr *value_p) +{ + struct bpf_dynptr_kern *value_ptr = (struct bpf_dynptr_kern *)value_p; + u32 value_len; + void *value; + + /* Only allow reading "user.*" xattrs */ + if (strncmp(name__str, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) + return -EPERM; + + value_len = __bpf_dynptr_size(value_ptr); + value = __bpf_dynptr_data_rw(value_ptr, value_len); + if (!value) + return -EINVAL; + + return sock_read_xattr(sock, name__str, value, value_len); +} +#endif /* CONFIG_NET */ + /** * bpf_real_inode - get the real inode backing a dentry * @dentry: dentry to resolve @@ -385,6 +419,9 @@ BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_set_dentry_xattr, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_remove_dentry_xattr, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_real_inode, KF_SLEEPABLE | KF_RET_NULL) +#ifdef CONFIG_NET +BTF_ID_FLAGS(func, bpf_sock_read_xattr, KF_RCU) +#endif BTF_KFUNCS_END(bpf_fs_kfunc_set_ids) static int bpf_fs_kfuncs_filter(const struct bpf_prog *prog, u32 kfunc_id) diff --git a/include/linux/net.h b/include/linux/net.h index f268f395ce47..fdcf9956805c 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -285,6 +285,7 @@ int sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags); struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname); struct socket *sockfd_lookup(int fd, int *err); struct socket *sock_from_file(struct file *file); +int sock_read_xattr(struct socket *sock, const char *name, void *value, size_t size); #define sockfd_put(sock) fput(sock->file) int net_ratelimit(void); diff --git a/net/socket.c b/net/socket.c index 9e8dc769ff7a..3566f8c8ea3f 100644 --- a/net/socket.c +++ b/net/socket.c @@ -465,6 +465,31 @@ static const struct xattr_handler sockfs_user_xattr_handler = { .set = sockfs_user_xattr_set, }; +/** + * sock_read_xattr - read a user.* xattr from a socket's sockfs inode + * @sock: socket whose inode holds the xattr + * @name: full xattr name, e.g. "user.bpf_test" + * @value: output buffer + * @size: size of @value in bytes + * + * SOCK_INODE() is valid only for sockfs sockets; sock_from_file() rejects + * anything else (e.g. tun, tap). + * Lockless: simple_xattr_get() looks up the value under RCU, no inode lock. + * + * Return: length of the value on success, a negative errno on error. + */ +int sock_read_xattr(struct socket *sock, const char *name, void *value, size_t size) +{ + struct file *file = sock->file; + struct sockfs_inode *si; + + if (!file || sock_from_file(file) != sock) + return -EOPNOTSUPP; + + si = SOCKFS_I(SOCK_INODE(sock)); + return simple_xattr_get(&sockfs_xa_cache, &si->xattrs, name, value, size); +} + static const struct xattr_handler * const sockfs_xattr_handlers[] = { &sockfs_xattr_handler, &sockfs_security_xattr_handler, -- 2.47.3