Add support for extended attributes on bpffs inodes so that user space
and BPF LSM programs can attach metadata, for example, a content hash
or a security label - to a pinned object or directory. BPF LSM or user
space tooling can then uniformly look at this (e.g. security.bpf.*) in
similar way to other fs'es. The store is in-memory and non-persistent:
it lives only for the lifetime of the mount, like everything else in
bpffs. The modelling is similar to tmpfs.

bpf_fill_super() is open-coded instead of using simple_fill_super(),
because the root inode must now be allocated through bpf_fs_alloc_inode()
i.e. carry the bpf_fs_inode wrapper and come from the right cache -
which requires s_op (and s_xattr) to be installed before the first
inode is created.

bpf_fs_listxattr() is only reachable through the filesystem via
i_op->listxattr, so the BPF token inode is left untouched. Name-based
fsetxattr()/fgetxattr() on a token fd still work since the get/set
handlers are installed at the superblock.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Christian Brauner <brauner@kernel.org>
---
 kernel/bpf/inode.c | 254 ++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 237 insertions(+), 17 deletions(-)

diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 25c06a011825..638aa996cbaf 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -21,6 +21,9 @@
 #include <linux/bpf.h>
 #include <linux/bpf_trace.h>
 #include <linux/kstrtox.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+
 #include "preload/bpf_preload.h"
 
 enum bpf_type {
@@ -30,6 +33,22 @@ enum bpf_type {
 	BPF_TYPE_LINK,
 };
 
+struct bpf_fs_inode {
+	struct simple_xattrs	*xattrs;
+	struct inode		vfs_inode;
+};
+
+static inline struct bpf_fs_inode *BPF_FS_I(struct inode *inode)
+{
+	return container_of(inode, struct bpf_fs_inode, vfs_inode);
+}
+
+static struct kmem_cache *bpf_fs_inode_cachep __ro_after_init;
+
+static int bpf_fs_initxattrs(struct inode *inode,
+			     const struct xattr *xattr_array, void *fs_info);
+static ssize_t bpf_fs_listxattr(struct dentry *dentry, char *buf, size_t size);
+
 static void *bpf_any_get(void *raw, enum bpf_type type)
 {
 	switch (type) {
@@ -94,10 +113,17 @@ static void *bpf_fd_probe_obj(u32 ufd, enum bpf_type *type)
 }
 
 static const struct inode_operations bpf_dir_iops;
+static const struct inode_operations bpf_symlink_iops;
 
-static const struct inode_operations bpf_prog_iops = { };
-static const struct inode_operations bpf_map_iops  = { };
-static const struct inode_operations bpf_link_iops  = { };
+static const struct inode_operations bpf_prog_iops = {
+	.listxattr	= bpf_fs_listxattr,
+};
+static const struct inode_operations bpf_map_iops  = {
+	.listxattr	= bpf_fs_listxattr,
+};
+static const struct inode_operations bpf_link_iops  = {
+	.listxattr	= bpf_fs_listxattr,
+};
 
 struct inode *bpf_get_inode(struct super_block *sb,
 			    const struct inode *dir,
@@ -153,11 +179,19 @@ static struct dentry *bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 				struct dentry *dentry, umode_t mode)
 {
 	struct inode *inode;
+	int ret;
 
 	inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR);
 	if (IS_ERR(inode))
 		return ERR_CAST(inode);
 
+	ret = security_inode_init_security(inode, dir, &dentry->d_name,
+					   bpf_fs_initxattrs, NULL);
+	if (ret && ret != -EOPNOTSUPP) {
+		iput(inode);
+		return ERR_PTR(ret);
+	}
+
 	inode->i_op = &bpf_dir_iops;
 	inode->i_fop = &simple_dir_operations;
 
@@ -330,10 +364,20 @@ static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw,
 			 const struct file_operations *fops)
 {
 	struct inode *dir = dentry->d_parent->d_inode;
-	struct inode *inode = bpf_get_inode(dir->i_sb, dir, mode);
+	struct inode *inode;
+	int ret;
+
+	inode = bpf_get_inode(dir->i_sb, dir, mode);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
 
+	ret = security_inode_init_security(inode, dir, &dentry->d_name,
+					   bpf_fs_initxattrs, NULL);
+	if (ret && ret != -EOPNOTSUPP) {
+		iput(inode);
+		return ret;
+	}
+
 	inode->i_op = iops;
 	inode->i_fop = fops;
 	inode->i_private = raw;
@@ -382,9 +426,11 @@ bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags)
 static int bpf_symlink(struct mnt_idmap *idmap, struct inode *dir,
 		       struct dentry *dentry, const char *target)
 {
-	char *link = kstrdup(target, GFP_USER | __GFP_NOWARN);
 	struct inode *inode;
+	char *link;
+	int ret;
 
+	link = kstrdup(target, GFP_USER | __GFP_NOWARN);
 	if (!link)
 		return -ENOMEM;
 
@@ -394,13 +440,25 @@ static int bpf_symlink(struct mnt_idmap *idmap, struct inode *dir,
 		return PTR_ERR(inode);
 	}
 
-	inode->i_op = &simple_symlink_inode_operations;
+	inode->i_op = &bpf_symlink_iops;
 	inode->i_link = link;
 
+	ret = security_inode_init_security(inode, dir, &dentry->d_name,
+					   bpf_fs_initxattrs, NULL);
+	if (ret && ret != -EOPNOTSUPP) {
+		iput(inode);
+		return ret;
+	}
+
 	bpf_dentry_finalize(dentry, inode, dir);
 	return 0;
 }
 
+static const struct inode_operations bpf_symlink_iops = {
+	.get_link	= simple_get_link,
+	.listxattr	= bpf_fs_listxattr,
+};
+
 static const struct inode_operations bpf_dir_iops = {
 	.lookup		= bpf_lookup,
 	.mkdir		= bpf_mkdir,
@@ -409,6 +467,7 @@ static const struct inode_operations bpf_dir_iops = {
 	.rename		= simple_rename,
 	.link		= simple_link,
 	.unlink		= simple_unlink,
+	.listxattr	= bpf_fs_listxattr,
 };
 
 /* pin iterator link into bpffs */
@@ -762,22 +821,151 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root)
 	return 0;
 }
 
+static struct inode *bpf_fs_alloc_inode(struct super_block *sb)
+{
+	struct bpf_fs_inode *bi;
+
+	bi = alloc_inode_sb(sb, bpf_fs_inode_cachep, GFP_KERNEL);
+	if (!bi)
+		return NULL;
+	bi->xattrs = NULL;
+	return &bi->vfs_inode;
+}
+
 static void bpf_destroy_inode(struct inode *inode)
 {
+	struct bpf_fs_inode *bi = BPF_FS_I(inode);
 	enum bpf_type type;
 
 	if (S_ISLNK(inode->i_mode))
 		kfree(inode->i_link);
 	if (!bpf_inode_type(inode, &type))
 		bpf_any_put(inode->i_private, type);
-	free_inode_nonrcu(inode);
+	if (bi->xattrs) {
+		simple_xattrs_free(bi->xattrs, NULL);
+		kfree(bi->xattrs);
+	}
+}
+
+static void bpf_free_inode(struct inode *inode)
+{
+	kmem_cache_free(bpf_fs_inode_cachep, BPF_FS_I(inode));
+}
+
+static int bpf_fs_xattr_get(const struct xattr_handler *handler,
+			    struct dentry *unused, struct inode *inode,
+			    const char *name, void *value, size_t size)
+{
+	struct simple_xattrs *xattrs;
+
+	name = xattr_full_name(handler, name);
+	xattrs = READ_ONCE(BPF_FS_I(inode)->xattrs);
+	if (!xattrs)
+		return -ENODATA;
+	return simple_xattr_get(xattrs, name, value, size);
+}
+
+static int bpf_fs_xattr_set(const struct xattr_handler *handler,
+			    struct mnt_idmap *idmap, struct dentry *unused,
+			    struct inode *inode, const char *name,
+			    const void *value, size_t size, int flags)
+{
+	struct bpf_fs_inode *bi = BPF_FS_I(inode);
+	struct simple_xattrs *xattrs;
+	struct simple_xattr *old;
+
+	name = xattr_full_name(handler, name);
+	xattrs = simple_xattrs_lazy_alloc(&bi->xattrs, value, flags);
+	if (IS_ERR_OR_NULL(xattrs))
+		return PTR_ERR(xattrs);
+	old = simple_xattr_set(xattrs, name, value, size, flags);
+	if (IS_ERR(old))
+		return PTR_ERR(old);
+	simple_xattr_free_rcu(old);
+	inode_set_ctime_current(inode);
+	return 0;
+}
+
+static const struct xattr_handler bpf_fs_trusted_xattr_handler = {
+	.prefix	= XATTR_TRUSTED_PREFIX,
+	.get	= bpf_fs_xattr_get,
+	.set	= bpf_fs_xattr_set,
+};
+
+static const struct xattr_handler bpf_fs_security_xattr_handler = {
+	.prefix	= XATTR_SECURITY_PREFIX,
+	.get	= bpf_fs_xattr_get,
+	.set	= bpf_fs_xattr_set,
+};
+
+static const struct xattr_handler bpf_fs_user_xattr_handler = {
+	.prefix	= XATTR_USER_PREFIX,
+	.get	= bpf_fs_xattr_get,
+	.set	= bpf_fs_xattr_set,
+};
+
+static const struct xattr_handler * const bpf_fs_xattr_handlers[] = {
+	&bpf_fs_trusted_xattr_handler,
+	&bpf_fs_security_xattr_handler,
+	&bpf_fs_user_xattr_handler,
+	NULL,
+};
+
+static ssize_t bpf_fs_listxattr(struct dentry *dentry, char *buf, size_t size)
+{
+	struct inode *inode = d_inode(dentry);
+
+	return simple_xattr_list(inode, READ_ONCE(BPF_FS_I(inode)->xattrs),
+				 buf, size);
+}
+
+static int bpf_fs_initxattrs(struct inode *inode,
+			     const struct xattr *xattr_array, void *fs_info)
+{
+	struct bpf_fs_inode *bi = BPF_FS_I(inode);
+	const struct xattr *xattr;
+	size_t len;
+
+	CLASS(simple_xattrs, xattrs)();
+	if (IS_ERR(xattrs))
+		return PTR_ERR(xattrs);
+
+	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+		CLASS(simple_xattr, new_xattr)(xattr->value, xattr->value_len);
+		if (IS_ERR(new_xattr))
+			break;
+
+		len = strlen(xattr->name) + 1;
+		new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
+					  GFP_KERNEL_ACCOUNT);
+		if (!new_xattr->name)
+			break;
+
+		memcpy(new_xattr->name, XATTR_SECURITY_PREFIX,
+		       XATTR_SECURITY_PREFIX_LEN);
+		memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
+		       xattr->name, len);
+
+		if (simple_xattr_add(xattrs, new_xattr))
+			break;
+		retain_and_null_ptr(new_xattr);
+	}
+
+	if (xattr->name != NULL)
+		return -ENOMEM;
+
+	/* Paired with: READ_ONCE() on ->xattrs, release/dependency pair. */
+	smp_store_release(&bi->xattrs, no_free_ptr(xattrs));
+	return 0;
 }
 
 const struct super_operations bpf_super_ops = {
 	.statfs		= simple_statfs,
 	.drop_inode	= inode_just_drop,
 	.show_options	= bpf_show_options,
+	.alloc_inode	= bpf_fs_alloc_inode,
 	.destroy_inode	= bpf_destroy_inode,
+	.free_inode	= bpf_free_inode,
 };
 
 enum {
@@ -996,25 +1184,38 @@ static int populate_bpffs(struct dentry *parent)
 
 static int bpf_fill_super(struct super_block *sb, struct fs_context *fc)
 {
-	static const struct tree_descr bpf_rfiles[] = { { "" } };
 	struct bpf_mount_opts *opts = sb->s_fs_info;
 	struct inode *inode;
-	int ret;
 
 	/* Mounting an instance of BPF FS requires privileges */
 	if (fc->user_ns != &init_user_ns && !capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles);
-	if (ret)
-		return ret;
-
+	sb->s_blocksize = PAGE_SIZE;
+	sb->s_blocksize_bits = PAGE_SHIFT;
+	sb->s_magic = BPF_FS_MAGIC;
 	sb->s_op = &bpf_super_ops;
+	sb->s_xattr = bpf_fs_xattr_handlers;
+	sb->s_iflags |= SB_I_NOEXEC;
+	sb->s_iflags |= SB_I_NODEV;
+	sb->s_time_gran = 1;
+
+	inode = bpf_get_inode(sb, NULL, S_IFDIR | 0777);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	inode->i_ino = 1;
+	inode->i_op = &bpf_dir_iops;
+	inode->i_fop = &simple_dir_operations;
+	set_nlink(inode, 2);
+
+	sb->s_root = d_make_root(inode);
+	if (!sb->s_root)
+		return -ENOMEM;
 
-	inode = sb->s_root->d_inode;
+	inode = d_inode(sb->s_root);
 	inode->i_uid = opts->uid;
 	inode->i_gid = opts->gid;
-	inode->i_op = &bpf_dir_iops;
 	inode->i_mode &= ~S_IALLUGO;
 	populate_bpffs(sb->s_root);
 	inode->i_mode |= S_ISVTX | opts->mode;
@@ -1080,18 +1281,37 @@ static struct file_system_type bpf_fs_type = {
 	.fs_flags	= FS_USERNS_MOUNT,
 };
 
+static void bpf_fs_inode_init_once(void *foo)
+{
+	struct bpf_fs_inode *bi = foo;
+
+	inode_init_once(&bi->vfs_inode);
+}
+
 static int __init bpf_init(void)
 {
 	int ret;
 
+	bpf_fs_inode_cachep = kmem_cache_create("bpf_fs_inode_cache",
+						sizeof(struct bpf_fs_inode), 0,
+						SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
+						bpf_fs_inode_init_once);
+	if (!bpf_fs_inode_cachep)
+		return -ENOMEM;
+
 	ret = sysfs_create_mount_point(fs_kobj, "bpf");
 	if (ret)
-		return ret;
+		goto out_cache;
 
 	ret = register_filesystem(&bpf_fs_type);
-	if (ret)
+	if (ret) {
 		sysfs_remove_mount_point(fs_kobj, "bpf");
+		goto out_cache;
+	}
 
+	return 0;
+out_cache:
+	kmem_cache_destroy(bpf_fs_inode_cachep);
 	return ret;
 }
 fs_initcall(bpf_init);
-- 
2.43.0