Add FSMOUNT_NAMESPACE flag to fsmount() that creates a new mount
namespace with the newly created filesystem attached to a copy of the
real rootfs. This returns a namespace file descriptor instead of an
O_PATH mount fd, similar to how OPEN_TREE_NAMESPACE works for open_tree().

This allows creating a new filesystem and immediately placing it in a
new mount namespace in a single operation, which is useful for container
runtimes and other namespace-based isolation mechanisms.

The rootfs mount is created before copying the real rootfs for the new
namespace meaning that the mount namespace id for the mount of the root
of the namespace is bigger than the child mounted on top of it. We've
never explicitly given the guarantee for such ordering and I doubt
anyone relies on it. Accepting that lets us avoid copying the mount
again and also avoids having to massage may_copy_tree() to grant an
exception for fsmount->mnt->mnt_ns being NULL.

Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c             | 53 +++++++++++++++++++++++++++++++++-------------
 include/uapi/linux/mount.h |  1 +
 2 files changed, 39 insertions(+), 15 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 46d2eb1c9c3d..30f2991b4a7f 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3068,8 +3068,13 @@ static struct file *open_detached_copy(struct path *path, unsigned int flags)
 DEFINE_FREE(put_empty_mnt_ns, struct mnt_namespace *,
 	    if (!IS_ERR_OR_NULL(_T)) free_mnt_ns(_T))
 
+enum open_newns_flags_t {
+	OPEN_NEWNS_RECURSIVE	= BIT(0),
+	OPEN_NEWNS_CLONE	= BIT(1),
+};
+
 static struct mnt_namespace *create_new_namespace(struct path *path,
-						  bool recurse)
+						  enum open_newns_flags_t flags)
 {
 	struct mnt_namespace *new_ns __free(put_empty_mnt_ns) = NULL;
 	struct path to_path __free(path_put) = {};
@@ -3080,6 +3085,9 @@ static struct mnt_namespace *create_new_namespace(struct path *path,
 	unsigned int copy_flags = 0;
 	bool locked = false;
 
+	if ((flags & (OPEN_NEWNS_RECURSIVE | OPEN_NEWNS_CLONE)) == OPEN_NEWNS_RECURSIVE)
+		return ERR_PTR(-EINVAL);
+
 	if (user_ns != ns->user_ns)
 		copy_flags |= CL_SLAVE;
 
@@ -3122,14 +3130,18 @@ static struct mnt_namespace *create_new_namespace(struct path *path,
 	if (unlikely(IS_ERR(mp.parent)))
 		return ERR_CAST(mp.parent);
 
-	/*
-	 * We don't emulate unshare()ing a mount namespace. We stick to the
-	 * restrictions of creating detached bind-mounts. It has a lot
-	 * saner and simpler semantics.
-	 */
-	mnt = __do_loopback(path, recurse, copy_flags);
-	if (IS_ERR(mnt))
-		return ERR_CAST(mnt);
+	if (flags & OPEN_NEWNS_CLONE) {
+		/*
+		 * We don't emulate unshare()ing a mount namespace. We stick to
+		 * the restrictions of creating detached bind-mounts. It has a
+		 * lot saner and simpler semantics.
+		 */
+		mnt = __do_loopback(path, flags & OPEN_NEWNS_RECURSIVE, copy_flags);
+		if (IS_ERR(mnt))
+			return ERR_CAST(mnt);
+	} else {
+		mnt = real_mount(mntget(path->mnt));
+	}
 
 	scoped_guard(mount_writer) {
 		if (locked)
@@ -3154,11 +3166,12 @@ static struct mnt_namespace *create_new_namespace(struct path *path,
 	return no_free_ptr(new_ns);
 }
 
-static struct file *open_new_namespace(struct path *path, bool recurse)
+static struct file *open_new_namespace(struct path *path,
+				       enum open_newns_flags_t flags)
 {
 	struct mnt_namespace *new_ns;
 
-	new_ns = create_new_namespace(path, recurse);
+	new_ns = create_new_namespace(path, flags);
 	if (IS_ERR(new_ns))
 		return ERR_CAST(new_ns);
 	return open_namespace_file(to_ns_common(new_ns));
@@ -3208,7 +3221,9 @@ static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned
 		return ERR_PTR(ret);
 
 	if (flags & OPEN_TREE_NAMESPACE)
-		return open_new_namespace(&path, (flags & AT_RECURSIVE));
+		return open_new_namespace(&path,
+				((flags & AT_RECURSIVE) ? OPEN_NEWNS_RECURSIVE : 0) |
+				OPEN_NEWNS_CLONE);
 
 	if (flags & OPEN_TREE_CLONE)
 		return open_detached_copy(&path, flags);
@@ -4395,11 +4410,15 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
 	unsigned int mnt_flags = 0;
 	long ret;
 
-	if (!may_mount())
+	if ((flags & ~(FSMOUNT_CLOEXEC | FSMOUNT_NAMESPACE)) != 0)
+		return -EINVAL;
+
+	if ((flags & FSMOUNT_NAMESPACE) &&
+	    !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
 		return -EPERM;
 
-	if ((flags & ~(FSMOUNT_CLOEXEC)) != 0)
-		return -EINVAL;
+	if (!(flags & FSMOUNT_NAMESPACE) && !may_mount())
+		return -EPERM;
 
 	if (attr_flags & ~FSMOUNT_VALID_FLAGS)
 		return -EINVAL;
@@ -4466,6 +4485,10 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
 	 */
 	vfs_clean_context(fc);
 
+	if (flags & FSMOUNT_NAMESPACE)
+		return FD_ADD((flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0,
+			      open_new_namespace(&new_path, 0));
+
 	ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, true);
 	if (IS_ERR(ns))
 		return PTR_ERR(ns);
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index d9d86598d100..2204708dbf7a 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -110,6 +110,7 @@ enum fsconfig_command {
  * fsmount() flags.
  */
 #define FSMOUNT_CLOEXEC		0x00000001
+#define FSMOUNT_NAMESPACE	0x00000002	/* Create the mount in a new mount namespace */
 
 /*
  * Mount attributes.

-- 
2.47.3