From: Khalid Aziz <khalid@kernel.org>

Add a pseudo filesystem that contains files and page table sharing
information that enables processes to share page table entries.
This patch adds the basic filesystem that can be mounted, a
CONFIG_MSHARE option to enable the feature, and documentation.

Signed-off-by: Khalid Aziz <khalid@kernel.org>
Signed-off-by: Anthony Yznaga <anthony.yznaga@oracle.com>
---
 Documentation/filesystems/index.rst    |  1 +
 Documentation/filesystems/msharefs.rst | 96 +++++++++++++++++++++++++
 include/uapi/linux/magic.h             |  1 +
 mm/Kconfig                             | 11 +++
 mm/Makefile                            |  4 ++
 mm/mshare.c                            | 97 ++++++++++++++++++++++++++
 6 files changed, 210 insertions(+)
 create mode 100644 Documentation/filesystems/msharefs.rst
 create mode 100644 mm/mshare.c

diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst
index 11a599387266..dcd6605eb228 100644
--- a/Documentation/filesystems/index.rst
+++ b/Documentation/filesystems/index.rst
@@ -102,6 +102,7 @@ Documentation for filesystem implementations.
    fuse-passthrough
    inotify
    isofs
+   msharefs
    nilfs2
    nfs/index
    ntfs3
diff --git a/Documentation/filesystems/msharefs.rst b/Documentation/filesystems/msharefs.rst
new file mode 100644
index 000000000000..3e5b7d531821
--- /dev/null
+++ b/Documentation/filesystems/msharefs.rst
@@ -0,0 +1,96 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=====================================================
+Msharefs - A filesystem to support shared page tables
+=====================================================
+
+What is msharefs?
+-----------------
+
+msharefs is a pseudo filesystem that allows multiple processes to
+share page table entries for shared pages. To enable support for
+msharefs the kernel must be compiled with CONFIG_MSHARE set.
+
+msharefs is typically mounted like this::
+
+	mount -t msharefs none /sys/fs/mshare
+
+A file created on msharefs creates a new shared region where all
+processes mapping that region will map it using shared page table
+entries. Once the size of the region has been established via
+ftruncate() or fallocate(), the region can be mapped into processes
+and ioctls used to map and unmap objects within it. Note that an
+msharefs file is a control file and accessing mapped objects within
+a shared region through read or write of the file is not permitted.
+
+How to use mshare
+-----------------
+
+Here are the basic steps for using mshare:
+
+  1. Mount msharefs on /sys/fs/mshare::
+
+	mount -t msharefs msharefs /sys/fs/mshare
+
+  2. mshare regions have alignment and size requirements. Start
+     address for the region must be aligned to an address boundary and
+     be a multiple of fixed size. This alignment and size requirement
+     can be obtained by reading the file ``/sys/fs/mshare/mshare_info``
+     which returns a number in text format. mshare regions must be
+     aligned to this boundary and be a multiple of this size.
+
+  3. For the process creating an mshare region:
+
+    a. Create a file on /sys/fs/mshare, for example::
+
+        fd = open("/sys/fs/mshare/shareme",
+                        O_RDWR|O_CREAT|O_EXCL, 0600);
+
+    b. Establish the size of the region::
+
+        fallocate(fd, 0, 0, BUF_SIZE);
+
+      or::
+
+        ftruncate(fd, BUF_SIZE);
+
+    c. Map some memory in the region::
+
+	struct mshare_create mcreate;
+
+	mcreate.region_offset = 0;
+	mcreate.size = BUF_SIZE;
+	mcreate.offset = 0;
+	mcreate.prot = PROT_READ | PROT_WRITE;
+	mcreate.flags = MAP_ANONYMOUS | MAP_SHARED | MAP_FIXED;
+	mcreate.fd = -1;
+
+	ioctl(fd, MSHAREFS_CREATE_MAPPING, &mcreate);
+
+    d. Map the mshare region into the process::
+
+	mmap(NULL, BUF_SIZE,
+		PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+
+    e. Write and read to mshared region normally.
+
+
+  4. For processes attaching an mshare region:
+
+    a. Open the msharefs file, for example::
+
+	fd = open("/sys/fs/mshare/shareme", O_RDWR);
+
+    b. Get the size of the mshare region from the file::
+
+        fstat(fd, &sb);
+        mshare_size = sb.st_size;
+
+    c. Map the mshare region into the process::
+
+	mmap(NULL, mshare_size,
+		PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+
+  5. To delete the mshare region::
+
+		unlink("/sys/fs/mshare/shareme");
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index bb575f3ab45e..e53dd6063cba 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -103,5 +103,6 @@
 #define DEVMEM_MAGIC		0x454d444d	/* "DMEM" */
 #define SECRETMEM_MAGIC		0x5345434d	/* "SECM" */
 #define PID_FS_MAGIC		0x50494446	/* "PIDF" */
+#define MSHARE_MAGIC		0x4d534852	/* "MSHR" */
 
 #endif /* __LINUX_MAGIC_H__ */
diff --git a/mm/Kconfig b/mm/Kconfig
index 4108bcd96784..8b50e9785729 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1400,6 +1400,17 @@ config PT_RECLAIM
 config FIND_NORMAL_PAGE
 	def_bool n
 
+config MSHARE
+	bool "Mshare"
+	depends on MMU
+	help
+	  Enable msharefs: A pseudo filesystem that allows multiple processes
+	  to share kernel resources for mapping shared pages. A file created on
+	  msharefs represents a shared region where all processes mapping that
+	  region will map objects within it with shared page table entries and
+	  VMAs. Ioctls are used to configure and map objects into the shared
+	  region.
+
 source "mm/damon/Kconfig"
 
 endmenu
diff --git a/mm/Makefile b/mm/Makefile
index ef54aa615d9d..4af111b29c68 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -48,6 +48,10 @@ ifdef CONFIG_64BIT
 mmu-$(CONFIG_MMU)	+= mseal.o
 endif
 
+ifdef CONFIG_MSHARE
+mmu-$(CONFIG_MMU)	+= mshare.o
+endif
+
 obj-y			:= filemap.o mempool.o oom_kill.o fadvise.o \
 			   maccess.o page-writeback.o folio-compat.o \
 			   readahead.o swap.o truncate.o vmscan.o shrinker.o \
diff --git a/mm/mshare.c b/mm/mshare.c
new file mode 100644
index 000000000000..f703af49ec81
--- /dev/null
+++ b/mm/mshare.c
@@ -0,0 +1,97 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Enable cooperating processes to share page table between
+ * them to reduce the extra memory consumed by multiple copies
+ * of page tables.
+ *
+ * This code adds an in-memory filesystem - msharefs.
+ * msharefs is used to manage page table sharing
+ *
+ *
+ * Copyright (C) 2024 Oracle Corp. All rights reserved.
+ * Author:	Khalid Aziz <khalid@kernel.org>
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/fs_context.h>
+#include <uapi/linux/magic.h>
+
+static const struct file_operations msharefs_file_operations = {
+	.open			= simple_open,
+};
+
+static const struct super_operations mshare_s_ops = {
+	.statfs		= simple_statfs,
+};
+
+static int
+msharefs_fill_super(struct super_block *sb, struct fs_context *fc)
+{
+	struct inode *inode;
+
+	sb->s_blocksize		= PAGE_SIZE;
+	sb->s_blocksize_bits	= PAGE_SHIFT;
+	sb->s_maxbytes		= MAX_LFS_FILESIZE;
+	sb->s_magic		= MSHARE_MAGIC;
+	sb->s_op		= &mshare_s_ops;
+	sb->s_time_gran		= 1;
+
+	inode = new_inode(sb);
+	if (!inode)
+		return -ENOMEM;
+
+	inode->i_ino = 1;
+	inode->i_mode = S_IFDIR | 0777;
+	simple_inode_init_ts(inode);
+	inode->i_op = &simple_dir_inode_operations;
+	inode->i_fop = &simple_dir_operations;
+	set_nlink(inode, 2);
+
+	sb->s_root = d_make_root(inode);
+	if (!sb->s_root)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int
+msharefs_get_tree(struct fs_context *fc)
+{
+	return get_tree_nodev(fc, msharefs_fill_super);
+}
+
+static const struct fs_context_operations msharefs_context_ops = {
+	.get_tree	= msharefs_get_tree,
+};
+
+static int
+mshare_init_fs_context(struct fs_context *fc)
+{
+	fc->ops = &msharefs_context_ops;
+	return 0;
+}
+
+static struct file_system_type mshare_fs = {
+	.name			= "msharefs",
+	.init_fs_context	= mshare_init_fs_context,
+	.kill_sb		= kill_litter_super,
+};
+
+static int __init
+mshare_init(void)
+{
+	int ret;
+
+	ret = sysfs_create_mount_point(fs_kobj, "mshare");
+	if (ret)
+		return ret;
+
+	ret = register_filesystem(&mshare_fs);
+	if (ret)
+		sysfs_remove_mount_point(fs_kobj, "mshare");
+
+	return ret;
+}
+
+core_initcall(mshare_init);
-- 
2.47.1


From: Khalid Aziz <khalid@kernel.org>

Users of mshare need to know the size and alignment requirement
for shared regions. Pre-populate msharefs with a file, mshare_info,
that provides this information. For now, pagetable sharing is
hardcoded to be at the PUD level.

Signed-off-by: Khalid Aziz <khalid@kernel.org>
Signed-off-by: Anthony Yznaga <anthony.yznaga@oracle.com>
---
 mm/mshare.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 75 insertions(+), 2 deletions(-)

diff --git a/mm/mshare.c b/mm/mshare.c
index f703af49ec81..d666471bc94b 100644
--- a/mm/mshare.c
+++ b/mm/mshare.c
@@ -17,18 +17,74 @@
 #include <linux/fs_context.h>
 #include <uapi/linux/magic.h>
 
+const unsigned long mshare_align = P4D_SIZE;
+
 static const struct file_operations msharefs_file_operations = {
 	.open			= simple_open,
 };
 
+struct msharefs_info {
+	struct dentry *info_dentry;
+};
+
+static ssize_t
+mshare_info_read(struct file *file, char __user *buf, size_t nbytes,
+		loff_t *ppos)
+{
+	char s[80];
+
+	sprintf(s, "%ld\n", mshare_align);
+	return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s));
+}
+
+static const struct file_operations mshare_info_ops = {
+	.read	= mshare_info_read,
+	.llseek	= noop_llseek,
+};
+
 static const struct super_operations mshare_s_ops = {
 	.statfs		= simple_statfs,
 };
 
+static int
+msharefs_create_mshare_info(struct super_block *sb)
+{
+	struct msharefs_info *info = sb->s_fs_info;
+	struct dentry *root = sb->s_root;
+	struct dentry *dentry;
+	struct inode *inode;
+	int ret;
+
+	ret = -ENOMEM;
+	inode = new_inode(sb);
+	if (!inode)
+		goto out;
+
+	inode->i_ino = 2;
+	simple_inode_init_ts(inode);
+	inode_init_owner(&nop_mnt_idmap, inode, NULL, S_IFREG | 0444);
+	inode->i_fop = &mshare_info_ops;
+
+	dentry = d_alloc_name(root, "mshare_info");
+	if (!dentry)
+		goto out;
+
+	info->info_dentry = dentry;
+	d_add(dentry, inode);
+
+	return 0;
+out:
+	iput(inode);
+
+	return ret;
+}
+
 static int
 msharefs_fill_super(struct super_block *sb, struct fs_context *fc)
 {
+	struct msharefs_info *info;
 	struct inode *inode;
+	int ret;
 
 	sb->s_blocksize		= PAGE_SIZE;
 	sb->s_blocksize_bits	= PAGE_SHIFT;
@@ -37,6 +93,12 @@ msharefs_fill_super(struct super_block *sb, struct fs_context *fc)
 	sb->s_op		= &mshare_s_ops;
 	sb->s_time_gran		= 1;
 
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return -ENOMEM;
+
+	sb->s_fs_info = info;
+
 	inode = new_inode(sb);
 	if (!inode)
 		return -ENOMEM;
@@ -52,7 +114,9 @@ msharefs_fill_super(struct super_block *sb, struct fs_context *fc)
 	if (!sb->s_root)
 		return -ENOMEM;
 
-	return 0;
+	ret = msharefs_create_mshare_info(sb);
+
+	return ret;
 }
 
 static int
@@ -72,10 +136,19 @@ mshare_init_fs_context(struct fs_context *fc)
 	return 0;
 }
 
+static void
+msharefs_kill_super(struct super_block *sb)
+{
+	struct msharefs_info *info = sb->s_fs_info;
+
+	kfree(info);
+	kill_litter_super(sb);
+}
+
 static struct file_system_type mshare_fs = {
 	.name			= "msharefs",
 	.init_fs_context	= mshare_init_fs_context,
-	.kill_sb		= kill_litter_super,
+	.kill_sb		= msharefs_kill_super,
 };
 
 static int __init
-- 
2.47.1


From: Khalid Aziz <khalid@kernel.org>

Make msharefs filesystem writable and allow creating directories
to support better access control to mshare'd regions defined in
msharefs.

Signed-off-by: Khalid Aziz <khalid@kernel.org>
Signed-off-by: Anthony Yznaga <anthony.yznaga@oracle.com>
---
 mm/mshare.c | 116 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 115 insertions(+), 1 deletion(-)

diff --git a/mm/mshare.c b/mm/mshare.c
index d666471bc94b..c43b53a7323a 100644
--- a/mm/mshare.c
+++ b/mm/mshare.c
@@ -19,14 +19,128 @@
 
 const unsigned long mshare_align = P4D_SIZE;
 
+static const struct inode_operations msharefs_dir_inode_ops;
+static const struct inode_operations msharefs_file_inode_ops;
+
 static const struct file_operations msharefs_file_operations = {
 	.open			= simple_open,
 };
 
+static struct inode
+*msharefs_get_inode(struct mnt_idmap *idmap, struct super_block *sb,
+			const struct inode *dir, umode_t mode)
+{
+	struct inode *inode = new_inode(sb);
+
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	inode->i_ino = get_next_ino();
+	inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
+	simple_inode_init_ts(inode);
+
+	switch (mode & S_IFMT) {
+	case S_IFREG:
+		inode->i_op = &msharefs_file_inode_ops;
+		inode->i_fop = &msharefs_file_operations;
+		break;
+	case S_IFDIR:
+		inode->i_op = &msharefs_dir_inode_ops;
+		inode->i_fop = &simple_dir_operations;
+		inc_nlink(inode);
+		break;
+	default:
+		iput(inode);
+		return ERR_PTR(-EINVAL);
+	}
+
+	return inode;
+}
+
+static int
+msharefs_mknod(struct mnt_idmap *idmap, struct inode *dir,
+		struct dentry *dentry, umode_t mode)
+{
+	struct inode *inode;
+
+	inode = msharefs_get_inode(idmap, dir->i_sb, dir, mode);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	d_instantiate(dentry, inode);
+	dget(dentry);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
+
+	return 0;
+}
+
+static int
+msharefs_create(struct mnt_idmap *idmap, struct inode *dir,
+		struct dentry *dentry, umode_t mode, bool excl)
+{
+	return msharefs_mknod(idmap, dir, dentry, mode | S_IFREG);
+}
+
+static struct dentry *
+msharefs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+		struct dentry *dentry, umode_t mode)
+{
+	int ret = msharefs_mknod(idmap, dir, dentry, mode | S_IFDIR);
+
+	if (!ret)
+		inc_nlink(dir);
+	return ERR_PTR(ret);
+}
+
 struct msharefs_info {
 	struct dentry *info_dentry;
 };
 
+static inline bool
+is_msharefs_info_file(const struct dentry *dentry)
+{
+	struct msharefs_info *info = dentry->d_sb->s_fs_info;
+
+	return info->info_dentry == dentry;
+}
+
+static int
+msharefs_rename(struct mnt_idmap *idmap,
+		struct inode *old_dir, struct dentry *old_dentry,
+		struct inode *new_dir, struct dentry *new_dentry,
+		unsigned int flags)
+{
+	if (is_msharefs_info_file(old_dentry) ||
+	    is_msharefs_info_file(new_dentry))
+		return -EPERM;
+
+	return simple_rename(idmap, old_dir, old_dentry, new_dir,
+			     new_dentry, flags);
+}
+
+static int
+msharefs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	if (is_msharefs_info_file(dentry))
+		return -EPERM;
+
+	return simple_unlink(dir, dentry);
+}
+
+static const struct inode_operations msharefs_file_inode_ops = {
+	.setattr	= simple_setattr,
+};
+
+static const struct inode_operations msharefs_dir_inode_ops = {
+	.create		= msharefs_create,
+	.lookup		= simple_lookup,
+	.link		= simple_link,
+	.unlink		= msharefs_unlink,
+	.mkdir		= msharefs_mkdir,
+	.rmdir		= simple_rmdir,
+	.rename		= msharefs_rename,
+};
+
 static ssize_t
 mshare_info_read(struct file *file, char __user *buf, size_t nbytes,
 		loff_t *ppos)
@@ -106,7 +220,7 @@ msharefs_fill_super(struct super_block *sb, struct fs_context *fc)
 	inode->i_ino = 1;
 	inode->i_mode = S_IFDIR | 0777;
 	simple_inode_init_ts(inode);
-	inode->i_op = &simple_dir_inode_operations;
+	inode->i_op = &msharefs_dir_inode_ops;
 	inode->i_fop = &simple_dir_operations;
 	set_nlink(inode, 2);
 
-- 
2.47.1


When a new file is created under msharefs, allocate a new mm_struct
to be associated with it for the lifetime of the file.
The mm_struct will hold the VMAs and pagetables for the mshare region
the file represents.

Signed-off-by: Khalid Aziz <khalid@kernel.org>
Signed-off-by: Anthony Yznaga <anthony.yznaga@oracle.com>
---
 mm/mshare.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/mm/mshare.c b/mm/mshare.c
index c43b53a7323a..400f198c0791 100644
--- a/mm/mshare.c
+++ b/mm/mshare.c
@@ -19,6 +19,11 @@
 
 const unsigned long mshare_align = P4D_SIZE;
 
+struct mshare_data {
+	struct mm_struct *mm;
+	refcount_t ref;
+};
+
 static const struct inode_operations msharefs_dir_inode_ops;
 static const struct inode_operations msharefs_file_inode_ops;
 
@@ -26,11 +31,55 @@ static const struct file_operations msharefs_file_operations = {
 	.open			= simple_open,
 };
 
+static int
+msharefs_fill_mm(struct inode *inode)
+{
+	struct mm_struct *mm;
+	struct mshare_data *m_data = NULL;
+	int ret = -ENOMEM;
+
+	mm = mm_alloc();
+	if (!mm)
+		return -ENOMEM;
+
+	mm->mmap_base = mm->task_size = 0;
+
+	m_data = kzalloc(sizeof(*m_data), GFP_KERNEL);
+	if (!m_data)
+		goto err_free;
+	m_data->mm = mm;
+
+	refcount_set(&m_data->ref, 1);
+	inode->i_private = m_data;
+	return 0;
+
+err_free:
+	mmput(mm);
+	kfree(m_data);
+	return ret;
+}
+
+static void
+msharefs_delmm(struct mshare_data *m_data)
+{
+	mmput(m_data->mm);
+	kfree(m_data);
+}
+
+static void mshare_data_putref(struct mshare_data *m_data)
+{
+	if (!refcount_dec_and_test(&m_data->ref))
+		return;
+
+	msharefs_delmm(m_data);
+}
+
 static struct inode
 *msharefs_get_inode(struct mnt_idmap *idmap, struct super_block *sb,
 			const struct inode *dir, umode_t mode)
 {
 	struct inode *inode = new_inode(sb);
+	int ret;
 
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
@@ -43,6 +92,11 @@ static struct inode
 	case S_IFREG:
 		inode->i_op = &msharefs_file_inode_ops;
 		inode->i_fop = &msharefs_file_operations;
+		ret = msharefs_fill_mm(inode);
+		if (ret) {
+			iput(inode);
+			inode = ERR_PTR(ret);
+		}
 		break;
 	case S_IFDIR:
 		inode->i_op = &msharefs_dir_inode_ops;
@@ -141,6 +195,19 @@ static const struct inode_operations msharefs_dir_inode_ops = {
 	.rename		= msharefs_rename,
 };
 
+static void
+msharefs_evict_inode(struct inode *inode)
+{
+	struct mshare_data *m_data = inode->i_private;
+
+	if (!m_data)
+		goto out;
+
+	mshare_data_putref(m_data);
+out:
+	clear_inode(inode);
+}
+
 static ssize_t
 mshare_info_read(struct file *file, char __user *buf, size_t nbytes,
 		loff_t *ppos)
@@ -158,6 +225,7 @@ static const struct file_operations mshare_info_ops = {
 
 static const struct super_operations mshare_s_ops = {
 	.statfs		= simple_statfs,
+	.evict_inode	= msharefs_evict_inode,
 };
 
 static int
-- 
2.47.1


Add file and inode operations to allow the size of an mshare region
to be set fallocate() or ftruncate().

Signed-off-by: Anthony Yznaga <anthony.yznaga@oracle.com>
---
 mm/mshare.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 86 insertions(+), 1 deletion(-)

diff --git a/mm/mshare.c b/mm/mshare.c
index 400f198c0791..bf859b176e09 100644
--- a/mm/mshare.c
+++ b/mm/mshare.c
@@ -16,19 +16,78 @@
 #include <linux/fs.h>
 #include <linux/fs_context.h>
 #include <uapi/linux/magic.h>
+#include <linux/falloc.h>
 
 const unsigned long mshare_align = P4D_SIZE;
 
+#define MSHARE_INITIALIZED	0x1
+
 struct mshare_data {
 	struct mm_struct *mm;
 	refcount_t ref;
+	unsigned long size;
+	unsigned long flags;
 };
 
+static inline bool mshare_is_initialized(struct mshare_data *m_data)
+{
+	return test_bit(MSHARE_INITIALIZED, &m_data->flags);
+}
+
+static int msharefs_set_size(struct mshare_data *m_data, unsigned long size)
+{
+	int error = -EINVAL;
+
+	if (mshare_is_initialized(m_data))
+		goto out;
+
+	if (m_data->size || (size & (mshare_align - 1)))
+		goto out;
+
+	m_data->mm->task_size = m_data->size = size;
+
+	set_bit(MSHARE_INITIALIZED, &m_data->flags);
+	error = 0;
+out:
+	return error;
+}
+
+static long msharefs_fallocate(struct file *file, int mode, loff_t offset,
+				loff_t len)
+{
+	struct inode *inode = file_inode(file);
+	struct mshare_data *m_data = inode->i_private;
+	int error;
+
+	if (mode != FALLOC_FL_ALLOCATE_RANGE)
+		return -EOPNOTSUPP;
+
+	if (offset)
+		return -EINVAL;
+
+	inode_lock(inode);
+
+	error = inode_newsize_ok(inode, len);
+	if (error)
+		goto out;
+
+	error = msharefs_set_size(m_data, len);
+	if (error)
+		goto out;
+
+	i_size_write(inode, len);
+out:
+	inode_unlock(inode);
+
+	return error;
+}
+
 static const struct inode_operations msharefs_dir_inode_ops;
 static const struct inode_operations msharefs_file_inode_ops;
 
 static const struct file_operations msharefs_file_operations = {
 	.open			= simple_open,
+	.fallocate		= msharefs_fallocate,
 };
 
 static int
@@ -128,6 +187,32 @@ msharefs_mknod(struct mnt_idmap *idmap, struct inode *dir,
 	return 0;
 }
 
+static int msharefs_setattr(struct mnt_idmap *idmap,
+			    struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = d_inode(dentry);
+	struct mshare_data *m_data = inode->i_private;
+	unsigned int ia_valid = attr->ia_valid;
+	int error;
+
+	error = setattr_prepare(idmap, dentry, attr);
+	if (error)
+		return error;
+
+	if (ia_valid & ATTR_SIZE) {
+		loff_t newsize = attr->ia_size;
+
+		error = msharefs_set_size(m_data, newsize);
+		if (error)
+			return error;
+
+		i_size_write(inode, newsize);
+	}
+
+	setattr_copy(idmap, inode, attr);
+	return 0;
+}
+
 static int
 msharefs_create(struct mnt_idmap *idmap, struct inode *dir,
 		struct dentry *dentry, umode_t mode, bool excl)
@@ -182,7 +267,7 @@ msharefs_unlink(struct inode *dir, struct dentry *dentry)
 }
 
 static const struct inode_operations msharefs_file_inode_ops = {
-	.setattr	= simple_setattr,
+	.setattr	= msharefs_setattr,
 };
 
 static const struct inode_operations msharefs_dir_inode_ops = {
-- 
2.47.1


From: Khalid Aziz <khalid@kernel.org>

An mshare region contains zero or more actual vmas that map objects
in the mshare range with shared page tables.

Signed-off-by: Khalid Aziz <khalid@kernel.org>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Anthony Yznaga <anthony.yznaga@oracle.com>
---
 include/linux/mm.h             | 19 +++++++++++++++++++
 include/trace/events/mmflags.h |  7 +++++++
 2 files changed, 26 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 76ee2bfaa8bd..aca853b4c5dc 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -431,6 +431,13 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_SEALED	VM_NONE
 #endif
 
+#ifdef CONFIG_MSHARE
+#define VM_MSHARE_BIT	43
+#define VM_MSHARE	BIT(VM_MSHARE_BIT)
+#else
+#define VM_MSHARE	VM_NONE
+#endif
+
 /* Bits set in the VMA until the stack is in its final location */
 #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY)
 
@@ -991,6 +998,18 @@ static inline bool vma_is_anon_shmem(struct vm_area_struct *vma) { return false;
 
 int vma_is_stack_for_current(struct vm_area_struct *vma);
 
+#ifdef CONFIG_MSHARE
+static inline bool vma_is_mshare(const struct vm_area_struct *vma)
+{
+	return vma->vm_flags & VM_MSHARE;
+}
+#else
+static inline bool vma_is_mshare(const struct vm_area_struct *vma)
+{
+	return false;
+}
+#endif
+
 /* flush_tlb_range() takes a vma, not a mm, and can care about flags */
 #define TLB_FLUSH_VMA(mm,flags) { .vm_mm = (mm), .vm_flags = (flags) }
 
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index aa441f593e9a..a9b13a8513d0 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -200,6 +200,12 @@ IF_HAVE_PG_ARCH_3(arch_3)
 # define IF_HAVE_VM_DROPPABLE(flag, name)
 #endif
 
+#ifdef CONFIG_MSHARE
+# define IF_HAVE_VM_MSHARE(flag, name) {flag, name},
+#else
+# define IF_HAVE_VM_MSHARE(flag, name)
+#endif
+
 #define __def_vmaflag_names						\
 	{VM_READ,			"read"		},		\
 	{VM_WRITE,			"write"		},		\
@@ -233,6 +239,7 @@ IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY,	"softdirty"	)		\
 	{VM_HUGEPAGE,			"hugepage"	},		\
 	{VM_NOHUGEPAGE,			"nohugepage"	},		\
 IF_HAVE_VM_DROPPABLE(VM_DROPPABLE,	"droppable"	)		\
+IF_HAVE_VM_MSHARE(VM_MSHARE,		"mshare"	)		\
 	{VM_MERGEABLE,			"mergeable"	}		\
 
 #define show_vma_flags(flags)						\
-- 
2.47.1


From: Khalid Aziz <khalid@kernel.org>

Add support for mapping an mshare region into a process after the
region has been established in msharefs. Disallow operations that
could split the resulting msharefs vma such as partial unmaps and
protection changes. Fault handling, mapping, unmapping, and
protection changes for objects mapped into an mshare region will
be done using the shared vmas created for them in the host mm. This
functionality will be added in later patches.

Signed-off-by: Khalid Aziz <khalid@kernel.org>
Signed-off-by: Anthony Yznaga <anthony.yznaga@oracle.com>
---
 mm/mshare.c | 133 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 132 insertions(+), 1 deletion(-)

diff --git a/mm/mshare.c b/mm/mshare.c
index bf859b176e09..e0dc42602f7f 100644
--- a/mm/mshare.c
+++ b/mm/mshare.c
@@ -15,16 +15,19 @@
 
 #include <linux/fs.h>
 #include <linux/fs_context.h>
+#include <linux/mman.h>
 #include <uapi/linux/magic.h>
 #include <linux/falloc.h>
 
 const unsigned long mshare_align = P4D_SIZE;
+const unsigned long mshare_base = mshare_align;
 
 #define MSHARE_INITIALIZED	0x1
 
 struct mshare_data {
 	struct mm_struct *mm;
 	refcount_t ref;
+	unsigned long start;
 	unsigned long size;
 	unsigned long flags;
 };
@@ -34,6 +37,130 @@ static inline bool mshare_is_initialized(struct mshare_data *m_data)
 	return test_bit(MSHARE_INITIALIZED, &m_data->flags);
 }
 
+static int mshare_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
+{
+	return -EINVAL;
+}
+
+static int mshare_vm_op_mprotect(struct vm_area_struct *vma, unsigned long start,
+				 unsigned long end, unsigned long newflags)
+{
+	return -EINVAL;
+}
+
+static const struct vm_operations_struct msharefs_vm_ops = {
+	.may_split = mshare_vm_op_split,
+	.mprotect = mshare_vm_op_mprotect,
+};
+
+/*
+ * msharefs_mmap() - mmap an mshare region
+ */
+static int
+msharefs_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct mshare_data *m_data = file->private_data;
+
+	vma->vm_private_data = m_data;
+	vm_flags_set(vma, VM_MSHARE | VM_DONTEXPAND);
+	vma->vm_ops = &msharefs_vm_ops;
+
+	return 0;
+}
+
+static unsigned long
+msharefs_get_unmapped_area_bottomup(struct file *file, unsigned long addr,
+		unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+	struct vm_unmapped_area_info info = {};
+
+	info.length = len;
+	info.low_limit = current->mm->mmap_base;
+	info.high_limit = arch_get_mmap_end(addr, len, flags);
+	info.align_mask = PAGE_MASK & (mshare_align - 1);
+	return vm_unmapped_area(&info);
+}
+
+static unsigned long
+msharefs_get_unmapped_area_topdown(struct file *file, unsigned long addr,
+		unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+	struct vm_unmapped_area_info info = {};
+
+	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+	info.length = len;
+	info.low_limit = PAGE_SIZE;
+	info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
+	info.align_mask = PAGE_MASK & (mshare_align - 1);
+	addr = vm_unmapped_area(&info);
+
+	/*
+	 * A failed mmap() very likely causes application failure,
+	 * so fall back to the bottom-up function here. This scenario
+	 * can happen with large stack limits and large mmap()
+	 * allocations.
+	 */
+	if (unlikely(offset_in_page(addr))) {
+		VM_BUG_ON(addr != -ENOMEM);
+		info.flags = 0;
+		info.low_limit = current->mm->mmap_base;
+		info.high_limit = arch_get_mmap_end(addr, len, flags);
+		addr = vm_unmapped_area(&info);
+	}
+
+	return addr;
+}
+
+static unsigned long
+msharefs_get_unmapped_area(struct file *file, unsigned long addr,
+		unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+	struct mshare_data *m_data = file->private_data;
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma, *prev;
+	unsigned long mshare_start, mshare_size;
+	const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);
+
+	mmap_assert_write_locked(mm);
+
+	if ((flags & MAP_TYPE) == MAP_PRIVATE)
+		return -EINVAL;
+
+	if (!mshare_is_initialized(m_data))
+		return -EINVAL;
+
+	mshare_start = m_data->start;
+	mshare_size = m_data->size;
+
+	if (len != mshare_size)
+		return -EINVAL;
+
+	if (len > mmap_end - mmap_min_addr)
+		return -ENOMEM;
+
+	if (flags & MAP_FIXED) {
+		if (!IS_ALIGNED(addr, mshare_align))
+			return -EINVAL;
+		return addr;
+	}
+
+	if (addr) {
+		addr = ALIGN(addr, mshare_align);
+		vma = find_vma_prev(mm, addr, &prev);
+		if (mmap_end - len >= addr && addr >= mmap_min_addr &&
+		    (!vma || addr + len <= vm_start_gap(vma)) &&
+		    (!prev || addr >= vm_end_gap(prev)))
+			return addr;
+	}
+
+	if (!mm_flags_test(MMF_TOPDOWN, mm))
+		return msharefs_get_unmapped_area_bottomup(file, addr, len,
+				pgoff, flags);
+	else
+		return msharefs_get_unmapped_area_topdown(file, addr, len,
+				pgoff, flags);
+}
+
 static int msharefs_set_size(struct mshare_data *m_data, unsigned long size)
 {
 	int error = -EINVAL;
@@ -87,6 +214,8 @@ static const struct inode_operations msharefs_file_inode_ops;
 
 static const struct file_operations msharefs_file_operations = {
 	.open			= simple_open,
+	.mmap			= msharefs_mmap,
+	.get_unmapped_area	= msharefs_get_unmapped_area,
 	.fallocate		= msharefs_fallocate,
 };
 
@@ -101,12 +230,14 @@ msharefs_fill_mm(struct inode *inode)
 	if (!mm)
 		return -ENOMEM;
 
-	mm->mmap_base = mm->task_size = 0;
+	mm->mmap_base = mshare_base;
+	mm->task_size = 0;
 
 	m_data = kzalloc(sizeof(*m_data), GFP_KERNEL);
 	if (!m_data)
 		goto err_free;
 	m_data->mm = mm;
+	m_data->start = mshare_base;
 
 	refcount_set(&m_data->ref, 1);
 	inode->i_private = m_data;
-- 
2.47.1


Unlike the mm of a task, an mshare host mm is not updated on context
switch. In particular this means that mm_cpumask is never updated
which results in TLB flushes for updates to mshare PTEs only being
done on the local CPU. To ensure entries are flushed for non-local
TLBs, set up an mmu notifier on the mshare mm and use the
.arch_invalidate_secondary_tlbs callback to flush all TLBs.
arch_invalidate_secondary_tlbs guarantees that TLB entries will be
flushed before pages are freed when unmapping pages in an mshare region.

Signed-off-by: Anthony Yznaga <anthony.yznaga@oracle.com>
---
 mm/mshare.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/mm/mshare.c b/mm/mshare.c
index e0dc42602f7f..be7cae739225 100644
--- a/mm/mshare.c
+++ b/mm/mshare.c
@@ -16,8 +16,10 @@
 #include <linux/fs.h>
 #include <linux/fs_context.h>
 #include <linux/mman.h>
+#include <linux/mmu_notifier.h>
 #include <uapi/linux/magic.h>
 #include <linux/falloc.h>
+#include <asm/tlbflush.h>
 
 const unsigned long mshare_align = P4D_SIZE;
 const unsigned long mshare_base = mshare_align;
@@ -30,6 +32,7 @@ struct mshare_data {
 	unsigned long start;
 	unsigned long size;
 	unsigned long flags;
+	struct mmu_notifier mn;
 };
 
 static inline bool mshare_is_initialized(struct mshare_data *m_data)
@@ -37,6 +40,16 @@ static inline bool mshare_is_initialized(struct mshare_data *m_data)
 	return test_bit(MSHARE_INITIALIZED, &m_data->flags);
 }
 
+static void mshare_invalidate_tlbs(struct mmu_notifier *mn, struct mm_struct *mm,
+				   unsigned long start, unsigned long end)
+{
+	flush_tlb_all();
+}
+
+static const struct mmu_notifier_ops mshare_mmu_ops = {
+	.arch_invalidate_secondary_tlbs = mshare_invalidate_tlbs,
+};
+
 static int mshare_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
 {
 	return -EINVAL;
@@ -238,6 +251,10 @@ msharefs_fill_mm(struct inode *inode)
 		goto err_free;
 	m_data->mm = mm;
 	m_data->start = mshare_base;
+	m_data->mn.ops = &mshare_mmu_ops;
+	ret = mmu_notifier_register(&m_data->mn, mm);
+	if (ret)
+		goto err_free;
 
 	refcount_set(&m_data->ref, 1);
 	inode->i_private = m_data;
-- 
2.47.1


Scanning an msharefs vma results in changes to the shared page
table but with TLB flushes incorrectly only going to the process
with the vma.

Signed-off-by: Anthony Yznaga <anthony.yznaga@oracle.com>
---
 kernel/sched/fair.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e256793b9a08..6f28395991cd 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3379,7 +3379,8 @@ static void task_numa_work(struct callback_head *work)
 
 	for (; vma; vma = vma_next(&vmi)) {
 		if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
-			is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
+			is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP) ||
+			vma_is_mshare(vma)) {
 			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_UNSUITABLE);
 			continue;
 		}
-- 
2.47.1


This will be used to support mshare functionality where the read
lock on an mshare host mm is taken while holding the lock on a
process mm.

Signed-off-by: Anthony Yznaga <anthony.yznaga@oracle.com>
---
 include/linux/mmap_lock.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index 2c9fffa58714..3cf7219306a1 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -369,6 +369,13 @@ static inline void mmap_read_lock(struct mm_struct *mm)
 	__mmap_lock_trace_acquire_returned(mm, false, true);
 }
 
+static inline void mmap_read_lock_nested(struct mm_struct *mm, int subclass)
+{
+	__mmap_lock_trace_start_locking(mm, false);
+	down_read_nested(&mm->mmap_lock, subclass);
+	__mmap_lock_trace_acquire_returned(mm, false, true);
+}
+
 static inline int mmap_read_lock_killable(struct mm_struct *mm)
 {
 	int ret;
-- 
2.47.1


Special handling is needed when unmapping a hugetlb vma and will
be needed when unmapping an msharefs vma once support is added for
handling faults in an mshare region.

Signed-off-by: Anthony Yznaga <anthony.yznaga@oracle.com>
---
 include/linux/mm.h | 10 ++++++++++
 ipc/shm.c          | 17 +++++++++++++++++
 mm/hugetlb.c       | 25 +++++++++++++++++++++++++
 mm/memory.c        | 36 +++++++++++++-----------------------
 4 files changed, 65 insertions(+), 23 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index aca853b4c5dc..96440082a633 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -43,6 +43,7 @@ struct anon_vma_chain;
 struct user_struct;
 struct pt_regs;
 struct folio_batch;
+struct zap_details;
 
 void arch_mm_preinit(void);
 void mm_core_init(void);
@@ -681,8 +682,17 @@ struct vm_operations_struct {
 	struct page *(*find_normal_page)(struct vm_area_struct *vma,
 					 unsigned long addr);
 #endif /* CONFIG_FIND_NORMAL_PAGE */
+	void (*unmap_page_range)(struct mmu_gather *tlb,
+				struct vm_area_struct *vma,
+				unsigned long addr, unsigned long end,
+				struct zap_details *details);
 };
 
+void __unmap_page_range(struct mmu_gather *tlb,
+			struct vm_area_struct *vma,
+			unsigned long addr, unsigned long end,
+			struct zap_details *details);
+
 #ifdef CONFIG_NUMA_BALANCING
 static inline void vma_numab_state_init(struct vm_area_struct *vma)
 {
diff --git a/ipc/shm.c b/ipc/shm.c
index a9310b6dbbc3..14376b63d46a 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -588,6 +588,22 @@ static struct mempolicy *shm_get_policy(struct vm_area_struct *vma,
 }
 #endif
 
+static void shm_unmap_page_range(struct mmu_gather *tlb,
+				 struct vm_area_struct *vma,
+				 unsigned long addr, unsigned long end,
+				 struct zap_details *details)
+{
+	struct file *file = vma->vm_file;
+	struct shm_file_data *sfd = shm_file_data(file);
+
+	if (sfd->vm_ops->unmap_page_range) {
+		sfd->vm_ops->unmap_page_range(tlb, vma, addr, end, details);
+		return;
+	}
+
+	__unmap_page_range(tlb, vma, addr, end, details);
+}
+
 static int shm_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct shm_file_data *sfd = shm_file_data(file);
@@ -688,6 +704,7 @@ static const struct vm_operations_struct shm_vm_ops = {
 	.set_policy = shm_set_policy,
 	.get_policy = shm_get_policy,
 #endif
+	.unmap_page_range = shm_unmap_page_range,
 };
 
 /**
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 514fab5a20ef..3fc6eb8a5858 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5461,6 +5461,30 @@ static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf)
 	return 0;
 }
 
+static void hugetlb_vm_op_unmap_page_range(struct mmu_gather *tlb,
+				struct vm_area_struct *vma,
+				unsigned long addr, unsigned long end,
+				struct zap_details *details)
+{
+	zap_flags_t zap_flags = details ?  details->zap_flags : 0;
+
+	/*
+	 * It is undesirable to test vma->vm_file as it
+	 * should be non-null for valid hugetlb area.
+	 * However, vm_file will be NULL in the error
+	 * cleanup path of mmap_region. When
+	 * hugetlbfs ->mmap method fails,
+	 * mmap_region() nullifies vma->vm_file
+	 * before calling this function to clean up.
+	 * Since no pte has actually been setup, it is
+	 * safe to do nothing in this case.
+	 */
+	if (!vma->vm_file)
+		return;
+
+	__unmap_hugepage_range(tlb, vma, addr, end, NULL, zap_flags);
+}
+
 /*
  * When a new function is introduced to vm_operations_struct and added
  * to hugetlb_vm_ops, please consider adding the function to shm_vm_ops.
@@ -5474,6 +5498,7 @@ const struct vm_operations_struct hugetlb_vm_ops = {
 	.close = hugetlb_vm_op_close,
 	.may_split = hugetlb_vm_op_split,
 	.pagesize = hugetlb_vm_op_pagesize,
+	.unmap_page_range = hugetlb_vm_op_unmap_page_range,
 };
 
 static pte_t make_huge_pte(struct vm_area_struct *vma, struct folio *folio,
diff --git a/mm/memory.c b/mm/memory.c
index 002c28795d8b..dbc299aa82c2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1999,7 +1999,7 @@ static inline unsigned long zap_p4d_range(struct mmu_gather *tlb,
 	return addr;
 }
 
-void unmap_page_range(struct mmu_gather *tlb,
+void __unmap_page_range(struct mmu_gather *tlb,
 			     struct vm_area_struct *vma,
 			     unsigned long addr, unsigned long end,
 			     struct zap_details *details)
@@ -2019,6 +2019,16 @@ void unmap_page_range(struct mmu_gather *tlb,
 	tlb_end_vma(tlb, vma);
 }
 
+void unmap_page_range(struct mmu_gather *tlb,
+			     struct vm_area_struct *vma,
+			     unsigned long addr, unsigned long end,
+			     struct zap_details *details)
+{
+	if (vma->vm_ops && vma->vm_ops->unmap_page_range)
+		vma->vm_ops->unmap_page_range(tlb, vma, addr, end, details);
+	else
+		__unmap_page_range(tlb, vma, addr, end, details);
+}
 
 static void unmap_single_vma(struct mmu_gather *tlb,
 		struct vm_area_struct *vma, unsigned long start_addr,
@@ -2037,28 +2047,8 @@ static void unmap_single_vma(struct mmu_gather *tlb,
 	if (vma->vm_file)
 		uprobe_munmap(vma, start, end);
 
-	if (start != end) {
-		if (unlikely(is_vm_hugetlb_page(vma))) {
-			/*
-			 * It is undesirable to test vma->vm_file as it
-			 * should be non-null for valid hugetlb area.
-			 * However, vm_file will be NULL in the error
-			 * cleanup path of mmap_region. When
-			 * hugetlbfs ->mmap method fails,
-			 * mmap_region() nullifies vma->vm_file
-			 * before calling this function to clean up.
-			 * Since no pte has actually been setup, it is
-			 * safe to do nothing in this case.
-			 */
-			if (vma->vm_file) {
-				zap_flags_t zap_flags = details ?
-				    details->zap_flags : 0;
-				__unmap_hugepage_range(tlb, vma, start, end,
-							     NULL, zap_flags);
-			}
-		} else
-			unmap_page_range(tlb, vma, start, end, details);
-	}
+	if (start != end)
+		unmap_page_range(tlb, vma, start, end, details);
 }
 
 /**
-- 
2.47.1


Once an mshare shared page table has been linked with one or more
process page tables it becomes necessary to ensure that the shared
page table is not completely freed when objects in it are unmapped
in order to avoid a potential UAF bug. To do this, introduce and
use a reference count for PUD pages.

Signed-off-by: Anthony Yznaga <anthony.yznaga@oracle.com>
---
 include/linux/mm.h       |  1 +
 include/linux/mm_types.h | 36 ++++++++++++++++++++++++++++++++++--
 mm/memory.c              | 21 +++++++++++++++++++--
 3 files changed, 54 insertions(+), 4 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 96440082a633..c8dfa5c6e7d4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3217,6 +3217,7 @@ static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud)
 
 static inline void pagetable_pud_ctor(struct ptdesc *ptdesc)
 {
+	ptdesc_pud_pts_init(ptdesc);
 	__pagetable_ctor(ptdesc);
 }
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index c8f4d2a2c60b..da5a7a31a81d 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -537,7 +537,7 @@ FOLIO_MATCH(compound_head, _head_3);
  * @pt_index:         Used for s390 gmap.
  * @pt_mm:            Used for x86 pgds.
  * @pt_frag_refcount: For fragmented page table tracking. Powerpc only.
- * @pt_share_count:   Used for HugeTLB PMD page table share count.
+ * @pt_share_count:   Used for HugeTLB PMD or Mshare PUD page table share count.
  * @_pt_pad_2:        Padding to ensure proper alignment.
  * @ptl:              Lock for the page table.
  * @__page_type:      Same as page->page_type. Unused for page tables.
@@ -564,7 +564,7 @@ struct ptdesc {
 		pgoff_t pt_index;
 		struct mm_struct *pt_mm;
 		atomic_t pt_frag_refcount;
-#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
+#if defined(CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING) || defined(CONFIG_MSHARE)
 		atomic_t pt_share_count;
 #endif
 	};
@@ -638,6 +638,38 @@ static inline void ptdesc_pmd_pts_init(struct ptdesc *ptdesc)
 }
 #endif
 
+#ifdef CONFIG_MSHARE
+static inline void ptdesc_pud_pts_init(struct ptdesc *ptdesc)
+{
+	atomic_set(&ptdesc->pt_share_count, 0);
+}
+
+static inline void ptdesc_pud_pts_inc(struct ptdesc *ptdesc)
+{
+	atomic_inc(&ptdesc->pt_share_count);
+}
+
+static inline void ptdesc_pud_pts_dec(struct ptdesc *ptdesc)
+{
+	atomic_dec(&ptdesc->pt_share_count);
+}
+
+static inline int ptdesc_pud_pts_count(struct ptdesc *ptdesc)
+{
+	return atomic_read(&ptdesc->pt_share_count);
+}
+#else
+static inline void ptdesc_pud_pts_init(struct ptdesc *ptdesc)
+{
+}
+
+static inline int ptdesc_pud_pts_count(struct ptdesc *ptdesc)
+{
+	return 0;
+}
+#endif
+
+
 /*
  * Used for sizing the vmemmap region on some architectures
  */
diff --git a/mm/memory.c b/mm/memory.c
index dbc299aa82c2..4e3bb49b95e2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -228,9 +228,18 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 	mm_dec_nr_pmds(tlb->mm);
 }
 
+static inline bool pud_range_is_shared(pud_t *pud)
+{
+	if (ptdesc_pud_pts_count(virt_to_ptdesc(pud)))
+		return true;
+
+	return false;
+}
+
 static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
 				unsigned long addr, unsigned long end,
-				unsigned long floor, unsigned long ceiling)
+				unsigned long floor, unsigned long ceiling,
+				bool *pud_is_shared)
 {
 	pud_t *pud;
 	unsigned long next;
@@ -257,6 +266,10 @@ static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
 		return;
 
 	pud = pud_offset(p4d, start);
+	if (unlikely(pud_range_is_shared(pud))) {
+		*pud_is_shared = true;
+		return;
+	}
 	p4d_clear(p4d);
 	pud_free_tlb(tlb, pud, start);
 	mm_dec_nr_puds(tlb->mm);
@@ -269,6 +282,7 @@ static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
 	p4d_t *p4d;
 	unsigned long next;
 	unsigned long start;
+	bool pud_is_shared = false;
 
 	start = addr;
 	p4d = p4d_offset(pgd, addr);
@@ -276,7 +290,8 @@ static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
 		next = p4d_addr_end(addr, end);
 		if (p4d_none_or_clear_bad(p4d))
 			continue;
-		free_pud_range(tlb, p4d, addr, next, floor, ceiling);
+		free_pud_range(tlb, p4d, addr, next, floor, ceiling,
+				&pud_is_shared);
 	} while (p4d++, addr = next, addr != end);
 
 	start &= PGDIR_MASK;
@@ -290,6 +305,8 @@ static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
 	if (end - 1 > ceiling - 1)
 		return;
 
+	if (unlikely(pud_is_shared))
+		return;
 	p4d = p4d_offset(pgd, start);
 	pgd_clear(pgd);
 	p4d_free_tlb(tlb, p4d, start);
-- 
2.47.1


From: Khalid Aziz <khalid@kernel.org>

In preparation for enabling the handling of page faults in an mshare
region provide a way to link an mshare shared page table to a process
page table and otherwise find the actual vma in order to handle a page
fault. Implement an unmap_page_range vm_ops function for msharefs VMAs
to unlink shared page tables when a process exits or an mshare region
is explicitly unmapped.

Signed-off-by: Khalid Aziz <khalid@kernel.org>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Anthony Yznaga <anthony.yznaga@oracle.com>
---
 include/linux/mm.h |   6 +++
 mm/memory.c        |   6 +++
 mm/mshare.c        | 107 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 119 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c8dfa5c6e7d4..3a8dddb5925a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1009,11 +1009,17 @@ static inline bool vma_is_anon_shmem(struct vm_area_struct *vma) { return false;
 int vma_is_stack_for_current(struct vm_area_struct *vma);
 
 #ifdef CONFIG_MSHARE
+vm_fault_t find_shared_vma(struct vm_area_struct **vma, unsigned long *addrp);
 static inline bool vma_is_mshare(const struct vm_area_struct *vma)
 {
 	return vma->vm_flags & VM_MSHARE;
 }
 #else
+static inline vm_fault_t find_shared_vma(struct vm_area_struct **vma, unsigned long *addrp)
+{
+	WARN_ON_ONCE(1);
+	return VM_FAULT_SIGBUS;
+}
 static inline bool vma_is_mshare(const struct vm_area_struct *vma)
 {
 	return false;
diff --git a/mm/memory.c b/mm/memory.c
index 4e3bb49b95e2..177eb53475cb 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -6475,6 +6475,12 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 	if (ret)
 		goto out;
 
+	if (unlikely(vma_is_mshare(vma))) {
+		WARN_ON_ONCE(1);
+		ret = VM_FAULT_SIGBUS;
+		goto out;
+	}
+
 	if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
 					    flags & FAULT_FLAG_INSTRUCTION,
 					    flags & FAULT_FLAG_REMOTE)) {
diff --git a/mm/mshare.c b/mm/mshare.c
index be7cae739225..f7b7904f0405 100644
--- a/mm/mshare.c
+++ b/mm/mshare.c
@@ -21,6 +21,8 @@
 #include <linux/falloc.h>
 #include <asm/tlbflush.h>
 
+#include <asm/tlb.h>
+
 const unsigned long mshare_align = P4D_SIZE;
 const unsigned long mshare_base = mshare_align;
 
@@ -50,6 +52,66 @@ static const struct mmu_notifier_ops mshare_mmu_ops = {
 	.arch_invalidate_secondary_tlbs = mshare_invalidate_tlbs,
 };
 
+static p4d_t *walk_to_p4d(struct mm_struct *mm, unsigned long addr)
+{
+	pgd_t *pgd;
+	p4d_t *p4d;
+
+	pgd = pgd_offset(mm, addr);
+	p4d = p4d_alloc(mm, pgd, addr);
+	if (!p4d)
+		return NULL;
+
+	return p4d;
+}
+
+/* Returns holding the host mm's lock for read.  Caller must release. */
+vm_fault_t
+find_shared_vma(struct vm_area_struct **vmap, unsigned long *addrp)
+{
+	struct vm_area_struct *vma, *guest = *vmap;
+	struct mshare_data *m_data = guest->vm_private_data;
+	struct mm_struct *host_mm = m_data->mm;
+	unsigned long host_addr;
+	p4d_t *p4d, *guest_p4d;
+
+	mmap_read_lock_nested(host_mm, SINGLE_DEPTH_NESTING);
+	host_addr = *addrp - guest->vm_start + host_mm->mmap_base;
+	p4d = walk_to_p4d(host_mm, host_addr);
+	guest_p4d = walk_to_p4d(guest->vm_mm, *addrp);
+	if (!p4d_same(*guest_p4d, *p4d)) {
+		spinlock_t *guest_ptl = &guest->vm_mm->page_table_lock;
+
+		spin_lock(guest_ptl);
+		if (!p4d_same(*guest_p4d, *p4d)) {
+			pud_t *pud = p4d_pgtable(*p4d);
+
+			ptdesc_pud_pts_inc(virt_to_ptdesc(pud));
+			set_p4d(guest_p4d, *p4d);
+			spin_unlock(guest_ptl);
+			mmap_read_unlock(host_mm);
+			return VM_FAULT_NOPAGE;
+		}
+		spin_unlock(guest_ptl);
+	}
+
+	*addrp = host_addr;
+	vma = find_vma(host_mm, host_addr);
+
+	/* XXX: expand stack? */
+	if (vma && vma->vm_start > host_addr)
+		vma = NULL;
+
+	*vmap = vma;
+
+	/*
+	 * release host mm lock unless a matching vma is found
+	 */
+	if (!vma)
+		mmap_read_unlock(host_mm);
+	return 0;
+}
+
 static int mshare_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
 {
 	return -EINVAL;
@@ -61,9 +123,54 @@ static int mshare_vm_op_mprotect(struct vm_area_struct *vma, unsigned long start
 	return -EINVAL;
 }
 
+/*
+ * Unlink any shared page tables in the range and ensure TLBs are flushed.
+ * Pages in the mshare region itself are not unmapped.
+ */
+static void mshare_vm_op_unshare_page_range(struct mmu_gather *tlb,
+				struct vm_area_struct *vma,
+				unsigned long addr, unsigned long end,
+				struct zap_details *details)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	spinlock_t *ptl = &mm->page_table_lock;
+	unsigned long sz = mshare_align;
+	pgd_t *pgd;
+	p4d_t *p4d;
+	pud_t *pud;
+
+	WARN_ON(!vma_is_mshare(vma));
+
+	tlb_start_vma(tlb, vma);
+
+	for (; addr < end ; addr += sz) {
+		spin_lock(ptl);
+
+		pgd = pgd_offset(mm, addr);
+		if (!pgd_present(*pgd)) {
+			spin_unlock(ptl);
+			continue;
+		}
+		p4d = p4d_offset(pgd, addr);
+		if (!p4d_present(*p4d)) {
+			spin_unlock(ptl);
+			continue;
+		}
+		pud = p4d_pgtable(*p4d);
+		ptdesc_pud_pts_dec(virt_to_ptdesc(pud));
+
+		p4d_clear(p4d);
+		spin_unlock(ptl);
+		tlb_flush_p4d_range(tlb, addr, sz);
+	}
+
+	tlb_end_vma(tlb, vma);
+}
+
 static const struct vm_operations_struct msharefs_vm_ops = {
 	.may_split = mshare_vm_op_split,
 	.mprotect = mshare_vm_op_mprotect,
+	.unmap_page_range = mshare_vm_op_unshare_page_range,
 };
 
 /*
-- 
2.47.1


Enable x86 support for handling page faults in an mshare region by
redirecting page faults to operate on the mshare mm_struct and vmas
contained in it.
Some permissions checks are done using vma flags in architecture-specfic
fault handling code so the actual vma needed to complete the handling
is acquired before calling handle_mm_fault(). Because of this an
ARCH_SUPPORTS_MSHARE config option is added.

Signed-off-by: Anthony Yznaga <anthony.yznaga@oracle.com>
---
 arch/Kconfig        |  3 +++
 arch/x86/Kconfig    |  1 +
 arch/x86/mm/fault.c | 40 +++++++++++++++++++++++++++++++++++++++-
 mm/Kconfig          |  2 +-
 4 files changed, 44 insertions(+), 2 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index d1b4ffd6e085..2e10a11fc442 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1676,6 +1676,9 @@ config HAVE_ARCH_PFN_VALID
 config ARCH_SUPPORTS_DEBUG_PAGEALLOC
 	bool
 
+config ARCH_SUPPORTS_MSHARE
+	bool
+
 config ARCH_SUPPORTS_PAGE_TABLE_CHECK
 	bool
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 58d890fe2100..1ad252eec417 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -124,6 +124,7 @@ config X86
 	select ARCH_SUPPORTS_ATOMIC_RMW
 	select ARCH_SUPPORTS_DEBUG_PAGEALLOC
 	select ARCH_SUPPORTS_HUGETLBFS
+	select ARCH_SUPPORTS_MSHARE		if X86_64
 	select ARCH_SUPPORTS_PAGE_TABLE_CHECK	if X86_64
 	select ARCH_SUPPORTS_NUMA_BALANCING	if X86_64
 	select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP	if NR_CPUS <= 4096
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 998bd807fc7b..2a7df3aa13b4 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1215,6 +1215,8 @@ void do_user_addr_fault(struct pt_regs *regs,
 	struct mm_struct *mm;
 	vm_fault_t fault;
 	unsigned int flags = FAULT_FLAG_DEFAULT;
+	bool is_shared_vma;
+	unsigned long addr;
 
 	tsk = current;
 	mm = tsk->mm;
@@ -1328,6 +1330,12 @@ void do_user_addr_fault(struct pt_regs *regs,
 	if (!vma)
 		goto lock_mmap;
 
+	/* mshare does not support per-VMA locks yet */
+	if (vma_is_mshare(vma)) {
+		vma_end_read(vma);
+		goto lock_mmap;
+	}
+
 	if (unlikely(access_error(error_code, vma))) {
 		bad_area_access_error(regs, error_code, address, NULL, vma);
 		count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
@@ -1356,17 +1364,38 @@ void do_user_addr_fault(struct pt_regs *regs,
 lock_mmap:
 
 retry:
+	addr = address;
+	is_shared_vma = false;
 	vma = lock_mm_and_find_vma(mm, address, regs);
 	if (unlikely(!vma)) {
 		bad_area_nosemaphore(regs, error_code, address);
 		return;
 	}
 
+	if (unlikely(vma_is_mshare(vma))) {
+		fault = find_shared_vma(&vma, &addr);
+
+		if (fault) {
+			mmap_read_unlock(mm);
+			goto done;
+		}
+
+		if (!vma) {
+			mmap_read_unlock(mm);
+			bad_area_nosemaphore(regs, error_code, address);
+			return;
+		}
+
+		is_shared_vma = true;
+	}
+
 	/*
 	 * Ok, we have a good vm_area for this memory access, so
 	 * we can handle it..
 	 */
 	if (unlikely(access_error(error_code, vma))) {
+		if (unlikely(is_shared_vma))
+			mmap_read_unlock(vma->vm_mm);
 		bad_area_access_error(regs, error_code, address, mm, vma);
 		return;
 	}
@@ -1384,7 +1413,14 @@ void do_user_addr_fault(struct pt_regs *regs,
 	 * userland). The return to userland is identified whenever
 	 * FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags.
 	 */
-	fault = handle_mm_fault(vma, address, flags, regs);
+	fault = handle_mm_fault(vma, addr, flags, regs);
+
+	/*
+	 * If the lock on the shared mm has been released, release the lock
+	 * on the task's mm now.
+	 */
+	if (unlikely(is_shared_vma) && (fault & (VM_FAULT_COMPLETED | VM_FAULT_RETRY)))
+		mmap_read_unlock(mm);
 
 	if (fault_signal_pending(fault, regs)) {
 		/*
@@ -1412,6 +1448,8 @@ void do_user_addr_fault(struct pt_regs *regs,
 		goto retry;
 	}
 
+	if (unlikely(is_shared_vma))
+		mmap_read_unlock(vma->vm_mm);
 	mmap_read_unlock(mm);
 done:
 	if (likely(!(fault & VM_FAULT_ERROR)))
diff --git a/mm/Kconfig b/mm/Kconfig
index 8b50e9785729..824da2a481f9 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1402,7 +1402,7 @@ config FIND_NORMAL_PAGE
 
 config MSHARE
 	bool "Mshare"
-	depends on MMU
+	depends on MMU && ARCH_SUPPORTS_MSHARE
 	help
 	  Enable msharefs: A pseudo filesystem that allows multiple processes
 	  to share kernel resources for mapping shared pages. A file created on
-- 
2.47.1


In preparation for mapping objects into an mshare region, create
__do_mmap() to allow mapping into a specified mm. There are no
functional changes otherwise.

Signed-off-by: Anthony Yznaga <anthony.yznaga@oracle.com>
---
 include/linux/mm.h | 16 ++++++++++++++++
 mm/mmap.c          | 10 +++++-----
 mm/vma.c           | 12 ++++++------
 mm/vma.h           |  2 +-
 4 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3a8dddb5925a..07e0a15a4618 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3434,10 +3434,26 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 	return __get_unmapped_area(file, addr, len, pgoff, flags, 0);
 }
 
+#ifdef CONFIG_MMU
+unsigned long __do_mmap(struct file *file, unsigned long addr,
+	unsigned long len, unsigned long prot, unsigned long flags,
+	vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate,
+	struct list_head *uf, struct mm_struct *mm);
+static inline unsigned long do_mmap(struct file *file, unsigned long addr,
+	unsigned long len, unsigned long prot, unsigned long flags,
+	vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate,
+	struct list_head *uf)
+{
+	return __do_mmap(file, addr, len, prot, flags, vm_flags, pgoff,
+			 populate, uf, current->mm);
+}
+#else
 extern unsigned long do_mmap(struct file *file, unsigned long addr,
 	unsigned long len, unsigned long prot, unsigned long flags,
 	vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate,
 	struct list_head *uf);
+#endif
+
 extern int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
 			 unsigned long start, size_t len, struct list_head *uf,
 			 bool unlock);
diff --git a/mm/mmap.c b/mm/mmap.c
index 7a057e0e8da9..18f266a511e2 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -275,7 +275,7 @@ static inline bool file_mmap_ok(struct file *file, struct inode *inode,
 }
 
 /**
- * do_mmap() - Perform a userland memory mapping into the current process
+ * __do_mmap() - Perform a userland memory mapping into the current process
  * address space of length @len with protection bits @prot, mmap flags @flags
  * (from which VMA flags will be inferred), and any additional VMA flags to
  * apply @vm_flags. If this is a file-backed mapping then the file is specified
@@ -327,17 +327,17 @@ static inline bool file_mmap_ok(struct file *file, struct inode *inode,
  * @uf: An optional pointer to a list head to track userfaultfd unmap events
  * should unmapping events arise. If provided, it is up to the caller to manage
  * this.
+ * @mm: The mm_struct
  *
  * Returns: Either an error, or the address at which the requested mapping has
  * been performed.
  */
-unsigned long do_mmap(struct file *file, unsigned long addr,
+unsigned long __do_mmap(struct file *file, unsigned long addr,
 			unsigned long len, unsigned long prot,
 			unsigned long flags, vm_flags_t vm_flags,
 			unsigned long pgoff, unsigned long *populate,
-			struct list_head *uf)
+			struct list_head *uf, struct mm_struct *mm)
 {
-	struct mm_struct *mm = current->mm;
 	int pkey = 0;
 
 	*populate = 0;
@@ -555,7 +555,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
 			vm_flags |= VM_NORESERVE;
 	}
 
-	addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
+	addr = mmap_region(file, addr, len, vm_flags, pgoff, uf, mm);
 	if (!IS_ERR_VALUE(addr) &&
 	    ((vm_flags & VM_LOCKED) ||
 	     (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
diff --git a/mm/vma.c b/mm/vma.c
index 3b12c7579831..a7fbd339d259 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -2637,9 +2637,8 @@ static bool can_set_ksm_flags_early(struct mmap_state *map)
 
 static unsigned long __mmap_region(struct file *file, unsigned long addr,
 		unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
-		struct list_head *uf)
+		struct list_head *uf, struct mm_struct *mm)
 {
-	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma = NULL;
 	int error;
 	bool have_mmap_prepare = file && file->f_op->mmap_prepare;
@@ -2706,18 +2705,19 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr,
  * the virtual page offset in memory of the anonymous mapping.
  * @uf: Optionally, a pointer to a list head used for tracking userfaultfd unmap
  * events.
+ * @mm: The mm struct
  *
  * Returns: Either an error, or the address at which the requested mapping has
  * been performed.
  */
 unsigned long mmap_region(struct file *file, unsigned long addr,
 			  unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
-			  struct list_head *uf)
+			  struct list_head *uf, struct mm_struct *mm)
 {
 	unsigned long ret;
 	bool writable_file_mapping = false;
 
-	mmap_assert_write_locked(current->mm);
+	mmap_assert_write_locked(mm);
 
 	/* Check to see if MDWE is applicable. */
 	if (map_deny_write_exec(vm_flags, vm_flags))
@@ -2736,13 +2736,13 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 		writable_file_mapping = true;
 	}
 
-	ret = __mmap_region(file, addr, len, vm_flags, pgoff, uf);
+	ret = __mmap_region(file, addr, len, vm_flags, pgoff, uf, mm);
 
 	/* Clear our write mapping regardless of error. */
 	if (writable_file_mapping)
 		mapping_unmap_writable(file->f_mapping);
 
-	validate_mm(current->mm);
+	validate_mm(mm);
 	return ret;
 }
 
diff --git a/mm/vma.h b/mm/vma.h
index bcdc261c5b15..20fc1c2a32fd 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -352,7 +352,7 @@ void mm_drop_all_locks(struct mm_struct *mm);
 
 unsigned long mmap_region(struct file *file, unsigned long addr,
 		unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
-		struct list_head *uf);
+		struct list_head *uf, struct mm_struct *mm);
 
 int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *brkvma,
 		 unsigned long addr, unsigned long request, unsigned long flags);
-- 
2.47.1


Allow unmap to work with an mshare host mm.

Signed-off-by: Anthony Yznaga <anthony.yznaga@oracle.com>
---
 mm/vma.c | 10 ++++++----
 mm/vma.h |  1 +
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/mm/vma.c b/mm/vma.c
index a7fbd339d259..c09b2e1a08e6 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -1265,7 +1265,7 @@ static void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
 	struct vm_area_struct *vma;
 	struct mm_struct *mm;
 
-	mm = current->mm;
+	mm = vms->mm;
 	mm->map_count -= vms->vma_count;
 	mm->locked_vm -= vms->locked_vm;
 	if (vms->unlock)
@@ -1473,13 +1473,15 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
  * @start: The aligned start address to munmap
  * @end: The aligned end address to munmap
  * @uf: The userfaultfd list_head
+ * @mm: The mm struct
  * @unlock: Unlock after the operation.  Only unlocked on success
  */
 static void init_vma_munmap(struct vma_munmap_struct *vms,
 		struct vma_iterator *vmi, struct vm_area_struct *vma,
 		unsigned long start, unsigned long end, struct list_head *uf,
-		bool unlock)
+		struct mm_struct *mm, bool unlock)
 {
+	vms->mm = mm;
 	vms->vmi = vmi;
 	vms->vma = vma;
 	if (vma) {
@@ -1523,7 +1525,7 @@ int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
 	struct vma_munmap_struct vms;
 	int error;
 
-	init_vma_munmap(&vms, vmi, vma, start, end, uf, unlock);
+	init_vma_munmap(&vms, vmi, vma, start, end, uf, mm, unlock);
 	error = vms_gather_munmap_vmas(&vms, &mas_detach);
 	if (error)
 		goto gather_failed;
@@ -2346,7 +2348,7 @@ static int __mmap_prepare(struct mmap_state *map, struct list_head *uf)
 
 	/* Find the first overlapping VMA and initialise unmap state. */
 	vms->vma = vma_find(vmi, map->end);
-	init_vma_munmap(vms, vmi, vms->vma, map->addr, map->end, uf,
+	init_vma_munmap(vms, vmi, vms->vma, map->addr, map->end, uf, map->mm,
 			/* unlock = */ false);
 
 	/* OK, we have overlapping VMAs - prepare to unmap them. */
diff --git a/mm/vma.h b/mm/vma.h
index 20fc1c2a32fd..4946d7dc13fd 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -51,6 +51,7 @@ struct vma_munmap_struct {
 	unsigned long exec_vm;
 	unsigned long stack_vm;
 	unsigned long data_vm;
+	struct mm_struct *mm;
 };
 
 enum vma_merge_state {
-- 
2.47.1


Ownership of an mshare region is assigned to the process that creates
it. Establishing ownership ensures that accounting the memory in an
mshare region is applied to the owner and not spread among the processes
sharing the memory. It also provides a means for freeing mshare memory
in an OOM situation. Once an mshare owner exits, access to the memory by
a non-owner process results in a SIGSEGV. For this initial implementation
ownership is not shared or transferred through forking or other means.

Signed-off-by: Anthony Yznaga <anthony.yznaga@oracle.com>
---
 include/linux/mshare.h | 25 +++++++++++++
 include/linux/sched.h  |  5 +++
 kernel/exit.c          |  1 +
 kernel/fork.c          |  1 +
 mm/mshare.c            | 83 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 115 insertions(+)
 create mode 100644 include/linux/mshare.h

diff --git a/include/linux/mshare.h b/include/linux/mshare.h
new file mode 100644
index 000000000000..b62f0e54cf84
--- /dev/null
+++ b/include/linux/mshare.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_MSHARE_H_
+#define _LINUX_MSHARE_H_
+
+#include <linux/types.h>
+
+struct task_struct;
+
+#ifdef CONFIG_MSHARE
+
+void exit_mshare(struct task_struct *task);
+#define mshare_init_task(task) INIT_LIST_HEAD(&(task)->mshare_mem)
+
+#else
+
+static inline void exit_mshare(struct task_struct *task)
+{
+}
+static inline void mshare_init_task(struct task_struct *task)
+{
+}
+
+#endif
+
+#endif /* _LINUX_MSHARE_H_ */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2b272382673d..17f2f3c0b465 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -48,6 +48,7 @@
 #include <linux/uidgid_types.h>
 #include <linux/tracepoint-defs.h>
 #include <linux/unwind_deferred_types.h>
+#include <linux/mshare.h>
 #include <asm/kmap_size.h>
 
 /* task_struct member predeclarations (sorted alphabetically): */
@@ -1654,6 +1655,10 @@ struct task_struct {
 	/* CPU-specific state of this task: */
 	struct thread_struct		thread;
 
+#ifdef CONFIG_MSHARE
+	struct list_head		mshare_mem;
+#endif
+
 	/*
 	 * New fields for task_struct should be added above here, so that
 	 * they are included in the randomized portion of task_struct.
diff --git a/kernel/exit.c b/kernel/exit.c
index 343eb97543d5..24445109865d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -951,6 +951,7 @@ void __noreturn do_exit(long code)
 	if (group_dead)
 		acct_process();
 
+	exit_mshare(tsk);
 	exit_sem(tsk);
 	exit_shm(tsk);
 	exit_files(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index 5115be549234..eba6bd709c6e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2143,6 +2143,7 @@ __latent_entropy struct task_struct *copy_process(
 #endif
 
 	unwind_task_init(p);
+	mshare_init_task(p);
 
 	/* Perform scheduler related setup. Assign this task to a CPU. */
 	retval = sched_fork(clone_flags, p);
diff --git a/mm/mshare.c b/mm/mshare.c
index f7b7904f0405..8a23b391fa11 100644
--- a/mm/mshare.c
+++ b/mm/mshare.c
@@ -17,6 +17,7 @@
 #include <linux/fs_context.h>
 #include <linux/mman.h>
 #include <linux/mmu_notifier.h>
+#include <linux/mshare.h>
 #include <uapi/linux/magic.h>
 #include <linux/falloc.h>
 #include <asm/tlbflush.h>
@@ -27,6 +28,7 @@ const unsigned long mshare_align = P4D_SIZE;
 const unsigned long mshare_base = mshare_align;
 
 #define MSHARE_INITIALIZED	0x1
+#define MSHARE_HAS_OWNER	0x2
 
 struct mshare_data {
 	struct mm_struct *mm;
@@ -35,6 +37,7 @@ struct mshare_data {
 	unsigned long size;
 	unsigned long flags;
 	struct mmu_notifier mn;
+	struct list_head list;
 };
 
 static inline bool mshare_is_initialized(struct mshare_data *m_data)
@@ -42,6 +45,65 @@ static inline bool mshare_is_initialized(struct mshare_data *m_data)
 	return test_bit(MSHARE_INITIALIZED, &m_data->flags);
 }
 
+static inline bool mshare_has_owner(struct mshare_data *m_data)
+{
+	return test_bit(MSHARE_HAS_OWNER, &m_data->flags);
+}
+
+static bool mshare_data_getref(struct mshare_data *m_data);
+static void mshare_data_putref(struct mshare_data *m_data);
+
+void exit_mshare(struct task_struct *task)
+{
+	for (;;) {
+		struct mshare_data *m_data;
+		int error;
+
+		task_lock(task);
+
+		if (list_empty(&task->mshare_mem)) {
+			task_unlock(task);
+			break;
+		}
+
+		m_data = list_first_entry(&task->mshare_mem, struct mshare_data,
+						list);
+
+		WARN_ON_ONCE(!mshare_data_getref(m_data));
+
+		list_del_init(&m_data->list);
+		task_unlock(task);
+
+		/*
+		 * The owner of an mshare region is going away. Unmap
+		 * everything in the region and prevent more mappings from
+		 * being created.
+		 *
+		 * XXX
+		 * The fact that the unmap can possibly fail is problematic.
+		 * One alternative is doing a subset of what exit_mmap() does.
+		 * If it's preferrable to preserve the mappings then another
+		 * approach is to fail any further faults on the mshare region
+		 * and unlink the shared page tables from the page tables of
+		 * each sharing process by walking the rmap via the msharefs
+		 * inode.
+		 * Unmapping everything means mshare memory is freed up when
+		 * the owner exits which may be preferrable for OOM situations.
+		 */
+
+		clear_bit(MSHARE_HAS_OWNER, &m_data->flags);
+
+		mmap_write_lock(m_data->mm);
+		error = do_munmap(m_data->mm, m_data->start, m_data->size, NULL);
+		mmap_write_unlock(m_data->mm);
+
+		if (error)
+			pr_warn("%s: do_munmap returned %d\n", __func__, error);
+
+		mshare_data_putref(m_data);
+	}
+}
+
 static void mshare_invalidate_tlbs(struct mmu_notifier *mn, struct mm_struct *mm,
 				   unsigned long start, unsigned long end)
 {
@@ -362,6 +424,11 @@ msharefs_fill_mm(struct inode *inode)
 	ret = mmu_notifier_register(&m_data->mn, mm);
 	if (ret)
 		goto err_free;
+	INIT_LIST_HEAD(&m_data->list);
+	task_lock(current);
+	list_add(&m_data->list, &current->mshare_mem);
+	task_unlock(current);
+	set_bit(MSHARE_HAS_OWNER, &m_data->flags);
 
 	refcount_set(&m_data->ref, 1);
 	inode->i_private = m_data;
@@ -380,6 +447,11 @@ msharefs_delmm(struct mshare_data *m_data)
 	kfree(m_data);
 }
 
+static bool mshare_data_getref(struct mshare_data *m_data)
+{
+	return refcount_inc_not_zero(&m_data->ref);
+}
+
 static void mshare_data_putref(struct mshare_data *m_data)
 {
 	if (!refcount_dec_and_test(&m_data->ref))
@@ -543,6 +615,17 @@ msharefs_evict_inode(struct inode *inode)
 	if (!m_data)
 		goto out;
 
+	rcu_read_lock();
+
+	if (!list_empty(&m_data->list)) {
+		struct task_struct *owner = m_data->mm->owner;
+
+		task_lock(owner);
+		list_del_init(&m_data->list);
+		task_unlock(owner);
+	}
+	rcu_read_unlock();
+
 	mshare_data_putref(m_data);
 out:
 	clear_inode(inode);
-- 
2.47.1


From: Khalid Aziz <khalid@kernel.org>

Reserve a range of ioctls for msharefs and add an ioctl for mapping
objects within an mshare region. The arguments are the same as mmap()
except that the start of the mapping is specified as an offset into
the mshare region instead of as an address. System-selected addresses
are not supported so MAP_FIXED must be specified. Only shared anonymous
memory is supported initially.

Signed-off-by: Khalid Aziz <khalid@kernel.org>
Signed-off-by: Anthony Yznaga <anthony.yznaga@oracle.com>
---
 .../userspace-api/ioctl/ioctl-number.rst      |  1 +
 include/uapi/linux/msharefs.h                 | 31 ++++++++
 mm/mshare.c                                   | 76 ++++++++++++++++++-
 3 files changed, 107 insertions(+), 1 deletion(-)
 create mode 100644 include/uapi/linux/msharefs.h

diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
index 406a9f4d0869..cb7377f40696 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -308,6 +308,7 @@ Code  Seq#    Include File                                             Comments
 'v'   20-27  arch/powerpc/include/uapi/asm/vas-api.h                   VAS API
 'v'   C0-FF  linux/meye.h                                              conflict!
 'w'   all                                                              CERN SCI driver
+'x'   00-1F  linux/msharefs.h                                          msharefs filesystem
 'y'   00-1F                                                            packet based user level communications
                                                                        <mailto:zapman@interlan.net>
 'z'   00-3F                                                            CAN bus card conflict!
diff --git a/include/uapi/linux/msharefs.h b/include/uapi/linux/msharefs.h
new file mode 100644
index 000000000000..ad129beeef62
--- /dev/null
+++ b/include/uapi/linux/msharefs.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * msharefs defines a memory region that is shared across processes.
+ * ioctl is used on files created under msharefs to set various
+ * attributes on these shared memory regions
+ *
+ *
+ * Copyright (C) 2024 Oracle Corp. All rights reserved.
+ * Author:	Khalid Aziz <khalid@kernel.org>
+ */
+
+#ifndef _UAPI_LINUX_MSHAREFS_H
+#define _UAPI_LINUX_MSHAREFS_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+/*
+ * msharefs specific ioctl commands
+ */
+#define MSHAREFS_CREATE_MAPPING	_IOW('x', 0,  struct mshare_create)
+
+struct mshare_create {
+	__u64 region_offset;
+	__u64 size;
+	__u64 offset;
+	__u32 prot;
+	__u32 flags;
+	__u32 fd;
+};
+#endif
diff --git a/mm/mshare.c b/mm/mshare.c
index 8a23b391fa11..ebec51e655e4 100644
--- a/mm/mshare.c
+++ b/mm/mshare.c
@@ -10,6 +10,7 @@
  *
  * Copyright (C) 2024 Oracle Corp. All rights reserved.
  * Author:	Khalid Aziz <khalid@kernel.org>
+ * Author:	Matthew Wilcox <willy@infradead.org>
  *
  */
 
@@ -19,6 +20,7 @@
 #include <linux/mmu_notifier.h>
 #include <linux/mshare.h>
 #include <uapi/linux/magic.h>
+#include <uapi/linux/msharefs.h>
 #include <linux/falloc.h>
 #include <asm/tlbflush.h>
 
@@ -308,7 +310,7 @@ msharefs_get_unmapped_area(struct file *file, unsigned long addr,
 	if ((flags & MAP_TYPE) == MAP_PRIVATE)
 		return -EINVAL;
 
-	if (!mshare_is_initialized(m_data))
+	if (!mshare_is_initialized(m_data) || !mshare_has_owner(m_data))
 		return -EINVAL;
 
 	mshare_start = m_data->start;
@@ -343,6 +345,77 @@ msharefs_get_unmapped_area(struct file *file, unsigned long addr,
 				pgoff, flags);
 }
 
+static long
+msharefs_create_mapping(struct mshare_data *m_data, struct mshare_create *mcreate)
+{
+	struct mm_struct *host_mm = m_data->mm;
+	unsigned long mshare_start, mshare_end;
+	unsigned long region_offset = mcreate->region_offset;
+	unsigned long size = mcreate->size;
+	unsigned int fd = mcreate->fd;
+	int flags = mcreate->flags;
+	int prot = mcreate->prot;
+	unsigned long populate = 0;
+	unsigned long mapped_addr;
+	unsigned long addr;
+	vm_flags_t vm_flags;
+	int error = -EINVAL;
+
+	mshare_start = m_data->start;
+	mshare_end = mshare_start + m_data->size;
+	addr = mshare_start + region_offset;
+
+	if ((addr < mshare_start) || (addr >= mshare_end) ||
+	    (addr + size > mshare_end))
+		goto out;
+
+	/*
+	 * Only anonymous shared memory at fixed addresses is allowed for now.
+	 */
+	if ((flags & (MAP_SHARED | MAP_FIXED)) != (MAP_SHARED | MAP_FIXED))
+		goto out;
+	if (fd != -1)
+		goto out;
+
+	if (mmap_write_lock_killable(host_mm)) {
+		error = -EINTR;
+		goto out;
+	}
+
+	error = 0;
+	mapped_addr = __do_mmap(NULL, addr, size, prot, flags, vm_flags,
+				0, &populate, NULL, host_mm);
+
+	if (IS_ERR_VALUE(mapped_addr))
+		error = (long)mapped_addr;
+
+	mmap_write_unlock(host_mm);
+out:
+	return error;
+}
+
+static long
+msharefs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct mshare_data *m_data = filp->private_data;
+	struct mshare_create mcreate;
+
+	if (!mshare_is_initialized(m_data))
+		return -EINVAL;
+
+	switch (cmd) {
+	case MSHAREFS_CREATE_MAPPING:
+		if (copy_from_user(&mcreate, (struct mshare_create __user *)arg,
+			sizeof(mcreate)))
+			return -EFAULT;
+
+		return msharefs_create_mapping(m_data, &mcreate);
+
+	default:
+		return -ENOTTY;
+	}
+}
+
 static int msharefs_set_size(struct mshare_data *m_data, unsigned long size)
 {
 	int error = -EINVAL;
@@ -398,6 +471,7 @@ static const struct file_operations msharefs_file_operations = {
 	.open			= simple_open,
 	.mmap			= msharefs_mmap,
 	.get_unmapped_area	= msharefs_get_unmapped_area,
+	.unlocked_ioctl		= msharefs_ioctl,
 	.fallocate		= msharefs_fallocate,
 };
 
-- 
2.47.1


The arguments are the same as munmap() except that the start of the
mapping is specified as an offset into the mshare region instead of
as an address.

Signed-off-by: Anthony Yznaga <anthony.yznaga@oracle.com>
---
 include/uapi/linux/msharefs.h |  7 +++++++
 mm/mshare.c                   | 37 +++++++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+)

diff --git a/include/uapi/linux/msharefs.h b/include/uapi/linux/msharefs.h
index ad129beeef62..fb0235d1e384 100644
--- a/include/uapi/linux/msharefs.h
+++ b/include/uapi/linux/msharefs.h
@@ -19,6 +19,7 @@
  * msharefs specific ioctl commands
  */
 #define MSHAREFS_CREATE_MAPPING	_IOW('x', 0,  struct mshare_create)
+#define MSHAREFS_UNMAP		_IOW('x', 1,  struct mshare_unmap)
 
 struct mshare_create {
 	__u64 region_offset;
@@ -28,4 +29,10 @@ struct mshare_create {
 	__u32 flags;
 	__u32 fd;
 };
+
+struct mshare_unmap {
+	__u64 region_offset;
+	__u64 size;
+};
+
 #endif
diff --git a/mm/mshare.c b/mm/mshare.c
index ebec51e655e4..b1e02f5e1f60 100644
--- a/mm/mshare.c
+++ b/mm/mshare.c
@@ -394,11 +394,41 @@ msharefs_create_mapping(struct mshare_data *m_data, struct mshare_create *mcreat
 	return error;
 }
 
+static long
+msharefs_unmap(struct mshare_data *m_data, struct mshare_unmap *munmap)
+{
+	struct mm_struct *host_mm = m_data->mm;
+	unsigned long mshare_start, mshare_end, mshare_size;
+	unsigned long region_offset = munmap->region_offset;
+	unsigned long size = munmap->size;
+	unsigned long addr;
+	int error;
+
+	mshare_start = m_data->start;
+	mshare_size = m_data->size;
+	mshare_end = mshare_start + mshare_size;
+	addr = mshare_start + region_offset;
+
+	if ((size > mshare_size) || (region_offset >= mshare_size) ||
+	    (addr + size > mshare_end))
+		return -EINVAL;
+
+	if (mmap_write_lock_killable(host_mm))
+		return -EINTR;
+
+	error = do_munmap(host_mm, addr, size, NULL);
+
+	mmap_write_unlock(host_mm);
+
+	return error;
+}
+
 static long
 msharefs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	struct mshare_data *m_data = filp->private_data;
 	struct mshare_create mcreate;
+	struct mshare_unmap munmap;
 
 	if (!mshare_is_initialized(m_data))
 		return -EINVAL;
@@ -411,6 +441,13 @@ msharefs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 
 		return msharefs_create_mapping(m_data, &mcreate);
 
+	case MSHAREFS_UNMAP:
+		if (copy_from_user(&munmap, (struct mshare_unmap __user *)arg,
+			sizeof(munmap)))
+			return -EFAULT;
+
+		return msharefs_unmap(m_data, &munmap);
+
 	default:
 		return -ENOTTY;
 	}
-- 
2.47.1


The astute reader will notice that the code is largely copied from
ksys_mmap_pgoff() with key differences being that mapping an mshare
region within an mshare region is disallowed and that the possibly
modified size is checked to ensure the new mapping does not exceed
the bounds of the mshare region.

Signed-off-by: Anthony Yznaga <anthony.yznaga@oracle.com>
---
 mm/mshare.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 63 insertions(+), 8 deletions(-)

diff --git a/mm/mshare.c b/mm/mshare.c
index b1e02f5e1f60..ddcf7bb2e956 100644
--- a/mm/mshare.c
+++ b/mm/mshare.c
@@ -14,8 +14,10 @@
  *
  */
 
+#include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/fs_context.h>
+#include <linux/hugetlb.h>
 #include <linux/mman.h>
 #include <linux/mmu_notifier.h>
 #include <linux/mshare.h>
@@ -345,12 +347,15 @@ msharefs_get_unmapped_area(struct file *file, unsigned long addr,
 				pgoff, flags);
 }
 
+static const struct file_operations msharefs_file_operations;
+
 static long
 msharefs_create_mapping(struct mshare_data *m_data, struct mshare_create *mcreate)
 {
 	struct mm_struct *host_mm = m_data->mm;
 	unsigned long mshare_start, mshare_end;
 	unsigned long region_offset = mcreate->region_offset;
+	unsigned long pgoff = mcreate->offset >> PAGE_SHIFT;
 	unsigned long size = mcreate->size;
 	unsigned int fd = mcreate->fd;
 	int flags = mcreate->flags;
@@ -359,37 +364,87 @@ msharefs_create_mapping(struct mshare_data *m_data, struct mshare_create *mcreat
 	unsigned long mapped_addr;
 	unsigned long addr;
 	vm_flags_t vm_flags;
+	struct file *file = NULL;
 	int error = -EINVAL;
 
 	mshare_start = m_data->start;
 	mshare_end = mshare_start + m_data->size;
 	addr = mshare_start + region_offset;
 
-	if ((addr < mshare_start) || (addr >= mshare_end) ||
-	    (addr + size > mshare_end))
+	/*
+	 * Check the size later after size has possibly been
+	 * adjusted.
+	 */
+	if ((addr < mshare_start) || (addr >= mshare_end))
 		goto out;
 
 	/*
-	 * Only anonymous shared memory at fixed addresses is allowed for now.
+	 * Only shared memory at fixed addresses is allowed for now.
 	 */
 	if ((flags & (MAP_SHARED | MAP_FIXED)) != (MAP_SHARED | MAP_FIXED))
 		goto out;
-	if (fd != -1)
-		goto out;
+
+	if (!(flags & MAP_ANONYMOUS)) {
+		file = fget(fd);
+		if (!file) {
+			error = -EBADF;
+			goto out;
+		}
+		if (is_file_hugepages(file)) {
+			size = ALIGN(size, huge_page_size(hstate_file(file)));
+		} else if (unlikely(flags & MAP_HUGETLB)) {
+			error = -EINVAL;
+			goto out_fput;
+		} else if (file->f_op == &msharefs_file_operations) {
+			error = -EINVAL;
+			goto out_fput;
+		}
+	} else if (flags & MAP_HUGETLB) {
+		struct hstate *hs;
+
+		hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
+		if (!hs)
+			return -EINVAL;
+
+		size = ALIGN(size, huge_page_size(hs));
+		/*
+		 * VM_NORESERVE is used because the reservations will be
+		 * taken when vm_ops->mmap() is called
+		 */
+		file = hugetlb_file_setup(HUGETLB_ANON_FILE, size,
+				VM_NORESERVE,
+				HUGETLB_ANONHUGE_INODE,
+				(flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
+		if (IS_ERR(file)) {
+			error = PTR_ERR(file);
+			goto out;
+		}
+	}
+
+	if (addr + size > mshare_end)
+		goto out_fput;
+
+	error = security_mmap_file(file, prot, flags);
+	if (error)
+		goto out_fput;
 
 	if (mmap_write_lock_killable(host_mm)) {
 		error = -EINTR;
-		goto out;
+		goto out_fput;
 	}
 
 	error = 0;
-	mapped_addr = __do_mmap(NULL, addr, size, prot, flags, vm_flags,
-				0, &populate, NULL, host_mm);
+	mapped_addr = __do_mmap(file, addr, size, prot, flags, vm_flags,
+				pgoff, &populate, NULL, host_mm);
 
 	if (IS_ERR_VALUE(mapped_addr))
 		error = (long)mapped_addr;
 
 	mmap_write_unlock(host_mm);
+
+out_fput:
+	if (file)
+		fput(file);
 out:
 	return error;
 }
-- 
2.47.1


Add new mm flag, MMF_MSHARE.

Signed-off-by: Anthony Yznaga <anthony.yznaga@oracle.com>
---
 include/linux/mm_types.h | 2 ++
 mm/mshare.c              | 1 +
 2 files changed, 3 insertions(+)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index da5a7a31a81d..4586a3f384f1 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1847,6 +1847,8 @@ enum {
 #define MMF_TOPDOWN		31	/* mm searches top down by default */
 #define MMF_TOPDOWN_MASK	_BITUL(MMF_TOPDOWN)
 
+#define MMF_MSHARE		32	/* mm is an mshare host mm */
+
 #define MMF_INIT_LEGACY_MASK	(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
 				 MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\
 				 MMF_VM_MERGE_ANY_MASK | MMF_TOPDOWN_MASK)
diff --git a/mm/mshare.c b/mm/mshare.c
index ddcf7bb2e956..22e2aedb74d3 100644
--- a/mm/mshare.c
+++ b/mm/mshare.c
@@ -578,6 +578,7 @@ msharefs_fill_mm(struct inode *inode)
 	if (!mm)
 		return -ENOMEM;
 
+	mm_flags_set(MMF_MSHARE, mm);
 	mm->mmap_base = mshare_base;
 	mm->task_size = 0;
 
-- 
2.47.1


When handling a fault in an mshare range, redirect charges for page
tables and other allocations to the mshare owner rather than the
current task.

Signed-off-by: Anthony Yznaga <anthony.yznaga@oracle.com>
---
 mm/memory.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/mm/memory.c b/mm/memory.c
index 177eb53475cb..127db0b9932c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -6468,9 +6468,17 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 	struct mm_struct *mm = vma->vm_mm;
 	vm_fault_t ret;
 	bool is_droppable;
+	bool is_mshare = mm_flags_test(MMF_MSHARE, mm);
+	struct mem_cgroup *mshare_memcg;
+	struct mem_cgroup *memcg;
 
 	__set_current_state(TASK_RUNNING);
 
+	if (unlikely(is_mshare)) {
+		mshare_memcg = get_mem_cgroup_from_mm(vma->vm_mm);
+		memcg = set_active_memcg(mshare_memcg);
+	}
+
 	ret = sanitize_fault_flags(vma, &flags);
 	if (ret)
 		goto out;
@@ -6530,6 +6538,11 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 out:
 	mm_account_fault(mm, regs, address, flags, ret);
 
+	if (unlikely(is_mshare)) {
+		set_active_memcg(memcg);
+		mem_cgroup_put(mshare_memcg);
+	}
+
 	return ret;
 }
 EXPORT_SYMBOL_GPL(handle_mm_fault);
-- 
2.47.1