From: Vernon Yang <yanglincheng@kylinos.cn>

Introducing bpf_mthp_ops enables eBPF programs to register the
mthp_choose callback function via cgroup-ebpf.

Using cgroup-bpf to customize mTHP size for different scenarios，
automatically select different mTHP sizes for different cgroups,
let's focus on making them truly transparent.

Signed-off-by: Vernon Yang <yanglincheng@kylinos.cn>
---
 MAINTAINERS                     |   3 +
 include/linux/bpf_huge_memory.h |  35 +++++++
 include/linux/cgroup-defs.h     |   1 +
 include/linux/huge_mm.h         |   6 ++
 mm/Kconfig                      |  14 +++
 mm/Makefile                     |   1 +
 mm/bpf_huge_memory.c            | 169 ++++++++++++++++++++++++++++++++
 7 files changed, 229 insertions(+)
 create mode 100644 include/linux/bpf_huge_memory.h
 create mode 100644 mm/bpf_huge_memory.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 27a073f53cea..39f00676eeb7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4887,7 +4887,10 @@ M:	Shakeel Butt <shakeel.butt@linux.dev>
 L:	bpf@vger.kernel.org
 L:	linux-mm@kvack.org
 S:	Maintained
+F:	include/linux/bpf_huge_memory.h
+F:	mm/bpf_huge_memory.c
 F:	mm/bpf_memcontrol.c
+F:	samples/bpf/mthp_ext.*
 
 BPF [MISC]
 L:	bpf@vger.kernel.org
diff --git a/include/linux/bpf_huge_memory.h b/include/linux/bpf_huge_memory.h
new file mode 100644
index 000000000000..1c8a6f7ad8f1
--- /dev/null
+++ b/include/linux/bpf_huge_memory.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+
+#ifndef __BPF_HUGE_MEMORY_H
+#define __BPF_HUGE_MEMORY_H
+
+/**
+ * struct bpf_mthp_ops - BPF callbacks for mTHP operations
+ * @mthp_choose: Choose the custom mTHP orders
+ *
+ * This structure defines the interface for BPF programs to customize
+ * mTHP behavior through struct_ops programs.
+ */
+struct bpf_mthp_ops {
+	unsigned long (*mthp_choose)(struct cgroup *cgrp, unsigned long orders);
+};
+
+#if defined(CONFIG_BPF_TRANSPARENT_HUGEPAGE) && defined(CONFIG_BPF_SYSCALL)
+/**
+ * bpf_mthp_choose: Choose the custom mTHP orders using bpf
+ * @mm: task mm_struct
+ * @orders: original orders
+ *
+ * Return suited mTHP orders.
+ */
+unsigned long bpf_mthp_choose(struct mm_struct *mm, unsigned long orders);
+#else
+static inline unsigned long bpf_mthp_choose(struct mm_struct *mm,
+					    unsigned long orders)
+{
+	return orders;
+}
+#endif /* CONFIG_BPF_SYSCALL */
+
+#endif /* __BPF_HUGE_MEMORY_H */
+
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index f42563739d2e..78854d0e06ab 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -628,6 +628,7 @@ struct cgroup {
 
 #ifdef CONFIG_BPF_SYSCALL
 	struct bpf_local_storage __rcu  *bpf_cgrp_storage;
+	struct bpf_mthp_ops *mthp_ops;
 #endif
 #ifdef CONFIG_EXT_SUB_SCHED
 	struct scx_sched __rcu *scx_sched;
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 2949e5acff35..80ec622213df 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -3,6 +3,7 @@
 #define _LINUX_HUGE_MM_H
 
 #include <linux/mm_types.h>
+#include <linux/bpf_huge_memory.h>
 
 #include <linux/fs.h> /* only for vma_is_dax() */
 #include <linux/kobject.h>
@@ -291,6 +292,11 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
 				       enum tva_type type,
 				       unsigned long orders)
 {
+	/* The eBPF-specified orders overrides which order is selected. */
+	orders &= bpf_mthp_choose(vma->vm_mm, orders);
+	if (!orders)
+		return 0;
+
 	/*
 	 * Optimization to check if required orders are enabled early. Only
 	 * forced collapse ignores sysfs configs.
diff --git a/mm/Kconfig b/mm/Kconfig
index e8bf1e9e6ad9..12382431ddc7 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -963,6 +963,20 @@ config NO_PAGE_MAPCOUNT
 
 	  EXPERIMENTAL because the impact of some changes is still unclear.
 
+config BPF_TRANSPARENT_HUGEPAGE
+	bool "BPF-based transparent hugepage (EXPERIMENTAL)"
+	depends on TRANSPARENT_HUGEPAGE
+	help
+	  Using cgroup-bpf to customize mTHP size for different scenarios,
+	  automatically select different mTHP sizes for different cgroups,
+	  let's focus on making them truly transparent.
+
+	  This is an experimental feature, that might go away at any time,
+	  Please do not rely any production environment.
+
+	  EXPERIMENTAL because the BPF interface is unstable and may be removed
+	  at any time.
+
 endif # TRANSPARENT_HUGEPAGE
 
 # simple helper to make the code a bit easier to read
diff --git a/mm/Makefile b/mm/Makefile
index 8ad2ab08244e..b474c21c3253 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -108,6 +108,7 @@ obj-$(CONFIG_MEMCG) += swap_cgroup.o
 endif
 ifdef CONFIG_BPF_SYSCALL
 obj-$(CONFIG_MEMCG) += bpf_memcontrol.o
+obj-$(CONFIG_BPF_TRANSPARENT_HUGEPAGE) += bpf_huge_memory.o
 endif
 obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
 obj-$(CONFIG_GUP_TEST) += gup_test.o
diff --git a/mm/bpf_huge_memory.c b/mm/bpf_huge_memory.c
new file mode 100644
index 000000000000..e34e0a35edac
--- /dev/null
+++ b/mm/bpf_huge_memory.c
@@ -0,0 +1,169 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Huge memory related BPF code
+ *
+ * Author: Vernon Yang <yanglincheng@kylinos.cn>
+ */
+
+#include <linux/bpf.h>
+#include <linux/srcu.h>
+
+/* Protects cgrp->mthp_ops pointer for read and write. */
+DEFINE_SRCU(mthp_bpf_srcu);
+
+unsigned long bpf_mthp_choose(struct mm_struct *mm, unsigned long orders)
+{
+	struct cgroup *cgrp;
+	struct mem_cgroup *memcg;
+	struct bpf_mthp_ops *ops;
+	int idx;
+
+	memcg = get_mem_cgroup_from_mm(mm);
+	if (!memcg)
+		return orders;
+
+	cgrp = memcg->css.cgroup;
+	ops = READ_ONCE(cgrp->mthp_ops);
+	if (unlikely(ops)) {
+		idx = srcu_read_lock(&mthp_bpf_srcu);
+		if (ops->mthp_choose)
+			orders = ops->mthp_choose(cgrp, orders);
+		srcu_read_unlock(&mthp_bpf_srcu, idx);
+	}
+
+	mem_cgroup_put(memcg);
+
+	return orders;
+}
+
+static int bpf_mthp_ops_btf_struct_access(struct bpf_verifier_log *log,
+		const struct bpf_reg_state *reg, int off, int size)
+{
+	return -EACCES;
+}
+
+static bool bpf_mthp_ops_is_valid_access(int off, int size, enum bpf_access_type type,
+		const struct bpf_prog *prog, struct bpf_insn_access_aux *info)
+{
+	return bpf_tracing_btf_ctx_access(off, size, type, prog, info);
+}
+
+const struct bpf_verifier_ops bpf_mthp_verifier_ops = {
+	.get_func_proto = bpf_base_func_proto,
+	.btf_struct_access = bpf_mthp_ops_btf_struct_access,
+	.is_valid_access = bpf_mthp_ops_is_valid_access,
+};
+
+static int bpf_mthp_ops_reg(void *kdata, struct bpf_link *link)
+{
+	struct bpf_struct_ops_link *st_link = (struct bpf_struct_ops_link *)link;
+	struct bpf_mthp_ops *ops = kdata;
+	struct cgroup *cgrp = st_link->cgroup;
+	struct cgroup_subsys_state *pos;
+
+	/* The link is not yet fully initialized, but cgroup should be set */
+	if (!link)
+		return -EOPNOTSUPP;
+
+	cgroup_lock();
+	css_for_each_descendant_pre(pos, &cgrp->self) {
+		struct cgroup *child = pos->cgroup;
+
+		if (READ_ONCE(child->mthp_ops)) {
+			/* TODO
+			 * Do not destroy the cgroup hierarchy property.
+			 * If an eBPF program already exists in the sub-cgroup,
+			 * trigger an error and clear the already set
+			 * bpf_mthp_ops data.
+			 */
+			continue;
+		}
+		WRITE_ONCE(child->mthp_ops, ops);
+	}
+	cgroup_unlock();
+
+	return 0;
+}
+
+static void bpf_mthp_ops_unreg(void *kdata, struct bpf_link *link)
+{
+	struct bpf_struct_ops_link *st_link = (struct bpf_struct_ops_link *)link;
+	struct bpf_mthp_ops *ops = kdata;
+	struct cgroup *cgrp = st_link->cgroup;
+	struct cgroup_subsys_state *pos;
+
+	cgroup_lock();
+	css_for_each_descendant_pre(pos, &cgrp->self) {
+		struct cgroup *child = pos->cgroup;
+
+		if (READ_ONCE(child->mthp_ops) == ops)
+			WRITE_ONCE(child->mthp_ops, NULL);
+	}
+	cgroup_unlock();
+
+	synchronize_srcu(&mthp_bpf_srcu);
+}
+
+static int bpf_mthp_ops_check_member(const struct btf_type *t,
+				     const struct btf_member *member,
+				     const struct bpf_prog *prog)
+{
+	u32 moff = __btf_member_bit_offset(t, member) / 8;
+
+	switch (moff) {
+	case offsetof(struct bpf_mthp_ops, mthp_choose):
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (prog->sleepable)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int bpf_mthp_ops_init_member(const struct btf_type *t,
+				    const struct btf_member *member,
+				    void *kdata, const void *udata)
+{
+	return 0;
+}
+
+static int bpf_mthp_ops_init(struct btf *btf)
+{
+	return 0;
+}
+
+static unsigned long cfi_mthp_choose(struct cgroup *cgrp, unsigned long orders)
+{
+	return 0;
+}
+
+static struct bpf_mthp_ops cfi_bpf_mthp_ops = {
+	.mthp_choose = cfi_mthp_choose,
+};
+
+static struct bpf_struct_ops bso_bpf_mthp_ops = {
+	.verifier_ops = &bpf_mthp_verifier_ops,
+	.reg = bpf_mthp_ops_reg,
+	.unreg = bpf_mthp_ops_unreg,
+	.check_member = bpf_mthp_ops_check_member,
+	.init_member = bpf_mthp_ops_init_member,
+	.init = bpf_mthp_ops_init,
+	.name = "bpf_mthp_ops",
+	.owner = THIS_MODULE,
+	.cfi_stubs = &cfi_bpf_mthp_ops,
+};
+
+static int __init bpf_huge_memory_init(void)
+{
+	int err;
+
+	err = register_bpf_struct_ops(&bso_bpf_mthp_ops, bpf_mthp_ops);
+	if (err)
+		pr_warn("Registration of bpf_mthp_ops failed, err %d\n", err);
+
+	return err;
+}
+late_initcall(bpf_huge_memory_init);
-- 
2.53.0