From: Vernon Yang Introducing bpf_mthp_ops enables eBPF programs to register the mthp_choose callback function via cgroup-ebpf. Using cgroup-bpf to customize mTHP size for different scenarios, automatically select different mTHP sizes for different cgroups, let's focus on making them truly transparent. Signed-off-by: Vernon Yang --- MAINTAINERS | 3 + include/linux/bpf_huge_memory.h | 35 +++++++ include/linux/cgroup-defs.h | 1 + include/linux/huge_mm.h | 6 ++ mm/Kconfig | 14 +++ mm/Makefile | 1 + mm/bpf_huge_memory.c | 169 ++++++++++++++++++++++++++++++++ 7 files changed, 229 insertions(+) create mode 100644 include/linux/bpf_huge_memory.h create mode 100644 mm/bpf_huge_memory.c diff --git a/MAINTAINERS b/MAINTAINERS index 27a073f53cea..39f00676eeb7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4887,7 +4887,10 @@ M: Shakeel Butt L: bpf@vger.kernel.org L: linux-mm@kvack.org S: Maintained +F: include/linux/bpf_huge_memory.h +F: mm/bpf_huge_memory.c F: mm/bpf_memcontrol.c +F: samples/bpf/mthp_ext.* BPF [MISC] L: bpf@vger.kernel.org diff --git a/include/linux/bpf_huge_memory.h b/include/linux/bpf_huge_memory.h new file mode 100644 index 000000000000..1c8a6f7ad8f1 --- /dev/null +++ b/include/linux/bpf_huge_memory.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ + +#ifndef __BPF_HUGE_MEMORY_H +#define __BPF_HUGE_MEMORY_H + +/** + * struct bpf_mthp_ops - BPF callbacks for mTHP operations + * @mthp_choose: Choose the custom mTHP orders + * + * This structure defines the interface for BPF programs to customize + * mTHP behavior through struct_ops programs. + */ +struct bpf_mthp_ops { + unsigned long (*mthp_choose)(struct cgroup *cgrp, unsigned long orders); +}; + +#if defined(CONFIG_BPF_TRANSPARENT_HUGEPAGE) && defined(CONFIG_BPF_SYSCALL) +/** + * bpf_mthp_choose: Choose the custom mTHP orders using bpf + * @mm: task mm_struct + * @orders: original orders + * + * Return suited mTHP orders. + */ +unsigned long bpf_mthp_choose(struct mm_struct *mm, unsigned long orders); +#else +static inline unsigned long bpf_mthp_choose(struct mm_struct *mm, + unsigned long orders) +{ + return orders; +} +#endif /* CONFIG_BPF_SYSCALL */ + +#endif /* __BPF_HUGE_MEMORY_H */ + diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index f42563739d2e..78854d0e06ab 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -628,6 +628,7 @@ struct cgroup { #ifdef CONFIG_BPF_SYSCALL struct bpf_local_storage __rcu *bpf_cgrp_storage; + struct bpf_mthp_ops *mthp_ops; #endif #ifdef CONFIG_EXT_SUB_SCHED struct scx_sched __rcu *scx_sched; diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 2949e5acff35..80ec622213df 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -3,6 +3,7 @@ #define _LINUX_HUGE_MM_H #include +#include #include /* only for vma_is_dax() */ #include @@ -291,6 +292,11 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, enum tva_type type, unsigned long orders) { + /* The eBPF-specified orders overrides which order is selected. */ + orders &= bpf_mthp_choose(vma->vm_mm, orders); + if (!orders) + return 0; + /* * Optimization to check if required orders are enabled early. Only * forced collapse ignores sysfs configs. diff --git a/mm/Kconfig b/mm/Kconfig index e8bf1e9e6ad9..12382431ddc7 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -963,6 +963,20 @@ config NO_PAGE_MAPCOUNT EXPERIMENTAL because the impact of some changes is still unclear. +config BPF_TRANSPARENT_HUGEPAGE + bool "BPF-based transparent hugepage (EXPERIMENTAL)" + depends on TRANSPARENT_HUGEPAGE + help + Using cgroup-bpf to customize mTHP size for different scenarios, + automatically select different mTHP sizes for different cgroups, + let's focus on making them truly transparent. + + This is an experimental feature, that might go away at any time, + Please do not rely any production environment. + + EXPERIMENTAL because the BPF interface is unstable and may be removed + at any time. + endif # TRANSPARENT_HUGEPAGE # simple helper to make the code a bit easier to read diff --git a/mm/Makefile b/mm/Makefile index 8ad2ab08244e..b474c21c3253 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -108,6 +108,7 @@ obj-$(CONFIG_MEMCG) += swap_cgroup.o endif ifdef CONFIG_BPF_SYSCALL obj-$(CONFIG_MEMCG) += bpf_memcontrol.o +obj-$(CONFIG_BPF_TRANSPARENT_HUGEPAGE) += bpf_huge_memory.o endif obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o obj-$(CONFIG_GUP_TEST) += gup_test.o diff --git a/mm/bpf_huge_memory.c b/mm/bpf_huge_memory.c new file mode 100644 index 000000000000..e34e0a35edac --- /dev/null +++ b/mm/bpf_huge_memory.c @@ -0,0 +1,169 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Huge memory related BPF code + * + * Author: Vernon Yang + */ + +#include +#include + +/* Protects cgrp->mthp_ops pointer for read and write. */ +DEFINE_SRCU(mthp_bpf_srcu); + +unsigned long bpf_mthp_choose(struct mm_struct *mm, unsigned long orders) +{ + struct cgroup *cgrp; + struct mem_cgroup *memcg; + struct bpf_mthp_ops *ops; + int idx; + + memcg = get_mem_cgroup_from_mm(mm); + if (!memcg) + return orders; + + cgrp = memcg->css.cgroup; + ops = READ_ONCE(cgrp->mthp_ops); + if (unlikely(ops)) { + idx = srcu_read_lock(&mthp_bpf_srcu); + if (ops->mthp_choose) + orders = ops->mthp_choose(cgrp, orders); + srcu_read_unlock(&mthp_bpf_srcu, idx); + } + + mem_cgroup_put(memcg); + + return orders; +} + +static int bpf_mthp_ops_btf_struct_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, int off, int size) +{ + return -EACCES; +} + +static bool bpf_mthp_ops_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) +{ + return bpf_tracing_btf_ctx_access(off, size, type, prog, info); +} + +const struct bpf_verifier_ops bpf_mthp_verifier_ops = { + .get_func_proto = bpf_base_func_proto, + .btf_struct_access = bpf_mthp_ops_btf_struct_access, + .is_valid_access = bpf_mthp_ops_is_valid_access, +}; + +static int bpf_mthp_ops_reg(void *kdata, struct bpf_link *link) +{ + struct bpf_struct_ops_link *st_link = (struct bpf_struct_ops_link *)link; + struct bpf_mthp_ops *ops = kdata; + struct cgroup *cgrp = st_link->cgroup; + struct cgroup_subsys_state *pos; + + /* The link is not yet fully initialized, but cgroup should be set */ + if (!link) + return -EOPNOTSUPP; + + cgroup_lock(); + css_for_each_descendant_pre(pos, &cgrp->self) { + struct cgroup *child = pos->cgroup; + + if (READ_ONCE(child->mthp_ops)) { + /* TODO + * Do not destroy the cgroup hierarchy property. + * If an eBPF program already exists in the sub-cgroup, + * trigger an error and clear the already set + * bpf_mthp_ops data. + */ + continue; + } + WRITE_ONCE(child->mthp_ops, ops); + } + cgroup_unlock(); + + return 0; +} + +static void bpf_mthp_ops_unreg(void *kdata, struct bpf_link *link) +{ + struct bpf_struct_ops_link *st_link = (struct bpf_struct_ops_link *)link; + struct bpf_mthp_ops *ops = kdata; + struct cgroup *cgrp = st_link->cgroup; + struct cgroup_subsys_state *pos; + + cgroup_lock(); + css_for_each_descendant_pre(pos, &cgrp->self) { + struct cgroup *child = pos->cgroup; + + if (READ_ONCE(child->mthp_ops) == ops) + WRITE_ONCE(child->mthp_ops, NULL); + } + cgroup_unlock(); + + synchronize_srcu(&mthp_bpf_srcu); +} + +static int bpf_mthp_ops_check_member(const struct btf_type *t, + const struct btf_member *member, + const struct bpf_prog *prog) +{ + u32 moff = __btf_member_bit_offset(t, member) / 8; + + switch (moff) { + case offsetof(struct bpf_mthp_ops, mthp_choose): + break; + default: + return -EINVAL; + } + + if (prog->sleepable) + return -EINVAL; + + return 0; +} + +static int bpf_mthp_ops_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + return 0; +} + +static int bpf_mthp_ops_init(struct btf *btf) +{ + return 0; +} + +static unsigned long cfi_mthp_choose(struct cgroup *cgrp, unsigned long orders) +{ + return 0; +} + +static struct bpf_mthp_ops cfi_bpf_mthp_ops = { + .mthp_choose = cfi_mthp_choose, +}; + +static struct bpf_struct_ops bso_bpf_mthp_ops = { + .verifier_ops = &bpf_mthp_verifier_ops, + .reg = bpf_mthp_ops_reg, + .unreg = bpf_mthp_ops_unreg, + .check_member = bpf_mthp_ops_check_member, + .init_member = bpf_mthp_ops_init_member, + .init = bpf_mthp_ops_init, + .name = "bpf_mthp_ops", + .owner = THIS_MODULE, + .cfi_stubs = &cfi_bpf_mthp_ops, +}; + +static int __init bpf_huge_memory_init(void) +{ + int err; + + err = register_bpf_struct_ops(&bso_bpf_mthp_ops, bpf_mthp_ops); + if (err) + pr_warn("Registration of bpf_mthp_ops failed, err %d\n", err); + + return err; +} +late_initcall(bpf_huge_memory_init); -- 2.53.0