From: Hui Zhu Add BPF struct_ops support to the memory controller, enabling custom control over memory pressure through the memcg_nr_pages_over_high mechanism. This patch introduces a new memcg_bpf_ops struct_ops type that allows BPF programs to report additional "pages over high" for specific memory cgroups. This additional count is integrated into the existing memory pressure calculation, causing affected cgroups to be throttled more aggressively. Key components: 1. memcg_bpf_ops structure: - memcg_nr_pages_over_high: Returns custom over-high count - handle_cgroup_online: Called when cgroup comes online - handle_cgroup_offline: Called when cgroup goes offline 2. Integration points: - reclaim_high(): Check BPF hook before reclaim decisions - calculate_overage(): Add BPF-reported overage to calculation - __mem_cgroup_handle_over_high(): Include BPF overage in throttling decisions - try_charge_memcg(): Trigger handling if BPF reports high overage 3. Lifecycle management: - Programs inherit from parent during cgroup online - SRCU protection for safe concurrent access - Clean detachment during cgroup offline - Hierarchy-wide attachment/detachment Use case: A high-priority cgroup experiencing memory pressure can trigger BPF logic to report additional overage for low-priority cgroups, causing them to be throttled and free up memory. This builds upon Roman Gushchin's BPF OOM patch series: https://lore.kernel.org/lkml/20251027231727.472628-1-roman.gushchin@linux.dev/ Signed-off-by: Geliang Tang Signed-off-by: Hui Zhu --- MAINTAINERS | 1 + include/linux/memcontrol.h | 2 + mm/bpf_memcontrol.c | 241 ++++++++++++++++++++++++++++++++++++- mm/bpf_memcontrol.h | 73 +++++++++++ mm/memcontrol.c | 27 +++-- 5 files changed, 335 insertions(+), 9 deletions(-) create mode 100644 mm/bpf_memcontrol.h diff --git a/MAINTAINERS b/MAINTAINERS index 229b5fae7a91..158f3ba63ee7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6349,6 +6349,7 @@ L: linux-mm@kvack.org S: Maintained F: include/linux/memcontrol.h F: include/linux/page_counter.h +F: mm/bpf_memcontrol.h F: mm/memcontrol.c F: mm/memcontrol-v1.c F: mm/memcontrol-v1.h diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index b9e08dddd7ad..ea623aee4ca5 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -229,6 +229,8 @@ struct mem_cgroup { #ifdef CONFIG_BPF_SYSCALL struct bpf_oom_ops *bpf_oom; + + struct memcg_bpf_ops *bpf_ops; #endif int swappiness; diff --git a/mm/bpf_memcontrol.c b/mm/bpf_memcontrol.c index 458ad022b036..9a788b1414ad 100644 --- a/mm/bpf_memcontrol.c +++ b/mm/bpf_memcontrol.c @@ -8,6 +8,11 @@ #include #include +#include "bpf_memcontrol.h" + +/* Protects memcg->bpf_ops pointer for read and write. */ +DEFINE_SRCU(memcg_bpf_srcu); + __bpf_kfunc_start_defs(); /** @@ -162,15 +167,247 @@ static const struct btf_kfunc_id_set bpf_memcontrol_kfunc_set = { .set = &bpf_memcontrol_kfuncs, }; +/** + * memcontrol_bpf_online - Inherit BPF programs for a new online cgroup. + * @memcg: The memory cgroup that is coming online. + * + * When a new memcg is brought online, it inherits the BPF programs + * attached to its parent. This ensures consistent BPF-based memory + * control policies throughout the cgroup hierarchy. + * + * After inheriting, if the BPF program has an online handler, it is + * invoked for the new memcg. + */ +void memcontrol_bpf_online(struct mem_cgroup *memcg) +{ + int idx; + struct memcg_bpf_ops *ops; + struct mem_cgroup *parent_memcg; + + /* The root cgroup does not inherit from a parent. */ + if (mem_cgroup_is_root(memcg)) + return; + + parent_memcg = parent_mem_cgroup(memcg); + + idx = srcu_read_lock(&memcg_bpf_srcu); + + /* Inherit the BPF program from the parent cgroup. */ + ops = READ_ONCE(parent_memcg->bpf_ops); + if (!ops) + goto out; + + WRITE_ONCE(memcg->bpf_ops, ops); + + /* + * If the BPF program implements it, call the online handler to + * allow the program to perform setup tasks for the new cgroup. + */ + if (!ops->handle_cgroup_online) + goto out; + + ops->handle_cgroup_online(memcg); + +out: + srcu_read_unlock(&memcg_bpf_srcu, idx); +} + +/** + * memcontrol_bpf_offline - Run BPF cleanup for an offline cgroup. + * @memcg: The memory cgroup that is going offline. + * + * If a BPF program is attached and implements an offline handler, + * it is invoked to perform cleanup tasks before the memcg goes + * completely offline. + */ +void memcontrol_bpf_offline(struct mem_cgroup *memcg) +{ + int idx; + struct memcg_bpf_ops *ops; + + idx = srcu_read_lock(&memcg_bpf_srcu); + + ops = READ_ONCE(memcg->bpf_ops); + if (!ops || !ops->handle_cgroup_offline) + goto out; + + ops->handle_cgroup_offline(memcg); + +out: + srcu_read_unlock(&memcg_bpf_srcu, idx); +} + +static int memcg_ops_btf_struct_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off, int size) +{ + return -EACCES; +} + +static bool memcg_ops_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + return bpf_tracing_btf_ctx_access(off, size, type, prog, info); +} + +const struct bpf_verifier_ops bpf_memcg_verifier_ops = { + .get_func_proto = bpf_base_func_proto, + .btf_struct_access = memcg_ops_btf_struct_access, + .is_valid_access = memcg_ops_is_valid_access, +}; + +static unsigned int cfi_memcg_nr_pages_over_high(struct mem_cgroup *memcg) +{ + return -EINVAL; +} + +static void cfi_handle_cgroup_online(struct mem_cgroup *memcg) +{ +} + +static void cfi_handle_cgroup_offline(struct mem_cgroup *memcg) +{ +} + +static struct memcg_bpf_ops cfi_bpf_memcg_ops = { + .memcg_nr_pages_over_high = cfi_memcg_nr_pages_over_high, + .handle_cgroup_online = cfi_handle_cgroup_online, + .handle_cgroup_offline = cfi_handle_cgroup_offline, +}; + +static int bpf_memcg_ops_init(struct btf *btf) +{ + return 0; +} + +static int bpf_memcg_ops_check_member(const struct btf_type *t, + const struct btf_member *member, + const struct bpf_prog *prog) +{ + u32 moff = __btf_member_bit_offset(t, member) / 8; + + switch (moff) { + case offsetof(struct memcg_bpf_ops, memcg_nr_pages_over_high): + break; + case offsetof(struct memcg_bpf_ops, handle_cgroup_online): + break; + case offsetof(struct memcg_bpf_ops, handle_cgroup_offline): + break; + default: + if (prog->sleepable) + return -EINVAL; + } + + return 0; +} + +static int bpf_memcg_ops_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + return 0; +} + +/** + * clean_memcg_bpf_ops - Detach BPF programs from a cgroup hierarchy. + * @memcg: The root of the cgroup hierarchy to clean. + * @ops: The specific ops struct to detach. If NULL, detach any ops. + * + * Iterates through all descendant cgroups of @memcg (including itself) + * and clears their bpf_ops pointer. This is used when a BPF program + * is detached or if attachment fails midway. + */ +static void clean_memcg_bpf_ops(struct mem_cgroup *memcg, + struct memcg_bpf_ops *ops) +{ + struct mem_cgroup *iter = NULL; + + while ((iter = mem_cgroup_iter(memcg, iter, NULL))) { + if (ops) { + if (!WARN_ON(READ_ONCE(memcg->bpf_ops) != ops)) + WRITE_ONCE(memcg->bpf_ops, NULL); + } else + WRITE_ONCE(iter->bpf_ops, NULL); + } +} + +static int bpf_memcg_ops_reg(void *kdata, struct bpf_link *link) +{ + struct bpf_struct_ops_link *ops_link + = container_of(link, struct bpf_struct_ops_link, link); + struct memcg_bpf_ops *ops = kdata; + struct mem_cgroup *memcg, *iter = NULL; + int err = 0; + + memcg = mem_cgroup_get_from_ino(ops_link->cgroup_id); + if (IS_ERR_OR_NULL(memcg)) + return PTR_ERR(memcg); + + cgroup_lock(); + while ((iter = mem_cgroup_iter(memcg, iter, NULL))) { + if (READ_ONCE(iter->bpf_ops)) { + mem_cgroup_iter_break(memcg, iter); + err = -EBUSY; + break; + } + WRITE_ONCE(iter->bpf_ops, ops); + } + if (err) + clean_memcg_bpf_ops(memcg, NULL); + cgroup_unlock(); + + mem_cgroup_put(memcg); + return err; +} + +/* Unregister the struct ops instance */ +static void bpf_memcg_ops_unreg(void *kdata, struct bpf_link *link) +{ + struct bpf_struct_ops_link *ops_link + = container_of(link, struct bpf_struct_ops_link, link); + struct memcg_bpf_ops *ops = kdata; + struct mem_cgroup *memcg; + + memcg = mem_cgroup_get_from_ino(ops_link->cgroup_id); + if (IS_ERR_OR_NULL(memcg)) + goto out; + + cgroup_lock(); + clean_memcg_bpf_ops(memcg, ops); + cgroup_unlock(); + + mem_cgroup_put(memcg); + +out: + synchronize_srcu(&memcg_bpf_srcu); +} + +static struct bpf_struct_ops bpf_memcg_bpf_ops = { + .verifier_ops = &bpf_memcg_verifier_ops, + .init = bpf_memcg_ops_init, + .check_member = bpf_memcg_ops_check_member, + .init_member = bpf_memcg_ops_init_member, + .reg = bpf_memcg_ops_reg, + .unreg = bpf_memcg_ops_unreg, + .name = "memcg_bpf_ops", + .owner = THIS_MODULE, + .cfi_stubs = &cfi_bpf_memcg_ops, +}; + static int __init bpf_memcontrol_init(void) { - int err; + int err, err2; err = register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &bpf_memcontrol_kfunc_set); if (err) pr_warn("error while registering bpf memcontrol kfuncs: %d", err); - return err; + err2 = register_bpf_struct_ops(&bpf_memcg_bpf_ops, memcg_bpf_ops); + if (err) + pr_warn("error while registering memcontrol bpf ops: %d", err2); + + return err ? err : err2; } late_initcall(bpf_memcontrol_init); diff --git a/mm/bpf_memcontrol.h b/mm/bpf_memcontrol.h new file mode 100644 index 000000000000..72598461a922 --- /dev/null +++ b/mm/bpf_memcontrol.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* memcontrol_bpf.h - Memory Controller eBPF support + * + * Author: Hui Zhu + * Copyright (C) 2025 KylinSoft Corporation. + */ + +#ifndef _LINUX_BPF_MEMCONTROL_H +#define _LINUX_BPF_MEMCONTROL_H + +#ifdef CONFIG_BPF_SYSCALL + +/** + * struct memcg_bpf_ops - BPF hooks for the memory controller. + * + * These hooks allow a BPF program to extend or modify the behavior of + * the memory controller for a cgroup. + * + * @memcg_nr_pages_over_high: A BPF hook to report additional pages over + * the high limit. This can be used to + * implement custom pressure calculation. + * @handle_cgroup_online: Called when a cgroup with this program + * attached comes online. + * @handle_cgroup_offline: Called when a cgroup with this program + * attached goes offline. + */ +struct memcg_bpf_ops { + unsigned int (*memcg_nr_pages_over_high)(struct mem_cgroup *memcg); + + void (*handle_cgroup_online)(struct mem_cgroup *memcg); + + void (*handle_cgroup_offline)(struct mem_cgroup *memcg); +}; + +extern struct srcu_struct memcg_bpf_srcu; + +/* + * Calls the BPF program to get a custom "over high" page count, which + * contributes to memory pressure calculation. + */ +static inline unsigned int +bpf_memcg_nr_pages_over_high(struct mem_cgroup *memcg) +{ + int idx; + struct memcg_bpf_ops *ops; + unsigned int nr_pages; + + idx = srcu_read_lock(&memcg_bpf_srcu); + + ops = READ_ONCE(memcg->bpf_ops); + if (!ops || !ops->memcg_nr_pages_over_high) + goto out; + + nr_pages = ops->memcg_nr_pages_over_high(memcg); + +out: + srcu_read_unlock(&memcg_bpf_srcu, idx); + return nr_pages; +} + +extern void memcontrol_bpf_online(struct mem_cgroup *memcg); +extern void memcontrol_bpf_offline(struct mem_cgroup *memcg); + +#else /* CONFIG_BPF_SYSCALL */ + +static inline unsigned int +bpf_memcg_nr_pages_over_high(struct mem_cgroup *memcg) { return 0 } +static inline void memcontrol_bpf_online(struct mem_cgroup *memcg) { } +static inline void memcontrol_bpf_offline(struct mem_cgroup *memcg) { } + +#endif /* CONFIG_BPF_SYSCALL */ + +#endif /* _LINUX_BPF_MEMCONTROL_H */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d44c1f293e16..5cbb0f343bc3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -69,6 +69,7 @@ #include #include "slab.h" #include "memcontrol-v1.h" +#include "bpf_memcontrol.h" #include @@ -2038,7 +2039,8 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg, unsigned long pflags; if (page_counter_read(&memcg->memory) <= - READ_ONCE(memcg->memory.high)) + READ_ONCE(memcg->memory.high) && + !bpf_memcg_nr_pages_over_high(memcg)) continue; memcg_memory_event(memcg, MEMCG_HIGH); @@ -2116,11 +2118,12 @@ static void high_work_func(struct work_struct *work) #define MEMCG_DELAY_PRECISION_SHIFT 20 #define MEMCG_DELAY_SCALING_SHIFT 14 -static u64 calculate_overage(unsigned long usage, unsigned long high) +static u64 calculate_overage(unsigned long usage, unsigned long high, + unsigned long bpf_over_high) { u64 overage; - if (usage <= high) + if (!bpf_over_high && usage <= high) return 0; /* @@ -2130,6 +2133,7 @@ static u64 calculate_overage(unsigned long usage, unsigned long high) high = max(high, 1UL); overage = usage - high; + overage = max(overage, bpf_over_high); overage <<= MEMCG_DELAY_PRECISION_SHIFT; return div64_u64(overage, high); } @@ -2140,7 +2144,8 @@ static u64 mem_find_max_overage(struct mem_cgroup *memcg) do { overage = calculate_overage(page_counter_read(&memcg->memory), - READ_ONCE(memcg->memory.high)); + READ_ONCE(memcg->memory.high), + bpf_memcg_nr_pages_over_high(memcg)); max_overage = max(overage, max_overage); } while ((memcg = parent_mem_cgroup(memcg)) && !mem_cgroup_is_root(memcg)); @@ -2154,7 +2159,7 @@ static u64 swap_find_max_overage(struct mem_cgroup *memcg) do { overage = calculate_overage(page_counter_read(&memcg->swap), - READ_ONCE(memcg->swap.high)); + READ_ONCE(memcg->swap.high), 0); if (overage) memcg_memory_event(memcg, MEMCG_SWAP_HIGH); max_overage = max(overage, max_overage); @@ -2210,12 +2215,14 @@ void __mem_cgroup_handle_over_high(gfp_t gfp_mask) unsigned long penalty_jiffies; unsigned long pflags; unsigned long nr_reclaimed; - unsigned int nr_pages = current->memcg_nr_pages_over_high; + unsigned int nr_pages; int nr_retries = MAX_RECLAIM_RETRIES; struct mem_cgroup *memcg; bool in_retry = false; memcg = get_mem_cgroup_from_mm(current->mm); + nr_pages = max(current->memcg_nr_pages_over_high, + bpf_memcg_nr_pages_over_high(memcg)); current->memcg_nr_pages_over_high = 0; retry_reclaim: @@ -2309,6 +2316,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, bool raised_max_event = false; unsigned long pflags; bool allow_spinning = gfpflags_allow_spinning(gfp_mask); + struct mem_cgroup *orig_memcg; retry: if (consume_stock(memcg, nr_pages)) @@ -2434,6 +2442,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, if (batch > nr_pages) refill_stock(memcg, batch - nr_pages); + orig_memcg = memcg; /* * If the hierarchy is above the normal consumption range, schedule * reclaim on returning to userland. We can perform reclaim here @@ -2483,7 +2492,8 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, * kernel. If this is successful, the return path will see it * when it rechecks the overage and simply bail out. */ - if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH && + if ((current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH || + bpf_memcg_nr_pages_over_high(orig_memcg) > MEMCG_CHARGE_BATCH) && !(current->flags & PF_MEMALLOC) && gfpflags_allow_blocking(gfp_mask)) __mem_cgroup_handle_over_high(gfp_mask); @@ -3867,6 +3877,8 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) */ xa_store(&mem_cgroup_ids, memcg->id.id, memcg, GFP_KERNEL); + memcontrol_bpf_online(memcg); + return 0; offline_kmem: memcg_offline_kmem(memcg); @@ -3887,6 +3899,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) zswap_memcg_offline_cleanup(memcg); bpf_oom_memcg_offline(memcg); + memcontrol_bpf_offline(memcg); memcg_offline_kmem(memcg); reparent_shrinker_deferred(memcg); wb_memcg_offline(memcg); -- 2.43.0 From: Hui Zhu Add comprehensive selftests for the memcg_bpf_ops struct_ops functionality. The test creates a cgroup hierarchy with high and low priority cgroups, attaches a BPF program that monitors PGSCAN events on the high-priority cgroup, and verifies that low-priority tasks are throttled when the BPF program reports additional overage. Test flow: 1. Create /memcg_ops_test/high and /memcg_ops_test/low cgroups 2. Attach BPF program to monitor high cgroup's PGSCAN events 3. When PGSCAN events exceed threshold (64 pages/sec), report 512 pages over high for low cgroup 4. Run memory-intensive workloads in both cgroups 5. Verify low-priority workload is significantly slower The BPF program uses: - Tracepoint to monitor memcg:count_memcg_events - One-second sliding window for PGSCAN aggregation - Trigger mechanism with configurable threshold and overage This validates that: - BPF programs can be attached to cgroup hierarchies - memcg_nr_pages_over_high hook is called correctly - Memory pressure calculation includes BPF-reported overage - Throttling behavior works as expected This test does not use PSI for triggering due to the reasons discussed in: https://lore.kernel.org/lkml/1d9a162605a3f32ac215430131f7745488deaa34@linux.dev/ Signed-off-by: Geliang Tang Signed-off-by: Hui Zhu --- MAINTAINERS | 2 + .../selftests/bpf/prog_tests/memcg_ops.c | 340 ++++++++++++++++++ .../selftests/bpf/progs/memcg_ops_over_high.c | 95 +++++ 3 files changed, 437 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/memcg_ops.c create mode 100644 tools/testing/selftests/bpf/progs/memcg_ops_over_high.c diff --git a/MAINTAINERS b/MAINTAINERS index 158f3ba63ee7..10508c90136a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6356,6 +6356,8 @@ F: mm/memcontrol-v1.h F: mm/page_counter.c F: mm/swap_cgroup.c F: samples/cgroup/* +F: tools/testing/selftests/bpf/prog_tests/memcg_ops.c +F: tools/testing/selftests/bpf/progs/memcg_ops_over_high.c F: tools/testing/selftests/cgroup/memcg_protection.m F: tools/testing/selftests/cgroup/test_hugetlb_memcg.c F: tools/testing/selftests/cgroup/test_kmem.c diff --git a/tools/testing/selftests/bpf/prog_tests/memcg_ops.c b/tools/testing/selftests/bpf/prog_tests/memcg_ops.c new file mode 100644 index 000000000000..48f0ca4a032b --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/memcg_ops.c @@ -0,0 +1,340 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Memory controller eBPF struct ops test + */ + +#include +#include +#include "cgroup_helpers.h" + +struct local_config { + u64 threshold; + u64 high_cgroup_id; + unsigned int over_high; +} local_config; + +#include "memcg_ops_over_high.skel.h" + +#define OVER_HIGH_THRESHOLD 64 +#define OVER_HIGH_NUM 512 +#define FILE_SIZE (512 * 1024 * 1024ul) +#define BUFFER_SIZE (128 * 1024) +#define READ_ITERATIONS 5 +#define CG_LIMIT (512 * 1024 * 1024ul) + +#define CG_DIR "/memcg_ops_test" +#define CG_HIGH_DIR CG_DIR "/high" +#define CG_LOW_DIR CG_DIR "/low" + + +static int setup_cgroup(int *high_cgroup_id, int *low_cgroup_fd) +{ + int ret; + char limit_buf[20]; + + ret = setup_cgroup_environment(); + if (!ASSERT_OK(ret, "setup_cgroup_environment")) + goto cleanup; + + ret = create_and_get_cgroup(CG_DIR); + if (!ASSERT_GE(ret, 0, "create_and_get_cgroup "CG_DIR)) + goto cleanup; + close(ret); + ret = enable_controllers(CG_DIR, "memory"); + if (!ASSERT_OK(ret, "enable_controllers")) + goto cleanup; + snprintf(limit_buf, 20, "%ld", CG_LIMIT); + ret = write_cgroup_file(CG_DIR, "memory.max", limit_buf); + if (!ASSERT_OK(ret, "write_cgroup_file")) + goto cleanup; + + ret = create_and_get_cgroup(CG_HIGH_DIR); + if (!ASSERT_GE(ret, 0, "create_and_get_cgroup "CG_HIGH_DIR)) + goto cleanup; + close(ret); + ret = (int)get_cgroup_id(CG_HIGH_DIR); + if (!ASSERT_GE(ret, 0, "get_cgroup_id")) + goto cleanup; + *high_cgroup_id = ret; + + ret = create_and_get_cgroup(CG_LOW_DIR); + if (!ASSERT_GE(ret, 0, "create_and_get_cgroup "CG_LOW_DIR)) + goto cleanup; + *low_cgroup_fd = ret; + + return 0; + +cleanup: + cleanup_cgroup_environment(); + return -1; +} + +int write_file(const char *filename) +{ + int ret = -1; + size_t written = 0; + char *buffer; + FILE *fp; + + fp = fopen(filename, "wb"); + if (!fp) + goto out; + + buffer = malloc(BUFFER_SIZE); + if (!buffer) + goto cleanup_fp; + + memset(buffer, 'A', BUFFER_SIZE); + + while (written < FILE_SIZE) { + size_t to_write = (FILE_SIZE - written < BUFFER_SIZE) ? + (FILE_SIZE - written) : + BUFFER_SIZE; + + if (fwrite(buffer, 1, to_write, fp) != to_write) + goto cleanup; + written += to_write; + } + + ret = 0; +cleanup: + free(buffer); +cleanup_fp: + fclose(fp); +out: + return ret; +} + +int read_file(const char *filename, int iterations) +{ + int ret = -1; + char *buffer; + + buffer = malloc(BUFFER_SIZE); + if (!buffer) + goto out; + + for (int iter = 0; iter < iterations; iter++) { + FILE *fp = fopen(filename, "rb"); + + if (!fp) + goto cleanup; + + size_t total_read = 0; + size_t bytes_read; + + while ((bytes_read = fread(buffer, 1, BUFFER_SIZE, fp)) > 0) + total_read += bytes_read; + + fclose(fp); + + if (env.verbosity >= VERBOSE_NORMAL) + printf("%s %d %d done\n", + __func__, getpid(), iter); + } + + ret = 0; +cleanup: + free(buffer); +out: + return ret; +} + +static void real_test_memcg_ops_over_high_child_work(const char *cgroup_path, + char *data_filename, + char *time_filename) +{ + struct timeval start, end; + double elapsed; + FILE *fp; + + if (!ASSERT_OK(join_parent_cgroup(cgroup_path), "join_parent_cgroup")) + goto out; + + if (env.verbosity >= VERBOSE_NORMAL) + printf("%s %d begin\n", __func__, getpid()); + + gettimeofday(&start, NULL); + + if (!ASSERT_OK(write_file(data_filename), "write_file")) + goto out; + + if (env.verbosity >= VERBOSE_NORMAL) + printf("%s %d write_file done\n", __func__, getpid()); + + if (!ASSERT_OK(read_file(data_filename, READ_ITERATIONS), "read_file")) + goto out; + + gettimeofday(&end, NULL); + + elapsed = (end.tv_sec - start.tv_sec) + + (end.tv_usec - start.tv_usec) / 1000000.0; + + if (env.verbosity >= VERBOSE_NORMAL) + printf("%s %d end %.6f\n", __func__, getpid(), elapsed); + + fp = fopen(time_filename, "w"); + if (!ASSERT_OK_PTR(fp, "fopen")) + goto out; + fprintf(fp, "%.6f", elapsed); + fclose(fp); + +out: + exit(0); +} + +static int get_time(char *time_filename, double *time) +{ + int ret = -1; + FILE *fp; + + fp = fopen(time_filename, "r"); + if (!fp) + goto out; + + if (fscanf(fp, "%lf", time) < 0) + goto cleanup; + + ret = 0; +cleanup: + fclose(fp); +out: + return ret; +} + +static void real_test_memcg_ops_over_high(void) +{ + int ret; + char data_file1[] = "/tmp/test_data_XXXXXX"; + char data_file2[] = "/tmp/test_data_XXXXXX"; + char time_file1[] = "/tmp/test_time_XXXXXX"; + char time_file2[] = "/tmp/test_time_XXXXXX"; + pid_t pid1, pid2; + double time1, time2; + + ret = mkstemp(data_file1); + if (!ASSERT_GT(ret, 0, "mkstemp")) + return; + close(ret); + ret = mkstemp(data_file2); + if (!ASSERT_GT(ret, 0, "mkstemp")) + goto cleanup_data_file1; + close(ret); + ret = mkstemp(time_file1); + if (!ASSERT_GT(ret, 0, "mkstemp")) + goto cleanup_data_file2; + close(ret); + ret = mkstemp(time_file2); + if (!ASSERT_GT(ret, 0, "mkstemp")) + goto cleanup_time_file1; + close(ret); + + pid1 = fork(); + if (!ASSERT_GE(pid1, 0, "fork")) + goto cleanup; + if (pid1 == 0) + real_test_memcg_ops_over_high_child_work(CG_LOW_DIR, + data_file1, + time_file1); + + pid2 = fork(); + if (!ASSERT_GE(pid1, 0, "fork")) + goto cleanup; + if (pid2 == 0) + real_test_memcg_ops_over_high_child_work(CG_HIGH_DIR, + data_file2, + time_file2); + + ret = waitpid(pid1, NULL, 0); + if (!ASSERT_GT(ret, 0, "waitpid")) + goto cleanup; + + ret = waitpid(pid2, NULL, 0); + if (!ASSERT_GT(ret, 0, "waitpid")) + goto cleanup; + + if (CHECK_FAIL(get_time(time_file1, &time1))) + goto cleanup; + + if (CHECK_FAIL(get_time(time_file2, &time2))) + goto cleanup; + + ASSERT_TRUE(time1 > time2 && time1 - time2 > 1, + "low fast compare"); + +cleanup: + unlink(time_file2); +cleanup_time_file1: + unlink(time_file1); +cleanup_data_file2: + unlink(data_file2); +cleanup_data_file1: + unlink(data_file1); +} + +void test_memcg_ops_over_high(void) +{ + int err, map_fd; + struct memcg_ops_over_high *skel; + struct bpf_map *map; + size_t bss_sz; + struct memcg_ops_over_high__bss *bss_data; + __u32 key = 0; + struct bpf_program *prog = NULL; + struct bpf_link *link = NULL, *link2 = NULL; + DECLARE_LIBBPF_OPTS(bpf_struct_ops_opts, opts); + int high_cgroup_id, low_cgroup_fd; + + err = setup_cgroup(&high_cgroup_id, &low_cgroup_fd); + if (!ASSERT_OK(err, "setup_cgroup")) + goto out; + + skel = memcg_ops_over_high__open_and_load(); + if (!ASSERT_OK_PTR(skel, "memcg_ops_over_high__open_and_load")) + goto out; + + map = bpf_object__find_map_by_name(skel->obj, ".bss"); + if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name .bss")) + goto out; + + map_fd = bpf_map__fd(map); + bss_sz = bpf_map__value_size(map); + bss_data = malloc(bpf_map__value_size(map)); + if (!ASSERT_OK_PTR(bss_data, "malloc(bpf_map__value_size(map))")) + goto out; + memset(bss_data, 0, sizeof(struct local_config)); + bss_data->local_config.high_cgroup_id = high_cgroup_id; + bss_data->local_config.threshold = OVER_HIGH_THRESHOLD; + bss_data->local_config.over_high = OVER_HIGH_NUM; + err = bpf_map_update_elem(map_fd, &key, bss_data, BPF_EXIST); + free(bss_data); + if (!ASSERT_OK(err, "bpf_map_update_elem")) + goto out; + + prog = bpf_object__find_program_by_name(skel->obj, + "handle_count_memcg_events"); + if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name")) + goto out; + + link = bpf_program__attach(prog); + if (!ASSERT_OK_PTR(link, "bpf_program__attach")) + goto out; + + map = bpf_object__find_map_by_name(skel->obj, "mcg_ops"); + if (!ASSERT_OK_PTR(link, "bpf_object__find_map_by_name mcg_ops")) + goto out; + + opts.relative_fd = low_cgroup_fd; + link2 = bpf_map__attach_struct_ops_opts(map, &opts); + if (!ASSERT_OK_PTR(link, "bpf_map__attach_struct_ops_opts")) + goto out; + + real_test_memcg_ops_over_high(); + +out: + bpf_link__destroy(link); + bpf_link__destroy(link2); + memcg_ops_over_high__detach(skel); + close(low_cgroup_fd); + cleanup_cgroup_environment(); +} diff --git a/tools/testing/selftests/bpf/progs/memcg_ops_over_high.c b/tools/testing/selftests/bpf/progs/memcg_ops_over_high.c new file mode 100644 index 000000000000..5c9651ec96d4 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/memcg_ops_over_high.c @@ -0,0 +1,95 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include +#include + +#define ONE_SECOND_NS 1000000000 + +struct local_config { + u64 threshold; + u64 high_cgroup_id; + unsigned int over_high; +} local_config; + +struct AggregationData { + u64 sum; + u64 window_start_ts; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, struct AggregationData); +} aggregation_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, u64); +} trigger_ts_map SEC(".maps"); + +SEC("tp/memcg/count_memcg_events") +int +handle_count_memcg_events(struct trace_event_raw_memcg_rstat_events *ctx) +{ + u32 key = 0; + struct AggregationData *data; + u64 current_ts; + + if (ctx->id != local_config.high_cgroup_id || + (ctx->item != PGSCAN_KSWAPD && ctx->item != PGSCAN_DIRECT)) + goto out; + + data = bpf_map_lookup_elem(&aggregation_map, &key); + if (!data) + goto out; + + current_ts = bpf_ktime_get_ns(); + + if (current_ts - data->window_start_ts < ONE_SECOND_NS) { + data->sum += ctx->val; + } else { + data->window_start_ts = current_ts; + data->sum = ctx->val; + } + + if (data->sum > local_config.threshold) { + bpf_map_update_elem(&trigger_ts_map, &key, ¤t_ts, + BPF_ANY); + data->sum = 0; + data->window_start_ts = current_ts; + } + +out: + return 0; +} + +SEC("struct_ops/memcg_nr_pages_over_high") +unsigned int memcg_nr_pages_over_high_impl(struct mem_cgroup *memcg) +{ + u32 key = 0; + u64 *trigger_ts; + unsigned int ret = 0; + + trigger_ts = bpf_map_lookup_elem(&trigger_ts_map, &key); + if (!trigger_ts || *trigger_ts == 0) + goto out; + + u64 current_ts = bpf_ktime_get_ns(); + + if (current_ts - *trigger_ts < ONE_SECOND_NS) + ret = local_config.over_high; + +out: + return ret; +} + +SEC(".struct_ops.link") +struct memcg_bpf_ops mcg_ops = { + .memcg_nr_pages_over_high = (void *)memcg_nr_pages_over_high_impl, +}; + +char LICENSE[] SEC("license") = "GPL"; -- 2.43.0 From: Hui Zhu Add a sample program demonstrating priority-based memory throttling using memcg_bpf_ops struct_ops. This sample consists of: 1. memcg.bpf.c: BPF program that monitors PGSCAN events on a high-priority cgroup. When page scan activity exceeds a threshold, it reports additional "over high" pages for a low-priority cgroup, causing it to be throttled. 2. memcg.c: Userspace loader that configures and attaches the BPF program. Takes parameters: - low_path: Path to low-priority cgroup - high_path: Path to high-priority cgroup - threshold: PGSCAN threshold to trigger throttling - over_high: Number of pages to report as over-high Usage example: # mkdir /sys/fs/cgroup/high # mkdir /sys/fs/cgroup/low # ./memcg /sys/fs/cgroup/low /sys/fs/cgroup/high 100 1024 When the high-priority cgroup experiences memory pressure (>100 PGSCAN events/sec), the low-priority cgroup will be throttled as if it were 1024 pages over its memory.high limit. Real-world test results on x86_64 QEMU (10 CPU, 4GB RAM): - High-priority: 347,825 ops/sec (unaffected) - Low-priority: 177 ops/sec (throttled by ~99.9%) This test does not use PSI for triggering due to the reasons discussed in: https://lore.kernel.org/lkml/1d9a162605a3f32ac215430131f7745488deaa34@linux.dev/ Signed-off-by: Geliang Tang Signed-off-by: Hui Zhu --- MAINTAINERS | 2 + samples/bpf/.gitignore | 1 + samples/bpf/Makefile | 9 +- samples/bpf/memcg.bpf.c | 95 +++++++++++++++++++ samples/bpf/memcg.c | 204 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 310 insertions(+), 1 deletion(-) create mode 100644 samples/bpf/memcg.bpf.c create mode 100644 samples/bpf/memcg.c diff --git a/MAINTAINERS b/MAINTAINERS index 10508c90136a..91af1f28eb14 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6355,6 +6355,8 @@ F: mm/memcontrol-v1.c F: mm/memcontrol-v1.h F: mm/page_counter.c F: mm/swap_cgroup.c +F: samples/bpf/memcg.bpf.c +F: samples/bpf/memcg.c F: samples/cgroup/* F: tools/testing/selftests/bpf/prog_tests/memcg_ops.c F: tools/testing/selftests/bpf/progs/memcg_ops_over_high.c diff --git a/samples/bpf/.gitignore b/samples/bpf/.gitignore index 0002cd359fb1..0de6569cdefd 100644 --- a/samples/bpf/.gitignore +++ b/samples/bpf/.gitignore @@ -49,3 +49,4 @@ iperf.* /vmlinux.h /bpftool/ /libbpf/ +memcg diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 95a4fa1f1e44..6416c8aa3034 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -37,6 +37,7 @@ tprogs-y += xdp_fwd tprogs-y += task_fd_query tprogs-y += ibumad tprogs-y += hbm +tprogs-y += memcg # Libbpf dependencies LIBBPF_SRC = $(TOOLS_PATH)/lib/bpf @@ -122,6 +123,7 @@ always-y += task_fd_query_kern.o always-y += ibumad_kern.o always-y += hbm_out_kern.o always-y += hbm_edt_kern.o +always-y += memcg.bpf.o COMMON_CFLAGS = $(TPROGS_USER_CFLAGS) TPROGS_LDFLAGS = $(TPROGS_USER_LDFLAGS) @@ -289,6 +291,8 @@ $(obj)/hbm_out_kern.o: $(src)/hbm.h $(src)/hbm_kern.h $(obj)/hbm.o: $(src)/hbm.h $(obj)/hbm_edt_kern.o: $(src)/hbm.h $(src)/hbm_kern.h +memcg: $(obj)/memcg.skel.h + # Override includes for xdp_sample_user.o because $(srctree)/usr/include in # TPROGS_CFLAGS causes conflicts XDP_SAMPLE_CFLAGS += -Wall -O2 \ @@ -347,11 +351,13 @@ $(obj)/%.bpf.o: $(src)/%.bpf.c $(obj)/vmlinux.h $(src)/xdp_sample.bpf.h $(src)/x -I$(LIBBPF_INCLUDE) $(CLANG_SYS_INCLUDES) \ -c $(filter %.bpf.c,$^) -o $@ -LINKED_SKELS := xdp_router_ipv4.skel.h +LINKED_SKELS := xdp_router_ipv4.skel.h memcg.skel.h clean-files += $(LINKED_SKELS) xdp_router_ipv4.skel.h-deps := xdp_router_ipv4.bpf.o xdp_sample.bpf.o +memcg.skel.h-deps := memcg.bpf.o + LINKED_BPF_SRCS := $(patsubst %.bpf.o,%.bpf.c,$(foreach skel,$(LINKED_SKELS),$($(skel)-deps))) BPF_SRCS_LINKED := $(notdir $(wildcard $(src)/*.bpf.c)) @@ -360,6 +366,7 @@ BPF_SKELS_LINKED := $(addprefix $(obj)/,$(LINKED_SKELS)) $(BPF_SKELS_LINKED): $(BPF_OBJS_LINKED) $(BPFTOOL) @echo " BPF GEN-OBJ " $(@:.skel.h=) + echo $(Q)$(BPFTOOL) gen object $(@:.skel.h=.lbpf.o) $(addprefix $(obj)/,$($(@F)-deps)) $(Q)$(BPFTOOL) gen object $(@:.skel.h=.lbpf.o) $(addprefix $(obj)/,$($(@F)-deps)) @echo " BPF GEN-SKEL" $(@:.skel.h=) $(Q)$(BPFTOOL) gen skeleton $(@:.skel.h=.lbpf.o) name $(notdir $(@:.skel.h=)) > $@ diff --git a/samples/bpf/memcg.bpf.c b/samples/bpf/memcg.bpf.c new file mode 100644 index 000000000000..5c9651ec96d4 --- /dev/null +++ b/samples/bpf/memcg.bpf.c @@ -0,0 +1,95 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include +#include + +#define ONE_SECOND_NS 1000000000 + +struct local_config { + u64 threshold; + u64 high_cgroup_id; + unsigned int over_high; +} local_config; + +struct AggregationData { + u64 sum; + u64 window_start_ts; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, struct AggregationData); +} aggregation_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, u64); +} trigger_ts_map SEC(".maps"); + +SEC("tp/memcg/count_memcg_events") +int +handle_count_memcg_events(struct trace_event_raw_memcg_rstat_events *ctx) +{ + u32 key = 0; + struct AggregationData *data; + u64 current_ts; + + if (ctx->id != local_config.high_cgroup_id || + (ctx->item != PGSCAN_KSWAPD && ctx->item != PGSCAN_DIRECT)) + goto out; + + data = bpf_map_lookup_elem(&aggregation_map, &key); + if (!data) + goto out; + + current_ts = bpf_ktime_get_ns(); + + if (current_ts - data->window_start_ts < ONE_SECOND_NS) { + data->sum += ctx->val; + } else { + data->window_start_ts = current_ts; + data->sum = ctx->val; + } + + if (data->sum > local_config.threshold) { + bpf_map_update_elem(&trigger_ts_map, &key, ¤t_ts, + BPF_ANY); + data->sum = 0; + data->window_start_ts = current_ts; + } + +out: + return 0; +} + +SEC("struct_ops/memcg_nr_pages_over_high") +unsigned int memcg_nr_pages_over_high_impl(struct mem_cgroup *memcg) +{ + u32 key = 0; + u64 *trigger_ts; + unsigned int ret = 0; + + trigger_ts = bpf_map_lookup_elem(&trigger_ts_map, &key); + if (!trigger_ts || *trigger_ts == 0) + goto out; + + u64 current_ts = bpf_ktime_get_ns(); + + if (current_ts - *trigger_ts < ONE_SECOND_NS) + ret = local_config.over_high; + +out: + return ret; +} + +SEC(".struct_ops.link") +struct memcg_bpf_ops mcg_ops = { + .memcg_nr_pages_over_high = (void *)memcg_nr_pages_over_high_impl, +}; + +char LICENSE[] SEC("license") = "GPL"; diff --git a/samples/bpf/memcg.c b/samples/bpf/memcg.c new file mode 100644 index 000000000000..08124f08f3ad --- /dev/null +++ b/samples/bpf/memcg.c @@ -0,0 +1,204 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef __MEMCG_RSTAT_SIMPLE_BPF_SKEL_H__ +#define u64 uint64_t +#endif + +struct local_config { + u64 threshold; + u64 high_cgroup_id; + unsigned int over_high; +} local_config; + +#include "memcg.skel.h" + +static bool exiting; + +static void sig_handler(int sig) +{ + exiting = true; +} + +static void usage(char *name) +{ + fprintf(stderr, + "Usage: %s \n", + name); + fprintf(stderr, "low_path: low priority memcgroup path.\n"); + fprintf(stderr, "high_path: high priority memcgroup path.\n"); + fprintf(stderr, "threshold: The sum of 'val' PGSCAN of high\n" + " priority memcgroup in 1 sec to trigger\n" + " low priority cgroup over_high.\n"); + fprintf(stderr, "over_high: low_path over_high value.\n"); +} + +static uint64_t get_cgroup_id(const char *cgroup_path) +{ + struct stat st; + + if (cgroup_path == NULL) { + fprintf(stderr, "Error: cgroup_path is NULL\n"); + return 0; + } + + if (stat(cgroup_path, &st) < 0) { + fprintf(stderr, "Error: stat(%s) failed: %d\n", + cgroup_path, errno); + return 0; + } + + return (uint64_t)st.st_ino; +} + +int main(int argc, char **argv) +{ + int low_cgroup_fd = -1; + uint64_t threshold, high_cgroup_id; + unsigned int over_high; + const char *bpf_obj_file = "memcg.bpf.o"; + struct bpf_object *obj = NULL; + struct bpf_program *prog = NULL; + struct bpf_link *link = NULL, *link2 = NULL; + struct bpf_map *map; + struct memcg__bss *bss_data; + DECLARE_LIBBPF_OPTS(bpf_struct_ops_opts, opts); + int err = -EINVAL; + int map_fd; + + if (argc < 5) { +usage_err: + usage(argv[0]); + goto out; + } + + low_cgroup_fd = open(argv[1], O_RDONLY); + if (low_cgroup_fd < 0) { + fprintf(stderr, + "ERROR: open low cgroup '%s' failed: %d\n", + argv[1], errno); + err = -errno; + goto out; + } + + high_cgroup_id = get_cgroup_id(argv[2]); + if (!high_cgroup_id) + goto out; + + threshold = strtoull(argv[3], NULL, 10); + over_high = strtoull(argv[4], NULL, 10); + if (!threshold || !over_high) + goto usage_err; + + obj = bpf_object__open_file(bpf_obj_file, NULL); + err = libbpf_get_error(obj); + if (err) { + fprintf(stderr, + "ERROR: opening BPF object file '%s' failed: %d\n", + bpf_obj_file, err); + goto out; + } + + map = bpf_object__find_map_by_name(obj, ".bss"); + if (!map) { + fprintf(stderr, "ERROR: Failed to find .data map\n"); + err = -ESRCH; + goto out; + } + + err = bpf_object__load(obj); + if (err) { + fprintf(stderr, + "ERROR: loading BPF object file failed: %d\n", + err); + goto out; + } + + map_fd = bpf_map__fd(map); + bss_data = malloc(bpf_map__value_size(map)); + if (bss_data) { + __u32 key = 0; + + memset(bss_data, 0, sizeof(struct local_config)); + bss_data->local_config.high_cgroup_id = high_cgroup_id; + bss_data->local_config.threshold = threshold; + bss_data->local_config.over_high = over_high; + + err = bpf_map_update_elem(map_fd, &key, bss_data, BPF_EXIST); + free(bss_data); + if (err) { + fprintf(stderr, + "ERROR: update config failed: %d\n", + err); + goto out; + } + } else { + fprintf(stderr, + "ERROR: allocate memory failed\n"); + err = -ENOMEM; + goto out; + } + + prog = bpf_object__find_program_by_name(obj, + "handle_count_memcg_events"); + if (!prog) { + fprintf(stderr, + "ERROR: finding a prog in BPF object file failed\n"); + goto out; + } + + link = bpf_program__attach(prog); + err = libbpf_get_error(link); + if (err) { + fprintf(stderr, + "ERROR: bpf_program__attach failed: %d\n", + err); + goto out; + } + + map = bpf_object__find_map_by_name(obj, "mcg_ops"); + if (!map) { + fprintf(stderr, "ERROR: Failed to find mcg_ops map\n"); + err = -ESRCH; + goto out; + } + + opts.relative_fd = low_cgroup_fd; + link2 = bpf_map__attach_struct_ops_opts(map, &opts); + if (!link2) { + fprintf(stderr, + "Failed to attach struct ops mcg_ops: %d\n", errno); + err = -errno; + goto out; + } + + printf("Successfully attached!\n"); + + signal(SIGINT, sig_handler); + signal(SIGTERM, sig_handler); + + while (!exiting) + pause(); + + printf("Exiting...\n"); + +out: + bpf_link__destroy(link); + bpf_link__destroy(link2); + bpf_object__close(obj); + close(low_cgroup_fd); + return err; +} -- 2.43.0