To use memcg_page_state_output() in bpf_memcontrol.c move the declaration from v1-specific memcontrol-v1.h to memcontrol.h. Signed-off-by: Roman Gushchin Acked-by: Michal Hocko Acked-by: Shakeel Butt --- include/linux/memcontrol.h | 6 ++++++ mm/memcontrol-v1.h | 1 - 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0651865a4564..7bef427d5a82 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -950,6 +950,7 @@ static inline void mod_memcg_page_state(struct page *page, } unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx); +unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item); unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx); unsigned long lruvec_page_state_local(struct lruvec *lruvec, enum node_stat_item idx); @@ -1373,6 +1374,11 @@ static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) return 0; } +static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item) +{ + return 0; +} + static inline unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) { diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h index 6358464bb416..a304ad418cdf 100644 --- a/mm/memcontrol-v1.h +++ b/mm/memcontrol-v1.h @@ -27,7 +27,6 @@ unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap); void drain_all_stock(struct mem_cgroup *root_memcg); unsigned long memcg_events(struct mem_cgroup *memcg, int event); -unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item); int memory_stat_show(struct seq_file *m, void *v); void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n); -- 2.52.0 To effectively operate with memory cgroups in BPF there is a need to convert css pointers to memcg pointers. A simple container_of cast which is used in the kernel code can't be used in BPF because from the verifier's point of view that's a out-of-bounds memory access. Introduce helper get/put kfuncs which can be used to get a refcounted memcg pointer from the css pointer: - bpf_get_mem_cgroup, - bpf_put_mem_cgroup. bpf_get_mem_cgroup() can take both memcg's css and the corresponding cgroup's "self" css. It allows it to be used with the existing cgroup iterator which iterates over cgroup tree, not memcg tree. Signed-off-by: Roman Gushchin --- mm/Makefile | 3 ++ mm/bpf_memcontrol.c | 88 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+) create mode 100644 mm/bpf_memcontrol.c diff --git a/mm/Makefile b/mm/Makefile index 2d0570a16e5b..bf46fe31dc14 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -106,6 +106,9 @@ obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o ifdef CONFIG_SWAP obj-$(CONFIG_MEMCG) += swap_cgroup.o endif +ifdef CONFIG_BPF_SYSCALL +obj-$(CONFIG_MEMCG) += bpf_memcontrol.o +endif obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o obj-$(CONFIG_GUP_TEST) += gup_test.o obj-$(CONFIG_DMAPOOL_TEST) += dmapool_test.o diff --git a/mm/bpf_memcontrol.c b/mm/bpf_memcontrol.c new file mode 100644 index 000000000000..82eb95de77b7 --- /dev/null +++ b/mm/bpf_memcontrol.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Memory Controller-related BPF kfuncs and auxiliary code + * + * Author: Roman Gushchin + */ + +#include +#include + +__bpf_kfunc_start_defs(); + +/** + * bpf_get_mem_cgroup - Get a reference to a memory cgroup + * @css: pointer to the css structure + * + * It's fine to pass a css which belongs to any cgroup controller, + * e.g. unified hierarchy's main css. + * + * Implements KF_ACQUIRE semantics. + * + * Return: A pointer to a mem_cgroup structure after bumping + * the corresponding css's reference counter. + */ +__bpf_kfunc struct mem_cgroup * +bpf_get_mem_cgroup(struct cgroup_subsys_state *css) +{ + struct mem_cgroup *memcg = NULL; + bool rcu_unlock = false; + + if (mem_cgroup_disabled() || !root_mem_cgroup) + return NULL; + + if (root_mem_cgroup->css.ss != css->ss) { + struct cgroup *cgroup = css->cgroup; + int ssid = root_mem_cgroup->css.ss->id; + + rcu_read_lock(); + rcu_unlock = true; + css = rcu_dereference_raw(cgroup->subsys[ssid]); + } + + if (css && css_tryget(css)) + memcg = container_of(css, struct mem_cgroup, css); + + if (rcu_unlock) + rcu_read_unlock(); + + return memcg; +} + +/** + * bpf_put_mem_cgroup - Put a reference to a memory cgroup + * @memcg: memory cgroup to release + * + * Releases a previously acquired memcg reference. + * Implements KF_RELEASE semantics. + */ +__bpf_kfunc void bpf_put_mem_cgroup(struct mem_cgroup *memcg) +{ + css_put(&memcg->css); +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(bpf_memcontrol_kfuncs) +BTF_ID_FLAGS(func, bpf_get_mem_cgroup, KF_ACQUIRE | KF_RET_NULL | KF_RCU) +BTF_ID_FLAGS(func, bpf_put_mem_cgroup, KF_RELEASE) + +BTF_KFUNCS_END(bpf_memcontrol_kfuncs) + +static const struct btf_kfunc_id_set bpf_memcontrol_kfunc_set = { + .owner = THIS_MODULE, + .set = &bpf_memcontrol_kfuncs, +}; + +static int __init bpf_memcontrol_init(void) +{ + int err; + + err = register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, + &bpf_memcontrol_kfunc_set); + if (err) + pr_warn("error while registering bpf memcontrol kfuncs: %d", err); + + return err; +} +late_initcall(bpf_memcontrol_init); -- 2.52.0 Introduce a BPF kfunc to get a trusted pointer to the root memory cgroup. It's very handy to traverse the full memcg tree, e.g. for handling a system-wide OOM. It's possible to obtain this pointer by traversing the memcg tree up from any known memcg, but it's sub-optimal and makes BPF programs more complex and less efficient. bpf_get_root_mem_cgroup() has a KF_ACQUIRE | KF_RET_NULL semantics, however in reality it's not necessary to bump the corresponding reference counter - root memory cgroup is immortal, reference counting is skipped, see css_get(). Once set, root_mem_cgroup is always a valid memcg pointer. It's safe to call bpf_put_mem_cgroup() for the pointer obtained with bpf_get_root_mem_cgroup(), it's effectively a no-op. Signed-off-by: Roman Gushchin --- mm/bpf_memcontrol.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/mm/bpf_memcontrol.c b/mm/bpf_memcontrol.c index 82eb95de77b7..187919eb2fe2 100644 --- a/mm/bpf_memcontrol.c +++ b/mm/bpf_memcontrol.c @@ -10,6 +10,25 @@ __bpf_kfunc_start_defs(); +/** + * bpf_get_root_mem_cgroup - Returns a pointer to the root memory cgroup + * + * The function has KF_ACQUIRE semantics, even though the root memory + * cgroup is never destroyed after being created and doesn't require + * reference counting. And it's perfectly safe to pass it to + * bpf_put_mem_cgroup() + * + * Return: A pointer to the root memory cgroup. + */ +__bpf_kfunc struct mem_cgroup *bpf_get_root_mem_cgroup(void) +{ + if (mem_cgroup_disabled()) + return NULL; + + /* css_get() is not needed */ + return root_mem_cgroup; +} + /** * bpf_get_mem_cgroup - Get a reference to a memory cgroup * @css: pointer to the css structure @@ -64,6 +83,7 @@ __bpf_kfunc void bpf_put_mem_cgroup(struct mem_cgroup *memcg) __bpf_kfunc_end_defs(); BTF_KFUNCS_START(bpf_memcontrol_kfuncs) +BTF_ID_FLAGS(func, bpf_get_root_mem_cgroup, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_get_mem_cgroup, KF_ACQUIRE | KF_RET_NULL | KF_RCU) BTF_ID_FLAGS(func, bpf_put_mem_cgroup, KF_RELEASE) -- 2.52.0 Introduce BPF kfuncs to conveniently access memcg data: - bpf_mem_cgroup_vm_events(), - bpf_mem_cgroup_memory_events(), - bpf_mem_cgroup_usage(), - bpf_mem_cgroup_page_state(), - bpf_mem_cgroup_flush_stats(). These functions are useful for implementing BPF OOM policies, but also can be used to accelerate access to the memcg data. Reading it through cgroupfs is much more expensive, roughly 5x, mostly because of the need to convert the data into the text and back. JP Kobryn: An experiment was setup to compare the performance of a program that uses the traditional method of reading memory.stat vs a program using the new kfuncs. The control program opens up the root memory.stat file and for 1M iterations reads, converts the string values to numeric data, then seeks back to the beginning. The experimental program sets up the requisite libbpf objects and for 1M iterations invokes a bpf program which uses the kfuncs to fetch all available stats for node_stat_item, memcg_stat_item, and vm_event_item types. The results showed a significant perf benefit on the experimental side, outperforming the control side by a margin of 93%. In kernel mode, elapsed time was reduced by 80%, while in user mode, over 99% of time was saved. control: elapsed time real 0m38.318s user 0m25.131s sys 0m13.070s experiment: elapsed time real 0m2.789s user 0m0.187s sys 0m2.512s control: perf data 33.43% a.out libc.so.6 [.] __vfscanf_internal 6.88% a.out [kernel.kallsyms] [k] vsnprintf 6.33% a.out libc.so.6 [.] _IO_fgets 5.51% a.out [kernel.kallsyms] [k] format_decode 4.31% a.out libc.so.6 [.] __GI_____strtoull_l_internal 3.78% a.out [kernel.kallsyms] [k] string 3.53% a.out [kernel.kallsyms] [k] number 2.71% a.out libc.so.6 [.] _IO_sputbackc 2.41% a.out [kernel.kallsyms] [k] strlen 1.98% a.out a.out [.] main 1.70% a.out libc.so.6 [.] _IO_getline_info 1.51% a.out libc.so.6 [.] __isoc99_sscanf 1.47% a.out [kernel.kallsyms] [k] memory_stat_format 1.47% a.out [kernel.kallsyms] [k] memcpy_orig 1.41% a.out [kernel.kallsyms] [k] seq_buf_printf experiment: perf data 10.55% memcgstat bpf_prog_..._query [k] bpf_prog_16aab2f19fa982a7_query 6.90% memcgstat [kernel.kallsyms] [k] memcg_page_state_output 3.55% memcgstat [kernel.kallsyms] [k] _raw_spin_lock 3.12% memcgstat [kernel.kallsyms] [k] memcg_events 2.87% memcgstat [kernel.kallsyms] [k] __memcg_slab_post_alloc_hook 2.73% memcgstat [kernel.kallsyms] [k] kmem_cache_free 2.70% memcgstat [kernel.kallsyms] [k] entry_SYSRETQ_unsafe_stack 2.25% memcgstat [kernel.kallsyms] [k] __memcg_slab_free_hook 2.06% memcgstat [kernel.kallsyms] [k] get_page_from_freelist Signed-off-by: Roman Gushchin Co-developed-by: JP Kobryn Signed-off-by: JP Kobryn --- include/linux/memcontrol.h | 14 +++++++ mm/bpf_memcontrol.c | 85 ++++++++++++++++++++++++++++++++++++++ mm/memcontrol.c | 10 +++++ 3 files changed, 109 insertions(+) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 7bef427d5a82..6a5d65487b70 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -949,8 +949,12 @@ static inline void mod_memcg_page_state(struct page *page, rcu_read_unlock(); } +unsigned long memcg_events(struct mem_cgroup *memcg, int event); +unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap); unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx); unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item); +bool memcg_stat_item_valid(int idx); +bool memcg_vm_event_item_valid(enum vm_event_item idx); unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx); unsigned long lruvec_page_state_local(struct lruvec *lruvec, enum node_stat_item idx); @@ -1379,6 +1383,16 @@ static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg, in return 0; } +static inline bool memcg_stat_item_valid(int idx) +{ + return false; +} + +static inline bool memcg_vm_event_item_valid(enum vm_event_item idx) +{ + return false; +} + static inline unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) { diff --git a/mm/bpf_memcontrol.c b/mm/bpf_memcontrol.c index 187919eb2fe2..d248162872d3 100644 --- a/mm/bpf_memcontrol.c +++ b/mm/bpf_memcontrol.c @@ -80,6 +80,85 @@ __bpf_kfunc void bpf_put_mem_cgroup(struct mem_cgroup *memcg) css_put(&memcg->css); } +/** + * bpf_mem_cgroup_vm_events - Read memory cgroup's vm event counter + * @memcg: memory cgroup + * @event: event id + * + * Allows to read memory cgroup event counters. + * + * Return: The current value of the corresponding events counter. + */ +__bpf_kfunc unsigned long bpf_mem_cgroup_vm_events(struct mem_cgroup *memcg, + enum vm_event_item event) +{ + if (event >= NR_VM_EVENT_ITEMS || !memcg_vm_event_item_valid(event)) + return (unsigned long)-1; + + return memcg_events(memcg, event); +} + +/** + * bpf_mem_cgroup_usage - Read memory cgroup's usage + * @memcg: memory cgroup + * + * Please, note that the root memory cgroup it special and is exempt + * from the memory accounting. The returned value is a sum of sub-cgroup's + * usages and it not reflecting the size of the root memory cgroup itself. + * If you need to get an approximation, you can use root level statistics: + * e.g. NR_FILE_PAGES + NR_ANON_MAPPED. + * + * Return: The current memory cgroup size in bytes. + */ +__bpf_kfunc unsigned long bpf_mem_cgroup_usage(struct mem_cgroup *memcg) +{ + return page_counter_read(&memcg->memory) * PAGE_SIZE; +} + +/** + * bpf_mem_cgroup_memory_events - Read memory cgroup's memory event value + * @memcg: memory cgroup + * @event: memory event id + * + * Return: The current value of the memory event counter. + */ +__bpf_kfunc unsigned long bpf_mem_cgroup_memory_events(struct mem_cgroup *memcg, + enum memcg_memory_event event) +{ + if (event >= MEMCG_NR_MEMORY_EVENTS) + return (unsigned long)-1; + + return atomic_long_read(&memcg->memory_events[event]); +} + +/** + * bpf_mem_cgroup_page_state - Read memory cgroup's page state counter + * @memcg: memory cgroup + * @idx: counter idx + * + * Allows to read memory cgroup statistics. The output is in bytes. + * + * Return: The value of the page state counter in bytes. + */ +__bpf_kfunc unsigned long bpf_mem_cgroup_page_state(struct mem_cgroup *memcg, int idx) +{ + if (idx < 0 || idx >= MEMCG_NR_STAT || !memcg_stat_item_valid(idx)) + return (unsigned long)-1; + + return memcg_page_state_output(memcg, idx); +} + +/** + * bpf_mem_cgroup_flush_stats - Flush memory cgroup's statistics + * @memcg: memory cgroup + * + * Propagate memory cgroup's statistics up the cgroup tree. + */ +__bpf_kfunc void bpf_mem_cgroup_flush_stats(struct mem_cgroup *memcg) +{ + mem_cgroup_flush_stats(memcg); +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(bpf_memcontrol_kfuncs) @@ -87,6 +166,12 @@ BTF_ID_FLAGS(func, bpf_get_root_mem_cgroup, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_get_mem_cgroup, KF_ACQUIRE | KF_RET_NULL | KF_RCU) BTF_ID_FLAGS(func, bpf_put_mem_cgroup, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_mem_cgroup_vm_events, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_mem_cgroup_memory_events, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_mem_cgroup_usage, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_mem_cgroup_page_state, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_mem_cgroup_flush_stats, KF_TRUSTED_ARGS | KF_SLEEPABLE) + BTF_KFUNCS_END(bpf_memcontrol_kfuncs) static const struct btf_kfunc_id_set bpf_memcontrol_kfunc_set = { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index be810c1fbfc3..188a4df8d7ab 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -663,6 +663,11 @@ unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) return x; } +bool memcg_stat_item_valid(int idx) +{ + return !BAD_STAT_IDX(memcg_stats_index(idx)); +} + static int memcg_page_state_unit(int item); /* @@ -860,6 +865,11 @@ unsigned long memcg_events(struct mem_cgroup *memcg, int event) return READ_ONCE(memcg->vmstats->events[i]); } +bool memcg_vm_event_item_valid(enum vm_event_item idx) +{ + return !BAD_STAT_IDX(memcg_events_index(idx)); +} + #ifdef CONFIG_MEMCG_V1 unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) { -- 2.52.0 From: JP Kobryn Add test coverage for the kfuncs that fetch memcg stats. Using some common stats, test scenarios ensuring that the given stat increases by some arbitrary amount. The stats selected cover the three categories represented by the enums: node_stat_item, memcg_stat_item, vm_event_item. Since only a subset of all stats are queried, use a static struct made up of fields for each stat. Write to the struct with the fetched values when the bpf program is invoked and read the fields in the user mode program for verification. Signed-off-by: JP Kobryn Signed-off-by: Roman Gushchin --- .../testing/selftests/bpf/cgroup_iter_memcg.h | 18 ++ .../bpf/prog_tests/cgroup_iter_memcg.c | 223 ++++++++++++++++++ .../selftests/bpf/progs/cgroup_iter_memcg.c | 39 +++ 3 files changed, 280 insertions(+) create mode 100644 tools/testing/selftests/bpf/cgroup_iter_memcg.h create mode 100644 tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c create mode 100644 tools/testing/selftests/bpf/progs/cgroup_iter_memcg.c diff --git a/tools/testing/selftests/bpf/cgroup_iter_memcg.h b/tools/testing/selftests/bpf/cgroup_iter_memcg.h new file mode 100644 index 000000000000..3f59b127943b --- /dev/null +++ b/tools/testing/selftests/bpf/cgroup_iter_memcg.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#ifndef __CGROUP_ITER_MEMCG_H +#define __CGROUP_ITER_MEMCG_H + +struct memcg_query { + /* some node_stat_item's */ + unsigned long nr_anon_mapped; + unsigned long nr_shmem; + unsigned long nr_file_pages; + unsigned long nr_file_mapped; + /* some memcg_stat_item */ + unsigned long memcg_kmem; + /* some vm_event_item */ + unsigned long pgfault; +}; + +#endif /* __CGROUP_ITER_MEMCG_H */ diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c b/tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c new file mode 100644 index 000000000000..a5afd16705f0 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c @@ -0,0 +1,223 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include +#include +#include +#include +#include "cgroup_helpers.h" +#include "cgroup_iter_memcg.h" +#include "cgroup_iter_memcg.skel.h" + +static int read_stats(struct bpf_link *link) +{ + int fd, ret = 0; + ssize_t bytes; + + fd = bpf_iter_create(bpf_link__fd(link)); + if (!ASSERT_OK_FD(fd, "bpf_iter_create")) + return 1; + + /* + * Invoke iter program by reading from its fd. We're not expecting any + * data to be written by the bpf program so the result should be zero. + * Results will be read directly through the custom data section + * accessible through skel->data_query.memcg_query. + */ + bytes = read(fd, NULL, 0); + if (!ASSERT_EQ(bytes, 0, "read fd")) + ret = 1; + + close(fd); + return ret; +} + +static void test_anon(struct bpf_link *link, struct memcg_query *memcg_query) +{ + void *map; + size_t len; + + len = sysconf(_SC_PAGESIZE) * 1024; + + /* + * Increase memcg anon usage by mapping and writing + * to a new anon region. + */ + map = mmap(NULL, len, PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (!ASSERT_NEQ(map, MAP_FAILED, "mmap anon")) + return; + + memset(map, 1, len); + + if (!ASSERT_OK(read_stats(link), "read stats")) + goto cleanup; + + ASSERT_GT(memcg_query->nr_anon_mapped, 0, "final anon mapped val"); + +cleanup: + munmap(map, len); +} + +static void test_file(struct bpf_link *link, struct memcg_query *memcg_query) +{ + void *map; + size_t len; + char *path; + int fd; + + len = sysconf(_SC_PAGESIZE) * 1024; + path = "/tmp/test_cgroup_iter_memcg"; + + /* + * Increase memcg file usage by creating and writing + * to a mapped file. + */ + fd = open(path, O_CREAT | O_RDWR, 0644); + if (!ASSERT_OK_FD(fd, "open fd")) + return; + if (!ASSERT_OK(ftruncate(fd, len), "ftruncate")) + goto cleanup_fd; + + map = mmap(NULL, len, PROT_WRITE, MAP_SHARED, fd, 0); + if (!ASSERT_NEQ(map, MAP_FAILED, "mmap file")) + goto cleanup_fd; + + memset(map, 1, len); + + if (!ASSERT_OK(read_stats(link), "read stats")) + goto cleanup_map; + + ASSERT_GT(memcg_query->nr_file_pages, 0, "final file value"); + ASSERT_GT(memcg_query->nr_file_mapped, 0, "final file mapped value"); + +cleanup_map: + munmap(map, len); +cleanup_fd: + close(fd); + unlink(path); +} + +static void test_shmem(struct bpf_link *link, struct memcg_query *memcg_query) +{ + size_t len; + int fd; + + len = sysconf(_SC_PAGESIZE) * 1024; + + /* + * Increase memcg shmem usage by creating and writing + * to a shmem object. + */ + fd = shm_open("/tmp_shmem", O_CREAT | O_RDWR, 0644); + if (!ASSERT_OK_FD(fd, "shm_open")) + return; + + if (!ASSERT_OK(fallocate(fd, 0, 0, len), "fallocate")) + goto cleanup; + + if (!ASSERT_OK(read_stats(link), "read stats")) + goto cleanup; + + ASSERT_GT(memcg_query->nr_shmem, 0, "final shmem value"); + +cleanup: + close(fd); + shm_unlink("/tmp_shmem"); +} + +#define NR_PIPES 64 +static void test_kmem(struct bpf_link *link, struct memcg_query *memcg_query) +{ + int fds[NR_PIPES][2], i; + + /* + * Increase kmem value by creating pipes which will allocate some + * kernel buffers. + */ + for (i = 0; i < NR_PIPES; i++) { + if (!ASSERT_OK(pipe(fds[i]), "pipe")) + goto cleanup; + } + + if (!ASSERT_OK(read_stats(link), "read stats")) + goto cleanup; + + ASSERT_GT(memcg_query->memcg_kmem, 0, "kmem value"); + +cleanup: + for (i = i - 1; i >= 0; i--) { + close(fds[i][0]); + close(fds[i][1]); + } +} + +static void test_pgfault(struct bpf_link *link, struct memcg_query *memcg_query) +{ + void *map; + size_t len; + + len = sysconf(_SC_PAGESIZE) * 1024; + + /* Create region to use for triggering a page fault. */ + map = mmap(NULL, len, PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (!ASSERT_NEQ(map, MAP_FAILED, "mmap anon")) + return; + + /* Trigger page fault. */ + memset(map, 1, len); + + if (!ASSERT_OK(read_stats(link), "read stats")) + goto cleanup; + + ASSERT_GT(memcg_query->pgfault, 0, "final pgfault val"); + +cleanup: + munmap(map, len); +} + +void test_cgroup_iter_memcg(void) +{ + char *cgroup_rel_path = "/cgroup_iter_memcg_test"; + struct cgroup_iter_memcg *skel; + struct bpf_link *link; + int cgroup_fd; + + cgroup_fd = cgroup_setup_and_join(cgroup_rel_path); + if (!ASSERT_OK_FD(cgroup_fd, "cgroup_setup_and_join")) + return; + + skel = cgroup_iter_memcg__open_and_load(); + if (!ASSERT_OK_PTR(skel, "cgroup_iter_memcg__open_and_load")) + goto cleanup_cgroup_fd; + + DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts); + union bpf_iter_link_info linfo = { + .cgroup.cgroup_fd = cgroup_fd, + .cgroup.order = BPF_CGROUP_ITER_SELF_ONLY, + }; + opts.link_info = &linfo; + opts.link_info_len = sizeof(linfo); + + link = bpf_program__attach_iter(skel->progs.cgroup_memcg_query, &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_iter")) + goto cleanup_skel; + + if (test__start_subtest("cgroup_iter_memcg__anon")) + test_anon(link, &skel->data_query->memcg_query); + if (test__start_subtest("cgroup_iter_memcg__shmem")) + test_shmem(link, &skel->data_query->memcg_query); + if (test__start_subtest("cgroup_iter_memcg__file")) + test_file(link, &skel->data_query->memcg_query); + if (test__start_subtest("cgroup_iter_memcg__kmem")) + test_kmem(link, &skel->data_query->memcg_query); + if (test__start_subtest("cgroup_iter_memcg__pgfault")) + test_pgfault(link, &skel->data_query->memcg_query); + + bpf_link__destroy(link); +cleanup_skel: + cgroup_iter_memcg__destroy(skel); +cleanup_cgroup_fd: + close(cgroup_fd); + cleanup_cgroup_environment(); +} diff --git a/tools/testing/selftests/bpf/progs/cgroup_iter_memcg.c b/tools/testing/selftests/bpf/progs/cgroup_iter_memcg.c new file mode 100644 index 000000000000..59fb70a3cc50 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/cgroup_iter_memcg.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include "cgroup_iter_memcg.h" + +char _license[] SEC("license") = "GPL"; + +/* The latest values read are stored here. */ +struct memcg_query memcg_query SEC(".data.query"); + +SEC("iter.s/cgroup") +int cgroup_memcg_query(struct bpf_iter__cgroup *ctx) +{ + struct cgroup *cgrp = ctx->cgroup; + struct cgroup_subsys_state *css; + struct mem_cgroup *memcg; + + if (!cgrp) + return 1; + + css = &cgrp->self; + memcg = bpf_get_mem_cgroup(css); + if (!memcg) + return 1; + + bpf_mem_cgroup_flush_stats(memcg); + + memcg_query.nr_anon_mapped = bpf_mem_cgroup_page_state(memcg, NR_ANON_MAPPED); + memcg_query.nr_shmem = bpf_mem_cgroup_page_state(memcg, NR_SHMEM); + memcg_query.nr_file_pages = bpf_mem_cgroup_page_state(memcg, NR_FILE_PAGES); + memcg_query.nr_file_mapped = bpf_mem_cgroup_page_state(memcg, NR_FILE_MAPPED); + memcg_query.memcg_kmem = bpf_mem_cgroup_page_state(memcg, MEMCG_KMEM); + memcg_query.pgfault = bpf_mem_cgroup_vm_events(memcg, PGFAULT); + + bpf_put_mem_cgroup(memcg); + + return 0; +} -- 2.52.0 Let's create a separate entry for MM BPF extensions: these patches often require an attention from both bpf and mm communities. Signed-off-by: Roman Gushchin --- MAINTAINERS | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index e7027fba97db..70c2b73b3941 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4799,6 +4799,15 @@ L: bpf@vger.kernel.org S: Maintained F: tools/lib/bpf/ +BPF [MEMORY MANAGEMENT EXTENSIONS] +M: Roman Gushchin +M: JP Kobryn +M: Shakeel Butt +L: bpf@vger.kernel.org +L: linux-mm@kvack.org +S: Maintained +F: mm/bpf_memcontrol.c + BPF [MISC] L: bpf@vger.kernel.org S: Odd Fixes -- 2.52.0