When arena allocations were converted from bpf_map_alloc_pages() to kmalloc_nolock() to support non-sleepable contexts, memcg accounting was inadvertently lost. This commit restores proper memory accounting for all arena-related allocations. All arena related allocations are accounted into memcg of the process that created bpf_arena. Signed-off-by: Puranjay Mohan --- kernel/bpf/arena.c | 39 ++++++++++++++++++++++++++++++++++----- kernel/bpf/range_tree.c | 5 +++-- 2 files changed, 37 insertions(+), 7 deletions(-) diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 456ac989269d..cb9451208b0e 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -360,6 +360,7 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) { struct bpf_map *map = vmf->vma->vm_file->private_data; struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + struct mem_cgroup *memcg, *old_memcg; struct page *page; long kbase, kaddr; unsigned long flags; @@ -377,6 +378,8 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) /* already have a page vmap-ed */ goto out; + old_memcg = bpf_map_memcg_enter(map, &memcg); + if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT) /* User space requested to segfault when page is not allocated by bpf prog */ goto out_unlock_sigsegv; @@ -400,12 +403,14 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) goto out_unlock_sigsegv; } flush_vmap_cache(kaddr, PAGE_SIZE); + bpf_map_memcg_exit(old_memcg, memcg); out: page_ref_add(page, 1); raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); vmf->page = page; return 0; out_unlock_sigsegv: + bpf_map_memcg_exit(old_memcg, memcg); raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); return VM_FAULT_SIGSEGV; } @@ -557,7 +562,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt /* Cap allocation size to KMALLOC_MAX_CACHE_SIZE so kmalloc_nolock() can succeed. */ alloc_pages = min(page_cnt, KMALLOC_MAX_CACHE_SIZE / sizeof(struct page *)); - pages = kmalloc_nolock(alloc_pages * sizeof(struct page *), 0, NUMA_NO_NODE); + pages = kmalloc_nolock(alloc_pages * sizeof(struct page *), __GFP_ACCOUNT, NUMA_NO_NODE); if (!pages) return 0; data.pages = pages; @@ -713,7 +718,7 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, return; defer: - s = kmalloc_nolock(sizeof(struct arena_free_span), 0, -1); + s = kmalloc_nolock(sizeof(struct arena_free_span), __GFP_ACCOUNT, -1); if (!s) /* * If allocation fails in non-sleepable context, pages are intentionally left @@ -834,49 +839,69 @@ __bpf_kfunc_start_defs(); __bpf_kfunc void *bpf_arena_alloc_pages(void *p__map, void *addr__ign, u32 page_cnt, int node_id, u64 flags) { + void *ret; struct bpf_map *map = p__map; + struct mem_cgroup *memcg, *old_memcg; struct bpf_arena *arena = container_of(map, struct bpf_arena, map); if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt) return NULL; - return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, true); + old_memcg = bpf_map_memcg_enter(map, &memcg); + ret = (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, true); + bpf_map_memcg_exit(old_memcg, memcg); + + return ret; } void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt, int node_id, u64 flags) { + void *ret; struct bpf_map *map = p__map; + struct mem_cgroup *memcg, *old_memcg; struct bpf_arena *arena = container_of(map, struct bpf_arena, map); if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt) return NULL; - return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, false); + old_memcg = bpf_map_memcg_enter(map, &memcg); + ret = (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, false); + bpf_map_memcg_exit(old_memcg, memcg); + + return ret; } __bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt) { struct bpf_map *map = p__map; + struct mem_cgroup *memcg, *old_memcg; struct bpf_arena *arena = container_of(map, struct bpf_arena, map); if (map->map_type != BPF_MAP_TYPE_ARENA || !page_cnt || !ptr__ign) return; + old_memcg = bpf_map_memcg_enter(map, &memcg); arena_free_pages(arena, (long)ptr__ign, page_cnt, true); + bpf_map_memcg_exit(old_memcg, memcg); } void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_cnt) { struct bpf_map *map = p__map; + struct mem_cgroup *memcg, *old_memcg; struct bpf_arena *arena = container_of(map, struct bpf_arena, map); if (map->map_type != BPF_MAP_TYPE_ARENA || !page_cnt || !ptr__ign) return; + old_memcg = bpf_map_memcg_enter(map, &memcg); arena_free_pages(arena, (long)ptr__ign, page_cnt, false); + bpf_map_memcg_exit(old_memcg, memcg); } __bpf_kfunc int bpf_arena_reserve_pages(void *p__map, void *ptr__ign, u32 page_cnt) { + int ret; struct bpf_map *map = p__map; + struct mem_cgroup *memcg, *old_memcg; struct bpf_arena *arena = container_of(map, struct bpf_arena, map); if (map->map_type != BPF_MAP_TYPE_ARENA) @@ -885,7 +910,11 @@ __bpf_kfunc int bpf_arena_reserve_pages(void *p__map, void *ptr__ign, u32 page_c if (!page_cnt) return 0; - return arena_reserve_pages(arena, (long)ptr__ign, page_cnt); + old_memcg = bpf_map_memcg_enter(map, &memcg); + ret = arena_reserve_pages(arena, (long)ptr__ign, page_cnt); + bpf_map_memcg_exit(old_memcg, memcg); + + return ret; } __bpf_kfunc_end_defs(); diff --git a/kernel/bpf/range_tree.c b/kernel/bpf/range_tree.c index 99c63d982c5d..2f28886f3ff7 100644 --- a/kernel/bpf/range_tree.c +++ b/kernel/bpf/range_tree.c @@ -149,7 +149,8 @@ int range_tree_clear(struct range_tree *rt, u32 start, u32 len) range_it_insert(rn, rt); /* Add a range */ - new_rn = kmalloc_nolock(sizeof(struct range_node), 0, NUMA_NO_NODE); + new_rn = kmalloc_nolock(sizeof(struct range_node), __GFP_ACCOUNT, + NUMA_NO_NODE); if (!new_rn) return -ENOMEM; new_rn->rn_start = last + 1; @@ -234,7 +235,7 @@ int range_tree_set(struct range_tree *rt, u32 start, u32 len) right->rn_start = start; range_it_insert(right, rt); } else { - left = kmalloc_nolock(sizeof(struct range_node), 0, NUMA_NO_NODE); + left = kmalloc_nolock(sizeof(struct range_node), __GFP_ACCOUNT, NUMA_NO_NODE); if (!left) return -ENOMEM; left->rn_start = start; -- 2.47.3