From: Chen Ridong The memcg LRU was originally introduced for global reclaim to enhance scalability. However, its implementation complexity has led to performance regressions when dealing with a large number of memory cgroups [1]. As suggested by Johannes [1], this patch adopts mem_cgroup_iter with cookie-based iteration for global reclaim, aligning with the approach already used in shrink_node_memcgs. This simplification removes the dedicated memcg LRU tracking while maintaining the core functionality. It performed a stress test based on Zhao Yu's methodology [2] on a 1 TB, 4-node NUMA system. The results are summarized below: memcg LRU memcg iter stddev(pgsteal) / mean(pgsteal) 91.2% 75.7% sum(pgsteal) / sum(requested) 216.4% 230.5% The new implementation demonstrates a significant improvement in fairness, reducing the standard deviation relative to the mean by 15.5 percentage points. While the reclaim accuracy shows a slight increase in overscan (from 85086871 to 90633890, 6.5%). The primary benefits of this change are: 1. Simplified codebase by removing custom memcg LRU infrastructure 2. Improved fairness in memory reclaim across multiple cgroups 3. Better performance when creating many memory cgroups [1] https://lore.kernel.org/r/20251126171513.GC135004@cmpxchg.org [2] https://lore.kernel.org/r/20221222041905.2431096-7-yuzhao@google.com Signed-off-by: Chen Ridong --- mm/vmscan.c | 117 ++++++++++++++++------------------------------------ 1 file changed, 36 insertions(+), 81 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index fddd168a9737..70b0e7e5393c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4895,27 +4895,14 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) return nr_to_scan < 0; } -static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) +static void shrink_one(struct lruvec *lruvec, struct scan_control *sc) { - bool success; unsigned long scanned = sc->nr_scanned; unsigned long reclaimed = sc->nr_reclaimed; - struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct pglist_data *pgdat = lruvec_pgdat(lruvec); + struct mem_cgroup *memcg = lruvec_memcg(lruvec); - /* lru_gen_age_node() called mem_cgroup_calculate_protection() */ - if (mem_cgroup_below_min(NULL, memcg)) - return MEMCG_LRU_YOUNG; - - if (mem_cgroup_below_low(NULL, memcg)) { - /* see the comment on MEMCG_NR_GENS */ - if (READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_TAIL) - return MEMCG_LRU_TAIL; - - memcg_memory_event(memcg, MEMCG_LOW); - } - - success = try_to_shrink_lruvec(lruvec, sc); + try_to_shrink_lruvec(lruvec, sc); shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority); @@ -4924,86 +4911,55 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) sc->nr_reclaimed - reclaimed); flush_reclaim_state(sc); - - if (success && mem_cgroup_online(memcg)) - return MEMCG_LRU_YOUNG; - - if (!success && lruvec_is_sizable(lruvec, sc)) - return 0; - - /* one retry if offlined or too small */ - return READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_TAIL ? - MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG; } static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc) { - int op; - int gen; - int bin; - int first_bin; - struct lruvec *lruvec; - struct lru_gen_folio *lrugen; + struct mem_cgroup *target = sc->target_mem_cgroup; + struct mem_cgroup_reclaim_cookie reclaim = { + .pgdat = pgdat, + }; + struct mem_cgroup_reclaim_cookie *cookie = &reclaim; struct mem_cgroup *memcg; - struct hlist_nulls_node *pos; - gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq)); - bin = first_bin = get_random_u32_below(MEMCG_NR_BINS); -restart: - op = 0; - memcg = NULL; - - rcu_read_lock(); + if (current_is_kswapd() || sc->memcg_full_walk) + cookie = NULL; - hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) { - if (op) { - lru_gen_rotate_memcg(lruvec, op); - op = 0; - } + memcg = mem_cgroup_iter(target, NULL, cookie); + while (memcg) { + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); - mem_cgroup_put(memcg); - memcg = NULL; + cond_resched(); - if (gen != READ_ONCE(lrugen->gen)) - continue; + mem_cgroup_calculate_protection(target, memcg); - lruvec = container_of(lrugen, struct lruvec, lrugen); - memcg = lruvec_memcg(lruvec); + if (mem_cgroup_below_min(target, memcg)) + goto next; - if (!mem_cgroup_tryget(memcg)) { - lru_gen_release_memcg(memcg); - memcg = NULL; - continue; + if (mem_cgroup_below_low(target, memcg)) { + if (!sc->memcg_low_reclaim) { + sc->memcg_low_skipped = 1; + goto next; + } + memcg_memory_event(memcg, MEMCG_LOW); } - rcu_read_unlock(); + shrink_one(lruvec, sc); - op = shrink_one(lruvec, sc); - - rcu_read_lock(); - - if (should_abort_scan(lruvec, sc)) + if (should_abort_scan(lruvec, sc)) { + if (cookie) + mem_cgroup_iter_break(target, memcg); break; - } - - rcu_read_unlock(); - - if (op) - lru_gen_rotate_memcg(lruvec, op); - - mem_cgroup_put(memcg); - - if (!is_a_nulls(pos)) - return; + } - /* restart if raced with lru_gen_rotate_memcg() */ - if (gen != get_nulls_value(pos)) - goto restart; +next: + if (cookie && sc->nr_reclaimed >= sc->nr_to_reclaim) { + mem_cgroup_iter_break(target, memcg); + break; + } - /* try the rest of the bins of the current generation */ - bin = get_memcg_bin(bin + 1); - if (bin != first_bin) - goto restart; + memcg = mem_cgroup_iter(target, memcg, cookie); + } } static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) @@ -5019,8 +4975,7 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc set_mm_walk(NULL, sc->proactive); - if (try_to_shrink_lruvec(lruvec, sc)) - lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG); + try_to_shrink_lruvec(lruvec, sc); clear_mm_walk(); -- 2.34.1