On machines serving multiple workloads whose memory is isolated via the memory cgroup controller, it is currently impossible to enforce a fair distribution of toptier memory among the worloads, as the only enforcable limits have to do with total memory footprint, but not where that memory resides. This makes ensuring a consistent and baseline performance difficult, as each workload's performance is heavily impacted by workload-external factors such as which other workloads are co-located in the same host, and the order at which different workloads are started. Extend the existing memory.low protection to be tier-aware in the charging, enforcement, and protection calculation to provide best-effort attempts at protecting a fair proportion of toptier memory. Updates to protection and charging are performed in the same path as the standard memcontrol equivalents. Enforcing tier-aware memcg limits however, are gated behind the sysctl tier_aware_memcg. This is so that runtime-enabling of tier aware limits can account for memory already present in the system. Signed-off-by: Joshua Hahn --- include/linux/memcontrol.h | 15 +++++++++++---- include/linux/page_counter.h | 7 ++++--- kernel/cgroup/dmem.c | 2 +- mm/memcontrol.c | 14 ++++++++++++-- mm/page_counter.c | 35 ++++++++++++++++++++++++++++++++++- mm/vmscan.c | 13 +++++++++---- 6 files changed, 71 insertions(+), 15 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 900a36112b62..a998a1e3b8b0 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -606,7 +606,9 @@ static inline void mem_cgroup_protection(struct mem_cgroup *root, } void mem_cgroup_calculate_protection(struct mem_cgroup *root, - struct mem_cgroup *memcg); + struct mem_cgroup *memcg, bool toptier); + +unsigned long mem_cgroup_toptier_usage(struct mem_cgroup *memcg); void update_memcg_toptier_capacity(void); @@ -623,11 +625,15 @@ static inline bool mem_cgroup_unprotected(struct mem_cgroup *target, } static inline bool mem_cgroup_below_low(struct mem_cgroup *target, - struct mem_cgroup *memcg) + struct mem_cgroup *memcg, bool toptier) { if (mem_cgroup_unprotected(target, memcg)) return false; + if (toptier) + return READ_ONCE(memcg->memory.etoptier_low) >= + mem_cgroup_toptier_usage(memcg); + return READ_ONCE(memcg->memory.elow) >= page_counter_read(&memcg->memory); } @@ -1114,7 +1120,8 @@ static inline void mem_cgroup_protection(struct mem_cgroup *root, } static inline void mem_cgroup_calculate_protection(struct mem_cgroup *root, - struct mem_cgroup *memcg) + struct mem_cgroup *memcg, + bool toptier) { } @@ -1128,7 +1135,7 @@ static inline bool mem_cgroup_unprotected(struct mem_cgroup *target, return true; } static inline bool mem_cgroup_below_low(struct mem_cgroup *target, - struct mem_cgroup *memcg) + struct mem_cgroup *memcg, bool toptier) { return false; } diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index ada5f1dd75d4..6635ee7b9575 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -120,15 +120,16 @@ static inline void page_counter_reset_watermark(struct page_counter *counter) #if IS_ENABLED(CONFIG_MEMCG) || IS_ENABLED(CONFIG_CGROUP_DMEM) void page_counter_calculate_protection(struct page_counter *root, struct page_counter *counter, - bool recursive_protection); + bool recursive_protection, bool toptier); void page_counter_update_toptier_capacity(struct page_counter *counter, const nodemask_t *allowed); unsigned long page_counter_toptier_high(struct page_counter *counter); unsigned long page_counter_toptier_low(struct page_counter *counter); #else static inline void page_counter_calculate_protection(struct page_counter *root, - struct page_counter *counter, - bool recursive_protection) {} + struct page_counter *counter, + bool recursive_protection, + bool toptier) {} #endif #endif /* _LINUX_PAGE_COUNTER_H */ diff --git a/kernel/cgroup/dmem.c b/kernel/cgroup/dmem.c index 1ea6afffa985..536d43c42de8 100644 --- a/kernel/cgroup/dmem.c +++ b/kernel/cgroup/dmem.c @@ -277,7 +277,7 @@ dmem_cgroup_calculate_protection(struct dmem_cgroup_pool_state *limit_pool, continue; page_counter_calculate_protection( - climit, &found_pool->cnt, true); + climit, &found_pool->cnt, true, false); if (found_pool == test_pool) break; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 07464f02c529..8aa7ae361a73 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4806,12 +4806,13 @@ struct cgroup_subsys memory_cgrp_subsys = { * mem_cgroup_calculate_protection - check if memory consumption is in the normal range * @root: the top ancestor of the sub-tree being checked * @memcg: the memory cgroup to check + * @toptier: whether the caller is in a toptier node * * WARNING: This function is not stateless! It can only be used as part * of a top-down tree iteration, not for isolated queries. */ void mem_cgroup_calculate_protection(struct mem_cgroup *root, - struct mem_cgroup *memcg) + struct mem_cgroup *memcg, bool toptier) { bool recursive_protection = cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT; @@ -4822,7 +4823,16 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root, if (!root) root = root_mem_cgroup; - page_counter_calculate_protection(&root->memory, &memcg->memory, recursive_protection); + page_counter_calculate_protection(&root->memory, &memcg->memory, + recursive_protection, toptier); +} + +unsigned long mem_cgroup_toptier_usage(struct mem_cgroup *memcg) +{ + if (mem_cgroup_disabled() || !memcg) + return 0; + + return atomic_long_read(&memcg->memory.toptier_usage); } void update_memcg_toptier_capacity(void) diff --git a/mm/page_counter.c b/mm/page_counter.c index cf21c72bfd4e..79d46a1c4c0c 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -410,12 +410,39 @@ static unsigned long effective_protection(unsigned long usage, return ep; } +static void calculate_protection_toptier(struct page_counter *counter, + bool recursive_protection) +{ + struct page_counter *parent = counter->parent; + unsigned long toptier_low; + unsigned long toptier_usage, parent_toptier_usage; + unsigned long toptier_protected, old_toptier_protected; + long delta; + + toptier_low = page_counter_toptier_low(counter); + toptier_usage = atomic_long_read(&counter->toptier_usage); + parent_toptier_usage = atomic_long_read(&parent->toptier_usage); + + /* Propagate toptier low usage to parent for sibling distribution */ + toptier_protected = min(toptier_usage, toptier_low); + old_toptier_protected = atomic_long_xchg(&counter->toptier_low_usage, + toptier_protected); + delta = toptier_protected - old_toptier_protected; + atomic_long_add(delta, &parent->children_toptier_low_usage); + + WRITE_ONCE(counter->etoptier_low, + effective_protection(toptier_usage, parent_toptier_usage, + toptier_low, READ_ONCE(parent->etoptier_low), + atomic_long_read(&parent->children_toptier_low_usage), + recursive_protection)); +} /** * page_counter_calculate_protection - check if memory consumption is in the normal range * @root: the top ancestor of the sub-tree being checked * @counter: the page_counter the counter to update * @recursive_protection: Whether to use memory_recursiveprot behavior. + * @toptier: Whether to calculate toptier-proportional protection * * Calculates elow/emin thresholds for given page_counter. * @@ -424,7 +451,7 @@ static unsigned long effective_protection(unsigned long usage, */ void page_counter_calculate_protection(struct page_counter *root, struct page_counter *counter, - bool recursive_protection) + bool recursive_protection, bool toptier) { unsigned long usage, parent_usage; struct page_counter *parent = counter->parent; @@ -446,6 +473,9 @@ void page_counter_calculate_protection(struct page_counter *root, if (parent == root) { counter->emin = READ_ONCE(counter->min); counter->elow = READ_ONCE(counter->low); + if (toptier) + WRITE_ONCE(counter->etoptier_low, + page_counter_toptier_low(counter)); return; } @@ -462,6 +492,9 @@ void page_counter_calculate_protection(struct page_counter *root, READ_ONCE(parent->elow), atomic_long_read(&parent->children_low_usage), recursive_protection)); + + if (toptier) + calculate_protection_toptier(counter, recursive_protection); } void page_counter_update_toptier_capacity(struct page_counter *counter, diff --git a/mm/vmscan.c b/mm/vmscan.c index 6a87ac7be43c..5b4cb030a477 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4144,6 +4144,7 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) struct mem_cgroup *memcg; unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl); bool reclaimable = !min_ttl; + bool toptier = node_is_toptier(pgdat->node_id); VM_WARN_ON_ONCE(!current_is_kswapd()); @@ -4153,7 +4154,7 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) do { struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); - mem_cgroup_calculate_protection(NULL, memcg); + mem_cgroup_calculate_protection(NULL, memcg, toptier); if (!reclaimable) reclaimable = lruvec_is_reclaimable(lruvec, sc, min_ttl); @@ -4905,12 +4906,14 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) unsigned long reclaimed = sc->nr_reclaimed; struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct pglist_data *pgdat = lruvec_pgdat(lruvec); + bool toptier = tier_aware_memcg_limits && + node_is_toptier(pgdat->node_id); /* lru_gen_age_node() called mem_cgroup_calculate_protection() */ if (mem_cgroup_below_min(NULL, memcg)) return MEMCG_LRU_YOUNG; - if (mem_cgroup_below_low(NULL, memcg)) { + if (mem_cgroup_below_low(NULL, memcg, toptier)) { /* see the comment on MEMCG_NR_GENS */ if (READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_TAIL) return MEMCG_LRU_TAIL; @@ -5960,6 +5963,7 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) }; struct mem_cgroup_reclaim_cookie *partial = &reclaim; struct mem_cgroup *memcg; + bool toptier = node_is_toptier(pgdat->node_id); /* * In most cases, direct reclaimers can do partial walks @@ -5987,7 +5991,7 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) */ cond_resched(); - mem_cgroup_calculate_protection(target_memcg, memcg); + mem_cgroup_calculate_protection(target_memcg, memcg, toptier); if (mem_cgroup_below_min(target_memcg, memcg)) { /* @@ -5995,7 +5999,8 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) * If there is no reclaimable memory, OOM. */ continue; - } else if (mem_cgroup_below_low(target_memcg, memcg)) { + } else if (mem_cgroup_below_low(target_memcg, memcg, + tier_aware_memcg_limits && toptier)) { /* * Soft protection. * Respect the protection only as long as -- 2.47.3