Commit 7d709f49babc ("vmscan,cgroup: apply mems_effective to reclaim") introduces the cpuset.mems_effective check and applies it to can_demote(). However, it does not apply this check in demote_folio_list(), which leads to situations where pages are demoted to nodes that are explicitly excluded from the task's cpuset.mems. To address the issue that demotion targets do not respect cpuset.mem_effective in demote_folio_list(), implement a new function get_demotion_targets(), which returns a preferred demotion target and all allowed (fallback) nodes against mems_effective, and update demote_folio_list() and can_demote() accordingly to use get_demotion_targets(). Furthermore, update some supporting functions: - Add a parameter for next_demotion_node() to return a copy of node_demotion[]->preferred, allowing get_demotion_targets() to select the next-best node for demotion. - Change the parameters for cpuset_node_allowed() and mem_cgroup_node_allowed() from nid to nodemask * to allow for direct logic-and operations with mems_effective. Signed-off-by: Bing Jiao --- include/linux/cpuset.h | 5 +-- include/linux/memcontrol.h | 6 +-- include/linux/memory-tiers.h | 6 +-- kernel/cgroup/cpuset.c | 14 +++---- mm/memcontrol.c | 5 ++- mm/memory-tiers.c | 8 +++- mm/vmscan.c | 77 +++++++++++++++++++++++++++++------- 7 files changed, 87 insertions(+), 34 deletions(-) diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index a98d3330385c..27a0b6e9fb9d 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -174,7 +174,7 @@ static inline void set_mems_allowed(nodemask_t nodemask) task_unlock(current); } -extern bool cpuset_node_allowed(struct cgroup *cgroup, int nid); +extern void cpuset_node_allowed(struct cgroup *cgroup, nodemask_t *nodes); #else /* !CONFIG_CPUSETS */ static inline bool cpusets_enabled(void) { return false; } @@ -301,9 +301,8 @@ static inline bool read_mems_allowed_retry(unsigned int seq) return false; } -static inline bool cpuset_node_allowed(struct cgroup *cgroup, int nid) +static inline void cpuset_node_allowed(struct cgroup *cgroup, nodemask_t *nodes) { - return true; } #endif /* !CONFIG_CPUSETS */ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index fd400082313a..a87f008b6600 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1740,7 +1740,7 @@ static inline void count_objcg_events(struct obj_cgroup *objcg, rcu_read_unlock(); } -bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid); +void mem_cgroup_node_allowed(struct mem_cgroup *memcg, nodemask_t *nodes); void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg); @@ -1811,9 +1811,9 @@ static inline ino_t page_cgroup_ino(struct page *page) return 0; } -static inline bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid) +static inline void mem_cgroup_node_allowed(struct mem_cgroup *memcg, + nodemask_t *nodes) { - return true; } static inline void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg) diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h index 7a805796fcfd..2706ebfa94b5 100644 --- a/include/linux/memory-tiers.h +++ b/include/linux/memory-tiers.h @@ -53,11 +53,11 @@ struct memory_dev_type *mt_find_alloc_memory_type(int adist, struct list_head *memory_types); void mt_put_memory_types(struct list_head *memory_types); #ifdef CONFIG_MIGRATION -int next_demotion_node(int node); +int next_demotion_node(int node, nodemask_t *mask); void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets); bool node_is_toptier(int node); #else -static inline int next_demotion_node(int node) +static inline int next_demotion_node(int node, nodemask_t *mask) { return NUMA_NO_NODE; } @@ -101,7 +101,7 @@ static inline void clear_node_memory_type(int node, struct memory_dev_type *memt } -static inline int next_demotion_node(int node) +static inline int next_demotion_node(int node, nodemask_t *mask) { return NUMA_NO_NODE; } diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 6e6eb09b8db6..2d78cfde5911 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -4416,11 +4416,10 @@ bool cpuset_current_node_allowed(int node, gfp_t gfp_mask) return allowed; } -bool cpuset_node_allowed(struct cgroup *cgroup, int nid) +void cpuset_node_allowed(struct cgroup *cgroup, nodemask_t *nodes) { struct cgroup_subsys_state *css; struct cpuset *cs; - bool allowed; /* * In v1, mem_cgroup and cpuset are unlikely in the same hierarchy @@ -4428,16 +4427,16 @@ bool cpuset_node_allowed(struct cgroup *cgroup, int nid) * so return true to avoid taking a global lock on the empty check. */ if (!cpuset_v2()) - return true; + return; css = cgroup_get_e_css(cgroup, &cpuset_cgrp_subsys); if (!css) - return true; + return; /* * Normally, accessing effective_mems would require the cpuset_mutex - * or callback_lock - but node_isset is atomic and the reference - * taken via cgroup_get_e_css is sufficient to protect css. + * or callback_lock - but the reference taken via cgroup_get_e_css + * is sufficient to protect css. * * Since this interface is intended for use by migration paths, we * relax locking here to avoid taking global locks - while accepting @@ -4447,9 +4446,8 @@ bool cpuset_node_allowed(struct cgroup *cgroup, int nid) * cannot make strong isolation guarantees, so this is acceptable. */ cs = container_of(css, struct cpuset, css); - allowed = node_isset(nid, cs->effective_mems); + nodes_and(*nodes, *nodes, cs->effective_mems); css_put(css); - return allowed; } /** diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 75fc22a33b28..a62c75b136ef 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5597,9 +5597,10 @@ subsys_initcall(mem_cgroup_swap_init); #endif /* CONFIG_SWAP */ -bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid) +void mem_cgroup_node_allowed(struct mem_cgroup *memcg, nodemask_t *nodes) { - return memcg ? cpuset_node_allowed(memcg->css.cgroup, nid) : true; + if (memcg) + cpuset_node_allowed(memcg->css.cgroup, nodes); } void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg) diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index 20aab9c19c5e..ed0ee9c3ae70 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -320,13 +320,14 @@ void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets) /** * next_demotion_node() - Get the next node in the demotion path * @node: The starting node to lookup the next node + * @mask: The preferred nodemask copy to be returned * * Return: node id for next memory node in the demotion path hierarchy * from @node; NUMA_NO_NODE if @node is terminal. This does not keep * @node online or guarantee that it *continues* to be the next demotion * target. */ -int next_demotion_node(int node) +int next_demotion_node(int node, nodemask_t *mask) { struct demotion_nodes *nd; int target; @@ -355,7 +356,12 @@ int next_demotion_node(int node) * last target node. Or introducing per-cpu data to avoid * caching issue, which seems more complicated. So selecting * target node randomly seems better until now. + * + * Copy preferred nodes as the fallback if the returned one + * does not satisify some constraints like cpuset. */ + if (mask) + nodes_copy(*mask, nd->preferred); target = node_random(&nd->preferred); rcu_read_unlock(); diff --git a/mm/vmscan.c b/mm/vmscan.c index 8bdb1629b6eb..2ddbf5584af8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -341,22 +341,71 @@ static void flush_reclaim_state(struct scan_control *sc) } } +/* + * Returns a preferred demotion node and all allowed demotion @targets. + * Returns NUMA_NO_NODE and @targets is meaningless if no allowed nodes. + */ +static int get_demotion_targets(nodemask_t *targets, struct pglist_data *pgdat, + struct mem_cgroup *memcg) +{ + nodemask_t allowed_mask; + nodemask_t preferred_mask; + int preferred_node; + + if (!pgdat) + return NUMA_NO_NODE; + + preferred_node = next_demotion_node(pgdat->node_id, &preferred_mask); + if (preferred_node == NUMA_NO_NODE) + return NUMA_NO_NODE; + + node_get_allowed_targets(pgdat, &allowed_mask); + mem_cgroup_node_allowed(memcg, &allowed_mask); + if (nodes_empty(allowed_mask)) + return NUMA_NO_NODE; + + if (targets) + nodes_copy(*targets, allowed_mask); + + do { + if (node_isset(preferred_node, allowed_mask)) + return preferred_node; + + nodes_and(preferred_mask, preferred_mask, allowed_mask); + if (!nodes_empty(preferred_mask)) + return node_random(&preferred_mask); + + /* + * Hop to the next tier of preferred nodes. Even if + * preferred_node is not set in allowed_mask, still can use it + * to query the nest-best demotion nodes. + */ + preferred_node = next_demotion_node(preferred_node, + &preferred_mask); + } while (preferred_node != NUMA_NO_NODE); + + /* + * Should not reach here, as a non-empty allowed_mask ensures + * there must have a target node for demotion. + * Otherwise, it suggests something wrong in node_demotion[]->preferred, + * where the same-tier nodes have different preferred targets. + * E.g., if node 0 identifies both nodes 2 and 3 as preferred targets, + * but nodes 2 and 3 themselves have different preferred nodes. + */ + WARN_ON_ONCE(1); + return node_random(&allowed_mask); +} + static bool can_demote(int nid, struct scan_control *sc, struct mem_cgroup *memcg) { - int demotion_nid; - if (!numa_demotion_enabled) return false; if (sc && sc->no_demotion) return false; - demotion_nid = next_demotion_node(nid); - if (demotion_nid == NUMA_NO_NODE) - return false; - - /* If demotion node isn't in the cgroup's mems_allowed, fall back */ - return mem_cgroup_node_allowed(memcg, demotion_nid); + return get_demotion_targets(NULL, NODE_DATA(nid), memcg) != + NUMA_NO_NODE; } static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg, @@ -1019,9 +1068,10 @@ static struct folio *alloc_demote_folio(struct folio *src, * Folios which are not demoted are left on @demote_folios. */ static unsigned int demote_folio_list(struct list_head *demote_folios, - struct pglist_data *pgdat) + struct pglist_data *pgdat, + struct mem_cgroup *memcg) { - int target_nid = next_demotion_node(pgdat->node_id); + int target_nid; unsigned int nr_succeeded; nodemask_t allowed_mask; @@ -1033,7 +1083,6 @@ static unsigned int demote_folio_list(struct list_head *demote_folios, */ .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOMEMALLOC | GFP_NOWAIT, - .nid = target_nid, .nmask = &allowed_mask, .reason = MR_DEMOTION, }; @@ -1041,10 +1090,10 @@ static unsigned int demote_folio_list(struct list_head *demote_folios, if (list_empty(demote_folios)) return 0; + target_nid = get_demotion_targets(&allowed_mask, pgdat, memcg); if (target_nid == NUMA_NO_NODE) return 0; - - node_get_allowed_targets(pgdat, &allowed_mask); + mtc.nid = target_nid; /* Demotion ignores all cpuset and mempolicy settings */ migrate_pages(demote_folios, alloc_demote_folio, NULL, @@ -1566,7 +1615,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, /* 'folio_list' is always empty here */ /* Migrate folios selected for demotion */ - nr_demoted = demote_folio_list(&demote_folios, pgdat); + nr_demoted = demote_folio_list(&demote_folios, pgdat, memcg); nr_reclaimed += nr_demoted; stat->nr_demoted += nr_demoted; /* Folios that could not be demoted are still in @demote_folios */ -- 2.52.0.351.gbe84eed79e-goog