The preferred demotion node (migration_target_control.nid) should be the one closest to the source node to minimize migration latency. Currently, a discrepancy exists where demote_folio_list() randomly selects an allowed node if the preferred node from next_demotion_node() is not set in mems_effective. To address it, update next_demotion_node() to select a preferred target against allowed nodes; and to return the closest demotion target if all preferred nodes are not in mems_effective via next_demotion_node(). It ensures that the preferred demotion target is consistently the closest available node to the source node. Signed-off-by: Bing Jiao --- v7 -> v8: Fix bugs in v7. Remove the while loop of getting the preferred node via next_demotion_node(). Use find_next_best_node() to find the closest demotion target. v8 -> v9: Move allowed node checks and identification of the closest demotion target into next_demotion_node() for better function splitting. include/linux/memory-tiers.h | 6 +++--- mm/memory-tiers.c | 21 ++++++++++++++++----- mm/vmscan.c | 5 ++--- 3 files changed, 21 insertions(+), 11 deletions(-) diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h index 7a805796fcfd..96987d9d95a8 100644 --- a/include/linux/memory-tiers.h +++ b/include/linux/memory-tiers.h @@ -53,11 +53,11 @@ struct memory_dev_type *mt_find_alloc_memory_type(int adist, struct list_head *memory_types); void mt_put_memory_types(struct list_head *memory_types); #ifdef CONFIG_MIGRATION -int next_demotion_node(int node); +int next_demotion_node(int node, const nodemask_t *allowed_mask); void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets); bool node_is_toptier(int node); #else -static inline int next_demotion_node(int node) +static inline int next_demotion_node(int node, const nodemask_t *allowed_mask) { return NUMA_NO_NODE; } @@ -101,7 +101,7 @@ static inline void clear_node_memory_type(int node, struct memory_dev_type *memt } -static inline int next_demotion_node(int node) +static inline int next_demotion_node(int node, const nodemask_t *allowed_mask) { return NUMA_NO_NODE; } diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index 864811fff409..2d6c3754e6a8 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -320,16 +320,17 @@ void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets) /** * next_demotion_node() - Get the next node in the demotion path * @node: The starting node to lookup the next node + * @allowed_mask: The pointer to allowed node mask * * Return: node id for next memory node in the demotion path hierarchy * from @node; NUMA_NO_NODE if @node is terminal. This does not keep * @node online or guarantee that it *continues* to be the next demotion * target. */ -int next_demotion_node(int node) +int next_demotion_node(int node, const nodemask_t *allowed_mask) { struct demotion_nodes *nd; - int target; + nodemask_t mask; if (!node_demotion) return NUMA_NO_NODE; @@ -344,6 +345,10 @@ int next_demotion_node(int node) * node_demotion[] reads need to be consistent. */ rcu_read_lock(); + /* Filter out nodes that are not in allowed_mask. */ + nodes_and(mask, nd->preferred, *allowed_mask); + rcu_read_unlock(); + /* * If there are multiple target nodes, just select one * target node randomly. @@ -356,10 +361,16 @@ int next_demotion_node(int node) * caching issue, which seems more complicated. So selecting * target node randomly seems better until now. */ - target = node_random(&nd->preferred); - rcu_read_unlock(); + if (!nodes_empty(mask)) + return node_random(&mask); - return target; + /* + * Preferred nodes are not in allowed_mask. Filp bits in + * allowed_mask as used node mask. Then, use it to get the + * closest demotion target. + */ + nodes_complement(mask, *allowed_mask); + return find_next_best_node(node, &mask); } static void disable_all_demotion_targets(void) diff --git a/mm/vmscan.c b/mm/vmscan.c index 5ea1dd2b8cce..7a631de46064 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1048,12 +1048,11 @@ static unsigned int demote_folio_list(struct list_head *demote_folios, if (nodes_empty(allowed_mask)) return 0; - target_nid = next_demotion_node(pgdat->node_id); + target_nid = next_demotion_node(pgdat->node_id, &allowed_mask); if (target_nid == NUMA_NO_NODE) /* No lower-tier nodes or nodes were hot-unplugged. */ return 0; - if (!node_isset(target_nid, allowed_mask)) - target_nid = node_random(&allowed_mask); + mtc.nid = target_nid; /* Demotion ignores all cpuset and mempolicy settings */ -- 2.52.0.457.g6b5491de43-goog