Add NUMA node filtering functionality to page_owner to allow filtering pages by specific NUMA node(s). This is useful for NUMA-aware memory allocation analysis and debugging. The filter supports flexible input formats: - Single node: nid=0 - Multiple nodes: nid=0,2,3 - Node range: nid=0-3 - Mixed format: nid=0,2-4,7 Example usage: # Using the page_owner_filter tool (recommended) ./page_owner_filter -n 0-3 ./page_owner_filter -m stack_handle -n 0,2-4,7 The implementation uses per-file-descriptor filter state stored in file->private_data, allowing each opener to have independent filter configuration. It uses nodemask_t for efficient multi-node filtering and nodelist_parse() for flexible input parsing. Node validity is verified using nodes_subset() to reject nodes without memory. Signed-off-by: Zhen Ni --- Changes in v9: - Add spinlock protection for NUMA filter state access - Use memdesc_nid() instead of page_to_nid() to bypass PF_POISONED_CHECK() Changes in v8: - Add cond_resched() in page iteration loop to prevent RCU stalls - Reject empty nid list to avoid enabling an empty filter - Improve comment: "Commit all filter changes" Changes in v7: - per-file-descriptor implementation Changes in v6: - Add node validity check using nodes_subset to reject invalid node numbers that don't exist in the system - Move bool filter_by_nid declaration to top of block - Use kmalloc_objs instead of kmalloc - Remove 100 bytes overhead Changes in v5: - Optimize nodes_empty() check in page iteration loop - Add __data_racy qualifier to nid_mask field Changes in v4: - Remove "-1" support, use empty string to clear filter - Use strncpy_from_user() instead of copy_from_user() - Add concurrency safety documentation for nid_mask access - Rename fops to page_owner_nid_filter_fops for consistency Changes in v3: - Remove READ_ONCE/WRITE_ONCE for nodemask_t (fixes compilation errors) * nodemask_t is a large structure (128 bytes) that triggers compile-time asserts * Direct assignment is safe for this use case - Add comment explaining input length calculation formula * 6 bytes = ",NNNNN" (comma + 5-digit node number) - Simplify "-1" check using kstrtoint() instead of dual strcmp() - Move nodemask_t mask read outside PFN iteration loop for performance * Avoids 128-byte structure copy on each iteration Changes in v2: - Use nodemask_t instead of int to support multiple nodes - Implement nodelist_parse() to support flexible input formats * Single node: "0", "2" * Multiple nodes: "0,2,3" * Ranges: "0-3" * Mixed: "0,2-4,7" - Use %*pbl format for output (e.g., "0-2", "0,2-4,7") - Use dynamic memory allocation (kmalloc) to handle variable-length input - Follow cpuset's max_write_len pattern: (100 + 6 * MAX_NUMNODES) v8: https://lore.kernel.org/linux-mm/20260520075641.1931080-3-zhen.ni@easystack.cn/ v7: https://lore.kernel.org/linux-mm/20260515091942.1535677-3-zhen.ni@easystack.cn/ v6: https://lore.kernel.org/linux-mm/20260511033017.747781-3-zhen.ni@easystack.cn/ v5: https://lore.kernel.org/linux-mm/20260507064643.179187-3-zhen.ni@easystack.cn/ v4: https://lore.kernel.org/linux-mm/20260430163247.13628-3-zhen.ni@easystack.cn/ v3: https://lore.kernel.org/linux-mm/20260428071112.1420380-4-zhen.ni@easystack.cn/ v2: https://lore.kernel.org/linux-mm/20260419155540.376847-4-zhen.ni@easystack.cn/ v1: https://lore.kernel.org/linux-mm/20260417154638.22370-4-zhen.ni@easystack.cn/ --- mm/page_owner.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 59 insertions(+), 2 deletions(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index 7595735979bf..9e0fb679303f 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -68,6 +68,8 @@ static const char * const page_owner_print_mode_strings[] = { struct page_owner_filter_state { enum page_owner_print_mode print_mode; + nodemask_t nid_filter; + bool nid_filter_enabled; spinlock_t lock; }; @@ -698,6 +700,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) struct page_owner *page_owner; depot_stack_handle_t handle; struct page_owner_filter_state *state = file->private_data; + unsigned long flags; if (!static_branch_unlikely(&page_owner_inited)) return -EINVAL; @@ -774,6 +777,26 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) if (!handle) goto ext_put_continue; + /* + * NUMA filter: if enabled, only output pages from specified nodes. + * We cannot use page_to_nid() here because it calls + * PF_POISONED_CHECK() which triggers VM_BUG_ON_PGFLAGS() when + * the page is in an inconsistent state during concurrent allocation + * or free. Since we're iterating pages without holding the zone + * lock, we need to extract nid directly from page->flags + * without the poisoned check. + */ + spin_lock_irqsave(&state->lock, flags); + if (state->nid_filter_enabled) { + int page_nid = memdesc_nid(page->flags); + + if (!node_isset(page_nid, state->nid_filter)) { + spin_unlock_irqrestore(&state->lock, flags); + goto ext_put_continue; + } + } + spin_unlock_irqrestore(&state->lock, flags); + /* Record the next PFN to read in the file offset */ *ppos = pfn + 1; @@ -783,6 +806,8 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) &page_owner_tmp, handle, state); ext_put_continue: page_ext_put(page_ext); + if (need_resched()) + cond_resched(); } return 0; @@ -891,6 +916,8 @@ static int page_owner_open(struct inode *inode, struct file *file) spin_lock_init(&state->lock); state->print_mode = PAGE_OWNER_PRINT_STACK; + nodes_clear(state->nid_filter); + state->nid_filter_enabled = false; file->private_data = state; return 0; } @@ -912,13 +939,18 @@ static ssize_t page_owner_write(struct file *file, size_t max_input_len; struct page_owner_filter_state *state = file->private_data; enum page_owner_print_mode new_print_mode; + nodemask_t new_nid_filter; + bool new_nid_filter_enabled; unsigned long flags; /* * Maximum input length for filter commands: - * 32: print_mode command max length is 17 ("mode=stack_handle"). + * - 32: print_mode command max length is 17 ("mode=stack_handle") + * with sufficient buffer + * - 6 * MAX_NUMNODES: worst case for nid list + * Worst case per node: ",NNNNN" (comma + 5-digit node number) = 6 bytes */ - max_input_len = 32; + max_input_len = 32 + 6 * MAX_NUMNODES; if (count > max_input_len) return -EINVAL; @@ -931,6 +963,8 @@ static ssize_t page_owner_write(struct file *file, spin_lock_irqsave(&state->lock, flags); new_print_mode = state->print_mode; + new_nid_filter = state->nid_filter; + new_nid_filter_enabled = state->nid_filter_enabled; spin_unlock_irqrestore(&state->lock, flags); while ((token = strsep(&kbuf, " \t\n")) != NULL) { @@ -943,14 +977,37 @@ static ssize_t page_owner_write(struct file *file, if (ret < 0) goto out_free; new_print_mode = ret; + } else if (!strncmp(token, "nid=", 4)) { + ret = nodelist_parse(token + 4, new_nid_filter); + if (ret < 0) + goto out_free; + + if (nodes_empty(new_nid_filter)) { + ret = -EINVAL; + goto out_free; + } + + /* + * We want to filter memory allocations by numa nodes, so make sure + * that the specified nodes have memory. + */ + if (!nodes_subset(new_nid_filter, node_states[N_MEMORY])) { + ret = -EINVAL; + goto out_free; + } + + new_nid_filter_enabled = true; } else { ret = -EINVAL; goto out_free; } } + /* Commit all filter changes */ spin_lock_irqsave(&state->lock, flags); state->print_mode = new_print_mode; + state->nid_filter = new_nid_filter; + state->nid_filter_enabled = new_nid_filter_enabled; spin_unlock_irqrestore(&state->lock, flags); ret = count; -- 2.20.1