When the page allocator steals a movable pageblock for unmovable or reclaimable allocations (via try_to_claim_block), the remaining movable pages in that block can prevent future unmovable/reclaimable allocations from being concentrated in fewer pageblocks, leading to long-term memory fragmentation. Add a lightweight asynchronous evacuation mechanism: when a movable pageblock is claimed for unmovable/reclaimable use, queue a work item to migrate the remaining movable pages out. This allows future unmovable/reclaimable allocations to be satisfied from the now-evacuated block, keeping those allocation types concentrated and reducing fragmentation. Signed-off-by: Rik van Riel Assisted-by: Claude:claude-opus-4.7 syzkaller --- include/linux/mmzone.h | 4 + mm/page_alloc.c | 223 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 227 insertions(+) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 13e29b2ebb86..90498bbbf60b 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -1540,6 +1541,9 @@ typedef struct pglist_data { wait_queue_head_t kcompactd_wait; struct task_struct *kcompactd; bool proactive_compact_trigger; + struct workqueue_struct *evacuate_wq; + struct llist_head evacuate_pending; + struct irq_work evacuate_irq_work; #endif /* * This is a per-node reserve of pages that are not available diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6e01e58aca54..0f3d734bd296 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -51,6 +52,7 @@ #include #include #include +#include #include #include #include @@ -59,6 +61,10 @@ #include "shuffle.h" #include "page_reporting.h" +#ifdef CONFIG_COMPACTION +static void queue_pageblock_evacuate(struct zone *zone, unsigned long pfn); +#endif + /* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */ typedef int __bitwise fpi_t; @@ -2428,6 +2434,13 @@ try_to_claim_block(struct zone *zone, struct page *page, int free_pages, movable_pages, alike_pages; unsigned long start_pfn; + /* + * Don't steal from pageblocks that are isolated for + * evacuation -- that would undo the work in progress. + */ + if (get_pageblock_isolate(page)) + return NULL; + /* Take ownership for orders >= pageblock_order */ if (current_order >= pageblock_order) { unsigned int nr_added; @@ -2473,6 +2486,18 @@ try_to_claim_block(struct zone *zone, struct page *page, page_group_by_mobility_disabled) { __move_freepages_block(zone, start_pfn, block_type, start_type); set_pageblock_migratetype(pfn_to_page(start_pfn), start_type); +#ifdef CONFIG_COMPACTION + /* + * A movable pageblock was just claimed for unmovable or + * reclaimable use. Queue async evacuation of the remaining + * movable pages so future unmovable/reclaimable allocations + * can stay concentrated in fewer pageblocks. + */ + if (block_type == MIGRATE_MOVABLE && + (start_type == MIGRATE_UNMOVABLE || + start_type == MIGRATE_RECLAIMABLE)) + queue_pageblock_evacuate(zone, start_pfn); +#endif return __rmqueue_smallest(zone, order, start_type); } @@ -7184,6 +7209,204 @@ void __init page_alloc_sysctl_init(void) register_sysctl_init("vm", page_alloc_sysctl_table); } +#ifdef CONFIG_COMPACTION +/* + * Pageblock evacuation: asynchronously migrate movable pages out of + * pageblocks that were stolen for unmovable/reclaimable allocations. + * This keeps unmovable/reclaimable allocations concentrated in fewer + * pageblocks, reducing long-term fragmentation. + * + * Uses a global pool of 64 pre-allocated work items (~3.5KB total) + * and a per-pgdat workqueue to keep migration node-local. + */ + +struct evacuate_item { + struct work_struct work; + struct zone *zone; + unsigned long start_pfn; + struct llist_node free_node; +}; + +#define NR_EVACUATE_ITEMS 64 +static struct evacuate_item evacuate_pool[NR_EVACUATE_ITEMS]; +static struct llist_head evacuate_freelist; + +static struct evacuate_item *evacuate_item_alloc(void) +{ + struct llist_node *node; + + node = llist_del_first(&evacuate_freelist); + if (!node) + return NULL; + return container_of(node, struct evacuate_item, free_node); +} + +static void evacuate_item_free(struct evacuate_item *item) +{ + llist_add(&item->free_node, &evacuate_freelist); +} + +static void evacuate_pageblock(struct zone *zone, unsigned long start_pfn) +{ + unsigned long end_pfn = start_pfn + pageblock_nr_pages; + unsigned long pfn = start_pfn; + int nr_reclaimed; + int ret = 0; + struct compact_control cc = { + .nr_migratepages = 0, + .order = -1, + .zone = zone, + .mode = MIGRATE_ASYNC, + .gfp_mask = GFP_HIGHUSER_MOVABLE, + }; + struct migration_target_control mtc = { + .nid = zone_to_nid(zone), + .gfp_mask = GFP_HIGHUSER_MOVABLE, + }; + + /* Verify this pageblock is still worth evacuating */ + if (get_pageblock_migratetype(pfn_to_page(start_pfn)) == MIGRATE_MOVABLE) + return; + + INIT_LIST_HEAD(&cc.migratepages); + + /* + * Loop through the entire pageblock, isolating and migrating + * in batches. isolate_migratepages_range stops at + * COMPACT_CLUSTER_MAX, so we must loop to cover the full block. + */ + while (pfn < end_pfn || !list_empty(&cc.migratepages)) { + if (list_empty(&cc.migratepages)) { + cc.nr_migratepages = 0; + cc.migrate_pfn = pfn; + ret = isolate_migratepages_range(&cc, pfn, end_pfn); + if (ret && ret != -EAGAIN) + break; + pfn = cc.migrate_pfn; + if (list_empty(&cc.migratepages)) + break; + } + + nr_reclaimed = reclaim_clean_pages_from_list(zone, + &cc.migratepages); + cc.nr_migratepages -= nr_reclaimed; + + if (!list_empty(&cc.migratepages)) { + ret = migrate_pages(&cc.migratepages, + alloc_migration_target, NULL, + (unsigned long)&mtc, cc.mode, + MR_COMPACTION, NULL); + if (ret) { + putback_movable_pages(&cc.migratepages); + break; + } + } + + cond_resched(); + } + + if (!list_empty(&cc.migratepages)) + putback_movable_pages(&cc.migratepages); +} + +static void evacuate_work_fn(struct work_struct *work) +{ + struct evacuate_item *item = container_of(work, struct evacuate_item, + work); + evacuate_pageblock(item->zone, item->start_pfn); + evacuate_item_free(item); +} + +/** + * evacuate_irq_work_fn - IRQ work callback to drain pending evacuations + * @work: the irq_work embedded in pg_data_t + * + * queue_work() can deadlock when called from inside the page allocator + * because it may try to allocate memory with locks already held. + * Use irq_work to defer the queue_work() calls to a safe context. + */ +static void evacuate_irq_work_fn(struct irq_work *work) +{ + pg_data_t *pgdat = container_of(work, pg_data_t, + evacuate_irq_work); + struct llist_node *pending; + struct evacuate_item *item, *next; + + if (!pgdat->evacuate_wq) + return; + + /* + * Collect all pending items first, then queue them. Use _safe + * because evacuate_work_fn() may run immediately on another + * CPU and free the item before we follow the next pointer. + */ + pending = llist_del_all(&pgdat->evacuate_pending); + llist_for_each_entry_safe(item, next, pending, free_node) { + INIT_WORK(&item->work, evacuate_work_fn); + queue_work(pgdat->evacuate_wq, &item->work); + } +} + +/** + * queue_pageblock_evacuate - schedule async evacuation of movable pages + * @zone: the zone containing the pageblock + * @pfn: start PFN of the pageblock (must be pageblock-aligned) + * + * Called from the page allocator when a movable pageblock is claimed + * for unmovable or reclaimable allocations. Queues the pageblock for + * background migration of its remaining movable pages. Uses irq_work + * to defer the actual queue_work() call outside the allocator's lock + * context. + */ +static void queue_pageblock_evacuate(struct zone *zone, unsigned long pfn) +{ + struct evacuate_item *item; + pg_data_t *pgdat = zone->zone_pgdat; + + if (!pgdat->evacuate_irq_work.func) + return; + + item = evacuate_item_alloc(); + if (!item) + return; + + item->zone = zone; + item->start_pfn = pfn; + llist_add(&item->free_node, &pgdat->evacuate_pending); + irq_work_queue(&pgdat->evacuate_irq_work); +} + +static int __init pageblock_evacuate_init(void) +{ + int nid, i; + + /* Initialize the global freelist of work items */ + init_llist_head(&evacuate_freelist); + for (i = 0; i < NR_EVACUATE_ITEMS; i++) + llist_add(&evacuate_pool[i].free_node, &evacuate_freelist); + + /* Create a per-pgdat workqueue */ + for_each_online_node(nid) { + pg_data_t *pgdat = NODE_DATA(nid); + char name[32]; + + snprintf(name, sizeof(name), "kevacuate/%d", nid); + pgdat->evacuate_wq = alloc_workqueue(name, WQ_MEM_RECLAIM, 1); + if (!pgdat->evacuate_wq) { + pr_warn("Failed to create evacuate workqueue for node %d\n", nid); + continue; + } + + init_llist_head(&pgdat->evacuate_pending); + init_irq_work(&pgdat->evacuate_irq_work, + evacuate_irq_work_fn); + } + + return 0; +} +late_initcall(pageblock_evacuate_init); +#endif /* CONFIG_COMPACTION */ + #ifdef CONFIG_CONTIG_ALLOC /* Usage: See admin-guide/dynamic-debug-howto.rst */ static void alloc_contig_dump_pages(struct list_head *page_list) -- 2.54.0