A longstanding problem with having a lot of CMA pageblocks in the system (through hugetlb_cma), is that this limits the amount of memory that the kernel can use for its allocations. Kernel allocations are unmovable and can not come from CMA pageblocks. This can lead to situations where kernel allocations cause OOMs, when in fact there might still enough memory available. There isn't much that can be done if the non-CMA part of memory is already taken up by unmovable allocations. That scenario can be considered a misconfigured system. But if there are movable allocations in the non-CMA areas, they are unnecessarily taking away space from the kernel. Currently, the page allocator tries to avoid this scenario by allocating from CMA first if more than half of free pages in a zone come from CMA. But that's not a guarantee. For example, take the case where a lot of memory is being taken up by 1G hugetlb pages, allocated from hugetlb_cma, and that the hugetlb_cma area has been fully used by hugetlbfs. This means that new movable allocations will land in the non-CMA part of memory, and that the kernel may come under memory pressure. If those allocations are long-lasting, freeing up hugetlb pages will not reduce that pressure, since the kernel can't use the new space, and the long-lasting allocations residing in non-CMA memory will stay put. To counter this issue, introduce interfaces to explicitly move pages in to CMA areas. The number of pages moved depends on cma_first_limit. It will use that percentage to calculate the target number of pages that should be moved. A later commit will call one of these interfaces to move pages to CMA if needed, after CMA-allocated hugetlb pages have been freed. Signed-off-by: Frank van der Linden --- include/linux/migrate_mode.h | 1 + include/trace/events/migrate.h | 3 +- mm/compaction.c | 168 +++++++++++++++++++++++++++++++++ mm/internal.h | 4 + 4 files changed, 175 insertions(+), 1 deletion(-) diff --git a/include/linux/migrate_mode.h b/include/linux/migrate_mode.h index 265c4328b36a..3e235499cd73 100644 --- a/include/linux/migrate_mode.h +++ b/include/linux/migrate_mode.h @@ -25,6 +25,7 @@ enum migrate_reason { MR_LONGTERM_PIN, MR_DEMOTION, MR_DAMON, + MR_CMA_BALANCE, MR_TYPES }; diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h index cd01dd7b3640..53d669ee26be 100644 --- a/include/trace/events/migrate.h +++ b/include/trace/events/migrate.h @@ -23,7 +23,8 @@ EM( MR_CONTIG_RANGE, "contig_range") \ EM( MR_LONGTERM_PIN, "longterm_pin") \ EM( MR_DEMOTION, "demotion") \ - EMe(MR_DAMON, "damon") + EM( MR_DAMON, "damon") \ + EMe(MR_CMA_BALANCE, "cma_balance") /* * First define the enums in the above macros to be exported to userspace diff --git a/mm/compaction.c b/mm/compaction.c index 2e6c30f50b89..3200119b8baf 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "internal.h" #ifdef CONFIG_COMPACTION @@ -2512,6 +2513,173 @@ compaction_suit_allocation_order(struct zone *zone, unsigned int order, return COMPACT_CONTINUE; } +#ifdef CONFIG_CMA + +static void +isolate_free_cma_pages(struct compact_control *cc) +{ + unsigned long end_pfn, pfn, next_pfn, start_pfn; + int i; + + i = -1; + end_pfn = 0; + + next_pfn = end_pfn = cc->free_pfn; + start_pfn = 0; + while (cc->nr_freepages < cc->nr_migratepages) { + if (!cma_next_balance_pagerange(cc->zone, cc->cma, &i, + &start_pfn, &end_pfn)) + break; + for (pfn = start_pfn; pfn < end_pfn; pfn = next_pfn) { + next_pfn = pfn + pageblock_nr_pages; + isolate_freepages_block(cc, &pfn, next_pfn, + cc->freepages, 1, false); + if (cc->nr_freepages >= cc->nr_migratepages) + break; + } + } + cc->free_pfn = next_pfn; +} + +static void balance_zone_cma(struct zone *zone, struct cma *cma) +{ + struct compact_control cc = { + .zone = zone, + .cma = cma, + .isolate_freepages = isolate_free_cma_pages, + .nr_migratepages = 0, + .nr_freepages = 0, + .free_pfn = 0, + .migrate_pfn = 0, + .mode = MIGRATE_SYNC, + .ignore_skip_hint = true, + .no_set_skip_hint = true, + .gfp_mask = GFP_KERNEL, + .migrate_large = true, + .order = -1, + }; + unsigned long nr_pages; + int order; + unsigned long free_cma, free_pages, allocated, allocated_noncma; + unsigned long target_free_cma; + int rindex, ret = 0, n; + unsigned long start_pfn, end_pfn, pfn, next_pfn; + long nr_migrated; + + if (zone_idx(zone) == ZONE_MOVABLE) + return; + + if (!cma && !cma_numranges()) + return; + + /* + * Try to move allocated pages from non-CMA pageblocks + * to CMA pageblocks (possibly in a specific CMA area), to + * give the kernel more space for unmovable allocations. + * + * cma_first_limit, the percentage of free pages that are + * MIGRATE_CMA, is used to calculcate the target number. + */ + free_pages = zone_page_state(zone, NR_FREE_PAGES); + free_cma = zone_page_state(zone, NR_FREE_CMA_PAGES); + if (!free_cma) + return; + + target_free_cma = (cma_first_limit * free_pages) / 100; + /* + * If we're already below the target, nothing to do. + */ + if (free_cma <= target_free_cma) + return; + + /* + * To try to avoid scanning too much non-CMA memory, + * set the upper bound of pages we want to migrate + * to the minimum of: + * 1. The number of MIGRATE_CMA pages we want to use. + * 2. The space available in the targeted CMA area (if any). + * 3. The number of used non-CMA pages. + * + * This will still likely cause the scanning of more + * pageblocks than is strictly needed, but it's the best + * that can be done without explicit tracking of the number + * of movable allocations in non-CMA memory. + */ + allocated = zone_managed_pages(zone) - free_pages; + allocated_noncma = allocated - (zone_cma_pages(zone) - free_cma); + + nr_pages = free_cma - target_free_cma; + if (cma) + nr_pages = min(nr_pages, cma_get_available(cma)); + nr_pages = min(allocated_noncma, nr_pages); + + for (order = 0; order < NR_PAGE_ORDERS; order++) + INIT_LIST_HEAD(&cc.freepages[order]); + INIT_LIST_HEAD(&cc.migratepages); + + rindex = -1; + start_pfn = next_pfn = end_pfn = 0; + nr_migrated = 0; + while (nr_pages > 0) { + ret = 0; + if (!cma_next_noncma_pagerange(cc.zone, &rindex, + &start_pfn, &end_pfn)) + break; + + for (pfn = start_pfn; pfn < end_pfn; pfn = next_pfn) { + next_pfn = pfn + pageblock_nr_pages; + cc.nr_migratepages = 0; + + if (!pageblock_pfn_to_page(pfn, next_pfn, zone)) + continue; + + ret = isolate_migratepages_block(&cc, pfn, next_pfn, + ISOLATE_UNEVICTABLE); + if (ret) + continue; + ret = migrate_pages(&cc.migratepages, compaction_alloc, + compaction_free, (unsigned long)&cc, + cc.mode, MR_CMA_BALANCE, &n); + if (ret) + putback_movable_pages(&cc.migratepages); + nr_migrated += n; + if (nr_migrated >= nr_pages) + break; + } + + nr_pages -= min_t(unsigned long, nr_migrated, nr_pages); + } + + if (cc.nr_freepages > 0) + release_free_list(cc.freepages); +} + +void balance_node_cma(int nid, struct cma *cma) +{ + pg_data_t *pgdat; + int zoneid; + struct zone *zone; + + if (!cma && !cma_numranges()) + return; + + if (nid >= MAX_NUMNODES || !node_online(nid)) + return; + + pgdat = NODE_DATA(nid); + + for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { + + zone = &pgdat->node_zones[zoneid]; + if (!populated_zone(zone)) + continue; + + balance_zone_cma(zone, cma); + } +} + +#endif /* CONFIG_CMA */ + static enum compact_result compact_zone(struct compact_control *cc, struct capture_control *capc) { diff --git a/mm/internal.h b/mm/internal.h index ffcb3aec05ed..7dcaf7214683 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -857,6 +857,8 @@ void memmap_init_range(unsigned long, int, unsigned long, unsigned long, #if defined CONFIG_COMPACTION || defined CONFIG_CMA +struct cma; + /* * in mm/compaction.c */ @@ -887,6 +889,7 @@ struct compact_control { unsigned long migrate_pfn; unsigned long fast_start_pfn; /* a pfn to start linear scan from */ struct zone *zone; + struct cma *cma; /* if moving to a specific CMA area */ unsigned long total_migrate_scanned; unsigned long total_free_scanned; unsigned short fast_search_fail;/* failures to use free list searches */ @@ -938,6 +941,7 @@ struct cma; #ifdef CONFIG_CMA void *cma_reserve_early(struct cma *cma, unsigned long size); void init_cma_pageblock(struct page *page); +void balance_node_cma(int nid, struct cma *cma); #else static inline void *cma_reserve_early(struct cma *cma, unsigned long size) { -- 2.51.0.384.g4c02a37b29-goog