Add a tunable to experiment with the circumstances under which movable allocations should use CMA pageblocks first, to avoid false OOM conditions. The limit is the percentage free memory which is being taken up by CMA. If the amount of used memory in CMA pageblocks is above this limit, CMA will be used first. So, 0 would mean always using CMA first, and 100 means never use CMA first. Currently the default is 50, which matches the existing behavior, so there is no functional change. Signed-off-by: Frank van der Linden --- include/linux/mm.h | 4 +++ mm/page_alloc.c | 84 ++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 74 insertions(+), 14 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 1ae97a0b8ec7..313ab38dc398 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3253,6 +3253,10 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...); extern void setup_per_cpu_pageset(void); +#ifdef CONFIG_CMA +extern int cma_first_limit; +#endif + /* nommu.c */ extern atomic_long_t mmap_pages_allocated; extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d1d037f97c5f..d3966d31c039 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2446,6 +2446,54 @@ enum rmqueue_mode { RMQUEUE_STEAL, }; +#ifdef CONFIG_CMA +/* + * The percentage of free CMA pages as part of the total number of free + * pages above which CMA is used first. + * 0 = always, 100 = never + */ +int cma_first_limit __read_mostly = 50; +EXPORT_SYMBOL_GPL(cma_first_limit); + +/* + * Return values: + * + * -1 - never try CMA (!ALLOC_CMA or !IS_ENABLED(CONFIG_CMA)) + * 0 - don't try CMA first + * 1 - try CMA first. + */ +static __always_inline int use_cma_first(struct zone *zone, + unsigned int alloc_flags) +{ + unsigned long free_cma, free_pages, cma_percentage; + + if (!(alloc_flags & ALLOC_CMA)) + return -1; + + free_cma = zone_page_state(zone, NR_FREE_CMA_PAGES); + if (!free_cma) + return -1; + + if (!cma_first_limit) + return 1; + + if (cma_first_limit == 100) + return 0; + + free_pages = zone_page_state(zone, NR_FREE_PAGES); + if (!free_pages) + return 0; + + cma_percentage = (free_cma * 100) / free_pages; + return (cma_percentage > cma_first_limit) ? 1 : 0; +} +#else +static inline int use_cma_first(struct zone *zone, unsigned int alloc_flags) +{ + return -1; +} +#endif + /* * Do the hard work of removing an element from the buddy allocator. * Call me with the zone->lock already held. @@ -2455,20 +2503,13 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype, unsigned int alloc_flags, enum rmqueue_mode *mode) { struct page *page; + int cma_first; - if (IS_ENABLED(CONFIG_CMA)) { - /* - * Balance movable allocations between regular and CMA areas by - * allocating from CMA when over half of the zone's free memory - * is in the CMA area. - */ - if (alloc_flags & ALLOC_CMA && - zone_page_state(zone, NR_FREE_CMA_PAGES) > - zone_page_state(zone, NR_FREE_PAGES) / 2) { - page = __rmqueue_cma_fallback(zone, order); - if (page) - return page; - } + cma_first = use_cma_first(zone, alloc_flags); + if (cma_first > 0) { + page = __rmqueue_cma_fallback(zone, order); + if (page) + return page; } /* @@ -2487,7 +2528,11 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype, return page; fallthrough; case RMQUEUE_CMA: - if (alloc_flags & ALLOC_CMA) { + /* + * Try CMA if we should, and haven't done so yet, + * which is indicated by cma_first == 0. + */ + if (cma_first == 0) { page = __rmqueue_cma_fallback(zone, order); if (page) { *mode = RMQUEUE_CMA; @@ -6672,6 +6717,17 @@ static const struct ctl_table page_alloc_sysctl_table[] = { .extra2 = SYSCTL_ONE_HUNDRED, }, #endif +#ifdef CONFIG_CMA + { + .procname = "cma_first_limit", + .data = &cma_first_limit, + .maxlen = sizeof(cma_first_limit), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, + }, +#endif }; void __init page_alloc_sysctl_init(void) -- 2.51.0.384.g4c02a37b29-goog Atomic bit operations aren't needed for the cma flags field, so switch their manipulation over to normal AND/OR operations. Also export the bit values in linux/cma.h, as we will be adding publicly used values later. No functional change. Signed-off-by: Frank van der Linden --- include/linux/cma.h | 12 ++++++++++++ mm/cma.c | 16 ++++++++-------- mm/cma.h | 7 ------- 3 files changed, 20 insertions(+), 15 deletions(-) diff --git a/include/linux/cma.h b/include/linux/cma.h index 62d9c1cf6326..5c3fdc5da908 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -20,6 +20,18 @@ #define CMA_MIN_ALIGNMENT_PAGES pageblock_nr_pages #define CMA_MIN_ALIGNMENT_BYTES (PAGE_SIZE * CMA_MIN_ALIGNMENT_PAGES) +enum cma_flags { + __CMA_RESERVE_PAGES_ON_ERROR, + __CMA_ZONES_VALID, + __CMA_ZONES_INVALID, + __CMA_ACTIVATED, +}; + +#define CMA_RESERVE_PAGES_ON_ERROR BIT(__CMA_RESERVE_PAGES_ON_ERROR) +#define CMA_ZONES_VALID BIT(__CMA_ZONES_VALID) +#define CMA_ZONES_INVALID BIT(__CMA_ZONES_INVALID) +#define CMA_ACTIVATED BIT(__CMA_ACTIVATED) + struct cma; extern unsigned long totalcma_pages; diff --git a/mm/cma.c b/mm/cma.c index 2ffa4befb99a..549d85b2e3a3 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -111,8 +111,8 @@ bool cma_validate_zones(struct cma *cma) * check has already been done. If neither is set, the * check has not been performed yet. */ - valid_bit_set = test_bit(CMA_ZONES_VALID, &cma->flags); - if (valid_bit_set || test_bit(CMA_ZONES_INVALID, &cma->flags)) + valid_bit_set = (cma->flags & CMA_ZONES_VALID); + if (valid_bit_set || (cma->flags & CMA_ZONES_INVALID)) return valid_bit_set; for (r = 0; r < cma->nranges; r++) { @@ -126,12 +126,12 @@ bool cma_validate_zones(struct cma *cma) */ WARN_ON_ONCE(!pfn_valid(base_pfn)); if (pfn_range_intersects_zones(cma->nid, base_pfn, cmr->count)) { - set_bit(CMA_ZONES_INVALID, &cma->flags); + cma->flags |= CMA_ZONES_INVALID; return false; } } - set_bit(CMA_ZONES_VALID, &cma->flags); + cma->flags |= CMA_ZONES_VALID; return true; } @@ -176,7 +176,7 @@ static void __init cma_activate_area(struct cma *cma) INIT_HLIST_HEAD(&cma->mem_head); spin_lock_init(&cma->mem_head_lock); #endif - set_bit(CMA_ACTIVATED, &cma->flags); + cma->flags |= CMA_ACTIVATED; return; @@ -185,7 +185,7 @@ static void __init cma_activate_area(struct cma *cma) bitmap_free(cma->ranges[r].bitmap); /* Expose all pages to the buddy, they are useless for CMA. */ - if (!test_bit(CMA_RESERVE_PAGES_ON_ERROR, &cma->flags)) { + if (!(cma->flags & CMA_RESERVE_PAGES_ON_ERROR)) { for (r = 0; r < allocrange; r++) { cmr = &cma->ranges[r]; end_pfn = cmr->base_pfn + cmr->count; @@ -211,7 +211,7 @@ core_initcall(cma_init_reserved_areas); void __init cma_reserve_pages_on_error(struct cma *cma) { - set_bit(CMA_RESERVE_PAGES_ON_ERROR, &cma->flags); + cma->flags |= CMA_RESERVE_PAGES_ON_ERROR; } static int __init cma_new_area(const char *name, phys_addr_t size, @@ -1085,7 +1085,7 @@ void __init *cma_reserve_early(struct cma *cma, unsigned long size) /* * Can only be called early in init. */ - if (test_bit(CMA_ACTIVATED, &cma->flags)) + if (cma->flags & CMA_ACTIVATED) return NULL; if (!IS_ALIGNED(size, CMA_MIN_ALIGNMENT_BYTES)) diff --git a/mm/cma.h b/mm/cma.h index c70180c36559..25b696774c6a 100644 --- a/mm/cma.h +++ b/mm/cma.h @@ -64,13 +64,6 @@ struct cma { int nid; }; -enum cma_flags { - CMA_RESERVE_PAGES_ON_ERROR, - CMA_ZONES_VALID, - CMA_ZONES_INVALID, - CMA_ACTIVATED, -}; - extern struct cma cma_areas[MAX_CMA_AREAS]; extern unsigned int cma_area_count; -- 2.51.0.384.g4c02a37b29-goog Add a flags argument to the various CMA init functions, as there will be a need to pass in more flags to control init and runtime behavior other than just the current 'fixed' argument. Replace the fixed argument with a flags argument, and adapt callers. Signed-off-by: Frank van der Linden --- arch/powerpc/kernel/fadump.c | 2 +- arch/powerpc/kvm/book3s_hv_builtin.c | 2 +- drivers/s390/char/vmcp.c | 2 +- include/linux/cma.h | 20 ++++++++++------ kernel/dma/contiguous.c | 10 ++++---- mm/cma.c | 36 ++++++++++++++++++---------- mm/hugetlb_cma.c | 2 +- 7 files changed, 46 insertions(+), 28 deletions(-) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 5782e743fd27..a763419bd1bc 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -112,7 +112,7 @@ void __init fadump_cma_init(void) return; } - rc = cma_init_reserved_mem(base, size, 0, "fadump_cma", &fadump_cma); + rc = cma_init_reserved_mem(base, size, 0, 0, "fadump_cma", &fadump_cma); if (rc) { pr_err("Failed to init cma area for firmware-assisted dump,%d\n", rc); /* diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index fa0e3a22cac0..23dcb67e797a 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -95,7 +95,7 @@ void __init kvm_cma_reserve(void) (unsigned long)selected_size / SZ_1M); align_size = HPT_ALIGN_PAGES << PAGE_SHIFT; cma_declare_contiguous(0, selected_size, 0, align_size, - KVM_CMA_CHUNK_ORDER - PAGE_SHIFT, false, "kvm_cma", + KVM_CMA_CHUNK_ORDER - PAGE_SHIFT, 0, "kvm_cma", &kvm_cma); } } diff --git a/drivers/s390/char/vmcp.c b/drivers/s390/char/vmcp.c index 69899bb86b3e..cd0c0edc496b 100644 --- a/drivers/s390/char/vmcp.c +++ b/drivers/s390/char/vmcp.c @@ -54,7 +54,7 @@ void __init vmcp_cma_reserve(void) { if (!machine_is_vm()) return; - cma_declare_contiguous(0, vmcp_cma_size, 0, 0, 0, false, "vmcp", &vmcp_cma); + cma_declare_contiguous(0, vmcp_cma_size, 0, 0, 0, 0, "vmcp", &vmcp_cma); } static void vmcp_response_alloc(struct vmcp_session *session) diff --git a/include/linux/cma.h b/include/linux/cma.h index 5c3fdc5da908..ec48f2a11f1d 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -25,12 +25,16 @@ enum cma_flags { __CMA_ZONES_VALID, __CMA_ZONES_INVALID, __CMA_ACTIVATED, + __CMA_FIXED, }; #define CMA_RESERVE_PAGES_ON_ERROR BIT(__CMA_RESERVE_PAGES_ON_ERROR) #define CMA_ZONES_VALID BIT(__CMA_ZONES_VALID) #define CMA_ZONES_INVALID BIT(__CMA_ZONES_INVALID) #define CMA_ACTIVATED BIT(__CMA_ACTIVATED) +#define CMA_FIXED BIT(__CMA_FIXED) + +#define CMA_INIT_FLAGS (CMA_FIXED|CMA_RESERVE_PAGES_ON_ERROR) struct cma; @@ -42,23 +46,25 @@ extern const char *cma_get_name(const struct cma *cma); extern int __init cma_declare_contiguous_nid(phys_addr_t base, phys_addr_t size, phys_addr_t limit, phys_addr_t alignment, unsigned int order_per_bit, - bool fixed, const char *name, struct cma **res_cma, - int nid); + unsigned long flags, const char *name, + struct cma **res_cma, int nid); static inline int __init cma_declare_contiguous(phys_addr_t base, phys_addr_t size, phys_addr_t limit, phys_addr_t alignment, unsigned int order_per_bit, - bool fixed, const char *name, struct cma **res_cma) + unsigned long flags, const char *name, + struct cma **res_cma) { return cma_declare_contiguous_nid(base, size, limit, alignment, - order_per_bit, fixed, name, res_cma, NUMA_NO_NODE); + order_per_bit, flags, name, res_cma, NUMA_NO_NODE); } extern int __init cma_declare_contiguous_multi(phys_addr_t size, phys_addr_t align, unsigned int order_per_bit, - const char *name, struct cma **res_cma, int nid); + unsigned long flags, const char *name, + struct cma **res_cma, int nid); extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, unsigned int order_per_bit, - const char *name, - struct cma **res_cma); + unsigned long flags, + const char *name, struct cma **res_cma); extern struct page *cma_alloc(struct cma *cma, unsigned long count, unsigned int align, bool no_warn); extern bool cma_pages_valid(struct cma *cma, const struct page *pages, unsigned long count); diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index d9b9dcba6ff7..7f2eed3b7cc5 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -175,7 +175,7 @@ static void __init dma_numa_cma_reserve(void) cma = &dma_contiguous_pernuma_area[nid]; snprintf(name, sizeof(name), "pernuma%d", nid); ret = cma_declare_contiguous_nid(0, pernuma_size_bytes, 0, 0, - 0, false, name, cma, nid); + 0, 0, name, cma, nid); if (ret) pr_warn("%s: reservation failed: err %d, node %d", __func__, ret, nid); @@ -185,7 +185,7 @@ static void __init dma_numa_cma_reserve(void) cma = &dma_contiguous_numa_area[nid]; snprintf(name, sizeof(name), "numa%d", nid); - ret = cma_declare_contiguous_nid(0, numa_cma_size[nid], 0, 0, 0, false, + ret = cma_declare_contiguous_nid(0, numa_cma_size[nid], 0, 0, 0, 0, name, cma, nid); if (ret) pr_warn("%s: reservation failed: err %d, node %d", __func__, @@ -279,7 +279,8 @@ int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, { int ret; - ret = cma_declare_contiguous(base, size, limit, 0, 0, fixed, + ret = cma_declare_contiguous(base, size, limit, 0, 0, + fixed ? CMA_FIXED : 0, "reserved", res_cma); if (ret) return ret; @@ -478,7 +479,8 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem) return -EINVAL; } - err = cma_init_reserved_mem(rmem->base, rmem->size, 0, rmem->name, &cma); + err = cma_init_reserved_mem(rmem->base, rmem->size, 0, 0, rmem->name, + &cma); if (err) { pr_err("Reserved memory: unable to setup CMA region\n"); return err; diff --git a/mm/cma.c b/mm/cma.c index 549d85b2e3a3..00d8d365f0b5 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -35,6 +35,7 @@ struct cma cma_areas[MAX_CMA_AREAS]; unsigned int cma_area_count; +static DEFINE_MUTEX(cma_mutex); phys_addr_t cma_get_base(const struct cma *cma) { @@ -215,7 +216,7 @@ void __init cma_reserve_pages_on_error(struct cma *cma) } static int __init cma_new_area(const char *name, phys_addr_t size, - unsigned int order_per_bit, + unsigned int order_per_bit, unsigned long flags, struct cma **res_cma) { struct cma *cma; @@ -239,6 +240,7 @@ static int __init cma_new_area(const char *name, phys_addr_t size, cma->available_count = cma->count = size >> PAGE_SHIFT; cma->order_per_bit = order_per_bit; + cma->flags = flags; *res_cma = cma; totalcma_pages += cma->count; @@ -265,7 +267,7 @@ static void __init cma_drop_area(struct cma *cma) */ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, unsigned int order_per_bit, - const char *name, + unsigned long flags, const char *name, struct cma **res_cma) { struct cma *cma; @@ -288,7 +290,7 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, if (!IS_ALIGNED(base | size, CMA_MIN_ALIGNMENT_BYTES)) return -EINVAL; - ret = cma_new_area(name, size, order_per_bit, &cma); + ret = cma_new_area(name, size, order_per_bit, flags, &cma); if (ret != 0) return ret; @@ -429,12 +431,18 @@ static phys_addr_t __init cma_alloc_mem(phys_addr_t base, phys_addr_t size, static int __init __cma_declare_contiguous_nid(phys_addr_t *basep, phys_addr_t size, phys_addr_t limit, phys_addr_t alignment, unsigned int order_per_bit, - bool fixed, const char *name, struct cma **res_cma, - int nid) + unsigned long flags, const char *name, + struct cma **res_cma, int nid) { phys_addr_t memblock_end = memblock_end_of_DRAM(); phys_addr_t base = *basep; int ret; + bool fixed; + + if (flags & ~CMA_INIT_FLAGS) + return -EINVAL; + + fixed = (flags & CMA_FIXED); pr_debug("%s(size %pa, base %pa, limit %pa alignment %pa)\n", __func__, &size, &base, &limit, &alignment); @@ -503,7 +511,8 @@ static int __init __cma_declare_contiguous_nid(phys_addr_t *basep, kmemleak_ignore_phys(base); } - ret = cma_init_reserved_mem(base, size, order_per_bit, name, res_cma); + ret = cma_init_reserved_mem(base, size, order_per_bit, flags, + name, res_cma); if (ret) { memblock_phys_free(base, size); return ret; @@ -526,7 +535,8 @@ static int __init __cma_declare_contiguous_nid(phys_addr_t *basep, */ int __init cma_declare_contiguous_multi(phys_addr_t total_size, phys_addr_t align, unsigned int order_per_bit, - const char *name, struct cma **res_cma, int nid) + unsigned long flags, const char *name, + struct cma **res_cma, int nid) { phys_addr_t start = 0, end; phys_addr_t size, sizesum, sizeleft; @@ -543,7 +553,7 @@ int __init cma_declare_contiguous_multi(phys_addr_t total_size, * First, try it the normal way, producing just one range. */ ret = __cma_declare_contiguous_nid(&start, total_size, 0, align, - order_per_bit, false, name, res_cma, nid); + order_per_bit, flags, name, res_cma, nid); if (ret != -ENOMEM) goto out; @@ -567,7 +577,7 @@ int __init cma_declare_contiguous_multi(phys_addr_t total_size, sizesum = 0; failed = NULL; - ret = cma_new_area(name, total_size, order_per_bit, &cma); + ret = cma_new_area(name, total_size, order_per_bit, flags, &cma); if (ret != 0) goto out; @@ -716,7 +726,7 @@ int __init cma_declare_contiguous_multi(phys_addr_t total_size, * @limit: End address of the reserved memory (optional, 0 for any). * @alignment: Alignment for the CMA area, should be power of 2 or zero * @order_per_bit: Order of pages represented by one bit on bitmap. - * @fixed: hint about where to place the reserved area + * @flags: flags controlling various aspects of the area * @name: The name of the area. See function cma_init_reserved_mem() * @res_cma: Pointer to store the created cma region. * @nid: nid of the free area to find, %NUMA_NO_NODE for any node @@ -732,13 +742,13 @@ int __init cma_declare_contiguous_multi(phys_addr_t total_size, int __init cma_declare_contiguous_nid(phys_addr_t base, phys_addr_t size, phys_addr_t limit, phys_addr_t alignment, unsigned int order_per_bit, - bool fixed, const char *name, struct cma **res_cma, - int nid) + unsigned long flags, const char *name, + struct cma **res_cma, int nid) { int ret; ret = __cma_declare_contiguous_nid(&base, size, limit, alignment, - order_per_bit, fixed, name, res_cma, nid); + order_per_bit, flags, name, res_cma, nid); if (ret != 0) pr_err("Failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M); diff --git a/mm/hugetlb_cma.c b/mm/hugetlb_cma.c index f58ef4969e7a..71d0e9a048d4 100644 --- a/mm/hugetlb_cma.c +++ b/mm/hugetlb_cma.c @@ -221,7 +221,7 @@ void __init hugetlb_cma_reserve(int order) * huge page demotion. */ res = cma_declare_contiguous_multi(size, PAGE_SIZE << order, - HUGETLB_PAGE_ORDER, name, + HUGETLB_PAGE_ORDER, 0, name, &hugetlb_cma[nid], nid); if (res) { pr_warn("hugetlb_cma: reservation failed: err %d, node %d", -- 2.51.0.384.g4c02a37b29-goog In order to walk through CMA areas efficiently, it is useful to keep a global sorted list of ranges. Create this list when activating the areas. Since users of this list may want to reference the CMA area the range came from, there needs to be a link from the range to that area. So, store a pointer to the CMA structure in the cma_memrange structure. This also reduces the number of arguments to a few internal functions. Signed-off-by: Frank van der Linden --- mm/cma.c | 72 ++++++++++++++++++++++++++++++++++++++++++-------------- mm/cma.h | 6 ++--- 2 files changed, 57 insertions(+), 21 deletions(-) diff --git a/mm/cma.c b/mm/cma.c index 00d8d365f0b5..1f5a7bfc9152 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -65,12 +66,11 @@ static unsigned long cma_bitmap_aligned_mask(const struct cma *cma, * Find the offset of the base PFN from the specified align_order. * The value returned is represented in order_per_bits. */ -static unsigned long cma_bitmap_aligned_offset(const struct cma *cma, - const struct cma_memrange *cmr, +static unsigned long cma_bitmap_aligned_offset(const struct cma_memrange *cmr, unsigned int align_order) { return (cmr->base_pfn & ((1UL << align_order) - 1)) - >> cma->order_per_bit; + >> cmr->cma->order_per_bit; } static unsigned long cma_bitmap_pages_to_bits(const struct cma *cma, @@ -79,11 +79,12 @@ static unsigned long cma_bitmap_pages_to_bits(const struct cma *cma, return ALIGN(pages, 1UL << cma->order_per_bit) >> cma->order_per_bit; } -static void cma_clear_bitmap(struct cma *cma, const struct cma_memrange *cmr, +static void cma_clear_bitmap(const struct cma_memrange *cmr, unsigned long pfn, unsigned long count) { unsigned long bitmap_no, bitmap_count; unsigned long flags; + struct cma *cma = cmr->cma; bitmap_no = (pfn - cmr->base_pfn) >> cma->order_per_bit; bitmap_count = cma_bitmap_pages_to_bits(cma, count); @@ -147,8 +148,7 @@ static void __init cma_activate_area(struct cma *cma) for (allocrange = 0; allocrange < cma->nranges; allocrange++) { cmr = &cma->ranges[allocrange]; early_pfn[allocrange] = cmr->early_pfn; - cmr->bitmap = bitmap_zalloc(cma_bitmap_maxno(cma, cmr), - GFP_KERNEL); + cmr->bitmap = bitmap_zalloc(cma_bitmap_maxno(cmr), GFP_KERNEL); if (!cmr->bitmap) goto cleanup; } @@ -199,12 +199,45 @@ static void __init cma_activate_area(struct cma *cma) pr_err("CMA area %s could not be activated\n", cma->name); } +static struct cma_memrange **cma_ranges; +static int cma_nranges; + +static int cmprange(const void *a, const void *b) +{ + struct cma_memrange *r1, *r2; + + r1 = *(struct cma_memrange **)a; + r2 = *(struct cma_memrange **)b; + + if (r1->base_pfn < r2->base_pfn) + return -1; + return r1->base_pfn - r2->base_pfn; +} + static int __init cma_init_reserved_areas(void) { - int i; + int i, r, nranges; + struct cma *cma; + struct cma_memrange *cmr; + + nranges = 0; + for (i = 0; i < cma_area_count; i++) { + cma = &cma_areas[i]; + nranges += cma->nranges; + cma_activate_area(cma); + } + + cma_ranges = kcalloc(nranges, sizeof(*cma_ranges), GFP_KERNEL); + cma_nranges = 0; + for (i = 0; i < cma_area_count; i++) { + cma = &cma_areas[i]; + for (r = 0; r < cma->nranges; r++) { + cmr = &cma->ranges[r]; + cma_ranges[cma_nranges++] = cmr; + } + } - for (i = 0; i < cma_area_count; i++) - cma_activate_area(&cma_areas[i]); + sort(cma_ranges, cma_nranges, sizeof(*cma_ranges), cmprange, NULL); return 0; } @@ -297,6 +330,7 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, cma->ranges[0].base_pfn = PFN_DOWN(base); cma->ranges[0].early_pfn = PFN_DOWN(base); cma->ranges[0].count = cma->count; + cma->ranges[0].cma = cma; cma->nranges = 1; cma->nid = NUMA_NO_NODE; @@ -687,6 +721,7 @@ int __init cma_declare_contiguous_multi(phys_addr_t total_size, cmrp->base_pfn = PHYS_PFN(mlp->base); cmrp->early_pfn = cmrp->base_pfn; cmrp->count = size >> PAGE_SHIFT; + cmrp->cma = cma; sizeleft -= size; if (sizeleft == 0) @@ -772,7 +807,7 @@ static void cma_debug_show_areas(struct cma *cma) for (r = 0; r < cma->nranges; r++) { cmr = &cma->ranges[r]; - nbits = cma_bitmap_maxno(cma, cmr); + nbits = cma_bitmap_maxno(cmr); pr_info("range %d: ", r); for_each_clear_bitrange(start, end, cmr->bitmap, nbits) { @@ -786,9 +821,9 @@ static void cma_debug_show_areas(struct cma *cma) spin_unlock_irq(&cma->lock); } -static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr, - unsigned long count, unsigned int align, - struct page **pagep, gfp_t gfp) +static int cma_range_alloc(struct cma_memrange *cmr, + unsigned long count, unsigned int align, + struct page **pagep, gfp_t gfp) { unsigned long mask, offset; unsigned long pfn = -1; @@ -796,10 +831,11 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr, unsigned long bitmap_maxno, bitmap_no, bitmap_count; int ret = -EBUSY; struct page *page = NULL; + struct cma *cma = cmr->cma; mask = cma_bitmap_aligned_mask(cma, align); - offset = cma_bitmap_aligned_offset(cma, cmr, align); - bitmap_maxno = cma_bitmap_maxno(cma, cmr); + offset = cma_bitmap_aligned_offset(cmr, align); + bitmap_maxno = cma_bitmap_maxno(cmr); bitmap_count = cma_bitmap_pages_to_bits(cma, count); if (bitmap_count > bitmap_maxno) @@ -840,7 +876,7 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr, break; } - cma_clear_bitmap(cma, cmr, pfn, count); + cma_clear_bitmap(cmr, pfn, count); if (ret != -EBUSY) break; @@ -879,7 +915,7 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count, for (r = 0; r < cma->nranges; r++) { page = NULL; - ret = cma_range_alloc(cma, &cma->ranges[r], count, align, + ret = cma_range_alloc(&cma->ranges[r], count, align, &page, gfp); if (ret != -EBUSY || page) break; @@ -1011,7 +1047,7 @@ bool cma_release(struct cma *cma, const struct page *pages, return false; free_contig_range(pfn, count); - cma_clear_bitmap(cma, cmr, pfn, count); + cma_clear_bitmap(cmr, pfn, count); cma_sysfs_account_release_pages(cma, count); trace_cma_release(cma->name, pfn, pages, count); diff --git a/mm/cma.h b/mm/cma.h index 25b696774c6a..384d1109d438 100644 --- a/mm/cma.h +++ b/mm/cma.h @@ -30,6 +30,7 @@ struct cma_memrange { unsigned long early_pfn; unsigned long *bitmap; }; + struct cma *cma; #ifdef CONFIG_CMA_DEBUGFS struct debugfs_u32_array dfs_bitmap; #endif @@ -67,10 +68,9 @@ struct cma { extern struct cma cma_areas[MAX_CMA_AREAS]; extern unsigned int cma_area_count; -static inline unsigned long cma_bitmap_maxno(struct cma *cma, - struct cma_memrange *cmr) +static inline unsigned long cma_bitmap_maxno(struct cma_memrange *cmr) { - return cmr->count >> cma->order_per_bit; + return cmr->count >> cmr->cma->order_per_bit; } #ifdef CONFIG_CMA_SYSFS -- 2.51.0.384.g4c02a37b29-goog Add some CMA helper functions to assist CMA balancing. They are: cma_get_available. - Returns the number of available pages in a CMA area cma_numranges - Returns the total number of CMA ranges. cma_next_balance_pagerange - Get the next CMA page range in a zone that has is available as a target for CMA balancing. This means a range that consists of CMA pageblocks that are managed by the buddy allocator (not allocated through cma_alloc). The array of CMA ranges is walked top down. cma_next_noncma_pagerange - Get the next non-CMA page range in a zone. The zone is traversed bottom up. Signed-off-by: Frank van der Linden --- include/linux/cma.h | 30 +++++++++ mm/cma.c | 161 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 191 insertions(+) diff --git a/include/linux/cma.h b/include/linux/cma.h index ec48f2a11f1d..0504580d61d0 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -37,6 +37,7 @@ enum cma_flags { #define CMA_INIT_FLAGS (CMA_FIXED|CMA_RESERVE_PAGES_ON_ERROR) struct cma; +struct zone; extern unsigned long totalcma_pages; extern phys_addr_t cma_get_base(const struct cma *cma); @@ -79,6 +80,12 @@ extern void cma_reserve_pages_on_error(struct cma *cma); struct folio *cma_alloc_folio(struct cma *cma, int order, gfp_t gfp); bool cma_free_folio(struct cma *cma, const struct folio *folio); bool cma_validate_zones(struct cma *cma); +int cma_numranges(void); +unsigned long cma_get_available(const struct cma *cma); +bool cma_next_balance_pagerange(struct zone *zone, struct cma *cma, int *rindex, + unsigned long *startpfn, unsigned long *endpfn); +bool cma_next_noncma_pagerange(struct zone *zone, int *rindex, + unsigned long *startpfn, unsigned long *endpfn); #else static inline struct folio *cma_alloc_folio(struct cma *cma, int order, gfp_t gfp) { @@ -93,6 +100,29 @@ static inline bool cma_validate_zones(struct cma *cma) { return false; } + +static inline int cma_numranges(void) +{ + return 0; +} + +static inline unsigned long cma_get_available(const struct cma *cma) +{ + return 0; +} + +static inline bool cma_next_balance_pagerange(struct zone *zone, + struct cma *cma, int *rindex, unsigned long *start_pfn, + unsigned long *end_pfn) +{ + return false; +} + +static inline bool cma_next_noncma_pagerange(struct zone *zone, int *rindex, + unsigned long *start_pfn, unsigned long *end_pfn) +{ + return false; +} #endif #endif diff --git a/mm/cma.c b/mm/cma.c index 1f5a7bfc9152..53cb1833407b 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -54,6 +54,11 @@ const char *cma_get_name(const struct cma *cma) return cma->name; } +unsigned long cma_get_available(const struct cma *cma) +{ + return cma->available_count; +} + static unsigned long cma_bitmap_aligned_mask(const struct cma *cma, unsigned int align_order) { @@ -202,6 +207,11 @@ static void __init cma_activate_area(struct cma *cma) static struct cma_memrange **cma_ranges; static int cma_nranges; +int cma_numranges(void) +{ + return cma_nranges; +} + static int cmprange(const void *a, const void *b) { struct cma_memrange *r1, *r2; @@ -214,6 +224,157 @@ static int cmprange(const void *a, const void *b) return r1->base_pfn - r2->base_pfn; } +/* + * Provide the next free range in a cma memory range, as derived + * from the bitmap. + * + * @cmr: memory range to scan + * @start_pfn: the beginning of the previous range + * @end_pfn: the end of the previous range, zero for the first call + * + * The caller can adjust *end_pfn end use it as a starting point. + */ +static bool cma_next_free_range(struct cma_memrange *cmr, + unsigned long *start_pfn, unsigned long *end_pfn) +{ + unsigned long zerobit, onebit, start, nbits, offset, base; + struct cma *cma = cmr->cma; + + nbits = cma_bitmap_maxno(cmr); + + if (!*end_pfn) + offset = start = 0; + else { + start = ((*end_pfn - cmr->base_pfn) >> cma->order_per_bit); + if (start >= nbits) + return false; + + offset = *end_pfn - + (cmr->base_pfn + (start << cma->order_per_bit)); + } + + spin_lock_irq(&cma->lock); + zerobit = find_next_zero_bit(cmr->bitmap, nbits, start); + if (zerobit >= nbits) { + spin_unlock_irq(&cma->lock); + return false; + } + onebit = find_next_bit(cmr->bitmap, nbits, zerobit); + spin_unlock_irq(&cma->lock); + + base = (zerobit << cma->order_per_bit) + cmr->base_pfn; + *start_pfn = base + offset; + *end_pfn = base + ((onebit - zerobit) << cma->order_per_bit); + + return true; +} + +static inline bool cma_should_balance_range(struct zone *zone, + struct cma_memrange *cmr) +{ + if (page_zone(pfn_to_page(cmr->base_pfn)) != zone) + return false; + + return true; +} + +/* + * Get the next CMA page range containing pages that have not been + * allocated through cma_alloc. This is just a snapshot, and the caller + * is expected to deal with the changing circumstances. Used to walk + * through CMA pageblocks in a zone in an optimized fashion during + * zone CMA balance compaction. + * + * If @cma is NULL, the global list of ranges is walked, else + * the ranges of the area pointed to by @cma are walked. + */ +bool cma_next_balance_pagerange(struct zone *zone, struct cma *cma, + int *rindex, unsigned long *start_pfn, + unsigned long *end_pfn) +{ + struct cma_memrange *cmr; + int i, nranges; + + if (!cma_nranges) + return false; + + nranges = cma ? cma->nranges : cma_nranges; + + if (*rindex == -1) { + if (*end_pfn != 0) { + for (i = nranges - 1; i >= 0; i--) { + cmr = cma ? &cma->ranges[i] : cma_ranges[i]; + if (!cma_should_balance_range(zone, cmr)) + continue; + if (*end_pfn > cmr->base_pfn && + *end_pfn < (cmr->base_pfn + cmr->count)) + break; + } + } else { + i = nranges - 1; + } + } else { + i = *rindex; + } + + for (; i >= 0; i--) { + cmr = cma ? &cma->ranges[i] : cma_ranges[i]; + if (!cma_should_balance_range(zone, cmr)) + continue; + if (cma_next_free_range(cmr, start_pfn, end_pfn)) { + *rindex = i; + return true; + } + } + + return false; +} + +/* + * Get the next stretch of memory in a zone that is not MIGRATE_CMA + * pageblocks. + */ +bool cma_next_noncma_pagerange(struct zone *zone, int *rindex, + unsigned long *start_pfn, + unsigned long *end_pfn) +{ + struct cma_memrange *cmr; + unsigned long cma_start, cma_end; + int i; + + if (*end_pfn >= zone_end_pfn(zone)) + return false; + + if (*rindex == -1) { + *rindex = 0; + if (*start_pfn == 0) + *start_pfn = zone->zone_start_pfn; + } else { + cmr = cma_ranges[*rindex]; + *start_pfn = cmr->base_pfn + cmr->count; + } + + for (i = *rindex; i < cma_nranges; i++) { + cmr = cma_ranges[i]; + cma_start = cmr->base_pfn; + cma_end = cmr->base_pfn + cmr->count; + if (page_zone(pfn_to_page(cma_start)) != zone) + continue; + if (*start_pfn == cma_start) { + *start_pfn = cma_end; + } else if (*start_pfn < cma_start) { + *rindex = i; + *end_pfn = cma_start; + return true; + } + } + + *rindex = cma_nranges; + *end_pfn = zone_end_pfn(zone); + + return true; +} + static int __init cma_init_reserved_areas(void) { int i, r, nranges; -- 2.51.0.384.g4c02a37b29-goog When the CMA_BALANCE flag is set for a CMA area, it means that it opts in to CMA balancing. This means two things: 1) It allows movable allocations to be migrated in to it in the case of a CMA inbalance (too much free memory in CMA pageblocks as compared to other pageblocks). 2) It is allocated top-down, so that compaction will end up migrating pages in to it. Doing this will make sure that compaction doesn't aggrevate a CMA imbalance, and that it won't fight with CMA balance migration from non-CMA to CMA. Signed-off-by: Frank van der Linden --- include/linux/cma.h | 4 +++- mm/cma.c | 33 ++++++++++++++++++++++++++------- 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/include/linux/cma.h b/include/linux/cma.h index 0504580d61d0..6e98a516b336 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -26,6 +26,7 @@ enum cma_flags { __CMA_ZONES_INVALID, __CMA_ACTIVATED, __CMA_FIXED, + __CMA_BALANCE, }; #define CMA_RESERVE_PAGES_ON_ERROR BIT(__CMA_RESERVE_PAGES_ON_ERROR) @@ -33,8 +34,9 @@ enum cma_flags { #define CMA_ZONES_INVALID BIT(__CMA_ZONES_INVALID) #define CMA_ACTIVATED BIT(__CMA_ACTIVATED) #define CMA_FIXED BIT(__CMA_FIXED) +#define CMA_BALANCE BIT(__CMA_BALANCE) -#define CMA_INIT_FLAGS (CMA_FIXED|CMA_RESERVE_PAGES_ON_ERROR) +#define CMA_INIT_FLAGS (CMA_FIXED|CMA_RESERVE_PAGES_ON_ERROR|CMA_BALANCE) struct cma; struct zone; diff --git a/mm/cma.c b/mm/cma.c index 53cb1833407b..6050d57f3c2e 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -272,6 +272,9 @@ static bool cma_next_free_range(struct cma_memrange *cmr, static inline bool cma_should_balance_range(struct zone *zone, struct cma_memrange *cmr) { + if (!(cmr->cma->flags & CMA_BALANCE)) + return false; + if (page_zone(pfn_to_page(cmr->base_pfn)) != zone) return false; @@ -527,6 +530,12 @@ static bool __init basecmp(struct cma_init_memrange *mlp, return mlp->base < mrp->base; } +static bool __init revbasecmp(struct cma_init_memrange *mlp, + struct cma_init_memrange *mrp) +{ + return mlp->base > mrp->base; +} + /* * Helper function to create sorted lists. */ @@ -575,7 +584,8 @@ static int __init cma_fixed_reserve(phys_addr_t base, phys_addr_t size) } static phys_addr_t __init cma_alloc_mem(phys_addr_t base, phys_addr_t size, - phys_addr_t align, phys_addr_t limit, int nid) + phys_addr_t align, phys_addr_t limit, int nid, + unsigned long flags) { phys_addr_t addr = 0; @@ -588,7 +598,8 @@ static phys_addr_t __init cma_alloc_mem(phys_addr_t base, phys_addr_t size, * like DMA/DMA32. */ #ifdef CONFIG_PHYS_ADDR_T_64BIT - if (!memblock_bottom_up() && limit >= SZ_4G + size) { + if (!(flags & CMA_BALANCE) && !memblock_bottom_up() + && limit >= SZ_4G + size) { memblock_set_bottom_up(true); addr = memblock_alloc_range_nid(size, align, SZ_4G, limit, nid, true); @@ -695,7 +706,7 @@ static int __init __cma_declare_contiguous_nid(phys_addr_t *basep, if (ret) return ret; } else { - base = cma_alloc_mem(base, size, alignment, limit, nid); + base = cma_alloc_mem(base, size, alignment, limit, nid, flags); if (!base) return -ENOMEM; @@ -851,7 +862,10 @@ int __init cma_declare_contiguous_multi(phys_addr_t total_size, list_for_each_safe(mp, next, &ranges) { mlp = list_entry(mp, struct cma_init_memrange, list); list_del(mp); - list_insert_sorted(&final_ranges, mlp, basecmp); + if (flags & CMA_BALANCE) + list_insert_sorted(&final_ranges, mlp, revbasecmp); + else + list_insert_sorted(&final_ranges, mlp, basecmp); sizesum += mlp->size; if (sizesum >= total_size) break; @@ -866,7 +880,12 @@ int __init cma_declare_contiguous_multi(phys_addr_t total_size, list_for_each(mp, &final_ranges) { mlp = list_entry(mp, struct cma_init_memrange, list); size = min(sizeleft, mlp->size); - if (memblock_reserve(mlp->base, size)) { + if (flags & CMA_BALANCE) + start = (mlp->base + mlp->size - size); + else + start = mlp->base; + + if (memblock_reserve(start, size)) { /* * Unexpected error. Could go on to * the next one, but just abort to @@ -877,9 +896,9 @@ int __init cma_declare_contiguous_multi(phys_addr_t total_size, } pr_debug("created region %d: %016llx - %016llx\n", - nr, (u64)mlp->base, (u64)mlp->base + size); + nr, (u64)start, (u64)start + size); cmrp = &cma->ranges[nr++]; - cmrp->base_pfn = PHYS_PFN(mlp->base); + cmrp->base_pfn = PHYS_PFN(start); cmrp->early_pfn = cmrp->base_pfn; cmrp->count = size >> PAGE_SHIFT; cmrp->cma = cma; -- 2.51.0.384.g4c02a37b29-goog For CMA balancing, the compaction migration hooks can largely be reused, except a different function to isolate free pages is needed. So, add a pointer to the isolation function in the compact_control structure. If it's not NULL, use it, else use isolate_freepages as usual. No functional change. Signed-off-by: Frank van der Linden --- mm/compaction.c | 5 ++++- mm/internal.h | 5 +++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/mm/compaction.c b/mm/compaction.c index bf021b31c7ec..6a2c06e356c5 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1813,7 +1813,10 @@ static struct folio *compaction_alloc_noprof(struct folio *src, unsigned long da if (start_order == NR_PAGE_ORDERS) { if (has_isolated_pages) return NULL; - isolate_freepages(cc); + if (cc->isolate_freepages) + cc->isolate_freepages(cc); + else + isolate_freepages(cc); has_isolated_pages = true; goto again; } diff --git a/mm/internal.h b/mm/internal.h index 45b725c3dc03..7916d8be8922 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -873,6 +873,11 @@ struct compact_control { unsigned int nr_freepages; /* Number of isolated free pages */ unsigned int nr_migratepages; /* Number of pages to migrate */ unsigned long free_pfn; /* isolate_freepages search base */ + /* + * Function to use to isolate free pages, if applicable. If NULL, + * default to isolate_freepages(). + */ + void (*isolate_freepages)(struct compact_control *cc); /* * Acts as an in/out parameter to page isolation for migration. * isolate_migratepages uses it as a search base. -- 2.51.0.384.g4c02a37b29-goog The code to isolate pages for migration always checked both cc->alloc_contig and skip_isolation_on_order to determine whether a page could be isolated for migration. Simplify this a little bit by moving the cc->alloc_contig check in to skip_isolation_on_order. Also rename alloc_contig to migrate_large, since there will be an additional user (CMA balancing) of this field soon, not just alloc_contig_range. No functional change. Signed-off-by: Frank van der Linden --- mm/compaction.c | 26 ++++++++++++++------------ mm/internal.h | 2 +- mm/page_alloc.c | 2 +- 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 6a2c06e356c5..2e6c30f50b89 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -793,13 +793,15 @@ static bool too_many_isolated(struct compact_control *cc) /** * skip_isolation_on_order() - determine when to skip folio isolation based on * folio order and compaction target order + * @cc: compact control structure containing target order * @order: to-be-isolated folio order - * @target_order: compaction target order * * This avoids unnecessary folio isolations during compaction. */ -static bool skip_isolation_on_order(int order, int target_order) +static bool skip_isolation_on_order(struct compact_control *cc, int order) { + if (cc->migrate_large) + return false; /* * Unless we are performing global compaction (i.e., * is_via_compact_memory), skip any folios that are larger than the @@ -807,7 +809,7 @@ static bool skip_isolation_on_order(int order, int target_order) * the desired target_order, so migrating this folio would likely fail * later. */ - if (!is_via_compact_memory(target_order) && order >= target_order) + if (!is_via_compact_memory(cc->order) && order >= cc->order) return true; /* * We limit memory compaction to pageblocks and won't try @@ -850,6 +852,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, unsigned long next_skip_pfn = 0; bool skip_updated = false; int ret = 0; + unsigned int order; cc->migrate_pfn = low_pfn; @@ -948,13 +951,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, } if (PageHuge(page)) { - const unsigned int order = compound_order(page); /* * skip hugetlbfs if we are not compacting for pages * bigger than its order. THPs and other compound pages * are handled below. */ - if (!cc->alloc_contig) { + if (!cc->migrate_large) { + order = compound_order(page); if (order <= MAX_PAGE_ORDER) { low_pfn += (1UL << order) - 1; @@ -962,7 +965,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, } goto isolate_fail; } - /* for alloc_contig case */ + /* for migrate_large case */ if (locked) { unlock_page_lruvec_irqrestore(locked, flags); locked = NULL; @@ -1030,11 +1033,11 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * skip them at once. The check is racy, but we can consider * only valid values and the only danger is skipping too much. */ - if (PageCompound(page) && !cc->alloc_contig) { - const unsigned int order = compound_order(page); + if (PageCompound(page)) { + order = compound_order(page); /* Skip based on page order and compaction target order. */ - if (skip_isolation_on_order(order, cc->order)) { + if (skip_isolation_on_order(cc, order)) { if (order <= MAX_PAGE_ORDER) { low_pfn += (1UL << order) - 1; nr_scanned += (1UL << order) - 1; @@ -1182,9 +1185,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, /* * Check LRU folio order under the lock */ - if (unlikely(skip_isolation_on_order(folio_order(folio), - cc->order) && - !cc->alloc_contig)) { + order = folio_order(folio); + if (unlikely(skip_isolation_on_order(cc, order))) { low_pfn += folio_nr_pages(folio) - 1; nr_scanned += folio_nr_pages(folio) - 1; folio_set_lru(folio); diff --git a/mm/internal.h b/mm/internal.h index 7916d8be8922..ffcb3aec05ed 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -909,7 +909,7 @@ struct compact_control { * isolation or migration failures to * ensure forward progress. */ - bool alloc_contig; /* alloc_contig_range allocation */ + bool migrate_large; /* Always migrate large/huge pages */ }; /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d3966d31c039..dc59aaa63ae6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6903,7 +6903,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, .mode = MIGRATE_SYNC, .ignore_skip_hint = true, .no_set_skip_hint = true, - .alloc_contig = true, + .migrate_large = true, }; INIT_LIST_HEAD(&cc.migratepages); enum pb_isolate_mode mode = (alloc_flags & ACR_FLAGS_CMA) ? -- 2.51.0.384.g4c02a37b29-goog A longstanding problem with having a lot of CMA pageblocks in the system (through hugetlb_cma), is that this limits the amount of memory that the kernel can use for its allocations. Kernel allocations are unmovable and can not come from CMA pageblocks. This can lead to situations where kernel allocations cause OOMs, when in fact there might still enough memory available. There isn't much that can be done if the non-CMA part of memory is already taken up by unmovable allocations. That scenario can be considered a misconfigured system. But if there are movable allocations in the non-CMA areas, they are unnecessarily taking away space from the kernel. Currently, the page allocator tries to avoid this scenario by allocating from CMA first if more than half of free pages in a zone come from CMA. But that's not a guarantee. For example, take the case where a lot of memory is being taken up by 1G hugetlb pages, allocated from hugetlb_cma, and that the hugetlb_cma area has been fully used by hugetlbfs. This means that new movable allocations will land in the non-CMA part of memory, and that the kernel may come under memory pressure. If those allocations are long-lasting, freeing up hugetlb pages will not reduce that pressure, since the kernel can't use the new space, and the long-lasting allocations residing in non-CMA memory will stay put. To counter this issue, introduce interfaces to explicitly move pages in to CMA areas. The number of pages moved depends on cma_first_limit. It will use that percentage to calculate the target number of pages that should be moved. A later commit will call one of these interfaces to move pages to CMA if needed, after CMA-allocated hugetlb pages have been freed. Signed-off-by: Frank van der Linden --- include/linux/migrate_mode.h | 1 + include/trace/events/migrate.h | 3 +- mm/compaction.c | 168 +++++++++++++++++++++++++++++++++ mm/internal.h | 4 + 4 files changed, 175 insertions(+), 1 deletion(-) diff --git a/include/linux/migrate_mode.h b/include/linux/migrate_mode.h index 265c4328b36a..3e235499cd73 100644 --- a/include/linux/migrate_mode.h +++ b/include/linux/migrate_mode.h @@ -25,6 +25,7 @@ enum migrate_reason { MR_LONGTERM_PIN, MR_DEMOTION, MR_DAMON, + MR_CMA_BALANCE, MR_TYPES }; diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h index cd01dd7b3640..53d669ee26be 100644 --- a/include/trace/events/migrate.h +++ b/include/trace/events/migrate.h @@ -23,7 +23,8 @@ EM( MR_CONTIG_RANGE, "contig_range") \ EM( MR_LONGTERM_PIN, "longterm_pin") \ EM( MR_DEMOTION, "demotion") \ - EMe(MR_DAMON, "damon") + EM( MR_DAMON, "damon") \ + EMe(MR_CMA_BALANCE, "cma_balance") /* * First define the enums in the above macros to be exported to userspace diff --git a/mm/compaction.c b/mm/compaction.c index 2e6c30f50b89..3200119b8baf 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "internal.h" #ifdef CONFIG_COMPACTION @@ -2512,6 +2513,173 @@ compaction_suit_allocation_order(struct zone *zone, unsigned int order, return COMPACT_CONTINUE; } +#ifdef CONFIG_CMA + +static void +isolate_free_cma_pages(struct compact_control *cc) +{ + unsigned long end_pfn, pfn, next_pfn, start_pfn; + int i; + + i = -1; + end_pfn = 0; + + next_pfn = end_pfn = cc->free_pfn; + start_pfn = 0; + while (cc->nr_freepages < cc->nr_migratepages) { + if (!cma_next_balance_pagerange(cc->zone, cc->cma, &i, + &start_pfn, &end_pfn)) + break; + for (pfn = start_pfn; pfn < end_pfn; pfn = next_pfn) { + next_pfn = pfn + pageblock_nr_pages; + isolate_freepages_block(cc, &pfn, next_pfn, + cc->freepages, 1, false); + if (cc->nr_freepages >= cc->nr_migratepages) + break; + } + } + cc->free_pfn = next_pfn; +} + +static void balance_zone_cma(struct zone *zone, struct cma *cma) +{ + struct compact_control cc = { + .zone = zone, + .cma = cma, + .isolate_freepages = isolate_free_cma_pages, + .nr_migratepages = 0, + .nr_freepages = 0, + .free_pfn = 0, + .migrate_pfn = 0, + .mode = MIGRATE_SYNC, + .ignore_skip_hint = true, + .no_set_skip_hint = true, + .gfp_mask = GFP_KERNEL, + .migrate_large = true, + .order = -1, + }; + unsigned long nr_pages; + int order; + unsigned long free_cma, free_pages, allocated, allocated_noncma; + unsigned long target_free_cma; + int rindex, ret = 0, n; + unsigned long start_pfn, end_pfn, pfn, next_pfn; + long nr_migrated; + + if (zone_idx(zone) == ZONE_MOVABLE) + return; + + if (!cma && !cma_numranges()) + return; + + /* + * Try to move allocated pages from non-CMA pageblocks + * to CMA pageblocks (possibly in a specific CMA area), to + * give the kernel more space for unmovable allocations. + * + * cma_first_limit, the percentage of free pages that are + * MIGRATE_CMA, is used to calculcate the target number. + */ + free_pages = zone_page_state(zone, NR_FREE_PAGES); + free_cma = zone_page_state(zone, NR_FREE_CMA_PAGES); + if (!free_cma) + return; + + target_free_cma = (cma_first_limit * free_pages) / 100; + /* + * If we're already below the target, nothing to do. + */ + if (free_cma <= target_free_cma) + return; + + /* + * To try to avoid scanning too much non-CMA memory, + * set the upper bound of pages we want to migrate + * to the minimum of: + * 1. The number of MIGRATE_CMA pages we want to use. + * 2. The space available in the targeted CMA area (if any). + * 3. The number of used non-CMA pages. + * + * This will still likely cause the scanning of more + * pageblocks than is strictly needed, but it's the best + * that can be done without explicit tracking of the number + * of movable allocations in non-CMA memory. + */ + allocated = zone_managed_pages(zone) - free_pages; + allocated_noncma = allocated - (zone_cma_pages(zone) - free_cma); + + nr_pages = free_cma - target_free_cma; + if (cma) + nr_pages = min(nr_pages, cma_get_available(cma)); + nr_pages = min(allocated_noncma, nr_pages); + + for (order = 0; order < NR_PAGE_ORDERS; order++) + INIT_LIST_HEAD(&cc.freepages[order]); + INIT_LIST_HEAD(&cc.migratepages); + + rindex = -1; + start_pfn = next_pfn = end_pfn = 0; + nr_migrated = 0; + while (nr_pages > 0) { + ret = 0; + if (!cma_next_noncma_pagerange(cc.zone, &rindex, + &start_pfn, &end_pfn)) + break; + + for (pfn = start_pfn; pfn < end_pfn; pfn = next_pfn) { + next_pfn = pfn + pageblock_nr_pages; + cc.nr_migratepages = 0; + + if (!pageblock_pfn_to_page(pfn, next_pfn, zone)) + continue; + + ret = isolate_migratepages_block(&cc, pfn, next_pfn, + ISOLATE_UNEVICTABLE); + if (ret) + continue; + ret = migrate_pages(&cc.migratepages, compaction_alloc, + compaction_free, (unsigned long)&cc, + cc.mode, MR_CMA_BALANCE, &n); + if (ret) + putback_movable_pages(&cc.migratepages); + nr_migrated += n; + if (nr_migrated >= nr_pages) + break; + } + + nr_pages -= min_t(unsigned long, nr_migrated, nr_pages); + } + + if (cc.nr_freepages > 0) + release_free_list(cc.freepages); +} + +void balance_node_cma(int nid, struct cma *cma) +{ + pg_data_t *pgdat; + int zoneid; + struct zone *zone; + + if (!cma && !cma_numranges()) + return; + + if (nid >= MAX_NUMNODES || !node_online(nid)) + return; + + pgdat = NODE_DATA(nid); + + for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { + + zone = &pgdat->node_zones[zoneid]; + if (!populated_zone(zone)) + continue; + + balance_zone_cma(zone, cma); + } +} + +#endif /* CONFIG_CMA */ + static enum compact_result compact_zone(struct compact_control *cc, struct capture_control *capc) { diff --git a/mm/internal.h b/mm/internal.h index ffcb3aec05ed..7dcaf7214683 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -857,6 +857,8 @@ void memmap_init_range(unsigned long, int, unsigned long, unsigned long, #if defined CONFIG_COMPACTION || defined CONFIG_CMA +struct cma; + /* * in mm/compaction.c */ @@ -887,6 +889,7 @@ struct compact_control { unsigned long migrate_pfn; unsigned long fast_start_pfn; /* a pfn to start linear scan from */ struct zone *zone; + struct cma *cma; /* if moving to a specific CMA area */ unsigned long total_migrate_scanned; unsigned long total_free_scanned; unsigned short fast_search_fail;/* failures to use free list searches */ @@ -938,6 +941,7 @@ struct cma; #ifdef CONFIG_CMA void *cma_reserve_early(struct cma *cma, unsigned long size); void init_cma_pageblock(struct page *page); +void balance_node_cma(int nid, struct cma *cma); #else static inline void *cma_reserve_early(struct cma *cma, unsigned long size) { -- 2.51.0.384.g4c02a37b29-goog CMA areas are normally not very large, but HugeTLB CMA is an exception. hugetlb_cma, used for 'gigantic' pages (usually 1G), can take up many gigabytes of memory. As such, it is potentially the largest source of 'false OOM' conditions, situations where the kernel runs out of space for unmovable allocations, because it can't allocate from CMA pageblocks, and non-CMA memory has been tied up by other movable allocations. The normal use case of hugetlb_cma is a system where 1G hugetlb pages are sometimes, but not always, needed, so they need to be created and freed dynamically. As such, the best time to address CMA memory imbalances is when CMA hugetlb pages are freed, making multiples of 1G available as buddy managed CMA pageblocks. That is a good time to check if movable allocations fron non-CMA pageblocks should be moved to CMA pageblocks to give the kernel more breathing space. Do this by calling balance_node_cma on either the hugetlb CMA area for the node that just had its number of hugetlb pages reduced, or for all hugetlb CMA areas if the reduction was not node-specific. To have the CMA balancing code act on the hugetlb CMA areas, set the CMA_BALANCE flag when creating them. Signed-off-by: Frank van der Linden --- mm/hugetlb.c | 14 ++++++++------ mm/hugetlb_cma.c | 16 ++++++++++++++++ mm/hugetlb_cma.h | 5 +++++ 3 files changed, 29 insertions(+), 6 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index eed59cfb5d21..611655876f60 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3971,12 +3971,14 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, list_add(&folio->lru, &page_list); } - /* free the pages after dropping lock */ - spin_unlock_irq(&hugetlb_lock); - update_and_free_pages_bulk(h, &page_list); - flush_free_hpage_work(h); - spin_lock_irq(&hugetlb_lock); - + if (!list_empty(&page_list)) { + /* free the pages after dropping lock */ + spin_unlock_irq(&hugetlb_lock); + update_and_free_pages_bulk(h, &page_list); + flush_free_hpage_work(h); + hugetlb_cma_balance(nid); + spin_lock_irq(&hugetlb_lock); + } while (count < persistent_huge_pages(h)) { if (!adjust_pool_surplus(h, nodes_allowed, 1)) break; diff --git a/mm/hugetlb_cma.c b/mm/hugetlb_cma.c index 71d0e9a048d4..c0396d35b5bf 100644 --- a/mm/hugetlb_cma.c +++ b/mm/hugetlb_cma.c @@ -276,3 +276,19 @@ bool __init hugetlb_early_cma(struct hstate *h) return hstate_is_gigantic(h) && hugetlb_cma_only; } + +void hugetlb_cma_balance(int nid) +{ + int node; + + if (nid != NUMA_NO_NODE) { + if (hugetlb_cma[nid]) + balance_node_cma(nid, hugetlb_cma[nid]); + } else { + for_each_online_node(node) { + if (hugetlb_cma[node]) + balance_node_cma(node, + hugetlb_cma[node]); + } + } +} diff --git a/mm/hugetlb_cma.h b/mm/hugetlb_cma.h index f7d7fb9880a2..2f2a35b56d8a 100644 --- a/mm/hugetlb_cma.h +++ b/mm/hugetlb_cma.h @@ -13,6 +13,7 @@ bool hugetlb_cma_exclusive_alloc(void); unsigned long hugetlb_cma_total_size(void); void hugetlb_cma_validate_params(void); bool hugetlb_early_cma(struct hstate *h); +void hugetlb_cma_balance(int nid); #else static inline void hugetlb_cma_free_folio(struct folio *folio) { @@ -53,5 +54,9 @@ static inline bool hugetlb_early_cma(struct hstate *h) { return false; } + +static inline void hugetlb_cma_balance(int nid) +{ +} #endif #endif -- 2.51.0.384.g4c02a37b29-goog To keep things consistent, rebalance CMA when changing the cma_first_limit sysctl. Signed-off-by: Frank van der Linden --- mm/page_alloc.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dc59aaa63ae6..da1cab63995c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6640,6 +6640,24 @@ static int percpu_pagelist_high_fraction_sysctl_handler(const struct ctl_table * return ret; } +#ifdef CONFIG_CMA +static int cma_first_limit_sysctl_handler(const struct ctl_table *table, + int write, void *buffer, size_t *length, + loff_t *ppos) +{ + int ret, nid; + + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (ret || !write) + return ret; + + for_each_node_state(nid, N_MEMORY) + balance_node_cma(nid, NULL); + + return 0; +} +#endif + static const struct ctl_table page_alloc_sysctl_table[] = { { .procname = "min_free_kbytes", @@ -6723,7 +6741,7 @@ static const struct ctl_table page_alloc_sysctl_table[] = { .data = &cma_first_limit, .maxlen = sizeof(cma_first_limit), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = cma_first_limit_sysctl_handler, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE_HUNDRED, }, -- 2.51.0.384.g4c02a37b29-goog Add VM counters that record the number of migration success / failures during CMA rebalancing. This is similar to other migrate counters. Signed-off-by: Frank van der Linden --- include/linux/vm_event_item.h | 3 +++ mm/migrate.c | 8 ++++++++ mm/vmstat.c | 2 ++ 3 files changed, 13 insertions(+) diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 9e15a088ba38..1711ff85a02f 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -87,6 +87,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, CMA_ALLOC_SUCCESS, CMA_ALLOC_FAIL, #endif + CMA_BALANCE_MIGRATE_SUCCESS, + CMA_BALANCE_MIGRATE_FAIL, + UNEVICTABLE_PGCULLED, /* culled to noreclaim list */ UNEVICTABLE_PGSCANNED, /* scanned for reclaimability */ UNEVICTABLE_PGRESCUED, /* rescued from noreclaim list */ diff --git a/mm/migrate.c b/mm/migrate.c index 9e5ef39ce73a..63d771daa3bc 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2144,6 +2144,14 @@ int migrate_pages(struct list_head *from, new_folio_t get_new_folio, count_vm_events(THP_MIGRATION_SUCCESS, stats.nr_thp_succeeded); count_vm_events(THP_MIGRATION_FAIL, stats.nr_thp_failed); count_vm_events(THP_MIGRATION_SPLIT, stats.nr_thp_split); + + if (reason == MR_CMA_BALANCE) { + count_vm_events(CMA_BALANCE_MIGRATE_SUCCESS, + stats.nr_succeeded); + count_vm_events(CMA_BALANCE_MIGRATE_FAIL, + stats.nr_failed_pages); + } + trace_mm_migrate_pages(stats.nr_succeeded, stats.nr_failed_pages, stats.nr_thp_succeeded, stats.nr_thp_failed, stats.nr_thp_split, stats.nr_split, mode, diff --git a/mm/vmstat.c b/mm/vmstat.c index 71cd1ceba191..af811328db09 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1392,6 +1392,8 @@ const char * const vmstat_text[] = { #ifdef CONFIG_CMA [I(CMA_ALLOC_SUCCESS)] = "cma_alloc_success", [I(CMA_ALLOC_FAIL)] = "cma_alloc_fail", + [I(CMA_BALANCE_MIGRATE_SUCCESS)] = "cma_balance_migrate_success", + [I(CMA_BALANCE_MIGRATE_FAIL)] = "cma_balance_migrate_fail", #endif [I(UNEVICTABLE_PGCULLED)] = "unevictable_pgs_culled", [I(UNEVICTABLE_PGSCANNED)] = "unevictable_pgs_scanned", -- 2.51.0.384.g4c02a37b29-goog