When invoke move_pfn_range_to_zone or remove_pfn_range_from_zone, it will update the zone->contiguous by checking the new zone's pfn range from the beginning to the end, regardless the previous state of the old zone. When the zone's pfn range is large, the cost of traversing the pfn range to update the zone->contiguous could be significant. Add a new zone's pages_with_memmap member, it is pages within the zone that have an online memmap. It includes present pages and memory holes that have a memmap. When spanned_pages == pages_with_online_memmap, pfn_to_page() can be performed without further checks on any pfn within the zone span. The following test cases of memory hotplug for a VM [1], tested in the environment [2], show that this optimization can significantly reduce the memory hotplug time [3]. +----------------+------+---------------+--------------+----------------+ | | Size | Time (before) | Time (after) | Time Reduction | | +------+---------------+--------------+----------------+ | Plug Memory | 256G | 10s | 3s | 70% | | +------+---------------+--------------+----------------+ | | 512G | 36s | 7s | 81% | +----------------+------+---------------+--------------+----------------+ +----------------+------+---------------+--------------+----------------+ | | Size | Time (before) | Time (after) | Time Reduction | | +------+---------------+--------------+----------------+ | Unplug Memory | 256G | 11s | 4s | 64% | | +------+---------------+--------------+----------------+ | | 512G | 36s | 9s | 75% | +----------------+------+---------------+--------------+----------------+ [1] Qemu commands to hotplug 256G/512G memory for a VM: object_add memory-backend-ram,id=hotmem0,size=256G/512G,share=on device_add virtio-mem-pci,id=vmem1,memdev=hotmem0,bus=port1 qom-set vmem1 requested-size 256G/512G (Plug Memory) qom-set vmem1 requested-size 0G (Unplug Memory) [2] Hardware : Intel Icelake server Guest Kernel : v7.0-rc4 Qemu : v9.0.0 Launch VM : qemu-system-x86_64 -accel kvm -cpu host \ -drive file=./Centos10_cloud.qcow2,format=qcow2,if=virtio \ -drive file=./seed.img,format=raw,if=virtio \ -smp 3,cores=3,threads=1,sockets=1,maxcpus=3 \ -m 2G,slots=10,maxmem=2052472M \ -device pcie-root-port,id=port1,bus=pcie.0,slot=1,multifunction=on \ -device pcie-root-port,id=port2,bus=pcie.0,slot=2 \ -nographic -machine q35 \ -nic user,hostfwd=tcp::3000-:22 Guest kernel auto-onlines newly added memory blocks: echo online > /sys/devices/system/memory/auto_online_blocks [3] The time from typing the QEMU commands in [1] to when the output of 'grep MemTotal /proc/meminfo' on Guest reflects that all hotplugged memory is recognized. Reported-by: Nanhai Zou Reported-by: Chen Zhang Tested-by: Yuan Liu Reviewed-by: Tim Chen Reviewed-by: Qiuxu Zhuo Reviewed-by: Yu C Chen Reviewed-by: Pan Deng Reviewed-by: Nanhai Zou Reviewed-by: Yuan Liu Co-developed-by: Tianyou Li Signed-off-by: Tianyou Li Signed-off-by: Yuan Liu --- Documentation/mm/physical_memory.rst | 6 +++++ include/linux/mmzone.h | 22 ++++++++++++++- mm/internal.h | 10 +++---- mm/memory_hotplug.c | 21 +++++---------- mm/mm_init.c | 40 +++++++++------------------- 5 files changed, 50 insertions(+), 49 deletions(-) diff --git a/Documentation/mm/physical_memory.rst b/Documentation/mm/physical_memory.rst index b76183545e5b..d324da29ac11 100644 --- a/Documentation/mm/physical_memory.rst +++ b/Documentation/mm/physical_memory.rst @@ -483,6 +483,12 @@ General ``present_pages`` should use ``get_online_mems()`` to get a stable value. It is initialized by ``calculate_node_totalpages()``. +``pages_with_online_memmap`` + The pages_with_online_memmap is pages within the zone that have an online + memmap. It includes present pages and memory holes that have a memmap. When + spanned_pages == pages_with_online_memmap, pfn_to_page() can be performed + without further checks on any pfn within the zone span. + ``present_early_pages`` The present pages existing within the zone located on memory available since early boot, excluding hotplugged memory. Defined only when diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3e51190a55e4..c7a136ce55c7 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -943,6 +943,11 @@ struct zone { * cma pages is present pages that are assigned for CMA use * (MIGRATE_CMA). * + * pages_with_online_memmap is pages within the zone that have an online + * memmap. It includes present pages and memory holes that have a memmap. + * When spanned_pages == pages_with_online_memmap, pfn_to_page() can be + * performed without further checks on any pfn within the zone span. + * * So present_pages may be used by memory hotplug or memory power * management logic to figure out unmanaged pages by checking * (present_pages - managed_pages). And managed_pages should be used @@ -967,6 +972,7 @@ struct zone { atomic_long_t managed_pages; unsigned long spanned_pages; unsigned long present_pages; + unsigned long pages_with_online_memmap; #if defined(CONFIG_MEMORY_HOTPLUG) unsigned long present_early_pages; #endif @@ -1051,7 +1057,6 @@ struct zone { bool compact_blockskip_flush; #endif - bool contiguous; CACHELINE_PADDING(_pad3_); /* Zone statistics */ @@ -1124,6 +1129,21 @@ static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn) return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone); } +/** + * zone_is_contiguous - test whether a zone is contiguous + * @zone: the zone to test. + * + * In a contiguous zone, it is valid to call pfn_to_page() on any pfn in the + * spanned zone without requiring pfn_valid() or pfn_to_online_page() checks. + * + * Returns: true if contiguous, otherwise false. + */ +static inline bool zone_is_contiguous(const struct zone *zone) +{ + return READ_ONCE(zone->spanned_pages) == + READ_ONCE(zone->pages_with_online_memmap); +} + static inline bool zone_is_initialized(const struct zone *zone) { return zone->initialized; diff --git a/mm/internal.h b/mm/internal.h index cb0af847d7d9..7c4c8ab68bde 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -793,21 +793,17 @@ extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn, static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn, unsigned long end_pfn, struct zone *zone) { - if (zone->contiguous) + if (zone_is_contiguous(zone) && zone_spans_pfn(zone, start_pfn)) { + VM_BUG_ON(end_pfn > zone_end_pfn(zone)); return pfn_to_page(start_pfn); + } return __pageblock_pfn_to_page(start_pfn, end_pfn, zone); } -void set_zone_contiguous(struct zone *zone); bool pfn_range_intersects_zones(int nid, unsigned long start_pfn, unsigned long nr_pages); -static inline void clear_zone_contiguous(struct zone *zone) -{ - zone->contiguous = false; -} - extern int __isolate_free_page(struct page *page, unsigned int order); extern void __putback_isolated_page(struct page *page, unsigned int order, int mt); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index bc805029da51..2ba7a394a64b 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -492,11 +492,11 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, pfn = find_smallest_section_pfn(nid, zone, end_pfn, zone_end_pfn(zone)); if (pfn) { - zone->spanned_pages = zone_end_pfn(zone) - pfn; + WRITE_ONCE(zone->spanned_pages, zone_end_pfn(zone) - pfn); zone->zone_start_pfn = pfn; } else { zone->zone_start_pfn = 0; - zone->spanned_pages = 0; + WRITE_ONCE(zone->spanned_pages, 0); } } else if (zone_end_pfn(zone) == end_pfn) { /* @@ -508,10 +508,10 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, pfn = find_biggest_section_pfn(nid, zone, zone->zone_start_pfn, start_pfn); if (pfn) - zone->spanned_pages = pfn - zone->zone_start_pfn + 1; + WRITE_ONCE(zone->spanned_pages, pfn - zone->zone_start_pfn + 1); else { zone->zone_start_pfn = 0; - zone->spanned_pages = 0; + WRITE_ONCE(zone->spanned_pages, 0); } } } @@ -565,18 +565,13 @@ void remove_pfn_range_from_zone(struct zone *zone, /* * Zone shrinking code cannot properly deal with ZONE_DEVICE. So - * we will not try to shrink the zones - which is okay as - * set_zone_contiguous() cannot deal with ZONE_DEVICE either way. + * we will not try to shrink the zones. */ if (zone_is_zone_device(zone)) return; - clear_zone_contiguous(zone); - shrink_zone_span(zone, start_pfn, start_pfn + nr_pages); update_pgdat_span(pgdat); - - set_zone_contiguous(zone); } /** @@ -753,8 +748,6 @@ void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, struct pglist_data *pgdat = zone->zone_pgdat; int nid = pgdat->node_id; - clear_zone_contiguous(zone); - if (zone_is_empty(zone)) init_currently_empty_zone(zone, start_pfn, nr_pages); resize_zone_range(zone, start_pfn, nr_pages); @@ -782,8 +775,6 @@ void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, memmap_init_range(nr_pages, nid, zone_idx(zone), start_pfn, 0, MEMINIT_HOTPLUG, altmap, migratetype, isolate_pageblock); - - set_zone_contiguous(zone); } struct auto_movable_stats { @@ -1079,6 +1070,8 @@ void adjust_present_page_count(struct page *page, struct memory_group *group, if (early_section(__pfn_to_section(page_to_pfn(page)))) zone->present_early_pages += nr_pages; zone->present_pages += nr_pages; + WRITE_ONCE(zone->pages_with_online_memmap, + READ_ONCE(zone->pages_with_online_memmap) + nr_pages); zone->zone_pgdat->node_present_pages += nr_pages; if (group && movable) diff --git a/mm/mm_init.c b/mm/mm_init.c index df34797691bd..96690e550024 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -946,6 +946,7 @@ static void __init memmap_init_zone_range(struct zone *zone, unsigned long zone_start_pfn = zone->zone_start_pfn; unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages; int nid = zone_to_nid(zone), zone_id = zone_idx(zone); + unsigned long zone_hole_start, zone_hole_end; start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn); end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn); @@ -957,8 +958,19 @@ static void __init memmap_init_zone_range(struct zone *zone, zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE, false); - if (*hole_pfn < start_pfn) + WRITE_ONCE(zone->pages_with_online_memmap, + READ_ONCE(zone->pages_with_online_memmap) + + (end_pfn - start_pfn)); + + if (*hole_pfn < start_pfn) { init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid); + zone_hole_start = clamp(*hole_pfn, zone_start_pfn, zone_end_pfn); + zone_hole_end = clamp(start_pfn, zone_start_pfn, zone_end_pfn); + if (zone_hole_start < zone_hole_end) + WRITE_ONCE(zone->pages_with_online_memmap, + READ_ONCE(zone->pages_with_online_memmap) + + (zone_hole_end - zone_hole_start)); + } *hole_pfn = end_pfn; } @@ -2261,28 +2273,6 @@ void __init init_cma_pageblock(struct page *page) } #endif -void set_zone_contiguous(struct zone *zone) -{ - unsigned long block_start_pfn = zone->zone_start_pfn; - unsigned long block_end_pfn; - - block_end_pfn = pageblock_end_pfn(block_start_pfn); - for (; block_start_pfn < zone_end_pfn(zone); - block_start_pfn = block_end_pfn, - block_end_pfn += pageblock_nr_pages) { - - block_end_pfn = min(block_end_pfn, zone_end_pfn(zone)); - - if (!__pageblock_pfn_to_page(block_start_pfn, - block_end_pfn, zone)) - return; - cond_resched(); - } - - /* We confirm that there is no hole */ - zone->contiguous = true; -} - /* * Check if a PFN range intersects multiple zones on one or more * NUMA nodes. Specify the @nid argument if it is known that this @@ -2311,7 +2301,6 @@ bool pfn_range_intersects_zones(int nid, unsigned long start_pfn, static void __init mem_init_print_info(void); void __init page_alloc_init_late(void) { - struct zone *zone; int nid; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT @@ -2345,9 +2334,6 @@ void __init page_alloc_init_late(void) for_each_node_state(nid, N_MEMORY) shuffle_free_memory(NODE_DATA(nid)); - for_each_populated_zone(zone) - set_zone_contiguous(zone); - /* Initialize page ext after all struct pages are initialized. */ if (deferred_struct_pages) page_ext_init(); -- 2.47.3