Currently, mm_core_init calls kho_memory_init, which calls kho_release_scratch. If DEFERRED is enabled, kho_release_scratch will first initialize the struct pages of kho scratch. This is not needed. We can just let page_alloc_init_late init it. Next, kho_release_scratch will mark scratch as MIGRATE_CMA. If DEFERRED is enabled, this will be overwritten later in deferred_free_pages. To fix this, I removed the whole kho_release_scratch. Marking the pageblocks as MIGRATE_CMA now happens in kho_init, which runs after deferred_free_pages. Signed-off-by: Michal Clapinski --- include/linux/memblock.h | 2 -- kernel/liveupdate/kexec_handover.c | 43 ++++++++---------------------- mm/memblock.c | 22 --------------- 3 files changed, 11 insertions(+), 56 deletions(-) diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 221118b5a16e..35d9cf6bbf7a 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -614,11 +614,9 @@ static inline void memtest_report_meminfo(struct seq_file *m) { } #ifdef CONFIG_MEMBLOCK_KHO_SCRATCH void memblock_set_kho_scratch_only(void); void memblock_clear_kho_scratch_only(void); -void memmap_init_kho_scratch_pages(void); #else static inline void memblock_set_kho_scratch_only(void) { } static inline void memblock_clear_kho_scratch_only(void) { } -static inline void memmap_init_kho_scratch_pages(void) {} #endif #endif /* _LINUX_MEMBLOCK_H */ diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index b851b09a8e99..de167bfa2c8d 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -1377,11 +1377,6 @@ static __init int kho_init(void) if (err) goto err_free_fdt; - if (fdt) { - kho_in_debugfs_init(&kho_in.dbg, fdt); - return 0; - } - for (int i = 0; i < kho_scratch_cnt; i++) { unsigned long base_pfn = PHYS_PFN(kho_scratch[i].addr); unsigned long count = kho_scratch[i].size >> PAGE_SHIFT; @@ -1397,8 +1392,17 @@ static __init int kho_init(void) */ kmemleak_ignore_phys(kho_scratch[i].addr); for (pfn = base_pfn; pfn < base_pfn + count; - pfn += pageblock_nr_pages) - init_cma_reserved_pageblock(pfn_to_page(pfn)); + pfn += pageblock_nr_pages) { + if (fdt) + init_cma_pageblock(pfn_to_page(pfn)); + else + init_cma_reserved_pageblock(pfn_to_page(pfn)); + } + } + + if (fdt) { + kho_in_debugfs_init(&kho_in.dbg, fdt); + return 0; } WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, "fdt", @@ -1421,35 +1425,10 @@ static __init int kho_init(void) } fs_initcall(kho_init); -static void __init kho_release_scratch(void) -{ - phys_addr_t start, end; - u64 i; - - memmap_init_kho_scratch_pages(); - - /* - * Mark scratch mem as CMA before we return it. That way we - * ensure that no kernel allocations happen on it. That means - * we can reuse it as scratch memory again later. - */ - __for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE, - MEMBLOCK_KHO_SCRATCH, &start, &end, NULL) { - ulong start_pfn = pageblock_start_pfn(PFN_DOWN(start)); - ulong end_pfn = pageblock_align(PFN_UP(end)); - ulong pfn; - - for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) - init_pageblock_migratetype(pfn_to_page(pfn), - MIGRATE_CMA, false); - } -} - void __init kho_memory_init(void) { if (kho_in.mem_map_phys) { kho_scratch = phys_to_virt(kho_in.scratch_phys); - kho_release_scratch(); kho_mem_deserialize(phys_to_virt(kho_in.mem_map_phys)); } else { kho_reserve_scratch(); diff --git a/mm/memblock.c b/mm/memblock.c index 6cff515d82f4..3eff19124fc0 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -959,28 +959,6 @@ __init void memblock_clear_kho_scratch_only(void) { kho_scratch_only = false; } - -__init void memmap_init_kho_scratch_pages(void) -{ - phys_addr_t start, end; - unsigned long pfn; - int nid; - u64 i; - - if (!IS_ENABLED(CONFIG_DEFERRED_STRUCT_PAGE_INIT)) - return; - - /* - * Initialize struct pages for free scratch memory. - * The struct pages for reserved scratch memory will be set up in - * reserve_bootmem_region() - */ - __for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE, - MEMBLOCK_KHO_SCRATCH, &start, &end, &nid) { - for (pfn = PFN_UP(start); pfn < PFN_DOWN(end); pfn++) - init_deferred_page(pfn, nid); - } -} #endif /** -- 2.53.0.345.g96ddfc5eaa-goog From: Evangelos Petrongonas When CONFIG_DEFERRED_STRUCT_PAGE_INIT is enabled, struct page initialization is deferred to parallel kthreads that run later in the boot process. During KHO restoration, deserialize_bitmap() writes metadata for each preserved memory region. However, if the struct page has not been initialized, this write targets uninitialized memory, potentially leading to errors like: BUG: unable to handle page fault for address: ... Fix this by introducing kho_get_preserved_page(), which ensures all struct pages in a preserved region are initialized by calling init_deferred_page() which is a no-op when deferred init is disabled or when the struct page is already initialized. Signed-off-by: Evangelos Petrongonas Co-developed-by: Michal Clapinski Signed-off-by: Michal Clapinski Reviewed-by: Pratyush Yadav (Google) Reviewed-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) --- I think we can't initialize those struct pages in kho_restore_page. I encountered this stack: page_zone(start_page) __pageblock_pfn_to_page set_zone_contiguous page_alloc_init_late So, at the end of page_alloc_init_late struct pages are expected to be already initialized. set_zone_contiguous() looks at the first and last struct page of each pageblock in each populated zone to figure out if the zone is contiguous. If a kho page lands on a pageblock boundary, this will lead to access of an uninitialized struct page. There is also page_ext_init that invokes pfn_to_nid, which calls page_to_nid for each section-aligned page. There might be other places that do something similar. Therefore, it's a good idea to initialize all struct pages by the end of deferred struct page init. That's why I'm resending Evangelos's patch. I also tried to implement Pratyush's idea, i.e. iterate over zones, then get node from zone. I didn't notice any performance difference even with 8GB of kho. --- kernel/liveupdate/Kconfig | 2 -- kernel/liveupdate/kexec_handover.c | 27 ++++++++++++++++++++++++++- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/kernel/liveupdate/Kconfig b/kernel/liveupdate/Kconfig index 1a8513f16ef7..c13af38ba23a 100644 --- a/kernel/liveupdate/Kconfig +++ b/kernel/liveupdate/Kconfig @@ -1,12 +1,10 @@ # SPDX-License-Identifier: GPL-2.0-only menu "Live Update and Kexec HandOver" - depends on !DEFERRED_STRUCT_PAGE_INIT config KEXEC_HANDOVER bool "kexec handover" depends on ARCH_SUPPORTS_KEXEC_HANDOVER && ARCH_SUPPORTS_KEXEC_FILE - depends on !DEFERRED_STRUCT_PAGE_INIT select MEMBLOCK_KHO_SCRATCH select KEXEC_FILE select LIBFDT diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index de167bfa2c8d..fe9c88fd2541 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -457,6 +457,31 @@ static int kho_mem_serialize(struct kho_out *kho_out) return err; } +/* + * With CONFIG_DEFERRED_STRUCT_PAGE_INIT, struct pages in higher memory regions + * may not be initialized yet at the time KHO deserializes preserved memory. + * KHO uses the struct page to store metadata and a later initialization would + * overwrite it. + * Ensure all the struct pages in the preservation are + * initialized. deserialize_bitmap() marks the reservation as noinit to make + * sure they don't get re-initialized later. + */ +static struct page *__init kho_get_preserved_page(phys_addr_t phys, + unsigned int order) +{ + unsigned long pfn = PHYS_PFN(phys); + int nid; + + if (!IS_ENABLED(CONFIG_DEFERRED_STRUCT_PAGE_INIT)) + return pfn_to_page(pfn); + + nid = early_pfn_to_nid(pfn); + for (unsigned long i = 0; i < (1UL << order); i++) + init_deferred_page(pfn + i, nid); + + return pfn_to_page(pfn); +} + static void __init deserialize_bitmap(unsigned int order, struct khoser_mem_bitmap_ptr *elm) { @@ -467,7 +492,7 @@ static void __init deserialize_bitmap(unsigned int order, int sz = 1 << (order + PAGE_SHIFT); phys_addr_t phys = elm->phys_start + (bit << (order + PAGE_SHIFT)); - struct page *page = phys_to_page(phys); + struct page *page = kho_get_preserved_page(phys, order); union kho_page_info info; memblock_reserve(phys, sz); -- 2.53.0.345.g96ddfc5eaa-goog