In an ideal world, we wouldn't have to deal with SPARSEMEM without SPARSEMEM_VMEMMAP, but in particular for 32bit SPARSEMEM_VMEMMAP is considered too costly and consequently not supported. However, if an architecture does support SPARSEMEM with SPARSEMEM_VMEMMAP, let's forbid the user to disable VMEMMAP: just like we already do for arm64, s390 and x86. So if SPARSEMEM_VMEMMAP is supported, don't allow to use SPARSEMEM without SPARSEMEM_VMEMMAP. This implies that the option to not use SPARSEMEM_VMEMMAP will now be gone for loongarch, powerpc, riscv and sparc. All architectures only enable SPARSEMEM_VMEMMAP with 64bit support, so there should not really be a big downside to using the VMEMMAP (quite the contrary). This is a preparation for not supporting (1) folio sizes that exceed a single memory section (2) CMA allocations of non-contiguous page ranges in SPARSEMEM without SPARSEMEM_VMEMMAP configs, whereby we want to limit possible impact as much as possible (e.g., gigantic hugetlb page allocations suddenly fails). Acked-by: Zi Yan Acked-by: Mike Rapoport (Microsoft) Acked-by: SeongJae Park Reviewed-by: Wei Yang Reviewed-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Cc: Huacai Chen Cc: WANG Xuerui Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Christophe Leroy Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: Alexandre Ghiti Cc: "David S. Miller" Cc: Andreas Larsson Signed-off-by: David Hildenbrand --- mm/Kconfig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/Kconfig b/mm/Kconfig index 4108bcd967848..330d0e698ef96 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -439,9 +439,8 @@ config SPARSEMEM_VMEMMAP_ENABLE bool config SPARSEMEM_VMEMMAP - bool "Sparse Memory virtual memmap" + def_bool y depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE - default y help SPARSEMEM_VMEMMAP uses a virtually mapped memmap to optimise pfn_to_page and page_to_pfn operations. This is the most -- 2.50.1 Now handled by the core automatically once SPARSEMEM_VMEMMAP_ENABLE is selected. Reviewed-by: Mike Rapoport (Microsoft) Acked-by: Catalin Marinas Reviewed-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Cc: Will Deacon Signed-off-by: David Hildenbrand --- arch/arm64/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index e9bbfacc35a64..b1d1f2ff2493b 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1570,7 +1570,6 @@ source "kernel/Kconfig.hz" config ARCH_SPARSEMEM_ENABLE def_bool y select SPARSEMEM_VMEMMAP_ENABLE - select SPARSEMEM_VMEMMAP config HW_PERF_EVENTS def_bool y -- 2.50.1 Now handled by the core automatically once SPARSEMEM_VMEMMAP_ENABLE is selected. Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: Sven Schnelle Signed-off-by: David Hildenbrand --- arch/s390/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index bf680c26a33cf..145ca23c2fff6 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -710,7 +710,6 @@ menu "Memory setup" config ARCH_SPARSEMEM_ENABLE def_bool y select SPARSEMEM_VMEMMAP_ENABLE - select SPARSEMEM_VMEMMAP config ARCH_SPARSEMEM_DEFAULT def_bool y -- 2.50.1 Now handled by the core automatically once SPARSEMEM_VMEMMAP_ENABLE is selected. Reviewed-by: Mike Rapoport (Microsoft) Acked-by: Dave Hansen Reviewed-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Signed-off-by: David Hildenbrand --- arch/x86/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 58d890fe2100e..e431d1c06fecd 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1552,7 +1552,6 @@ config ARCH_SPARSEMEM_ENABLE def_bool y select SPARSEMEM_STATIC if X86_32 select SPARSEMEM_VMEMMAP_ENABLE if X86_64 - select SPARSEMEM_VMEMMAP if X86_64 config ARCH_SPARSEMEM_DEFAULT def_bool X86_64 || (NUMA && X86_32) -- 2.50.1 It's no longer user-selectable (and the default was already "y"), so let's just drop it. It was never really relevant to the wireguard selftests either way. Acked-by: Mike Rapoport (Microsoft) Reviewed-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Cc: "Jason A. Donenfeld" Cc: Shuah Khan Signed-off-by: David Hildenbrand --- tools/testing/selftests/wireguard/qemu/kernel.config | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/wireguard/qemu/kernel.config b/tools/testing/selftests/wireguard/qemu/kernel.config index 0a5381717e9f4..1149289f4b30f 100644 --- a/tools/testing/selftests/wireguard/qemu/kernel.config +++ b/tools/testing/selftests/wireguard/qemu/kernel.config @@ -48,7 +48,6 @@ CONFIG_JUMP_LABEL=y CONFIG_FUTEX=y CONFIG_SHMEM=y CONFIG_SLUB=y -CONFIG_SPARSEMEM_VMEMMAP=y CONFIG_SMP=y CONFIG_SCHED_SMT=y CONFIG_SCHED_MC=y -- 2.50.1 Let's reject them early, which in turn makes folio_alloc_gigantic() reject them properly. To avoid converting from order to nr_pages, let's just add MAX_FOLIO_ORDER and calculate MAX_FOLIO_NR_PAGES based on that. While at it, let's just make the order a "const unsigned order". Reviewed-by: Zi Yan Acked-by: SeongJae Park Reviewed-by: Wei Yang Reviewed-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Signed-off-by: David Hildenbrand --- include/linux/mm.h | 6 ++++-- mm/page_alloc.c | 10 +++++++++- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 00c8a54127d37..77737cbf2216a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2055,11 +2055,13 @@ static inline long folio_nr_pages(const struct folio *folio) /* Only hugetlbfs can allocate folios larger than MAX_ORDER */ #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE -#define MAX_FOLIO_NR_PAGES (1UL << PUD_ORDER) +#define MAX_FOLIO_ORDER PUD_ORDER #else -#define MAX_FOLIO_NR_PAGES MAX_ORDER_NR_PAGES +#define MAX_FOLIO_ORDER MAX_PAGE_ORDER #endif +#define MAX_FOLIO_NR_PAGES (1UL << MAX_FOLIO_ORDER) + /* * compound_nr() returns the number of pages in this potentially compound * page. compound_nr() can be called on a tail page, and is defined to diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 27ea4c7acd158..7e96c69a06ccb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6841,6 +6841,7 @@ static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask) int alloc_contig_range_noprof(unsigned long start, unsigned long end, acr_flags_t alloc_flags, gfp_t gfp_mask) { + const unsigned int order = ilog2(end - start); unsigned long outer_start, outer_end; int ret = 0; @@ -6858,6 +6859,14 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, PB_ISOLATE_MODE_CMA_ALLOC : PB_ISOLATE_MODE_OTHER; + /* + * In contrast to the buddy, we allow for orders here that exceed + * MAX_PAGE_ORDER, so we must manually make sure that we are not + * exceeding the maximum folio order. + */ + if (WARN_ON_ONCE((gfp_mask & __GFP_COMP) && order > MAX_FOLIO_ORDER)) + return -EINVAL; + gfp_mask = current_gfp_context(gfp_mask); if (__alloc_contig_verify_gfp_mask(gfp_mask, (gfp_t *)&cc.gfp_mask)) return -EINVAL; @@ -6955,7 +6964,6 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, free_contig_range(end, outer_end - end); } else if (start == outer_start && end == outer_end && is_power_of_2(end - start)) { struct page *head = pfn_to_page(start); - int order = ilog2(end - start); check_new_pages(head, order); prep_new_page(head, order, gfp_mask, 0); -- 2.50.1 Let's reject unreasonable folio sizes early, where we can still fail. We'll add sanity checks to prepare_compound_head/prepare_compound_page next. Is there a way to configure a system such that unreasonable folio sizes would be possible? It would already be rather questionable. If so, we'd probably want to bail out earlier, where we can avoid a WARN and just report a proper error message that indicates where something went wrong such that we messed up. Acked-by: SeongJae Park Reviewed-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Signed-off-by: David Hildenbrand --- mm/memremap.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/memremap.c b/mm/memremap.c index b0ce0d8254bd8..a2d4bb88f64b6 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -275,6 +275,9 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) if (WARN_ONCE(!nr_range, "nr_range must be specified\n")) return ERR_PTR(-EINVAL); + if (WARN_ONCE(pgmap->vmemmap_shift > MAX_FOLIO_ORDER, + "requested folio size unsupported\n")) + return ERR_PTR(-EINVAL); switch (pgmap->type) { case MEMORY_DEVICE_PRIVATE: -- 2.50.1 Let's check that no hstate that corresponds to an unreasonable folio size is registered by an architecture. If we were to succeed registering, we could later try allocating an unsupported gigantic folio size. Further, let's add a BUILD_BUG_ON() for checking that HUGETLB_PAGE_ORDER is sane at build time. As HUGETLB_PAGE_ORDER is dynamic on powerpc, we have to use a BUILD_BUG_ON_INVALID() to make it compile. No existing kernel configuration should be able to trigger this check: either SPARSEMEM without SPARSEMEM_VMEMMAP cannot be configured or gigantic folios will not exceed a memory section (the case on sparse). Reviewed-by: Zi Yan Reviewed-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Signed-off-by: David Hildenbrand --- mm/hugetlb.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1e777cc51ad04..d3542e92a712e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4657,6 +4657,7 @@ static int __init hugetlb_init(void) BUILD_BUG_ON(sizeof_field(struct page, private) * BITS_PER_BYTE < __NR_HPAGEFLAGS); + BUILD_BUG_ON_INVALID(HUGETLB_PAGE_ORDER > MAX_FOLIO_ORDER); if (!hugepages_supported()) { if (hugetlb_max_hstate || default_hstate_max_huge_pages) @@ -4740,6 +4741,7 @@ void __init hugetlb_add_hstate(unsigned int order) } BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); BUG_ON(order < order_base_2(__NR_USED_SUBPAGE)); + WARN_ON(order > MAX_FOLIO_ORDER); h = &hstates[hugetlb_max_hstate++]; __mutex_init(&h->resize_lock, "resize mutex", &h->resize_key); h->order = order; -- 2.50.1 Grepping for "prep_compound_page" leaves on clueless how devdax gets its compound pages initialized. Let's add a comment that might help finding this open-coded prep_compound_page() initialization more easily. Further, let's be less smart about the ordering of initialization and just perform the prep_compound_head() call after all tail pages were initialized: just like prep_compound_page() does. No need for a comment to describe the initialization order: again, just like prep_compound_page(). Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Wei Yang Reviewed-by: Lorenzo Stoakes Acked-by: Liam R. Howlett Signed-off-by: David Hildenbrand --- mm/mm_init.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/mm/mm_init.c b/mm/mm_init.c index 5c21b3af216b2..df614556741a4 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1091,6 +1091,12 @@ static void __ref memmap_init_compound(struct page *head, unsigned long pfn, end_pfn = head_pfn + nr_pages; unsigned int order = pgmap->vmemmap_shift; + /* + * We have to initialize the pages, including setting up page links. + * prep_compound_page() does not take care of that, so instead we + * open-code prep_compound_page() so we can take care of initializing + * the pages in the same go. + */ __SetPageHead(head); for (pfn = head_pfn + 1; pfn < end_pfn; pfn++) { struct page *page = pfn_to_page(pfn); @@ -1098,15 +1104,8 @@ static void __ref memmap_init_compound(struct page *head, __init_zone_device_page(page, pfn, zone_idx, nid, pgmap); prep_compound_tail(head, pfn - head_pfn); set_page_count(page, 0); - - /* - * The first tail page stores important compound page info. - * Call prep_compound_head() after the first tail page has - * been initialized, to not have the data overwritten. - */ - if (pfn == head_pfn + 1) - prep_compound_head(head, order); } + prep_compound_head(head, order); } void __ref memmap_init_zone_device(struct zone *zone, -- 2.50.1 Let's sanity-check in folio_set_order() whether we would be trying to create a folio with an order that would make it exceed MAX_FOLIO_ORDER. This will enable the check whenever a folio/compound page is initialized through prepare_compound_head() / prepare_compound_page() with CONFIG_DEBUG_VM set. Reviewed-by: Zi Yan Reviewed-by: Wei Yang Reviewed-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Signed-off-by: David Hildenbrand --- mm/internal.h | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/internal.h b/mm/internal.h index 45da9ff5694f6..9b0129531d004 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -755,6 +755,7 @@ static inline void folio_set_order(struct folio *folio, unsigned int order) { if (WARN_ON_ONCE(!order || !folio_test_large(folio))) return; + VM_WARN_ON_ONCE(order > MAX_FOLIO_ORDER); folio->_flags_1 = (folio->_flags_1 & ~0xffUL) | order; #ifdef NR_PAGES_IN_LARGE_FOLIO -- 2.50.1 Let's limit the maximum folio size in problematic kernel config where the memmap is allocated per memory section (SPARSEMEM without SPARSEMEM_VMEMMAP) to a single memory section. Currently, only a single architectures supports ARCH_HAS_GIGANTIC_PAGE but not SPARSEMEM_VMEMMAP: sh. Fortunately, the biggest hugetlb size sh supports is 64 MiB (HUGETLB_PAGE_SIZE_64MB) and the section size is at least 64 MiB (SECTION_SIZE_BITS == 26), so their use case is not degraded. As folios and memory sections are naturally aligned to their order-2 size in memory, consequently a single folio can no longer span multiple memory sections on these problematic kernel configs. nth_page() is no longer required when operating within a single compound page / folio. Reviewed-by: Zi Yan Acked-by: Mike Rapoport (Microsoft) Reviewed-by: Wei Yang Reviewed-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Signed-off-by: David Hildenbrand --- include/linux/mm.h | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 77737cbf2216a..2dee79fa2efcf 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2053,11 +2053,25 @@ static inline long folio_nr_pages(const struct folio *folio) return folio_large_nr_pages(folio); } -/* Only hugetlbfs can allocate folios larger than MAX_ORDER */ -#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE -#define MAX_FOLIO_ORDER PUD_ORDER -#else +#if !defined(CONFIG_ARCH_HAS_GIGANTIC_PAGE) +/* + * We don't expect any folios that exceed buddy sizes (and consequently + * memory sections). + */ #define MAX_FOLIO_ORDER MAX_PAGE_ORDER +#elif defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) +/* + * Only pages within a single memory section are guaranteed to be + * contiguous. By limiting folios to a single memory section, all folio + * pages are guaranteed to be contiguous. + */ +#define MAX_FOLIO_ORDER PFN_SECTION_SHIFT +#else +/* + * There is no real limit on the folio size. We limit them to the maximum we + * currently expect (e.g., hugetlb, dax). + */ +#define MAX_FOLIO_ORDER PUD_ORDER #endif #define MAX_FOLIO_NR_PAGES (1UL << MAX_FOLIO_ORDER) -- 2.50.1 Now that a single folio/compound page can no longer span memory sections in problematic kernel configurations, we can stop using nth_page() in folio_page() and folio_page_idx(). While at it, turn both macros into static inline functions and add kernel doc for folio_page_idx(). Reviewed-by: Zi Yan Reviewed-by: Wei Yang Reviewed-by: Lorenzo Stoakes Signed-off-by: David Hildenbrand --- include/linux/mm.h | 16 ++++++++++++++-- include/linux/page-flags.h | 5 ++++- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 2dee79fa2efcf..f6880e3225c5c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -210,10 +210,8 @@ extern unsigned long sysctl_admin_reserve_kbytes; #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) -#define folio_page_idx(folio, p) (page_to_pfn(p) - folio_pfn(folio)) #else #define nth_page(page,n) ((page) + (n)) -#define folio_page_idx(folio, p) ((p) - &(folio)->page) #endif /* to align the pointer to the (next) page boundary */ @@ -225,6 +223,20 @@ extern unsigned long sysctl_admin_reserve_kbytes; /* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */ #define PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), PAGE_SIZE) +/** + * folio_page_idx - Return the number of a page in a folio. + * @folio: The folio. + * @page: The folio page. + * + * This function expects that the page is actually part of the folio. + * The returned number is relative to the start of the folio. + */ +static inline unsigned long folio_page_idx(const struct folio *folio, + const struct page *page) +{ + return page - &folio->page; +} + static inline struct folio *lru_to_folio(struct list_head *head) { return list_entry((head)->prev, struct folio, lru); diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 5ee6ffbdbf831..faf17ca211b4f 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -316,7 +316,10 @@ static __always_inline unsigned long _compound_head(const struct page *page) * check that the page number lies within @folio; the caller is presumed * to have a reference to the page. */ -#define folio_page(folio, n) nth_page(&(folio)->page, n) +static inline struct page *folio_page(struct folio *folio, unsigned long n) +{ + return &folio->page + n; +} static __always_inline int PageTail(const struct page *page) { -- 2.50.1 We can now safely iterate over all pages in a folio, so no need for the pfn_to_page(). Also, as we already force the refcount in __init_single_page() to 1 through init_page_count(), we can just set the refcount to 0 and avoid page_ref_freeze() + VM_BUG_ON. Likely, in the future, we would just want to tell __init_single_page() to which value to initialize the refcount. Further, adjust the comments to highlight that we are dealing with an open-coded prep_compound_page() variant, and add another comment explaining why we really need the __init_single_page() only on the tail pages. Note that the current code was likely problematic, but we never ran into it: prep_compound_tail() would have been called with an offset that might exceed a memory section, and prep_compound_tail() would have simply added that offset to the page pointer -- which would not have done the right thing on sparsemem without vmemmap. Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Lorenzo Stoakes Acked-by: Liam R. Howlett Signed-off-by: David Hildenbrand --- mm/hugetlb.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d3542e92a712e..56e6d2af08434 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3237,17 +3237,18 @@ static void __init hugetlb_folio_init_tail_vmemmap(struct folio *folio, { enum zone_type zone = zone_idx(folio_zone(folio)); int nid = folio_nid(folio); + struct page *page = folio_page(folio, start_page_number); unsigned long head_pfn = folio_pfn(folio); unsigned long pfn, end_pfn = head_pfn + end_page_number; - int ret; - - for (pfn = head_pfn + start_page_number; pfn < end_pfn; pfn++) { - struct page *page = pfn_to_page(pfn); + /* + * As we marked all tail pages with memblock_reserved_mark_noinit(), + * we must initialize them ourselves here. + */ + for (pfn = head_pfn + start_page_number; pfn < end_pfn; page++, pfn++) { __init_single_page(page, pfn, zone, nid); prep_compound_tail((struct page *)folio, pfn - head_pfn); - ret = page_ref_freeze(page, 1); - VM_BUG_ON(!ret); + set_page_count(page, 0); } } @@ -3257,12 +3258,15 @@ static void __init hugetlb_folio_init_vmemmap(struct folio *folio, { int ret; - /* Prepare folio head */ + /* + * This is an open-coded prep_compound_page() whereby we avoid + * walking pages twice by initializing/preparing+freezing them in the + * same go. + */ __folio_clear_reserved(folio); __folio_set_head(folio); ret = folio_ref_freeze(folio, 1); VM_BUG_ON(!ret); - /* Initialize the necessary tail struct pages */ hugetlb_folio_init_tail_vmemmap(folio, 1, nr_pages); prep_compound_head((struct page *)folio, huge_page_order(h)); } -- 2.50.1 We're allocating a higher-order page from the buddy. For these pages (that are guaranteed to not exceed a single memory section) there is no need to use nth_page(). Reviewed-by: Lorenzo Stoakes Acked-by: Liam R. Howlett Signed-off-by: David Hildenbrand --- mm/percpu-km.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/percpu-km.c b/mm/percpu-km.c index fe31aa19db81a..4efa74a495cb6 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -69,7 +69,7 @@ static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp) } for (i = 0; i < nr_pages; i++) - pcpu_set_page_chunk(nth_page(pages, i), chunk); + pcpu_set_page_chunk(pages + i, chunk); chunk->data = pages; chunk->base_addr = page_address(pages); -- 2.50.1 The nth_page() is not really required anymore, so let's remove it. Reviewed-by: Zi Yan Reviewed-by: Lorenzo Stoakes Signed-off-by: David Hildenbrand --- fs/hugetlbfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 34d496a2b7de6..c5a46d10afaa0 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -217,7 +217,7 @@ static size_t adjust_range_hwpoison(struct folio *folio, size_t offset, break; offset += n; if (offset == PAGE_SIZE) { - page = nth_page(page, 1); + page++; offset = 0; } } -- 2.50.1 Let's cleanup and simplify the function a bit. Reviewed-by: Zi Yan Reviewed-by: Lorenzo Stoakes Signed-off-by: David Hildenbrand --- fs/hugetlbfs/inode.c | 36 ++++++++++++------------------------ 1 file changed, 12 insertions(+), 24 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index c5a46d10afaa0..3cfdf4091001f 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -192,37 +192,25 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, * Someone wants to read @bytes from a HWPOISON hugetlb @folio from @offset. * Returns the maximum number of bytes one can read without touching the 1st raw * HWPOISON page. - * - * The implementation borrows the iteration logic from copy_page_to_iter*. */ static size_t adjust_range_hwpoison(struct folio *folio, size_t offset, size_t bytes) { - struct page *page; - size_t n = 0; - size_t res = 0; - - /* First page to start the loop. */ - page = folio_page(folio, offset / PAGE_SIZE); - offset %= PAGE_SIZE; - while (1) { - if (is_raw_hwpoison_page_in_hugepage(page)) - break; + struct page *page = folio_page(folio, offset / PAGE_SIZE); + size_t safe_bytes; + + if (is_raw_hwpoison_page_in_hugepage(page)) + return 0; + /* Safe to read the remaining bytes in this page. */ + safe_bytes = PAGE_SIZE - (offset % PAGE_SIZE); + page++; - /* Safe to read n bytes without touching HWPOISON subpage. */ - n = min(bytes, (size_t)PAGE_SIZE - offset); - res += n; - bytes -= n; - if (!bytes || !n) + /* Check each remaining page as long as we are not done yet. */ + for (; safe_bytes < bytes; safe_bytes += PAGE_SIZE, page++) + if (is_raw_hwpoison_page_in_hugepage(page)) break; - offset += n; - if (offset == PAGE_SIZE) { - page++; - offset = 0; - } - } - return res; + return min(safe_bytes, bytes); } /* -- 2.50.1 It's no longer required to use nth_page() within a folio, so let's just drop the nth_page() in folio_walk_start(). Reviewed-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Signed-off-by: David Hildenbrand --- mm/pagewalk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/pagewalk.c b/mm/pagewalk.c index c6753d370ff4e..9e4225e5fcf5c 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -1004,7 +1004,7 @@ struct folio *folio_walk_start(struct folio_walk *fw, found: if (expose_page) /* Note: Offset from the mapped page, not the folio start. */ - fw->page = nth_page(page, (addr & (entry_size - 1)) >> PAGE_SHIFT); + fw->page = page + ((addr & (entry_size - 1)) >> PAGE_SHIFT); else fw->page = NULL; fw->ptl = ptl; -- 2.50.1 nth_page() is no longer required when iterating over pages within a single folio, so let's just drop it when recording subpages. Reviewed-by: Lorenzo Stoakes Signed-off-by: David Hildenbrand --- mm/gup.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 8157197a19f77..c10cd969c1a3b 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -488,12 +488,11 @@ static int record_subpages(struct page *page, unsigned long sz, unsigned long addr, unsigned long end, struct page **pages) { - struct page *start_page; int nr; - start_page = nth_page(page, (addr & (sz - 1)) >> PAGE_SHIFT); + page += (addr & (sz - 1)) >> PAGE_SHIFT; for (nr = 0; addr != end; nr++, addr += PAGE_SIZE) - pages[nr] = nth_page(start_page, nr); + pages[nr] = page++; return nr; } @@ -1512,7 +1511,7 @@ static long __get_user_pages(struct mm_struct *mm, } for (j = 0; j < page_increm; j++) { - subpage = nth_page(page, j); + subpage = page + j; pages[i + j] = subpage; flush_anon_page(vma, subpage, start + j * PAGE_SIZE); flush_dcache_page(subpage); -- 2.50.1 We can just cleanup the code by calculating the #refs earlier, so we can just inline what remains of record_subpages(). Calculate the number of references/pages ahead of times, and record them only once all our tests passed. Signed-off-by: David Hildenbrand --- mm/gup.c | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index c10cd969c1a3b..f0f4d1a68e094 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -484,19 +484,6 @@ static inline void mm_set_has_pinned_flag(struct mm_struct *mm) #ifdef CONFIG_MMU #ifdef CONFIG_HAVE_GUP_FAST -static int record_subpages(struct page *page, unsigned long sz, - unsigned long addr, unsigned long end, - struct page **pages) -{ - int nr; - - page += (addr & (sz - 1)) >> PAGE_SHIFT; - for (nr = 0; addr != end; nr++, addr += PAGE_SIZE) - pages[nr] = page++; - - return nr; -} - /** * try_grab_folio_fast() - Attempt to get or pin a folio in fast path. * @page: pointer to page to be grabbed @@ -2967,8 +2954,8 @@ static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr, if (pmd_special(orig)) return 0; - page = pmd_page(orig); - refs = record_subpages(page, PMD_SIZE, addr, end, pages + *nr); + refs = (end - addr) >> PAGE_SHIFT; + page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); folio = try_grab_folio_fast(page, refs, flags); if (!folio) @@ -2989,6 +2976,8 @@ static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr, } *nr += refs; + for (; refs; refs--) + *(pages++) = page++; folio_set_referenced(folio); return 1; } @@ -3007,8 +2996,8 @@ static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr, if (pud_special(orig)) return 0; - page = pud_page(orig); - refs = record_subpages(page, PUD_SIZE, addr, end, pages + *nr); + refs = (end - addr) >> PAGE_SHIFT; + page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); folio = try_grab_folio_fast(page, refs, flags); if (!folio) @@ -3030,6 +3019,8 @@ static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr, } *nr += refs; + for (; refs; refs--) + *(pages++) = page++; folio_set_referenced(folio); return 1; } -- 2.50.1 Within a folio/compound page, nth_page() is no longer required. Given that we call folio_test_partial_kmap()+kmap_local_page(), the code would already be problematic if the pages would span multiple folios. So let's just assume that all src pages belong to a single folio/compound page and can be iterated ordinarily. The dst page is currently always a single page, so we're not actually iterating anything. Reviewed-by: Pavel Begunkov Reviewed-by: Lorenzo Stoakes Cc: Jens Axboe Cc: Pavel Begunkov Signed-off-by: David Hildenbrand --- io_uring/zcrx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index e5ff49f3425e0..18c12f4b56b6c 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -975,9 +975,9 @@ static ssize_t io_copy_page(struct io_copy_cache *cc, struct page *src_page, if (folio_test_partial_kmap(page_folio(dst_page)) || folio_test_partial_kmap(page_folio(src_page))) { - dst_page = nth_page(dst_page, dst_offset / PAGE_SIZE); + dst_page += dst_offset / PAGE_SIZE; dst_offset = offset_in_page(dst_offset); - src_page = nth_page(src_page, src_offset / PAGE_SIZE); + src_page += src_offset / PAGE_SIZE; src_offset = offset_in_page(src_offset); n = min(PAGE_SIZE - src_offset, PAGE_SIZE - dst_offset); n = min(n, len); -- 2.50.1 Let's make it clearer that we are operating within a single folio by providing both the folio and the page. This implies that for flush_dcache_folio() we'll now avoid one more page->folio lookup, and that we can safely drop the "nth_page" usage. While at it, drop the "extern" from the function declaration. Cc: Thomas Bogendoerfer Signed-off-by: David Hildenbrand --- arch/mips/include/asm/cacheflush.h | 11 +++++++---- arch/mips/mm/cache.c | 8 ++++---- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/arch/mips/include/asm/cacheflush.h b/arch/mips/include/asm/cacheflush.h index 5d283ef89d90d..5099c1b65a584 100644 --- a/arch/mips/include/asm/cacheflush.h +++ b/arch/mips/include/asm/cacheflush.h @@ -50,13 +50,14 @@ extern void (*flush_cache_mm)(struct mm_struct *mm); extern void (*flush_cache_range)(struct vm_area_struct *vma, unsigned long start, unsigned long end); extern void (*flush_cache_page)(struct vm_area_struct *vma, unsigned long page, unsigned long pfn); -extern void __flush_dcache_pages(struct page *page, unsigned int nr); +void __flush_dcache_folio_pages(struct folio *folio, struct page *page, unsigned int nr); #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 static inline void flush_dcache_folio(struct folio *folio) { if (cpu_has_dc_aliases) - __flush_dcache_pages(&folio->page, folio_nr_pages(folio)); + __flush_dcache_folio_pages(folio, folio_page(folio, 0), + folio_nr_pages(folio)); else if (!cpu_has_ic_fills_f_dc) folio_set_dcache_dirty(folio); } @@ -64,10 +65,12 @@ static inline void flush_dcache_folio(struct folio *folio) static inline void flush_dcache_page(struct page *page) { + struct folio *folio = page_folio(page); + if (cpu_has_dc_aliases) - __flush_dcache_pages(page, 1); + __flush_dcache_folio_pages(folio, page, 1); else if (!cpu_has_ic_fills_f_dc) - folio_set_dcache_dirty(page_folio(page)); + folio_set_dcache_dirty(folio); } #define flush_dcache_mmap_lock(mapping) do { } while (0) diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c index bf9a37c60e9f0..e3b4224c9a406 100644 --- a/arch/mips/mm/cache.c +++ b/arch/mips/mm/cache.c @@ -99,9 +99,9 @@ SYSCALL_DEFINE3(cacheflush, unsigned long, addr, unsigned long, bytes, return 0; } -void __flush_dcache_pages(struct page *page, unsigned int nr) +void __flush_dcache_folio_pages(struct folio *folio, struct page *page, + unsigned int nr) { - struct folio *folio = page_folio(page); struct address_space *mapping = folio_flush_mapping(folio); unsigned long addr; unsigned int i; @@ -117,12 +117,12 @@ void __flush_dcache_pages(struct page *page, unsigned int nr) * get faulted into the tlb (and thus flushed) anyways. */ for (i = 0; i < nr; i++) { - addr = (unsigned long)kmap_local_page(nth_page(page, i)); + addr = (unsigned long)kmap_local_page(page + i); flush_data_cache_page(addr); kunmap_local((void *)addr); } } -EXPORT_SYMBOL(__flush_dcache_pages); +EXPORT_SYMBOL(__flush_dcache_folio_pages); void __flush_anon_page(struct page *page, unsigned long vmaddr) { -- 2.50.1 Let's disallow handing out PFN ranges with non-contiguous pages, so we can remove the nth-page usage in __cma_alloc(), and so any callers don't have to worry about that either when wanting to blindly iterate pages. This is really only a problem in configs with SPARSEMEM but without SPARSEMEM_VMEMMAP, and only when we would cross memory sections in some cases. Will this cause harm? Probably not, because it's mostly 32bit that does not support SPARSEMEM_VMEMMAP. If this ever becomes a problem we could look into allocating the memmap for the memory sections spanned by a single CMA region in one go from memblock. Reviewed-by: Alexandru Elisei Reviewed-by: Lorenzo Stoakes Signed-off-by: David Hildenbrand --- include/linux/mm.h | 6 ++++++ mm/cma.c | 39 ++++++++++++++++++++++++--------------- mm/util.c | 35 +++++++++++++++++++++++++++++++++++ 3 files changed, 65 insertions(+), 15 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index f6880e3225c5c..2ca1eb2db63ec 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -209,9 +209,15 @@ extern unsigned long sysctl_user_reserve_kbytes; extern unsigned long sysctl_admin_reserve_kbytes; #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) +bool page_range_contiguous(const struct page *page, unsigned long nr_pages); #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) #else #define nth_page(page,n) ((page) + (n)) +static inline bool page_range_contiguous(const struct page *page, + unsigned long nr_pages) +{ + return true; +} #endif /* to align the pointer to the (next) page boundary */ diff --git a/mm/cma.c b/mm/cma.c index e56ec64d0567e..813e6dc7b0954 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -780,10 +780,8 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr, unsigned long count, unsigned int align, struct page **pagep, gfp_t gfp) { - unsigned long mask, offset; - unsigned long pfn = -1; - unsigned long start = 0; unsigned long bitmap_maxno, bitmap_no, bitmap_count; + unsigned long start, pfn, mask, offset; int ret = -EBUSY; struct page *page = NULL; @@ -795,7 +793,7 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr, if (bitmap_count > bitmap_maxno) goto out; - for (;;) { + for (start = 0; ; start = bitmap_no + mask + 1) { spin_lock_irq(&cma->lock); /* * If the request is larger than the available number @@ -812,6 +810,22 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr, spin_unlock_irq(&cma->lock); break; } + + pfn = cmr->base_pfn + (bitmap_no << cma->order_per_bit); + page = pfn_to_page(pfn); + + /* + * Do not hand out page ranges that are not contiguous, so + * callers can just iterate the pages without having to worry + * about these corner cases. + */ + if (!page_range_contiguous(page, count)) { + spin_unlock_irq(&cma->lock); + pr_warn_ratelimited("%s: %s: skipping incompatible area [0x%lx-0x%lx]", + __func__, cma->name, pfn, pfn + count - 1); + continue; + } + bitmap_set(cmr->bitmap, bitmap_no, bitmap_count); cma->available_count -= count; /* @@ -821,29 +835,24 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr, */ spin_unlock_irq(&cma->lock); - pfn = cmr->base_pfn + (bitmap_no << cma->order_per_bit); mutex_lock(&cma->alloc_mutex); ret = alloc_contig_range(pfn, pfn + count, ACR_FLAGS_CMA, gfp); mutex_unlock(&cma->alloc_mutex); - if (ret == 0) { - page = pfn_to_page(pfn); + if (!ret) break; - } cma_clear_bitmap(cma, cmr, pfn, count); if (ret != -EBUSY) break; pr_debug("%s(): memory range at pfn 0x%lx %p is busy, retrying\n", - __func__, pfn, pfn_to_page(pfn)); + __func__, pfn, page); - trace_cma_alloc_busy_retry(cma->name, pfn, pfn_to_page(pfn), - count, align); - /* try again with a bit different memory target */ - start = bitmap_no + mask + 1; + trace_cma_alloc_busy_retry(cma->name, pfn, page, count, align); } out: - *pagep = page; + if (!ret) + *pagep = page; return ret; } @@ -882,7 +891,7 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count, */ if (page) { for (i = 0; i < count; i++) - page_kasan_tag_reset(nth_page(page, i)); + page_kasan_tag_reset(page + i); } if (ret && !(gfp & __GFP_NOWARN)) { diff --git a/mm/util.c b/mm/util.c index d235b74f7aff7..fbdb73aaf35fe 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1280,4 +1280,39 @@ unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte, { return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr, 0); } + +#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) +/** + * page_range_contiguous - test whether the page range is contiguous + * @page: the start of the page range. + * @nr_pages: the number of pages in the range. + * + * Test whether the page range is contiguous, such that they can be iterated + * naively, corresponding to iterating a contiguous PFN range. + * + * This function should primarily only be used for debug checks, or when + * working with page ranges that are not naturally contiguous (e.g., pages + * within a folio are). + * + * Returns true if contiguous, otherwise false. + */ +bool page_range_contiguous(const struct page *page, unsigned long nr_pages) +{ + const unsigned long start_pfn = page_to_pfn(page); + const unsigned long end_pfn = start_pfn + nr_pages; + unsigned long pfn; + + /* + * The memmap is allocated per memory section, so no need to check + * within the first section. However, we need to check each other + * spanned memory section once, making sure the first page in a + * section could similarly be reached by just iterating pages. + */ + for (pfn = ALIGN(start_pfn, PAGES_PER_SECTION); + pfn < end_pfn; pfn += PAGES_PER_SECTION) + if (unlikely(page + (pfn - start_pfn) != pfn_to_page(pfn))) + return false; + return true; +} +#endif #endif /* CONFIG_MMU */ -- 2.50.1 dma_common_contiguous_remap() is used to remap an "allocated contiguous region". Within a single allocation, there is no need to use nth_page() anymore. Neither the buddy, nor hugetlb, nor CMA will hand out problematic page ranges. Acked-by: Marek Szyprowski Reviewed-by: Lorenzo Stoakes Cc: Robin Murphy Signed-off-by: David Hildenbrand --- kernel/dma/remap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index 9e2afad1c6152..b7c1c0c92d0c8 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -49,7 +49,7 @@ void *dma_common_contiguous_remap(struct page *page, size_t size, if (!pages) return NULL; for (i = 0; i < count; i++) - pages[i] = nth_page(page, i); + pages[i] = page++; vaddr = vmap(pages, count, VM_DMA_COHERENT, prot); kvfree(pages); -- 2.50.1 The expectation is that there is currently no user that would pass in non-contigous page ranges: no allocator, not even VMA, will hand these out. The only problematic part would be if someone would provide a range obtained directly from memblock, or manually merge problematic ranges. If we find such cases, we should fix them to create separate SG entries. Let's check in sg_set_page() that this is really the case. No need to check in sg_set_folio(), as pages in a folio are guaranteed to be contiguous. As sg_set_page() gets inlined into modules, we have to export the page_range_contiguous() helper -- use EXPORT_SYMBOL, there is nothing special about this helper such that we would want to enforce GPL-only modules. We can now drop the nth_page() usage in sg_page_iter_page(). Acked-by: Marek Szyprowski Reviewed-by: Lorenzo Stoakes Signed-off-by: David Hildenbrand --- include/linux/scatterlist.h | 3 ++- mm/util.c | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index 6f8a4965f9b98..29f6ceb98d74b 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -158,6 +158,7 @@ static inline void sg_assign_page(struct scatterlist *sg, struct page *page) static inline void sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len, unsigned int offset) { + VM_WARN_ON_ONCE(!page_range_contiguous(page, ALIGN(len + offset, PAGE_SIZE) / PAGE_SIZE)); sg_assign_page(sg, page); sg->offset = offset; sg->length = len; @@ -600,7 +601,7 @@ void __sg_page_iter_start(struct sg_page_iter *piter, */ static inline struct page *sg_page_iter_page(struct sg_page_iter *piter) { - return nth_page(sg_page(piter->sg), piter->sg_pgoffset); + return sg_page(piter->sg) + piter->sg_pgoffset; } /** diff --git a/mm/util.c b/mm/util.c index fbdb73aaf35fe..bb4b47cd67091 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1314,5 +1314,6 @@ bool page_range_contiguous(const struct page *page, unsigned long nr_pages) return false; return true; } +EXPORT_SYMBOL(page_range_contiguous); #endif #endif /* CONFIG_MMU */ -- 2.50.1 It's no longer required to use nth_page() when iterating pages within a single SG entry, so let's drop the nth_page() usage. Acked-by: Damien Le Moal Reviewed-by: Lorenzo Stoakes Cc: Niklas Cassel Signed-off-by: David Hildenbrand --- drivers/ata/libata-sff.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c index 7fc407255eb46..1e2a2c33cdc80 100644 --- a/drivers/ata/libata-sff.c +++ b/drivers/ata/libata-sff.c @@ -614,7 +614,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc) offset = qc->cursg->offset + qc->cursg_ofs; /* get the current page and offset */ - page = nth_page(page, (offset >> PAGE_SHIFT)); + page += offset >> PAGE_SHIFT; offset %= PAGE_SIZE; /* don't overrun current sg */ @@ -631,7 +631,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc) unsigned int split_len = PAGE_SIZE - offset; ata_pio_xfer(qc, page, offset, split_len); - ata_pio_xfer(qc, nth_page(page, 1), 0, count - split_len); + ata_pio_xfer(qc, page + 1, 0, count - split_len); } else { ata_pio_xfer(qc, page, offset, count); } @@ -751,7 +751,7 @@ static int __atapi_pio_bytes(struct ata_queued_cmd *qc, unsigned int bytes) offset = sg->offset + qc->cursg_ofs; /* get the current page and offset */ - page = nth_page(page, (offset >> PAGE_SHIFT)); + page += offset >> PAGE_SHIFT; offset %= PAGE_SIZE; /* don't overrun current sg */ -- 2.50.1 It's no longer required to use nth_page() when iterating pages within a single SG entry, so let's drop the nth_page() usage. Reviewed-by: Lorenzo Stoakes Cc: Jani Nikula Cc: Joonas Lahtinen Cc: Rodrigo Vivi Cc: Tvrtko Ursulin Cc: David Airlie Cc: Simona Vetter Signed-off-by: David Hildenbrand --- drivers/gpu/drm/i915/gem/i915_gem_pages.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_pages.c b/drivers/gpu/drm/i915/gem/i915_gem_pages.c index c16a57160b262..031d7acc16142 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_pages.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_pages.c @@ -779,7 +779,7 @@ __i915_gem_object_get_page(struct drm_i915_gem_object *obj, pgoff_t n) GEM_BUG_ON(!i915_gem_object_has_struct_page(obj)); sg = i915_gem_object_get_sg(obj, n, &offset); - return nth_page(sg_page(sg), offset); + return sg_page(sg) + offset; } /* Like i915_gem_object_get_page(), but mark the returned page dirty */ -- 2.50.1 It's no longer required to use nth_page() when iterating pages within a single SG entry, so let's drop the nth_page() usage. Acked-by: Ulf Hansson Reviewed-by: Lorenzo Stoakes Cc: Maxim Levitsky Cc: Alex Dubov Signed-off-by: David Hildenbrand --- drivers/memstick/core/mspro_block.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c index c9853d887d282..d3f160dc0da4c 100644 --- a/drivers/memstick/core/mspro_block.c +++ b/drivers/memstick/core/mspro_block.c @@ -560,8 +560,7 @@ static int h_mspro_block_transfer_data(struct memstick_dev *card, t_offset += msb->current_page * msb->page_size; sg_set_page(&t_sg, - nth_page(sg_page(&(msb->req_sg[msb->current_seg])), - t_offset >> PAGE_SHIFT), + sg_page(&(msb->req_sg[msb->current_seg])) + (t_offset >> PAGE_SHIFT), msb->page_size, offset_in_page(t_offset)); memstick_init_req_sg(*mrq, msb->data_dir == READ -- 2.50.1 It's no longer required to use nth_page() when iterating pages within a single SG entry, so let's drop the nth_page() usage. Acked-by: Ulf Hansson Reviewed-by: Lorenzo Stoakes Cc: Maxim Levitsky Cc: Alex Dubov Signed-off-by: David Hildenbrand --- drivers/memstick/host/jmb38x_ms.c | 3 +-- drivers/memstick/host/tifm_ms.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/memstick/host/jmb38x_ms.c b/drivers/memstick/host/jmb38x_ms.c index cddddb3a5a27f..79e66e30417c1 100644 --- a/drivers/memstick/host/jmb38x_ms.c +++ b/drivers/memstick/host/jmb38x_ms.c @@ -317,8 +317,7 @@ static int jmb38x_ms_transfer_data(struct jmb38x_ms_host *host) unsigned int p_off; if (host->req->long_data) { - pg = nth_page(sg_page(&host->req->sg), - off >> PAGE_SHIFT); + pg = sg_page(&host->req->sg) + (off >> PAGE_SHIFT); p_off = offset_in_page(off); p_cnt = PAGE_SIZE - p_off; p_cnt = min(p_cnt, length); diff --git a/drivers/memstick/host/tifm_ms.c b/drivers/memstick/host/tifm_ms.c index db7f3a088fb09..0b6a90661eee5 100644 --- a/drivers/memstick/host/tifm_ms.c +++ b/drivers/memstick/host/tifm_ms.c @@ -201,8 +201,7 @@ static unsigned int tifm_ms_transfer_data(struct tifm_ms *host) unsigned int p_off; if (host->req->long_data) { - pg = nth_page(sg_page(&host->req->sg), - off >> PAGE_SHIFT); + pg = sg_page(&host->req->sg) + (off >> PAGE_SHIFT); p_off = offset_in_page(off); p_cnt = PAGE_SIZE - p_off; p_cnt = min(p_cnt, length); -- 2.50.1 It's no longer required to use nth_page() when iterating pages within a single SG entry, so let's drop the nth_page() usage. Acked-by: Ulf Hansson Reviewed-by: Lorenzo Stoakes Cc: Alex Dubov Cc: Jesper Nilsson Cc: Lars Persson Signed-off-by: David Hildenbrand --- drivers/mmc/host/tifm_sd.c | 4 ++-- drivers/mmc/host/usdhi6rol0.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/mmc/host/tifm_sd.c b/drivers/mmc/host/tifm_sd.c index ac636efd911d3..2cd69c9e9571b 100644 --- a/drivers/mmc/host/tifm_sd.c +++ b/drivers/mmc/host/tifm_sd.c @@ -191,7 +191,7 @@ static void tifm_sd_transfer_data(struct tifm_sd *host) } off = sg[host->sg_pos].offset + host->block_pos; - pg = nth_page(sg_page(&sg[host->sg_pos]), off >> PAGE_SHIFT); + pg = sg_page(&sg[host->sg_pos]) + (off >> PAGE_SHIFT); p_off = offset_in_page(off); p_cnt = PAGE_SIZE - p_off; p_cnt = min(p_cnt, cnt); @@ -240,7 +240,7 @@ static void tifm_sd_bounce_block(struct tifm_sd *host, struct mmc_data *r_data) } off = sg[host->sg_pos].offset + host->block_pos; - pg = nth_page(sg_page(&sg[host->sg_pos]), off >> PAGE_SHIFT); + pg = sg_page(&sg[host->sg_pos]) + (off >> PAGE_SHIFT); p_off = offset_in_page(off); p_cnt = PAGE_SIZE - p_off; p_cnt = min(p_cnt, cnt); diff --git a/drivers/mmc/host/usdhi6rol0.c b/drivers/mmc/host/usdhi6rol0.c index 85b49c07918b3..3bccf800339ba 100644 --- a/drivers/mmc/host/usdhi6rol0.c +++ b/drivers/mmc/host/usdhi6rol0.c @@ -323,7 +323,7 @@ static void usdhi6_blk_bounce(struct usdhi6_host *host, host->head_pg.page = host->pg.page; host->head_pg.mapped = host->pg.mapped; - host->pg.page = nth_page(host->pg.page, 1); + host->pg.page = host->pg.page + 1; host->pg.mapped = kmap(host->pg.page); host->blk_page = host->bounce_buf; @@ -503,7 +503,7 @@ static void usdhi6_sg_advance(struct usdhi6_host *host) /* We cannot get here after crossing a page border */ /* Next page in the same SG */ - host->pg.page = nth_page(sg_page(host->sg), host->page_idx); + host->pg.page = sg_page(host->sg) + host->page_idx; host->pg.mapped = kmap(host->pg.page); host->blk_page = host->pg.mapped; -- 2.50.1 It's no longer required to use nth_page() when iterating pages within a single SG entry, so let's drop the nth_page() usage. Reviewed-by: Bart Van Assche Reviewed-by: Lorenzo Stoakes Reviewed-by: Martin K. Petersen Cc: "James E.J. Bottomley" Signed-off-by: David Hildenbrand --- drivers/scsi/scsi_lib.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 0c65ecfedfbd6..d7e42293b8645 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -3148,8 +3148,7 @@ void *scsi_kmap_atomic_sg(struct scatterlist *sgl, int sg_count, /* Offset starting from the beginning of first page in this sg-entry */ *offset = *offset - len_complete + sg->offset; - /* Assumption: contiguous pages can be accessed as "page + i" */ - page = nth_page(sg_page(sg), (*offset >> PAGE_SHIFT)); + page = sg_page(sg) + (*offset >> PAGE_SHIFT); *offset &= ~PAGE_MASK; /* Bytes in this sg-entry from *offset to the end of the page */ -- 2.50.1 It's no longer required to use nth_page() when iterating pages within a single SG entry, so let's drop the nth_page() usage. Reviewed-by: Bart Van Assche Reviewed-by: Lorenzo Stoakes Reviewed-by: Martin K. Petersen Cc: Doug Gilbert Cc: "James E.J. Bottomley" Signed-off-by: David Hildenbrand --- drivers/scsi/sg.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 3c02a5f7b5f39..4c62c597c7be9 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -1235,8 +1235,7 @@ sg_vma_fault(struct vm_fault *vmf) len = vma->vm_end - sa; len = (len < length) ? len : length; if (offset < len) { - struct page *page = nth_page(rsv_schp->pages[k], - offset >> PAGE_SHIFT); + struct page *page = rsv_schp->pages[k] + (offset >> PAGE_SHIFT); get_page(page); /* increment page count */ vmf->page = page; return 0; /* success */ -- 2.50.1 It's no longer required to use nth_page() when iterating pages within a single SG entry, so let's drop the nth_page() usage. Reviewed-by: Lorenzo Stoakes Reviewed-by: Alex Williamson Reviewed-by: Brett Creeley Cc: Jason Gunthorpe Cc: Yishai Hadas Cc: Shameer Kolothum Cc: Kevin Tian Signed-off-by: David Hildenbrand --- drivers/vfio/pci/pds/lm.c | 3 +-- drivers/vfio/pci/virtio/migrate.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/vfio/pci/pds/lm.c b/drivers/vfio/pci/pds/lm.c index f2673d395236a..4d70c833fa32e 100644 --- a/drivers/vfio/pci/pds/lm.c +++ b/drivers/vfio/pci/pds/lm.c @@ -151,8 +151,7 @@ static struct page *pds_vfio_get_file_page(struct pds_vfio_lm_file *lm_file, lm_file->last_offset_sg = sg; lm_file->sg_last_entry += i; lm_file->last_offset = cur_offset; - return nth_page(sg_page(sg), - (offset - cur_offset) / PAGE_SIZE); + return sg_page(sg) + (offset - cur_offset) / PAGE_SIZE; } cur_offset += sg->length; } diff --git a/drivers/vfio/pci/virtio/migrate.c b/drivers/vfio/pci/virtio/migrate.c index ba92bb4e9af94..7dd0ac866461d 100644 --- a/drivers/vfio/pci/virtio/migrate.c +++ b/drivers/vfio/pci/virtio/migrate.c @@ -53,8 +53,7 @@ virtiovf_get_migration_page(struct virtiovf_data_buffer *buf, buf->last_offset_sg = sg; buf->sg_last_entry += i; buf->last_offset = cur_offset; - return nth_page(sg_page(sg), - (offset - cur_offset) / PAGE_SIZE); + return sg_page(sg) + (offset - cur_offset) / PAGE_SIZE; } cur_offset += sg->length; } -- 2.50.1 It's no longer required to use nth_page() when iterating pages within a single SG entry, so let's drop the nth_page() usage. Reviewed-by: Lorenzo Stoakes Acked-by: Herbert Xu Cc: "David S. Miller" Signed-off-by: David Hildenbrand --- crypto/ahash.c | 4 ++-- crypto/scompress.c | 8 ++++---- include/crypto/scatterwalk.h | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/crypto/ahash.c b/crypto/ahash.c index a227793d2c5b5..dfb4f5476428f 100644 --- a/crypto/ahash.c +++ b/crypto/ahash.c @@ -88,7 +88,7 @@ static int hash_walk_new_entry(struct crypto_hash_walk *walk) sg = walk->sg; walk->offset = sg->offset; - walk->pg = nth_page(sg_page(walk->sg), (walk->offset >> PAGE_SHIFT)); + walk->pg = sg_page(walk->sg) + (walk->offset >> PAGE_SHIFT); walk->offset = offset_in_page(walk->offset); walk->entrylen = sg->length; @@ -226,7 +226,7 @@ int shash_ahash_digest(struct ahash_request *req, struct shash_desc *desc) if (!IS_ENABLED(CONFIG_HIGHMEM)) return crypto_shash_digest(desc, data, nbytes, req->result); - page = nth_page(page, offset >> PAGE_SHIFT); + page += offset >> PAGE_SHIFT; offset = offset_in_page(offset); if (nbytes > (unsigned int)PAGE_SIZE - offset) diff --git a/crypto/scompress.c b/crypto/scompress.c index c651e7f2197a9..1a7ed8ae65b07 100644 --- a/crypto/scompress.c +++ b/crypto/scompress.c @@ -198,7 +198,7 @@ static int scomp_acomp_comp_decomp(struct acomp_req *req, int dir) } else return -ENOSYS; - dpage = nth_page(dpage, doff / PAGE_SIZE); + dpage += doff / PAGE_SIZE; doff = offset_in_page(doff); n = (dlen - 1) / PAGE_SIZE; @@ -220,12 +220,12 @@ static int scomp_acomp_comp_decomp(struct acomp_req *req, int dir) } else break; - spage = nth_page(spage, soff / PAGE_SIZE); + spage = spage + soff / PAGE_SIZE; soff = offset_in_page(soff); n = (slen - 1) / PAGE_SIZE; n += (offset_in_page(slen - 1) + soff) / PAGE_SIZE; - if (PageHighMem(nth_page(spage, n)) && + if (PageHighMem(spage + n) && size_add(soff, slen) > PAGE_SIZE) break; src = kmap_local_page(spage) + soff; @@ -270,7 +270,7 @@ static int scomp_acomp_comp_decomp(struct acomp_req *req, int dir) if (dlen <= PAGE_SIZE) break; dlen -= PAGE_SIZE; - dpage = nth_page(dpage, 1); + dpage++; } } diff --git a/include/crypto/scatterwalk.h b/include/crypto/scatterwalk.h index 15ab743f68c8f..83d14376ff2bc 100644 --- a/include/crypto/scatterwalk.h +++ b/include/crypto/scatterwalk.h @@ -159,7 +159,7 @@ static inline void scatterwalk_map(struct scatter_walk *walk) if (IS_ENABLED(CONFIG_HIGHMEM)) { struct page *page; - page = nth_page(base_page, offset >> PAGE_SHIFT); + page = base_page + (offset >> PAGE_SHIFT); offset = offset_in_page(offset); addr = kmap_local_page(page) + offset; } else { @@ -259,7 +259,7 @@ static inline void scatterwalk_done_dst(struct scatter_walk *walk, end += (offset_in_page(offset) + offset_in_page(nbytes) + PAGE_SIZE - 1) >> PAGE_SHIFT; for (i = start; i < end; i++) - flush_dcache_page(nth_page(base_page, i)); + flush_dcache_page(base_page + i); } scatterwalk_advance(walk, nbytes); } -- 2.50.1 There is the concern that unpin_user_page_range_dirty_lock() might do some weird merging of PFN ranges -- either now or in the future -- such that PFN range is contiguous but the page range might not be. Let's sanity-check for that and drop the nth_page() usage. Reviewed-by: Lorenzo Stoakes Signed-off-by: David Hildenbrand --- mm/gup.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/mm/gup.c b/mm/gup.c index f0f4d1a68e094..010fe56f6e132 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -237,7 +237,7 @@ void folio_add_pin(struct folio *folio) static inline struct folio *gup_folio_range_next(struct page *start, unsigned long npages, unsigned long i, unsigned int *ntails) { - struct page *next = nth_page(start, i); + struct page *next = start + i; struct folio *folio = page_folio(next); unsigned int nr = 1; @@ -342,6 +342,10 @@ EXPORT_SYMBOL(unpin_user_pages_dirty_lock); * "gup-pinned page range" refers to a range of pages that has had one of the * pin_user_pages() variants called on that page. * + * The page range must be truly physically contiguous: the page range + * corresponds to a contiguous PFN range and all pages can be iterated + * naturally. + * * For the page ranges defined by [page .. page+npages], make that range (or * its head pages, if a compound page) dirty, if @make_dirty is true, and if the * page range was previously listed as clean. @@ -359,6 +363,8 @@ void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, struct folio *folio; unsigned int nr; + VM_WARN_ON_ONCE(!page_range_contiguous(page, npages)); + for (i = 0; i < npages; i += nr) { folio = gup_folio_range_next(page, npages, i, &nr); if (make_dirty && !folio_test_dirty(folio)) { -- 2.50.1 We want to get rid of nth_page(), and kfence init code is the last user. Unfortunately, we might actually walk a PFN range where the pages are not contiguous, because we might be allocating an area from memblock that could span memory sections in problematic kernel configs (SPARSEMEM without SPARSEMEM_VMEMMAP). We could check whether the page range is contiguous using page_range_contiguous() and failing kfence init, or making kfence incompatible these problemtic kernel configs. Let's keep it simple and simply use pfn_to_page() by iterating PFNs. Reviewed-by: Marco Elver Reviewed-by: Lorenzo Stoakes Cc: Alexander Potapenko Cc: Dmitry Vyukov Signed-off-by: David Hildenbrand --- mm/kfence/core.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 0ed3be100963a..727c20c94ac59 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -594,15 +594,14 @@ static void rcu_guarded_free(struct rcu_head *h) */ static unsigned long kfence_init_pool(void) { - unsigned long addr; - struct page *pages; + unsigned long addr, start_pfn; int i; if (!arch_kfence_init_pool()) return (unsigned long)__kfence_pool; addr = (unsigned long)__kfence_pool; - pages = virt_to_page(__kfence_pool); + start_pfn = PHYS_PFN(virt_to_phys(__kfence_pool)); /* * Set up object pages: they must have PGTY_slab set to avoid freeing @@ -613,11 +612,12 @@ static unsigned long kfence_init_pool(void) * enters __slab_free() slow-path. */ for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) { - struct slab *slab = page_slab(nth_page(pages, i)); + struct slab *slab; if (!i || (i % 2)) continue; + slab = page_slab(pfn_to_page(start_pfn + i)); __folio_set_slab(slab_folio(slab)); #ifdef CONFIG_MEMCG slab->obj_exts = (unsigned long)&kfence_metadata_init[i / 2 - 1].obj_exts | @@ -665,10 +665,12 @@ static unsigned long kfence_init_pool(void) reset_slab: for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) { - struct slab *slab = page_slab(nth_page(pages, i)); + struct slab *slab; if (!i || (i % 2)) continue; + + slab = page_slab(pfn_to_page(start_pfn + i)); #ifdef CONFIG_MEMCG slab->obj_exts = 0; #endif -- 2.50.1 Ever since commit 858c708d9efb ("block: move the bi_size update out of __bio_try_merge_page"), page_is_mergeable() no longer exists, and the logic in bvec_try_merge_page() is now a simple page pointer comparison. Reviewed-by: Lorenzo Stoakes Signed-off-by: David Hildenbrand --- include/linux/bvec.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/include/linux/bvec.h b/include/linux/bvec.h index 0a80e1f9aa201..3fc0efa0825b1 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -22,11 +22,8 @@ struct page; * @bv_len: Number of bytes in the address range. * @bv_offset: Start of the address range relative to the start of @bv_page. * - * The following holds for a bvec if n * PAGE_SIZE < bv_offset + bv_len: - * - * nth_page(@bv_page, n) == @bv_page + n - * - * This holds because page_is_mergeable() checks the above property. + * All pages within a bio_vec starting from @bv_page are contiguous and + * can simply be iterated (see bvec_advance()). */ struct bio_vec { struct page *bv_page; -- 2.50.1 Now that all users are gone, let's remove it. Reviewed-by: Lorenzo Stoakes Signed-off-by: David Hildenbrand --- include/linux/mm.h | 2 -- tools/testing/scatterlist/linux/mm.h | 1 - 2 files changed, 3 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 2ca1eb2db63ec..b26ca8b2162d9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -210,9 +210,7 @@ extern unsigned long sysctl_admin_reserve_kbytes; #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) bool page_range_contiguous(const struct page *page, unsigned long nr_pages); -#define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) #else -#define nth_page(page,n) ((page) + (n)) static inline bool page_range_contiguous(const struct page *page, unsigned long nr_pages) { diff --git a/tools/testing/scatterlist/linux/mm.h b/tools/testing/scatterlist/linux/mm.h index 5bd9e6e806254..121ae78d6e885 100644 --- a/tools/testing/scatterlist/linux/mm.h +++ b/tools/testing/scatterlist/linux/mm.h @@ -51,7 +51,6 @@ static inline unsigned long page_to_phys(struct page *page) #define page_to_pfn(page) ((unsigned long)(page) / PAGE_SIZE) #define pfn_to_page(pfn) (void *)((pfn) * PAGE_SIZE) -#define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) #define __min(t1, t2, min1, min2, x, y) ({ \ t1 min1 = (x); \ -- 2.50.1