From: "Barry Song (Xiaomi)" For sizes aligned to CONT_PTE_SIZE and smaller than PMD_SIZE, we can handle CONT_PTE_SIZE groups together. Signed-off-by: Barry Song (Xiaomi) Signed-off-by: Wen Jiang Tested-by: Xueyuan Chen --- arch/arm64/mm/hugetlbpage.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index a42c05cf56408..c4d8b226126cb 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -110,6 +110,12 @@ static inline int num_contig_ptes(unsigned long size, size_t *pgsize) contig_ptes = CONT_PTES; break; default: + if (size > 0 && size < PMD_SIZE && + IS_ALIGNED(size, CONT_PTE_SIZE)) { + contig_ptes = size >> PAGE_SHIFT; + *pgsize = PAGE_SIZE; + break; + } WARN_ON(!__hugetlb_valid_size(size)); } @@ -359,6 +365,10 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags) case CONT_PTE_SIZE: return pte_mkcont(entry); default: + if (pagesize > 0 && pagesize < PMD_SIZE && + IS_ALIGNED(pagesize, CONT_PTE_SIZE)) + return pte_mkcont(entry); + break; } pr_warn("%s: unrecognized huge page size 0x%lx\n", -- 2.34.1 From: "Barry Song (Xiaomi)" Allow arch_vmap_pte_range_map_size to batch across multiple CONT_PTE blocks, reducing both PTE setup and TLB flush iterations. Signed-off-by: Barry Song (Xiaomi) Signed-off-by: Wen Jiang Tested-by: Xueyuan Chen --- arch/arm64/include/asm/vmalloc.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/vmalloc.h b/arch/arm64/include/asm/vmalloc.h index 4ec1acd3c1b34..787fd17b48e2c 100644 --- a/arch/arm64/include/asm/vmalloc.h +++ b/arch/arm64/include/asm/vmalloc.h @@ -23,6 +23,8 @@ static inline unsigned long arch_vmap_pte_range_map_size(unsigned long addr, unsigned long end, u64 pfn, unsigned int max_page_shift) { + unsigned long size; + /* * If the block is at least CONT_PTE_SIZE in size, and is naturally * aligned in both virtual and physical space, then we can pte-map the @@ -40,7 +42,9 @@ static inline unsigned long arch_vmap_pte_range_map_size(unsigned long addr, if (!IS_ALIGNED(PFN_PHYS(pfn), CONT_PTE_SIZE)) return PAGE_SIZE; - return CONT_PTE_SIZE; + size = min3(end - addr, 1UL << max_page_shift, PMD_SIZE >> 1); + size = 1UL << __fls(size); + return size; } #define arch_vmap_pte_range_unmap_size arch_vmap_pte_range_unmap_size -- 2.34.1 Extract the common PTE mapping logic from vmap_pte_range() into a shared helper vmap_set_ptes(). This handles both CONT_PTE and regular PTE mappings in a single function, preparing for the next patch which will extend vmap_pages_pte_range() to also use this helper. The #ifdef CONFIG_HUGETLB_PAGE guard is moved inside vmap_set_ptes(), so callers no longer need to handle the conditional compilation. No functional change. Signed-off-by: Wen Jiang Tested-by: Xueyuan Chen --- mm/vmalloc.c | 44 +++++++++++++++++++++++++++++++------------- 1 file changed, 31 insertions(+), 13 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 2c2f74a07f396..6660f240d27c9 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -91,6 +91,35 @@ struct vfree_deferred { static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred); /*** Page table manipulation functions ***/ + +/* + * Set PTE mappings for the given PFN. Try CONT_PTE mappings first when + * supported, otherwise fall back to PAGE_SIZE mappings. + * + * Return: mapping size. + */ +static __always_inline unsigned long vmap_set_ptes(pte_t *pte, + unsigned long addr, unsigned long end, u64 pfn, + pgprot_t prot, unsigned int max_page_shift) +{ +#ifdef CONFIG_HUGETLB_PAGE + if (max_page_shift > PAGE_SHIFT) { + unsigned long size; + + size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift); + if (size != PAGE_SIZE) { + pte_t entry = pfn_pte(pfn, prot); + + entry = arch_make_huge_pte(entry, ilog2(size), 0); + set_huge_pte_at(&init_mm, addr, pte, entry, size); + return size; + } + } +#endif + set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot)); + return PAGE_SIZE; +} + static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift, pgtbl_mod_mask *mask) @@ -119,19 +148,8 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, BUG(); } -#ifdef CONFIG_HUGETLB_PAGE - size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift); - if (size != PAGE_SIZE) { - pte_t entry = pfn_pte(pfn, prot); - - entry = arch_make_huge_pte(entry, ilog2(size), 0); - set_huge_pte_at(&init_mm, addr, pte, entry, size); - pfn += PFN_DOWN(size); - continue; - } -#endif - set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot)); - pfn++; + size = vmap_set_ptes(pte, addr, end, pfn, prot, max_page_shift); + pfn += PFN_DOWN(size); } while (pte += PFN_DOWN(size), addr += size, addr != end); lazy_mmu_mode_disable(); -- 2.34.1 From: "Barry Song (Xiaomi)" vmap_pages_range_noflush_walk() (formerly vmap_small_pages_range_noflush()) provides a clean interface by taking struct page **pages and mapping them via direct PTE iteration. This avoids the page table rewalk seen when using vmap_range_noflush() for page_shift values other than PAGE_SHIFT. Extend it to support larger page_shift values, and add PMD- and contiguous-PTE mappings as well. Rename it to vmap_pages_range_noflush_walk() since it now handles more than just small pages. For vmalloc() allocations with VM_ALLOW_HUGE_VMAP, we no longer need to iterate over pages one by one via vmap_range_noflush(), which would otherwise lead to page table rewalk. The code is now unified with the PAGE_SHIFT case by simply calling vmap_pages_range_noflush_walk(). Signed-off-by: Barry Song (Xiaomi) Signed-off-by: Wen Jiang Tested-by: Xueyuan Chen --- mm/vmalloc.c | 81 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 47 insertions(+), 34 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 6660f240d27c9..253e017130e09 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -127,7 +127,8 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pte_t *pte; u64 pfn; struct page *page; - unsigned long size = PAGE_SIZE; + unsigned long size; + unsigned int steps; if (WARN_ON_ONCE(!PAGE_ALIGNED(end - addr))) return -EINVAL; @@ -149,8 +150,8 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, } size = vmap_set_ptes(pte, addr, end, pfn, prot, max_page_shift); - pfn += PFN_DOWN(size); - } while (pte += PFN_DOWN(size), addr += size, addr != end); + steps = PFN_DOWN(size); + } while (pte += steps, pfn += steps, addr += size, addr != end); lazy_mmu_mode_disable(); *mask |= PGTBL_PTE_MODIFIED; @@ -542,8 +543,10 @@ void vunmap_range(unsigned long addr, unsigned long end) static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, - pgtbl_mod_mask *mask) + pgtbl_mod_mask *mask, unsigned int shift) { + unsigned long pfn, size; + unsigned int steps; int err = 0; pte_t *pte; @@ -574,9 +577,10 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, break; } - set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); - (*nr)++; - } while (pte++, addr += PAGE_SIZE, addr != end); + pfn = page_to_pfn(page); + size = vmap_set_ptes(pte, addr, end, pfn, prot, shift); + steps = PFN_DOWN(size); + } while (pte += steps, *nr += steps, addr += size, addr != end); lazy_mmu_mode_disable(); *mask |= PGTBL_PTE_MODIFIED; @@ -586,7 +590,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, - pgtbl_mod_mask *mask) + pgtbl_mod_mask *mask, unsigned int shift) { pmd_t *pmd; unsigned long next; @@ -596,7 +600,27 @@ static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr, return -ENOMEM; do { next = pmd_addr_end(addr, end); - if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr, mask)) + + if (shift >= PMD_SHIFT) { + struct page *page = pages[*nr]; + phys_addr_t phys_addr; + + if (WARN_ON(!page)) + return -ENOMEM; + if (WARN_ON(!pfn_valid(page_to_pfn(page)))) + return -EINVAL; + + phys_addr = page_to_phys(page); + + if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot, + shift)) { + *mask |= PGTBL_PMD_MODIFIED; + *nr += 1 << (PMD_SHIFT - PAGE_SHIFT); + continue; + } + } + + if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr, mask, shift)) return -ENOMEM; } while (pmd++, addr = next, addr != end); return 0; @@ -604,7 +628,7 @@ static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr, static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, - pgtbl_mod_mask *mask) + pgtbl_mod_mask *mask, unsigned int shift) { pud_t *pud; unsigned long next; @@ -614,7 +638,7 @@ static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr, return -ENOMEM; do { next = pud_addr_end(addr, end); - if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr, mask)) + if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr, mask, shift)) return -ENOMEM; } while (pud++, addr = next, addr != end); return 0; @@ -622,7 +646,7 @@ static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr, static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, - pgtbl_mod_mask *mask) + pgtbl_mod_mask *mask, unsigned int shift) { p4d_t *p4d; unsigned long next; @@ -632,14 +656,18 @@ static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr, return -ENOMEM; do { next = p4d_addr_end(addr, end); - if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr, mask)) + if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr, mask, shift)) return -ENOMEM; } while (p4d++, addr = next, addr != end); return 0; } -static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, - pgprot_t prot, struct page **pages) +/* + * It can take an array of pages which are not all contiguous, but it + * may have contiguous chunks, as hinted by @shift. + */ +static int vmap_pages_range_noflush_walk(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, unsigned int shift) { unsigned long start = addr; pgd_t *pgd; @@ -654,7 +682,7 @@ static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, next = pgd_addr_end(addr, end); if (pgd_bad(*pgd)) mask |= PGTBL_PGD_MODIFIED; - err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask); + err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask, shift); if (err) break; } while (pgd++, addr = next, addr != end); @@ -677,27 +705,12 @@ static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, int __vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { - unsigned int i, nr = (end - addr) >> PAGE_SHIFT; - WARN_ON(page_shift < PAGE_SHIFT); - if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) || - page_shift == PAGE_SHIFT) - return vmap_small_pages_range_noflush(addr, end, prot, pages); + if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC)) + page_shift = PAGE_SHIFT; - for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) { - int err; - - err = vmap_range_noflush(addr, addr + (1UL << page_shift), - page_to_phys(pages[i]), prot, - page_shift); - if (err) - return err; - - addr += 1UL << page_shift; - } - - return 0; + return vmap_pages_range_noflush_walk(addr, end, prot, pages, page_shift); } int vmap_pages_range_noflush(unsigned long addr, unsigned long end, -- 2.34.1 From: "Barry Song (Xiaomi)" In many cases, the pages passed to vmap() may include high-order pages. For example, the systemheap often allocates pages in descending order: order 8, then 4, then 0. Currently, vmap() iterates over every page individually—even pages inside a high-order block are handled one by one. This patch detects physically contiguous pages (regardless of whether they are compound or non-compound) by scanning with num_pages_contiguous(), and maps them as a single contiguous block whenever possible. The mapping order is determined by taking the minimum of the contiguous page count and the pfn alignment, allowing graceful degradation when pfn alignment is less than the contiguous range. Pages with the same page_shift are coalesced and mapped via vmap_pages_range_noflush_walk() to avoid page table rewalk. As users typically allocate memory in descending orders (e.g. 8 → 4 → 0), once an order-0 page is encountered, we stop scanning for contiguous pages since subsequent pages are likely order-0 as well. Signed-off-by: Barry Song (Xiaomi) Co-developed-by: Dev Jain Signed-off-by: Dev Jain Signed-off-by: Wen Jiang Tested-by: Xueyuan Chen --- mm/vmalloc.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 85 insertions(+), 2 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 253e017130e09..fffb885cb2158 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3545,6 +3545,89 @@ void vunmap(const void *addr) } EXPORT_SYMBOL(vunmap); +static inline unsigned int vm_shift(pgprot_t prot, unsigned long size) +{ + if (arch_vmap_pmd_supported(prot) && size >= PMD_SIZE) + return PMD_SHIFT; + + return arch_vmap_pte_supported_shift(size); +} + +static inline int get_vmap_batch_order(struct page **pages, + pgprot_t prot, unsigned int max_steps, unsigned int idx) +{ + unsigned int nr_contig; + int order; + + if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP)) + return 0; + + nr_contig = num_pages_contiguous(&pages[idx], max_steps); + if (nr_contig < 2) + return 0; + + order = ilog2(nr_contig); + + /* Limit order by pfn alignment */ + order = min_t(int, order, __ffs(page_to_pfn(pages[idx]))); + + if (vm_shift(prot, PAGE_SIZE << order) == PAGE_SHIFT) + return 0; + + return order; +} + +static int vmap_batched(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages) +{ + unsigned int count = (end - addr) >> PAGE_SHIFT; + unsigned int prev_shift = 0, idx = 0; + unsigned long start = addr, map_addr = addr; + int err; + + err = kmsan_vmap_pages_range_noflush(addr, end, prot, pages, + PAGE_SHIFT, GFP_KERNEL); + if (err) + goto out; + + for (unsigned int i = 0; i < count; ) { + unsigned int shift = PAGE_SHIFT + + get_vmap_batch_order(pages, prot, count - i, i); + + if (!i) + prev_shift = shift; + + if (shift != prev_shift) { + err = vmap_pages_range_noflush_walk(map_addr, addr, + prot, pages + idx, prev_shift); + if (err) + goto out; + prev_shift = shift; + map_addr = addr; + idx = i; + } + + /* + * Once small pages are encountered, the remaining pages + * are likely small as well. + */ + if (shift == PAGE_SHIFT) + break; + + addr += 1UL << shift; + i += 1U << (shift - PAGE_SHIFT); + } + + /* Remaining */ + if (map_addr < end) + err = vmap_pages_range_noflush_walk(map_addr, end, + prot, pages + idx, prev_shift); + +out: + flush_cache_vmap(start, end); + return err; +} + /** * vmap - map an array of pages into virtually contiguous space * @pages: array of page pointers @@ -3588,8 +3671,8 @@ void *vmap(struct page **pages, unsigned int count, return NULL; addr = (unsigned long)area->addr; - if (vmap_pages_range(addr, addr + size, pgprot_nx(prot), - pages, PAGE_SHIFT) < 0) { + if (vmap_batched(addr, addr + size, pgprot_nx(prot), + pages) < 0) { vunmap(area->addr); return NULL; } -- 2.34.1 From: "Barry Song (Xiaomi)" Try to align the vmap virtual address to PMD_SHIFT or a larger PTE mapping size hinted by the architecture, so contiguous pages can be batch-mapped when setting PMD or PTE entries. Add __get_vm_area_node_aligned_caller() as a wrapper over __get_vm_area_node() to simplify repeated calls with fixed arguments. Signed-off-by: Barry Song (Xiaomi) Signed-off-by: Wen Jiang Tested-by: Xueyuan Chen --- mm/vmalloc.c | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index fffb885cb2158..bc9fa93e2bdc6 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3628,6 +3628,41 @@ static int vmap_batched(unsigned long addr, unsigned long end, return err; } +static struct vm_struct *__get_vm_area_node_aligned_caller(unsigned long size, + unsigned long align, unsigned long flags, const void *caller) +{ + return __get_vm_area_node(size, align, PAGE_SHIFT, flags, + VMALLOC_START, VMALLOC_END, + NUMA_NO_NODE, GFP_KERNEL, caller); +} + +static struct vm_struct *vmap_get_aligned_vm_area(unsigned long size, + unsigned long flags, const void *caller) +{ + struct vm_struct *vm_area; + unsigned int shift; + + /* Try PMD alignment for large sizes */ + if (size >= PMD_SIZE) { + vm_area = __get_vm_area_node_aligned_caller(size, PMD_SIZE, + flags, caller); + if (vm_area) + return vm_area; + } + + /* Try CONT_PTE alignment */ + shift = arch_vmap_pte_supported_shift(size); + if (shift > PAGE_SHIFT) { + vm_area = __get_vm_area_node_aligned_caller(size, 1UL << shift, + flags, caller); + if (vm_area) + return vm_area; + } + + /* Fall back to page alignment */ + return __get_vm_area_node_aligned_caller(size, PAGE_SIZE, flags, caller); +} + /** * vmap - map an array of pages into virtually contiguous space * @pages: array of page pointers @@ -3666,7 +3701,7 @@ void *vmap(struct page **pages, unsigned int count, return NULL; size = (unsigned long)count << PAGE_SHIFT; - area = get_vm_area_caller(size, flags, __builtin_return_address(0)); + area = vmap_get_aligned_vm_area(size, flags, __builtin_return_address(0)); if (!area) return NULL; -- 2.34.1