Currently, the PTEs batch requires folio access, with the maximum quantity limited to the PFNs contained within the folio. However, in certain case (such as mremap_folio_pte_batch and mincore_pte_range), accessing the folio is unnecessary and expensive. For scenarios that do not require folio access, this patch introduces can_pte_batch_count(). With contiguous physical addresses and identical PTE attribut bits, we can now process more page table entries at once, in batch, not just limited to entries mapped within a single folio. On the other hand, it avoid the folio access. Signed-off-by: Zhang Qilong --- mm/internal.h | 76 +++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 58 insertions(+), 18 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 1561fc2ff5b8..92034ca9092d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -233,61 +233,62 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) pte = pte_wrprotect(pte); return pte_mkold(pte); } /** - * folio_pte_batch_flags - detect a PTE batch for a large folio - * @folio: The large folio to detect a PTE batch for. + * can_pte_batch_count - detect a PTE batch in range [ptep, to ptep + max_nr) * @vma: The VMA. Only relevant with FPB_MERGE_WRITE, otherwise can be NULL. * @ptep: Page table pointer for the first entry. * @ptentp: Pointer to a COPY of the first page table entry whose flags this * function updates based on @flags if appropriate. * @max_nr: The maximum number of table entries to consider. * @flags: Flags to modify the PTE batch semantics. * - * Detect a PTE batch: consecutive (present) PTEs that map consecutive - * pages of the same large folio in a single VMA and a single page table. + * This interface is designed for this case that do not require folio access. + * If folio consideration is needed, please call folio_pte_batch_flags instead. + * + * Detect a PTE batch: consecutive (present) PTEs that map consecutive pages + * in a single VMA and a single page table. * * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN, * the accessed bit, writable bit, dirty bit (unless FPB_RESPECT_DIRTY is set) * and soft-dirty bit (unless FPB_RESPECT_SOFT_DIRTY is set). * - * @ptep must map any page of the folio. max_nr must be at least one and + * @ptep point to the first entry in range, max_nr must be at least one and * must be limited by the caller so scanning cannot exceed a single VMA and * a single page table. * * Depending on the FPB_MERGE_* flags, the pte stored at @ptentp will * be updated: it's crucial that a pointer to a COPY of the first * page table entry, obtained through ptep_get(), is provided as @ptentp. * - * This function will be inlined to optimize based on the input parameters; - * consider using folio_pte_batch() instead if applicable. + * The following folio_pte_batch_flags() deal with PTEs that mapped in a + * single folio. However can_pte_batch_count has the capability to handle + * PTEs that mapped in consecutive folios. If flags is not set, it will ignore + * the accessed, writable and dirty bits. Once the flags is set, the respect + * bit(s) will be compared in pte_same(), if the advanced pte_batch_hint() + * respect pte bit is different, pte_same() will return false and break. This + * ensures the correctness of handling multiple folio PTEs. + * + * This function will be inlined to optimize based on the input parameters. * * Return: the number of table entries in the batch. */ -static inline unsigned int folio_pte_batch_flags(struct folio *folio, - struct vm_area_struct *vma, pte_t *ptep, pte_t *ptentp, - unsigned int max_nr, fpb_t flags) +static inline unsigned int can_pte_batch_count(struct vm_area_struct *vma, + pte_t *ptep, pte_t *ptentp, unsigned int max_nr, fpb_t flags) { bool any_writable = false, any_young = false, any_dirty = false; pte_t expected_pte, pte = *ptentp; unsigned int nr, cur_nr; - VM_WARN_ON_FOLIO(!pte_present(pte), folio); - VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio); - VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio); + VM_WARN_ON(!pte_present(pte)); /* * Ensure this is a pointer to a copy not a pointer into a page table. * If this is a stack value, it won't be a valid virtual address, but * that's fine because it also cannot be pointing into the page table. */ VM_WARN_ON(virt_addr_valid(ptentp) && PageTable(virt_to_page(ptentp))); - - /* Limit max_nr to the actual remaining PFNs in the folio we could batch. */ - max_nr = min_t(unsigned long, max_nr, - folio_pfn(folio) + folio_nr_pages(folio) - pte_pfn(pte)); - nr = pte_batch_hint(ptep, pte); expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags); ptep = ptep + nr; while (nr < max_nr) { @@ -317,10 +318,49 @@ static inline unsigned int folio_pte_batch_flags(struct folio *folio, *ptentp = pte_mkdirty(*ptentp); return min(nr, max_nr); } +/** + * folio_pte_batch_flags - detect a PTE batch for a large folio + * @folio: The large folio to detect a PTE batch for. + * @vma: The VMA. Only relevant with FPB_MERGE_WRITE, otherwise can be NULL. + * @ptep: Page table pointer for the first entry. + * @ptentp: Pointer to a COPY of the first page table entry whose flags this + * function updates based on @flags if appropriate. + * @max_nr: The maximum number of table entries to consider. + * @flags: Flags to modify the PTE batch semantics. + * + * Detect a PTE batch: consecutive (present) PTEs that map consecutive + * pages of the same large folio and have the same PTE bits set excluding the + * PFN, the accessed bit, writable bit, dirty bit. (unless FPB_RESPECT_DIRTY + * is set) and soft-dirty bit (unless FPB_RESPECT_SOFT_DIRTY is set). + * + * @ptep must map any page of the folio. + * + * This function will be inlined to optimize based on the input parameters; + * consider using folio_pte_batch() instead if applicable. + * + * Return: the number of table entries in the batch. + */ +static inline unsigned int folio_pte_batch_flags(struct folio *folio, + struct vm_area_struct *vma, pte_t *ptep, pte_t *ptentp, + unsigned int max_nr, fpb_t flags) +{ + pte_t pte = *ptentp; + + VM_WARN_ON_FOLIO(!pte_present(pte), folio); + VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio); + VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio); + + /* Limit max_nr to the actual remaining PFNs in the folio we could batch. */ + max_nr = min_t(unsigned long, max_nr, + folio_pfn(folio) + folio_nr_pages(folio) - pte_pfn(pte)); + + return can_pte_batch_count(vma, ptep, ptentp, max_nr, flags); +} + unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte, unsigned int max_nr); /** * pte_move_swp_offset - Move the swap entry offset field of a swap pte -- 2.43.0 In current mincore_pte_range(), if pte_batch_hint() return one pte, it's not efficient, just call new added can_pte_batch_count(). In ARM64 qemu, with 8 CPUs, 32G memory, a simple test demo like: 1. mmap 1G anon memory 2. write 1G data by 4k step 3. mincore the mmaped 1G memory 4. get the time consumed by mincore Tested the following cases: - 4k, disabled all hugepage setting. - 64k mTHP, only enable 64k hugepage setting. Before Case status | Consumed time (us) | ----------------------------------| 4k | 7356 | 64k mTHP | 3670 | Pathed: Case status | Consumed time (us) | ----------------------------------| 4k | 4419 | 64k mTHP | 3061 | The result is evident and demonstrate a significant improvement in the pte batch. While verification within a single environment may have inherent randomness. there is a high probability of achieving positive effects. Signed-off-by: Zhang Qilong --- mm/mincore.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/mm/mincore.c b/mm/mincore.c index 8ec4719370e1..2cc5d276d1cd 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -178,18 +178,14 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, /* We need to do cache lookup too for pte markers */ if (pte_none_mostly(pte)) __mincore_unmapped_range(addr, addr + PAGE_SIZE, vma, vec); else if (pte_present(pte)) { - unsigned int batch = pte_batch_hint(ptep, pte); - - if (batch > 1) { - unsigned int max_nr = (end - addr) >> PAGE_SHIFT; - - step = min_t(unsigned int, batch, max_nr); - } + unsigned int max_nr = (end - addr) >> PAGE_SHIFT; + step = can_pte_batch_count(vma, ptep, &pte, + max_nr, 0); for (i = 0; i < step; i++) vec[i] = 1; } else { /* pte is a swap entry */ *vec = mincore_swap(pte_to_swp_entry(pte), false); } -- 2.43.0 In current mremap_folio_pte_batch(), 1) pte_batch_hint() always return one pte in non-ARM64 machine, it is not efficient. 2) Next, it need to acquire a folio to call the folio_pte_batch(). Due to new added can_pte_batch_count(), we just call it instead of folio_pte_batch(). And then rename mremap_folio_pte_batch() to mremap_pte_batch(). Signed-off-by: Zhang Qilong --- mm/mremap.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/mm/mremap.c b/mm/mremap.c index bd7314898ec5..d11f93f1622f 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -169,27 +169,17 @@ static pte_t move_soft_dirty_pte(pte_t pte) pte = pte_swp_mksoft_dirty(pte); #endif return pte; } -static int mremap_folio_pte_batch(struct vm_area_struct *vma, unsigned long addr, +static int mremap_pte_batch(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int max_nr) { - struct folio *folio; - if (max_nr == 1) return 1; - /* Avoid expensive folio lookup if we stand no chance of benefit. */ - if (pte_batch_hint(ptep, pte) == 1) - return 1; - - folio = vm_normal_folio(vma, addr, pte); - if (!folio || !folio_test_large(folio)) - return 1; - - return folio_pte_batch(folio, ptep, pte, max_nr); + return can_pte_batch_count(vma, ptep, &pte, max_nr, 0); } static int move_ptes(struct pagetable_move_control *pmc, unsigned long extent, pmd_t *old_pmd, pmd_t *new_pmd) { @@ -278,11 +268,11 @@ static int move_ptes(struct pagetable_move_control *pmc, * make sure the physical page stays valid until * the TLB entry for the old mapping has been * flushed. */ if (pte_present(old_pte)) { - nr_ptes = mremap_folio_pte_batch(vma, old_addr, old_ptep, + nr_ptes = mremap_pte_batch(vma, old_addr, old_ptep, old_pte, max_nr_ptes); force_flush = true; } pte = get_and_clear_ptes(mm, old_addr, old_ptep, nr_ptes); pte = move_pte(pte, old_addr, new_addr); -- 2.43.0