Currently, scan_get_next_rmap_item() walks every page address in a VMA to locate mergeable pages. This becomes highly inefficient when scanning large virtual memory areas that contain mostly unmapped regions. This patch replaces the per-address lookup with a range walk using walk_page_range(). The range walker allows KSM to skip over entire unmapped holes in a VMA, avoiding unnecessary lookups. This problem was previously discussed in [1]. Changes since v1 [2]: - Use pmd_entry to walk page range - Use cond_resched inside pmd_entry() - walk_page_range returns page+folio [1] https://lore.kernel.org/linux-mm/423de7a3-1c62-4e72-8e79-19a6413e420c@redhat.com/ [2] https://lore.kernel.org/linux-mm/20251014055828.124522-1-pedrodemargomes@gmail.com/ Signed-off-by: Pedro Demarchi Gomes --- mm/ksm.c | 144 ++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 94 insertions(+), 50 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index 3aed0478fdce..adb0267a1b7d 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2455,14 +2455,82 @@ static bool should_skip_rmap_item(struct folio *folio, return true; } +struct ksm_walk_private { + struct page *page; + struct folio *folio; + struct vm_area_struct *vma; +}; + +static int ksm_walk_test(unsigned long addr, unsigned long next, struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; + + if (!vma->anon_vma || !(vma->vm_flags & VM_MERGEABLE)) { + ksm_scan.address = vma->vm_end; + return 1; + } + return 0; +} + +static int ksm_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct mm_struct *mm = walk->mm; + struct vm_area_struct *vma = walk->vma; + struct ksm_walk_private *private = (struct ksm_walk_private *) walk->private; + struct folio *folio; + pte_t *start_pte, *pte, ptent; + spinlock_t *ptl; + int ret = 0; + + start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + if (!start_pte) { + ksm_scan.address = end; + return 0; + } + + for (; addr < end; pte++, addr += PAGE_SIZE) { + ptent = ptep_get(pte); + struct page *page = vm_normal_page(vma, addr, ptent); + ksm_scan.address = addr; + + if (ksm_test_exit(mm)) { + ret = 1; + break; + } + + if (!page) + continue; + + folio = page_folio(page); + if (folio_is_zone_device(folio) || !folio_test_anon(folio)) + continue; + + ret = 1; + folio_get(folio); + private->page = page; + private->folio = folio; + private->vma = vma; + break; + } + pte_unmap_unlock(start_pte, ptl); + + cond_resched(); + return ret; +} + +struct mm_walk_ops walk_ops = { + .pmd_entry = ksm_pmd_entry, + .test_walk = ksm_walk_test, + .walk_lock = PGWALK_RDLOCK, +}; + static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) { struct mm_struct *mm; struct ksm_mm_slot *mm_slot; struct mm_slot *slot; - struct vm_area_struct *vma; struct ksm_rmap_item *rmap_item; - struct vma_iterator vmi; int nid; if (list_empty(&ksm_mm_head.slot.mm_node)) @@ -2527,64 +2595,40 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) slot = &mm_slot->slot; mm = slot->mm; - vma_iter_init(&vmi, mm, ksm_scan.address); mmap_read_lock(mm); if (ksm_test_exit(mm)) goto no_vmas; - for_each_vma(vmi, vma) { - if (!(vma->vm_flags & VM_MERGEABLE)) - continue; - if (ksm_scan.address < vma->vm_start) - ksm_scan.address = vma->vm_start; - if (!vma->anon_vma) - ksm_scan.address = vma->vm_end; - - while (ksm_scan.address < vma->vm_end) { - struct page *tmp_page = NULL; - struct folio_walk fw; - struct folio *folio; +get_page: + struct ksm_walk_private walk_private = { + .page = NULL, + .folio = NULL, + .vma = NULL + }; - if (ksm_test_exit(mm)) - break; + walk_page_range(mm, ksm_scan.address, -1, &walk_ops, (void *) &walk_private); + if (walk_private.page) { + flush_anon_page(walk_private.vma, walk_private.page, ksm_scan.address); + flush_dcache_page(walk_private.page); + rmap_item = get_next_rmap_item(mm_slot, + ksm_scan.rmap_list, ksm_scan.address); + if (rmap_item) { + ksm_scan.rmap_list = + &rmap_item->rmap_list; - folio = folio_walk_start(&fw, vma, ksm_scan.address, 0); - if (folio) { - if (!folio_is_zone_device(folio) && - folio_test_anon(folio)) { - folio_get(folio); - tmp_page = fw.page; - } - folio_walk_end(&fw, vma); + ksm_scan.address += PAGE_SIZE; + if (should_skip_rmap_item(walk_private.folio, rmap_item)) { + folio_put(walk_private.folio); + goto get_page; } - if (tmp_page) { - flush_anon_page(vma, tmp_page, ksm_scan.address); - flush_dcache_page(tmp_page); - rmap_item = get_next_rmap_item(mm_slot, - ksm_scan.rmap_list, ksm_scan.address); - if (rmap_item) { - ksm_scan.rmap_list = - &rmap_item->rmap_list; - - if (should_skip_rmap_item(folio, rmap_item)) { - folio_put(folio); - goto next_page; - } - - ksm_scan.address += PAGE_SIZE; - *page = tmp_page; - } else { - folio_put(folio); - } - mmap_read_unlock(mm); - return rmap_item; - } -next_page: - ksm_scan.address += PAGE_SIZE; - cond_resched(); + *page = walk_private.page; + } else { + folio_put(walk_private.folio); } + mmap_read_unlock(mm); + return rmap_item; } if (ksm_test_exit(mm)) { -- 2.43.0