Currently, scan_get_next_rmap_item() walks every page address in a VMA to locate mergeable pages. This becomes highly inefficient when scanning large virtual memory areas that contain mostly unmapped regions. This patch replaces the per-address lookup with a range walk using walk_page_range(). The range walker allows KSM to skip over entire unmapped holes in a VMA, avoiding unnecessary lookups. This problem was previously discussed in [1]. [1] https://lore.kernel.org/linux-mm/423de7a3-1c62-4e72-8e79-19a6413e420c@redhat.com/ --- v3: - Treat THPs in ksm_pmd_entry - Update ksm_scan.address outside walk_page_range - Change goto to while loop v2: https://lore.kernel.org/all/20251014151126.87589-1-pedrodemargomes@gmail.com/ - Use pmd_entry to walk page range - Use cond_resched inside pmd_entry() - walk_page_range returns page+folio v1: https://lore.kernel.org/all/20251014055828.124522-1-pedrodemargomes@gmail.com/ Reported-by: craftfever Closes: https://lkml.kernel.org/r/020cf8de6e773bb78ba7614ef250129f11a63781@murena.io Suggested-by: David Hildenbrand Signed-off-by: Pedro Demarchi Gomes --- mm/ksm.c | 185 ++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 135 insertions(+), 50 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index 3aed0478fdce..403e4f102f07 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2455,14 +2455,119 @@ static bool should_skip_rmap_item(struct folio *folio, return true; } +struct ksm_walk_private { + struct page *page; + struct folio *folio; + struct vm_area_struct *vma; + unsigned long address; +}; + +static int ksm_walk_test(unsigned long addr, unsigned long next, struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; + struct ksm_walk_private *private; + + if (!(vma->vm_flags & VM_MERGEABLE)) + return 1; + + private = (struct ksm_walk_private *) walk->private; + private->address = vma->vm_end; + + if (!vma->anon_vma) + return 1; + + return 0; +} + +static int ksm_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct mm_struct *mm = walk->mm; + struct vm_area_struct *vma = walk->vma; + struct ksm_walk_private *private = (struct ksm_walk_private *) walk->private; + struct folio *folio; + pte_t *start_pte, *pte, ptent; + pmd_t pmde; + struct page *page; + spinlock_t *ptl; + int ret = 0; + + if (ksm_test_exit(mm)) + return 1; + + ptl = pmd_lock(mm, pmd); + pmde = pmdp_get(pmd); + + if (!pmd_present(pmde)) + goto pmd_out; + + if (!pmd_trans_huge(pmde)) + goto pte_table; + + page = vm_normal_page_pmd(vma, addr, pmde); + + if (!page) + goto pmd_out; + + folio = page_folio(page); + if (folio_is_zone_device(folio) || !folio_test_anon(folio)) + goto pmd_out; + + ret = 1; + folio_get(folio); + private->page = page + ((addr & (PMD_SIZE - 1)) >> PAGE_SHIFT); + private->folio = folio; + private->vma = vma; + private->address = addr; +pmd_out: + spin_unlock(ptl); + return ret; + +pte_table: + spin_unlock(ptl); + + start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + if (!start_pte) + return 0; + + for (; addr < end; pte++, addr += PAGE_SIZE) { + ptent = ptep_get(pte); + page = vm_normal_page(vma, addr, ptent); + + if (!page) + continue; + + folio = page_folio(page); + if (folio_is_zone_device(folio) || !folio_test_anon(folio)) + continue; + + ret = 1; + folio_get(folio); + private->page = page; + private->folio = folio; + private->vma = vma; + private->address = addr; + break; + } + pte_unmap_unlock(start_pte, ptl); + + cond_resched(); + return ret; +} + +struct mm_walk_ops walk_ops = { + .pmd_entry = ksm_pmd_entry, + .test_walk = ksm_walk_test, + .walk_lock = PGWALK_RDLOCK, +}; + static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) { struct mm_struct *mm; struct ksm_mm_slot *mm_slot; struct mm_slot *slot; - struct vm_area_struct *vma; struct ksm_rmap_item *rmap_item; - struct vma_iterator vmi; + struct ksm_walk_private walk_private; int nid; if (list_empty(&ksm_mm_head.slot.mm_node)) @@ -2527,64 +2632,44 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) slot = &mm_slot->slot; mm = slot->mm; - vma_iter_init(&vmi, mm, ksm_scan.address); mmap_read_lock(mm); if (ksm_test_exit(mm)) goto no_vmas; - for_each_vma(vmi, vma) { - if (!(vma->vm_flags & VM_MERGEABLE)) - continue; - if (ksm_scan.address < vma->vm_start) - ksm_scan.address = vma->vm_start; - if (!vma->anon_vma) - ksm_scan.address = vma->vm_end; - - while (ksm_scan.address < vma->vm_end) { - struct page *tmp_page = NULL; - struct folio_walk fw; - struct folio *folio; + while (true) { + struct folio *folio; - if (ksm_test_exit(mm)) - break; + walk_private.page = NULL; + walk_private.folio = NULL; + walk_private.address = ksm_scan.address; - folio = folio_walk_start(&fw, vma, ksm_scan.address, 0); - if (folio) { - if (!folio_is_zone_device(folio) && - folio_test_anon(folio)) { - folio_get(folio); - tmp_page = fw.page; - } - folio_walk_end(&fw, vma); - } + walk_page_range(mm, ksm_scan.address, -1, &walk_ops, (void *) &walk_private); + ksm_scan.address = walk_private.address; + if (!walk_private.page) + break; + + folio = walk_private.folio; + flush_anon_page(walk_private.vma, walk_private.page, ksm_scan.address); + flush_dcache_page(walk_private.page); + rmap_item = get_next_rmap_item(mm_slot, + ksm_scan.rmap_list, ksm_scan.address); + if (rmap_item) { + ksm_scan.rmap_list = + &rmap_item->rmap_list; - if (tmp_page) { - flush_anon_page(vma, tmp_page, ksm_scan.address); - flush_dcache_page(tmp_page); - rmap_item = get_next_rmap_item(mm_slot, - ksm_scan.rmap_list, ksm_scan.address); - if (rmap_item) { - ksm_scan.rmap_list = - &rmap_item->rmap_list; - - if (should_skip_rmap_item(folio, rmap_item)) { - folio_put(folio); - goto next_page; - } - - ksm_scan.address += PAGE_SIZE; - *page = tmp_page; - } else { - folio_put(folio); - } - mmap_read_unlock(mm); - return rmap_item; - } -next_page: ksm_scan.address += PAGE_SIZE; - cond_resched(); + if (should_skip_rmap_item(folio, rmap_item)) { + folio_put(folio); + continue; + } + + *page = walk_private.page; + } else { + folio_put(folio); } + mmap_read_unlock(mm); + return rmap_item; } if (ksm_test_exit(mm)) { -- 2.39.5