swapin_walk_pmd_entry() walks PTEs and skips non-present PMDs, so MADV_WILLNEED is a no-op on a PMD swap entry. Handle PMD swap entries under pmd_trans_huge_lock(). If the covered swap-cache range already has a PMD-sized folio, there is nothing left to prefetch. If the range has split cache state, or any covered slot currently has a zswap entry, split the PMD swap entry and ask the walker to retry so the PTE path can handle the individual slots. Otherwise pin the swap device and read the folio in at PMD order via swapin_sync(BIT(HPAGE_PMD_ORDER)). This keeps the subsequent fault on the do_huge_pmd_swap_page() path and avoids order-0 readahead needlessly splitting the PMD swap entry. If PMD-order swapin races with per-slot swap-cache population after dropping the PMD lock, split and retry through the PTE path instead. Signed-off-by: Usama Arif --- mm/madvise.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/mm/madvise.c b/mm/madvise.c index 0d6aa0608f70..78a08039e173 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -32,6 +32,7 @@ #include #include #include +#include #include @@ -193,6 +194,79 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, spinlock_t *ptl; unsigned long addr; + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { + pmd_t pmdval = *pmd; + + if (pmd_is_swap_entry(pmdval)) { + softleaf_t entry = softleaf_from_pmd(pmdval); + struct vm_fault vmf = { + .vma = vma, + .address = start, + .real_address = start, + .pmd = pmd, + }; + struct swap_info_struct *si; + struct folio *folio; + enum swap_pmd_cache cache_state; + bool split = false; + + cache_state = swap_pmd_cache_lookup(entry, &folio); + if (cache_state == SWAP_PMD_CACHE_HUGE) { + folio_put(folio); + spin_unlock(ptl); + goto ret; + } + if (cache_state == SWAP_PMD_CACHE_SPLIT || + zswap_range_has_entry(entry, HPAGE_PMD_NR)) { + spin_unlock(ptl); + __split_huge_pmd(vma, pmd, start, false); + walk->action = ACTION_AGAIN; + goto ret; + } + + /* + * Pin the swap device under the PMD lock so the + * PMD-swap-entry observation keeps the entry valid for + * swapin_sync(). + */ + si = get_swap_device(entry); + spin_unlock(ptl); + if (!si) + goto ret; + + folio = swapin_sync(entry, GFP_HIGHUSER_MOVABLE, + BIT(HPAGE_PMD_ORDER), &vmf, + NULL, 0); + /* + * The empty-cache observation was made under the PMD + * lock, but swap cache can change after dropping it. If + * PMD-order swapin lost a race to per-slot cache state, + * retry through the PTE path. + */ + if (IS_ERR(folio)) { + if (PTR_ERR(folio) == -EBUSY) + split = true; + } else if (folio) { + if (folio_nr_pages(folio) != HPAGE_PMD_NR) + split = true; + else if (!folio_test_locked(folio) && + !folio_test_uptodate(folio) && + zswap_range_has_entry(entry, + HPAGE_PMD_NR)) + split = true; + folio_put(folio); + } + put_swap_device(si); + if (split) { + __split_huge_pmd(vma, pmd, start, false); + walk->action = ACTION_AGAIN; + } + goto ret; + } + spin_unlock(ptl); + } + for (addr = start; addr < end; addr += PAGE_SIZE) { pte_t pte; softleaf_t entry; @@ -221,6 +295,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, if (ptep) pte_unmap_unlock(ptep, ptl); swap_read_unplug(splug); +ret: cond_resched(); return 0; -- 2.53.0-Meta