alloc_swap_folio() has been falling back to order-0 in the anonymous synchronous swapin path whenever zswap was ever enabled, because a large folio range could contain a mixture of zswap and non-zswap entries and zswap_load() could not handle large folios. zswap_load() can now load a range that is fully present in zswap, and zswap_entry_batch() can identify mixed zswap ranges. Use that check alongside the existing zeromap and swapcache checks when selecting a large folio for anonymous swapin, and recheck before inserting a large folio into the swap cache while holding the swap cluster lock. With mixed zswap ranges rejected and the insertion-race fallback in place, remove the blanket zswap_never_enabled() fallback from the anonymous swapin path so all-zswap and all-disk anonymous ranges can use mTHP swapin. Shmem keeps its existing zswap fallback and is outside this RFC. Signed-off-by: fujunjie --- mm/memory.c | 21 ++++++--------------- mm/swap_state.c | 23 +++++++++++++++-------- 2 files changed, 21 insertions(+), 23 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 84e3b77b8293..0be249108de1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -78,6 +78,7 @@ #include #include #include +#include #include @@ -4635,13 +4636,11 @@ static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages) if (swap_pte_batch(ptep, nr_pages, pte) != nr_pages) return false; - /* - * swap_read_folio() can't handle the case a large folio is hybridly - * from different backends. And they are likely corner cases. Similar - * things might be added once zswap support large folios. - */ + /* swap_read_folio() can't handle hybrid backend large folios. */ if (unlikely(swap_zeromap_batch(entry, nr_pages, NULL) != nr_pages)) return false; + if (unlikely(zswap_entry_batch(entry, nr_pages, NULL) != nr_pages)) + return false; if (unlikely(non_swapcache_batch(entry, nr_pages) != nr_pages)) return false; @@ -4690,14 +4689,6 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) if (unlikely(userfaultfd_armed(vma))) goto fallback; - /* - * A large swapped out folio could be partially or fully in zswap. We - * lack handling for such cases, so fallback to swapping in order-0 - * folio. - */ - if (!zswap_never_enabled()) - goto fallback; - entry = softleaf_from_pte(vmf->orig_pte); /* * Get a list of all the (large) orders below PMD_ORDER that are enabled @@ -4772,8 +4763,8 @@ static struct folio *swapin_synchronous_folio(swp_entry_t entry, order = folio_order(folio); /* - * folio is charged, so swapin can only fail due to raced swapin and - * return NULL. + * folio is charged, so NULL means the large folio could not be + * inserted and needs order-0 fallback. */ swapcache = swapin_folio(entry, folio); if (swapcache == folio) diff --git a/mm/swap_state.c b/mm/swap_state.c index 1415a5c54a43..4e58fad5e5f0 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "internal.h" #include "swap_table.h" #include "swap.h" @@ -207,6 +208,11 @@ static int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, if (swp_tb_is_shadow(old_tb)) shadow = swp_tb_to_shadow(old_tb); } while (++ci_off < ci_end); + if (unlikely(folio_test_large(folio) && + zswap_entry_batch(entry, nr_pages, NULL) != nr_pages)) { + err = -EAGAIN; + goto failed; + } __swap_cache_add_folio(ci, folio, entry); swap_cluster_unlock(ci); if (shadowp) @@ -460,7 +466,8 @@ void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma, * * Context: Caller must protect the swap device with reference count or locks. * Return: Returns the folio being added on success. Returns the existing folio - * if @entry is already cached. Returns NULL if raced with swapin or swapoff. + * if @entry is already cached. Returns NULL if raced with swapin or swapoff, + * or if a large folio fails a backend recheck before insertion. */ static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry, struct folio *folio, @@ -483,10 +490,10 @@ static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry, /* * Large order allocation needs special handling on - * race: if a smaller folio exists in cache, swapin needs - * to fallback to order 0, and doing a swap cache lookup - * might return a folio that is irrelevant to the faulting - * entry because @entry is aligned down. Just return NULL. + * race or backend recheck failure: swapin needs to fall back + * to order 0, and doing a swap cache lookup might return a + * folio that is irrelevant to the faulting entry because + * @entry is aligned down. Just return NULL. */ if (ret != -EEXIST || folio_test_large(folio)) goto failed; @@ -567,9 +574,9 @@ struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_mask, * with the folio size. * * Return: returns pointer to @folio on success. If folio is a large folio - * and this raced with another swapin, NULL will be returned to allow fallback - * to order 0. Else, if another folio was already added to the swap cache, - * return that swap cache folio instead. + * and it raced with another swapin or failed a backend recheck, NULL will be + * returned to allow fallback to order 0. Else, if another folio was already + * added to the swap cache, return that swap cache folio instead. */ struct folio *swapin_folio(swp_entry_t entry, struct folio *folio) { -- 2.34.1