alloc_swap_folio() has been falling back to order-0 in the anonymous
synchronous swapin path whenever zswap was ever enabled, because a large
folio range could contain a mixture of zswap and non-zswap entries and
zswap_load() could not handle large folios.

zswap_load() can now load a range that is fully present in zswap, and
zswap_entry_batch() can identify mixed zswap ranges. Use that check
alongside the existing zeromap and swapcache checks when selecting a large
folio for anonymous swapin, and recheck before inserting a large folio into
the swap cache while holding the swap cluster lock.

With mixed zswap ranges rejected and the insertion-race fallback in place,
remove the blanket zswap_never_enabled() fallback from the anonymous swapin
path so all-zswap and all-disk anonymous ranges can use mTHP swapin. Shmem
keeps its existing zswap fallback and is outside this RFC.

Signed-off-by: fujunjie <fujunjie1@qq.com>
---
 mm/memory.c     | 21 ++++++---------------
 mm/swap_state.c | 23 +++++++++++++++--------
 2 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 84e3b77b8293..0be249108de1 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -78,6 +78,7 @@
 #include <linux/sched/sysctl.h>
 #include <linux/pgalloc.h>
 #include <linux/uaccess.h>
+#include <linux/zswap.h>
 
 #include <trace/events/kmem.h>
 
@@ -4635,13 +4636,11 @@ static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages)
 	if (swap_pte_batch(ptep, nr_pages, pte) != nr_pages)
 		return false;
 
-	/*
-	 * swap_read_folio() can't handle the case a large folio is hybridly
-	 * from different backends. And they are likely corner cases. Similar
-	 * things might be added once zswap support large folios.
-	 */
+	/* swap_read_folio() can't handle hybrid backend large folios. */
 	if (unlikely(swap_zeromap_batch(entry, nr_pages, NULL) != nr_pages))
 		return false;
+	if (unlikely(zswap_entry_batch(entry, nr_pages, NULL) != nr_pages))
+		return false;
 	if (unlikely(non_swapcache_batch(entry, nr_pages) != nr_pages))
 		return false;
 
@@ -4690,14 +4689,6 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
 	if (unlikely(userfaultfd_armed(vma)))
 		goto fallback;
 
-	/*
-	 * A large swapped out folio could be partially or fully in zswap. We
-	 * lack handling for such cases, so fallback to swapping in order-0
-	 * folio.
-	 */
-	if (!zswap_never_enabled())
-		goto fallback;
-
 	entry = softleaf_from_pte(vmf->orig_pte);
 	/*
 	 * Get a list of all the (large) orders below PMD_ORDER that are enabled
@@ -4772,8 +4763,8 @@ static struct folio *swapin_synchronous_folio(swp_entry_t entry,
 	order = folio_order(folio);
 
 	/*
-	 * folio is charged, so swapin can only fail due to raced swapin and
-	 * return NULL.
+	 * folio is charged, so NULL means the large folio could not be
+	 * inserted and needs order-0 fallback.
 	 */
 	swapcache = swapin_folio(entry, folio);
 	if (swapcache == folio)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 1415a5c54a43..4e58fad5e5f0 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -22,6 +22,7 @@
 #include <linux/vmalloc.h>
 #include <linux/huge_mm.h>
 #include <linux/shmem_fs.h>
+#include <linux/zswap.h>
 #include "internal.h"
 #include "swap_table.h"
 #include "swap.h"
@@ -207,6 +208,11 @@ static int swap_cache_add_folio(struct folio *folio, swp_entry_t entry,
 		if (swp_tb_is_shadow(old_tb))
 			shadow = swp_tb_to_shadow(old_tb);
 	} while (++ci_off < ci_end);
+	if (unlikely(folio_test_large(folio) &&
+		     zswap_entry_batch(entry, nr_pages, NULL) != nr_pages)) {
+		err = -EAGAIN;
+		goto failed;
+	}
 	__swap_cache_add_folio(ci, folio, entry);
 	swap_cluster_unlock(ci);
 	if (shadowp)
@@ -460,7 +466,8 @@ void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma,
  *
  * Context: Caller must protect the swap device with reference count or locks.
  * Return: Returns the folio being added on success. Returns the existing folio
- * if @entry is already cached. Returns NULL if raced with swapin or swapoff.
+ * if @entry is already cached. Returns NULL if raced with swapin or swapoff,
+ * or if a large folio fails a backend recheck before insertion.
  */
 static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry,
 						  struct folio *folio,
@@ -483,10 +490,10 @@ static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry,
 
 		/*
 		 * Large order allocation needs special handling on
-		 * race: if a smaller folio exists in cache, swapin needs
-		 * to fallback to order 0, and doing a swap cache lookup
-		 * might return a folio that is irrelevant to the faulting
-		 * entry because @entry is aligned down. Just return NULL.
+		 * race or backend recheck failure: swapin needs to fall back
+		 * to order 0, and doing a swap cache lookup might return a
+		 * folio that is irrelevant to the faulting entry because
+		 * @entry is aligned down. Just return NULL.
 		 */
 		if (ret != -EEXIST || folio_test_large(folio))
 			goto failed;
@@ -567,9 +574,9 @@ struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_mask,
  * with the folio size.
  *
  * Return: returns pointer to @folio on success. If folio is a large folio
- * and this raced with another swapin, NULL will be returned to allow fallback
- * to order 0. Else, if another folio was already added to the swap cache,
- * return that swap cache folio instead.
+ * and it raced with another swapin or failed a backend recheck, NULL will be
+ * returned to allow fallback to order 0. Else, if another folio was already
+ * added to the swap cache, return that swap cache folio instead.
  */
 struct folio *swapin_folio(swp_entry_t entry, struct folio *folio)
 {
-- 
2.34.1