zswap_store() already stores every base page of a large folio as a separate zswap entry and tears the whole folio back down on store failure. The load side still rejects any large folio, which forces the swapin path to avoid mTHP swapin once zswap has ever been enabled. Use zswap_entry_batch() to distinguish three cases: the whole range is absent from zswap and should fall through to the disk backend, the whole range is present and can be decompressed one base page at a time, or the range is mixed and must be treated as an invalid large-folio backend selection. After all entries decompress successfully, mark the folio uptodate and dirty, account the mTHP swpin stat once for the folio, account one ZSWPIN event per base page, and invalidate each zswap entry because the swapcache folio becomes authoritative. Signed-off-by: fujunjie --- Documentation/admin-guide/mm/transhuge.rst | 4 +- mm/zswap.c | 65 ++++++++++++++-------- 2 files changed, 45 insertions(+), 24 deletions(-) diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 5fbc3d89bb07..05456906aff6 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -644,8 +644,8 @@ zswpout piece without splitting. swpin - is incremented every time a huge page is swapped in from a non-zswap - swap device in one piece. + is incremented every time a huge page is swapped in from swap I/O or + zswap in one piece. swpin_fallback is incremented if swapin fails to allocate or charge a huge page diff --git a/mm/zswap.c b/mm/zswap.c index 27c14b8edd15..863ca1e896ed 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -1614,20 +1615,23 @@ bool zswap_store(struct folio *folio) * NOT marked up-to-date, so that an IO error is emitted (e.g. do_swap_page() * will SIGBUS). * - * -EINVAL: if the swapped out content was in zswap, but the page belongs - * to a large folio, which is not supported by zswap. The folio is unlocked, - * but NOT marked up-to-date, so that an IO error is emitted (e.g. - * do_swap_page() will SIGBUS). + * -EINVAL: if the folio spans a mix of zswap and non-zswap entries. The + * folio is unlocked, but NOT marked up-to-date, so that an IO error is + * emitted (e.g. do_swap_page() will SIGBUS). Large folio swapin should + * reject such ranges before calling zswap_load(). * - * -ENOENT: if the swapped out content was not in zswap. The folio remains + * -ENOENT: if the swapped out content was not in zswap. For a large folio, + * this means the whole folio range was not in zswap. The folio remains * locked on return. */ int zswap_load(struct folio *folio) { swp_entry_t swp = folio->swap; pgoff_t offset = swp_offset(swp); - struct xarray *tree = swap_zswap_tree(swp); struct zswap_entry *entry; + int nr_pages = folio_nr_pages(folio); + bool is_zswap; + int index; VM_WARN_ON_ONCE(!folio_test_locked(folio)); VM_WARN_ON_ONCE(!folio_test_swapcache(folio)); @@ -1635,30 +1639,36 @@ int zswap_load(struct folio *folio) if (zswap_never_enabled()) return -ENOENT; - /* - * Large folios should not be swapped in while zswap is being used, as - * they are not properly handled. Zswap does not properly load large - * folios, and a large folio may only be partially in zswap. - */ - if (WARN_ON_ONCE(folio_test_large(folio))) { + if (zswap_entry_batch(swp, nr_pages, &is_zswap) != nr_pages) { + WARN_ON_ONCE(folio_test_large(folio)); folio_unlock(folio); return -EINVAL; } - entry = xa_load(tree, offset); - if (!entry) + if (!is_zswap) return -ENOENT; - if (!zswap_decompress(entry, folio, 0)) { - folio_unlock(folio); - return -EIO; + for (index = 0; index < nr_pages; index++) { + swp_entry_t entry_swp = swp_entry(swp_type(swp), + offset + index); + struct xarray *tree = swap_zswap_tree(entry_swp); + + entry = xa_load(tree, offset + index); + if (WARN_ON_ONCE(!entry)) { + folio_unlock(folio); + return -EINVAL; + } + + if (!zswap_decompress(entry, folio, index)) { + folio_unlock(folio); + return -EIO; + } } folio_mark_uptodate(folio); - count_vm_event(ZSWPIN); - if (entry->objcg) - count_objcg_events(entry->objcg, ZSWPIN, 1); + count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN); + count_vm_events(ZSWPIN, nr_pages); /* * We are reading into the swapcache, invalidate zswap entry. @@ -1668,8 +1678,19 @@ int zswap_load(struct folio *folio) * compression work. */ folio_mark_dirty(folio); - xa_erase(tree, offset); - zswap_entry_free(entry); + + for (index = 0; index < nr_pages; index++) { + swp_entry_t entry_swp = swp_entry(swp_type(swp), + offset + index); + struct xarray *tree = swap_zswap_tree(entry_swp); + + entry = xa_erase(tree, offset + index); + if (WARN_ON_ONCE(!entry)) + continue; + if (entry->objcg) + count_objcg_events(entry->objcg, ZSWPIN, 1); + zswap_entry_free(entry); + } folio_unlock(folio); return 0; -- 2.34.1