Remove the zswap tree and manage zswap entries directly through the virtual swap descriptor. This re-partitions the zswap pool (by virtual swap cluster), which eliminates zswap tree lock contention. Signed-off-by: Nhat Pham --- include/linux/zswap.h | 6 +++ mm/vswap.c | 100 ++++++++++++++++++++++++++++++++++++++++++ mm/zswap.c | 56 ++++------------------- 3 files changed, 114 insertions(+), 48 deletions(-) diff --git a/include/linux/zswap.h b/include/linux/zswap.h index 1a04caf283dc..7eb3ce7e124f 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -6,6 +6,7 @@ #include struct lruvec; +struct zswap_entry; extern atomic_long_t zswap_stored_pages; @@ -33,6 +34,11 @@ void zswap_lruvec_state_init(struct lruvec *lruvec); void zswap_folio_swapin(struct folio *folio); bool zswap_is_enabled(void); bool zswap_never_enabled(void); +void *zswap_entry_store(swp_entry_t swpentry, struct zswap_entry *entry); +void *zswap_entry_load(swp_entry_t swpentry); +void *zswap_entry_erase(swp_entry_t swpentry); +bool zswap_empty(swp_entry_t swpentry); + #else struct zswap_lruvec_state {}; diff --git a/mm/vswap.c b/mm/vswap.c index 3be42c45a1bb..fad1fd86e0f5 100644 --- a/mm/vswap.c +++ b/mm/vswap.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include "swap.h" #include "swap_table.h" @@ -38,11 +39,13 @@ * Swap descriptor - metadata of a swapped out page. * * @slot: The handle to the physical swap slot backing this page. + * @zswap_entry: The zswap entry associated with this swap slot. * @swap_cache: The folio in swap cache. * @shadow: The shadow entry. */ struct swp_desc { swp_slot_t slot; + struct zswap_entry *zswap_entry; union { struct folio *swap_cache; void *shadow; @@ -241,6 +244,7 @@ static void __vswap_alloc_from_cluster(struct vswap_cluster *cluster, int start, for (i = 0; i < nr; i++) { desc = &cluster->descriptors[start + i]; desc->slot.val = 0; + desc->zswap_entry = NULL; desc->swap_cache = folio; } cluster->count += nr; @@ -1034,6 +1038,102 @@ void __swap_cache_replace_folio(struct folio *old, struct folio *new) rcu_read_unlock(); } +#ifdef CONFIG_ZSWAP +/** + * zswap_entry_store - store a zswap entry for a swap entry + * @swpentry: the swap entry + * @entry: the zswap entry to store + * + * Stores a zswap entry in the swap descriptor for the given swap entry. + * The cluster is locked during the store operation. + * + * Return: the old zswap entry if one existed, NULL otherwise + */ +void *zswap_entry_store(swp_entry_t swpentry, struct zswap_entry *entry) +{ + struct vswap_cluster *cluster = NULL; + struct swp_desc *desc; + void *old; + + rcu_read_lock(); + desc = vswap_iter(&cluster, swpentry.val); + if (!desc) { + rcu_read_unlock(); + return NULL; + } + + old = desc->zswap_entry; + desc->zswap_entry = entry; + spin_unlock(&cluster->lock); + rcu_read_unlock(); + + return old; +} + +/** + * zswap_entry_load - load a zswap entry for a swap entry + * @swpentry: the swap entry + * + * Loads the zswap entry from the swap descriptor for the given swap entry. + * + * Return: the zswap entry if one exists, NULL otherwise + */ +void *zswap_entry_load(swp_entry_t swpentry) +{ + struct vswap_cluster *cluster = NULL; + struct swp_desc *desc; + void *zswap_entry; + + rcu_read_lock(); + desc = vswap_iter(&cluster, swpentry.val); + if (!desc) { + rcu_read_unlock(); + return NULL; + } + + zswap_entry = desc->zswap_entry; + spin_unlock(&cluster->lock); + rcu_read_unlock(); + + return zswap_entry; +} + +/** + * zswap_entry_erase - erase a zswap entry for a swap entry + * @swpentry: the swap entry + * + * Erases the zswap entry from the swap descriptor for the given swap entry. + * The cluster is locked during the erase operation. + * + * Return: the zswap entry that was erased, NULL if none existed + */ +void *zswap_entry_erase(swp_entry_t swpentry) +{ + struct vswap_cluster *cluster = NULL; + struct swp_desc *desc; + void *old; + + rcu_read_lock(); + desc = vswap_iter(&cluster, swpentry.val); + if (!desc) { + rcu_read_unlock(); + return NULL; + } + + old = desc->zswap_entry; + desc->zswap_entry = NULL; + spin_unlock(&cluster->lock); + rcu_read_unlock(); + + return old; +} + +bool zswap_empty(swp_entry_t swpentry) +{ + return xa_empty(&vswap_cluster_map); +} +#endif /* CONFIG_ZSWAP */ + int vswap_init(void) { int i; diff --git a/mm/zswap.c b/mm/zswap.c index f7313261673f..18725d9b1194 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -145,10 +145,10 @@ struct crypto_acomp_ctx { }; /* - * The lock ordering is zswap_tree.lock -> zswap_pool.lru_lock. - * The only case where lru_lock is not acquired while holding tree.lock is - * when a zswap_entry is taken off the lru for writeback, in that case it - * needs to be verified that it's still valid in the tree. + * The lock ordering is the vswap cluster lock -> zswap_pool.lru_lock. + * The only case where lru_lock is not acquired while holding the vswap + * cluster lock is when a zswap_entry is taken off the lru for writeback, + * in that case it needs to be verified that it's still valid in vswap. */ struct zswap_pool { struct zs_pool *zs_pool; @@ -223,37 +223,6 @@ static bool zswap_has_pool; * helpers and fwd declarations **********************************/ -static DEFINE_XARRAY(zswap_tree); - -#define zswap_tree_index(entry) (entry.val) - -static inline void *zswap_entry_store(swp_entry_t swpentry, - struct zswap_entry *entry) -{ - pgoff_t offset = zswap_tree_index(swpentry); - - return xa_store(&zswap_tree, offset, entry, GFP_KERNEL); -} - -static inline void *zswap_entry_load(swp_entry_t swpentry) -{ - pgoff_t offset = zswap_tree_index(swpentry); - - return xa_load(&zswap_tree, offset); -} - -static inline void *zswap_entry_erase(swp_entry_t swpentry) -{ - pgoff_t offset = zswap_tree_index(swpentry); - - return xa_erase(&zswap_tree, offset); -} - -static inline bool zswap_empty(swp_entry_t swpentry) -{ - return xa_empty(&zswap_tree); -} - #define zswap_pool_debug(msg, p) \ pr_debug("%s pool %s\n", msg, (p)->tfm_name) @@ -1168,7 +1137,7 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o /* * Once the lru lock is dropped, the entry might get freed. The * swpentry is copied to the stack, and entry isn't deref'd again - * until the entry is verified to still be alive in the tree. + * until the entry is verified to still be alive in vswap. */ swpentry = entry->swpentry; @@ -1445,13 +1414,6 @@ static bool zswap_store_page(struct page *page, goto compress_failed; old = zswap_entry_store(page_swpentry, entry); - if (xa_is_err(old)) { - int err = xa_err(old); - - WARN_ONCE(err != -ENOMEM, "unexpected xarray error: %d\n", err); - zswap_reject_alloc_fail++; - goto store_failed; - } /* * We may have had an existing entry that became stale when @@ -1462,11 +1424,11 @@ static bool zswap_store_page(struct page *page, zswap_entry_free(old); /* - * The entry is successfully compressed and stored in the tree, there is + * The entry is successfully compressed and stored in vswap, there is * no further possibility of failure. Grab refs to the pool and objcg, * charge zswap memory, and increment zswap_stored_pages. * The opposite actions will be performed by zswap_entry_free() - * when the entry is removed from the tree. + * when the entry is removed from vswap. */ zswap_pool_get(pool); if (objcg) { @@ -1478,7 +1440,7 @@ static bool zswap_store_page(struct page *page, atomic_long_inc(&zswap_stored_incompressible_pages); /* - * We finish initializing the entry while it's already in xarray. + * We finish initializing the entry while it's already in vswap. * This is safe because: * * 1. Concurrent stores and invalidations are excluded by folio lock. @@ -1498,8 +1460,6 @@ static bool zswap_store_page(struct page *page, return true; -store_failed: - zs_free(pool->zs_pool, entry->handle); compress_failed: zswap_entry_cache_free(entry); return false; -- 2.52.0