From: Kairui Song Now the large order allocation is supported in swap cache, making both anon and shmem use this instead of implementing their own different method for doing so. Signed-off-by: Kairui Song --- mm/memory.c | 77 +++++--------------------- mm/shmem.c | 94 ++++++++------------------------ mm/swap.h | 30 ++--------- mm/swap_state.c | 163 ++++++++++++-------------------------------------------- mm/swapfile.c | 3 +- 5 files changed, 76 insertions(+), 291 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 21bf2517fbce..e58f976508b3 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4520,26 +4520,6 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf) return VM_FAULT_SIGBUS; } -static struct folio *__alloc_swap_folio(struct vm_fault *vmf) -{ - struct vm_area_struct *vma = vmf->vma; - struct folio *folio; - softleaf_t entry; - - folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address); - if (!folio) - return NULL; - - entry = softleaf_from_pte(vmf->orig_pte); - if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, - GFP_KERNEL, entry)) { - folio_put(folio); - return NULL; - } - - return folio; -} - #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * Check if the PTEs within a range are contiguous swap entries @@ -4569,8 +4549,6 @@ static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages) */ if (unlikely(swap_zeromap_batch(entry, nr_pages, NULL) != nr_pages)) return false; - if (unlikely(non_swapcache_batch(entry, nr_pages) != nr_pages)) - return false; return true; } @@ -4598,16 +4576,14 @@ static inline unsigned long thp_swap_suitable_orders(pgoff_t swp_offset, return orders; } -static struct folio *alloc_swap_folio(struct vm_fault *vmf) +static unsigned long thp_swapin_suiltable_orders(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; unsigned long orders; - struct folio *folio; unsigned long addr; softleaf_t entry; spinlock_t *ptl; pte_t *pte; - gfp_t gfp; int order; /* @@ -4615,7 +4591,7 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) * maintain the uffd semantics. */ if (unlikely(userfaultfd_armed(vma))) - goto fallback; + return 0; /* * A large swapped out folio could be partially or fully in zswap. We @@ -4623,7 +4599,7 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) * folio. */ if (!zswap_never_enabled()) - goto fallback; + return 0; entry = softleaf_from_pte(vmf->orig_pte); /* @@ -4637,12 +4613,12 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) vmf->address, orders); if (!orders) - goto fallback; + return 0; pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address & PMD_MASK, &ptl); if (unlikely(!pte)) - goto fallback; + return 0; /* * For do_swap_page, find the highest order where the aligned range is @@ -4658,29 +4634,12 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) pte_unmap_unlock(pte, ptl); - /* Try allocating the highest of the remaining orders. */ - gfp = vma_thp_gfp_mask(vma); - while (orders) { - addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); - folio = vma_alloc_folio(gfp, order, vma, addr); - if (folio) { - if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, - gfp, entry)) - return folio; - count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK_CHARGE); - folio_put(folio); - } - count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK); - order = next_order(&orders, order); - } - -fallback: - return __alloc_swap_folio(vmf); + return orders; } #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ -static struct folio *alloc_swap_folio(struct vm_fault *vmf) +static unsigned long thp_swapin_suiltable_orders(struct vm_fault *vmf) { - return __alloc_swap_folio(vmf); + return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -4785,21 +4744,13 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (folio) swap_update_readahead(folio, vma, vmf->address); if (!folio) { - if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) { - folio = alloc_swap_folio(vmf); - if (folio) { - /* - * folio is charged, so swapin can only fail due - * to raced swapin and return NULL. - */ - swapcache = swapin_folio(entry, folio); - if (swapcache != folio) - folio_put(folio); - folio = swapcache; - } - } else { + /* Swapin bypass readahead for SWP_SYNCHRONOUS_IO devices */ + if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) + folio = swapin_entry(entry, GFP_HIGHUSER_MOVABLE, + thp_swapin_suiltable_orders(vmf), + vmf, NULL, 0); + else folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vmf); - } if (!folio) { /* diff --git a/mm/shmem.c b/mm/shmem.c index 9f054b5aae8e..0a19ac82ec77 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -159,7 +159,7 @@ static unsigned long shmem_default_max_inodes(void) static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp, gfp_t gfp, - struct vm_area_struct *vma, vm_fault_t *fault_type); + struct vm_fault *vmf, vm_fault_t *fault_type); static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) { @@ -2014,68 +2014,24 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf, } static struct folio *shmem_swap_alloc_folio(struct inode *inode, - struct vm_area_struct *vma, pgoff_t index, + struct vm_fault *vmf, pgoff_t index, swp_entry_t entry, int order, gfp_t gfp) { + pgoff_t ilx; + struct folio *folio; + struct mempolicy *mpol; + unsigned long orders = BIT(order); struct shmem_inode_info *info = SHMEM_I(inode); - struct folio *new, *swapcache; - int nr_pages = 1 << order; - gfp_t alloc_gfp = gfp; - - if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { - if (WARN_ON_ONCE(order)) - return ERR_PTR(-EINVAL); - } else if (order) { - /* - * If uffd is active for the vma, we need per-page fault - * fidelity to maintain the uffd semantics, then fallback - * to swapin order-0 folio, as well as for zswap case. - * Any existing sub folio in the swap cache also blocks - * mTHP swapin. - */ - if ((vma && unlikely(userfaultfd_armed(vma))) || - !zswap_never_enabled() || - non_swapcache_batch(entry, nr_pages) != nr_pages) - goto fallback; - alloc_gfp = thp_limit_gfp_mask(vma_thp_gfp_mask(vma), gfp); - } -retry: - new = shmem_alloc_folio(alloc_gfp, order, info, index); - if (!new) { - new = ERR_PTR(-ENOMEM); - goto fallback; - } + if ((vmf && unlikely(userfaultfd_armed(vmf->vma))) || + !zswap_never_enabled()) + orders = 0; - if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL, - alloc_gfp, entry)) { - folio_put(new); - new = ERR_PTR(-ENOMEM); - goto fallback; - } + mpol = shmem_get_pgoff_policy(info, index, order, &ilx); + folio = swapin_entry(entry, gfp, orders, vmf, mpol, ilx); + mpol_cond_put(mpol); - swapcache = swapin_folio(entry, new); - if (swapcache != new) { - folio_put(new); - if (!swapcache) { - /* - * The new folio is charged already, swapin can - * only fail due to another raced swapin. - */ - new = ERR_PTR(-EEXIST); - goto fallback; - } - } - return swapcache; -fallback: - /* Order 0 swapin failed, nothing to fallback to, abort */ - if (!order) - return new; - entry.val += index - round_down(index, nr_pages); - alloc_gfp = gfp; - nr_pages = 1; - order = 0; - goto retry; + return folio; } /* @@ -2262,11 +2218,12 @@ static int shmem_split_large_entry(struct inode *inode, pgoff_t index, */ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp, - gfp_t gfp, struct vm_area_struct *vma, + gfp_t gfp, struct vm_fault *vmf, vm_fault_t *fault_type) { struct address_space *mapping = inode->i_mapping; - struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL; + struct vm_area_struct *vma = vmf ? vmf->vma : NULL; + struct mm_struct *fault_mm = vmf ? vmf->vma->vm_mm : NULL; struct shmem_inode_info *info = SHMEM_I(inode); swp_entry_t swap; softleaf_t index_entry; @@ -2307,20 +2264,15 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, if (!folio) { if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) { /* Direct swapin skipping swap cache & readahead */ - folio = shmem_swap_alloc_folio(inode, vma, index, - index_entry, order, gfp); - if (IS_ERR(folio)) { - error = PTR_ERR(folio); - folio = NULL; - goto failed; - } + folio = shmem_swap_alloc_folio(inode, vmf, index, + swap, order, gfp); } else { /* Cached swapin only supports order 0 folio */ folio = shmem_swapin_cluster(swap, gfp, info, index); - if (!folio) { - error = -ENOMEM; - goto failed; - } + } + if (!folio) { + error = -ENOMEM; + goto failed; } if (fault_type) { *fault_type |= VM_FAULT_MAJOR; @@ -2468,7 +2420,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, if (xa_is_value(folio)) { error = shmem_swapin_folio(inode, index, &folio, - sgp, gfp, vma, fault_type); + sgp, gfp, vmf, fault_type); if (error == -EEXIST) goto repeat; diff --git a/mm/swap.h b/mm/swap.h index 6774af10a943..80c2f1bf7a57 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -300,7 +300,8 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, struct mempolicy *mpol, pgoff_t ilx); struct folio *swapin_readahead(swp_entry_t entry, gfp_t flag, struct vm_fault *vmf); -struct folio *swapin_folio(swp_entry_t entry, struct folio *folio); +struct folio *swapin_entry(swp_entry_t entry, gfp_t flag, unsigned long orders, + struct vm_fault *vmf, struct mempolicy *mpol, pgoff_t ilx); void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma, unsigned long addr); @@ -334,24 +335,6 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, return find_next_bit(sis->zeromap, end, start) - start; } -static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) -{ - int i; - - /* - * While allocating a large folio and doing mTHP swapin, we need to - * ensure all entries are not cached, otherwise, the mTHP folio will - * be in conflict with the folio in swap cache. - */ - for (i = 0; i < max_nr; i++) { - if (swap_cache_has_folio(entry)) - return i; - entry.val++; - } - - return i; -} - #else /* CONFIG_SWAP */ struct swap_iocb; static inline struct swap_cluster_info *swap_cluster_lock( @@ -433,7 +416,9 @@ static inline struct folio *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask, return NULL; } -static inline struct folio *swapin_folio(swp_entry_t entry, struct folio *folio) +static inline struct folio *swapin_entry( + swp_entry_t entry, gfp_t flag, unsigned long orders, + struct vm_fault *vmf, struct mempolicy *mpol, pgoff_t ilx) { return NULL; } @@ -493,10 +478,5 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, { return 0; } - -static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) -{ - return 0; -} #endif /* CONFIG_SWAP */ #endif /* _MM_SWAP_H */ diff --git a/mm/swap_state.c b/mm/swap_state.c index e32b06a1f229..0a2a4e084cf2 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -199,43 +199,6 @@ void __swap_cache_add_folio(struct swap_cluster_info *ci, lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr_pages); } -/** - * swap_cache_add_folio - Add a folio into the swap cache. - * @folio: The folio to be added. - * @entry: The swap entry corresponding to the folio. - * @gfp: gfp_mask for XArray node allocation. - * @shadowp: If a shadow is found, return the shadow. - * - * Context: Caller must ensure @entry is valid and protect the swap device - * with reference count or locks. - */ -static int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, - void **shadowp) -{ - int err; - void *shadow = NULL; - unsigned int ci_off; - struct swap_info_struct *si; - struct swap_cluster_info *ci; - unsigned long nr_pages = folio_nr_pages(folio); - - si = __swap_entry_to_info(entry); - ci = swap_cluster_lock(si, swp_offset(entry)); - ci_off = swp_cluster_offset(entry); - err = __swap_cache_check_batch(ci, ci_off, ci_off, nr_pages, &shadow); - if (err) { - swap_cluster_unlock(ci); - return err; - } - - __swap_cache_add_folio(ci, folio, entry); - swap_cluster_unlock(ci); - if (shadowp) - *shadowp = shadow; - - return 0; -} - static struct folio *__swap_cache_alloc(struct swap_cluster_info *ci, swp_entry_t targ_entry, gfp_t gfp, unsigned int order, struct vm_fault *vmf, @@ -328,30 +291,28 @@ struct folio *swap_cache_alloc_folio(swp_entry_t targ_entry, gfp_t gfp_mask, unsigned long orders, struct vm_fault *vmf, struct mempolicy *mpol, pgoff_t ilx) { - int order; + int order, err; struct folio *folio; struct swap_cluster_info *ci; + /* Always allow order 0 so swap won't fail under pressure. */ + order = orders ? highest_order(orders |= BIT(0)) : 0; ci = __swap_entry_to_cluster(targ_entry); - order = orders ? highest_order(orders) : 0; for (;;) { folio = __swap_cache_alloc(ci, targ_entry, gfp_mask, order, vmf, mpol, ilx); if (!IS_ERR(folio)) return folio; - if (PTR_ERR(folio) == -EAGAIN) + err = PTR_ERR(folio); + if (err == -EAGAIN) continue; - /* Only -EBUSY means we should fallback and retry. */ - if (PTR_ERR(folio) != -EBUSY) - return folio; + if (!order || (err != -EBUSY && err != -ENOMEM)) + break; count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK); order = next_order(&orders, order); - if (!orders) - break; } - /* Should never reach here, order 0 should not fail with -EBUSY. */ - WARN_ON_ONCE(1); - return ERR_PTR(-EINVAL); + + return ERR_PTR(err); } /** @@ -584,51 +545,6 @@ void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma, } } -/** - * __swap_cache_prepare_and_add - Prepare the folio and add it to swap cache. - * @entry: swap entry to be bound to the folio. - * @folio: folio to be added. - * @gfp: memory allocation flags for charge, can be 0 if @charged if true. - * @charged: if the folio is already charged. - * - * Update the swap_map and add folio as swap cache, typically before swapin. - * All swap slots covered by the folio must have a non-zero swap count. - * - * Context: Caller must protect the swap device with reference count or locks. - * Return: 0 if success, error code if failed. - */ -static int __swap_cache_prepare_and_add(swp_entry_t entry, - struct folio *folio, - gfp_t gfp, bool charged) -{ - void *shadow; - int ret; - - __folio_set_locked(folio); - __folio_set_swapbacked(folio); - ret = swap_cache_add_folio(folio, entry, &shadow); - if (ret) - goto failed; - - if (!charged && mem_cgroup_swapin_charge_folio(folio, NULL, gfp, entry)) { - swap_cache_del_folio(folio); - ret = -ENOMEM; - goto failed; - } - - memcg1_swapin(entry, folio_nr_pages(folio)); - if (shadow) - workingset_refault(folio, shadow); - - /* Caller will initiate read into locked folio */ - folio_add_lru(folio); - return 0; - -failed: - folio_unlock(folio); - return ret; -} - static struct folio *swap_cache_read_folio(swp_entry_t entry, gfp_t gfp, struct mempolicy *mpol, pgoff_t ilx, struct swap_iocb **plug, bool readahead) @@ -649,7 +565,6 @@ static struct folio *swap_cache_read_folio(swp_entry_t entry, gfp_t gfp, folio = swap_cache_get_folio(entry); if (folio) return folio; - folio = swap_cache_alloc_folio(entry, gfp, 0, NULL, mpol, ilx); } while (PTR_ERR(folio) == -EEXIST); @@ -666,49 +581,37 @@ static struct folio *swap_cache_read_folio(swp_entry_t entry, gfp_t gfp, } /** - * swapin_folio - swap-in one or multiple entries skipping readahead. - * @entry: starting swap entry to swap in - * @folio: a new allocated and charged folio + * swapin_entry - swap-in one or multiple entries skipping readahead. + * @entry: swap entry indicating the target slot + * @gfp_mask: memory allocation flags + * @orders: allocation orders + * @vmf: fault information + * @mpol: NUMA memory allocation policy to be applied + * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE * - * Reads @entry into @folio, @folio will be added to the swap cache. - * If @folio is a large folio, the @entry will be rounded down to align - * with the folio size. + * This would allocate a folio suit given @orders, or return the existing + * folio in the swap cache for @entry. This initiates the IO, too, if needed. + * @entry could be rounded down if @orders allows large allocation. * - * Return: returns pointer to @folio on success. If folio is a large folio - * and this raced with another swapin, NULL will be returned to allow fallback - * to order 0. Else, if another folio was already added to the swap cache, - * return that swap cache folio instead. + * Context: Caller must ensure @entry is valid and pin the swap device with refcount. + * Return: Returns the folio on success, returns error code if failed. */ -struct folio *swapin_folio(swp_entry_t entry, struct folio *folio) +struct folio *swapin_entry(swp_entry_t entry, gfp_t gfp, unsigned long orders, + struct vm_fault *vmf, struct mempolicy *mpol, pgoff_t ilx) { - int ret; - struct folio *swapcache; - pgoff_t offset = swp_offset(entry); - unsigned long nr_pages = folio_nr_pages(folio); - - entry = swp_entry(swp_type(entry), round_down(offset, nr_pages)); - for (;;) { - ret = __swap_cache_prepare_and_add(entry, folio, 0, true); - if (!ret) { - swap_read_folio(folio, NULL); - break; - } + struct folio *folio; - /* - * Large order allocation needs special handling on - * race: if a smaller folio exists in cache, swapin needs - * to fallback to order 0, and doing a swap cache lookup - * might return a folio that is irrelevant to the faulting - * entry because @entry is aligned down. Just return NULL. - */ - if (ret != -EEXIST || nr_pages > 1) - return NULL; + do { + folio = swap_cache_get_folio(entry); + if (folio) + return folio; + folio = swap_cache_alloc_folio(entry, gfp, orders, vmf, mpol, ilx); + } while (PTR_ERR(folio) == -EEXIST); - swapcache = swap_cache_get_folio(entry); - if (swapcache) - return swapcache; - } + if (IS_ERR(folio)) + return NULL; + swap_read_folio(folio, NULL); return folio; } diff --git a/mm/swapfile.c b/mm/swapfile.c index 06b37efad2bd..7e7614a5181a 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1833,8 +1833,7 @@ void folio_put_swap(struct folio *folio, struct page *subpage) * do_swap_page() * ... swapoff+swapon * swap_cache_alloc_folio() - * swap_cache_add_folio() - * // check swap_map + * // check swap_map * // verify PTE not changed * * In __swap_duplicate(), the swap_map need to be checked before -- 2.53.0