Teach copy_huge_pmd()/copy_huge_non_present_pmd() about swap entries, mirroring copy_nonpresent_pte(). swap_dup_entry_direct() gains a nr parameter (and is renamed to swap_dup_entries_direct()) so it can duplicate a contiguous range of swap slots in one call, matching the existing swap_put_entries_direct(entry, nr) API. Existing callers pass 1. copy_huge_non_present_pmd() "copies" PMD swap entries during fork instead of splitting, preserving the THP. This mirrors copy_nonpresent_pte() which duplicates the swap slot refcount, clears the exclusive bit on the source, and adds the destination mm to mmlist. If swap_dup_entries_direct() fails (GFP_ATOMIC table alloc), copy_huge_pmd() retries after swap_retry_table_alloc() with GFP_KERNEL, matching the PTE retry in copy_pte_range(). The PMD is stable across the retry because dup_mmap() holds write mmap_lock on both mm_structs. Signed-off-by: Usama Arif --- include/linux/swap.h | 4 ++-- mm/huge_memory.c | 53 ++++++++++++++++++++++++++++++++++++++------ mm/memory.c | 2 +- mm/swapfile.c | 7 +++--- 4 files changed, 53 insertions(+), 13 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 8d19be675baf..0b1db19e6ae3 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -451,7 +451,7 @@ sector_t swap_folio_sector(struct folio *folio); * All entries must be allocated by folio_alloc_swap(). And they must have * a swap count > 1. See comments of folio_*_swap helpers for more info. */ -int swap_dup_entry_direct(swp_entry_t entry); +int swap_dup_entries_direct(swp_entry_t entry, int nr); void swap_put_entries_direct(swp_entry_t entry, int nr); /* @@ -495,7 +495,7 @@ static inline void free_swap_cache(struct folio *folio) { } -static inline int swap_dup_entry_direct(swp_entry_t ent) +static inline int swap_dup_entries_direct(swp_entry_t ent, int nr) { return 0; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 201193ce0373..69e4e09ac1f6 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1805,7 +1805,7 @@ bool touch_pmd(struct vm_area_struct *vma, unsigned long addr, return false; } -static void copy_huge_non_present_pmd( +static int copy_huge_non_present_pmd( struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, @@ -1851,14 +1851,35 @@ static void copy_huge_non_present_pmd( */ folio_try_dup_anon_rmap_pmd(src_folio, &src_folio->page, dst_vma, src_vma); + } else if (softleaf_is_swap(entry)) { + int err; + + /* + * PMD swap entry: duplicate swap references and clear + * exclusive on source, matching copy_nonpresent_pte(). + */ + err = swap_dup_entries_direct(entry, HPAGE_PMD_NR); + if (err < 0) + return err; + + mm_prepare_for_swap_entries(dst_mm); + + if (pmd_swp_exclusive(pmd)) { + pmd = pmd_swp_clear_exclusive(pmd); + set_pmd_at(src_mm, addr, src_pmd, pmd); + } } - add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); + if (softleaf_is_swap(entry)) + add_mm_counter(dst_mm, MM_SWAPENTS, HPAGE_PMD_NR); + else + add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); mm_inc_nr_ptes(dst_mm); pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); if (!userfaultfd_wp(dst_vma)) pmd = pmd_swp_clear_uffd_wp(pmd); set_pmd_at(dst_mm, addr, dst_pmd, pmd); + return 0; } int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, @@ -1899,6 +1920,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (unlikely(!pgtable)) goto out; +retry: dst_ptl = pmd_lock(dst_mm, dst_pmd); src_ptl = pmd_lockptr(src_mm, src_pmd); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); @@ -1906,11 +1928,28 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, ret = -EAGAIN; pmd = *src_pmd; - if (unlikely(thp_migration_supported() && - pmd_is_valid_softleaf(pmd))) { - copy_huge_non_present_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr, - dst_vma, src_vma, pmd, pgtable); - ret = 0; + if (unlikely(pmd_is_valid_softleaf(pmd))) { + ret = copy_huge_non_present_pmd(dst_mm, src_mm, dst_pmd, src_pmd, + addr, dst_vma, src_vma, pmd, + pgtable); + if (ret) { + spin_unlock(src_ptl); + spin_unlock(dst_ptl); + /* + * For PMD swap entries -ENOMEM means the per-cluster + * swap-extend table couldn't be GFP_ATOMIC-allocated. + * try the GFP_KERNEL fallback once before giving up. + */ + if (ret == -ENOMEM) { + softleaf_t entry = softleaf_from_pmd(pmd); + + if (softleaf_is_swap(entry) && + !swap_retry_table_alloc(entry, GFP_KERNEL)) + goto retry; + } + pte_free(dst_mm, pgtable); + goto out; + } goto out_unlock; } diff --git a/mm/memory.c b/mm/memory.c index 6637c5b13c9b..e0819a562187 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -950,7 +950,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, struct page *page; if (likely(softleaf_is_swap(entry))) { - if (swap_dup_entry_direct(entry) < 0) + if (swap_dup_entries_direct(entry, 1) < 0) return -EIO; mm_prepare_for_swap_entries(dst_mm); diff --git a/mm/swapfile.c b/mm/swapfile.c index 5a69716b2052..0695dbd1a8b1 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3899,8 +3899,9 @@ void si_swapinfo(struct sysinfo *val) } /* - * swap_dup_entry_direct() - Increase reference count of a swap entry by one. + * swap_dup_entries_direct() - Increase reference count of swap entries by one. * @entry: first swap entry from which we want to increase the refcount. + * @nr: number of contiguous swap entries to duplicate. * * Returns 0 for success, or -ENOMEM if the extend table is required * but could not be atomically allocated. Returns -EINVAL if the swap @@ -3912,7 +3913,7 @@ void si_swapinfo(struct sysinfo *val) * Also the swap entry must have a count >= 1. Otherwise folio_dup_swap should * be used. */ -int swap_dup_entry_direct(swp_entry_t entry) +int swap_dup_entries_direct(swp_entry_t entry, int nr) { struct swap_info_struct *si; @@ -3929,7 +3930,7 @@ int swap_dup_entry_direct(swp_entry_t entry) */ VM_WARN_ON_ONCE(!swap_entry_swapped(si, entry)); - return swap_dup_entries_cluster(si, swp_offset(entry), 1); + return swap_dup_entries_cluster(si, swp_offset(entry), nr); } #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) -- 2.53.0-Meta