Since rmap_walk() is now always called on locked anon folios, we don't have to serialize on anon_vma lock when updating folio->mapping. This helps avoid contention on src anon_vma when multiple threads are simultaneously moving distinct pages from the same src vma. CC: David Hildenbrand CC: Lorenzo Stoakes CC: Peter Xu CC: Suren Baghdasaryan CC: Barry Song Signed-off-by: Lokesh Gidra --- mm/huge_memory.c | 22 +---------------- mm/userfaultfd.c | 62 +++++++++--------------------------------------- 2 files changed, 12 insertions(+), 72 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 26cedfcd7418..5cd3957f92d4 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2533,7 +2533,6 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm pmd_t _dst_pmd, src_pmdval; struct page *src_page; struct folio *src_folio; - struct anon_vma *src_anon_vma; spinlock_t *src_ptl, *dst_ptl; pgtable_t src_pgtable; struct mmu_notifier_range range; @@ -2582,23 +2581,9 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm src_addr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); - if (src_folio) { + if (src_folio) folio_lock(src_folio); - /* - * split_huge_page walks the anon_vma chain without the page - * lock. Serialize against it with the anon_vma lock, the page - * lock is not enough. - */ - src_anon_vma = folio_get_anon_vma(src_folio); - if (!src_anon_vma) { - err = -EAGAIN; - goto unlock_folio; - } - anon_vma_lock_write(src_anon_vma); - } else - src_anon_vma = NULL; - dst_ptl = pmd_lockptr(mm, dst_pmd); double_pt_lock(src_ptl, dst_ptl); if (unlikely(!pmd_same(*src_pmd, src_pmdval) || @@ -2643,11 +2628,6 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm pgtable_trans_huge_deposit(mm, dst_pmd, src_pgtable); unlock_ptls: double_pt_unlock(src_ptl, dst_ptl); - if (src_anon_vma) { - anon_vma_unlock_write(src_anon_vma); - put_anon_vma(src_anon_vma); - } -unlock_folio: /* unblock rmap walks */ if (src_folio) folio_unlock(src_folio); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 50aaa8dcd24c..1a36760a36c7 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -1035,8 +1035,7 @@ static inline bool is_pte_pages_stable(pte_t *dst_pte, pte_t *src_pte, */ static struct folio *check_ptes_for_batched_move(struct vm_area_struct *src_vma, unsigned long src_addr, - pte_t *src_pte, pte_t *dst_pte, - struct anon_vma *src_anon_vma) + pte_t *src_pte, pte_t *dst_pte) { pte_t orig_dst_pte, orig_src_pte; struct folio *folio; @@ -1052,8 +1051,7 @@ static struct folio *check_ptes_for_batched_move(struct vm_area_struct *src_vma, folio = vm_normal_folio(src_vma, src_addr, orig_src_pte); if (!folio || !folio_trylock(folio)) return NULL; - if (!PageAnonExclusive(&folio->page) || folio_test_large(folio) || - folio_anon_vma(folio) != src_anon_vma) { + if (!PageAnonExclusive(&folio->page) || folio_test_large(folio)) { folio_unlock(folio); return NULL; } @@ -1061,9 +1059,8 @@ static struct folio *check_ptes_for_batched_move(struct vm_area_struct *src_vma, } /* - * Moves src folios to dst in a batch as long as they share the same - * anon_vma as the first folio, are not large, and can successfully - * take the lock via folio_trylock(). + * Moves src folios to dst in a batch as long as they are not large, and can + * successfully take the lock via folio_trylock(). */ static long move_present_ptes(struct mm_struct *mm, struct vm_area_struct *dst_vma, @@ -1073,8 +1070,7 @@ static long move_present_ptes(struct mm_struct *mm, pte_t orig_dst_pte, pte_t orig_src_pte, pmd_t *dst_pmd, pmd_t dst_pmdval, spinlock_t *dst_ptl, spinlock_t *src_ptl, - struct folio **first_src_folio, unsigned long len, - struct anon_vma *src_anon_vma) + struct folio **first_src_folio, unsigned long len) { int err = 0; struct folio *src_folio = *first_src_folio; @@ -1132,8 +1128,8 @@ static long move_present_ptes(struct mm_struct *mm, src_pte++; folio_unlock(src_folio); - src_folio = check_ptes_for_batched_move(src_vma, src_addr, src_pte, - dst_pte, src_anon_vma); + src_folio = check_ptes_for_batched_move(src_vma, src_addr, + src_pte, dst_pte); if (!src_folio) break; } @@ -1263,7 +1259,6 @@ static long move_pages_ptes(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd pmd_t dummy_pmdval; pmd_t dst_pmdval; struct folio *src_folio = NULL; - struct anon_vma *src_anon_vma = NULL; struct mmu_notifier_range range; long ret = 0; @@ -1347,9 +1342,9 @@ static long move_pages_ptes(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd } /* - * Pin and lock both source folio and anon_vma. Since we are in - * RCU read section, we can't block, so on contention have to - * unmap the ptes, obtain the lock and retry. + * Pin and lock source folio. Since we are in RCU read section, + * we can't block, so on contention have to unmap the ptes, + * obtain the lock and retry. */ if (!src_folio) { struct folio *folio; @@ -1423,33 +1418,11 @@ static long move_pages_ptes(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd goto retry; } - if (!src_anon_vma) { - /* - * folio_referenced walks the anon_vma chain - * without the folio lock. Serialize against it with - * the anon_vma lock, the folio lock is not enough. - */ - src_anon_vma = folio_get_anon_vma(src_folio); - if (!src_anon_vma) { - /* page was unmapped from under us */ - ret = -EAGAIN; - goto out; - } - if (!anon_vma_trylock_write(src_anon_vma)) { - pte_unmap(src_pte); - pte_unmap(dst_pte); - src_pte = dst_pte = NULL; - /* now we can block and wait */ - anon_vma_lock_write(src_anon_vma); - goto retry; - } - } - ret = move_present_ptes(mm, dst_vma, src_vma, dst_addr, src_addr, dst_pte, src_pte, orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval, dst_ptl, src_ptl, &src_folio, - len, src_anon_vma); + len); } else { struct folio *folio = NULL; @@ -1516,10 +1489,6 @@ static long move_pages_ptes(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd } out: - if (src_anon_vma) { - anon_vma_unlock_write(src_anon_vma); - put_anon_vma(src_anon_vma); - } if (src_folio) { folio_unlock(src_folio); folio_put(src_folio); @@ -1793,15 +1762,6 @@ static void uffd_move_unlock(struct vm_area_struct *dst_vma, * virtual regions without knowing if there are transparent hugepage * in the regions or not, but preventing the risk of having to split * the hugepmd during the remap. - * - * If there's any rmap walk that is taking the anon_vma locks without - * first obtaining the folio lock (the only current instance is - * folio_referenced), they will have to verify if the folio->mapping - * has changed after taking the anon_vma lock. If it changed they - * should release the lock and retry obtaining a new anon_vma, because - * it means the anon_vma was changed by move_pages() before the lock - * could be obtained. This is the only additional complexity added to - * the rmap code to provide this anonymous page remapping functionality. */ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, unsigned long src_start, unsigned long len, __u64 mode) -- 2.51.0.355.g5224444f11-goog