Prior discussion about this can be found at [1]. rmap_walk() requires all folios, except non-KSM anon, to be locked. This implies that when threads update folio->mapping to an anon_vma with different root (currently only done by UFFDIO MOVE), they have to serialize against rmap_walk() with write-lock on the anon_vma, hurting scalability. Furthermore, this necessitates rechecking anon_vma when pinning/locking an anon_vma (like in folio_lock_anon_vma_read()). This can be simplified quite a bit by ensuring that rmap_walk() is always called on locked folios. Among the few callers of rmap_walk() on unlocked anon folios, shrink_active_list()->folio_referenced() is the only performance critical one. shrink_active_list() doesn't act differently depending on what folio_referenced() returns for an anon folio. So returning 1 when it is contended, like in case of other folio types, wouldn't have any negative impact. Furthermore, as David pointed out in the previous discussion [2], this could potentially only affect R/O pages after fork as PG_anon_exclusive is not set. But, such folios are already isolated (prior to calling folio_referenced()) by grabbing a reference and clearing LRU, so do_wp_page()->wp_can_reuse_anon_folio() would not reuse such folios anyways. [1] https://lore.kernel.org/all/CA+EESO4Z6wtX7ZMdDHQRe5jAAS_bQ-POq5+4aDx5jh2DvY6UHg@mail.gmail.com/ [2] https://lore.kernel.org/all/dc92aef8-757f-4432-923e-70d92d13fb37@redhat.com/ CC: David Hildenbrand CC: Lorenzo Stoakes CC: Harry Yoo CC: Peter Xu CC: Suren Baghdasaryan CC: Barry Song CC: SeongJae Park Signed-off-by: Lokesh Gidra --- mm/damon/ops-common.c | 16 ++++------------ mm/page_idle.c | 8 ++------ mm/rmap.c | 40 ++++++++++------------------------------ 3 files changed, 16 insertions(+), 48 deletions(-) diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index 998c5180a603..f61d6dde13dc 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -162,21 +162,17 @@ void damon_folio_mkold(struct folio *folio) .rmap_one = damon_folio_mkold_one, .anon_lock = folio_lock_anon_vma_read, }; - bool need_lock; if (!folio_mapped(folio) || !folio_raw_mapping(folio)) { folio_set_idle(folio); return; } - need_lock = !folio_test_anon(folio) || folio_test_ksm(folio); - if (need_lock && !folio_trylock(folio)) + if (!folio_trylock(folio)) return; rmap_walk(folio, &rwc); - - if (need_lock) - folio_unlock(folio); + folio_unlock(folio); } @@ -228,7 +224,6 @@ bool damon_folio_young(struct folio *folio) .rmap_one = damon_folio_young_one, .anon_lock = folio_lock_anon_vma_read, }; - bool need_lock; if (!folio_mapped(folio) || !folio_raw_mapping(folio)) { if (folio_test_idle(folio)) @@ -237,14 +232,11 @@ bool damon_folio_young(struct folio *folio) return true; } - need_lock = !folio_test_anon(folio) || folio_test_ksm(folio); - if (need_lock && !folio_trylock(folio)) + if (!folio_trylock(folio)) return false; rmap_walk(folio, &rwc); - - if (need_lock) - folio_unlock(folio); + folio_unlock(folio); return accessed; } diff --git a/mm/page_idle.c b/mm/page_idle.c index a82b340dc204..9bf573d22e87 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -101,19 +101,15 @@ static void page_idle_clear_pte_refs(struct folio *folio) .rmap_one = page_idle_clear_pte_refs_one, .anon_lock = folio_lock_anon_vma_read, }; - bool need_lock; if (!folio_mapped(folio) || !folio_raw_mapping(folio)) return; - need_lock = !folio_test_anon(folio) || folio_test_ksm(folio); - if (need_lock && !folio_trylock(folio)) + if (!folio_trylock(folio)) return; rmap_walk(folio, &rwc); - - if (need_lock) - folio_unlock(folio); + folio_unlock(folio); } static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj, diff --git a/mm/rmap.c b/mm/rmap.c index 34333ae3bd80..fc53f31434f4 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -489,17 +489,15 @@ void __init anon_vma_init(void) * if there is a mapcount, we can dereference the anon_vma after observing * those. * - * NOTE: the caller should normally hold folio lock when calling this. If - * not, the caller needs to double check the anon_vma didn't change after - * taking the anon_vma lock for either read or write (UFFDIO_MOVE can modify it - * concurrently without folio lock protection). See folio_lock_anon_vma_read() - * which has already covered that, and comment above remap_pages(). + * NOTE: the caller should hold folio lock when calling this. */ struct anon_vma *folio_get_anon_vma(const struct folio *folio) { struct anon_vma *anon_vma = NULL; unsigned long anon_mapping; + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); + rcu_read_lock(); anon_mapping = (unsigned long)READ_ONCE(folio->mapping); if ((anon_mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON) @@ -546,7 +544,6 @@ struct anon_vma *folio_lock_anon_vma_read(const struct folio *folio, struct anon_vma *root_anon_vma; unsigned long anon_mapping; -retry: rcu_read_lock(); anon_mapping = (unsigned long)READ_ONCE(folio->mapping); if ((anon_mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON) @@ -557,17 +554,6 @@ struct anon_vma *folio_lock_anon_vma_read(const struct folio *folio, anon_vma = (struct anon_vma *) (anon_mapping - FOLIO_MAPPING_ANON); root_anon_vma = READ_ONCE(anon_vma->root); if (down_read_trylock(&root_anon_vma->rwsem)) { - /* - * folio_move_anon_rmap() might have changed the anon_vma as we - * might not hold the folio lock here. - */ - if (unlikely((unsigned long)READ_ONCE(folio->mapping) != - anon_mapping)) { - up_read(&root_anon_vma->rwsem); - rcu_read_unlock(); - goto retry; - } - /* * If the folio is still mapped, then this anon_vma is still * its anon_vma, and holding the mutex ensures that it will @@ -602,18 +588,6 @@ struct anon_vma *folio_lock_anon_vma_read(const struct folio *folio, rcu_read_unlock(); anon_vma_lock_read(anon_vma); - /* - * folio_move_anon_rmap() might have changed the anon_vma as we might - * not hold the folio lock here. - */ - if (unlikely((unsigned long)READ_ONCE(folio->mapping) != - anon_mapping)) { - anon_vma_unlock_read(anon_vma); - put_anon_vma(anon_vma); - anon_vma = NULL; - goto retry; - } - if (atomic_dec_and_test(&anon_vma->refcount)) { /* * Oops, we held the last refcount, release the lock @@ -1005,7 +979,7 @@ int folio_referenced(struct folio *folio, int is_locked, if (!folio_raw_mapping(folio)) return 0; - if (!is_locked && (!folio_test_anon(folio) || folio_test_ksm(folio))) { + if (!is_locked) { we_locked = folio_trylock(folio); if (!we_locked) return 1; @@ -2815,6 +2789,12 @@ static void rmap_walk_anon(struct folio *folio, pgoff_t pgoff_start, pgoff_end; struct anon_vma_chain *avc; + /* + * The folio lock ensures that folio->mapping can be changed under us to + * an anon_vma with different root, like UFFDIO MOVE. + */ + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + if (locked) { anon_vma = folio_anon_vma(folio); /* anon_vma disappear under us? */ base-commit: b024763926d2726978dff6588b81877d000159c1 -- 2.51.0.355.g5224444f11-goog