Add kdoc comments, describe exactly what these functinos are used for in detail, pointing out importantly that the anon_vma_clone() !dst->anon_vma && src->anon_vma dance is ONLY for fork. Both are confusing functions that will be refactored in a subsequent patch but the first stage is establishing documentation and some invariatns. Add some basic CONFIG_DEBUG_VM asserts that help document expected state, specifically: anon_vma_clone() - mmap write lock held. - We do nothing if src VMA is not faulted. - The destination VMA has no anon_vma_chain yet. - We are always operating on the same active VMA (i.e. vma->anon-vma). - If not forking, must operate on the same mm_struct. unlink_anon_vmas() - mmap lock held (read on unmap downgraded). - That unfaulted VMAs are no-ops. Signed-off-by: Lorenzo Stoakes --- mm/rmap.c | 82 +++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 64 insertions(+), 18 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index d6799afe1114..0e34c0a69fbc 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -257,30 +257,60 @@ static inline void unlock_anon_vma_root(struct anon_vma *root) up_write(&root->rwsem); } -/* - * Attach the anon_vmas from src to dst. - * Returns 0 on success, -ENOMEM on failure. - * - * anon_vma_clone() is called by vma_expand(), vma_merge(), __split_vma(), - * copy_vma() and anon_vma_fork(). The first four want an exact copy of src, - * while the last one, anon_vma_fork(), may try to reuse an existing anon_vma to - * prevent endless growth of anon_vma. Since dst->anon_vma is set to NULL before - * call, we can identify this case by checking (!dst->anon_vma && - * src->anon_vma). - * - * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find - * and reuse existing anon_vma which has no vmas and only one child anon_vma. - * This prevents degradation of anon_vma hierarchy to endless linear chain in - * case of constantly forking task. On the other hand, an anon_vma with more - * than one child isn't reused even if there was no alive vma, thus rmap - * walker has a good chance of avoiding scanning the whole hierarchy when it - * searches where page is mapped. +static void check_anon_vma_clone(struct vm_area_struct *dst, + struct vm_area_struct *src) +{ + /* The write lock must be held. */ + mmap_assert_write_locked(src->vm_mm); + /* If not a fork (implied by dst->anon_vma) then must be on same mm. */ + VM_WARN_ON_ONCE(dst->anon_vma && dst->vm_mm != src->vm_mm); + + /* No source anon_vma is a no-op. */ + VM_WARN_ON_ONCE(!src->anon_vma && !list_empty(&src->anon_vma_chain)); + VM_WARN_ON_ONCE(!src->anon_vma && dst->anon_vma); + /* We are establishing a new anon_vma_chain. */ + VM_WARN_ON_ONCE(!list_empty(&dst->anon_vma_chain)); + /* + * On fork, dst->anon_vma is set NULL (temporarily). Otherwise, anon_vma + * must be the same across dst and src. + */ + VM_WARN_ON_ONCE(dst->anon_vma && dst->anon_vma != src->anon_vma); +} + +/** + * anon_vma_clone - Establishes new anon_vma_chain objects in @dst linking to + * all of the anon_vma objects contained within @src anon_vma_chain's. + * @dst: The destination VMA with an empty anon_vma_chain. + * @src: The source VMA we wish to duplicate. + * + * This is the heart of the VMA side of the anon_vma implementation - we invoke + * this function whenever we need to set up a new VMA's anon_vma state. + * + * This is invoked for: + * + * - VMA Merge, but only when @dst is unfaulted and @src is faulted - meaning we + * clone @src into @dst. + * - VMA split. + * - VMA (m)remap. + * - Fork of faulted VMA. + * + * In all cases other than fork this is simply a duplication. Fork additionally + * adds a new active anon_vma. + * + * ONLY in the case of fork do we try to 'reuse' existing anon_vma's in an + * anon_vma hierarchy, reusing anon_vma's which have no VMA associated with them + * but do have a single child. This is to avoid waste of memory when repeatedly + * forking. + * + * Returns: 0 on success, -ENOMEM on failure. */ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) { struct anon_vma_chain *avc, *pavc; struct anon_vma *root = NULL; + check_anon_vma_clone(dst, src); + list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { struct anon_vma *anon_vma; @@ -392,11 +422,27 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) return -ENOMEM; } +/** + * unlink_anon_vmas() - remove all links between a VMA and anon_vma's, freeing + * anon_vma_chain objects. + * @vma: The VMA whose links to anon_vma objects is to be severed. + * + * As part of the process anon_vma_chain's are freed, + * anon_vma->num_children,num_active_vmas is updated as required and, if the + * relevant anon_vma references no further VMAs, its reference count is + * decremented. + */ void unlink_anon_vmas(struct vm_area_struct *vma) { struct anon_vma_chain *avc, *next; struct anon_vma *root = NULL; + /* Always hold mmap lock, read-lock on unmap possibly. */ + mmap_assert_locked(vma->vm_mm); + + /* Unfaulted is a no-op. */ + VM_WARN_ON_ONCE(!vma->anon_vma && !list_empty(&vma->anon_vma_chain)); + /* * Unlink each anon_vma chained to the VMA. This list is ordered * from newest to oldest, ensuring the root anon_vma gets freed last. -- 2.52.0 For both anon_vma_clone() and unlink_anon_vmas(), if the source VMA or the VMA to be linked are unfaulted (e.g. !vma->anon_vma), then the functions do nothing. Simply exit early in these cases. In the unlink_anon_vmas() case we can also remove a conditional that checks whether vma->anon_vma is set. Signed-off-by: Lorenzo Stoakes --- mm/rmap.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 0e34c0a69fbc..9332d1cbc643 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -309,6 +309,9 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) struct anon_vma_chain *avc, *pavc; struct anon_vma *root = NULL; + if (!src->anon_vma) + return 0; + check_anon_vma_clone(dst, src); list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { @@ -441,7 +444,8 @@ void unlink_anon_vmas(struct vm_area_struct *vma) mmap_assert_locked(vma->vm_mm); /* Unfaulted is a no-op. */ - VM_WARN_ON_ONCE(!vma->anon_vma && !list_empty(&vma->anon_vma_chain)); + if (!vma->anon_vma) + return; /* * Unlink each anon_vma chained to the VMA. This list is ordered @@ -465,15 +469,13 @@ void unlink_anon_vmas(struct vm_area_struct *vma) list_del(&avc->same_vma); anon_vma_chain_free(avc); } - if (vma->anon_vma) { - vma->anon_vma->num_active_vmas--; - /* - * vma would still be needed after unlink, and anon_vma will be prepared - * when handle fault. - */ - vma->anon_vma = NULL; - } + vma->anon_vma->num_active_vmas--; + /* + * vma would still be needed after unlink, and anon_vma will be prepared + * when handle fault. + */ + vma->anon_vma = NULL; unlock_anon_vma_root(root); /* -- 2.52.0 The root anon_vma of all anon_vma's linked to a VMA must by definition be the same - a VMA and all of its descendants/ancestors must exist in the same CoW chain. Commit bb4aa39676f7 ("mm: avoid repeated anon_vma lock/unlock sequences in anon_vma_clone()") introduced paranoid checking of the root anon_vma remaining the same throughout all AVC's in 2011. I think 15 years later we can safely assume that this is always the case. Additionally, since unfaulted VMAs being cloned from or unlinked are no-op's, we can simply lock the anon_vma's associated with this rather than doing any specific dance around this. This removes unnecessary checks and makes it clear that the root anon_vma is shared between all anon_vma's in a given VMA's anon_vma_chain. Signed-off-by: Lorenzo Stoakes --- mm/rmap.c | 48 ++++++++++++------------------------------------ 1 file changed, 12 insertions(+), 36 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 9332d1cbc643..60134a566073 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -231,32 +231,6 @@ int __anon_vma_prepare(struct vm_area_struct *vma) return -ENOMEM; } -/* - * This is a useful helper function for locking the anon_vma root as - * we traverse the vma->anon_vma_chain, looping over anon_vma's that - * have the same vma. - * - * Such anon_vma's should have the same root, so you'd expect to see - * just a single mutex_lock for the whole traversal. - */ -static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma) -{ - struct anon_vma *new_root = anon_vma->root; - if (new_root != root) { - if (WARN_ON_ONCE(root)) - up_write(&root->rwsem); - root = new_root; - down_write(&root->rwsem); - } - return root; -} - -static inline void unlock_anon_vma_root(struct anon_vma *root) -{ - if (root) - up_write(&root->rwsem); -} - static void check_anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) { @@ -307,26 +281,25 @@ static void check_anon_vma_clone(struct vm_area_struct *dst, int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) { struct anon_vma_chain *avc, *pavc; - struct anon_vma *root = NULL; if (!src->anon_vma) return 0; check_anon_vma_clone(dst, src); + anon_vma_lock_write(src->anon_vma); list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { struct anon_vma *anon_vma; avc = anon_vma_chain_alloc(GFP_NOWAIT); if (unlikely(!avc)) { - unlock_anon_vma_root(root); - root = NULL; + anon_vma_unlock_write(src->anon_vma); avc = anon_vma_chain_alloc(GFP_KERNEL); if (!avc) goto enomem_failure; + anon_vma_lock_write(src->anon_vma); } anon_vma = pavc->anon_vma; - root = lock_anon_vma_root(root, anon_vma); anon_vma_chain_link(dst, avc, anon_vma); /* @@ -343,7 +316,8 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) } if (dst->anon_vma) dst->anon_vma->num_active_vmas++; - unlock_anon_vma_root(root); + + anon_vma_unlock_write(src->anon_vma); return 0; enomem_failure: @@ -438,15 +412,17 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) void unlink_anon_vmas(struct vm_area_struct *vma) { struct anon_vma_chain *avc, *next; - struct anon_vma *root = NULL; + struct anon_vma *active_anon_vma = vma->anon_vma; /* Always hold mmap lock, read-lock on unmap possibly. */ mmap_assert_locked(vma->vm_mm); /* Unfaulted is a no-op. */ - if (!vma->anon_vma) + if (!active_anon_vma) return; + anon_vma_lock_write(active_anon_vma); + /* * Unlink each anon_vma chained to the VMA. This list is ordered * from newest to oldest, ensuring the root anon_vma gets freed last. @@ -454,7 +430,6 @@ void unlink_anon_vmas(struct vm_area_struct *vma) list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { struct anon_vma *anon_vma = avc->anon_vma; - root = lock_anon_vma_root(root, anon_vma); anon_vma_interval_tree_remove(avc, &anon_vma->rb_root); /* @@ -470,13 +445,14 @@ void unlink_anon_vmas(struct vm_area_struct *vma) anon_vma_chain_free(avc); } - vma->anon_vma->num_active_vmas--; + active_anon_vma->num_active_vmas--; /* * vma would still be needed after unlink, and anon_vma will be prepared * when handle fault. */ vma->anon_vma = NULL; - unlock_anon_vma_root(root); + anon_vma_unlock_write(active_anon_vma); + /* * Iterate the list once more, it now only contains empty and unlinked -- 2.52.0 This function is confusing, we already have the concept of anon_vma merge to adjacent VMA's anon_vma's to increase probability of anon_vma compatibility and therefore VMA merge (see is_mergeable_anon_vma() etc.), as well as anon_vma reuse, along side the usual VMA merge logic. We can remove the anon_vma check as it is redundant - a merge would not have been permitted with removal if the anon_vma's were not the same (and in the case of an unfaulted/faulted merge, we would have already set the unfaulted VMA's anon_vma to vp->remove->anon_vma in dup_anon_vma()). Avoid overloading this term when we're very simply unlinking anon_vma state from a removed VMA upon merge. Signed-off-by: Lorenzo Stoakes --- include/linux/rmap.h | 7 ------- mm/vma.c | 2 +- tools/testing/vma/vma_internal.h | 5 ----- 3 files changed, 1 insertion(+), 13 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index daa92a58585d..832bfc0ccfc6 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -165,13 +165,6 @@ static inline int anon_vma_prepare(struct vm_area_struct *vma) return __anon_vma_prepare(vma); } -static inline void anon_vma_merge(struct vm_area_struct *vma, - struct vm_area_struct *next) -{ - VM_BUG_ON_VMA(vma->anon_vma != next->anon_vma, vma); - unlink_anon_vmas(next); -} - struct anon_vma *folio_get_anon_vma(const struct folio *folio); #ifdef CONFIG_MM_ID diff --git a/mm/vma.c b/mm/vma.c index fc90befd162f..feb4bbd3b259 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -376,7 +376,7 @@ static void vma_complete(struct vma_prepare *vp, struct vma_iterator *vmi, fput(vp->file); } if (vp->remove->anon_vma) - anon_vma_merge(vp->vma, vp->remove); + unlink_anon_vmas(vp->remove); mm->map_count--; mpol_put(vma_policy(vp->remove)); if (!vp->remove2) diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 9f0a9f5ed0fe..93e5792306d9 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -1265,11 +1265,6 @@ static inline void i_mmap_unlock_write(struct address_space *mapping) { } -static inline void anon_vma_merge(struct vm_area_struct *vma, - struct vm_area_struct *next) -{ -} - static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start, unsigned long end, -- 2.52.0 The bulk of the anon_vma operations are only used by mm, so formalise this by putting the function prototypes and inlines in mm/internal.h. This allows us to make changes without having to worry about the rest of the kernel. Signed-off-by: Lorenzo Stoakes --- include/linux/rmap.h | 60 -------------------------------------------- mm/internal.h | 58 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 60 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 832bfc0ccfc6..dd764951b03d 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -104,68 +104,8 @@ enum ttu_flags { }; #ifdef CONFIG_MMU -static inline void get_anon_vma(struct anon_vma *anon_vma) -{ - atomic_inc(&anon_vma->refcount); -} - -void __put_anon_vma(struct anon_vma *anon_vma); - -static inline void put_anon_vma(struct anon_vma *anon_vma) -{ - if (atomic_dec_and_test(&anon_vma->refcount)) - __put_anon_vma(anon_vma); -} - -static inline void anon_vma_lock_write(struct anon_vma *anon_vma) -{ - down_write(&anon_vma->root->rwsem); -} -static inline int anon_vma_trylock_write(struct anon_vma *anon_vma) -{ - return down_write_trylock(&anon_vma->root->rwsem); -} - -static inline void anon_vma_unlock_write(struct anon_vma *anon_vma) -{ - up_write(&anon_vma->root->rwsem); -} - -static inline void anon_vma_lock_read(struct anon_vma *anon_vma) -{ - down_read(&anon_vma->root->rwsem); -} - -static inline int anon_vma_trylock_read(struct anon_vma *anon_vma) -{ - return down_read_trylock(&anon_vma->root->rwsem); -} - -static inline void anon_vma_unlock_read(struct anon_vma *anon_vma) -{ - up_read(&anon_vma->root->rwsem); -} - - -/* - * anon_vma helper functions. - */ void anon_vma_init(void); /* create anon_vma_cachep */ -int __anon_vma_prepare(struct vm_area_struct *); -void unlink_anon_vmas(struct vm_area_struct *); -int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *); -int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *); - -static inline int anon_vma_prepare(struct vm_area_struct *vma) -{ - if (likely(vma->anon_vma)) - return 0; - - return __anon_vma_prepare(vma); -} - -struct anon_vma *folio_get_anon_vma(const struct folio *folio); #ifdef CONFIG_MM_ID static __always_inline void folio_lock_large_mapcount(struct folio *folio) diff --git a/mm/internal.h b/mm/internal.h index e430da900430..469d4ef1ccc5 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -199,6 +199,64 @@ static inline void vma_close(struct vm_area_struct *vma) #ifdef CONFIG_MMU +static inline void get_anon_vma(struct anon_vma *anon_vma) +{ + atomic_inc(&anon_vma->refcount); +} + +void __put_anon_vma(struct anon_vma *anon_vma); + +static inline void put_anon_vma(struct anon_vma *anon_vma) +{ + if (atomic_dec_and_test(&anon_vma->refcount)) + __put_anon_vma(anon_vma); +} + +static inline void anon_vma_lock_write(struct anon_vma *anon_vma) +{ + down_write(&anon_vma->root->rwsem); +} + +static inline int anon_vma_trylock_write(struct anon_vma *anon_vma) +{ + return down_write_trylock(&anon_vma->root->rwsem); +} + +static inline void anon_vma_unlock_write(struct anon_vma *anon_vma) +{ + up_write(&anon_vma->root->rwsem); +} + +static inline void anon_vma_lock_read(struct anon_vma *anon_vma) +{ + down_read(&anon_vma->root->rwsem); +} + +static inline int anon_vma_trylock_read(struct anon_vma *anon_vma) +{ + return down_read_trylock(&anon_vma->root->rwsem); +} + +static inline void anon_vma_unlock_read(struct anon_vma *anon_vma) +{ + up_read(&anon_vma->root->rwsem); +} + +struct anon_vma *folio_get_anon_vma(const struct folio *folio); + +int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src); +int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma); +int __anon_vma_prepare(struct vm_area_struct *vma); +void unlink_anon_vmas(struct vm_area_struct *vma); + +static inline int anon_vma_prepare(struct vm_area_struct *vma) +{ + if (likely(vma->anon_vma)) + return 0; + + return __anon_vma_prepare(vma); +} + /* Flags for folio_pte_batch(). */ typedef int __bitwise fpb_t; -- 2.52.0 This makes it easy to explicitly check for VMA detachment, which is useful for things like asserts. Note that we intentionally do not allow this function to be available should CONFIG_PER_VMA_LOCK be set - this is because vma_assert_attached() and vma_assert_detached() are no-ops if !CONFIG_PER_VMA_LOCK, so there is no correct state for vma_is_attached() to be in if this configuration option is not specified. Therefore users elsewhere must invoke this function only after checking for CONFIG_PER_VMA_LOCK. We rework the assert functions to utilise this. Signed-off-by: Lorenzo Stoakes --- include/linux/mmap_lock.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index d53f72dba7fe..b50416fbba20 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -251,6 +251,11 @@ static inline void vma_assert_locked(struct vm_area_struct *vma) !__is_vma_write_locked(vma, &mm_lock_seq), vma); } +static inline bool vma_is_attached(struct vm_area_struct *vma) +{ + return refcount_read(&vma->vm_refcnt); +} + /* * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these * assertions should be made either under mmap_write_lock or when the object @@ -258,12 +263,12 @@ static inline void vma_assert_locked(struct vm_area_struct *vma) */ static inline void vma_assert_attached(struct vm_area_struct *vma) { - WARN_ON_ONCE(!refcount_read(&vma->vm_refcnt)); + WARN_ON_ONCE(!vma_is_attached(vma)); } static inline void vma_assert_detached(struct vm_area_struct *vma) { - WARN_ON_ONCE(refcount_read(&vma->vm_refcnt)); + WARN_ON_ONCE(vma_is_attached(vma)); } static inline void vma_mark_attached(struct vm_area_struct *vma) -- 2.52.0 There is no reason to allocate the anon_vma_chain under the anon_vma write lock when cloning - we can in fact assign these to the destination VMA safely as we hold the exclusive mmap lock and therefore preclude anybody else accessing these fields. We only need take the anon_vma write lock when we link rbtree edges from the anon_vma to the newly established AVCs. This also allows us to eliminate the weird GFP_NOWAIT, GFP_KERNEL dance introduced in commit dd34739c03f2 ("mm: avoid anon_vma_chain allocation under anon_vma lock"), further simplifying this logic. This should reduce lock anon_vma contention, and clarifies exactly where the anon_vma lock is required. We cannot adjust __anon_vma_prepare() in the same way as this is only protected by VMA read lock, so we have to perform the allocation here under the anon_vma write lock and page_table_lock (to protect against racing threads), and we wish to retain the lock ordering. Signed-off-by: Lorenzo Stoakes --- mm/rmap.c | 49 +++++++++++++++++++++++++++++-------------------- 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 60134a566073..de9de6d71c23 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -146,14 +146,13 @@ static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); } -static void anon_vma_chain_link(struct vm_area_struct *vma, - struct anon_vma_chain *avc, - struct anon_vma *anon_vma) +static void anon_vma_chain_assign(struct vm_area_struct *vma, + struct anon_vma_chain *avc, + struct anon_vma *anon_vma) { avc->vma = vma; avc->anon_vma = anon_vma; list_add(&avc->same_vma, &vma->anon_vma_chain); - anon_vma_interval_tree_insert(avc, &anon_vma->rb_root); } /** @@ -210,7 +209,8 @@ int __anon_vma_prepare(struct vm_area_struct *vma) spin_lock(&mm->page_table_lock); if (likely(!vma->anon_vma)) { vma->anon_vma = anon_vma; - anon_vma_chain_link(vma, avc, anon_vma); + anon_vma_chain_assign(vma, avc, anon_vma); + anon_vma_interval_tree_insert(avc, &anon_vma->rb_root); anon_vma->num_active_vmas++; allocated = NULL; avc = NULL; @@ -287,20 +287,28 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) check_anon_vma_clone(dst, src); + /* + * Allocate AVCs. We don't need an anon_vma lock for this as we + * are not updating the anon_vma rbtree nor are we changing + * anon_vma statistics. + * + * We hold the mmap write lock so there's no possibliity of + * the unlinked AVC's being observed yet. + */ + list_for_each_entry(pavc, &src->anon_vma_chain, same_vma) { + avc = anon_vma_chain_alloc(GFP_KERNEL); + if (!avc) + goto enomem_failure; + + anon_vma_chain_assign(dst, avc, pavc->anon_vma); + } + + /* Now link the anon_vma's back to the newly inserted AVCs. */ anon_vma_lock_write(src->anon_vma); - list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { - struct anon_vma *anon_vma; - - avc = anon_vma_chain_alloc(GFP_NOWAIT); - if (unlikely(!avc)) { - anon_vma_unlock_write(src->anon_vma); - avc = anon_vma_chain_alloc(GFP_KERNEL); - if (!avc) - goto enomem_failure; - anon_vma_lock_write(src->anon_vma); - } - anon_vma = pavc->anon_vma; - anon_vma_chain_link(dst, avc, anon_vma); + list_for_each_entry_reverse(avc, &dst->anon_vma_chain, same_vma) { + struct anon_vma *anon_vma = avc->anon_vma; + + anon_vma_interval_tree_insert(avc, &anon_vma->rb_root); /* * Reuse existing anon_vma if it has no vma and only one @@ -316,7 +324,6 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) } if (dst->anon_vma) dst->anon_vma->num_active_vmas++; - anon_vma_unlock_write(src->anon_vma); return 0; @@ -385,8 +392,10 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) get_anon_vma(anon_vma->root); /* Mark this anon_vma as the one where our new (COWed) pages go. */ vma->anon_vma = anon_vma; + anon_vma_chain_assign(vma, avc, anon_vma); + /* Now let rmap see it. */ anon_vma_lock_write(anon_vma); - anon_vma_chain_link(vma, avc, anon_vma); + anon_vma_interval_tree_insert(avc, &anon_vma->rb_root); anon_vma->parent->num_children++; anon_vma_unlock_write(anon_vma); -- 2.52.0 Specify which operation is being performed to anon_vma_clone(), which allows us to do checks specific to each operation type, as well as to separate out and make clear that the anon_vma reuse logic is absolutely specific to fork only. This opens the door to further refactorings and refinements later as we have more information to work with. Signed-off-by: Lorenzo Stoakes --- mm/internal.h | 11 +++++- mm/rmap.c | 67 ++++++++++++++++++++++---------- mm/vma.c | 6 +-- tools/testing/vma/vma_internal.h | 11 +++++- 4 files changed, 69 insertions(+), 26 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 469d4ef1ccc5..b4d4bca0f9a7 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -244,7 +244,16 @@ static inline void anon_vma_unlock_read(struct anon_vma *anon_vma) struct anon_vma *folio_get_anon_vma(const struct folio *folio); -int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src); +/* Operations which modify VMAs. */ +enum vma_operation { + VMA_OP_SPLIT, + VMA_OP_MERGE_UNFAULTED, + VMA_OP_REMAP, + VMA_OP_FORK, +}; + +int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src, + enum vma_operation operation); int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma); int __anon_vma_prepare(struct vm_area_struct *vma); void unlink_anon_vmas(struct vm_area_struct *vma); diff --git a/mm/rmap.c b/mm/rmap.c index de9de6d71c23..f08e6bc57379 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -232,12 +232,13 @@ int __anon_vma_prepare(struct vm_area_struct *vma) } static void check_anon_vma_clone(struct vm_area_struct *dst, - struct vm_area_struct *src) + struct vm_area_struct *src, + enum vma_operation operation) { /* The write lock must be held. */ mmap_assert_write_locked(src->vm_mm); - /* If not a fork (implied by dst->anon_vma) then must be on same mm. */ - VM_WARN_ON_ONCE(dst->anon_vma && dst->vm_mm != src->vm_mm); + /* If not a fork then must be on same mm. */ + VM_WARN_ON_ONCE(operation != VMA_OP_FORK && dst->vm_mm != src->vm_mm); /* No source anon_vma is a no-op. */ VM_WARN_ON_ONCE(!src->anon_vma && !list_empty(&src->anon_vma_chain)); @@ -249,6 +250,35 @@ static void check_anon_vma_clone(struct vm_area_struct *dst, * must be the same across dst and src. */ VM_WARN_ON_ONCE(dst->anon_vma && dst->anon_vma != src->anon_vma); + + /* For the anon_vma to be compatible, it can only be singular. */ + VM_WARN_ON_ONCE(operation == VMA_OP_MERGE_UNFAULTED && + !list_is_singular(&src->anon_vma_chain)); +#ifdef CONFIG_PER_VMA_LOCK + /* Only merging an unfaulted VMA leaves the destination attached. */ + VM_WARN_ON_ONCE(operation != VMA_OP_MERGE_UNFAULTED && + vma_is_attached(dst)); +#endif +} + +static void find_reusable_anon_vma(struct vm_area_struct *vma, + struct anon_vma *anon_vma) +{ + /* If already populated, nothing to do.*/ + if (vma->anon_vma) + return; + + /* + * We reuse an anon_vma if any linking VMAs were unmapped and it has + * only a single child at most. + */ + if (anon_vma->num_active_vmas > 0) + return; + if (anon_vma->num_children > 1) + return; + + vma->anon_vma = anon_vma; + anon_vma->num_active_vmas++; } /** @@ -256,6 +286,7 @@ static void check_anon_vma_clone(struct vm_area_struct *dst, * all of the anon_vma objects contained within @src anon_vma_chain's. * @dst: The destination VMA with an empty anon_vma_chain. * @src: The source VMA we wish to duplicate. + * @operation: The type of operation which resulted in the clone. * * This is the heart of the VMA side of the anon_vma implementation - we invoke * this function whenever we need to set up a new VMA's anon_vma state. @@ -278,14 +309,16 @@ static void check_anon_vma_clone(struct vm_area_struct *dst, * * Returns: 0 on success, -ENOMEM on failure. */ -int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) +int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src, + enum vma_operation operation) { struct anon_vma_chain *avc, *pavc; + struct anon_vma *active_anon_vma = src->anon_vma; - if (!src->anon_vma) + if (!active_anon_vma) return 0; - check_anon_vma_clone(dst, src); + check_anon_vma_clone(dst, src, operation); /* * Allocate AVCs. We don't need an anon_vma lock for this as we @@ -309,22 +342,14 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) struct anon_vma *anon_vma = avc->anon_vma; anon_vma_interval_tree_insert(avc, &anon_vma->rb_root); - - /* - * Reuse existing anon_vma if it has no vma and only one - * anon_vma child. - * - * Root anon_vma is never reused: - * it has self-parent reference and at least one child. - */ - if (!dst->anon_vma && src->anon_vma && - anon_vma->num_children < 2 && - anon_vma->num_active_vmas == 0) - dst->anon_vma = anon_vma; + if (operation == VMA_OP_FORK) + find_reusable_anon_vma(dst, anon_vma); } - if (dst->anon_vma) + + if (operation != VMA_OP_FORK) dst->anon_vma->num_active_vmas++; - anon_vma_unlock_write(src->anon_vma); + + anon_vma_unlock_write(active_anon_vma); return 0; enomem_failure: @@ -361,7 +386,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) * First, attach the new VMA to the parent VMA's anon_vmas, * so rmap can find non-COWed pages in child processes. */ - error = anon_vma_clone(vma, pvma); + error = anon_vma_clone(vma, pvma, VMA_OP_FORK); if (error) return error; diff --git a/mm/vma.c b/mm/vma.c index feb4bbd3b259..e297c3a94133 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -525,7 +525,7 @@ __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, if (err) goto out_free_vmi; - err = anon_vma_clone(new, vma); + err = anon_vma_clone(new, vma, VMA_OP_SPLIT); if (err) goto out_free_mpol; @@ -623,7 +623,7 @@ static int dup_anon_vma(struct vm_area_struct *dst, vma_assert_write_locked(dst); dst->anon_vma = src->anon_vma; - ret = anon_vma_clone(dst, src); + ret = anon_vma_clone(dst, src, VMA_OP_MERGE_UNFAULTED); if (ret) return ret; @@ -1862,7 +1862,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, vma_set_range(new_vma, addr, addr + len, pgoff); if (vma_dup_policy(vma, new_vma)) goto out_free_vma; - if (anon_vma_clone(new_vma, vma)) + if (anon_vma_clone(new_vma, vma, VMA_OP_REMAP)) goto out_free_mempol; if (new_vma->vm_file) get_file(new_vma->vm_file); diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 93e5792306d9..7fa56dcc53a6 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -600,6 +600,14 @@ struct mmap_action { bool hide_from_rmap_until_complete :1; }; +/* Operations which modify VMAs. */ +enum vma_operation { + VMA_OP_SPLIT, + VMA_OP_MERGE_UNFAULTED, + VMA_OP_REMAP, + VMA_OP_FORK, +}; + /* * Describes a VMA that is about to be mmap()'ed. Drivers may choose to * manipulate mutable fields which will cause those fields to be updated in the @@ -1157,7 +1165,8 @@ static inline int vma_dup_policy(struct vm_area_struct *src, struct vm_area_stru return 0; } -static inline int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) +static inline int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src, + enum vma_operation operation) { /* For testing purposes. We indicate that an anon_vma has been cloned. */ if (src->anon_vma != NULL) { -- 2.52.0