Mark VMAs as ANON_VMA_LAZY and defer anon_vma creation until fork, avoiding early allocation when it may not be needed and reducing overhead. During fork(), ANON_VMA_LAZY VMAs are first upgraded to a regular anon_vma in the parent to establish the sharing topology. Child VMAs are created as ANON_VMA_TREE_PARENT and do not allocate anon_vma, avoiding additional fork overhead. Signed-off-by: tao --- mm/internal.h | 9 +++ mm/memory.c | 4 + mm/rmap.c | 209 ++++++++++++++++++++++++++++++++++++++++++++++++-- mm/vma.c | 9 ++- 4 files changed, 222 insertions(+), 9 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 6b703646f66d..0a36eba3f63c 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -417,6 +417,8 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src, enum vma_operation operation); int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma); int __anon_vma_prepare(struct vm_area_struct *vma); +/* Called on first anon fault or from anon_vma_prepare(). */ +void vma_prepare_anon_vma_lazy(struct vm_area_struct *vma); void unlink_anon_vmas(struct vm_area_struct *vma); static inline int anon_vma_prepare(struct vm_area_struct *vma) @@ -424,6 +426,13 @@ static inline int anon_vma_prepare(struct vm_area_struct *vma) if (likely(vma->anon_vma)) return 0; +#ifdef CONFIG_ANON_VMA_LAZY + if (anon_vma_lazy_enabled()) { + vma_prepare_anon_vma_lazy(vma); + return 0; + } +#endif + return __anon_vma_prepare(vma); } diff --git a/mm/memory.c b/mm/memory.c index c13b79987b26..8fd3877f69fb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3822,6 +3822,10 @@ vm_fault_t __vmf_anon_prepare(struct vm_fault *vmf) if (likely(vma->anon_vma)) return 0; + if (anon_vma_lazy_enabled()) { + vma_prepare_anon_vma_lazy(vma); + return 0; + } if (vmf->flags & FAULT_FLAG_VMA_LOCK) { if (!mmap_read_trylock(vma->vm_mm)) return VM_FAULT_RETRY; diff --git a/mm/rmap.c b/mm/rmap.c index f70e3cb9812e..d9424f4eb6d0 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -240,9 +240,118 @@ static void anon_vma_chain_assign(struct vm_area_struct *vma, list_add(&avc->same_vma, &vma->anon_vma_chain); } +#ifdef CONFIG_ANON_VMA_LAZY +/* Called on first anon fault or from anon_vma_prepare(). */ +void vma_prepare_anon_vma_lazy(struct vm_area_struct *vma) +{ + struct mm_struct *mm = vma->vm_mm; + + spin_lock(&mm->page_table_lock); + if (!vma->anon_vma) { + vma_get(vma); + vma->anon_vma = (anon_vma_tree_t)( + (unsigned long)vma + ANON_VMA_TREE_VMA); + } + spin_unlock(&mm->page_table_lock); +} + +/* + * Link VMA to its root ANON_VMA_TREE_VMA. Root holds reference to prevent + * premature freeing while folios reference it via folio->mapping. + */ +static bool vma_link_anon_vma_lazy_root(struct vm_area_struct *vma, + struct vm_area_struct *src) +{ + struct mm_struct *mm = src->vm_mm; + struct vm_area_struct *root_vma; + bool ret = false; + + VM_BUG_ON_VMA(vma->vm_mm != src->vm_mm, vma); + /* src may be upgraded concurrently */ + spin_lock(&mm->page_table_lock); + root_vma = anon_vma_tree_vma(src->anon_vma); + if (root_vma) { + vma_get(root_vma); + vma->anon_vma = src->anon_vma; + ret = true; + } else { + vma_set_anon_vma(vma, NULL); + } + spin_unlock(&mm->page_table_lock); + return ret; +} + +/* Link VMA to its ANON_VMA_TREE_PARENT .*/ +static void vma_link_anon_vma_lazy_parent(struct vm_area_struct *vma, + struct vm_area_struct *src) +{ + struct anon_vma *parent_anon_vma = vma_anon_vma(src); + + vma_assert_write_locked(src); + VM_BUG_ON_VMA(vma->anon_vma, vma); + VM_BUG_ON_VMA(!parent_anon_vma, src); + + get_anon_vma(parent_anon_vma); + vma->anon_vma = (anon_vma_tree_t)( + (unsigned long)parent_anon_vma + ANON_VMA_TREE_PARENT); +} + +/* Unlink VMA from anon_vma, dropping root/parent reference. */ +static bool vma_unlink_anon_vma_lazy(struct vm_area_struct *vma, + anon_vma_tree_t new_anon_vma_tree) +{ + struct mm_struct *mm = vma->vm_mm; + anon_vma_tree_t anon_tree_mutable = READ_ONCE(vma->anon_vma); + anon_vma_tree_t anon_tree; + bool is_lazy = true; + struct vm_area_struct *root_vma = NULL; + struct anon_vma *parent_anon_vma = NULL; + + VM_BUG_ON_VMA(anon_vma_tree_type(new_anon_vma_tree), vma); + + anon_vma_tree_lock_write(anon_tree_mutable); + spin_lock(&mm->page_table_lock); + anon_tree = vma->anon_vma; + if (anon_vma_tree_is_vma(anon_tree)) { + root_vma = anon_vma_tree_vma(anon_tree); + vma->anon_vma = new_anon_vma_tree; + } else if (anon_vma_tree_is_parent(anon_tree)) { + parent_anon_vma = anon_vma_tree_anon_vma(anon_tree); + vma->anon_vma = new_anon_vma_tree; + } else { + is_lazy = false; + } + spin_unlock(&mm->page_table_lock); + anon_vma_tree_unlock_write(anon_tree_mutable); + if (!is_lazy) + return false; + + /* drop reference after unlock */ + VM_BUG_ON_VMA(!parent_anon_vma && !root_vma, vma); + if (parent_anon_vma) { + /* There must be nodes; it cannot be the last reference. */ + VM_BUG_ON(RB_EMPTY_ROOT(&parent_anon_vma->rb_root.rb_root)); + put_anon_vma(parent_anon_vma); + } + if (root_vma) + vma_put(root_vma); + return is_lazy; +} +#else +static inline bool vma_link_anon_vma_lazy_root(struct vm_area_struct *vma, + struct vm_area_struct *src) { return false; } +static void vma_link_anon_vma_lazy_parent(struct vm_area_struct *vma, + struct vm_area_struct *src) {} +static inline bool vma_unlink_anon_vma_lazy(struct vm_area_struct *vma, + anon_vma_tree_t new_anon_vma_tree) { return false; } +#endif + /** - * __anon_vma_prepare - attach an anon_vma to a memory region + * vma_prepare_anon_vma - attach an anon_vma to a memory region * @vma: the memory region in question + * @upgrade_lazy: true when upgrading a lazy VMA to a regular anon_vma. + * @parent_anon_vma: non-NULL if the VMA is inherited from its parent, + * otherwise NULL. * * This makes sure the memory mapping described by 'vma' has * an 'anon_vma' attached to it, so that we can associate the @@ -266,12 +375,14 @@ static void anon_vma_chain_assign(struct vm_area_struct *vma, * to do any locking for the common case of already having * an anon_vma. */ -int __anon_vma_prepare(struct vm_area_struct *vma) +static int vma_prepare_anon_vma(struct vm_area_struct *vma, bool upgrade_lazy, + struct anon_vma *parent_anon_vma) { struct mm_struct *mm = vma->vm_mm; struct anon_vma *anon_vma, *allocated; anon_vma_tree_t anon_tree; struct anon_vma_chain *avc; + bool is_lazy = false; mmap_assert_locked(mm); might_sleep(); @@ -282,19 +393,30 @@ int __anon_vma_prepare(struct vm_area_struct *vma) anon_vma = find_mergeable_anon_vma(vma); allocated = NULL; - if (!anon_vma) { + /* If parent_anon_vma exists, mergeable anon_vma root must match it. */ + if (!anon_vma || + (parent_anon_vma && anon_vma->root != parent_anon_vma->root)) { anon_vma = anon_vma_alloc(); if (unlikely(!anon_vma)) goto out_enomem_free_avc; - anon_vma->num_children++; /* self-parent link for new root */ allocated = anon_vma; + if (parent_anon_vma) { + anon_vma->root = parent_anon_vma->root; + anon_vma->parent = parent_anon_vma; + } } anon_tree = make_anon_vma_tree(anon_vma); + if (upgrade_lazy) + is_lazy = vma_unlink_anon_vma_lazy(vma, anon_tree); anon_vma_tree_lock_write(anon_tree); /* page_table_lock to protect against threads */ spin_lock(&mm->page_table_lock); - if (likely(!vma->anon_vma)) { + if (likely(!vma->anon_vma || is_lazy)) { + if (anon_vma->root != anon_vma) + get_anon_vma(anon_vma->root); + if (allocated) + anon_vma->parent->num_children++; vma->anon_vma = anon_tree; anon_vma_chain_assign(vma, avc, anon_vma); anon_vma_interval_tree_insert(avc, &anon_vma->rb_root); @@ -318,6 +440,28 @@ int __anon_vma_prepare(struct vm_area_struct *vma) return -ENOMEM; } +/** + * __anon_vma_prepare - attach an anon_vma to a memory region + * @vma: the memory region in question + * + * Wrapper around vma_prepare_anon_vma() for the non-lazy case. + * Called when ANON_VMA_LAZY is disabled. + */ +int __anon_vma_prepare(struct vm_area_struct *vma) +{ + return vma_prepare_anon_vma(vma, false, NULL); +} + +static int vma_upgrade_anon_vma_lazy(struct vm_area_struct *vma) +{ + anon_vma_tree_t vma_tree = vma->anon_vma; + struct anon_vma *parent_anon_vma = NULL; + + if (anon_vma_tree_is_parent(vma_tree)) + parent_anon_vma = anon_vma_tree_anon_vma(vma_tree); + return vma_prepare_anon_vma(vma, true, parent_anon_vma); +} + static void check_anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src, enum vma_operation operation) @@ -414,6 +558,20 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src, if (!active_anon_tree) return 0; + /* Check ANON_VMA_LAZY first. */ + if (anon_vma_tree_is_vma(active_anon_tree)) { + if (vma_link_anon_vma_lazy_root(dst, src)) + return 0; + } else if (anon_vma_tree_is_parent(active_anon_tree)) { + /* split from tree_parent is rare; promote to regular. */ + int err = vma_upgrade_anon_vma_lazy(src); + + if (err) + return err; + VM_BUG_ON_VMA(vma_is_anon_vma_lazy(src), src); + dst->anon_vma = src->anon_vma; + } + /* * Allocate AVCs. We don't need an anon_vma lock for this as we * are not updating the anon_vma rbtree nor are we changing @@ -445,7 +603,7 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src, maybe_reuse_anon_vma(dst, anon_vma); } - if (operation != VMA_OP_FORK) + if (operation != VMA_OP_FORK && vma_anon_vma(dst)) vma_anon_vma(dst)->num_active_vmas++; anon_vma_tree_unlock_write(active_anon_tree); @@ -456,9 +614,38 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src, return -ENOMEM; } +static int vma_fork_anon_vma_lazy(struct vm_area_struct *vma, + struct vm_area_struct *pvma) +{ + int error; + + if (vma_is_anon_vma_lazy(pvma)) { + error = vma_upgrade_anon_vma_lazy(pvma); + if (error) + return error; + VM_BUG_ON_VMA(vma_is_anon_vma_lazy(pvma), pvma); + } + + vma_set_anon_vma(vma, NULL); + error = anon_vma_clone(vma, pvma, VMA_OP_FORK); + if (error) + return error; + + if (vma->anon_vma) + return 0; + /* Lazily allocate the child anon_vma. */ + vma_link_anon_vma_lazy_parent(vma, pvma); + return 0; +} + /* * Attach vma to its own anon_vma, as well as to the anon_vmas that * the corresponding VMA in the parent process is attached to. + * + * For ANON_VMA_LAZY: if the parent VMA is lazy, upgrade it to a regular + * anon_vma before cloning. The child VMA may also be marked lazy when + * ANON_VMA_LAZY is enabled, deferring anon_vma allocation. + * * Returns 0 on success, non-zero on failure. */ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) @@ -472,6 +659,9 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) if (!pvma->anon_vma) return 0; + if (anon_vma_lazy_enabled()) + return vma_fork_anon_vma_lazy(vma, pvma); + /* Drop inherited anon_vma, we'll reuse existing or allocate new. */ vma_set_anon_vma(vma, NULL); @@ -577,6 +767,10 @@ void unlink_anon_vmas(struct vm_area_struct *vma) return; } + /* Unlink ANON_VMA_LAZY first, then ancestor anon_vma. */ + if (vma_is_anon_vma_lazy(vma)) + vma_unlink_anon_vma_lazy(vma, (anon_vma_tree_t)NULL); + anon_vma_tree_lock_write(active_anon_tree); /* @@ -601,7 +795,8 @@ void unlink_anon_vmas(struct vm_area_struct *vma) anon_vma_chain_free(avc); } - vma_anon_vma(vma)->num_active_vmas--; + if (vma_anon_vma(vma)) + vma_anon_vma(vma)->num_active_vmas--; /* * vma would still be needed after unlink, and anon_vma will be prepared * when handle fault. diff --git a/mm/vma.c b/mm/vma.c index ed15968a5891..0a31ef82a90c 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -1995,6 +1995,8 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct * * acceptable for merging, so we can do all of this optimistically. But * we do that READ_ONCE() to make sure that we never re-load the pointer. * + * For upgrading ANON_VMA_LAZY VMAs, follow the same reuse rules as splitting. + * * IOW: that the "list_is_singular()" test on the anon_vma_chain only * matters for the 'stable anon_vma' case (ie the thing we want to avoid * is to return an anon_vma that is "complex" due to having gone through @@ -2005,12 +2007,15 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct * * a read lock on the mmap_lock. */ static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, + struct vm_area_struct *vma, struct vm_area_struct *a, struct vm_area_struct *b) { if (anon_vma_compatible(a, b)) { struct anon_vma *anon_vma = vma_anon_vma(old); + if (anon_vma && vma_is_anon_vma_lazy(vma)) + return anon_vma; if (anon_vma && list_is_singular(&old->anon_vma_chain)) return anon_vma; } @@ -2034,7 +2039,7 @@ struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) /* Try next first. */ next = vma_iter_load(&vmi); if (next) { - anon_vma = reusable_anon_vma(next, vma, next); + anon_vma = reusable_anon_vma(next, vma, vma, next); if (anon_vma) return anon_vma; } @@ -2044,7 +2049,7 @@ struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) prev = vma_prev(&vmi); /* Try prev next. */ if (prev) - anon_vma = reusable_anon_vma(prev, prev, vma); + anon_vma = reusable_anon_vma(prev, vma, prev, vma); /* * We might reach here with anon_vma == NULL if we can't find -- 2.17.1