Move the trace point later in the function so that it is not skipped in the event of a failed fork. Signed-off-by: Liam R. Howlett --- mm/mmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mmap.c b/mm/mmap.c index 7306253cc3b57..c4c315b480af7 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1310,9 +1310,9 @@ void exit_mmap(struct mm_struct *mm) BUG_ON(count != mm->map_count); - trace_exit_mmap(mm); destroy: __mt_destroy(&mm->mm_mt); + trace_exit_mmap(mm); mmap_write_unlock(mm); vm_unacct_memory(nr_accounted); } -- 2.47.2 Create the new function tear_down_vmas() to remove a range of vmas. exit_mmap() will be removing all the vmas. This is necessary for future patches. No functional changes intended. Signed-off-by: Liam R. Howlett --- mm/mmap.c | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index c4c315b480af7..0995a48b46d59 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1250,6 +1250,29 @@ int vm_brk_flags(unsigned long addr, unsigned long request, vm_flags_t vm_flags) } EXPORT_SYMBOL(vm_brk_flags); +static inline +unsigned long tear_down_vmas(struct mm_struct *mm, struct vma_iterator *vmi, + struct vm_area_struct *vma, unsigned long max) +{ + unsigned long nr_accounted = 0; + int count = 0; + + mmap_assert_write_locked(mm); + vma_iter_set(vmi, vma->vm_end); + do { + if (vma->vm_flags & VM_ACCOUNT) + nr_accounted += vma_pages(vma); + vma_mark_detached(vma); + remove_vma(vma); + count++; + cond_resched(); + vma = vma_next(vmi); + } while (vma && vma->vm_end <= max); + + BUG_ON(count != mm->map_count); + return nr_accounted; +} + /* Release all mmaps. */ void exit_mmap(struct mm_struct *mm) { @@ -1257,7 +1280,6 @@ void exit_mmap(struct mm_struct *mm) struct vm_area_struct *vma; unsigned long nr_accounted = 0; VMA_ITERATOR(vmi, mm, 0); - int count = 0; /* mm's last user has gone, and its about to be pulled down */ mmu_notifier_release(mm); @@ -1297,18 +1319,7 @@ void exit_mmap(struct mm_struct *mm) * enabled, without holding any MM locks besides the unreachable * mmap_write_lock. */ - vma_iter_set(&vmi, vma->vm_end); - do { - if (vma->vm_flags & VM_ACCOUNT) - nr_accounted += vma_pages(vma); - vma_mark_detached(vma); - remove_vma(vma); - count++; - cond_resched(); - vma = vma_next(&vmi); - } while (vma && likely(!xa_is_zero(vma))); - - BUG_ON(count != mm->map_count); + nr_accounted = tear_down_vmas(mm, &vmi, vma, ULONG_MAX); destroy: __mt_destroy(&mm->mm_mt); -- 2.47.2 Add a limit to the vma search instead of using the start and end of the one passed in. No functional changes intended. Signed-off-by: Liam R. Howlett --- mm/vma.c | 6 ++++-- mm/vma.h | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/vma.c b/mm/vma.c index 3b12c7579831b..fd270345c25d3 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -474,6 +474,7 @@ void remove_vma(struct vm_area_struct *vma) * Called with the mm semaphore held. */ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, + unsigned long vma_min, unsigned long vma_max, struct vm_area_struct *prev, struct vm_area_struct *next) { struct mm_struct *mm = vma->vm_mm; @@ -481,7 +482,7 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); - unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end, + unmap_vmas(&tlb, mas, vma, vma_min, vma_max, vma_max, /* mm_wr_locked = */ true); mas_set(mas, vma->vm_end); free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, @@ -2417,7 +2418,8 @@ static int __mmap_new_file_vma(struct mmap_state *map, vma_iter_set(vmi, vma->vm_end); /* Undo any partial mapping done by a device driver. */ - unmap_region(&vmi->mas, vma, map->prev, map->next); + unmap_region(&vmi->mas, vma, vma->vm_start, vma->vm_end, + map->prev, map->next); return error; } diff --git a/mm/vma.h b/mm/vma.h index b123a9cdedb0d..336dae295853e 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -281,6 +281,7 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, void remove_vma(struct vm_area_struct *vma); void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, + unsigned long min, unsigned long max, struct vm_area_struct *prev, struct vm_area_struct *next); /* We are about to modify the VMA's flags. */ -- 2.47.2 The ceiling and tree search limit need to be different arguments for the future change in the failed fork attempt. No functional changes intended. Signed-off-by: Liam R. Howlett --- mm/internal.h | 4 +++- mm/memory.c | 7 ++++--- mm/mmap.c | 2 +- mm/vma.c | 3 ++- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 45b725c3dc030..f9a278ac76d83 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -444,7 +444,9 @@ void folio_activate(struct folio *folio); void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *start_vma, unsigned long floor, - unsigned long ceiling, bool mm_wr_locked); + unsigned long ceiling, unsigned long tree_max, + bool mm_wr_locked); + void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte); struct zap_details; diff --git a/mm/memory.c b/mm/memory.c index 0ba4f6b718471..3346514562bba 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -371,7 +371,8 @@ void free_pgd_range(struct mmu_gather *tlb, void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *vma, unsigned long floor, - unsigned long ceiling, bool mm_wr_locked) + unsigned long ceiling, unsigned long tree_max, + bool mm_wr_locked) { struct unlink_vma_file_batch vb; @@ -385,7 +386,7 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, * Note: USER_PGTABLES_CEILING may be passed as ceiling and may * be 0. This will underflow and is okay. */ - next = mas_find(mas, ceiling - 1); + next = mas_find(mas, tree_max - 1); if (unlikely(xa_is_zero(next))) next = NULL; @@ -405,7 +406,7 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, */ while (next && next->vm_start <= vma->vm_end + PMD_SIZE) { vma = next; - next = mas_find(mas, ceiling - 1); + next = mas_find(mas, tree_max - 1); if (unlikely(xa_is_zero(next))) next = NULL; if (mm_wr_locked) diff --git a/mm/mmap.c b/mm/mmap.c index 0995a48b46d59..eba2bc81bc749 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1311,7 +1311,7 @@ void exit_mmap(struct mm_struct *mm) mt_clear_in_rcu(&mm->mm_mt); vma_iter_set(&vmi, vma->vm_end); free_pgtables(&tlb, &vmi.mas, vma, FIRST_USER_ADDRESS, - USER_PGTABLES_CEILING, true); + USER_PGTABLES_CEILING, USER_PGTABLES_CEILING, true); tlb_finish_mmu(&tlb); /* diff --git a/mm/vma.c b/mm/vma.c index fd270345c25d3..aa75ca8618609 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -486,6 +486,7 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, /* mm_wr_locked = */ true); mas_set(mas, vma->vm_end); free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, + next ? next->vm_start : USER_PGTABLES_CEILING, next ? next->vm_start : USER_PGTABLES_CEILING, /* mm_wr_locked = */ true); tlb_finish_mmu(&tlb); @@ -1232,7 +1233,7 @@ static inline void vms_clear_ptes(struct vma_munmap_struct *vms, mas_set(mas_detach, 1); /* start and end may be different if there is no prev or next vma. */ free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start, - vms->unmap_end, mm_wr_locked); + vms->unmap_end, vms->unmap_end, mm_wr_locked); tlb_finish_mmu(&tlb); vms->clear_ptes = false; } -- 2.47.2 The unmap_region() calls need to pass through the page table limit for a future patch. No functional changes intended. Signed-off-by: Liam R. Howlett --- mm/vma.c | 5 +++-- mm/vma.h | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/vma.c b/mm/vma.c index aa75ca8618609..39f3b55a020b2 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -474,7 +474,7 @@ void remove_vma(struct vm_area_struct *vma) * Called with the mm semaphore held. */ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, - unsigned long vma_min, unsigned long vma_max, + unsigned long vma_min, unsigned long vma_max, unsigned long pg_max, struct vm_area_struct *prev, struct vm_area_struct *next) { struct mm_struct *mm = vma->vm_mm; @@ -487,7 +487,7 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, mas_set(mas, vma->vm_end); free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, next ? next->vm_start : USER_PGTABLES_CEILING, - next ? next->vm_start : USER_PGTABLES_CEILING, + pg_max, /* mm_wr_locked = */ true); tlb_finish_mmu(&tlb); } @@ -2420,6 +2420,7 @@ static int __mmap_new_file_vma(struct mmap_state *map, vma_iter_set(vmi, vma->vm_end); /* Undo any partial mapping done by a device driver. */ unmap_region(&vmi->mas, vma, vma->vm_start, vma->vm_end, + map->next ? map->next->vm_start : USER_PGTABLES_CEILING, map->prev, map->next); return error; diff --git a/mm/vma.h b/mm/vma.h index 336dae295853e..ba203c0c1d89d 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -281,7 +281,7 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, void remove_vma(struct vm_area_struct *vma); void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, - unsigned long min, unsigned long max, + unsigned long min, unsigned long max, unsigned long pg_max, struct vm_area_struct *prev, struct vm_area_struct *next); /* We are about to modify the VMA's flags. */ -- 2.47.2 When the dup_mmap() fails during the vma duplication or setup, don't write the XA_ZERO entry in the vma tree. Instead, destroy the tree and free the new resources, leaving an empty vma tree. Using XA_ZERO introduced races where the vma could be found between dup_mmap() dropping all locks and exit_mmap() taking the locks. The race can occur because the mm can be reached through the other trees via successfully copied vmas and other methods such as the swapoff code. XA_ZERO was marking the location to stop vma removal and pagetable freeing. The newly created arguments to the unmap_vmas() and free_pgtables() serve this function. Replacing the XA_ZERO entry use with the new argument list also means the checks for xa_is_zero() are no longer necessary so these are also removed. Signed-off-by: Liam R. Howlett --- mm/memory.c | 6 +----- mm/mmap.c | 39 ++++++++++++++++++++++++++++----------- 2 files changed, 29 insertions(+), 16 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 3346514562bba..8cd58f171bae4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -387,8 +387,6 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, * be 0. This will underflow and is okay. */ next = mas_find(mas, tree_max - 1); - if (unlikely(xa_is_zero(next))) - next = NULL; /* * Hide vma from rmap and truncate_pagecache before freeing @@ -407,8 +405,6 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, while (next && next->vm_start <= vma->vm_end + PMD_SIZE) { vma = next; next = mas_find(mas, tree_max - 1); - if (unlikely(xa_is_zero(next))) - next = NULL; if (mm_wr_locked) vma_start_write(vma); unlink_anon_vmas(vma); @@ -1978,7 +1974,7 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, mm_wr_locked); hugetlb_zap_end(vma, &details); vma = mas_find(mas, tree_end - 1); - } while (vma && likely(!xa_is_zero(vma))); + } while (vma); mmu_notifier_invalidate_range_end(&range); } diff --git a/mm/mmap.c b/mm/mmap.c index eba2bc81bc749..5acc0b5f8c14a 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1288,7 +1288,7 @@ void exit_mmap(struct mm_struct *mm) arch_exit_mmap(mm); vma = vma_next(&vmi); - if (!vma || unlikely(xa_is_zero(vma))) { + if (!vma) { /* Can happen if dup_mmap() received an OOM */ mmap_read_unlock(mm); mmap_write_lock(mm); @@ -1858,20 +1858,37 @@ __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) ksm_fork(mm, oldmm); khugepaged_fork(mm, oldmm); } else { + unsigned long max; /* - * The entire maple tree has already been duplicated. If the - * mmap duplication fails, mark the failure point with - * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered, - * stop releasing VMAs that have not been duplicated after this - * point. + * The entire maple tree has already been duplicated, but + * replacing the vmas failed at mpnt (which could be NULL if + * all were allocated but the last vma was not fully set up. Use + * the start address of the failure point to clean up the half + * initialized tree. */ - if (mpnt) { - mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1); - mas_store(&vmi.mas, XA_ZERO_ENTRY); - /* Avoid OOM iterating a broken tree */ - set_bit(MMF_OOM_SKIP, &mm->flags); + if (!mm->map_count) { + /* zero vmas were written to the new tree. */ + max = 0; + } else if (mpnt) { + /* mid-tree failure */ + max = mpnt->vm_start; + } else { + /* All vmas were written to the new tree */ + max = ULONG_MAX; } + + /* Hide mm from oom killer because the memory is being freed */ + set_bit(MMF_OOM_SKIP, &mm->flags); + if (max) { + vma_iter_set(&vmi, 0); + tmp = vma_next(&vmi); + flush_cache_mm(mm); + unmap_region(&vmi.mas, tmp, 0, max, max, NULL, NULL); + charge = tear_down_vmas(mm, &vmi, tmp, max); + vm_unacct_memory(charge); + } + __mt_destroy(&mm->mm_mt); /* * The mm_struct is going to exit, but the locks will be dropped * first. Set the mm_struct as unstable is advisable as it is -- 2.47.2