Move the trace point later in the function so that it is not skipped in the event of a failed fork. Acked-by: Chris Li Reviewed-by: Lorenzo Stoakes Signed-off-by: Liam R. Howlett --- mm/mmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mmap.c b/mm/mmap.c index 5fd3b80fda1d5..b07b3ec5e28f5 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1310,9 +1310,9 @@ void exit_mmap(struct mm_struct *mm) BUG_ON(count != mm->map_count); - trace_exit_mmap(mm); destroy: __mt_destroy(&mm->mm_mt); + trace_exit_mmap(mm); mmap_write_unlock(mm); vm_unacct_memory(nr_accounted); } -- 2.47.2 Create the new function tear_down_vmas() to remove a range of vmas. exit_mmap() will be removing all the vmas. This is necessary for future patches. No functional changes intended. Reviewed-by: Lorenzo Stoakes Signed-off-by: Liam R. Howlett --- mm/mmap.c | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index b07b3ec5e28f5..a290448a53bb2 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1250,6 +1250,29 @@ int vm_brk_flags(unsigned long addr, unsigned long request, vm_flags_t vm_flags) } EXPORT_SYMBOL(vm_brk_flags); +static inline +unsigned long tear_down_vmas(struct mm_struct *mm, struct vma_iterator *vmi, + struct vm_area_struct *vma, unsigned long max) +{ + unsigned long nr_accounted = 0; + int count = 0; + + mmap_assert_write_locked(mm); + vma_iter_set(vmi, vma->vm_end); + do { + if (vma->vm_flags & VM_ACCOUNT) + nr_accounted += vma_pages(vma); + vma_mark_detached(vma); + remove_vma(vma); + count++; + cond_resched(); + vma = vma_next(vmi); + } while (vma && vma->vm_end <= max); + + WARN_ON_ONCE(count != mm->map_count); + return nr_accounted; +} + /* Release all mmaps. */ void exit_mmap(struct mm_struct *mm) { @@ -1257,7 +1280,6 @@ void exit_mmap(struct mm_struct *mm) struct vm_area_struct *vma; unsigned long nr_accounted = 0; VMA_ITERATOR(vmi, mm, 0); - int count = 0; /* mm's last user has gone, and its about to be pulled down */ mmu_notifier_release(mm); @@ -1297,18 +1319,7 @@ void exit_mmap(struct mm_struct *mm) * enabled, without holding any MM locks besides the unreachable * mmap_write_lock. */ - vma_iter_set(&vmi, vma->vm_end); - do { - if (vma->vm_flags & VM_ACCOUNT) - nr_accounted += vma_pages(vma); - vma_mark_detached(vma); - remove_vma(vma); - count++; - cond_resched(); - vma = vma_next(&vmi); - } while (vma && likely(!xa_is_zero(vma))); - - BUG_ON(count != mm->map_count); + nr_accounted = tear_down_vmas(mm, &vmi, vma, ULONG_MAX); destroy: __mt_destroy(&mm->mm_mt); -- 2.47.2 Add a limit to the vma search instead of using the start and end of the one passed in. No functional changes intended. Signed-off-by: Liam R. Howlett --- mm/vma.c | 6 ++++-- mm/vma.h | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/vma.c b/mm/vma.c index abe0da33c8446..a648e0555c873 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -474,6 +474,7 @@ void remove_vma(struct vm_area_struct *vma) * Called with the mm semaphore held. */ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, + unsigned long vma_min, unsigned long vma_max, struct vm_area_struct *prev, struct vm_area_struct *next) { struct mm_struct *mm = vma->vm_mm; @@ -481,7 +482,7 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); - unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end, + unmap_vmas(&tlb, mas, vma, vma_min, vma_max, vma_max, /* mm_wr_locked = */ true); mas_set(mas, vma->vm_end); free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, @@ -2417,7 +2418,8 @@ static int __mmap_new_file_vma(struct mmap_state *map, vma_iter_set(vmi, vma->vm_end); /* Undo any partial mapping done by a device driver. */ - unmap_region(&vmi->mas, vma, map->prev, map->next); + unmap_region(&vmi->mas, vma, vma->vm_start, vma->vm_end, + map->prev, map->next); return error; } diff --git a/mm/vma.h b/mm/vma.h index 9183fe5490090..a9d0cef684ddb 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -261,6 +261,7 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, void remove_vma(struct vm_area_struct *vma); void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, + unsigned long min, unsigned long max, struct vm_area_struct *prev, struct vm_area_struct *next); /* We are about to modify the VMA's flags. */ -- 2.47.2 The ceiling and tree search limit need to be different arguments for the future change in the failed fork attempt. Add some documentation around free_pgtables() and the limits in an attempt to clarify the floor and ceiling use as well as the new tree_max. Test code also updated. No functional changes intended. Signed-off-by: Liam R. Howlett --- mm/internal.h | 4 +++- mm/memory.c | 28 +++++++++++++++++++++++++--- mm/mmap.c | 2 +- mm/vma.c | 3 ++- tools/testing/vma/vma_internal.h | 3 ++- 5 files changed, 33 insertions(+), 7 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 63e3ec8d63be7..d295252407fee 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -444,7 +444,9 @@ void folio_activate(struct folio *folio); void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *start_vma, unsigned long floor, - unsigned long ceiling, bool mm_wr_locked); + unsigned long ceiling, unsigned long tree_max, + bool mm_wr_locked); + void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte); struct zap_details; diff --git a/mm/memory.c b/mm/memory.c index 3e0404bd57a02..24716b3713f66 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -369,12 +369,34 @@ void free_pgd_range(struct mmu_gather *tlb, } while (pgd++, addr = next, addr != end); } +/* + * free_pgtables() - Free a range of page tables + * @tlb: The mmu gather + * @mas: The maple state + * @vma: The first vma + * @floor: The lowest page table address + * @ceiling: The highest page table address + * @tree_max: The highest tree search address + * @mm_wr_locked: boolean indicating if the mm is write locked + * + * Note: Floor and ceiling are provided to indicate the absolute range of the + * page tables that should be removed. This can differ from the vma mappings on + * some archs that may have mappings that need to be removed outside the vmas. + * Note that the prev->vm_end and next->vm_start are often used. + * + * The tree_max differs from the ceiling when a dup_mmap() failed and the tree + * has unrelated data to the mm_struct being torn down. + */ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *vma, unsigned long floor, - unsigned long ceiling, bool mm_wr_locked) + unsigned long ceiling, unsigned long tree_max, + bool mm_wr_locked) { struct unlink_vma_file_batch vb; + /* underflow can happen and is fine */ + WARN_ON_ONCE(tree_max - 1 > ceiling - 1); + tlb_free_vmas(tlb); do { @@ -385,7 +407,7 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, * Note: USER_PGTABLES_CEILING may be passed as ceiling and may * be 0. This will underflow and is okay. */ - next = mas_find(mas, ceiling - 1); + next = mas_find(mas, tree_max - 1); if (unlikely(xa_is_zero(next))) next = NULL; @@ -405,7 +427,7 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, */ while (next && next->vm_start <= vma->vm_end + PMD_SIZE) { vma = next; - next = mas_find(mas, ceiling - 1); + next = mas_find(mas, tree_max - 1); if (unlikely(xa_is_zero(next))) next = NULL; if (mm_wr_locked) diff --git a/mm/mmap.c b/mm/mmap.c index a290448a53bb2..0f4808f135fe6 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1311,7 +1311,7 @@ void exit_mmap(struct mm_struct *mm) mt_clear_in_rcu(&mm->mm_mt); vma_iter_set(&vmi, vma->vm_end); free_pgtables(&tlb, &vmi.mas, vma, FIRST_USER_ADDRESS, - USER_PGTABLES_CEILING, true); + USER_PGTABLES_CEILING, USER_PGTABLES_CEILING, true); tlb_finish_mmu(&tlb); /* diff --git a/mm/vma.c b/mm/vma.c index a648e0555c873..1bae142bbc0f1 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -486,6 +486,7 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, /* mm_wr_locked = */ true); mas_set(mas, vma->vm_end); free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, + next ? next->vm_start : USER_PGTABLES_CEILING, next ? next->vm_start : USER_PGTABLES_CEILING, /* mm_wr_locked = */ true); tlb_finish_mmu(&tlb); @@ -1232,7 +1233,7 @@ static inline void vms_clear_ptes(struct vma_munmap_struct *vms, mas_set(mas_detach, 1); /* start and end may be different if there is no prev or next vma. */ free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start, - vms->unmap_end, mm_wr_locked); + vms->unmap_end, vms->unmap_end, mm_wr_locked); tlb_finish_mmu(&tlb); vms->clear_ptes = false; } diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 07167446dcf42..823d379e1fac2 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -900,7 +900,8 @@ static inline void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, static inline void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *vma, unsigned long floor, - unsigned long ceiling, bool mm_wr_locked) + unsigned long ceiling, unsigned long tree_max, + bool mm_wr_locked) { (void)tlb; (void)mas; -- 2.47.2 The unmap_region() calls need to pass through the page table limit for a future patch. No functional changes intended. Reviewed-by: Lorenzo Stoakes Signed-off-by: Liam R. Howlett --- mm/vma.c | 5 +++-- mm/vma.h | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/vma.c b/mm/vma.c index 1bae142bbc0f1..4c850ffd83a4b 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -474,7 +474,7 @@ void remove_vma(struct vm_area_struct *vma) * Called with the mm semaphore held. */ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, - unsigned long vma_min, unsigned long vma_max, + unsigned long vma_min, unsigned long vma_max, unsigned long pg_max, struct vm_area_struct *prev, struct vm_area_struct *next) { struct mm_struct *mm = vma->vm_mm; @@ -487,7 +487,7 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, mas_set(mas, vma->vm_end); free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, next ? next->vm_start : USER_PGTABLES_CEILING, - next ? next->vm_start : USER_PGTABLES_CEILING, + pg_max, /* mm_wr_locked = */ true); tlb_finish_mmu(&tlb); } @@ -2420,6 +2420,7 @@ static int __mmap_new_file_vma(struct mmap_state *map, vma_iter_set(vmi, vma->vm_end); /* Undo any partial mapping done by a device driver. */ unmap_region(&vmi->mas, vma, vma->vm_start, vma->vm_end, + map->next ? map->next->vm_start : USER_PGTABLES_CEILING, map->prev, map->next); return error; diff --git a/mm/vma.h b/mm/vma.h index a9d0cef684ddb..b0ebc81d5862e 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -261,7 +261,7 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, void remove_vma(struct vm_area_struct *vma); void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, - unsigned long min, unsigned long max, + unsigned long min, unsigned long max, unsigned long pg_max, struct vm_area_struct *prev, struct vm_area_struct *next); /* We are about to modify the VMA's flags. */ -- 2.47.2 When the dup_mmap() fails during the vma duplication or setup, don't write the XA_ZERO entry in the vma tree. Instead, destroy the tree and free the new resources, leaving an empty vma tree. Using XA_ZERO introduced races where the vma could be found between dup_mmap() dropping all locks and exit_mmap() taking the locks. The race can occur because the mm can be reached through the other trees via successfully copied vmas and other methods such as the swapoff code. XA_ZERO was marking the location to stop vma removal and pagetable freeing. The newly created arguments to the unmap_vmas() and free_pgtables() serve this function. Replacing the XA_ZERO entry use with the new argument list also means the checks for xa_is_zero() are no longer necessary so these are also removed. Signed-off-by: Liam R. Howlett --- mm/memory.c | 6 +----- mm/mmap.c | 42 +++++++++++++++++++++++++++++++----------- 2 files changed, 32 insertions(+), 16 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 24716b3713f66..829cd94950182 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -408,8 +408,6 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, * be 0. This will underflow and is okay. */ next = mas_find(mas, tree_max - 1); - if (unlikely(xa_is_zero(next))) - next = NULL; /* * Hide vma from rmap and truncate_pagecache before freeing @@ -428,8 +426,6 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, while (next && next->vm_start <= vma->vm_end + PMD_SIZE) { vma = next; next = mas_find(mas, tree_max - 1); - if (unlikely(xa_is_zero(next))) - next = NULL; if (mm_wr_locked) vma_start_write(vma); unlink_anon_vmas(vma); @@ -2129,7 +2125,7 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, mm_wr_locked); hugetlb_zap_end(vma, &details); vma = mas_find(mas, tree_end - 1); - } while (vma && likely(!xa_is_zero(vma))); + } while (vma); mmu_notifier_invalidate_range_end(&range); } diff --git a/mm/mmap.c b/mm/mmap.c index 0f4808f135fe6..aa4770b8d7f1e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1288,7 +1288,7 @@ void exit_mmap(struct mm_struct *mm) arch_exit_mmap(mm); vma = vma_next(&vmi); - if (!vma || unlikely(xa_is_zero(vma))) { + if (!vma) { /* Can happen if dup_mmap() received an OOM */ mmap_read_unlock(mm); mmap_write_lock(mm); @@ -1858,20 +1858,40 @@ __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) ksm_fork(mm, oldmm); khugepaged_fork(mm, oldmm); } else { + unsigned long max; /* - * The entire maple tree has already been duplicated. If the - * mmap duplication fails, mark the failure point with - * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered, - * stop releasing VMAs that have not been duplicated after this - * point. + * The entire maple tree has already been duplicated, but + * replacing the vmas failed at mpnt (which could be NULL if + * all were allocated but the last vma was not fully set up). + * Use the start address of the failure point to clean up the + * partially initialized tree. */ - if (mpnt) { - mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1); - mas_store(&vmi.mas, XA_ZERO_ENTRY); - /* Avoid OOM iterating a broken tree */ - mm_flags_set(MMF_OOM_SKIP, mm); + if (!mm->map_count) { + /* zero vmas were written to the new tree. */ + max = 0; + } else if (mpnt) { + /* partial tree failure */ + max = mpnt->vm_start; + } else { + /* All vmas were written to the new tree */ + max = ULONG_MAX; } + + /* Hide mm from oom killer because the memory is being freed */ + mm_flags_set(MMF_OOM_SKIP, mm); + if (max) { + vma_iter_set(&vmi, 0); + tmp = vma_next(&vmi); + flush_cache_mm(mm); + unmap_region(&vmi.mas, /* vma = */ tmp, + /*vma_min = */ 0, /* vma_max = */ max, + /* pg_max = */ max, /* prev = */ NULL, + /* next = */ NULL); + charge = tear_down_vmas(mm, &vmi, tmp, max); + vm_unacct_memory(charge); + } + __mt_destroy(&mm->mm_mt); /* * The mm_struct is going to exit, but the locks will be dropped * first. Set the mm_struct as unstable is advisable as it is -- 2.47.2 The unmap_region code uses a number of arguments that could use better documentation. With the addition of a descriptor for unmap (called unmap_desc), the arguments can be more self-documenting and increase the descriptions within the declaration. No functional change intended Signed-off-by: Liam R. Howlett --- mm/mmap.c | 12 ++++++++---- mm/vma.c | 27 ++++++++++++--------------- mm/vma.h | 35 ++++++++++++++++++++++++++++++++--- 3 files changed, 52 insertions(+), 22 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index aa4770b8d7f1e..5c9bd3f20e53f 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1883,11 +1883,15 @@ __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) if (max) { vma_iter_set(&vmi, 0); tmp = vma_next(&vmi); + UNMAP_REGION(unmap, &vmi, /* first vma = */ tmp, + /* min vma addr = */ 0, + /* max vma addr = */ max, + /* prev = */ NULL, /* next = */ NULL); + + /* Don't free the pgtables higher than the failure */ + unmap.tree_max = max; flush_cache_mm(mm); - unmap_region(&vmi.mas, /* vma = */ tmp, - /*vma_min = */ 0, /* vma_max = */ max, - /* pg_max = */ max, /* prev = */ NULL, - /* next = */ NULL); + unmap_region(&unmap); charge = tear_down_vmas(mm, &vmi, tmp, max); vm_unacct_memory(charge); } diff --git a/mm/vma.c b/mm/vma.c index 4c850ffd83a4b..c92384975cbb2 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -473,22 +473,20 @@ void remove_vma(struct vm_area_struct *vma) * * Called with the mm semaphore held. */ -void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, - unsigned long vma_min, unsigned long vma_max, unsigned long pg_max, - struct vm_area_struct *prev, struct vm_area_struct *next) +void unmap_region(struct unmap_desc *desc) { - struct mm_struct *mm = vma->vm_mm; + struct mm_struct *mm = desc->first->vm_mm; + struct ma_state *mas = desc->mas; struct mmu_gather tlb; tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); - unmap_vmas(&tlb, mas, vma, vma_min, vma_max, vma_max, - /* mm_wr_locked = */ true); - mas_set(mas, vma->vm_end); - free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, - next ? next->vm_start : USER_PGTABLES_CEILING, - pg_max, - /* mm_wr_locked = */ true); + unmap_vmas(&tlb, mas, desc->first, desc->vma_min, desc->vma_max, + desc->vma_max, desc->mm_wr_locked); + mas_set(mas, desc->tree_reset); + free_pgtables(&tlb, mas, desc->first, desc->first_pgaddr, + desc->last_pgaddr, desc->tree_max, + desc->mm_wr_locked); tlb_finish_mmu(&tlb); } @@ -2414,15 +2412,14 @@ static int __mmap_new_file_vma(struct mmap_state *map, error = mmap_file(vma->vm_file, vma); if (error) { + UNMAP_REGION(unmap, vmi, vma, vma->vm_start, vma->vm_end, + map->prev, map->next); fput(vma->vm_file); vma->vm_file = NULL; vma_iter_set(vmi, vma->vm_end); /* Undo any partial mapping done by a device driver. */ - unmap_region(&vmi->mas, vma, vma->vm_start, vma->vm_end, - map->next ? map->next->vm_start : USER_PGTABLES_CEILING, - map->prev, map->next); - + unmap_region(&unmap); return error; } diff --git a/mm/vma.h b/mm/vma.h index b0ebc81d5862e..4edd5d26ffcfc 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -152,6 +152,37 @@ struct vma_merge_struct { }; +struct unmap_desc { + struct ma_state *mas; /* the maple state point to the first vma */ + struct vm_area_struct *first; /* The first vma */ + unsigned long first_pgaddr; /* The first pagetable address to free */ + unsigned long last_pgaddr; /* The last pagetable address to free */ + unsigned long vma_min; /* The min vma address */ + unsigned long vma_max; /* The max vma address */ + unsigned long tree_max; /* Maximum for the vma tree search */ + unsigned long tree_reset; /* Where to reset the vma tree walk */ + bool mm_wr_locked; /* If the mmap write lock is held */ +}; + +#define UNMAP_REGION(name, _vmi, _vma, _vma_min, _vma_max, _prev, _next) \ + struct unmap_desc name = { \ + .mas = &(_vmi)->mas, \ + .first = _vma, \ + .first_pgaddr = _prev ? \ + ((struct vm_area_struct *)_prev)->vm_end : \ + FIRST_USER_ADDRESS, \ + .last_pgaddr = _next ? \ + ((struct vm_area_struct *)_next)->vm_start : \ + USER_PGTABLES_CEILING, \ + .vma_min = _vma_min, \ + .vma_max = _vma_max, \ + .tree_max = _next ? \ + ((struct vm_area_struct *)_next)->vm_start : \ + USER_PGTABLES_CEILING, \ + .tree_reset = _vma->vm_end, \ + .mm_wr_locked = true, \ + } + static inline bool vmg_nomem(struct vma_merge_struct *vmg) { return vmg->state == VMA_MERGE_ERROR_NOMEM; @@ -260,9 +291,7 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, void remove_vma(struct vm_area_struct *vma); -void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, - unsigned long min, unsigned long max, unsigned long pg_max, - struct vm_area_struct *prev, struct vm_area_struct *next); +void unmap_region(struct unmap_desc *desc); /* We are about to modify the VMA's flags. */ __must_check struct vm_area_struct -- 2.47.2 vms_clear_ptes() is slightly different than other callers to unmap_region() and so had the unmapping open-coded. Using the new structure it is now possible to special-case the struct setup instead of having the open-coded function. exit_mmap() also calls unmap_vmas() with many arguemnts. Using the unmap_all_init() function to set the unmap descriptor for all vmas makes this a bit easier to read. Update to the vma test code is necessary to ensure testing continues to function. No functional changes intended. Signed-off-by: Liam R. Howlett --- include/linux/mm.h | 3 --- mm/internal.h | 3 +++ mm/memory.c | 24 ++++++++------------ mm/mmap.c | 5 +++- mm/vma.c | 39 ++++++++++++++++++-------------- mm/vma.h | 14 ++++++++++++ tools/testing/vma/vma_internal.h | 14 ++++-------- 7 files changed, 56 insertions(+), 46 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 892fe5dbf9de0..23eb59d543390 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2450,9 +2450,6 @@ static inline void zap_vma_pages(struct vm_area_struct *vma) zap_page_range_single(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL); } -void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, - struct vm_area_struct *start_vma, unsigned long start, - unsigned long end, unsigned long tree_end, bool mm_wr_locked); struct mmu_notifier_range; diff --git a/mm/internal.h b/mm/internal.h index d295252407fee..1239944f2830a 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -197,6 +197,9 @@ static inline void vma_close(struct vm_area_struct *vma) } } +/* unmap_vmas is in mm/memory.c */ +void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap); + #ifdef CONFIG_MMU /* Flags for folio_pte_batch(). */ diff --git a/mm/memory.c b/mm/memory.c index 829cd94950182..8d4d976311037 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2084,12 +2084,7 @@ static void unmap_single_vma(struct mmu_gather *tlb, /** * unmap_vmas - unmap a range of memory covered by a list of vma's * @tlb: address of the caller's struct mmu_gather - * @mas: the maple state - * @vma: the starting vma - * @start_addr: virtual address at which to start unmapping - * @end_addr: virtual address at which to end unmapping - * @tree_end: The maximum index to check - * @mm_wr_locked: lock flag + * @unmap: The unmap_desc * * Unmap all pages in the vma list. * @@ -2102,11 +2097,9 @@ static void unmap_single_vma(struct mmu_gather *tlb, * ensure that any thus-far unmapped pages are flushed before unmap_vmas() * drops the lock and schedules. */ -void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, - struct vm_area_struct *vma, unsigned long start_addr, - unsigned long end_addr, unsigned long tree_end, - bool mm_wr_locked) +void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap) { + struct vm_area_struct *vma; struct mmu_notifier_range range; struct zap_details details = { .zap_flags = ZAP_FLAG_DROP_MARKER | ZAP_FLAG_UNMAP, @@ -2114,17 +2107,18 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, .even_cows = true, }; + vma = unmap->first; mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm, - start_addr, end_addr); + unmap->vma_min, unmap->vma_max); mmu_notifier_invalidate_range_start(&range); do { - unsigned long start = start_addr; - unsigned long end = end_addr; + unsigned long start = unmap->vma_min; + unsigned long end = unmap->vma_max; hugetlb_zap_begin(vma, &start, &end); unmap_single_vma(tlb, vma, start, end, &details, - mm_wr_locked); + unmap->mm_wr_locked); hugetlb_zap_end(vma, &details); - vma = mas_find(mas, tree_end - 1); + vma = mas_find(unmap->mas, unmap->tree_max - 1); } while (vma); mmu_notifier_invalidate_range_end(&range); } diff --git a/mm/mmap.c b/mm/mmap.c index 5c9bd3f20e53f..6011f62b0a294 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1280,10 +1280,12 @@ void exit_mmap(struct mm_struct *mm) struct vm_area_struct *vma; unsigned long nr_accounted = 0; VMA_ITERATOR(vmi, mm, 0); + struct unmap_desc unmap; /* mm's last user has gone, and its about to be pulled down */ mmu_notifier_release(mm); + unmap.mm_wr_locked = false; mmap_read_lock(mm); arch_exit_mmap(mm); @@ -1295,11 +1297,12 @@ void exit_mmap(struct mm_struct *mm) goto destroy; } + unmap_all_init(&unmap, &vmi, vma); flush_cache_mm(mm); tlb_gather_mmu_fullmm(&tlb, mm); /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */ - unmap_vmas(&tlb, &vmi.mas, vma, 0, ULONG_MAX, ULONG_MAX, false); + unmap_vmas(&tlb, &unmap); mmap_read_unlock(mm); /* diff --git a/mm/vma.c b/mm/vma.c index c92384975cbb2..ad64cd9795ef3 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -481,8 +481,7 @@ void unmap_region(struct unmap_desc *desc) tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); - unmap_vmas(&tlb, mas, desc->first, desc->vma_min, desc->vma_max, - desc->vma_max, desc->mm_wr_locked); + unmap_vmas(&tlb, desc); mas_set(mas, desc->tree_reset); free_pgtables(&tlb, mas, desc->first, desc->first_pgaddr, desc->last_pgaddr, desc->tree_max, @@ -1213,26 +1212,32 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, static inline void vms_clear_ptes(struct vma_munmap_struct *vms, struct ma_state *mas_detach, bool mm_wr_locked) { - struct mmu_gather tlb; + struct unmap_desc unmap = { + .mas = mas_detach, + .first = vms->vma, + /* start and end may be different if there is no prev or next vma. */ + .first_pgaddr = vms->unmap_start, + .last_pgaddr = vms->unmap_end, + .vma_min = vms->start, + .vma_max = vms->end, + /* + * The tree limits and reset differ from the normal case since it's a + * side-tree + */ + .tree_reset = 1, + .tree_max = vms->vma_count, + /* + * We can free page tables without write-locking mmap_lock because VMAs + * were isolated before we downgraded mmap_lock. + */ + .mm_wr_locked = mm_wr_locked, + }; if (!vms->clear_ptes) /* Nothing to do */ return; - /* - * We can free page tables without write-locking mmap_lock because VMAs - * were isolated before we downgraded mmap_lock. - */ mas_set(mas_detach, 1); - tlb_gather_mmu(&tlb, vms->vma->vm_mm); - update_hiwater_rss(vms->vma->vm_mm); - unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end, - vms->vma_count, mm_wr_locked); - - mas_set(mas_detach, 1); - /* start and end may be different if there is no prev or next vma. */ - free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start, - vms->unmap_end, vms->unmap_end, mm_wr_locked); - tlb_finish_mmu(&tlb); + unmap_region(&unmap); vms->clear_ptes = false; } diff --git a/mm/vma.h b/mm/vma.h index 4edd5d26ffcfc..8b55a0c73d097 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -164,6 +164,20 @@ struct unmap_desc { bool mm_wr_locked; /* If the mmap write lock is held */ }; +static inline void unmap_all_init(struct unmap_desc *desc, + struct vma_iterator *vmi, struct vm_area_struct *vma) +{ + desc->mas = &vmi->mas; + desc->first = vma; + desc->first_pgaddr = FIRST_USER_ADDRESS; + desc->last_pgaddr = USER_PGTABLES_CEILING; + desc->vma_min = 0; + desc->vma_max = ULONG_MAX; + desc->tree_max = ULONG_MAX; + desc->tree_reset = vma->vm_end; + desc->mm_wr_locked = false; +} + #define UNMAP_REGION(name, _vmi, _vma, _vma_min, _vma_max, _prev, _next) \ struct unmap_desc name = { \ .mas = &(_vmi)->mas, \ diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 823d379e1fac2..d73ad4747d40a 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -884,18 +884,12 @@ static inline void update_hiwater_vm(struct mm_struct *) { } -static inline void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, - struct vm_area_struct *vma, unsigned long start_addr, - unsigned long end_addr, unsigned long tree_end, - bool mm_wr_locked) +struct unmap_desc; + +static inline void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap) { (void)tlb; - (void)mas; - (void)vma; - (void)start_addr; - (void)end_addr; - (void)tree_end; - (void)mm_wr_locked; + (void)unmap; } static inline void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, -- 2.47.2 Pass through the unmap_desc to free_pgtables() because it almost has everything necessary and is already on the stack. Updates testing code as necessary. No functional changes intended. Signed-off-by: Liam R. Howlett --- mm/internal.h | 5 +---- mm/memory.c | 21 ++++++++++----------- mm/mmap.c | 6 +++--- mm/vma.c | 7 ++----- tools/testing/vma/vma_internal.h | 11 ++--------- 5 files changed, 18 insertions(+), 32 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 1239944f2830a..f22329967e908 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -445,10 +445,7 @@ bool __folio_end_writeback(struct folio *folio); void deactivate_file_folio(struct folio *folio); void folio_activate(struct folio *folio); -void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, - struct vm_area_struct *start_vma, unsigned long floor, - unsigned long ceiling, unsigned long tree_max, - bool mm_wr_locked); +void free_pgtables(struct mmu_gather *tlb, struct unmap_desc *desc); void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte); diff --git a/mm/memory.c b/mm/memory.c index 8d4d976311037..98c5ffd28a109 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -387,15 +387,14 @@ void free_pgd_range(struct mmu_gather *tlb, * The tree_max differs from the ceiling when a dup_mmap() failed and the tree * has unrelated data to the mm_struct being torn down. */ -void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, - struct vm_area_struct *vma, unsigned long floor, - unsigned long ceiling, unsigned long tree_max, - bool mm_wr_locked) +void free_pgtables(struct mmu_gather *tlb, struct unmap_desc *desc) { struct unlink_vma_file_batch vb; + struct ma_state *mas = desc->mas; + struct vm_area_struct *vma = desc->first; /* underflow can happen and is fine */ - WARN_ON_ONCE(tree_max - 1 > ceiling - 1); + WARN_ON_ONCE(desc->tree_max - 1 > desc->last_pgaddr - 1); tlb_free_vmas(tlb); @@ -407,13 +406,13 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, * Note: USER_PGTABLES_CEILING may be passed as ceiling and may * be 0. This will underflow and is okay. */ - next = mas_find(mas, tree_max - 1); + next = mas_find(mas, desc->tree_max - 1); /* * Hide vma from rmap and truncate_pagecache before freeing * pgtables */ - if (mm_wr_locked) + if (desc->mm_wr_locked) vma_start_write(vma); unlink_anon_vmas(vma); @@ -425,16 +424,16 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, */ while (next && next->vm_start <= vma->vm_end + PMD_SIZE) { vma = next; - next = mas_find(mas, tree_max - 1); - if (mm_wr_locked) + next = mas_find(mas, desc->tree_max - 1); + if (desc->mm_wr_locked) vma_start_write(vma); unlink_anon_vmas(vma); unlink_file_vma_batch_add(&vb, vma); } unlink_file_vma_batch_final(&vb); - free_pgd_range(tlb, addr, vma->vm_end, - floor, next ? next->vm_start : ceiling); + free_pgd_range(tlb, addr, vma->vm_end, desc->first_pgaddr, + next ? next->vm_start : desc->last_pgaddr); vma = next; } while (vma); } diff --git a/mm/mmap.c b/mm/mmap.c index 6011f62b0a294..9908481452780 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1311,10 +1311,10 @@ void exit_mmap(struct mm_struct *mm) */ mm_flags_set(MMF_OOM_SKIP, mm); mmap_write_lock(mm); + unmap.mm_wr_locked = true; mt_clear_in_rcu(&mm->mm_mt); - vma_iter_set(&vmi, vma->vm_end); - free_pgtables(&tlb, &vmi.mas, vma, FIRST_USER_ADDRESS, - USER_PGTABLES_CEILING, USER_PGTABLES_CEILING, true); + vma_iter_set(&vmi, unmap.tree_reset); + free_pgtables(&tlb, &unmap); tlb_finish_mmu(&tlb); /* diff --git a/mm/vma.c b/mm/vma.c index ad64cd9795ef3..ba155a539d160 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -476,16 +476,13 @@ void remove_vma(struct vm_area_struct *vma) void unmap_region(struct unmap_desc *desc) { struct mm_struct *mm = desc->first->vm_mm; - struct ma_state *mas = desc->mas; struct mmu_gather tlb; tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); unmap_vmas(&tlb, desc); - mas_set(mas, desc->tree_reset); - free_pgtables(&tlb, mas, desc->first, desc->first_pgaddr, - desc->last_pgaddr, desc->tree_max, - desc->mm_wr_locked); + mas_set(desc->mas, desc->tree_reset); + free_pgtables(&tlb, desc); tlb_finish_mmu(&tlb); } diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index d73ad4747d40a..435c5a24480bc 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -892,17 +892,10 @@ static inline void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap) (void)unmap; } -static inline void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, - struct vm_area_struct *vma, unsigned long floor, - unsigned long ceiling, unsigned long tree_max, - bool mm_wr_locked) +static inline void free_pgtables(struct mmu_gather *tlb, struct unmap_desc *desc) { (void)tlb; - (void)mas; - (void)vma; - (void)floor; - (void)ceiling; - (void)mm_wr_locked; + (void)desc; } static inline void mapping_unmap_writable(struct address_space *) -- 2.47.2