pgtables.h defines a fallback for ceiling and floor of the page tables within the CONFIG_MMU section. Moving the definitions to outside the CONFIG_MMU Allows for using them in generic code. Suggested-by: Lorenzo Stoakes Suggested-by: SeongJae Park Signed-off-by: Liam R. Howlett --- include/linux/pgtable.h | 39 ++++++++++++++++++++------------------- mm/vma_internal.h | 1 + 2 files changed, 21 insertions(+), 19 deletions(-) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index eb8aacba3698d..557ac918bdca6 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -22,25 +22,6 @@ #error CONFIG_PGTABLE_LEVELS is not consistent with __PAGETABLE_{P4D,PUD,PMD}_FOLDED #endif -/* - * On almost all architectures and configurations, 0 can be used as the - * upper ceiling to free_pgtables(): on many architectures it has the same - * effect as using TASK_SIZE. However, there is one configuration which - * must impose a more careful limit, to avoid freeing kernel pgtables. - */ -#ifndef USER_PGTABLES_CEILING -#define USER_PGTABLES_CEILING 0UL -#endif - -/* - * This defines the first usable user address. Platforms - * can override its value with custom FIRST_USER_ADDRESS - * defined in their respective . - */ -#ifndef FIRST_USER_ADDRESS -#define FIRST_USER_ADDRESS 0UL -#endif - /* * This defines the generic helper for accessing PMD page * table page. Although platforms can still override this @@ -1660,6 +1641,26 @@ void arch_sync_kernel_mappings(unsigned long start, unsigned long end); #endif /* CONFIG_MMU */ +/* + * On almost all architectures and configurations, 0 can be used as the + * upper ceiling to free_pgtables(): on many architectures it has the same + * effect as using TASK_SIZE. However, there is one configuration which + * must impose a more careful limit, to avoid freeing kernel pgtables. + */ +#ifndef USER_PGTABLES_CEILING +#define USER_PGTABLES_CEILING 0UL +#endif + +/* + * This defines the first usable user address. Platforms + * can override its value with custom FIRST_USER_ADDRESS + * defined in their respective . + */ +#ifndef FIRST_USER_ADDRESS +#define FIRST_USER_ADDRESS 0UL +#endif + + /* * No-op macros that just return the current protection value. Defined here * because these macros can be used even if CONFIG_MMU is not defined. diff --git a/mm/vma_internal.h b/mm/vma_internal.h index 2f05735ff190c..2da6d224c1a85 100644 --- a/mm/vma_internal.h +++ b/mm/vma_internal.h @@ -46,6 +46,7 @@ #include #include #include +#include #include #include -- 2.47.3 Move the trace point later in the function so that it is not skipped in the event of a failed fork. Acked-by: Chris Li Reviewed-by: Lorenzo Stoakes Reviewed-by: David Hildenbrand Reviewed-by: Pedro Falcato Reviewed-by: Suren Baghdasaryan Signed-off-by: Liam R. Howlett --- mm/mmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mmap.c b/mm/mmap.c index 4bdb9ffa9e257..1f025edf8d7d0 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1307,9 +1307,9 @@ void exit_mmap(struct mm_struct *mm) BUG_ON(count != mm->map_count); - trace_exit_mmap(mm); destroy: __mt_destroy(&mm->mm_mt); + trace_exit_mmap(mm); mmap_write_unlock(mm); vm_unacct_memory(nr_accounted); } -- 2.47.3 Create the new function tear_down_vmas() to remove a range of vmas. exit_mmap() will be removing all the vmas. This is necessary for future patches. No functional changes intended. Reviewed-by: Lorenzo Stoakes Reviewed-by: David Hildenbrand Reviewed-by: Pedro Falcato Reviewed-by: Suren Baghdasaryan Signed-off-by: Liam R. Howlett --- mm/mmap.c | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 1f025edf8d7d0..9c8adc505d3de 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1247,6 +1247,29 @@ int vm_brk_flags(unsigned long addr, unsigned long request, vm_flags_t vm_flags) } EXPORT_SYMBOL(vm_brk_flags); +static +unsigned long tear_down_vmas(struct mm_struct *mm, struct vma_iterator *vmi, + struct vm_area_struct *vma, unsigned long end) +{ + unsigned long nr_accounted = 0; + int count = 0; + + mmap_assert_write_locked(mm); + vma_iter_set(vmi, vma->vm_end); + do { + if (vma->vm_flags & VM_ACCOUNT) + nr_accounted += vma_pages(vma); + vma_mark_detached(vma); + remove_vma(vma); + count++; + cond_resched(); + vma = vma_next(vmi); + } while (vma && vma->vm_end <= end); + + VM_WARN_ON_ONCE(count != mm->map_count); + return nr_accounted; +} + /* Release all mmaps. */ void exit_mmap(struct mm_struct *mm) { @@ -1254,7 +1277,6 @@ void exit_mmap(struct mm_struct *mm) struct vm_area_struct *vma; unsigned long nr_accounted = 0; VMA_ITERATOR(vmi, mm, 0); - int count = 0; /* mm's last user has gone, and its about to be pulled down */ mmu_notifier_release(mm); @@ -1294,18 +1316,7 @@ void exit_mmap(struct mm_struct *mm) * enabled, without holding any MM locks besides the unreachable * mmap_write_lock. */ - vma_iter_set(&vmi, vma->vm_end); - do { - if (vma->vm_flags & VM_ACCOUNT) - nr_accounted += vma_pages(vma); - vma_mark_detached(vma); - remove_vma(vma); - count++; - cond_resched(); - vma = vma_next(&vmi); - } while (vma && likely(!xa_is_zero(vma))); - - BUG_ON(count != mm->map_count); + nr_accounted = tear_down_vmas(mm, &vmi, vma, ULONG_MAX); destroy: __mt_destroy(&mm->mm_mt); -- 2.47.3 Add a limit to the vma search instead of using the start and end of the one passed in. No functional changes intended. Reviewed-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Reviewed-by: Pedro Falcato Reviewed-by: Suren Baghdasaryan Signed-off-by: Liam R. Howlett --- mm/vma.c | 6 ++++-- mm/vma.h | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/vma.c b/mm/vma.c index 3dbe414eff894..0c35cdc0d3b7b 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -473,6 +473,7 @@ void remove_vma(struct vm_area_struct *vma) * Called with the mm semaphore held. */ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, + unsigned long vma_start, unsigned long vma_end, struct vm_area_struct *prev, struct vm_area_struct *next) { struct mm_struct *mm = vma->vm_mm; @@ -480,7 +481,7 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); - unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end); + unmap_vmas(&tlb, mas, vma, vma_start, vma_end, vma_end); mas_set(mas, vma->vm_end); free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, next ? next->vm_start : USER_PGTABLES_CEILING, @@ -2466,7 +2467,8 @@ static int __mmap_new_file_vma(struct mmap_state *map, vma_iter_set(vmi, vma->vm_end); /* Undo any partial mapping done by a device driver. */ - unmap_region(&vmi->mas, vma, map->prev, map->next); + unmap_region(&vmi->mas, vma, vma->vm_start, vma->vm_end, + map->prev, map->next); return error; } diff --git a/mm/vma.h b/mm/vma.h index d51efd9da113f..e671adced3a03 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -264,6 +264,7 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, void remove_vma(struct vm_area_struct *vma); void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, + unsigned long vma_start, unsigned long vma_end, struct vm_area_struct *prev, struct vm_area_struct *next); /** -- 2.47.3 The ceiling and tree search limit need to be different arguments for the future change in the failed fork attempt. The ceiling and floor variables are not very descriptive, so change them to pg_start/pg_end. Adding a new variable for the vma_end to the function as it will differ from the pg_end in the later patches in the series. Add a kernel doc about the free_pgtables() function. Test code also updated. No functional changes intended. Reviewed-by: Lorenzo Stoakes Reviewed-by: Pedro Falcato Signed-off-by: Liam R. Howlett --- mm/internal.h | 6 +++-- mm/memory.c | 42 +++++++++++++++++++++++++------- mm/mmap.c | 2 +- mm/vma.c | 3 ++- tools/testing/vma/vma_internal.h | 3 ++- 5 files changed, 42 insertions(+), 14 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 5afe55751fe08..2cdc5c9396f10 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -510,8 +510,10 @@ void deactivate_file_folio(struct folio *folio); void folio_activate(struct folio *folio); void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, - struct vm_area_struct *start_vma, unsigned long floor, - unsigned long ceiling, bool mm_wr_locked); + struct vm_area_struct *vma, unsigned long pg_start, + unsigned long pg_end, unsigned long vma_end, + bool mm_wr_locked); + void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte); struct zap_details; diff --git a/mm/memory.c b/mm/memory.c index 4b0790c8fa48e..9043cfda65b94 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -370,23 +370,47 @@ void free_pgd_range(struct mmu_gather *tlb, } while (pgd++, addr = next, addr != end); } +/** + * free_pgtables() - Free a range of page tables + * @tlb: The mmu gather + * @mas: The maple state + * @vma: The first vma + * @pg_start: The lowest page table address (floor) + * @pg_end: The highest page table address (ceiling) + * @vma_end: The highest vma tree search address + * @mm_wr_locked: boolean indicating if the mm is write locked + * + * Note: pg_start and pg_end are provided to indicate the absolute range of the + * page tables that should be removed. This can differ from the vma mappings on + * some archs that may have mappings that need to be removed outside the vmas. + * Note that the prev->vm_end and next->vm_start are often used. + * + * The vma_end differs from the pg_end when a dup_mmap() failed and the tree has + * unrelated data to the mm_struct being torn down. + */ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, - struct vm_area_struct *vma, unsigned long floor, - unsigned long ceiling, bool mm_wr_locked) + struct vm_area_struct *vma, unsigned long pg_start, + unsigned long pg_end, unsigned long vma_end, + bool mm_wr_locked) { struct unlink_vma_file_batch vb; + /* + * Note: USER_PGTABLES_CEILING may be passed as the value of pg_end and + * may be 0. Underflow is expected in this case. Otherwise the + * pagetable end is exclusive. + * vma_end is exclusive. + * The last vma address should never be larger than the pagetable end. + */ + WARN_ON_ONCE(vma_end - 1 > pg_end - 1); + tlb_free_vmas(tlb); do { unsigned long addr = vma->vm_start; struct vm_area_struct *next; - /* - * Note: USER_PGTABLES_CEILING may be passed as ceiling and may - * be 0. This will underflow and is okay. - */ - next = mas_find(mas, ceiling - 1); + next = mas_find(mas, vma_end - 1); if (unlikely(xa_is_zero(next))) next = NULL; @@ -406,7 +430,7 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, */ while (next && next->vm_start <= vma->vm_end + PMD_SIZE) { vma = next; - next = mas_find(mas, ceiling - 1); + next = mas_find(mas, vma_end - 1); if (unlikely(xa_is_zero(next))) next = NULL; if (mm_wr_locked) @@ -417,7 +441,7 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, unlink_file_vma_batch_final(&vb); free_pgd_range(tlb, addr, vma->vm_end, - floor, next ? next->vm_start : ceiling); + pg_start, next ? next->vm_start : pg_end); vma = next; } while (vma); } diff --git a/mm/mmap.c b/mm/mmap.c index 9c8adc505d3de..827a64cdcc681 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1308,7 +1308,7 @@ void exit_mmap(struct mm_struct *mm) mt_clear_in_rcu(&mm->mm_mt); vma_iter_set(&vmi, vma->vm_end); free_pgtables(&tlb, &vmi.mas, vma, FIRST_USER_ADDRESS, - USER_PGTABLES_CEILING, true); + USER_PGTABLES_CEILING, USER_PGTABLES_CEILING, true); tlb_finish_mmu(&tlb); /* diff --git a/mm/vma.c b/mm/vma.c index 0c35cdc0d3b7b..b2b9e7b3284f3 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -484,6 +484,7 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, unmap_vmas(&tlb, mas, vma, vma_start, vma_end, vma_end); mas_set(mas, vma->vm_end); free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, + next ? next->vm_start : USER_PGTABLES_CEILING, next ? next->vm_start : USER_PGTABLES_CEILING, /* mm_wr_locked = */ true); tlb_finish_mmu(&tlb); @@ -1275,7 +1276,7 @@ static inline void vms_clear_ptes(struct vma_munmap_struct *vms, mas_set(mas_detach, 1); /* start and end may be different if there is no prev or next vma. */ free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start, - vms->unmap_end, mm_wr_locked); + vms->unmap_end, vms->unmap_end, mm_wr_locked); tlb_finish_mmu(&tlb); vms->clear_ptes = false; } diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 7fa56dcc53a6b..f50b8ddee6120 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -1139,7 +1139,8 @@ static inline void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, static inline void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *vma, unsigned long floor, - unsigned long ceiling, bool mm_wr_locked) + unsigned long ceiling, unsigned long tree_max, + bool mm_wr_locked) { } -- 2.47.3 The unmap_region() calls need to pass through the page table limit for a future patch. No functional changes intended. Reviewed-by: Lorenzo Stoakes Reviewed-by: Pedro Falcato Signed-off-by: Liam R. Howlett --- mm/vma.c | 7 ++++--- mm/vma.h | 3 ++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/mm/vma.c b/mm/vma.c index b2b9e7b3284f3..b92383e5eebd1 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -474,7 +474,8 @@ void remove_vma(struct vm_area_struct *vma) */ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, unsigned long vma_start, unsigned long vma_end, - struct vm_area_struct *prev, struct vm_area_struct *next) + unsigned long pg_max, struct vm_area_struct *prev, + struct vm_area_struct *next) { struct mm_struct *mm = vma->vm_mm; struct mmu_gather tlb; @@ -484,8 +485,7 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, unmap_vmas(&tlb, mas, vma, vma_start, vma_end, vma_end); mas_set(mas, vma->vm_end); free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, - next ? next->vm_start : USER_PGTABLES_CEILING, - next ? next->vm_start : USER_PGTABLES_CEILING, + pg_max, next ? next->vm_start : USER_PGTABLES_CEILING, /* mm_wr_locked = */ true); tlb_finish_mmu(&tlb); } @@ -2469,6 +2469,7 @@ static int __mmap_new_file_vma(struct mmap_state *map, vma_iter_set(vmi, vma->vm_end); /* Undo any partial mapping done by a device driver. */ unmap_region(&vmi->mas, vma, vma->vm_start, vma->vm_end, + map->next ? map->next->vm_start : USER_PGTABLES_CEILING, map->prev, map->next); return error; diff --git a/mm/vma.h b/mm/vma.h index e671adced3a03..7c2c95fef240b 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -265,7 +265,8 @@ void remove_vma(struct vm_area_struct *vma); void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, unsigned long vma_start, unsigned long vma_end, - struct vm_area_struct *prev, struct vm_area_struct *next); + unsigned long pg_max, struct vm_area_struct *prev, + struct vm_area_struct *next); /** * vma_modify_flags() - Perform any necessary split/merge in preparation for -- 2.47.3 When the dup_mmap() fails during the vma duplication or setup, don't write the XA_ZERO entry in the vma tree. Instead, destroy the tree and free the new resources, leaving an empty vma tree. Using XA_ZERO introduced races where the vma could be found between dup_mmap() dropping all locks and exit_mmap() taking the locks. The race can occur because the mm can be reached through the other trees via successfully copied vmas and other methods such as the swapoff code. XA_ZERO was marking the location to stop vma removal and pagetable freeing. The newly created arguments to the unmap_vmas() and free_pgtables() serve this function. Replacing the XA_ZERO entry use with the new argument list also means the checks for xa_is_zero() are no longer necessary so these are also removed. Note that dup_mmap() now cleans up when ALL vmas are successfully copied, but the dup_mmap() fails to completely set up some other aspect of the duplication. Reviewed-by: Lorenzo Stoakes Reviewed-by: Pedro Falcato Reviewed-by: Suren Baghdasaryan Signed-off-by: Liam R. Howlett --- mm/memory.c | 6 +----- mm/mmap.c | 42 +++++++++++++++++++++++++++++++----------- 2 files changed, 32 insertions(+), 16 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 9043cfda65b94..4331a6abe3e4c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -411,8 +411,6 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *next; next = mas_find(mas, vma_end - 1); - if (unlikely(xa_is_zero(next))) - next = NULL; /* * Hide vma from rmap and truncate_pagecache before freeing @@ -431,8 +429,6 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, while (next && next->vm_start <= vma->vm_end + PMD_SIZE) { vma = next; next = mas_find(mas, vma_end - 1); - if (unlikely(xa_is_zero(next))) - next = NULL; if (mm_wr_locked) vma_start_write(vma); unlink_anon_vmas(vma); @@ -2125,7 +2121,7 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, unmap_single_vma(tlb, vma, start, end, &details); hugetlb_zap_end(vma, &details); vma = mas_find(mas, tree_end - 1); - } while (vma && likely(!xa_is_zero(vma))); + } while (vma); mmu_notifier_invalidate_range_end(&range); } diff --git a/mm/mmap.c b/mm/mmap.c index 827a64cdcc681..48dae3d48e46f 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1285,7 +1285,7 @@ void exit_mmap(struct mm_struct *mm) arch_exit_mmap(mm); vma = vma_next(&vmi); - if (!vma || unlikely(xa_is_zero(vma))) { + if (!vma) { /* Can happen if dup_mmap() received an OOM */ mmap_read_unlock(mm); mmap_write_lock(mm); @@ -1851,20 +1851,40 @@ __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) ksm_fork(mm, oldmm); khugepaged_fork(mm, oldmm); } else { + unsigned long end; /* - * The entire maple tree has already been duplicated. If the - * mmap duplication fails, mark the failure point with - * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered, - * stop releasing VMAs that have not been duplicated after this - * point. + * The entire maple tree has already been duplicated, but + * replacing the vmas failed at mpnt (which could be NULL if + * all were allocated but the last vma was not fully set up). + * Use the start address of the failure point to clean up the + * partially initialized tree. */ - if (mpnt) { - mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1); - mas_store(&vmi.mas, XA_ZERO_ENTRY); - /* Avoid OOM iterating a broken tree */ - mm_flags_set(MMF_OOM_SKIP, mm); + if (!mm->map_count) { + /* zero vmas were written to the new tree. */ + end = 0; + } else if (mpnt) { + /* partial tree failure */ + end = mpnt->vm_start; + } else { + /* All vmas were written to the new tree */ + end = ULONG_MAX; } + + /* Hide mm from oom killer because the memory is being freed */ + mm_flags_set(MMF_OOM_SKIP, mm); + if (end) { + vma_iter_set(&vmi, 0); + tmp = vma_next(&vmi); + flush_cache_mm(mm); + unmap_region(&vmi.mas, /* vma = */ tmp, + /* vma_start = */ 0, /* vma_end = */ end, + /* pg_end = */ end, /* prev = */ NULL, + /* next = */ NULL); + charge = tear_down_vmas(mm, &vmi, tmp, end); + vm_unacct_memory(charge); + } + __mt_destroy(&mm->mm_mt); /* * The mm_struct is going to exit, but the locks will be dropped * first. Set the mm_struct as unstable is advisable as it is -- 2.47.3 The unmap_region code uses a number of arguments that could use better documentation. With the addition of a descriptor for unmap (called unmap_desc), the arguments can be more self-documenting and increase the descriptions within the declaration. No functional change intended Reviewed-by: Lorenzo Stoakes Reviewed-by: Pedro Falcato Signed-off-by: Liam R. Howlett --- mm/mmap.c | 14 ++++++++++---- mm/vma.c | 25 +++++++++++-------------- mm/vma.h | 35 ++++++++++++++++++++++++++++++----- 3 files changed, 51 insertions(+), 23 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 48dae3d48e46f..4500e61a0d5e4 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1876,11 +1876,17 @@ __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) if (end) { vma_iter_set(&vmi, 0); tmp = vma_next(&vmi); + UNMAP_STATE(unmap, &vmi, /* first = */ tmp, + /* vma_start = */ 0, /* vma_end = */ end, + /* prev = */ NULL, /* next = */ NULL); + + /* + * Don't iterate over vmas beyond the failure point for + * both unmap_vma() and free_pgtables(). + */ + unmap.tree_end = end; flush_cache_mm(mm); - unmap_region(&vmi.mas, /* vma = */ tmp, - /* vma_start = */ 0, /* vma_end = */ end, - /* pg_end = */ end, /* prev = */ NULL, - /* next = */ NULL); + unmap_region(&unmap); charge = tear_down_vmas(mm, &vmi, tmp, end); vm_unacct_memory(charge); } diff --git a/mm/vma.c b/mm/vma.c index b92383e5eebd1..75c68c74c062e 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -472,21 +472,19 @@ void remove_vma(struct vm_area_struct *vma) * * Called with the mm semaphore held. */ -void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, - unsigned long vma_start, unsigned long vma_end, - unsigned long pg_max, struct vm_area_struct *prev, - struct vm_area_struct *next) +void unmap_region(struct unmap_desc *unmap) { - struct mm_struct *mm = vma->vm_mm; + struct mm_struct *mm = unmap->first->vm_mm; + struct ma_state *mas = unmap->mas; struct mmu_gather tlb; tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); - unmap_vmas(&tlb, mas, vma, vma_start, vma_end, vma_end); - mas_set(mas, vma->vm_end); - free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, - pg_max, next ? next->vm_start : USER_PGTABLES_CEILING, - /* mm_wr_locked = */ true); + unmap_vmas(&tlb, mas, unmap->first, unmap->vma_start, unmap->vma_end, + unmap->vma_end); + mas_set(mas, unmap->tree_reset); + free_pgtables(&tlb, mas, unmap->first, unmap->pg_start, unmap->pg_end, + unmap->tree_end, unmap->mm_wr_locked); tlb_finish_mmu(&tlb); } @@ -2463,15 +2461,14 @@ static int __mmap_new_file_vma(struct mmap_state *map, error = mmap_file(vma->vm_file, vma); if (error) { + UNMAP_STATE(unmap, vmi, vma, vma->vm_start, vma->vm_end, + map->prev, map->next); fput(vma->vm_file); vma->vm_file = NULL; vma_iter_set(vmi, vma->vm_end); /* Undo any partial mapping done by a device driver. */ - unmap_region(&vmi->mas, vma, vma->vm_start, vma->vm_end, - map->next ? map->next->vm_start : USER_PGTABLES_CEILING, - map->prev, map->next); - + unmap_region(&unmap); return error; } diff --git a/mm/vma.h b/mm/vma.h index 7c2c95fef240b..cca7553c7d641 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -155,6 +155,35 @@ struct vma_merge_struct { }; +struct unmap_desc { + struct ma_state *mas; /* the maple state point to the first vma */ + struct vm_area_struct *first; /* The first vma */ + unsigned long pg_start; /* The first pagetable address to free (floor) */ + unsigned long pg_end; /* The last pagetable address to free (ceiling) */ + unsigned long vma_start; /* The min vma address */ + unsigned long vma_end; /* The max vma address */ + unsigned long tree_end; /* Maximum for the vma tree search */ + unsigned long tree_reset; /* Where to reset the vma tree walk */ + bool mm_wr_locked; /* If the mmap write lock is held */ +}; + +#define UNMAP_STATE(name, _vmi, _vma, _vma_start, _vma_end, _prev, _next) \ + struct unmap_desc name = { \ + .mas = &(_vmi)->mas, \ + .first = _vma, \ + .pg_start = _prev ? ((struct vm_area_struct *)_prev)->vm_end : \ + FIRST_USER_ADDRESS, \ + .pg_end = _next ? ((struct vm_area_struct *)_next)->vm_start : \ + USER_PGTABLES_CEILING, \ + .vma_start = _vma_start, \ + .vma_end = _vma_end, \ + .tree_end = _next ? \ + ((struct vm_area_struct *)_next)->vm_start : \ + USER_PGTABLES_CEILING, \ + .tree_reset = _vma->vm_end, \ + .mm_wr_locked = true, \ + } + static inline bool vmg_nomem(struct vma_merge_struct *vmg) { return vmg->state == VMA_MERGE_ERROR_NOMEM; @@ -262,11 +291,7 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, bool unlock); void remove_vma(struct vm_area_struct *vma); - -void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, - unsigned long vma_start, unsigned long vma_end, - unsigned long pg_max, struct vm_area_struct *prev, - struct vm_area_struct *next); +void unmap_region(struct unmap_desc *unmap); /** * vma_modify_flags() - Perform any necessary split/merge in preparation for -- 2.47.3 Convert vms_clear_ptes() to use unmap_desc to call unmap_vmas() instead of the large argument list. The UNMAP_STATE() cannot be used because the vma iterator in the vms does not point to the correct maple state (mas_detach), and the tree_end will be set incorrectly. Setting up the arguments manually avoids setting the struct up incorrectly and doing extra work to get the correct pagetable range. exit_mmap() also calls unmap_vmas() with many arguments. Using the unmap_all_init() function to set the unmap descriptor for all vmas makes this a bit easier to read. Update to the vma test code is necessary to ensure testing continues to function. No functional changes intended. Signed-off-by: Liam R. Howlett --- include/linux/mm.h | 4 ---- mm/internal.h | 3 +++ mm/memory.c | 20 ++++++++------------ mm/mmap.c | 4 +++- mm/vma.c | 27 ++++++++++++++++++++++----- mm/vma.h | 14 ++++++++++++++ tools/testing/vma/vma_internal.h | 6 +++--- 7 files changed, 53 insertions(+), 25 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index cb3de0c73d030..3164b897283f1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2624,10 +2624,6 @@ static inline void zap_vma_pages(struct vm_area_struct *vma) zap_page_range_single(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL); } -void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, - struct vm_area_struct *start_vma, unsigned long start, - unsigned long end, unsigned long tree_end); - struct mmu_notifier_range; void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, diff --git a/mm/internal.h b/mm/internal.h index 2cdc5c9396f10..25a17eea550b8 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -197,6 +197,9 @@ static inline void vma_close(struct vm_area_struct *vma) } } +/* unmap_vmas is in mm/memory.c */ +void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap); + #ifdef CONFIG_MMU static inline void get_anon_vma(struct anon_vma *anon_vma) diff --git a/mm/memory.c b/mm/memory.c index 4331a6abe3e4c..6fd6decc139e9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2083,11 +2083,7 @@ static void unmap_single_vma(struct mmu_gather *tlb, /** * unmap_vmas - unmap a range of memory covered by a list of vma's * @tlb: address of the caller's struct mmu_gather - * @mas: the maple state - * @vma: the starting vma - * @start_addr: virtual address at which to start unmapping - * @end_addr: virtual address at which to end unmapping - * @tree_end: The maximum index to check + * @unmap: The unmap_desc * * Unmap all pages in the vma list. * @@ -2100,10 +2096,9 @@ static void unmap_single_vma(struct mmu_gather *tlb, * ensure that any thus-far unmapped pages are flushed before unmap_vmas() * drops the lock and schedules. */ -void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, - struct vm_area_struct *vma, unsigned long start_addr, - unsigned long end_addr, unsigned long tree_end) +void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap) { + struct vm_area_struct *vma; struct mmu_notifier_range range; struct zap_details details = { .zap_flags = ZAP_FLAG_DROP_MARKER | ZAP_FLAG_UNMAP, @@ -2111,16 +2106,17 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, .even_cows = true, }; + vma = unmap->first; mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm, - start_addr, end_addr); + unmap->vma_start, unmap->vma_end); mmu_notifier_invalidate_range_start(&range); do { - unsigned long start = start_addr; - unsigned long end = end_addr; + unsigned long start = unmap->vma_start; + unsigned long end = unmap->vma_end; hugetlb_zap_begin(vma, &start, &end); unmap_single_vma(tlb, vma, start, end, &details); hugetlb_zap_end(vma, &details); - vma = mas_find(mas, tree_end - 1); + vma = mas_find(unmap->mas, unmap->tree_end - 1); } while (vma); mmu_notifier_invalidate_range_end(&range); } diff --git a/mm/mmap.c b/mm/mmap.c index 4500e61a0d5e4..042b6b4b6ab86 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1277,6 +1277,7 @@ void exit_mmap(struct mm_struct *mm) struct vm_area_struct *vma; unsigned long nr_accounted = 0; VMA_ITERATOR(vmi, mm, 0); + struct unmap_desc unmap; /* mm's last user has gone, and its about to be pulled down */ mmu_notifier_release(mm); @@ -1292,11 +1293,12 @@ void exit_mmap(struct mm_struct *mm) goto destroy; } + unmap_all_init(&unmap, &vmi, vma); flush_cache_mm(mm); tlb_gather_mmu_fullmm(&tlb, mm); /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */ - unmap_vmas(&tlb, &vmi.mas, vma, 0, ULONG_MAX, ULONG_MAX); + unmap_vmas(&tlb, &unmap); mmap_read_unlock(mm); /* diff --git a/mm/vma.c b/mm/vma.c index 75c68c74c062e..b46c869d4bb07 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -480,8 +480,7 @@ void unmap_region(struct unmap_desc *unmap) tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); - unmap_vmas(&tlb, mas, unmap->first, unmap->vma_start, unmap->vma_end, - unmap->vma_end); + unmap_vmas(&tlb, unmap); mas_set(mas, unmap->tree_reset); free_pgtables(&tlb, mas, unmap->first, unmap->pg_start, unmap->pg_end, unmap->tree_end, unmap->mm_wr_locked); @@ -1257,6 +1256,26 @@ static inline void vms_clear_ptes(struct vma_munmap_struct *vms, struct ma_state *mas_detach, bool mm_wr_locked) { struct mmu_gather tlb; + struct unmap_desc unmap = { + .mas = mas_detach, + .first = vms->vma, + /* start and end may be different if there is no prev or next vma. */ + .pg_start = vms->unmap_start, + .pg_end = vms->unmap_end, + .vma_start = vms->start, + .vma_end = vms->end, + /* + * The tree limits and reset differ from the normal case since it's a + * side-tree + */ + .tree_reset = 1, + .tree_end = vms->vma_count, + /* + * We can free page tables without write-locking mmap_lock because VMAs + * were isolated before we downgraded mmap_lock. + */ + .mm_wr_locked = mm_wr_locked, + }; if (!vms->clear_ptes) /* Nothing to do */ return; @@ -1268,9 +1287,7 @@ static inline void vms_clear_ptes(struct vma_munmap_struct *vms, mas_set(mas_detach, 1); tlb_gather_mmu(&tlb, vms->vma->vm_mm); update_hiwater_rss(vms->vma->vm_mm); - unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end, - vms->vma_count); - + unmap_vmas(&tlb, &unmap); mas_set(mas_detach, 1); /* start and end may be different if there is no prev or next vma. */ free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start, diff --git a/mm/vma.h b/mm/vma.h index cca7553c7d641..bb7fa5d2bde25 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -167,6 +167,20 @@ struct unmap_desc { bool mm_wr_locked; /* If the mmap write lock is held */ }; +static inline void unmap_all_init(struct unmap_desc *unmap, + struct vma_iterator *vmi, struct vm_area_struct *vma) +{ + unmap->mas = &vmi->mas; + unmap->first = vma; + unmap->pg_start = FIRST_USER_ADDRESS; + unmap->pg_end = USER_PGTABLES_CEILING; + unmap->vma_start = 0; + unmap->vma_end = ULONG_MAX; + unmap->tree_end = ULONG_MAX; + unmap->tree_reset = vma->vm_end; + unmap->mm_wr_locked = false; +} + #define UNMAP_STATE(name, _vmi, _vma, _vma_start, _vma_end, _prev, _next) \ struct unmap_desc name = { \ .mas = &(_vmi)->mas, \ diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index f50b8ddee6120..0b4918aac8d6d 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -1131,9 +1131,9 @@ static inline void update_hiwater_vm(struct mm_struct *mm) { } -static inline void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, - struct vm_area_struct *vma, unsigned long start_addr, - unsigned long end_addr, unsigned long tree_end) +struct unmap_desc; + +static inline void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap) { } -- 2.47.3 There is no need to open code the vms_clear_ptes() now that unmap_desc struct is used. Signed-off-by: Liam R. Howlett --- mm/vma.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/mm/vma.c b/mm/vma.c index b46c869d4bb07..876d2db5329dd 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -1255,7 +1255,6 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, static inline void vms_clear_ptes(struct vma_munmap_struct *vms, struct ma_state *mas_detach, bool mm_wr_locked) { - struct mmu_gather tlb; struct unmap_desc unmap = { .mas = mas_detach, .first = vms->vma, @@ -1280,19 +1279,8 @@ static inline void vms_clear_ptes(struct vma_munmap_struct *vms, if (!vms->clear_ptes) /* Nothing to do */ return; - /* - * We can free page tables without write-locking mmap_lock because VMAs - * were isolated before we downgraded mmap_lock. - */ mas_set(mas_detach, 1); - tlb_gather_mmu(&tlb, vms->vma->vm_mm); - update_hiwater_rss(vms->vma->vm_mm); - unmap_vmas(&tlb, &unmap); - mas_set(mas_detach, 1); - /* start and end may be different if there is no prev or next vma. */ - free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start, - vms->unmap_end, vms->unmap_end, mm_wr_locked); - tlb_finish_mmu(&tlb); + unmap_region(&unmap); vms->clear_ptes = false; } -- 2.47.3 Pass through the unmap_desc to free_pgtables() because it almost has everything necessary and is already on the stack. Updates testing code as necessary. No functional changes intended. Reviewed-by: Lorenzo Stoakes Reviewed-by: Suren Baghdasaryan Signed-off-by: Liam R. Howlett --- mm/internal.h | 5 +---- mm/memory.c | 37 ++++++++++++++------------------ mm/mmap.c | 6 +++--- mm/vma.c | 6 ++---- tools/testing/vma/vma_internal.h | 7 +++--- 5 files changed, 25 insertions(+), 36 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 25a17eea550b8..1cad630f0dcef 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -512,10 +512,7 @@ bool __folio_end_writeback(struct folio *folio); void deactivate_file_folio(struct folio *folio); void folio_activate(struct folio *folio); -void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, - struct vm_area_struct *vma, unsigned long pg_start, - unsigned long pg_end, unsigned long vma_end, - bool mm_wr_locked); +void free_pgtables(struct mmu_gather *tlb, struct unmap_desc *desc); void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte); diff --git a/mm/memory.c b/mm/memory.c index 6fd6decc139e9..16b25eff19251 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -373,12 +373,7 @@ void free_pgd_range(struct mmu_gather *tlb, /** * free_pgtables() - Free a range of page tables * @tlb: The mmu gather - * @mas: The maple state - * @vma: The first vma - * @pg_start: The lowest page table address (floor) - * @pg_end: The highest page table address (ceiling) - * @vma_end: The highest vma tree search address - * @mm_wr_locked: boolean indicating if the mm is write locked + * @unmap: The unmap_desc * * Note: pg_start and pg_end are provided to indicate the absolute range of the * page tables that should be removed. This can differ from the vma mappings on @@ -388,21 +383,21 @@ void free_pgd_range(struct mmu_gather *tlb, * The vma_end differs from the pg_end when a dup_mmap() failed and the tree has * unrelated data to the mm_struct being torn down. */ -void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, - struct vm_area_struct *vma, unsigned long pg_start, - unsigned long pg_end, unsigned long vma_end, - bool mm_wr_locked) +void free_pgtables(struct mmu_gather *tlb, struct unmap_desc *unmap) { struct unlink_vma_file_batch vb; + struct ma_state *mas = unmap->mas; + struct vm_area_struct *vma = unmap->first; /* * Note: USER_PGTABLES_CEILING may be passed as the value of pg_end and - * may be 0. Underflow is expected in this case. Otherwise the - * pagetable end is exclusive. - * vma_end is exclusive. - * The last vma address should never be larger than the pagetable end. + * may be 0. The underflow here is fine and expected. + * The vma_end is exclusive, which is fine until we use the mas_ instead + * of the vma iterators. + * For freeing the page tables to make sense, the vma_end must be larger + * than the pg_end, so check that after the potential underflow. */ - WARN_ON_ONCE(vma_end - 1 > pg_end - 1); + WARN_ON_ONCE(unmap->vma_end - 1 > unmap->pg_end - 1); tlb_free_vmas(tlb); @@ -410,13 +405,13 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, unsigned long addr = vma->vm_start; struct vm_area_struct *next; - next = mas_find(mas, vma_end - 1); + next = mas_find(mas, unmap->tree_end - 1); /* * Hide vma from rmap and truncate_pagecache before freeing * pgtables */ - if (mm_wr_locked) + if (unmap->mm_wr_locked) vma_start_write(vma); unlink_anon_vmas(vma); @@ -428,16 +423,16 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, */ while (next && next->vm_start <= vma->vm_end + PMD_SIZE) { vma = next; - next = mas_find(mas, vma_end - 1); - if (mm_wr_locked) + next = mas_find(mas, unmap->tree_end - 1); + if (unmap->mm_wr_locked) vma_start_write(vma); unlink_anon_vmas(vma); unlink_file_vma_batch_add(&vb, vma); } unlink_file_vma_batch_final(&vb); - free_pgd_range(tlb, addr, vma->vm_end, - pg_start, next ? next->vm_start : pg_end); + free_pgd_range(tlb, addr, vma->vm_end, unmap->pg_start, + next ? next->vm_start : unmap->pg_end); vma = next; } while (vma); } diff --git a/mm/mmap.c b/mm/mmap.c index 042b6b4b6ab86..8771b276d63db 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1307,10 +1307,10 @@ void exit_mmap(struct mm_struct *mm) */ mm_flags_set(MMF_OOM_SKIP, mm); mmap_write_lock(mm); + unmap.mm_wr_locked = true; mt_clear_in_rcu(&mm->mm_mt); - vma_iter_set(&vmi, vma->vm_end); - free_pgtables(&tlb, &vmi.mas, vma, FIRST_USER_ADDRESS, - USER_PGTABLES_CEILING, USER_PGTABLES_CEILING, true); + vma_iter_set(&vmi, unmap.tree_reset); + free_pgtables(&tlb, &unmap); tlb_finish_mmu(&tlb); /* diff --git a/mm/vma.c b/mm/vma.c index 876d2db5329dd..f352d5c722126 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -475,15 +475,13 @@ void remove_vma(struct vm_area_struct *vma) void unmap_region(struct unmap_desc *unmap) { struct mm_struct *mm = unmap->first->vm_mm; - struct ma_state *mas = unmap->mas; struct mmu_gather tlb; tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); unmap_vmas(&tlb, unmap); - mas_set(mas, unmap->tree_reset); - free_pgtables(&tlb, mas, unmap->first, unmap->pg_start, unmap->pg_end, - unmap->tree_end, unmap->mm_wr_locked); + mas_set(unmap->mas, unmap->tree_reset); + free_pgtables(&tlb, unmap); tlb_finish_mmu(&tlb); } diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 0b4918aac8d6d..ca4eb563b29ba 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -1137,11 +1137,10 @@ static inline void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap) { } -static inline void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, - struct vm_area_struct *vma, unsigned long floor, - unsigned long ceiling, unsigned long tree_max, - bool mm_wr_locked) +static inline void free_pgtables(struct mmu_gather *tlb, struct unmap_desc *desc) { + (void)tlb; + (void)desc; } static inline void mapping_unmap_writable(struct address_space *mapping) -- 2.47.3