From: Alexander Gordeev Since commit b9ef323ea168 ("powerpc/64s: Disable preemption in hash lazy mmu mode") a task can not be preempted while in lazy MMU mode. Therefore, the batch re-activation code is never called, so remove it. Signed-off-by: Alexander Gordeev Signed-off-by: Kevin Brodsky --- arch/powerpc/include/asm/thread_info.h | 2 -- arch/powerpc/kernel/process.c | 25 ------------------------- 2 files changed, 27 deletions(-) diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index b0f200aba2b3..97f35f9b1a96 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -154,12 +154,10 @@ void arch_setup_new_exec(void); /* Don't move TLF_NAPPING without adjusting the code in entry_32.S */ #define TLF_NAPPING 0 /* idle thread enabled NAP mode */ #define TLF_SLEEPING 1 /* suspend code enabled SLEEP mode */ -#define TLF_LAZY_MMU 3 /* tlb_batch is active */ #define TLF_RUNLATCH 4 /* Is the runlatch enabled? */ #define _TLF_NAPPING (1 << TLF_NAPPING) #define _TLF_SLEEPING (1 << TLF_SLEEPING) -#define _TLF_LAZY_MMU (1 << TLF_LAZY_MMU) #define _TLF_RUNLATCH (1 << TLF_RUNLATCH) #ifndef __ASSEMBLER__ diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index eb23966ac0a9..9237dcbeee4a 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1281,9 +1281,6 @@ struct task_struct *__switch_to(struct task_struct *prev, { struct thread_struct *new_thread, *old_thread; struct task_struct *last; -#ifdef CONFIG_PPC_64S_HASH_MMU - struct ppc64_tlb_batch *batch; -#endif new_thread = &new->thread; old_thread = ¤t->thread; @@ -1291,14 +1288,6 @@ struct task_struct *__switch_to(struct task_struct *prev, WARN_ON(!irqs_disabled()); #ifdef CONFIG_PPC_64S_HASH_MMU - batch = this_cpu_ptr(&ppc64_tlb_batch); - if (batch->active) { - current_thread_info()->local_flags |= _TLF_LAZY_MMU; - if (batch->index) - __flush_tlb_pending(batch); - batch->active = 0; - } - /* * On POWER9 the copy-paste buffer can only paste into * foreign real addresses, so unprivileged processes can not @@ -1369,20 +1358,6 @@ struct task_struct *__switch_to(struct task_struct *prev, */ #ifdef CONFIG_PPC_BOOK3S_64 -#ifdef CONFIG_PPC_64S_HASH_MMU - /* - * This applies to a process that was context switched while inside - * arch_enter_lazy_mmu_mode(), to re-activate the batch that was - * deactivated above, before _switch(). This will never be the case - * for new tasks. - */ - if (current_thread_info()->local_flags & _TLF_LAZY_MMU) { - current_thread_info()->local_flags &= ~_TLF_LAZY_MMU; - batch = this_cpu_ptr(&ppc64_tlb_batch); - batch->active = 1; - } -#endif - /* * Math facilities are masked out of the child MSR in copy_thread. * A new task does not need to restore_math because it will -- 2.47.0 arch_flush_lazy_mmu_mode() is called when outstanding batched pgtable operations must be completed immediately. There should however be no need to leave and re-enter lazy MMU completely. The only part of that sequence that we really need is xen_mc_flush(); call it directly. Signed-off-by: Kevin Brodsky --- arch/x86/xen/mmu_pv.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 2a4a8deaf612..7a35c3393df4 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -2139,10 +2139,8 @@ static void xen_flush_lazy_mmu(void) { preempt_disable(); - if (xen_get_lazy_mode() == XEN_LAZY_MMU) { - arch_leave_lazy_mmu_mode(); - arch_enter_lazy_mmu_mode(); - } + if (xen_get_lazy_mode() == XEN_LAZY_MMU) + xen_mc_flush(); preempt_enable(); } -- 2.47.0 Upcoming changes to the lazy_mmu API will cause arch_flush_lazy_mmu_mode() to be called when leaving a nested lazy_mmu section. Move the relevant logic from arch_leave_lazy_mmu_mode() to arch_flush_lazy_mmu_mode() and have the former call the latter. Note: the additional this_cpu_ptr() on the arch_leave_lazy_mmu_mode() path will be removed in a subsequent patch. Signed-off-by: Kevin Brodsky --- .../powerpc/include/asm/book3s/64/tlbflush-hash.h | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h index 146287d9580f..7704dbe8e88d 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h @@ -41,6 +41,16 @@ static inline void arch_enter_lazy_mmu_mode(void) batch->active = 1; } +static inline void arch_flush_lazy_mmu_mode(void) +{ + struct ppc64_tlb_batch *batch; + + batch = this_cpu_ptr(&ppc64_tlb_batch); + + if (batch->index) + __flush_tlb_pending(batch); +} + static inline void arch_leave_lazy_mmu_mode(void) { struct ppc64_tlb_batch *batch; @@ -49,14 +59,11 @@ static inline void arch_leave_lazy_mmu_mode(void) return; batch = this_cpu_ptr(&ppc64_tlb_batch); - if (batch->index) - __flush_tlb_pending(batch); + arch_flush_lazy_mmu_mode(); batch->active = 0; preempt_enable(); } -#define arch_flush_lazy_mmu_mode() do {} while (0) - extern void hash__tlbiel_all(unsigned int action); extern void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, -- 2.47.0 Upcoming changes to the lazy_mmu API will cause arch_flush_lazy_mmu_mode() to be called when leaving a nested lazy_mmu section. Move the relevant logic from arch_leave_lazy_mmu_mode() to arch_flush_lazy_mmu_mode() and have the former call the latter. Note: the additional this_cpu_ptr() on the arch_leave_lazy_mmu_mode() path will be removed in a subsequent patch. Signed-off-by: Kevin Brodsky --- arch/sparc/include/asm/tlbflush_64.h | 2 +- arch/sparc/mm/tlb.c | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/sparc/include/asm/tlbflush_64.h b/arch/sparc/include/asm/tlbflush_64.h index 8b8cdaa69272..925bb5d7a4e1 100644 --- a/arch/sparc/include/asm/tlbflush_64.h +++ b/arch/sparc/include/asm/tlbflush_64.h @@ -43,8 +43,8 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end); void flush_tlb_pending(void); void arch_enter_lazy_mmu_mode(void); +void arch_flush_lazy_mmu_mode(void); void arch_leave_lazy_mmu_mode(void); -#define arch_flush_lazy_mmu_mode() do {} while (0) /* Local cpu only. */ void __flush_tlb_all(void); diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c index a35ddcca5e76..7b5dfcdb1243 100644 --- a/arch/sparc/mm/tlb.c +++ b/arch/sparc/mm/tlb.c @@ -59,12 +59,19 @@ void arch_enter_lazy_mmu_mode(void) tb->active = 1; } -void arch_leave_lazy_mmu_mode(void) +void arch_flush_lazy_mmu_mode(void) { struct tlb_batch *tb = this_cpu_ptr(&tlb_batch); if (tb->tlb_nr) flush_tlb_pending(); +} + +void arch_leave_lazy_mmu_mode(void) +{ + struct tlb_batch *tb = this_cpu_ptr(&tlb_batch); + + arch_flush_lazy_mmu_mode(); tb->active = 0; preempt_enable(); } -- 2.47.0 Architectures currently opt in for implementing lazy_mmu helpers by defining __HAVE_ARCH_ENTER_LAZY_MMU_MODE. In preparation for introducing a generic lazy_mmu layer that will require storage in task_struct, let's switch to a cleaner approach: instead of defining a macro, select a CONFIG option. This patch introduces CONFIG_ARCH_HAS_LAZY_MMU_MODE and has each arch select it when it implements lazy_mmu helpers. __HAVE_ARCH_ENTER_LAZY_MMU_MODE is removed and relies on the new CONFIG instead. On x86, lazy_mmu helpers are only implemented if PARAVIRT_XXL is selected. This creates some complications in arch/x86/boot/, because a few files manually undefine PARAVIRT* options. As a result does not define the lazy_mmu helpers, but this breaks the build as only defines them if !CONFIG_ARCH_HAS_LAZY_MMU_MODE. There does not seem to be a clean way out of this - let's just undefine that new CONFIG too. Signed-off-by: Kevin Brodsky --- arch/arm64/Kconfig | 1 + arch/arm64/include/asm/pgtable.h | 1 - arch/powerpc/include/asm/book3s/64/tlbflush-hash.h | 2 -- arch/powerpc/platforms/Kconfig.cputype | 1 + arch/sparc/Kconfig | 1 + arch/sparc/include/asm/tlbflush_64.h | 2 -- arch/x86/Kconfig | 1 + arch/x86/boot/compressed/misc.h | 1 + arch/x86/boot/startup/sme.c | 1 + arch/x86/include/asm/paravirt.h | 1 - include/linux/pgtable.h | 2 +- mm/Kconfig | 3 +++ 12 files changed, 10 insertions(+), 7 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 6663ffd23f25..e6bf5c7311b5 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -122,6 +122,7 @@ config ARM64 select ARCH_WANTS_NO_INSTR select ARCH_WANTS_THP_SWAP if ARM64_4K_PAGES select ARCH_HAS_UBSAN + select ARCH_HAS_LAZY_MMU_MODE select ARM_AMBA select ARM_ARCH_TIMER select ARM_GIC diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 0944e296dd4a..54f8d6bb6f22 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -80,7 +80,6 @@ static inline void queue_pte_barriers(void) } } -#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE static inline void arch_enter_lazy_mmu_mode(void) { /* diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h index 7704dbe8e88d..623a8a8b2d0e 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h @@ -24,8 +24,6 @@ DECLARE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch); extern void __flush_tlb_pending(struct ppc64_tlb_batch *batch); -#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE - static inline void arch_enter_lazy_mmu_mode(void) { struct ppc64_tlb_batch *batch; diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 7b527d18aa5e..2942d57cf59c 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -93,6 +93,7 @@ config PPC_BOOK3S_64 select IRQ_WORK select PPC_64S_HASH_MMU if !PPC_RADIX_MMU select KASAN_VMALLOC if KASAN + select ARCH_HAS_LAZY_MMU_MODE config PPC_BOOK3E_64 bool "Embedded processors" diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index a630d373e645..2bad14744ca4 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -112,6 +112,7 @@ config SPARC64 select NEED_PER_CPU_PAGE_FIRST_CHUNK select ARCH_SUPPORTS_SCHED_SMT if SMP select ARCH_SUPPORTS_SCHED_MC if SMP + select ARCH_HAS_LAZY_MMU_MODE config ARCH_PROC_KCORE_TEXT def_bool y diff --git a/arch/sparc/include/asm/tlbflush_64.h b/arch/sparc/include/asm/tlbflush_64.h index 925bb5d7a4e1..4e1036728e2f 100644 --- a/arch/sparc/include/asm/tlbflush_64.h +++ b/arch/sparc/include/asm/tlbflush_64.h @@ -39,8 +39,6 @@ static inline void flush_tlb_range(struct vm_area_struct *vma, void flush_tlb_kernel_range(unsigned long start, unsigned long end); -#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE - void flush_tlb_pending(void); void arch_enter_lazy_mmu_mode(void); void arch_flush_lazy_mmu_mode(void); diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index fa3b616af03a..ef4332d720ab 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -804,6 +804,7 @@ config PARAVIRT config PARAVIRT_XXL bool depends on X86_64 + select ARCH_HAS_LAZY_MMU_MODE config PARAVIRT_DEBUG bool "paravirt-ops debugging" diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index db1048621ea2..cdd7f692d9ee 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -11,6 +11,7 @@ #undef CONFIG_PARAVIRT #undef CONFIG_PARAVIRT_XXL #undef CONFIG_PARAVIRT_SPINLOCKS +#undef CONFIG_ARCH_HAS_LAZY_MMU_MODE #undef CONFIG_KASAN #undef CONFIG_KASAN_GENERIC diff --git a/arch/x86/boot/startup/sme.c b/arch/x86/boot/startup/sme.c index e7ea65f3f1d6..b76a7c95dfe1 100644 --- a/arch/x86/boot/startup/sme.c +++ b/arch/x86/boot/startup/sme.c @@ -24,6 +24,7 @@ #undef CONFIG_PARAVIRT #undef CONFIG_PARAVIRT_XXL #undef CONFIG_PARAVIRT_SPINLOCKS +#undef CONFIG_ARCH_HAS_LAZY_MMU_MODE /* * This code runs before CPU feature bits are set. By default, the diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index b5e59a7ba0d0..13f9cd31c8f8 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -526,7 +526,6 @@ static inline void arch_end_context_switch(struct task_struct *next) PVOP_VCALL1(cpu.end_context_switch, next); } -#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE static inline void arch_enter_lazy_mmu_mode(void) { PVOP_VCALL0(mmu.lazy_mode.enter); diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 32e8457ad535..9894366e768b 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -231,7 +231,7 @@ static inline int pmd_dirty(pmd_t pmd) * held, but for kernel PTE updates, no lock is held). Nesting is not permitted * and the mode cannot be used in interrupt context. */ -#ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE +#ifndef CONFIG_ARCH_HAS_LAZY_MMU_MODE static inline void arch_enter_lazy_mmu_mode(void) {} static inline void arch_leave_lazy_mmu_mode(void) {} static inline void arch_flush_lazy_mmu_mode(void) {} diff --git a/mm/Kconfig b/mm/Kconfig index 0e26f4fc8717..5480c9a1bfb2 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1372,6 +1372,9 @@ config PT_RECLAIM config FIND_NORMAL_PAGE def_bool n +config ARCH_HAS_LAZY_MMU_MODE + bool + source "mm/damon/Kconfig" endmenu -- 2.47.0 The implementation of the lazy MMU mode is currently entirely arch-specific; core code directly calls arch helpers: arch_{enter,leave}_lazy_mmu_mode(). We are about to introduce support for nested lazy MMU sections. As things stand we'd have to duplicate that logic in every arch implementing lazy_mmu - adding to a fair amount of logic already duplicated across lazy_mmu implementations. This patch therefore introduces a new generic layer that calls the existing arch_* helpers. Two pair of calls are introduced: * lazy_mmu_mode_enable() ... lazy_mmu_mode_disable() This is the standard case where the mode is enabled for a given block of code by surrounding it with enable() and disable() calls. * lazy_mmu_mode_pause() ... lazy_mmu_mode_resume() This is for situations where the mode is temporarily disabled by first calling pause() and then resume() (e.g. to prevent any batching from occurring in a critical section). The documentation in will be updated in a subsequent patch. No functional change should be introduced at this stage. The implementation of enable()/resume() and disable()/pause() is currently identical, but nesting support will change that. Most of the call sites have been updated using the following Coccinelle script: @@ @@ { ... - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); ... - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); ... } @@ @@ { ... - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_pause(); ... - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_resume(); ... } A couple of notes regarding x86: * Xen is currently the only case where explicit handling is required for lazy MMU when context-switching. This is purely an implementation detail and using the generic lazy_mmu_mode_* functions would cause trouble when nesting support is introduced, because the generic functions must be called from the current task. For that reason we still use arch_leave() and arch_enter() there. * x86 calls arch_flush_lazy_mmu_mode() unconditionally in a few places, but only defines it if PARAVIRT_XXL is selected, and we are removing the fallback in . Add a new fallback definition to to keep things building. Signed-off-by: Kevin Brodsky --- arch/arm64/mm/mmu.c | 4 ++-- arch/arm64/mm/pageattr.c | 4 ++-- arch/powerpc/mm/book3s64/hash_tlb.c | 8 +++---- arch/powerpc/mm/book3s64/subpage_prot.c | 4 ++-- arch/x86/include/asm/pgtable.h | 3 ++- fs/proc/task_mmu.c | 4 ++-- include/linux/pgtable.h | 29 +++++++++++++++++++++---- mm/kasan/shadow.c | 8 +++---- mm/madvise.c | 18 +++++++-------- mm/memory.c | 16 +++++++------- mm/migrate_device.c | 4 ++-- mm/mprotect.c | 4 ++-- mm/mremap.c | 4 ++-- mm/userfaultfd.c | 4 ++-- mm/vmalloc.c | 12 +++++----- mm/vmscan.c | 12 +++++----- 16 files changed, 80 insertions(+), 58 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index b8d37eb037fc..d9c8e94f140f 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -731,7 +731,7 @@ int split_kernel_leaf_mapping(unsigned long start, unsigned long end) return -EINVAL; mutex_lock(&pgtable_split_lock); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); /* * The split_kernel_leaf_mapping_locked() may sleep, it is not a @@ -753,7 +753,7 @@ int split_kernel_leaf_mapping(unsigned long start, unsigned long end) ret = split_kernel_leaf_mapping_locked(end); } - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); mutex_unlock(&pgtable_split_lock); return ret; } diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 5135f2d66958..e4059f13c4ed 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -110,7 +110,7 @@ static int update_range_prot(unsigned long start, unsigned long size, if (WARN_ON_ONCE(ret)) return ret; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); /* * The caller must ensure that the range we are operating on does not @@ -119,7 +119,7 @@ static int update_range_prot(unsigned long start, unsigned long size, */ ret = walk_kernel_page_table_range_lockless(start, start + size, &pageattr_ops, NULL, &data); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); return ret; } diff --git a/arch/powerpc/mm/book3s64/hash_tlb.c b/arch/powerpc/mm/book3s64/hash_tlb.c index 21fcad97ae80..787f7a0e27f0 100644 --- a/arch/powerpc/mm/book3s64/hash_tlb.c +++ b/arch/powerpc/mm/book3s64/hash_tlb.c @@ -205,7 +205,7 @@ void __flush_hash_table_range(unsigned long start, unsigned long end) * way to do things but is fine for our needs here. */ local_irq_save(flags); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); for (; start < end; start += PAGE_SIZE) { pte_t *ptep = find_init_mm_pte(start, &hugepage_shift); unsigned long pte; @@ -217,7 +217,7 @@ void __flush_hash_table_range(unsigned long start, unsigned long end) continue; hpte_need_flush(&init_mm, start, ptep, pte, hugepage_shift); } - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); local_irq_restore(flags); } @@ -237,7 +237,7 @@ void flush_hash_table_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long * way to do things but is fine for our needs here. */ local_irq_save(flags); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); start_pte = pte_offset_map(pmd, addr); if (!start_pte) goto out; @@ -249,6 +249,6 @@ void flush_hash_table_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long } pte_unmap(start_pte); out: - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); local_irq_restore(flags); } diff --git a/arch/powerpc/mm/book3s64/subpage_prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c index ec98e526167e..07c47673bba2 100644 --- a/arch/powerpc/mm/book3s64/subpage_prot.c +++ b/arch/powerpc/mm/book3s64/subpage_prot.c @@ -73,13 +73,13 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr, pte = pte_offset_map_lock(mm, pmd, addr, &ptl); if (!pte) return; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); for (; npages > 0; --npages) { pte_update(mm, addr, pte, 0, 0, 0); addr += PAGE_SIZE; ++pte; } - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(pte - 1, ptl); } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index e33df3da6980..14fd672bc9b2 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -117,7 +117,8 @@ extern pmdval_t early_pmd_flags; #define pte_val(x) native_pte_val(x) #define __pte(x) native_make_pte(x) -#define arch_end_context_switch(prev) do {} while(0) +#define arch_end_context_switch(prev) do {} while (0) +#define arch_flush_lazy_mmu_mode() do {} while (0) #endif /* CONFIG_PARAVIRT_XXL */ static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index fc35a0543f01..d16ba1d32169 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -2703,7 +2703,7 @@ static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start, return 0; } - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); if ((p->arg.flags & PM_SCAN_WP_MATCHING) && !p->vec_out) { /* Fast path for performing exclusive WP */ @@ -2773,7 +2773,7 @@ static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start, if (flush_end) flush_tlb_range(vma, start, addr); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(start_pte, ptl); cond_resched(); diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 9894366e768b..b5fdf32c437f 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -231,10 +231,31 @@ static inline int pmd_dirty(pmd_t pmd) * held, but for kernel PTE updates, no lock is held). Nesting is not permitted * and the mode cannot be used in interrupt context. */ -#ifndef CONFIG_ARCH_HAS_LAZY_MMU_MODE -static inline void arch_enter_lazy_mmu_mode(void) {} -static inline void arch_leave_lazy_mmu_mode(void) {} -static inline void arch_flush_lazy_mmu_mode(void) {} +#ifdef CONFIG_ARCH_HAS_LAZY_MMU_MODE +static inline void lazy_mmu_mode_enable(void) +{ + arch_enter_lazy_mmu_mode(); +} + +static inline void lazy_mmu_mode_disable(void) +{ + arch_leave_lazy_mmu_mode(); +} + +static inline void lazy_mmu_mode_pause(void) +{ + arch_leave_lazy_mmu_mode(); +} + +static inline void lazy_mmu_mode_resume(void) +{ + arch_enter_lazy_mmu_mode(); +} +#else +static inline void lazy_mmu_mode_enable(void) {} +static inline void lazy_mmu_mode_disable(void) {} +static inline void lazy_mmu_mode_pause(void) {} +static inline void lazy_mmu_mode_resume(void) {} #endif #ifndef pte_batch_hint diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 5d2a876035d6..c49b029d3593 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -305,7 +305,7 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr, pte_t pte; int index; - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_pause(); index = PFN_DOWN(addr - data->start); page = data->pages[index]; @@ -319,7 +319,7 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr, } spin_unlock(&init_mm.page_table_lock); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_resume(); return 0; } @@ -482,7 +482,7 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, pte_t pte; int none; - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_pause(); spin_lock(&init_mm.page_table_lock); pte = ptep_get(ptep); @@ -494,7 +494,7 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, if (likely(!none)) __free_page(pfn_to_page(pte_pfn(pte))); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_resume(); return 0; } diff --git a/mm/madvise.c b/mm/madvise.c index fb1c86e630b6..536026772160 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -455,7 +455,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, if (!start_pte) return 0; flush_tlb_batched_pending(mm); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); for (; addr < end; pte += nr, addr += nr * PAGE_SIZE) { nr = 1; ptent = ptep_get(pte); @@ -463,7 +463,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, if (++batch_count == SWAP_CLUSTER_MAX) { batch_count = 0; if (need_resched()) { - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(start_pte, ptl); cond_resched(); goto restart; @@ -499,7 +499,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, if (!folio_trylock(folio)) continue; folio_get(folio); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(start_pte, ptl); start_pte = NULL; err = split_folio(folio); @@ -510,7 +510,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, if (!start_pte) break; flush_tlb_batched_pending(mm); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); if (!err) nr = 0; continue; @@ -558,7 +558,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, } if (start_pte) { - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(start_pte, ptl); } if (pageout) @@ -677,7 +677,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, if (!start_pte) return 0; flush_tlb_batched_pending(mm); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) { nr = 1; ptent = ptep_get(pte); @@ -727,7 +727,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, if (!folio_trylock(folio)) continue; folio_get(folio); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(start_pte, ptl); start_pte = NULL; err = split_folio(folio); @@ -738,7 +738,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, if (!start_pte) break; flush_tlb_batched_pending(mm); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); if (!err) nr = 0; continue; @@ -778,7 +778,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, if (nr_swap) add_mm_counter(mm, MM_SWAPENTS, nr_swap); if (start_pte) { - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(start_pte, ptl); } cond_resched(); diff --git a/mm/memory.c b/mm/memory.c index 74b45e258323..2d662dee5ae7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1254,7 +1254,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); orig_src_pte = src_pte; orig_dst_pte = dst_pte; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); do { nr = 1; @@ -1323,7 +1323,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, } while (dst_pte += nr, src_pte += nr, addr += PAGE_SIZE * nr, addr != end); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(orig_src_pte, src_ptl); add_mm_rss_vec(dst_mm, rss); pte_unmap_unlock(orig_dst_pte, dst_ptl); @@ -1842,7 +1842,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, return addr; flush_tlb_batched_pending(mm); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); do { bool any_skipped = false; @@ -1874,7 +1874,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, direct_reclaim = try_get_and_clear_pmd(mm, pmd, &pmdval); add_mm_rss_vec(mm, rss); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); /* Do the actual TLB flush before dropping ptl */ if (force_flush) { @@ -2817,7 +2817,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) return -ENOMEM; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); do { BUG_ON(!pte_none(ptep_get(pte))); if (!pfn_modify_allowed(pfn, prot)) { @@ -2827,7 +2827,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot))); pfn++; } while (pte++, addr += PAGE_SIZE, addr != end); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(mapped_pte, ptl); return err; } @@ -3134,7 +3134,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, return -EINVAL; } - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); if (fn) { do { @@ -3147,7 +3147,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, } *mask |= PGTBL_PTE_MODIFIED; - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); if (mm != &init_mm) pte_unmap_unlock(mapped_pte, ptl); diff --git a/mm/migrate_device.c b/mm/migrate_device.c index abd9f6850db6..dcdc46b96cc7 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -110,7 +110,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); if (!ptep) goto again; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); for (; addr < end; addr += PAGE_SIZE, ptep++) { struct dev_pagemap *pgmap; @@ -287,7 +287,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, if (unmapped) flush_tlb_range(walk->vma, start, end); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(ptep - 1, ptl); return 0; diff --git a/mm/mprotect.c b/mm/mprotect.c index 113b48985834..bcb183a6fd2f 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -293,7 +293,7 @@ static long change_pte_range(struct mmu_gather *tlb, target_node = numa_node_id(); flush_tlb_batched_pending(vma->vm_mm); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); do { nr_ptes = 1; oldpte = ptep_get(pte); @@ -439,7 +439,7 @@ static long change_pte_range(struct mmu_gather *tlb, } } } while (pte += nr_ptes, addr += nr_ptes * PAGE_SIZE, addr != end); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(pte - 1, ptl); return pages; diff --git a/mm/mremap.c b/mm/mremap.c index bd7314898ec5..a2e2cd8f279a 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -256,7 +256,7 @@ static int move_ptes(struct pagetable_move_control *pmc, if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); flush_tlb_batched_pending(vma->vm_mm); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); for (; old_addr < old_end; old_ptep += nr_ptes, old_addr += nr_ptes * PAGE_SIZE, new_ptep += nr_ptes, new_addr += nr_ptes * PAGE_SIZE) { @@ -301,7 +301,7 @@ static int move_ptes(struct pagetable_move_control *pmc, } } - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); if (force_flush) flush_tlb_range(vma, old_end - len, old_end); if (new_ptl != old_ptl) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index af61b95c89e4..e01f7813e15c 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -1100,7 +1100,7 @@ static long move_present_ptes(struct mm_struct *mm, /* It's safe to drop the reference now as the page-table is holding one. */ folio_put(*first_src_folio); *first_src_folio = NULL; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); while (true) { orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); @@ -1138,7 +1138,7 @@ static long move_present_ptes(struct mm_struct *mm, break; } - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); if (src_addr > src_start) flush_tlb_range(src_vma, src_start, src_addr); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 798b2ed21e46..b9940590a40d 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -105,7 +105,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (!pte) return -ENOMEM; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); do { if (unlikely(!pte_none(ptep_get(pte)))) { @@ -131,7 +131,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pfn++; } while (pte += PFN_DOWN(size), addr += size, addr != end); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); *mask |= PGTBL_PTE_MODIFIED; return 0; } @@ -359,7 +359,7 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, unsigned long size = PAGE_SIZE; pte = pte_offset_kernel(pmd, addr); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); do { #ifdef CONFIG_HUGETLB_PAGE @@ -378,7 +378,7 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, WARN_ON(!pte_none(ptent) && !pte_present(ptent)); } while (pte += (size >> PAGE_SHIFT), addr += size, addr != end); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); *mask |= PGTBL_PTE_MODIFIED; } @@ -526,7 +526,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, if (!pte) return -ENOMEM; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); do { struct page *page = pages[*nr]; @@ -548,7 +548,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, (*nr)++; } while (pte++, addr += PAGE_SIZE, addr != end); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); *mask |= PGTBL_PTE_MODIFIED; return err; diff --git a/mm/vmscan.c b/mm/vmscan.c index b2fc8b626d3d..7d2d87069530 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3551,7 +3551,7 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, return false; } - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); restart: for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) { unsigned long pfn; @@ -3592,7 +3592,7 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end)) goto restart; - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(pte, ptl); return suitable_to_scan(total, young); @@ -3633,7 +3633,7 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area if (!spin_trylock(ptl)) goto done; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); do { unsigned long pfn; @@ -3680,7 +3680,7 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area walk_update_folio(walk, last, gen, dirty); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); spin_unlock(ptl); done: *first = -1; @@ -4279,7 +4279,7 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) } } - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); pte -= (addr - start) / PAGE_SIZE; @@ -4313,7 +4313,7 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) walk_update_folio(walk, last, gen, dirty); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); /* feedback from rmap walkers to page table walkers */ if (mm_state && suitable_to_scan(i, young)) -- 2.47.0 Despite recent efforts to prevent lazy_mmu sections from nesting, it remains difficult to ensure that it never occurs - and in fact it does occur on arm64 in certain situations (CONFIG_DEBUG_PAGEALLOC). Commit 1ef3095b1405 ("arm64/mm: Permit lazy_mmu_mode to be nested") made nesting tolerable on arm64, but without truly supporting it: the inner call to leave() disables the batching optimisation before the outer section ends. This patch actually enables lazy_mmu sections to nest by tracking the nesting level in task_struct, in a similar fashion to e.g. pagefault_{enable,disable}(). This is fully handled by the generic lazy_mmu helpers that were recently introduced. lazy_mmu sections were not initially intended to nest, so we need to clarify the semantics w.r.t. the arch_*_lazy_mmu_mode() callbacks. This patch takes the following approach: * The outermost calls to lazy_mmu_mode_{enable,disable}() trigger calls to arch_{enter,leave}_lazy_mmu_mode() - this is unchanged. * Nested calls to lazy_mmu_mode_{enable,disable}() are not forwarded to the arch via arch_{enter,leave} - lazy MMU remains enabled so the assumption is that these callbacks are not relevant. However, existing code may rely on a call to disable() to flush any batched state, regardless of nesting. arch_flush_lazy_mmu_mode() is therefore called in that situation. A separate interface was recently introduced to temporarily pause the lazy MMU mode: lazy_mmu_mode_{pause,resume}(). pause() fully exits the mode *regardless of the nesting level*, and resume() restores the mode at the same nesting level. Whether the mode is actually enabled or not at any point is tracked by a separate "active" field in task_struct; this makes it possible to check invariants in the generic API, and to expose a new in_lazy_mmu_mode() helper to replace the various ways arch's currently track whether the mode is enabled (this will be done in later patches). In summary (nesting/active represent the values *after* the call): lazy_mmu_mode_enable() -> arch_enter() nesting=1 active=1 lazy_mmu_mode_enable() -> ΓΈ nesting=2 active=1 lazy_mmu_mode_pause() -> arch_leave() nesting=2 active=0 lazy_mmu_mode_resume() -> arch_enter() nesting=2 active=1 lazy_mmu_mode_disable() -> arch_flush() nesting=1 active=1 lazy_mmu_mode_disable() -> arch_leave() nesting=0 active=0 Note: in_lazy_mmu_mode() is added to to allow arch headers included by to use it. Signed-off-by: Kevin Brodsky --- arch/arm64/include/asm/pgtable.h | 12 ------ include/linux/mm_types_task.h | 5 +++ include/linux/pgtable.h | 67 ++++++++++++++++++++++++++++++-- include/linux/sched.h | 16 ++++++++ 4 files changed, 84 insertions(+), 16 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 54f8d6bb6f22..535435248923 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -82,18 +82,6 @@ static inline void queue_pte_barriers(void) static inline void arch_enter_lazy_mmu_mode(void) { - /* - * lazy_mmu_mode is not supposed to permit nesting. But in practice this - * does happen with CONFIG_DEBUG_PAGEALLOC, where a page allocation - * inside a lazy_mmu_mode section (such as zap_pte_range()) will change - * permissions on the linear map with apply_to_page_range(), which - * re-enters lazy_mmu_mode. So we tolerate nesting in our - * implementation. The first call to arch_leave_lazy_mmu_mode() will - * flush and clear the flag such that the remainder of the work in the - * outer nest behaves as if outside of lazy mmu mode. This is safe and - * keeps tracking simple. - */ - if (in_interrupt()) return; diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h index a82aa80c0ba4..632d404f8191 100644 --- a/include/linux/mm_types_task.h +++ b/include/linux/mm_types_task.h @@ -88,4 +88,9 @@ struct tlbflush_unmap_batch { #endif }; +struct lazy_mmu_state { + u8 nesting_level; + bool active; +}; + #endif /* _LINUX_MM_TYPES_TASK_H */ diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index b5fdf32c437f..e6064e00b22d 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -228,27 +228,86 @@ static inline int pmd_dirty(pmd_t pmd) * of the lazy mode. So the implementation must assume preemption may be enabled * and cpu migration is possible; it must take steps to be robust against this. * (In practice, for user PTE updates, the appropriate page table lock(s) are - * held, but for kernel PTE updates, no lock is held). Nesting is not permitted - * and the mode cannot be used in interrupt context. + * held, but for kernel PTE updates, no lock is held). The mode cannot be used + * in interrupt context. + * + * The lazy MMU mode is enabled for a given block of code using: + * + * lazy_mmu_mode_enable(); + * + * lazy_mmu_mode_disable(); + * + * Nesting is permitted: may itself use an enable()/disable() pair. + * A nested call to enable() has no functional effect; however disable() causes + * any batched architectural state to be flushed regardless of nesting. After a + * call to disable(), the caller can therefore rely on all previous page table + * modifications to have taken effect, but the lazy MMU mode may still be + * enabled. + * + * In certain cases, it may be desirable to temporarily pause the lazy MMU mode. + * This can be done using: + * + * lazy_mmu_mode_pause(); + * + * lazy_mmu_mode_resume(); + * + * This sequence must only be used if the lazy MMU mode is already enabled. + * pause() ensures that the mode is exited regardless of the nesting level; + * resume() re-enters the mode at the same nesting level. must not modify + * the lazy MMU state (i.e. it must not call any of the lazy_mmu_mode_* + * helpers). + * + * in_lazy_mmu_mode() can be used to check whether the lazy MMU mode is + * currently enabled. */ #ifdef CONFIG_ARCH_HAS_LAZY_MMU_MODE static inline void lazy_mmu_mode_enable(void) { - arch_enter_lazy_mmu_mode(); + struct lazy_mmu_state *state = ¤t->lazy_mmu_state; + + VM_WARN_ON_ONCE(state->nesting_level == U8_MAX); + /* enable() must not be called while paused */ + VM_WARN_ON(state->nesting_level > 0 && !state->active); + + if (state->nesting_level++ == 0) { + state->active = true; + arch_enter_lazy_mmu_mode(); + } } static inline void lazy_mmu_mode_disable(void) { - arch_leave_lazy_mmu_mode(); + struct lazy_mmu_state *state = ¤t->lazy_mmu_state; + + VM_WARN_ON_ONCE(state->nesting_level == 0); + VM_WARN_ON(!state->active); + + if (--state->nesting_level == 0) { + state->active = false; + arch_leave_lazy_mmu_mode(); + } else { + /* Exiting a nested section */ + arch_flush_lazy_mmu_mode(); + } } static inline void lazy_mmu_mode_pause(void) { + struct lazy_mmu_state *state = ¤t->lazy_mmu_state; + + VM_WARN_ON(state->nesting_level == 0 || !state->active); + + state->active = false; arch_leave_lazy_mmu_mode(); } static inline void lazy_mmu_mode_resume(void) { + struct lazy_mmu_state *state = ¤t->lazy_mmu_state; + + VM_WARN_ON(state->nesting_level == 0 || state->active); + + state->active = true; arch_enter_lazy_mmu_mode(); } #else diff --git a/include/linux/sched.h b/include/linux/sched.h index cbb7340c5866..11566d973f42 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1441,6 +1441,10 @@ struct task_struct { struct page_frag task_frag; +#ifdef CONFIG_ARCH_HAS_LAZY_MMU_MODE + struct lazy_mmu_state lazy_mmu_state; +#endif + #ifdef CONFIG_TASK_DELAY_ACCT struct task_delay_info *delays; #endif @@ -1724,6 +1728,18 @@ static inline char task_state_to_char(struct task_struct *tsk) return task_index_to_char(task_state_index(tsk)); } +#ifdef CONFIG_ARCH_HAS_LAZY_MMU_MODE +static inline bool in_lazy_mmu_mode(void) +{ + return current->lazy_mmu_state.active; +} +#else +static inline bool in_lazy_mmu_mode(void) +{ + return false; +} +#endif + extern struct pid *cad_pid; /* -- 2.47.0 The generic lazy_mmu layer now tracks whether a task is in lazy MMU mode. As a result we no longer need a TIF flag for that purpose - let's use the new in_lazy_mmu_mode() helper instead. Signed-off-by: Kevin Brodsky --- arch/arm64/include/asm/pgtable.h | 16 +++------------- arch/arm64/include/asm/thread_info.h | 3 +-- 2 files changed, 4 insertions(+), 15 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 535435248923..61ca88f94551 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -62,30 +62,21 @@ static inline void emit_pte_barriers(void) static inline void queue_pte_barriers(void) { - unsigned long flags; - if (in_interrupt()) { emit_pte_barriers(); return; } - flags = read_thread_flags(); - - if (flags & BIT(TIF_LAZY_MMU)) { - /* Avoid the atomic op if already set. */ - if (!(flags & BIT(TIF_LAZY_MMU_PENDING))) - set_thread_flag(TIF_LAZY_MMU_PENDING); - } else { + if (in_lazy_mmu_mode()) + test_and_set_thread_flag(TIF_LAZY_MMU_PENDING); + else emit_pte_barriers(); - } } static inline void arch_enter_lazy_mmu_mode(void) { if (in_interrupt()) return; - - set_thread_flag(TIF_LAZY_MMU); } static inline void arch_flush_lazy_mmu_mode(void) @@ -103,7 +94,6 @@ static inline void arch_leave_lazy_mmu_mode(void) return; arch_flush_lazy_mmu_mode(); - clear_thread_flag(TIF_LAZY_MMU); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index f241b8601ebd..4ff8da0767d9 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -84,8 +84,7 @@ void arch_setup_new_exec(void); #define TIF_SME_VL_INHERIT 28 /* Inherit SME vl_onexec across exec */ #define TIF_KERNEL_FPSTATE 29 /* Task is in a kernel mode FPSIMD section */ #define TIF_TSC_SIGSEGV 30 /* SIGSEGV on counter-timer access */ -#define TIF_LAZY_MMU 31 /* Task in lazy mmu mode */ -#define TIF_LAZY_MMU_PENDING 32 /* Ops pending for lazy mmu mode exit */ +#define TIF_LAZY_MMU_PENDING 31 /* Ops pending for lazy mmu mode exit */ #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) -- 2.47.0 A per-CPU batch struct is activated when entering lazy MMU mode; its lifetime is the same as the lazy MMU section (it is deactivated when leaving the mode). Preemption is disabled in that interval to ensure that the per-CPU reference remains valid. The generic lazy_mmu layer now tracks whether a task is in lazy MMU mode. We can therefore use the generic helper in_lazy_mmu_mode() to tell whether a batch struct is active instead of tracking it explicitly. Signed-off-by: Kevin Brodsky --- arch/powerpc/include/asm/book3s/64/tlbflush-hash.h | 9 --------- arch/powerpc/mm/book3s64/hash_tlb.c | 2 +- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h index 623a8a8b2d0e..bbc54690d374 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h @@ -12,7 +12,6 @@ #define PPC64_TLB_BATCH_NR 192 struct ppc64_tlb_batch { - int active; unsigned long index; struct mm_struct *mm; real_pte_t pte[PPC64_TLB_BATCH_NR]; @@ -26,8 +25,6 @@ extern void __flush_tlb_pending(struct ppc64_tlb_batch *batch); static inline void arch_enter_lazy_mmu_mode(void) { - struct ppc64_tlb_batch *batch; - if (radix_enabled()) return; /* @@ -35,8 +32,6 @@ static inline void arch_enter_lazy_mmu_mode(void) * operating on kernel page tables. */ preempt_disable(); - batch = this_cpu_ptr(&ppc64_tlb_batch); - batch->active = 1; } static inline void arch_flush_lazy_mmu_mode(void) @@ -51,14 +46,10 @@ static inline void arch_flush_lazy_mmu_mode(void) static inline void arch_leave_lazy_mmu_mode(void) { - struct ppc64_tlb_batch *batch; - if (radix_enabled()) return; - batch = this_cpu_ptr(&ppc64_tlb_batch); arch_flush_lazy_mmu_mode(); - batch->active = 0; preempt_enable(); } diff --git a/arch/powerpc/mm/book3s64/hash_tlb.c b/arch/powerpc/mm/book3s64/hash_tlb.c index 787f7a0e27f0..72b83f582b6d 100644 --- a/arch/powerpc/mm/book3s64/hash_tlb.c +++ b/arch/powerpc/mm/book3s64/hash_tlb.c @@ -100,7 +100,7 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr, * Check if we have an active batch on this CPU. If not, just * flush now and return. */ - if (!batch->active) { + if (!in_lazy_mmu_mode()) { flush_hash_page(vpn, rpte, psize, ssize, mm_is_thread_local(mm)); put_cpu_var(ppc64_tlb_batch); return; -- 2.47.0 A per-CPU batch struct is activated when entering lazy MMU mode; its lifetime is the same as the lazy MMU section (it is deactivated when leaving the mode). Preemption is disabled in that interval to ensure that the per-CPU reference remains valid. The generic lazy_mmu layer now tracks whether a task is in lazy MMU mode. We can therefore use the generic helper in_lazy_mmu_mode() to tell whether a batch struct is active instead of tracking it explicitly. Signed-off-by: Kevin Brodsky --- arch/sparc/include/asm/tlbflush_64.h | 1 - arch/sparc/mm/tlb.c | 9 +-------- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/arch/sparc/include/asm/tlbflush_64.h b/arch/sparc/include/asm/tlbflush_64.h index 4e1036728e2f..6133306ba59a 100644 --- a/arch/sparc/include/asm/tlbflush_64.h +++ b/arch/sparc/include/asm/tlbflush_64.h @@ -12,7 +12,6 @@ struct tlb_batch { unsigned int hugepage_shift; struct mm_struct *mm; unsigned long tlb_nr; - unsigned long active; unsigned long vaddrs[TLB_BATCH_NR]; }; diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c index 7b5dfcdb1243..879e22c86e5c 100644 --- a/arch/sparc/mm/tlb.c +++ b/arch/sparc/mm/tlb.c @@ -52,11 +52,7 @@ void flush_tlb_pending(void) void arch_enter_lazy_mmu_mode(void) { - struct tlb_batch *tb; - preempt_disable(); - tb = this_cpu_ptr(&tlb_batch); - tb->active = 1; } void arch_flush_lazy_mmu_mode(void) @@ -69,10 +65,7 @@ void arch_flush_lazy_mmu_mode(void) void arch_leave_lazy_mmu_mode(void) { - struct tlb_batch *tb = this_cpu_ptr(&tlb_batch); - arch_flush_lazy_mmu_mode(); - tb->active = 0; preempt_enable(); } @@ -93,7 +86,7 @@ static void tlb_batch_add_one(struct mm_struct *mm, unsigned long vaddr, nr = 0; } - if (!tb->active) { + if (!in_lazy_mmu_mode()) { flush_tsb_user_page(mm, vaddr, hugepage_shift); global_flush_tlb_page(mm, vaddr); goto out; -- 2.47.0 We currently set a TIF flag when scheduling out a task that is in lazy MMU mode, in order to restore it when the task is scheduled again. The generic lazy_mmu layer now tracks whether a task is in lazy MMU mode in task_struct::lazy_mmu_state. We can therefore check that state when switching to the new task, instead of using a separate TIF flag. Signed-off-by: Kevin Brodsky --- arch/x86/include/asm/thread_info.h | 4 +--- arch/x86/xen/enlighten_pv.c | 3 +-- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index e71e0e8362ed..0067684afb5b 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -100,8 +100,7 @@ struct thread_info { #define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ #define TIF_SINGLESTEP 25 /* reenable singlestep on user return*/ #define TIF_BLOCKSTEP 26 /* set when we want DEBUGCTLMSR_BTF */ -#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ -#define TIF_ADDR32 28 /* 32-bit address space on 64 bits */ +#define TIF_ADDR32 27 /* 32-bit address space on 64 bits */ #define _TIF_SSBD BIT(TIF_SSBD) #define _TIF_SPEC_IB BIT(TIF_SPEC_IB) @@ -114,7 +113,6 @@ struct thread_info { #define _TIF_FORCED_TF BIT(TIF_FORCED_TF) #define _TIF_BLOCKSTEP BIT(TIF_BLOCKSTEP) #define _TIF_SINGLESTEP BIT(TIF_SINGLESTEP) -#define _TIF_LAZY_MMU_UPDATES BIT(TIF_LAZY_MMU_UPDATES) #define _TIF_ADDR32 BIT(TIF_ADDR32) /* flags to check in __switch_to() */ diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 4806cc28d7ca..f40f5999352e 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -426,7 +426,6 @@ static void xen_start_context_switch(struct task_struct *prev) if (this_cpu_read(xen_lazy_mode) == XEN_LAZY_MMU) { arch_leave_lazy_mmu_mode(); - set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES); } enter_lazy(XEN_LAZY_CPU); } @@ -437,7 +436,7 @@ static void xen_end_context_switch(struct task_struct *next) xen_mc_flush(); leave_lazy(XEN_LAZY_CPU); - if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES)) + if (next->lazy_mmu_state.active) arch_enter_lazy_mmu_mode(); } -- 2.47.0 The lazy MMU mode cannot be used in interrupt context. This is documented in , but isn't consistently handled across architectures. arm64 ensures that calls to lazy_mmu_mode_* have no effect in interrupt context, because such calls do occur in certain configurations - see commit b81c688426a9 ("arm64/mm: Disable barrier batching in interrupt contexts"). Other architectures do not check this situation, most likely because it hasn't occurred so far. Both arm64 and x86/Xen also ensure that any lazy MMU optimisation is disabled while in interrupt mode (see queue_pte_barriers() and xen_get_lazy_mode() respectively). Let's handle this in the new generic lazy_mmu layer, in the same fashion as arm64: bail out of lazy_mmu_mode_* if in_interrupt(), and have in_lazy_mmu_mode() return false to disable any optimisation. Also remove the arm64 handling that is now redundant; x86/Xen has its own internal tracking so it is left unchanged. Signed-off-by: Kevin Brodsky --- arch/arm64/include/asm/pgtable.h | 17 +---------------- include/linux/pgtable.h | 16 ++++++++++++++-- include/linux/sched.h | 3 +++ 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 61ca88f94551..96987a49e83b 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -62,37 +62,22 @@ static inline void emit_pte_barriers(void) static inline void queue_pte_barriers(void) { - if (in_interrupt()) { - emit_pte_barriers(); - return; - } - if (in_lazy_mmu_mode()) test_and_set_thread_flag(TIF_LAZY_MMU_PENDING); else emit_pte_barriers(); } -static inline void arch_enter_lazy_mmu_mode(void) -{ - if (in_interrupt()) - return; -} +static inline void arch_enter_lazy_mmu_mode(void) {} static inline void arch_flush_lazy_mmu_mode(void) { - if (in_interrupt()) - return; - if (test_and_clear_thread_flag(TIF_LAZY_MMU_PENDING)) emit_pte_barriers(); } static inline void arch_leave_lazy_mmu_mode(void) { - if (in_interrupt()) - return; - arch_flush_lazy_mmu_mode(); } diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index e6064e00b22d..e6069ce4ec83 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -228,8 +228,8 @@ static inline int pmd_dirty(pmd_t pmd) * of the lazy mode. So the implementation must assume preemption may be enabled * and cpu migration is possible; it must take steps to be robust against this. * (In practice, for user PTE updates, the appropriate page table lock(s) are - * held, but for kernel PTE updates, no lock is held). The mode cannot be used - * in interrupt context. + * held, but for kernel PTE updates, no lock is held). The mode is disabled + * in interrupt context and calls to the lazy_mmu API have no effect. * * The lazy MMU mode is enabled for a given block of code using: * @@ -265,6 +265,9 @@ static inline void lazy_mmu_mode_enable(void) { struct lazy_mmu_state *state = ¤t->lazy_mmu_state; + if (in_interrupt()) + return; + VM_WARN_ON_ONCE(state->nesting_level == U8_MAX); /* enable() must not be called while paused */ VM_WARN_ON(state->nesting_level > 0 && !state->active); @@ -279,6 +282,9 @@ static inline void lazy_mmu_mode_disable(void) { struct lazy_mmu_state *state = ¤t->lazy_mmu_state; + if (in_interrupt()) + return; + VM_WARN_ON_ONCE(state->nesting_level == 0); VM_WARN_ON(!state->active); @@ -295,6 +301,9 @@ static inline void lazy_mmu_mode_pause(void) { struct lazy_mmu_state *state = ¤t->lazy_mmu_state; + if (in_interrupt()) + return; + VM_WARN_ON(state->nesting_level == 0 || !state->active); state->active = false; @@ -305,6 +314,9 @@ static inline void lazy_mmu_mode_resume(void) { struct lazy_mmu_state *state = ¤t->lazy_mmu_state; + if (in_interrupt()) + return; + VM_WARN_ON(state->nesting_level == 0 || state->active); state->active = true; diff --git a/include/linux/sched.h b/include/linux/sched.h index 11566d973f42..bb873016ffcf 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1731,6 +1731,9 @@ static inline char task_state_to_char(struct task_struct *tsk) #ifdef CONFIG_ARCH_HAS_LAZY_MMU_MODE static inline bool in_lazy_mmu_mode(void) { + if (in_interrupt()) + return false; + return current->lazy_mmu_state.active; } #else -- 2.47.0