From: Lance Yang When unsharing hugetlb PMD page tables, we currently send two IPIs: one for TLB invalidation, and another to synchronize with concurrent GUP-fast walkers. However, if the TLB flush already reaches all CPUs, the second IPI is redundant. GUP-fast runs with IRQs disabled, so when the TLB flush IPI completes, any concurrent GUP-fast must have finished. Add tlb_table_flush_implies_ipi_broadcast() to let architectures indicate their TLB flush provides full synchronization, enabling the redundant IPI to be skipped. Suggested-by: David Hildenbrand (Red Hat) Signed-off-by: Lance Yang --- include/asm-generic/tlb.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 4d679d2a206b..e8d99b5e831f 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -261,6 +261,20 @@ static inline void tlb_remove_table_sync_one(void) { } #endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */ +/* + * Architectures can override if their TLB flush already broadcasts IPIs to all + * CPUs when freeing or unsharing page tables. + * + * Return true only when the flush guarantees: + * - IPIs reach all CPUs with potentially stale paging-structure cache entries + * - Synchronization with IRQ-disabled code like GUP-fast + */ +#ifndef tlb_table_flush_implies_ipi_broadcast +static inline bool tlb_table_flush_implies_ipi_broadcast(void) +{ + return false; +} +#endif #ifndef CONFIG_MMU_GATHER_NO_GATHER /* -- 2.49.0 From: Lance Yang Add a callback function flush_tlb_multi_implies_ipi_broadcast to pv_mmu_ops to explicitly track whether flush_tlb_multi IPIs provide sufficient synchronization for GUP-fast when freeing or unsharing page tables. Pass both freed_tables and unshared_tables to flush_tlb_mm_range() to ensure lazy-TLB CPUs receive IPIs and flush their paging-structure caches: flush_tlb_mm_range(..., freed_tables || unshared_tables); Suggested-by: David Hildenbrand (Red Hat) Signed-off-by: Lance Yang --- arch/x86/include/asm/paravirt_types.h | 6 ++++++ arch/x86/include/asm/tlb.h | 19 ++++++++++++++++++- arch/x86/kernel/paravirt.c | 10 ++++++++++ 3 files changed, 34 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 3502939415ad..a5bd0983da1f 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -133,6 +133,12 @@ struct pv_mmu_ops { void (*flush_tlb_multi)(const struct cpumask *cpus, const struct flush_tlb_info *info); + /* + * Indicates whether flush_tlb_multi IPIs provide sufficient + * synchronization for GUP-fast when freeing or unsharing page tables. + */ + bool (*flush_tlb_multi_implies_ipi_broadcast)(void); + /* Hook for intercepting the destruction of an mm_struct. */ void (*exit_mmap)(struct mm_struct *mm); void (*notify_page_enc_status_changed)(unsigned long pfn, int npages, bool enc); diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h index 866ea78ba156..3a7cdfdcea8e 100644 --- a/arch/x86/include/asm/tlb.h +++ b/arch/x86/include/asm/tlb.h @@ -5,10 +5,26 @@ #define tlb_flush tlb_flush static inline void tlb_flush(struct mmu_gather *tlb); +#define tlb_table_flush_implies_ipi_broadcast tlb_table_flush_implies_ipi_broadcast +static inline bool tlb_table_flush_implies_ipi_broadcast(void); + #include #include #include #include +#include + +static inline bool tlb_table_flush_implies_ipi_broadcast(void) +{ +#ifdef CONFIG_PARAVIRT + if (pv_ops.mmu.flush_tlb_multi_implies_ipi_broadcast) + return pv_ops.mmu.flush_tlb_multi_implies_ipi_broadcast(); + + return false; +#else + return !cpu_feature_enabled(X86_FEATURE_INVLPGB); +#endif +} static inline void tlb_flush(struct mmu_gather *tlb) { @@ -20,7 +36,8 @@ static inline void tlb_flush(struct mmu_gather *tlb) end = tlb->end; } - flush_tlb_mm_range(tlb->mm, start, end, stride_shift, tlb->freed_tables); + flush_tlb_mm_range(tlb->mm, start, end, stride_shift, + tlb->freed_tables || tlb->unshared_tables); } static inline void invlpg(unsigned long addr) diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index ab3e172dcc69..4eaa44800b39 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -60,6 +60,15 @@ void __init native_pv_lock_init(void) static_branch_enable(&virt_spin_lock_key); } +static bool native_flush_tlb_multi_implies_ipi_broadcast(void) +{ + /* Paravirt may use hypercalls that don't send real IPIs. */ + if (pv_ops.mmu.flush_tlb_multi != native_flush_tlb_multi) + return false; + + return !cpu_feature_enabled(X86_FEATURE_INVLPGB); +} + struct static_key paravirt_steal_enabled; struct static_key paravirt_steal_rq_enabled; @@ -173,6 +182,7 @@ struct paravirt_patch_template pv_ops = { .mmu.flush_tlb_kernel = native_flush_tlb_global, .mmu.flush_tlb_one_user = native_flush_tlb_one_user, .mmu.flush_tlb_multi = native_flush_tlb_multi, + .mmu.flush_tlb_multi_implies_ipi_broadcast = native_flush_tlb_multi_implies_ipi_broadcast, .mmu.exit_mmap = paravirt_nop, .mmu.notify_page_enc_status_changed = paravirt_nop, -- 2.49.0 From: Lance Yang Embed the tlb_table_flush_implies_ipi_broadcast() check directly inside tlb_remove_table_sync_one() instead of requiring every caller to check it explicitly. This relies on callers to do the right thing: flush with freed_tables=true or unshared_tables=true beforehand. All existing callers satisfy this requirement: 1. mm/khugepaged.c:1188 (collapse_huge_page): pmdp_collapse_flush(vma, address, pmd) -> flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE) -> flush_tlb_mm_range(mm, ..., freed_tables = true) -> flush_tlb_multi(mm_cpumask(mm), info) So freed_tables=true before calling tlb_remove_table_sync_one(). 2. include/asm-generic/tlb.h:861 (tlb_flush_unshared_tables): tlb_flush_mmu_tlbonly(tlb) -> tlb_flush(tlb) -> flush_tlb_mm_range(mm, ..., unshared_tables = true) -> flush_tlb_multi(mm_cpumask(mm), info) unshared_tables=true (equivalent to freed_tables for sending IPIs). 3. mm/mmu_gather.c:341 (__tlb_remove_table_one): When we can't allocate a batch page in tlb_remove_table(), we do: tlb_table_invalidate(tlb) -> tlb_flush_mmu_tlbonly(tlb) -> flush_tlb_mm_range(mm, ..., freed_tables = true) -> flush_tlb_multi(mm_cpumask(mm), info) Then: tlb_remove_table_one(table) -> __tlb_remove_table_one(table) // if !CONFIG_PT_RECLAIM -> tlb_remove_table_sync_one() freed_tables=true, and this should work too. Why is tlb->freed_tables guaranteed? Because callers like pte_free_tlb() (via free_pte_range) set freed_tables=true before calling __pte_free_tlb(), which then calls tlb_remove_table(). We cannot free page tables without freed_tables=true. Note that tlb_remove_table_sync_one() was a NOP on bare metal x86 (CONFIG_MMU_GATHER_RCU_TABLE_FREE=n) before commit a37259732a7d ("x86/mm: Make MMU_GATHER_RCU_TABLE_FREE unconditional"). 4-5. mm/khugepaged.c:1683,1819 (pmdp_get_lockless_sync macro): Same as #1. These also use pmdp_collapse_flush() beforehand. Suggested-by: David Hildenbrand (Red Hat) Signed-off-by: Lance Yang --- mm/mmu_gather.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 7468ec388455..7b588643cbae 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -276,6 +276,10 @@ static void tlb_remove_table_smp_sync(void *arg) void tlb_remove_table_sync_one(void) { + /* Skip the IPI if the TLB flush already synchronized with other CPUs. */ + if (tlb_table_flush_implies_ipi_broadcast()) + return; + /* * This isn't an RCU grace period and hence the page-tables cannot be * assumed to be actually RCU-freed. -- 2.49.0