The page faults may be spurious because of the racy access to the page table. For example, a non-populated virtual page is accessed on 2 CPUs simultaneously, thus the page faults are triggered on both CPUs. However, it's possible that one CPU (say CPU A) cannot find the reason for the page fault if the other CPU (say CPU B) has changed the page table before the PTE is checked on CPU A. Most of the time, the spurious page faults can be ignored safely. However, if the page fault is for the write access, it's possible that a stale read-only TLB entry exists in the local CPU and needs to be flushed on some architectures. This is called the spurious page fault fixing. In the current kernel, there is spurious fault fixing support for pte, but not for huge pmd because no architectures need it. But in the next patch in the series, we will change the write protection fault handling logic on arm64, so that some stale huge pmd entries may remain in the TLB. These entries need to be flushed via the huge pmd spurious fault fixing mechanism. Signed-off-by: Huang Ying Cc: Catalin Marinas Cc: Will Deacon Cc: Andrew Morton Cc: David Hildenbrand Cc: Lorenzo Stoakes Cc: Vlastimil Babka Cc: Zi Yan Cc: Baolin Wang Cc: Ryan Roberts Cc: Yang Shi Cc: "Christoph Lameter (Ampere)" Cc: Dev Jain Cc: Barry Song Cc: Anshuman Khandual Cc: Kefeng Wang Cc: Kevin Brodsky Cc: Yin Fengwei Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org --- include/linux/huge_mm.h | 2 +- include/linux/pgtable.h | 4 +++ mm/huge_memory.c | 33 ++++++++++++++-------- mm/internal.h | 2 +- mm/memory.c | 62 ++++++++++++++++++++++++++++++----------- 5 files changed, 73 insertions(+), 30 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index f327d62fc985..887a632ce7a0 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -11,7 +11,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf); int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); -void huge_pmd_set_accessed(struct vm_fault *vmf); +bool huge_pmd_set_accessed(struct vm_fault *vmf); int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, pud_t *dst_pud, pud_t *src_pud, unsigned long addr, struct vm_area_struct *vma); diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 32e8457ad535..ee3148ef87f6 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1232,6 +1232,10 @@ static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio) #define flush_tlb_fix_spurious_fault(vma, address, ptep) flush_tlb_page(vma, address) #endif +#ifndef flush_tlb_fix_spurious_fault_pmd +#define flush_tlb_fix_spurious_fault_pmd(vma, address, pmdp) do { } while (0) +#endif + /* * When walking page tables, get the address of the next boundary, * or the end address of the range if that comes earlier. Although no diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1b81680b4225..6a8679907eaa 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1641,17 +1641,30 @@ vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio, EXPORT_SYMBOL_GPL(vmf_insert_folio_pud); #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ -void touch_pmd(struct vm_area_struct *vma, unsigned long addr, +/** + * touch_pmd - Mark page table pmd entry as accessed and dirty (for write) + * @vma: The VMA covering @addr + * @addr: The virtual address + * @pmd: pmd pointer into the page table mapping @addr + * @write: Whether it's a write access + * + * Return: whether the pmd entry is changed + */ +bool touch_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, bool write) { - pmd_t _pmd; + pmd_t entry; - _pmd = pmd_mkyoung(*pmd); + entry = pmd_mkyoung(*pmd); if (write) - _pmd = pmd_mkdirty(_pmd); + entry = pmd_mkdirty(entry); if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, - pmd, _pmd, write)) + pmd, entry, write)) { update_mmu_cache_pmd(vma, addr, pmd); + return true; + } + + return false; } int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, @@ -1841,18 +1854,14 @@ void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud) } #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ -void huge_pmd_set_accessed(struct vm_fault *vmf) +bool huge_pmd_set_accessed(struct vm_fault *vmf) { bool write = vmf->flags & FAULT_FLAG_WRITE; - vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd))) - goto unlock; - - touch_pmd(vmf->vma, vmf->address, vmf->pmd, write); + return false; -unlock: - spin_unlock(vmf->ptl); + return touch_pmd(vmf->vma, vmf->address, vmf->pmd, write); } static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf) diff --git a/mm/internal.h b/mm/internal.h index 1561fc2ff5b8..27ad37a41868 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1402,7 +1402,7 @@ int __must_check try_grab_folio(struct folio *folio, int refs, */ void touch_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud, bool write); -void touch_pmd(struct vm_area_struct *vma, unsigned long addr, +bool touch_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, bool write); /* diff --git a/mm/memory.c b/mm/memory.c index 74b45e258323..6e5a08c4fd2e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -6115,6 +6115,45 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud) return VM_FAULT_FALLBACK; } +/* + * The page faults may be spurious because of the racy access to the + * page table. For example, a non-populated virtual page is accessed + * on 2 CPUs simultaneously, thus the page faults are triggered on + * both CPUs. However, it's possible that one CPU (say CPU A) cannot + * find the reason for the page fault if the other CPU (say CPU B) has + * changed the page table before the PTE is checked on CPU A. Most of + * the time, the spurious page faults can be ignored safely. However, + * if the page fault is for the write access, it's possible that a + * stale read-only TLB entry exists in the local CPU and needs to be + * flushed on some architectures. This is called the spurious page + * fault fixing. + * + * Note: flush_tlb_fix_spurious_fault() is defined as flush_tlb_page() + * by default and used as such on most architectures, while + * flush_tlb_fix_spurious_fault_pmd() is defined as NOP by default and + * used as such on most architectures. + */ +static void fix_spurious_fault(struct vm_fault *vmf, + enum pgtable_level ptlevel) +{ + /* Skip spurious TLB flush for retried page fault */ + if (vmf->flags & FAULT_FLAG_TRIED) + return; + /* + * This is needed only for protection faults but the arch code + * is not yet telling us if this is a protection fault or not. + * This still avoids useless tlb flushes for .text page faults + * with threads. + */ + if (vmf->flags & FAULT_FLAG_WRITE) { + if (ptlevel == PGTABLE_LEVEL_PTE) + flush_tlb_fix_spurious_fault(vmf->vma, vmf->address, + vmf->pte); + else + flush_tlb_fix_spurious_fault_pmd(vmf->vma, vmf->address, + vmf->pmd); + } +} /* * These routines also need to handle stuff like marking pages dirty * and/or accessed for architectures that don't do it in hardware (most @@ -6196,23 +6235,11 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) } entry = pte_mkyoung(entry); if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry, - vmf->flags & FAULT_FLAG_WRITE)) { + vmf->flags & FAULT_FLAG_WRITE)) update_mmu_cache_range(vmf, vmf->vma, vmf->address, vmf->pte, 1); - } else { - /* Skip spurious TLB flush for retried page fault */ - if (vmf->flags & FAULT_FLAG_TRIED) - goto unlock; - /* - * This is needed only for protection faults but the arch code - * is not yet telling us if this is a protection fault or not. - * This still avoids useless tlb flushes for .text page faults - * with threads. - */ - if (vmf->flags & FAULT_FLAG_WRITE) - flush_tlb_fix_spurious_fault(vmf->vma, vmf->address, - vmf->pte); - } + else + fix_spurious_fault(vmf, PGTABLE_LEVEL_PTE); unlock: pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; @@ -6309,7 +6336,10 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { - huge_pmd_set_accessed(&vmf); + vmf.ptl = pmd_lock(mm, vmf.pmd); + if (!huge_pmd_set_accessed(&vmf)) + fix_spurious_fault(&vmf, PGTABLE_LEVEL_PMD); + spin_unlock(vmf.ptl); return 0; } } -- 2.39.5