mm->context.untag_mask is written once, when LAM is enabled (mm_enable_lam(), under mmap_write_lock and while the process is still single-threaded), and is otherwise stable and never reverted. untagged_addr_remote() reads it for a remote mm, and the new untagged_addr_remote_unlocked() (used by the per-VMA-lock access_remote_vm() fast path) reads it without the mmap lock. The field is a single aligned word and cannot tear, but annotate the reads and writes with READ_ONCE()/WRITE_ONCE() to make the lockless access explicit and keep the compiler from reloading or tearing it. No functional change. Assisted-by: Claude:claude-opus-4-8 Signed-off-by: Rik van Riel --- arch/x86/include/asm/mmu_context.h | 6 +++--- arch/x86/include/asm/uaccess_64.h | 2 +- arch/x86/kernel/process_64.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index ef5b507de34e..cee710f64658 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -100,18 +100,18 @@ static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm) static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm) { mm->context.lam_cr3_mask = oldmm->context.lam_cr3_mask; - mm->context.untag_mask = oldmm->context.untag_mask; + WRITE_ONCE(mm->context.untag_mask, READ_ONCE(oldmm->context.untag_mask)); } #define mm_untag_mask mm_untag_mask static inline unsigned long mm_untag_mask(struct mm_struct *mm) { - return mm->context.untag_mask; + return READ_ONCE(mm->context.untag_mask); } static inline void mm_reset_untag_mask(struct mm_struct *mm) { - mm->context.untag_mask = -1UL; + WRITE_ONCE(mm->context.untag_mask, -1UL); } #define arch_pgtable_dma_compat arch_pgtable_dma_compat diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 20de34cc9aa6..4a52497ba6a1 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -43,7 +43,7 @@ static inline unsigned long __untagged_addr_remote(struct mm_struct *mm, unsigned long addr) { mmap_assert_locked(mm); - return addr & (mm)->context.untag_mask; + return addr & READ_ONCE((mm)->context.untag_mask); } #define untagged_addr_remote(mm, addr) ({ \ diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index d44afbe005bb..55096136de53 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -814,7 +814,7 @@ static void enable_lam_func(void *__mm) static void mm_enable_lam(struct mm_struct *mm) { mm->context.lam_cr3_mask = X86_CR3_LAM_U57; - mm->context.untag_mask = ~GENMASK(62, 57); + WRITE_ONCE(mm->context.untag_mask, ~GENMASK(62, 57)); /* * Even though the process must still be single-threaded at this -- 2.53.0-Meta folio_walk_start() asserts the mmap lock is held. For callers that only need to read a single, already-present page, the mmap lock is a heavy and often badly contended hammer. Such a caller can instead hold the per-VMA lock, which keeps the VMA itself stable. The per-VMA lock does not, however, keep the page tables walked below that VMA from being freed. A concurrent munmap() or THP collapse of an adjacent region in the same mm can free a shared upper-level table, and THP collapse (collapse_huge_page() -> retract_page_tables()) frees page tables of VMAs whose lock it does not hold. Page table freeing synchronizes against lockless walkers the way gup_fast relies on: tlb_remove_table_sync_one() sends an IPI and waits for every CPU to enable interrupts, so a walker that keeps interrupts disabled across the walk cannot be observing a table that is about to be freed. rcu_read_lock() is not sufficient -- it does not block that IPI -- so the caller must keep interrupts disabled, not merely hold an RCU read-side critical section. Add an FW_VMA_LOCKED flag. When passed, folio_walk_start() asserts the per-VMA lock and that interrupts are disabled, instead of asserting the mmap lock; it requires CONFIG_MMU_GATHER_RCU_TABLE_FREE and refuses hugetlb VMAs (PMD sharing maps page tables this VMA's lock does not cover). The caller must keep interrupts disabled until folio_walk_end(). No existing caller passes FW_VMA_LOCKED, so behaviour is unchanged. Assisted-by: Claude:claude-opus-4-8 Signed-off-by: Rik van Riel --- include/linux/pagewalk.h | 7 +++++++ mm/pagewalk.c | 29 +++++++++++++++++++++++++++-- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index b41d7265c01b..d0387470d732 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -150,6 +150,13 @@ typedef int __bitwise folio_walk_flags_t; /* Walk shared zeropages (small + huge) as well. */ #define FW_ZEROPAGE ((__force folio_walk_flags_t)BIT(0)) +/* + * The caller holds the per-VMA lock instead of the mmap lock, with interrupts + * disabled across the walk (until folio_walk_end()) to serialize against page + * table freeing, the same way gup_fast does. Only valid with RCU-freed page + * tables (CONFIG_MMU_GATHER_RCU_TABLE_FREE) and not for hugetlb. + */ +#define FW_VMA_LOCKED ((__force folio_walk_flags_t)BIT(1)) enum folio_walk_level { FW_LEVEL_PTE, diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 3ae2586ff45b..ab1e81983cb8 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -890,7 +890,10 @@ int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, * huge_ptep_set_*, ...). Note that the page table entry stored in @fw might * not correspond to the first physical entry of a logical hugetlb entry. * - * The mmap lock must be held in read mode. + * The mmap lock must be held in read mode. Alternatively, if @FW_VMA_LOCKED is + * passed, the VMA's per-VMA lock must be held and interrupts must be disabled + * across the walk and until folio_walk_end() (only supported with RCU-freed page + * tables, i.e. CONFIG_MMU_GATHER_RCU_TABLE_FREE, and not for hugetlb). * * Return: folio pointer on success, otherwise NULL. */ @@ -908,7 +911,29 @@ struct folio *folio_walk_start(struct folio_walk *fw, pgd_t *pgdp; p4d_t *p4dp; - mmap_assert_locked(vma->vm_mm); + if (flags & FW_VMA_LOCKED) { + /* + * Lockless walk under the per-VMA lock instead of the mmap + * lock. The VMA lock keeps the VMA stable, but the page tables + * walked below it can still be freed concurrently: a munmap() or + * THP collapse of an adjacent region in the same mm can free a + * shared upper-level table, and collapse_huge_page() -> + * retract_page_tables() frees page tables of VMAs whose lock it + * does not hold. Page table freeing serializes against lockless + * walkers via tlb_remove_table_sync_one(), which IPIs and waits + * for every CPU to enable interrupts; an RCU read-side critical + * section does not block that IPI, so the caller must keep + * interrupts disabled across the whole walk, like gup_fast. + * Hugetlb (PMD sharing) maps page tables not covered by this + * VMA's lock and is not supported. + */ + VM_WARN_ON_ONCE(!IS_ENABLED(CONFIG_MMU_GATHER_RCU_TABLE_FREE)); + VM_WARN_ON_ONCE(is_vm_hugetlb_page(vma)); + lockdep_assert_irqs_disabled(); + vma_assert_locked(vma); + } else { + mmap_assert_locked(vma->vm_mm); + } vma_pgtable_walk_begin(vma); if (WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end)) -- 2.53.0-Meta __access_remote_vm() takes mmap_read_lock() for the entire transfer and uses get_user_pages_remote(), which faults pages in. For the common case of reading memory that is already resident -- /proc/PID/cmdline, /proc/PID/environ, ptrace PEEK of resident pages -- the mmap lock is unnecessary and is badly contended on large machines. Add an opportunistic, read-only fast path. It takes the per-VMA lock with lock_vma_under_rcu() and, only when the whole request lies within that one VMA, copies the resident pages out using folio_walk_start(FW_VMA_LOCKED) to grab a short-lived page reference from a page table walk run with interrupts disabled. Interrupts are disabled only across the walk (until the folio is pinned): page table freeing -- a concurrent munmap() or THP collapse of an adjacent region -- serializes against lockless walkers via tlb_remove_table_sync_one(), which IPIs and waits for every CPU to enable interrupts, the same contract gup_fast relies on. The copy then runs with interrupts on, holding only the folio reference. A request that spans more than one VMA is left entirely to the mmap_lock path: relocking per VMA could observe a structurally inconsistent address space (a neighbouring VMA unmapped and a different one mapped in its place between locks), whereas the mmap_lock path sees a stable VMA tree for the whole transfer. The per-VMA permission check mirrors the read side of check_vma_flags(), including the FOLL_ANON restriction that /proc/PID/{cmdline,environ} rely on (CVE-2018-1120). Anything not positively allowed -- a not-present page, a hugetlb or VM_IO/VM_PFNMAP or secretmem mapping, or a race with a VMA writer -- falls back to the mmap_lock path for the remainder, which re-validates everything. Pages read on the fast path are marked accessed, matching the FOLL_TOUCH behaviour of the get_user_pages_remote() slow path. untagged_addr_remote() asserts the mmap lock, so add an unlocked variant for the fast path; the untag mask is a stable per-mm value. Only reads are handled here; writes keep using the slow path. Assisted-by: Claude:claude-opus-4-8 Signed-off-by: Rik van Riel --- arch/x86/include/asm/uaccess_64.h | 14 ++- include/linux/uaccess.h | 11 ++ mm/memory.c | 195 +++++++++++++++++++++++++++++- 3 files changed, 217 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 4a52497ba6a1..933b0b8b4d60 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -39,11 +39,23 @@ static inline unsigned long __untagged_addr(unsigned long addr) (__force __typeof__(addr))__untagged_addr(__addr); \ }) +/* Strip the tag bits from a remote mm's address; usable without the mmap lock. */ +static inline unsigned long __untagged_addr_remote_unlocked(struct mm_struct *mm, + unsigned long addr) +{ + return addr & READ_ONCE(mm->context.untag_mask); +} + +#define untagged_addr_remote_unlocked(mm, addr) ({ \ + unsigned long __addr = (__force unsigned long)(addr); \ + (__force __typeof__(addr))__untagged_addr_remote_unlocked(mm, __addr); \ +}) + static inline unsigned long __untagged_addr_remote(struct mm_struct *mm, unsigned long addr) { mmap_assert_locked(mm); - return addr & READ_ONCE((mm)->context.untag_mask); + return __untagged_addr_remote_unlocked(mm, addr); } #define untagged_addr_remote(mm, addr) ({ \ diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 8a264662b242..c8c83372c9d8 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -34,6 +34,17 @@ }) #endif +/* + * Like untagged_addr_remote(), but for callers that stabilize @mm by other + * means (e.g. a per-VMA lock) and must not assert the mmap lock. + */ +#ifndef untagged_addr_remote_unlocked +#define untagged_addr_remote_unlocked(mm, addr) ({ \ + (void)(mm); \ + untagged_addr(addr); \ +}) +#endif + #ifdef masked_user_access_begin #define can_do_masked_user_access() 1 # ifndef masked_user_write_access_begin diff --git a/mm/memory.c b/mm/memory.c index 86a973119bd4..d2b2f0014a0c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -42,6 +42,8 @@ #include #include #include +#include +#include #include #include #include @@ -7062,6 +7064,180 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, EXPORT_SYMBOL_GPL(generic_access_phys); #endif +/* + * The fast path uses folio_walk_start(FW_VMA_LOCKED), which needs the per-VMA + * lock and RCU-freed page tables to walk page tables without the mmap lock. + */ +#if defined(CONFIG_PER_VMA_LOCK) && defined(CONFIG_MMU_GATHER_RCU_TABLE_FREE) +/* + * Read-side VMA checks for the lockless fast path, mirroring the read side of + * check_vma_flags(): reject what FW_VMA_LOCKED cannot handle (hugetlb), what + * needs the ->access() handler (VM_IO/VM_PFNMAP), or what has no struct page to + * copy (secretmem); enforce the FOLL_ANON restriction that + * /proc/PID/{cmdline,environ} rely on (CVE-2018-1120); and require read access + * (honoring FOLL_FORCE). Anything not positively allowed falls back to the slow + * path, which re-validates everything. + */ +static bool vma_permits_fast_access(struct vm_area_struct *vma, + unsigned int gup_flags) +{ + if (vma->vm_flags & (VM_IO | VM_PFNMAP)) + return false; + if (is_vm_hugetlb_page(vma) || vma_is_secretmem(vma)) + return false; + if ((gup_flags & FOLL_ANON) && !vma_is_anonymous(vma)) + return false; + if (!(vma->vm_flags & VM_READ) && + (!(gup_flags & FOLL_FORCE) || !(vma->vm_flags & VM_MAYREAD))) + return false; + return true; +} + +/* Size of the single mapping entry folio_walk_start() landed on. */ +static unsigned long fw_entry_size(enum folio_walk_level level) +{ + switch (level) { + case FW_LEVEL_PUD: + return PUD_SIZE; + case FW_LEVEL_PMD: + return PMD_SIZE; + default: + return PAGE_SIZE; + } +} + +/* + * Copy @len bytes of the pinned @folio out to @buf, starting at byte offset + * @folio_off within the folio (the position of @addr). Maps and copies one + * page at a time -- kmap_local_folio() for HIGHMEM, copy_from_user_page() for + * the per-page flush on aliasing caches -- without re-walking page tables. + * Each page borrows the caller's single folio reference, so the mapping is + * dropped with kunmap_local() rather than folio_release_kmap(). + */ +static void copy_folio_pages(struct vm_area_struct *vma, struct folio *folio, + unsigned long folio_off, unsigned long addr, + void *buf, unsigned long len) +{ + unsigned long done = 0; + + while (done < len) { + unsigned long pos = folio_off + done; + unsigned long page_idx = pos >> PAGE_SHIFT; + unsigned int page_off = pos & ~PAGE_MASK; + unsigned int chunk = min_t(unsigned long, len - done, + PAGE_SIZE - page_off); + void *kaddr = kmap_local_folio(folio, page_idx << PAGE_SHIFT); + + copy_from_user_page(vma, folio_page(folio, page_idx), + addr + done, buf + done, kaddr + page_off, + chunk); + kunmap_local(kaddr); + done += chunk; + } +} + +/* + * Opportunistic lockless fast path for __access_remote_vm() reads. + * + * Memory already resident in @mm can be read without taking the frequently + * contended mmap_lock: a per-VMA lock stabilizes the VMA, and folio_walk_start() + * with FW_VMA_LOCKED grabs a short-lived reference to a present page from a page + * table walk run with interrupts disabled, which serializes against concurrent + * page table freeing the same way gup_fast does (relying on + * MMU_GATHER_RCU_TABLE_FREE). + * + * Only a request that lies entirely within a single VMA is handled here, + * which should not be an issue in practice since every caller has a + * buffer of PAGE_SIZE or smaller. Loop iteration inside this function + * should be rare, too. + * + * Returns the number of bytes transferred via the fast path. + */ +static int access_remote_vm_fast(struct mm_struct *mm, unsigned long addr, + void *buf, int len, unsigned int gup_flags) +{ + void *old_buf = buf; + struct vm_area_struct *vma; + + addr = untagged_addr_remote_unlocked(mm, addr); + + vma = lock_vma_under_rcu(mm, addr); + if (!vma) + return 0; + + /* Only handle a request contained entirely within this one VMA. */ + if (len > vma->vm_end - addr) + goto out_unlock; + + if (!vma_permits_fast_access(vma, gup_flags)) + goto out_unlock; + + while (len) { + struct folio_walk fw; + struct folio *folio; + struct page *page; + unsigned long entry_size, folio_off, span, irq_flags; + + /* + * The lockless page table walk must run with interrupts + * disabled: page table freeing (munmap or THP collapse, which + * IPI via tlb_remove_table_sync_one() and wait) then cannot free + * a table mid-walk -- the same contract gup_fast relies on. IRQs + * are restored once the folio is pinned; the copy below holds only + * the folio reference. + */ + local_irq_save(irq_flags); + folio = folio_walk_start(&fw, vma, addr, FW_VMA_LOCKED); + if (!folio) { + local_irq_restore(irq_flags); + goto out_unlock; /* not present: let the slow path fault it in */ + } + page = fw.page; + if (!page) { + /* No struct page to copy (e.g. a special PTE). */ + folio_walk_end(&fw, vma); + local_irq_restore(irq_flags); + goto out_unlock; + } + entry_size = fw_entry_size(fw.level); + folio_get(folio); + folio_walk_end(&fw, vma); + local_irq_restore(irq_flags); + + /* + * folio_walk_start() validated one present mapping entry + * (PAGE/PMD/PUD_SIZE). Copy to the end of that entry, bounded by + * the folio and the remaining length (already within the VMA), so + * a huge mapping is handled in a single walk. + */ + folio_off = (folio_page_idx(folio, page) << PAGE_SHIFT) + + offset_in_page(addr); + span = min3((unsigned long)len, + entry_size - (addr & (entry_size - 1)), + (folio_nr_pages(folio) << PAGE_SHIFT) - folio_off); + + copy_folio_pages(vma, folio, folio_off, addr, buf, span); + + /* Match the FOLL_TOUCH behaviour of the slow (GUP) path. */ + folio_mark_accessed(folio); + folio_put(folio); + len -= span; + buf += span; + addr += span; + } + +out_unlock: + vma_end_read(vma); + return buf - old_buf; +} +#else +static int access_remote_vm_fast(struct mm_struct *mm, unsigned long addr, + void *buf, int len, unsigned int gup_flags) +{ + return 0; +} +#endif /* CONFIG_PER_VMA_LOCK && CONFIG_MMU_GATHER_RCU_TABLE_FREE */ + /* * Access another process' address space as given in mm. */ @@ -7071,15 +7247,30 @@ static int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *old_buf = buf; int write = gup_flags & FOLL_WRITE; + /* + * Try the lockless fast path for reads first; it transfers what it can + * from resident memory without taking mmap_lock, and leaves the + * remainder (if any) to the slow path below. + */ + if (!write) { + int done = access_remote_vm_fast(mm, addr, buf, len, gup_flags); + + addr += done; + buf += done; + len -= done; + if (!len) + return buf - old_buf; + } + if (mmap_read_lock_killable(mm)) - return 0; + return buf - old_buf; /* Untag the address before looking up the VMA */ addr = untagged_addr_remote(mm, addr); /* Avoid triggering the temporary warning in __get_user_pages */ if (!vma_lookup(mm, addr) && !expand_stack(mm, addr)) - return 0; + return buf - old_buf; /* ignore errors, just check how much was successfully transferred */ while (len) { -- 2.53.0-Meta