The mermap provides a fast way to create ephemeral mm-local mappings of physical pages. The purpose of this is to access pages that have been removed from the direct map. Potential use cases are: 1. For zeroing __GFP_UNMAPPED pages (added in a later patch). 2. For populating guest_memfd pages that are protected by the GUEST_MEMFD_NO_DIRECT_MAP feature [0]. 3. For efficient access of pages protected by Address Space Isolation [1]. [0] https://lore.kernel.org/all/20250924151101.2225820-1-patrick.roy@campus.lmu.de/ [1] https://linuxasi.dev The details of this mechanism are described in the API comments. However the key idea is to use CPU-local virtual regions to avoid a need for synchronizing. On x86, this can also be used to prevent TLB shootdowns. Because the virtual region is CPU-local, allocating from the mermap disables migration. The caller is forbidden to use the returned value from any other context, and migration is re-enabled when it's freed. One might notice that mermap_get() bears a strong similarity to kmap_local_page(). The most important differences between mermap_get() and kmap_local_page() are: 1. mermap_get() allows mapping variable sizes while kmap_local_page() specifically maps a single order-0 page. 2. As a consequence of 1 (combined with the need for mermap_get() to be an extremely simple allocator), mermap_get() should be expected to fail, while kmap_local_page() is guaranteed to work up to a certain degree of nesting. 3. While the mappings provided by kmap_local_page() are _logically_ local to the calling context (it's a bug for software to access them from elsewhere), they are _physically_ installed into the shared kernel pagetables. This means their locality doesn't provide any protection from hardware attacks. In contrast, the mermap is physically local to the creating mm, taking advantage of the new mm-local kernel address region. So that the mermap is available even in contexts where failure is not tolerable there is also a _reserved() variant, which is fixed at allocating a single base page. This is useful, for example, for zeroing __GFP_UNMAPPED pages, where handling failure would be extremely inconvenient. The _reserved() variant is simply implemented by leaving one base-page space unavailable for non-_reserved allocations, and requiring an atomic context. Signed-off-by: Brendan Jackman --- arch/x86/Kconfig | 1 + arch/x86/include/asm/mermap.h | 23 +++ arch/x86/include/asm/pgtable_64_types.h | 8 +- include/linux/mermap.h | 63 ++++++ include/linux/mermap_types.h | 41 ++++ include/linux/mm_types.h | 4 + kernel/fork.c | 5 + mm/Kconfig | 10 + mm/Makefile | 1 + mm/mermap.c | 334 ++++++++++++++++++++++++++++++++ mm/pgalloc-track.h | 6 + 11 files changed, 495 insertions(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d7073b6077c62..f093252b5eab5 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -37,6 +37,7 @@ config X86_64 select ZONE_DMA32 select EXECMEM if DYNAMIC_FTRACE select ACPI_MRRM if ACPI + select ARCH_SUPPORTS_MERMAP config FORCE_DYNAMIC_FTRACE def_bool y diff --git a/arch/x86/include/asm/mermap.h b/arch/x86/include/asm/mermap.h new file mode 100644 index 0000000000000..9d7614716b718 --- /dev/null +++ b/arch/x86/include/asm/mermap.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_MERMAP_H +#define _ASM_X86_MERMAP_H + +#include + +static inline void arch_mermap_flush_tlb(void) +{ + /* + * No shootdown allowed, IRQs may be off. Luckily other CPUs are not + * allowed to access our region so the stale mappings are harmless, as + * long as they still point to data belonging to this process. + */ + __flush_tlb_all(); +} + +static inline bool arch_mermap_pgprot_allowed(pgprot_t prot) +{ + /* Mermap is mm-local so global mappings would be a bug. */ + return !(pgprot_val(prot) & _PAGE_GLOBAL); +} + +#endif /* _ASM_X86_MERMAP_H */ diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 1181565966405..fb6c3daacfeb8 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -105,11 +105,17 @@ extern unsigned int ptrs_per_p4d; #define MM_LOCAL_PGD_ENTRY -240UL #define MM_LOCAL_BASE_ADDR (MM_LOCAL_PGD_ENTRY << PGDIR_SHIFT) -#define MM_LOCAL_END_ADDR ((MM_LOCAL_PGD_ENTRY + 1) << PGDIR_SHIFT) +#define MM_LOCAL_START_ADDR ((MM_LOCAL_PGD_ENTRY) << PGDIR_SHIFT) +#define MM_LOCAL_END_ADDR (MM_LOCAL_START_ADDR + (1UL << PGDIR_SHIFT)) #define LDT_BASE_ADDR MM_LOCAL_BASE_ADDR #define LDT_END_ADDR (LDT_BASE_ADDR + PMD_SIZE) +#define MERMAP_BASE_ADDR LDT_END_ADDR +#define MERMAP_CPU_REGION_SIZE PMD_SIZE +#define MERMAP_SIZE (MERMAP_CPU_REGION_SIZE * NR_CPUS) +#define MERMAP_END_ADDR (MERMAP_BASE_ADDR + (NR_CPUS * MERMAP_CPU_REGION_SIZE)) + #define __VMALLOC_BASE_L4 0xffffc90000000000UL #define __VMALLOC_BASE_L5 0xffa0000000000000UL diff --git a/include/linux/mermap.h b/include/linux/mermap.h new file mode 100644 index 0000000000000..5457dcb8c9789 --- /dev/null +++ b/include/linux/mermap.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_MERMAP_H +#define _LINUX_MERMAP_H + +#include +#include + +#ifdef CONFIG_MERMAP + +#include + +int mermap_mm_prepare(struct mm_struct *mm); +void mermap_mm_init(struct mm_struct *mm); +void mermap_mm_teardown(struct mm_struct *mm); + +/* Can the mermap be called from this context? */ +static inline bool mermap_ready(void) +{ + return in_task() && current->mm && current->mm->mermap.cpu; +} + +struct mermap_alloc *mermap_get(struct page *page, unsigned long size, pgprot_t prot); +void *mermap_get_reserved(struct page *page, pgprot_t prot); +void mermap_put(struct mermap_alloc *alloc); + +static inline void *mermap_addr(struct mermap_alloc *alloc) +{ + return (void *)alloc->base; +} + +/* + * arch_mermap_flush_tlb() is called before a part of the local CPU's mermap + * region is remapped to a new address. No other CPU is allowed to _access_ that + * region, but the region was mapped there. + * + * This may be called with IRQs off. + * + * On arm64, this will need to be a broadcast TLB flush. Although the other CPUs + * are forbidden to access the region, they can leak the data that was mapped + * there via CPU exploits. Violating break-before-make would mean the data + * available to these CPU exploits is unpredictable. + */ +extern void arch_mermap_flush_tlb(void); +extern bool arch_mermap_pgprot_allowed(pgprot_t prot); + +#if IS_ENABLED(CONFIG_KUNIT) +struct mermap_alloc *__mermap_get(struct mm_struct *mm, struct page *page, + unsigned long size, pgprot_t prot, bool use_reserve); +void __mermap_put(struct mm_struct *mm, struct mermap_alloc *alloc); +unsigned long mermap_cpu_base(int cpu); +unsigned long mermap_cpu_end(int cpu); +#endif + +#else /* CONFIG_MERMAP */ + +static inline int mermap_mm_prepare(struct mm_struct *mm) { return 0; } +static inline void mermap_mm_init(struct mm_struct *mm) { } +static inline void mermap_mm_teardown(struct mm_struct *mm) { } +static inline bool mermap_ready(void) { return false; } + +#endif /* CONFIG_MERMAP */ + +#endif /* _LINUX_MERMAP_H */ diff --git a/include/linux/mermap_types.h b/include/linux/mermap_types.h new file mode 100644 index 0000000000000..c1c83b223c28d --- /dev/null +++ b/include/linux/mermap_types.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_MERMAP_TYPES_H +#define _LINUX_MERMAP_TYPES_H + +#include +#include +#include + +#ifdef CONFIG_MERMAP + +/* Tracks an individual allocation in the mermap. */ +struct mermap_alloc { + /* Currently allocated. */ + bool in_use; + /* Requires flush before reallocating. */ + bool need_flush; + unsigned long base; + /* Non-inclusive. */ + unsigned long end; +}; + +struct mermap_cpu { + /* Next address immediately available for alloc (no TLB flush needed). */ + unsigned long next_addr; + struct mermap_alloc normal_allocs[3]; + struct mermap_alloc reserve_alloc; +}; + +struct mermap { + struct mutex init_lock; + struct mermap_cpu __percpu *cpu; +}; + +#else /* CONFIG_MERMAP */ + +struct mermap {}; + +#endif /* CONFIG_MERMAP */ + +#endif /* _LINUX_MERMAP_TYPES_H */ + diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 0ca7cb7da918f..2c60a451f96e7 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -34,6 +35,7 @@ struct address_space; struct futex_private_hash; struct mem_cgroup; +struct mermap; typedef struct { unsigned long f; @@ -1172,6 +1174,8 @@ struct mm_struct { atomic_t membarrier_state; #endif + struct mermap mermap; + /** * @mm_users: The number of users including userspace. * diff --git a/kernel/fork.c b/kernel/fork.c index ff075c74333fe..2770b4d296846 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -13,6 +13,7 @@ */ #include +#include #include #include #include @@ -1144,6 +1145,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm->user_ns = get_user_ns(user_ns); lru_gen_init_mm(mm); + + mermap_mm_init(mm); + return mm; fail_pcpu: @@ -1187,6 +1191,7 @@ static inline void __mmput(struct mm_struct *mm) ksm_exit(mm); khugepaged_exit(mm); /* must run before exit_mmap */ exit_mmap(mm); + mermap_mm_teardown(mm); mm_put_huge_zero_folio(mm); set_mm_exe_file(mm, NULL); if (!list_empty(&mm->mmlist)) { diff --git a/mm/Kconfig b/mm/Kconfig index 2813059df9c1c..2bf1dbcc8cb10 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1484,4 +1484,14 @@ config MM_LOCAL_REGION source "mm/damon/Kconfig" +config ARCH_SUPPORTS_MERMAP + bool + +config MERMAP + bool "Support for epheMERal mappings within the kernel" + depends on ARCH_SUPPORTS_MERMAP + depends on MM_LOCAL_REGION + help + Support for epheMERal mappings within the kernel. + endmenu diff --git a/mm/Makefile b/mm/Makefile index ffd06cf7a04e6..0c45677f4a538 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -150,3 +150,4 @@ obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o obj-$(CONFIG_EXECMEM) += execmem.o obj-$(CONFIG_TMPFS_QUOTA) += shmem_quota.o obj-$(CONFIG_LAZY_MMU_MODE_KUNIT_TEST) += tests/lazy_mmu_mode_kunit.o +obj-$(CONFIG_MERMAP) += mermap.o diff --git a/mm/mermap.c b/mm/mermap.c new file mode 100644 index 0000000000000..7cddc202755ee --- /dev/null +++ b/mm/mermap.c @@ -0,0 +1,334 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "internal.h" + +static inline int set_unmapped_pte(pte_t *ptep, unsigned long addr, void *data) +{ + set_pte(ptep, __pte(0)); + return 0; +} + +VISIBLE_IF_KUNIT void __mermap_put(struct mm_struct *mm, struct mermap_alloc *alloc) +{ + unsigned long size = PAGE_ALIGN(alloc->end - alloc->base); + + if (WARN_ON_ONCE(!alloc->in_use)) + return; + + __apply_to_page_range(mm, alloc->base, size, set_unmapped_pte, + NULL, PGRANGE_CREATE | PGRANGE_NOLOCK); + + WRITE_ONCE(alloc->in_use, false); + + migrate_enable(); +} +EXPORT_SYMBOL_IF_KUNIT(__mermap_put); + +/* Return a region allocated by mermap_get(). */ +void mermap_put(struct mermap_alloc *alloc) +{ + __mermap_put(current->mm, alloc); +} +EXPORT_SYMBOL(mermap_put); + +VISIBLE_IF_KUNIT inline unsigned long mermap_cpu_base(int cpu) +{ + return MERMAP_BASE_ADDR + (cpu * MERMAP_CPU_REGION_SIZE); + +} +EXPORT_SYMBOL_IF_KUNIT(mermap_cpu_base); + +/* Non-inclusive */ +VISIBLE_IF_KUNIT inline unsigned long mermap_cpu_end(int cpu) +{ + return MERMAP_BASE_ADDR + ((cpu + 1) * MERMAP_CPU_REGION_SIZE); + +} +EXPORT_SYMBOL_IF_KUNIT(mermap_cpu_end); + +static inline void mermap_flush_tlb(int cpu, struct mermap_cpu *mc) +{ +#if IS_ENABLED(CONFIG_MERMAP_KUNIT_TEST) + mc->tlb_flushes++; +#endif + arch_mermap_flush_tlb(); +} + +/* Call with preemption disabled in use_reserve, else with migration disabled. */ +static inline struct mermap_alloc *mermap_alloc(struct mm_struct *mm, + unsigned long size, bool use_reserve) +{ + int cpu = raw_smp_processor_id(); + struct mermap_cpu *mc = this_cpu_ptr(mm->mermap.cpu); + unsigned long cpu_end = mermap_cpu_end(cpu); + struct mermap_alloc *alloc = NULL; + + /* + * This is an extremely stupid allocator, there can only ever be a small + * number of allocations so everything just works on linear search. + * + * Allocations are "in order", i.e. if the whole region is free it + * allocates from the beginning. If there are any existing allocations + * it allocates from right after the last (highest address) one. Any + * free space before that goes unused. + * + * Once an allocation has been freed, the space it occupied must be flushed + * from the TLB before it can be reused. + * + * Visual example of how this is suppose to behave (A for allocated, T for + * TLB-flush-pending): + * + * _______________ Start with everything free. + * AaaA___________ Allocate something. + * TttT___________ Free it. (Region needs a TLB flush now). + * TttTAaaaaaaaA__ Allocate something else. + * TttTAaaaaaaaAAA Allocate the remaining space. + * TttTTtttttttTAA Free the allocation before last. + * ^^^^^^^^^^^^^ This could all be reused now but for simplicity it + * isn't. Another allocation at this point will fail. + * TttTTtttttttTTT Free the last allocation. + * _______________ Next time we allocate, first flush the TLB. + * AA_____________ Now we're back at the beginning. + */ + + /* Keep one page for mermap_get_reserved(). */ + if (use_reserve) { + if (WARN_ON_ONCE(size != PAGE_SIZE)) + return NULL; + lockdep_assert_preemption_disabled(); + } else { + cpu_end -= PAGE_SIZE; + } + + if (WARN_ON_ONCE(!in_task())) + return NULL; + guard(preempt)(); + + /* Out of already-available space? */ + if (mc->next_addr + size > cpu_end) { + unsigned long new_next = mermap_cpu_base(cpu); + + /* Would we have space after a TLB flush? */ + for (int i = 0; i < ARRAY_SIZE(mc->normal_allocs); i++) { + struct mermap_alloc *alloc = &mc->normal_allocs[i]; + + /* + * The space between the uppermost allocated alloc->end + * (or the base of the CPU's region if there are no + * current allocations) and mc->next_addr has been + * unmapped in the pagetables, but not flushed from the + * TLB. Set new_next to point to the beginning of that + * space. + */ + if (READ_ONCE(alloc->in_use)) + new_next = max(new_next, alloc->end); + } + if (size > cpu_end - new_next) + return NULL; + + mermap_flush_tlb(cpu, mc); + mc->next_addr = new_next; + } + + /* + * Find an alloc-tracking structure to use. Keep one for + * mermap_get_reserved() - that should never be contended since it can + * only be allocated with preemption off. + */ + if (WARN_ON_ONCE(mc->reserve_alloc.in_use)) + return NULL; + if (use_reserve) { + alloc = &mc->reserve_alloc; + } else { + for (int i = 0; i < ARRAY_SIZE(mc->normal_allocs); i++) { + if (!READ_ONCE(mc->normal_allocs[i].in_use)) { + alloc = &mc->normal_allocs[i]; + break; + } + } + if (!alloc) + return NULL; + } + alloc->in_use = true; + alloc->base = mc->next_addr; + alloc->end = alloc->base + size; + mc->next_addr += size; + + return alloc; +} + +struct set_pte_ctx { + pgprot_t prot; + unsigned long next_pfn; +}; + +static inline int do_set_pte(pte_t *pte, unsigned long addr, void *data) +{ + struct set_pte_ctx *ctx = data; + + set_pte(pte, pfn_pte(ctx->next_pfn, ctx->prot)); + ctx->next_pfn++; + + return 0; +} + +VISIBLE_IF_KUNIT struct mermap_alloc * +__mermap_get(struct mm_struct *mm, struct page *page, + unsigned long size, pgprot_t prot, bool use_reserve) +{ + struct mermap_alloc *alloc = NULL; + struct set_pte_ctx ctx; + int err; + + if (size > MERMAP_CPU_REGION_SIZE || WARN_ON_ONCE(!mm || !mm->mermap.cpu)) + return NULL; + if (WARN_ON_ONCE(!arch_mermap_pgprot_allowed(prot))) + return NULL; + + size = PAGE_ALIGN(size); + + migrate_disable(); + + alloc = mermap_alloc(mm, size, use_reserve); + if (!alloc) { + migrate_enable(); + return NULL; + } + + /* This probably wants to be optimised. */ + ctx.prot = prot; + ctx.next_pfn = page_to_pfn(page); + err = __apply_to_page_range(mm, alloc->base, size, do_set_pte, &ctx, + PGRANGE_CREATE | PGRANGE_NOLOCK); + if (err) { + WRITE_ONCE(alloc->in_use, false); + return NULL; + } + + return alloc; +} +EXPORT_SYMBOL_IF_KUNIT(__mermap_get); + +/* + * Allocate a region of virtual memory, and map the page into it. This tries + * pretty hard to be fast but doesn't try very hard at all to actually succeed. + * + * The returned region is physically local to the current mm. It is _logically_ + * local to the current CPU but this is not enforced by hardware so it can be + * exploited to mitigate CPU vulns. This means the caller must not map memory + * here that doesn't belong to the current process. The caller must also perform + * a full TLB flush of the region before freeing the pages that have been mapped + * here. + * + * This may only be called from process context, and the caller must arrange to + * first call mermap_mm_prepare(). (It would be possible to support this in IRQ, + * but it seems unlikely there's a valid usecase given the TLB flushing + * requirements). If it succeeds, it disables migration until you call + * mermap_put(). + * + * This is guaranteed not to allocate. + * + * Use mermap_addr() to get the actual address of the mapped region. + */ +struct mermap_alloc *mermap_get(struct page *page, unsigned long size, pgprot_t prot) +{ + return __mermap_get(current->mm, page, size, prot, false); +} +EXPORT_SYMBOL(mermap_get); +ALLOW_ERROR_INJECTION(mermap_get, NULL); + +/* + * Allocate a single PAGE_SIZE page via mermap_get(), requiring preemption to be + * off until it is freed. This always succeeds. + */ +void *mermap_get_reserved(struct page *page, pgprot_t prot) +{ + lockdep_assert_preemption_disabled(); + return __mermap_get(current->mm, page, PAGE_SIZE, prot, true); +} +EXPORT_SYMBOL(mermap_get_reserved); + +/* + * Internal - do unconditional (cheap) setup that's done for every mm. This + * doesn't actually prepare the mermap for use until someone calls + * mermap_mm_prepare(). + */ +void mermap_mm_init(struct mm_struct *mm) +{ + mutex_init(&mm->mermap.init_lock); +} + +/* + * Set up the mermap for this mm. The caller doesn't need to call + * mermap_mm_teardown(), that's take care of by the normal mm teardown + * mechanism. This is idempotent and thread-safe. + */ +int mermap_mm_prepare(struct mm_struct *mm) +{ + int err = 0; + int cpu; + + guard(mutex)(&mm->mermap.init_lock); + + /* Already done? */ + if (likely(mm->mermap.cpu)) + return 0; + + mm->mermap.cpu = alloc_percpu_gfp(struct mermap_cpu, + GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!mm->mermap.cpu) + return -ENOMEM; + + /* So we can use this from the page allocator, preallocate pagetables. */ + mm_flags_set(MMF_LOCAL_REGION_USED, mm); + for_each_possible_cpu(cpu) { + unsigned long base = mermap_cpu_base(cpu); + + /* Note this pointlessly iterates over PTEs to initialise. */ + err = apply_to_page_range(mm, base, MERMAP_CPU_REGION_SIZE, + set_unmapped_pte, NULL); + if (err) { + /* + * Clear .cpu now to inform mermap_ready(). Any partial + * page tables get cleared up by mm teardown. + */ + free_percpu(mm->mermap.cpu); + mm->mermap.cpu = NULL; + break; + } + per_cpu_ptr(mm->mermap.cpu, cpu)->next_addr = base; + } + + return err; +} +EXPORT_SYMBOL_GPL(mermap_mm_prepare); + +/* Clean up mermap stuff on mm teardown. */ +void mermap_mm_teardown(struct mm_struct *mm) +{ + int cpu; + + if (!mm->mermap.cpu) + return; + + for_each_possible_cpu(cpu) { + struct mermap_cpu *mc = this_cpu_ptr(mm->mermap.cpu); + + for (int i = 0; i < ARRAY_SIZE(mc->normal_allocs); i++) + WARN_ON_ONCE(mc->normal_allocs[i].in_use); + WARN_ON_ONCE(mc->reserve_alloc.in_use); + } + + free_percpu(mm->mermap.cpu); +} diff --git a/mm/pgalloc-track.h b/mm/pgalloc-track.h index e9e879de8649b..51fc4668d7177 100644 --- a/mm/pgalloc-track.h +++ b/mm/pgalloc-track.h @@ -2,6 +2,12 @@ #ifndef _LINUX_PGALLOC_TRACK_H #define _LINUX_PGALLOC_TRACK_H +#include +#include +#include + +#include "internal.h" + #if defined(CONFIG_MMU) static inline p4d_t *p4d_alloc_track(struct mm_struct *mm, pgd_t *pgd, unsigned long address, -- 2.51.2