From: Ard Biesheuvel Before moving the empty_zero_page into the __ro_after_init section, make sure it has the memory-tagged type. This is needed to ensure that cpu_enable_mte() will be able to initialize the tags correctly. Signed-off-by: Ard Biesheuvel --- arch/arm64/mm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index dd85e093ffdb..f084993024ab 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1049,7 +1049,7 @@ void __init mark_linear_text_alias_ro(void) */ update_mapping_prot(__pa_symbol(_text), (unsigned long)lm_alias(_text), (unsigned long)__init_begin - (unsigned long)_text, - PAGE_KERNEL_RO); + pgprot_tagged(PAGE_KERNEL_RO)); } #ifdef CONFIG_KFENCE -- 2.54.0.rc2.544.gc7ae2d5bb8-goog From: Ard Biesheuvel The empty zero page is used to back any kernel or user space mapping that is supposed to remain cleared, and so the page itself is never supposed to be modified. So make it __ro_after_init rather than __page_aligned_bss: on most architectures, this ensures that both the kernel's mapping of it and any aliases that are accessible via the kernel direct (linear) map are mapped read-only, and cannot be used (inadvertently or maliciously) to corrupt the contents of the zero page. Signed-off-by: Ard Biesheuvel --- mm/mm_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mm_init.c b/mm/mm_init.c index f9f8e1af921c..6ca01ed2a5a4 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -57,7 +57,7 @@ unsigned long zero_page_pfn __ro_after_init; EXPORT_SYMBOL(zero_page_pfn); #ifndef __HAVE_COLOR_ZERO_PAGE -uint8_t empty_zero_page[PAGE_SIZE] __page_aligned_bss; +uint8_t empty_zero_page[PAGE_SIZE] __ro_after_init __aligned(PAGE_SIZE); EXPORT_SYMBOL(empty_zero_page); struct page *__zero_page __ro_after_init; -- 2.54.0.rc2.544.gc7ae2d5bb8-goog From: Ard Biesheuvel Instead of blindly overwriting an existing table entry when mapping DRAM regions, take care not to replace a pre-existing table entry with a block entry. This permits the logic of mapping the kernel's linear alias to be simplified in a subsequent patch. Reviewed-by: Ryan Roberts Signed-off-by: Ard Biesheuvel --- arch/arm64/mm/mmu.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index f084993024ab..801a54ade76f 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -256,7 +256,8 @@ static int init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end, /* try section mapping first */ if (((addr | next | phys) & ~PMD_MASK) == 0 && - (flags & NO_BLOCK_MAPPINGS) == 0) { + (flags & NO_BLOCK_MAPPINGS) == 0 && + !pmd_table(old_pmd)) { pmd_set_huge(pmdp, phys, prot); /* @@ -379,7 +380,8 @@ static int alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end, */ if (pud_sect_supported() && ((addr | next | phys) & ~PUD_MASK) == 0 && - (flags & NO_BLOCK_MAPPINGS) == 0) { + (flags & NO_BLOCK_MAPPINGS) == 0 && + !pud_table(old_pud)) { pud_set_huge(pudp, phys, prot); /* -- 2.54.0.rc2.544.gc7ae2d5bb8-goog From: Ard Biesheuvel Instead of blindly overwriting existing live entries with the contiguous bit cleared when mapping DRAM regions, check whether the contiguous region in question starts with a descriptor that has the valid bit set and the contiguous bit cleared, and in that case, leave the contiguous bit unset on the entire region. This permits the logic of mapping the kernel's linear alias to be simplified in a subsequent patch. Note that not setting the contiguous bit on any of the descriptors in the contiguous region can only result in an invalid configuration if it was already invalid to begin with. Reviewed-by: Ryan Roberts Signed-off-by: Ard Biesheuvel --- arch/arm64/include/asm/pgtable.h | 4 ++++ arch/arm64/mm/mmu.c | 6 ++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 4dfa42b7d053..a1c5894332d9 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -181,6 +181,10 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys) * Returns true if the pte is valid and has the contiguous bit set. */ #define pte_valid_cont(pte) (pte_valid(pte) && pte_cont(pte)) +/* + * Returns true if the pte is valid and has the contiguous bit cleared. + */ +#define pte_valid_noncont(pte) (pte_valid(pte) && !pte_cont(pte)) /* * Could the pte be present in the TLB? We must check mm_tlb_flush_pending * so that we don't erroneously return false for pages that have been diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 801a54ade76f..005844e521bd 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -224,7 +224,8 @@ static int alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr, /* use a contiguous mapping if the range is suitably aligned */ if ((((addr | next | phys) & ~CONT_PTE_MASK) == 0) && - (flags & NO_CONT_MAPPINGS) == 0) + (flags & NO_CONT_MAPPINGS) == 0 && + !pte_valid_noncont(__ptep_get(ptep))) __prot = __pgprot(pgprot_val(prot) | PTE_CONT); init_pte(ptep, addr, next, phys, __prot); @@ -324,7 +325,8 @@ static int alloc_init_cont_pmd(pud_t *pudp, unsigned long addr, /* use a contiguous mapping if the range is suitably aligned */ if ((((addr | next | phys) & ~CONT_PMD_MASK) == 0) && - (flags & NO_CONT_MAPPINGS) == 0) + (flags & NO_CONT_MAPPINGS) == 0 && + !pte_valid_noncont(pmd_pte(READ_ONCE(*pmdp)))) __prot = __pgprot(pgprot_val(prot) | PTE_CONT); ret = init_pmd(pmdp, addr, next, phys, __prot, pgtable_alloc, flags); -- 2.54.0.rc2.544.gc7ae2d5bb8-goog From: Ard Biesheuvel The memblock API guarantees that start is not greater than or equal to end, so there is no need to test it. And if it were, it is doubtful that breaking out of the loop would be a reasonable course of action here (rather than attempting to map the remaining regions) So let's drop this check. Reviewed-by: Ryan Roberts Signed-off-by: Ard Biesheuvel --- arch/arm64/mm/mmu.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 005844e521bd..bfbf3fe0d1be 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1177,8 +1177,6 @@ static void __init map_mem(pgd_t *pgdp) /* map all the memory banks */ for_each_mem_range(i, &start, &end) { - if (start >= end) - break; /* * The linear map must allow allocation tags reading/writing * if MTE is present. Otherwise, it has the same attributes as -- 2.54.0.rc2.544.gc7ae2d5bb8-goog From: Ard Biesheuvel __map_memblock() and map_mem() always operate on swapper_pg_dir, so there is no need to pass around a pgd_t pointer between them. Reviewed-by: Ryan Roberts Signed-off-by: Ard Biesheuvel --- arch/arm64/mm/mmu.c | 25 ++++++++++---------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index bfbf3fe0d1be..9610dd2d7bd9 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1039,11 +1039,11 @@ static void update_mapping_prot(phys_addr_t phys, unsigned long virt, flush_tlb_kernel_range(virt, virt + size); } -static void __init __map_memblock(pgd_t *pgdp, phys_addr_t start, - phys_addr_t end, pgprot_t prot, int flags) +static void __init __map_memblock(phys_addr_t start, phys_addr_t end, + pgprot_t prot, int flags) { - early_create_pgd_mapping(pgdp, start, __phys_to_virt(start), end - start, - prot, early_pgtable_alloc, flags); + early_create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start), + end - start, prot, early_pgtable_alloc, flags); } void __init mark_linear_text_alias_ro(void) @@ -1091,13 +1091,13 @@ static phys_addr_t __init arm64_kfence_alloc_pool(void) return kfence_pool; } -static void __init arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp) +static void __init arm64_kfence_map_pool(phys_addr_t kfence_pool) { if (!kfence_pool) return; /* KFENCE pool needs page-level mapping. */ - __map_memblock(pgdp, kfence_pool, kfence_pool + KFENCE_POOL_SIZE, + __map_memblock(kfence_pool, kfence_pool + KFENCE_POOL_SIZE, pgprot_tagged(PAGE_KERNEL), NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS); memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE); @@ -1133,11 +1133,11 @@ bool arch_kfence_init_pool(void) #else /* CONFIG_KFENCE */ static inline phys_addr_t arm64_kfence_alloc_pool(void) { return 0; } -static inline void arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp) { } +static inline void arm64_kfence_map_pool(phys_addr_t kfence_pool) { } #endif /* CONFIG_KFENCE */ -static void __init map_mem(pgd_t *pgdp) +static void __init map_mem(void) { static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN); phys_addr_t kernel_start = __pa_symbol(_text); @@ -1182,7 +1182,7 @@ static void __init map_mem(pgd_t *pgdp) * if MTE is present. Otherwise, it has the same attributes as * PAGE_KERNEL. */ - __map_memblock(pgdp, start, end, pgprot_tagged(PAGE_KERNEL), + __map_memblock(start, end, pgprot_tagged(PAGE_KERNEL), flags); } @@ -1196,10 +1196,9 @@ static void __init map_mem(pgd_t *pgdp) * Note that contiguous mappings cannot be remapped in this way, * so we should avoid them here. */ - __map_memblock(pgdp, kernel_start, kernel_end, - PAGE_KERNEL, NO_CONT_MAPPINGS); + __map_memblock(kernel_start, kernel_end, PAGE_KERNEL, NO_CONT_MAPPINGS); memblock_clear_nomap(kernel_start, kernel_end - kernel_start); - arm64_kfence_map_pool(early_kfence_pool, pgdp); + arm64_kfence_map_pool(early_kfence_pool); } void mark_rodata_ro(void) @@ -1421,7 +1420,7 @@ static void __init create_idmap(void) void __init paging_init(void) { - map_mem(swapper_pg_dir); + map_mem(); memblock_allow_resize(); -- 2.54.0.rc2.544.gc7ae2d5bb8-goog From: Ard Biesheuvel Currently, pgattr_change_is_safe() is overly pedantic when it comes to descriptors with the contiguous hint attribute set, as it rejects assignments even if the old and the new value are the same. So relax the check to allow that. Signed-off-by: Ard Biesheuvel --- arch/arm64/mm/mmu.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 9610dd2d7bd9..bfb2f1cae724 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -134,10 +134,6 @@ bool pgattr_change_is_safe(pteval_t old, pteval_t new) if (pte_pfn(__pte(old)) != pte_pfn(__pte(new))) return false; - /* live contiguous mappings may not be manipulated at all */ - if ((old | new) & PTE_CONT) - return false; - /* Transitioning from Non-Global to Global is unsafe */ if (old & ~new & PTE_NG) return false; -- 2.54.0.rc2.544.gc7ae2d5bb8-goog From: Ard Biesheuvel Now that the map_mem() routines respect existing page mappings and contiguous granule sized blocks with the contiguous bit cleared, there is no longer a reason to play tricks with the memblock NOMAP attribute. Instead, the kfence pool can be allocated and mapped with page granularity first, and this granularity will be respected when the rest of DRAM is mapped later, even if block and contiguous mappings are allowed for the remainder of those mappings. Signed-off-by: Ard Biesheuvel --- arch/arm64/mm/mmu.c | 25 ++++---------------- 1 file changed, 5 insertions(+), 20 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index bfb2f1cae724..4eab40f4aa6f 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1067,36 +1067,24 @@ static int __init parse_kfence_early_init(char *arg) } early_param("kfence.sample_interval", parse_kfence_early_init); -static phys_addr_t __init arm64_kfence_alloc_pool(void) +static void __init arm64_kfence_map_pool(void) { phys_addr_t kfence_pool; if (!kfence_early_init) - return 0; + return; kfence_pool = memblock_phys_alloc(KFENCE_POOL_SIZE, PAGE_SIZE); if (!kfence_pool) { pr_err("failed to allocate kfence pool\n"); kfence_early_init = false; - return 0; - } - - /* Temporarily mark as NOMAP. */ - memblock_mark_nomap(kfence_pool, KFENCE_POOL_SIZE); - - return kfence_pool; -} - -static void __init arm64_kfence_map_pool(phys_addr_t kfence_pool) -{ - if (!kfence_pool) return; + } /* KFENCE pool needs page-level mapping. */ __map_memblock(kfence_pool, kfence_pool + KFENCE_POOL_SIZE, pgprot_tagged(PAGE_KERNEL), NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS); - memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE); __kfence_pool = phys_to_virt(kfence_pool); } @@ -1128,8 +1116,7 @@ bool arch_kfence_init_pool(void) } #else /* CONFIG_KFENCE */ -static inline phys_addr_t arm64_kfence_alloc_pool(void) { return 0; } -static inline void arm64_kfence_map_pool(phys_addr_t kfence_pool) { } +static inline void arm64_kfence_map_pool(void) { } #endif /* CONFIG_KFENCE */ @@ -1139,7 +1126,6 @@ static void __init map_mem(void) phys_addr_t kernel_start = __pa_symbol(_text); phys_addr_t kernel_end = __pa_symbol(__init_begin); phys_addr_t start, end; - phys_addr_t early_kfence_pool; int flags = NO_EXEC_MAPPINGS; u64 i; @@ -1156,7 +1142,7 @@ static void __init map_mem(void) BUILD_BUG_ON(pgd_index(direct_map_end - 1) == pgd_index(direct_map_end) && pgd_index(_PAGE_OFFSET(VA_BITS_MIN)) != PTRS_PER_PGD - 1); - early_kfence_pool = arm64_kfence_alloc_pool(); + arm64_kfence_map_pool(); linear_map_requires_bbml2 = !force_pte_mapping() && can_set_direct_map(); @@ -1194,7 +1180,6 @@ static void __init map_mem(void) */ __map_memblock(kernel_start, kernel_end, PAGE_KERNEL, NO_CONT_MAPPINGS); memblock_clear_nomap(kernel_start, kernel_end - kernel_start); - arm64_kfence_map_pool(early_kfence_pool); } void mark_rodata_ro(void) -- 2.54.0.rc2.544.gc7ae2d5bb8-goog From: Ard Biesheuvel There are a few cases where we omit the contiguous hint for mappings that start out as read-write and are remapped read-only later, on the basis that manipulating live descriptors with the PTE_CONT attribute set is unsafe. When support for the contiguous hint was added to the code, the ARM ARM was ambiguous about this, and so we erred on the side of caution. In the meantime, this has been clarified [0], and regions that will be remapped in their entirety can use the contiguous hint both in the initial mapping as well as the one that replaces it. Note that this requires that the logic that may be called to remap overlapping regions respects existing valid descriptors that have the contiguous bit cleared. So omit the NO_CONT_MAPPINGS flag in places where it is unneeded. Thanks to Ryan for the reference. [0] RJQQTC For a TLB lookup in a contiguous region mapped by translation table entries that have consistent values for the Contiguous bit, but have the OA, attributes, or permissions misprogrammed, that TLB lookup is permitted to produce an OA, access permissions, and memory attributes that are consistent with any one of the programmed translation table values. Signed-off-by: Ard Biesheuvel --- arch/arm64/mm/mmu.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 4eab40f4aa6f..5e2348b15783 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1000,8 +1000,7 @@ void __init create_mapping_noalloc(phys_addr_t phys, unsigned long virt, &phys, virt); return; } - early_create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, - NO_CONT_MAPPINGS); + early_create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, 0); } void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, @@ -1028,8 +1027,7 @@ static void update_mapping_prot(phys_addr_t phys, unsigned long virt, return; } - early_create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, - NO_CONT_MAPPINGS); + early_create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, 0); /* flush the TLBs after updating live kernel mappings */ flush_tlb_kernel_range(virt, virt + size); @@ -1175,10 +1173,8 @@ static void __init map_mem(void) * alternative patching has completed). This makes the contents * of the region accessible to subsystems such as hibernate, * but protects it from inadvertent modification or execution. - * Note that contiguous mappings cannot be remapped in this way, - * so we should avoid them here. */ - __map_memblock(kernel_start, kernel_end, PAGE_KERNEL, NO_CONT_MAPPINGS); + __map_memblock(kernel_start, kernel_end, PAGE_KERNEL, 0); memblock_clear_nomap(kernel_start, kernel_end - kernel_start); } -- 2.54.0.rc2.544.gc7ae2d5bb8-goog From: Ard Biesheuvel Move the fixmap page tables out of the BSS section, and place them at the end of the image, right before the init_pg_dir section where some of the other statically allocated page tables live. These page tables are currently the only data objects in vmlinux that are meant to be accessed via the kernel image's linear alias, and so placing them together allows the remainder of the data/bss section to be remapped read-only or unmapped entirely. Signed-off-by: Ard Biesheuvel --- arch/arm64/kernel/vmlinux.lds.S | 5 +++++ arch/arm64/mm/fixmap.c | 7 ++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index e1ac876200a3..2dca18574619 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -353,6 +353,11 @@ SECTIONS __pi___bss_start = __bss_start; . = ALIGN(PAGE_SIZE); + .fixmap_pgdir : { + __fixmap_pgdir_start = .; + *(.fixmap_bss) + } + __pi_init_pg_dir = .; . += INIT_DIR_SIZE; __pi_init_pg_end = .; diff --git a/arch/arm64/mm/fixmap.c b/arch/arm64/mm/fixmap.c index c5c5425791da..b649ea1a46e4 100644 --- a/arch/arm64/mm/fixmap.c +++ b/arch/arm64/mm/fixmap.c @@ -31,9 +31,10 @@ static_assert(NR_BM_PMD_TABLES == 1); #define BM_PTE_TABLE_IDX(addr) __BM_TABLE_IDX(addr, PMD_SHIFT) -static pte_t bm_pte[NR_BM_PTE_TABLES][PTRS_PER_PTE] __page_aligned_bss; -static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss __maybe_unused; -static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss __maybe_unused; +#define __fixmap_bss __section(".fixmap_bss") __aligned(PAGE_SIZE) +static pte_t bm_pte[NR_BM_PTE_TABLES][PTRS_PER_PTE] __fixmap_bss; +static pmd_t bm_pmd[PTRS_PER_PMD] __fixmap_bss __maybe_unused; +static pud_t bm_pud[PTRS_PER_PUD] __fixmap_bss __maybe_unused; static inline pte_t *fixmap_pte(unsigned long addr) { -- 2.54.0.rc2.544.gc7ae2d5bb8-goog From: Ard Biesheuvel Now that the DRAM mapping routines respect existing table mappings and contiguous block and page mappings, it is no longer needed to fiddle with the memblock tables to set and clear the NOMAP attribute in order to omit text and rodata when creating the linear map. Instead, map the kernel text and rodata alias first with the desired attributes, so that they will not be remapped later with different attributes when mapping the memblocks. Signed-off-by: Ard Biesheuvel --- arch/arm64/mm/mmu.c | 24 +++++++------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 5e2348b15783..1a4b4337d29a 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1148,12 +1148,15 @@ static void __init map_mem(void) flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; /* - * Take care not to create a writable alias for the - * read-only text and rodata sections of the kernel image. - * So temporarily mark them as NOMAP to skip mappings in - * the following for-loop + * Map the linear alias of the [_text, __init_begin) interval + * as non-executable now, and remove the write permission in + * mark_linear_text_alias_ro() above (which will be called after + * alternative patching has completed). This makes the contents + * of the region accessible to subsystems such as hibernate, + * but protects it from inadvertent modification or execution. */ - memblock_mark_nomap(kernel_start, kernel_end - kernel_start); + __map_memblock(kernel_start, kernel_end, pgprot_tagged(PAGE_KERNEL), + flags); /* map all the memory banks */ for_each_mem_range(i, &start, &end) { @@ -1165,17 +1168,6 @@ static void __init map_mem(void) __map_memblock(start, end, pgprot_tagged(PAGE_KERNEL), flags); } - - /* - * Map the linear alias of the [_text, __init_begin) interval - * as non-executable now, and remove the write permission in - * mark_linear_text_alias_ro() below (which will be called after - * alternative patching has completed). This makes the contents - * of the region accessible to subsystems such as hibernate, - * but protects it from inadvertent modification or execution. - */ - __map_memblock(kernel_start, kernel_end, PAGE_KERNEL, 0); - memblock_clear_nomap(kernel_start, kernel_end - kernel_start); } void mark_rodata_ro(void) -- 2.54.0.rc2.544.gc7ae2d5bb8-goog From: Ard Biesheuvel On systems where the bootloader adheres to the original arm64 boot protocol, the placement of the kernel in the physical address space is highly predictable, and this makes the placement of its linear alias in the kernel virtual address space equally predictable, given the lack of randomization of the linear map. The linear aliases of the kernel text and rodata regions are already mapped read-only, but the kernel data and bss are mapped read-write in this region. This is not needed, so map them read-only as well. Note that the statically allocated kernel page tables do need to be modifiable via the linear map, so leave these mapped read-write. Signed-off-by: Ard Biesheuvel --- arch/arm64/include/asm/sections.h | 1 + arch/arm64/mm/mmu.c | 16 ++++++++++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/sections.h b/arch/arm64/include/asm/sections.h index 51b0d594239e..32ec21af0823 100644 --- a/arch/arm64/include/asm/sections.h +++ b/arch/arm64/include/asm/sections.h @@ -23,6 +23,7 @@ extern char __irqentry_text_start[], __irqentry_text_end[]; extern char __mmuoff_data_start[], __mmuoff_data_end[]; extern char __entry_tramp_text_start[], __entry_tramp_text_end[]; extern char __relocate_new_kernel_start[], __relocate_new_kernel_end[]; +extern char __fixmap_pgdir_start[]; static inline size_t entry_tramp_text_size(void) { diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 1a4b4337d29a..9361b7efb848 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1122,7 +1122,9 @@ static void __init map_mem(void) { static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN); phys_addr_t kernel_start = __pa_symbol(_text); - phys_addr_t kernel_end = __pa_symbol(__init_begin); + phys_addr_t init_begin = __pa_symbol(__init_begin); + phys_addr_t init_end = __pa_symbol(__init_end); + phys_addr_t kernel_end = __pa_symbol(__fixmap_pgdir_start); phys_addr_t start, end; int flags = NO_EXEC_MAPPINGS; u64 i; @@ -1155,7 +1157,11 @@ static void __init map_mem(void) * of the region accessible to subsystems such as hibernate, * but protects it from inadvertent modification or execution. */ - __map_memblock(kernel_start, kernel_end, pgprot_tagged(PAGE_KERNEL), + __map_memblock(kernel_start, init_begin, pgprot_tagged(PAGE_KERNEL), + flags); + + /* Map the kernel data/bss so it can be remapped later */ + __map_memblock(init_end, kernel_end, pgprot_tagged(PAGE_KERNEL), flags); /* map all the memory banks */ @@ -1168,6 +1174,12 @@ static void __init map_mem(void) __map_memblock(start, end, pgprot_tagged(PAGE_KERNEL), flags); } + + /* Map the kernel data/bss read-only in the linear map */ + __map_memblock(init_end, kernel_end, pgprot_tagged(PAGE_KERNEL_RO), + flags); + flush_tlb_kernel_range((unsigned long)lm_alias(__init_end), + (unsigned long)lm_alias(__fixmap_pgdir_start)); } void mark_rodata_ro(void) -- 2.54.0.rc2.544.gc7ae2d5bb8-goog From: Ard Biesheuvel The linear aliases of the kernel text and rodata are mapped read-only in the linear map as well. Given that the contents of these regions are mostly identical to the version in the loadable image, mapping them read-only and leaving their contents visible is a reasonable hardening measure. Data and bss, however, are now also mapped read-only but the contents of these regions are more likely to contain data that we'd rather not leak. So let's unmap these entirely in the linear map when the kernel is running normally. When going into hibernation or waking up from it, these regions need to be mapped, so map the region initially, and toggle the valid bit so map/unmap the region as needed. Signed-off-by: Ard Biesheuvel --- arch/arm64/mm/mmu.c | 44 ++++++++++++++++---- 1 file changed, 37 insertions(+), 7 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 9361b7efb848..a464f3d2d2df 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -1040,6 +1041,31 @@ static void __init __map_memblock(phys_addr_t start, phys_addr_t end, end - start, prot, early_pgtable_alloc, flags); } +static void remap_linear_data_alias(bool unmap) +{ + set_memory_valid((unsigned long)lm_alias(__init_end), + (unsigned long)(__fixmap_pgdir_start - __init_end) / PAGE_SIZE, + !unmap); +} + +static int arm64_hibernate_pm_notify(struct notifier_block *nb, + unsigned long mode, void *unused) +{ + switch (mode) { + default: + break; + case PM_POST_HIBERNATION: + case PM_POST_RESTORE: + remap_linear_data_alias(true); + break; + case PM_HIBERNATION_PREPARE: + case PM_RESTORE_PREPARE: + remap_linear_data_alias(false); + break; + } + return 0; +} + void __init mark_linear_text_alias_ro(void) { /* @@ -1048,6 +1074,16 @@ void __init mark_linear_text_alias_ro(void) update_mapping_prot(__pa_symbol(_text), (unsigned long)lm_alias(_text), (unsigned long)__init_begin - (unsigned long)_text, pgprot_tagged(PAGE_KERNEL_RO)); + + remap_linear_data_alias(true); + + if (IS_ENABLED(CONFIG_HIBERNATION)) { + static struct notifier_block nb = { + .notifier_call = arm64_hibernate_pm_notify + }; + + register_pm_notifier(&nb); + } } #ifdef CONFIG_KFENCE @@ -1162,7 +1198,7 @@ static void __init map_mem(void) /* Map the kernel data/bss so it can be remapped later */ __map_memblock(init_end, kernel_end, pgprot_tagged(PAGE_KERNEL), - flags); + flags | NO_BLOCK_MAPPINGS); /* map all the memory banks */ for_each_mem_range(i, &start, &end) { @@ -1174,12 +1210,6 @@ static void __init map_mem(void) __map_memblock(start, end, pgprot_tagged(PAGE_KERNEL), flags); } - - /* Map the kernel data/bss read-only in the linear map */ - __map_memblock(init_end, kernel_end, pgprot_tagged(PAGE_KERNEL_RO), - flags); - flush_tlb_kernel_range((unsigned long)lm_alias(__init_end), - (unsigned long)lm_alias(__fixmap_pgdir_start)); } void mark_rodata_ro(void) -- 2.54.0.rc2.544.gc7ae2d5bb8-goog From: Ard Biesheuvel Before moving the fixmap PUD/PMD tables into .rodata, update the existing descriptor manipulation code so it will fallback to the fixmap for any descriptor located in the .pgdir_rodata section. This is slightly more costly, as it evaluates whether or not a descriptor is in the kernel's rodata region at levels PMD and higher for any configuration, rather than only when the level in question is the root level. Signed-off-by: Ard Biesheuvel --- arch/arm64/include/asm/pgtable.h | 27 ++++++++++---------- arch/arm64/kernel/vmlinux.lds.S | 8 ++++-- arch/arm64/mm/mmu.c | 24 ++++++++--------- 3 files changed, 31 insertions(+), 28 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index a1c5894332d9..94235dd428be 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -816,23 +816,22 @@ extern pgd_t swapper_pg_dir[]; extern pgd_t idmap_pg_dir[]; extern pgd_t tramp_pg_dir[]; extern pgd_t reserved_pg_dir[]; +extern pgd_t __pgdir_rodata_start[], __pgdir_rodata_end[]; -extern void set_swapper_pgd(pgd_t *pgdp, pgd_t pgd); +extern void set_rodata_pte(pte_t *ptep, pte_t pte); -static inline bool in_swapper_pgdir(void *addr) +static inline bool in_pgdir_rodata(void *addr) { - return ((unsigned long)addr & PAGE_MASK) == - ((unsigned long)swapper_pg_dir & PAGE_MASK); + return addr >= (void *)__pgdir_rodata_start && + addr < (void *)__pgdir_rodata_end; } static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) { -#ifdef __PAGETABLE_PMD_FOLDED - if (in_swapper_pgdir(pmdp)) { - set_swapper_pgd((pgd_t *)pmdp, __pgd(pmd_val(pmd))); + if (in_pgdir_rodata(pmdp)) { + set_rodata_pte((pte_t *)pmdp, __pte(pmd_val(pmd))); return; } -#endif /* __PAGETABLE_PMD_FOLDED */ WRITE_ONCE(*pmdp, pmd); @@ -893,8 +892,8 @@ static inline bool pgtable_l4_enabled(void); static inline void set_pud(pud_t *pudp, pud_t pud) { - if (!pgtable_l4_enabled() && in_swapper_pgdir(pudp)) { - set_swapper_pgd((pgd_t *)pudp, __pgd(pud_val(pud))); + if (in_pgdir_rodata(pudp)) { + set_rodata_pte((pte_t *)pudp, __pte(pud_val(pud))); return; } @@ -974,8 +973,8 @@ static inline bool mm_pud_folded(const struct mm_struct *mm) static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) { - if (in_swapper_pgdir(p4dp)) { - set_swapper_pgd((pgd_t *)p4dp, __pgd(p4d_val(p4d))); + if (in_pgdir_rodata(p4dp)) { + set_rodata_pte((pte_t *)p4dp, __pte(p4d_val(p4d))); return; } @@ -1102,8 +1101,8 @@ static inline bool mm_p4d_folded(const struct mm_struct *mm) static inline void set_pgd(pgd_t *pgdp, pgd_t pgd) { - if (in_swapper_pgdir(pgdp)) { - set_swapper_pgd(pgdp, __pgd(pgd_val(pgd))); + if (in_pgdir_rodata(pgdp)) { + set_rodata_pte((pte_t *)pgdp, __pte(pgd_val(pgd))); return; } diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 2dca18574619..e5e1d0fd7f27 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -243,8 +243,12 @@ SECTIONS reserved_pg_dir = .; . += PAGE_SIZE; - swapper_pg_dir = .; - . += PAGE_SIZE; + .pgdir_rodata : { + __pgdir_rodata_start = .; + swapper_pg_dir = .; + . += PAGE_SIZE; + __pgdir_rodata_end = .; + } . = ALIGN(SEGMENT_ALIGN); __init_begin = .; diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index a464f3d2d2df..84d81bae07a7 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -65,34 +65,34 @@ static bool rodata_is_rw __ro_after_init = true; */ long __section(".mmuoff.data.write") __early_cpu_boot_status; -static DEFINE_SPINLOCK(swapper_pgdir_lock); +static DEFINE_SPINLOCK(rodata_pgdir_lock); static DEFINE_MUTEX(fixmap_lock); -void noinstr set_swapper_pgd(pgd_t *pgdp, pgd_t pgd) +void noinstr set_rodata_pte(pte_t *ptep, pte_t pte) { - pgd_t *fixmap_pgdp; + pte_t *fixmap_ptep; /* - * Don't bother with the fixmap if swapper_pg_dir is still mapped - * writable in the kernel mapping. + * Don't bother with the fixmap if rodata is still mapped + * writable in the kernel and linear mappings. */ if (rodata_is_rw) { - WRITE_ONCE(*pgdp, pgd); + WRITE_ONCE(*ptep, pte); dsb(ishst); isb(); return; } - spin_lock(&swapper_pgdir_lock); - fixmap_pgdp = pgd_set_fixmap(__pa_symbol(pgdp)); - WRITE_ONCE(*fixmap_pgdp, pgd); + spin_lock(&rodata_pgdir_lock); + fixmap_ptep = pte_set_fixmap(__pa_nodebug(ptep)); + WRITE_ONCE(*fixmap_ptep, pte); /* * We need dsb(ishst) here to ensure the page-table-walker sees * our new entry before set_p?d() returns. The fixmap's * flush_tlb_kernel_range() via clear_fixmap() does this for us. */ - pgd_clear_fixmap(); - spin_unlock(&swapper_pgdir_lock); + pte_clear_fixmap(); + spin_unlock(&rodata_pgdir_lock); } pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, @@ -1071,6 +1071,7 @@ void __init mark_linear_text_alias_ro(void) /* * Remove the write permissions from the linear alias of .text/.rodata */ + WRITE_ONCE(rodata_is_rw, false); update_mapping_prot(__pa_symbol(_text), (unsigned long)lm_alias(_text), (unsigned long)__init_begin - (unsigned long)_text, pgprot_tagged(PAGE_KERNEL_RO)); @@ -1221,7 +1222,6 @@ void mark_rodata_ro(void) * to cover NOTES and EXCEPTION_TABLE. */ section_size = (unsigned long)__init_begin - (unsigned long)__start_rodata; - WRITE_ONCE(rodata_is_rw, false); update_mapping_prot(__pa_symbol(__start_rodata), (unsigned long)__start_rodata, section_size, PAGE_KERNEL_RO); /* mark the range between _text and _stext as read only. */ -- 2.54.0.rc2.544.gc7ae2d5bb8-goog From: Ard Biesheuvel The fixmap page tables are statically allocated, and are currently mapped read-write both in the kernel mapping as well as its linear alias. Due to lack of randomization of the linear map, these tables will appear at a priori known offsets in the virtual address space when booting without physical randomization, which means that a single kernel write primitive is sufficient for an attacker to map memory of their own choosing with any permissions at a known virtual address in the kernel's address space. To harden against this, move the fixmap PUD and PMD tables to .pgdir_rodata, so that both their kernel mappings as well as their linear aliases are mapped read-only during ordinary execution. The PTE table needs to remain read-write accessible via the kernel mapping, but its linear alias can be remapped read-only as well. Signed-off-by: Ard Biesheuvel --- arch/arm64/include/asm/pgtable.h | 6 ++++-- arch/arm64/kernel/vmlinux.lds.S | 1 + arch/arm64/mm/fixmap.c | 5 +++-- arch/arm64/mm/mmu.c | 5 +++++ 4 files changed, 13 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 94235dd428be..21afe923cd71 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -822,8 +822,10 @@ extern void set_rodata_pte(pte_t *ptep, pte_t pte); static inline bool in_pgdir_rodata(void *addr) { - return addr >= (void *)__pgdir_rodata_start && - addr < (void *)__pgdir_rodata_end; + phys_addr_t pa = __pa_nodebug(addr); + + return pa >= __pa_symbol_nodebug(__pgdir_rodata_start) && + pa < __pa_symbol_nodebug(__pgdir_rodata_end); } static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index e5e1d0fd7f27..9b346dd24d1c 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -247,6 +247,7 @@ SECTIONS __pgdir_rodata_start = .; swapper_pg_dir = .; . += PAGE_SIZE; + *(.fixmap_rodata) __pgdir_rodata_end = .; } diff --git a/arch/arm64/mm/fixmap.c b/arch/arm64/mm/fixmap.c index b649ea1a46e4..ad6d46e5c23e 100644 --- a/arch/arm64/mm/fixmap.c +++ b/arch/arm64/mm/fixmap.c @@ -32,9 +32,10 @@ static_assert(NR_BM_PMD_TABLES == 1); #define BM_PTE_TABLE_IDX(addr) __BM_TABLE_IDX(addr, PMD_SHIFT) #define __fixmap_bss __section(".fixmap_bss") __aligned(PAGE_SIZE) +#define __fixmap_rodata __section(".fixmap_rodata") __aligned(PAGE_SIZE) static pte_t bm_pte[NR_BM_PTE_TABLES][PTRS_PER_PTE] __fixmap_bss; -static pmd_t bm_pmd[PTRS_PER_PMD] __fixmap_bss __maybe_unused; -static pud_t bm_pud[PTRS_PER_PUD] __fixmap_bss __maybe_unused; +static pmd_t bm_pmd[PTRS_PER_PMD] __fixmap_rodata __maybe_unused; +static pud_t bm_pud[PTRS_PER_PUD] __fixmap_rodata __maybe_unused; static inline pte_t *fixmap_pte(unsigned long addr) { diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 84d81bae07a7..e76fe5b0c5fe 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1076,6 +1076,11 @@ void __init mark_linear_text_alias_ro(void) (unsigned long)__init_begin - (unsigned long)_text, pgprot_tagged(PAGE_KERNEL_RO)); + /* Map the fixmap PTE table at __fixmap_pgdir_start R/O in linear map too */ + update_mapping_prot(__pa_symbol(__fixmap_pgdir_start), + (unsigned long)lm_alias(__fixmap_pgdir_start), + PAGE_SIZE, pgprot_tagged(PAGE_KERNEL_RO)); + remap_linear_data_alias(true); if (IS_ENABLED(CONFIG_HIBERNATION)) { -- 2.54.0.rc2.544.gc7ae2d5bb8-goog