Historically, several core x86 mm subsystems (vmemmap, vmalloc, and CPA) have abused `pfn_pte()` to generate PMD and PUD entries by passing pgprot values containing the _PAGE_PSE flag, and then casting the resulting pte_t to a pmd_t or pud_t. This violates strict type safety and prevents us from enforcing the rule that `pfn_pte()` should strictly generate pte without huge page attributes. Fix these abuses by explicitly using the correct level-specific helpers (`pfn_pmd()` and `pfn_pud()`) and their corresponding setters (`set_pmd()`, `set_pud()`). For the CPA (Change Page Attribute) code, which uses `pte_t` as a generic container for page table entries across all levels in __should_split_large_page(), pack the correctly generated PMD/PUD values into the pte_t container. This cleanup prepares the ground for making `pfn_pte()` strictly filter out huge page attributes. Signed-off-by: Yin Tirui --- arch/x86/mm/init_64.c | 6 +++--- arch/x86/mm/pat/set_memory.c | 6 +++++- arch/x86/mm/pgtable.c | 4 ++-- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index df2261fa4f98..d65f3d05c66f 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1518,11 +1518,11 @@ static int __meminitdata node_start; void __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node, unsigned long addr, unsigned long next) { - pte_t entry; + pmd_t entry; - entry = pfn_pte(__pa(p) >> PAGE_SHIFT, + entry = pfn_pmd(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL_LARGE); - set_pmd(pmd, __pmd(pte_val(entry))); + set_pmd(pmd, entry); /* check to see if we have contiguous blocks */ if (p_end != p || node_start != node) { diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 40581a720fe8..87aa0e9a8f82 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -1059,7 +1059,11 @@ static int __should_split_large_page(pte_t *kpte, unsigned long address, return 1; /* All checks passed. Update the large page mapping. */ - new_pte = pfn_pte(old_pfn, new_prot); + if (level == PG_LEVEL_2M) + new_pte = __pte(pmd_val(pfn_pmd(old_pfn, new_prot))); + else + new_pte = __pte(pud_val(pfn_pud(old_pfn, new_prot))); + __set_pmd_pte(kpte, address, new_pte); cpa->flags |= CPA_FLUSHTLB; cpa_inc_lp_preserved(level); diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 2e5ecfdce73c..61320fd44e16 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -644,7 +644,7 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) if (pud_present(*pud) && !pud_leaf(*pud)) return 0; - set_pte((pte_t *)pud, pfn_pte( + set_pud(pud, pfn_pud( (u64)addr >> PAGE_SHIFT, __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE))); @@ -676,7 +676,7 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) if (pmd_present(*pmd) && !pmd_leaf(*pmd)) return 0; - set_pte((pte_t *)pmd, pfn_pte( + set_pmd(pmd, pfn_pmd( (u64)addr >> PAGE_SHIFT, __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE))); -- 2.22.0 A fundamental principle of page table type safety is that `pte_t` represents the lowest level page table entry and should never carry huge page attributes. Currently, passing a pgprot with huge page bits (e.g., extracted via pmd_pgprot()) into pfn_pte() creates a malformed PTE that retains the huge attribute, leading to the necessity of the ugly `pte_clrhuge()` anti-pattern. Enforce type safety by making `pfn_pte()` inherently filter out huge page attributes: - On x86: Strip the `_PAGE_PSE` bit. - On ARM64: Mask out the block descriptor bits in `PTE_TYPE_MASK` and enforce the `PTE_TYPE_PAGE` format. - On RISC-V: No changes required, as RISC-V leaf PMDs and PTEs share the exact same hardware format and do not use a distinct huge bit. Signed-off-by: Yin Tirui --- arch/arm64/include/asm/pgtable.h | 4 +++- arch/x86/include/asm/pgtable.h | 4 ++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index b3e58735c49b..f2a7a40106d2 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -141,7 +141,9 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys) #define pte_pfn(pte) (__pte_to_phys(pte) >> PAGE_SHIFT) #define pfn_pte(pfn,prot) \ - __pte(__phys_to_pte_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot)) + __pte(__phys_to_pte_val((phys_addr_t)(pfn) << PAGE_SHIFT) | \ + ((pgprot_val(prot) & ~(PTE_TYPE_MASK & ~PTE_VALID)) | \ + (PTE_TYPE_PAGE & ~PTE_VALID))) #define pte_none(pte) (!pte_val(pte)) #define pte_page(pte) (pfn_to_page(pte_pfn(pte))) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 1662c5a8f445..a4dbd81d42bf 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -738,6 +738,10 @@ static inline pgprotval_t check_pgprot(pgprot_t pgprot) static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) { phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT; + + /* Filter out _PAGE_PSE to ensure PTEs never carry the huge page bit */ + pgprot = __pgprot(pgprot_val(pgprot) & ~_PAGE_PSE); + /* This bit combination is used to mark shadow stacks */ WARN_ON_ONCE((pgprot_val(pgprot) & (_PAGE_DIRTY | _PAGE_RW)) == _PAGE_DIRTY); -- 2.22.0 With `pfn_pte()` now guaranteeing that it will natively filter out huge page attributes like `_PAGE_PSE`, the `pte_clrhuge()` helper has become obsolete. Remove `pte_clrhuge()` entirely. Concurrently, clean up the ugly type-casting anti-pattern in `arch/x86/mm/init_64.c` where `(pte_t *)` was forcibly cast from `pmd_t *` to call `pte_clrhuge()`. Now, we can simply extract the pgprot directly via `pmd_pgprot()` and safely pass it downstream, knowing that `pfn_pte()` will strip the huge bit automatically. Signed-off-by: Yin Tirui --- arch/x86/include/asm/pgtable.h | 5 ----- arch/x86/mm/init_64.c | 4 ++-- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index a4dbd81d42bf..e8564d4ce318 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -483,11 +483,6 @@ static inline pte_t pte_mkhuge(pte_t pte) return pte_set_flags(pte, _PAGE_PSE); } -static inline pte_t pte_clrhuge(pte_t pte) -{ - return pte_clear_flags(pte, _PAGE_PSE); -} - static inline pte_t pte_mkglobal(pte_t pte) { return pte_set_flags(pte, _PAGE_GLOBAL); diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index d65f3d05c66f..a1ddcf793a8a 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -572,7 +572,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end, paddr_last = paddr_next; continue; } - new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd)); + new_prot = pmd_pgprot(*pmd); } if (page_size_mask & (1< --- mm/huge_memory.c | 36 ++++++++++++++++++++++++++++++++++-- mm/memory.c | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+), 2 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d4ca8cfd7f9d..e463d51005ee 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1857,6 +1857,9 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd = pmdp_get_lockless(src_pmd); if (unlikely(pmd_present(pmd) && pmd_special(pmd) && !is_huge_zero_pmd(pmd))) { + pgtable = pte_alloc_one(dst_mm); + if (unlikely(!pgtable)) + goto out; dst_ptl = pmd_lock(dst_mm, dst_pmd); src_ptl = pmd_lockptr(src_mm, src_pmd); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); @@ -1870,6 +1873,12 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, * able to wrongly write to the backend MMIO. */ VM_WARN_ON_ONCE(is_cow_mapping(src_vma->vm_flags) && pmd_write(pmd)); + + /* dax won't reach here, it will be intercepted at vma_needs_copy() */ + VM_WARN_ON_ONCE(vma_is_dax(src_vma)); + + mm_inc_nr_ptes(dst_mm); + pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); goto set_pmd; } @@ -2360,6 +2369,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, arch_check_zapped_pmd(vma, orig_pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); if (!vma_is_dax(vma) && vma_is_special_huge(vma)) { + if (pmd_special(orig_pmd)) + zap_deposited_table(tlb->mm, pmd); if (arch_needs_pgtable_deposit()) zap_deposited_table(tlb->mm, pmd); spin_unlock(ptl); @@ -3005,14 +3016,35 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, if (!vma_is_anonymous(vma)) { old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd); + + if (!vma_is_dax(vma) && vma_is_special_huge(vma)) { + pte_t entry; + + if (!pmd_special(old_pmd)) { + zap_deposited_table(mm, pmd); + return; + } + pgtable = pgtable_trans_huge_withdraw(mm, pmd); + if (unlikely(!pgtable)) + return; + pmd_populate(mm, &_pmd, pgtable); + pte = pte_offset_map(&_pmd, haddr); + entry = pfn_pte(pmd_pfn(old_pmd), pmd_pgprot(old_pmd)); + set_ptes(mm, haddr, pte, entry, HPAGE_PMD_NR); + pte_unmap(pte); + + smp_wmb(); /* make pte visible before pmd */ + pmd_populate(mm, pmd, pgtable); + return; + } + /* * We are going to unmap this huge page. So * just go ahead and zap it */ if (arch_needs_pgtable_deposit()) zap_deposited_table(mm, pmd); - if (!vma_is_dax(vma) && vma_is_special_huge(vma)) - return; + if (unlikely(pmd_is_migration_entry(old_pmd))) { const softleaf_t old_entry = softleaf_from_pmd(old_pmd); diff --git a/mm/memory.c b/mm/memory.c index 07778814b4a8..affccf38cbcf 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2890,6 +2890,40 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, return err; } +#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP +static int remap_try_huge_pmd(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, unsigned long end, + unsigned long pfn, pgprot_t prot) +{ + pgtable_t pgtable; + spinlock_t *ptl; + + if ((end - addr) != PMD_SIZE) + return 0; + + if (!IS_ALIGNED(addr, PMD_SIZE)) + return 0; + + if (!IS_ALIGNED(pfn, HPAGE_PMD_NR)) + return 0; + + if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr)) + return 0; + + pgtable = pte_alloc_one(mm); + if (unlikely(!pgtable)) + return 0; + + mm_inc_nr_ptes(mm); + ptl = pmd_lock(mm, pmd); + set_pmd_at(mm, addr, pmd, pmd_mkspecial(pmd_mkhuge(pfn_pmd(pfn, prot)))); + pgtable_trans_huge_deposit(mm, pmd, pgtable); + spin_unlock(ptl); + + return 1; +} +#endif + static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, unsigned long addr, unsigned long end, unsigned long pfn, pgprot_t prot) @@ -2905,6 +2939,12 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, VM_BUG_ON(pmd_trans_huge(*pmd)); do { next = pmd_addr_end(addr, end); +#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP + if (remap_try_huge_pmd(mm, pmd, addr, next, + pfn + (addr >> PAGE_SHIFT), prot)) { + continue; + } +#endif err = remap_pte_range(mm, pmd, addr, next, pfn + (addr >> PAGE_SHIFT), prot); if (err) -- 2.22.0