Page tables do not use the reference count. That means we can avoid two atomic operations (one on alloc, one on free) by allocating frozen pages here. This does not interfere with compaction as page tables are non-movable allocations. pagetable_alloc() and pagetable_free() need to move out of line to make this work as alloc_frozen_page() and free_frozen_page() are not exported outside the mm for now. We'll want them out of line anyway soon. Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/mm.h | 53 +++++--------------------------------------- mm/memory.c | 34 ++++++++++++++++++++++++++++ mm/pgtable-generic.c | 3 ++- 3 files changed, 42 insertions(+), 48 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 5087deecdd9c..e168ee23091e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2995,58 +2995,17 @@ static inline void ptdesc_clear_kernel(struct ptdesc *ptdesc) */ static inline bool ptdesc_test_kernel(const struct ptdesc *ptdesc) { +#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE return test_bit(PT_kernel, &ptdesc->pt_flags.f); +#else + return false; +#endif } -/** - * pagetable_alloc - Allocate pagetables - * @gfp: GFP flags - * @order: desired pagetable order - * - * pagetable_alloc allocates memory for page tables as well as a page table - * descriptor to describe that memory. - * - * Return: The ptdesc describing the allocated page tables. - */ -static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int order) -{ - struct page *page = alloc_pages_noprof(gfp | __GFP_COMP, order); - - return page_ptdesc(page); -} +struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int order); #define pagetable_alloc(...) alloc_hooks(pagetable_alloc_noprof(__VA_ARGS__)) - -static inline void __pagetable_free(struct ptdesc *pt) -{ - struct page *page = ptdesc_page(pt); - - __free_pages(page, compound_order(page)); -} - -#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE +void pagetable_free(struct ptdesc *pt); void pagetable_free_kernel(struct ptdesc *pt); -#else -static inline void pagetable_free_kernel(struct ptdesc *pt) -{ - __pagetable_free(pt); -} -#endif -/** - * pagetable_free - Free pagetables - * @pt: The page table descriptor - * - * pagetable_free frees the memory of all page tables described by a page - * table descriptor and the memory for the descriptor itself. - */ -static inline void pagetable_free(struct ptdesc *pt) -{ - if (ptdesc_test_kernel(pt)) { - ptdesc_clear_kernel(pt); - pagetable_free_kernel(pt); - } else { - __pagetable_free(pt); - } -} #if defined(CONFIG_SPLIT_PTE_PTLOCKS) #if ALLOC_SPLIT_PTLOCKS diff --git a/mm/memory.c b/mm/memory.c index 1c66ee83a7ab..781cd7f607f7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -7338,6 +7338,40 @@ long copy_folio_from_user(struct folio *dst_folio, } #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ +/** + * pagetable_alloc - Allocate pagetables + * @gfp: GFP flags + * @order: desired pagetable order + * + * pagetable_alloc allocates memory for page tables as well as a page table + * descriptor to describe that memory. + * + * Return: The ptdesc describing the allocated page tables. + */ +struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int order) +{ + struct page *page = alloc_frozen_pages_noprof(gfp | __GFP_COMP, order); + + return page_ptdesc(page); +} + +/** + * pagetable_free - Free pagetables + * @pt: The page table descriptor + * + * pagetable_free frees the memory of all page tables described by a page + * table descriptor and the memory for the descriptor itself. + */ +void pagetable_free(struct ptdesc *pt) +{ + struct page *page = ptdesc_page(pt); + + if (ptdesc_test_kernel(pt)) + pagetable_free_kernel(pt); + else + free_frozen_pages(page, compound_order(page)); +} + #if defined(CONFIG_SPLIT_PTE_PTLOCKS) && ALLOC_SPLIT_PTLOCKS static struct kmem_cache *page_ptl_cachep; diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index d3aec7a9926a..597049e21ac1 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -434,11 +434,12 @@ static void kernel_pgtable_work_func(struct work_struct *work) iommu_sva_invalidate_kva_range(PAGE_OFFSET, TLB_FLUSH_ALL); list_for_each_entry_safe(pt, next, &page_list, pt_list) - __pagetable_free(pt); + pagetable_free(pt); } void pagetable_free_kernel(struct ptdesc *pt) { + ptdesc_clear_kernel(pt); spin_lock(&kernel_pgtable_work.lock); list_add(&pt->pt_list, &kernel_pgtable_work.list); spin_unlock(&kernel_pgtable_work.lock); -- 2.47.2 Move the accounting from the constructor to the allocation site. Some of the architecture code is a little complex to reason about, but I think this is all correct (and slightly more efficient due to having 'order' as an argument instead of having to retrieve it from struct page again). Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/mm.h | 11 ----------- mm/memory.c | 16 +++++++++++++--- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index e168ee23091e..17f783c04c87 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3082,26 +3082,15 @@ static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; } static inline void ptlock_free(struct ptdesc *ptdesc) {} #endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */ -static inline unsigned long ptdesc_nr_pages(const struct ptdesc *ptdesc) -{ - return compound_nr(ptdesc_page(ptdesc)); -} - static inline void __pagetable_ctor(struct ptdesc *ptdesc) { - pg_data_t *pgdat = NODE_DATA(memdesc_nid(ptdesc->pt_flags)); - __SetPageTable(ptdesc_page(ptdesc)); - mod_node_page_state(pgdat, NR_PAGETABLE, ptdesc_nr_pages(ptdesc)); } static inline void pagetable_dtor(struct ptdesc *ptdesc) { - pg_data_t *pgdat = NODE_DATA(memdesc_nid(ptdesc->pt_flags)); - ptlock_free(ptdesc); __ClearPageTable(ptdesc_page(ptdesc)); - mod_node_page_state(pgdat, NR_PAGETABLE, -ptdesc_nr_pages(ptdesc)); } static inline void pagetable_dtor_free(struct ptdesc *ptdesc) diff --git a/mm/memory.c b/mm/memory.c index 781cd7f607f7..35886fde189c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -7351,7 +7351,13 @@ long copy_folio_from_user(struct folio *dst_folio, struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int order) { struct page *page = alloc_frozen_pages_noprof(gfp | __GFP_COMP, order); + pg_data_t *pgdat; + if (!page) + return NULL; + + pgdat = NODE_DATA(page_to_nid(page)); + mod_node_page_state(pgdat, NR_PAGETABLE, 1 << order); return page_ptdesc(page); } @@ -7364,12 +7370,16 @@ struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int order) */ void pagetable_free(struct ptdesc *pt) { + pg_data_t *pgdat = NODE_DATA(memdesc_nid(pt->pt_flags)); struct page *page = ptdesc_page(pt); + unsigned int order = compound_order(page); - if (ptdesc_test_kernel(pt)) + if (ptdesc_test_kernel(pt)) { pagetable_free_kernel(pt); - else - free_frozen_pages(page, compound_order(page)); + return; + } + mod_node_page_state(pgdat, NR_PAGETABLE, -(1L << order)); + free_frozen_pages(page, order); } #if defined(CONFIG_SPLIT_PTE_PTLOCKS) && ALLOC_SPLIT_PTLOCKS -- 2.47.2 Move the page type setting from the constructor to the allocation site. Some of the architecture code is a little complex to reason about, but I think this is all correct. This makes __pagetable_ctor() empty, so remove it. While pagetable_pud_ctor() and higher levels are now empty, leave them alone as there may be call to have them do something in future. Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/mm.h | 11 ----------- mm/memory.c | 2 ++ 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 17f783c04c87..3111344b8d05 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3082,15 +3082,9 @@ static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; } static inline void ptlock_free(struct ptdesc *ptdesc) {} #endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */ -static inline void __pagetable_ctor(struct ptdesc *ptdesc) -{ - __SetPageTable(ptdesc_page(ptdesc)); -} - static inline void pagetable_dtor(struct ptdesc *ptdesc) { ptlock_free(ptdesc); - __ClearPageTable(ptdesc_page(ptdesc)); } static inline void pagetable_dtor_free(struct ptdesc *ptdesc) @@ -3104,7 +3098,6 @@ static inline bool pagetable_pte_ctor(struct mm_struct *mm, { if (mm != &init_mm && !ptlock_init(ptdesc)) return false; - __pagetable_ctor(ptdesc); return true; } @@ -3212,7 +3205,6 @@ static inline bool pagetable_pmd_ctor(struct mm_struct *mm, if (mm != &init_mm && !pmd_ptlock_init(ptdesc)) return false; ptdesc_pmd_pts_init(ptdesc); - __pagetable_ctor(ptdesc); return true; } @@ -3237,17 +3229,14 @@ static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud) static inline void pagetable_pud_ctor(struct ptdesc *ptdesc) { - __pagetable_ctor(ptdesc); } static inline void pagetable_p4d_ctor(struct ptdesc *ptdesc) { - __pagetable_ctor(ptdesc); } static inline void pagetable_pgd_ctor(struct ptdesc *ptdesc) { - __pagetable_ctor(ptdesc); } extern void __init pagecache_init(void); diff --git a/mm/memory.c b/mm/memory.c index 35886fde189c..54480b12eb8c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -7358,6 +7358,7 @@ struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int order) pgdat = NODE_DATA(page_to_nid(page)); mod_node_page_state(pgdat, NR_PAGETABLE, 1 << order); + __SetPageTable(page); return page_ptdesc(page); } @@ -7379,6 +7380,7 @@ void pagetable_free(struct ptdesc *pt) return; } mod_node_page_state(pgdat, NR_PAGETABLE, -(1L << order)); + __ClearPageTable(page); free_frozen_pages(page, order); } -- 2.47.2 Use ptdesc->pt_list instead of page->lru. These are the same bits for now, but will be different when ptdesc is allocated separately. Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/mm_types.h | 1 + mm/pgtable-generic.c | 24 +++++++++++++++--------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index a0f4bd6099cc..5e08c4a41777 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -602,6 +602,7 @@ struct ptdesc { TABLE_MATCH(flags, pt_flags); TABLE_MATCH(compound_head, pt_list); TABLE_MATCH(compound_head, _pt_pad_1); +TABLE_MATCH(lru, pt_list); TABLE_MATCH(mapping, __page_mapping); TABLE_MATCH(__folio_index, pt_index); TABLE_MATCH(rcu_head, pt_rcu_head); diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 597049e21ac1..a3990c04b31e 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -166,13 +166,14 @@ pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable) { + struct ptdesc *ptdesc = page_ptdesc(pgtable); assert_spin_locked(pmd_lockptr(mm, pmdp)); /* FIFO */ if (!pmd_huge_pte(mm, pmdp)) - INIT_LIST_HEAD(&pgtable->lru); + INIT_LIST_HEAD(&ptdesc->pt_list); else - list_add(&pgtable->lru, &pmd_huge_pte(mm, pmdp)->lru); + list_add(&ptdesc->pt_list, &page_ptdesc(pmd_huge_pte(mm, pmdp))->pt_list); pmd_huge_pte(mm, pmdp) = pgtable; } #endif @@ -181,17 +182,22 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, /* no "address" argument so destroys page coloring of some arch */ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) { - pgtable_t pgtable; + struct ptdesc *ptdesc, *next; + struct page *page; assert_spin_locked(pmd_lockptr(mm, pmdp)); /* FIFO */ - pgtable = pmd_huge_pte(mm, pmdp); - pmd_huge_pte(mm, pmdp) = list_first_entry_or_null(&pgtable->lru, - struct page, lru); - if (pmd_huge_pte(mm, pmdp)) - list_del(&pgtable->lru); - return pgtable; + page = pmd_huge_pte(mm, pmdp); + ptdesc = page_ptdesc(page); + next = list_first_entry_or_null(&ptdesc->pt_list, struct ptdesc, pt_list); + if (next) { + pmd_huge_pte(mm, pmdp) = ptdesc_page(next); + list_del(&ptdesc->pt_list); + } else { + pmd_huge_pte(mm, pmdp) = NULL; + } + return page; } #endif -- 2.47.2