Page tables do not use the reference count. That means we can avoid two atomic operations (one on alloc, one on free) by allocating frozen pages here. This does not interfere with compaction as page tables are non-movable allocations. pagetable_alloc() and pagetable_free() need to move out of line to make this work as alloc_frozen_page() and free_frozen_page() are not exported outside the mm for now. We'll want them out of line anyway soon. Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/mm.h | 32 ++------------------------------ mm/memory.c | 31 +++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 30 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index d16b33bacc32..ec9365375d9c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2965,37 +2965,9 @@ static inline bool pagetable_is_reserved(struct ptdesc *pt) return test_bit(PT_reserved, &pt->pt_flags.f); } -/** - * pagetable_alloc - Allocate pagetables - * @gfp: GFP flags - * @order: desired pagetable order - * - * pagetable_alloc allocates memory for page tables as well as a page table - * descriptor to describe that memory. - * - * Return: The ptdesc describing the allocated page tables. - */ -static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int order) -{ - struct page *page = alloc_pages_noprof(gfp | __GFP_COMP, order); - - return page_ptdesc(page); -} +struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int order); #define pagetable_alloc(...) alloc_hooks(pagetable_alloc_noprof(__VA_ARGS__)) - -/** - * pagetable_free - Free pagetables - * @pt: The page table descriptor - * - * pagetable_free frees the memory of all page tables described by a page - * table descriptor and the memory for the descriptor itself. - */ -static inline void pagetable_free(struct ptdesc *pt) -{ - struct page *page = ptdesc_page(pt); - - __free_pages(page, compound_order(page)); -} +void pagetable_free(struct ptdesc *pt); #if defined(CONFIG_SPLIT_PTE_PTLOCKS) #if ALLOC_SPLIT_PTLOCKS diff --git a/mm/memory.c b/mm/memory.c index 74b45e258323..de9f999ffcf6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -7267,6 +7267,37 @@ long copy_folio_from_user(struct folio *dst_folio, } #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ +/** + * pagetable_alloc - Allocate pagetables + * @gfp: GFP flags + * @order: desired pagetable order + * + * pagetable_alloc allocates memory for page tables as well as a page table + * descriptor to describe that memory. + * + * Return: The ptdesc describing the allocated page tables. + */ +struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int order) +{ + struct page *page = alloc_frozen_pages_noprof(gfp | __GFP_COMP, order); + + return page_ptdesc(page); +} + +/** + * pagetable_free - Free pagetables + * @pt: The page table descriptor + * + * pagetable_free frees the memory of all page tables described by a page + * table descriptor and the memory for the descriptor itself. + */ +void pagetable_free(struct ptdesc *pt) +{ + struct page *page = ptdesc_page(pt); + + free_frozen_pages(page, compound_order(page)); +} + #if defined(CONFIG_SPLIT_PTE_PTLOCKS) && ALLOC_SPLIT_PTLOCKS static struct kmem_cache *page_ptl_cachep; -- 2.47.2 Move the accounting from the constructor to the allocation site. Some of the architecture code is a little complex to reason about, but I think this is all correct (and slightly more efficient due to having 'order' as an argument instead of having to retrieve it from struct page again). Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/mm.h | 11 ----------- mm/memory.c | 11 ++++++++++- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index ec9365375d9c..ade37df5ea2b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3044,26 +3044,15 @@ static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; } static inline void ptlock_free(struct ptdesc *ptdesc) {} #endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */ -static inline unsigned long ptdesc_nr_pages(const struct ptdesc *ptdesc) -{ - return compound_nr(ptdesc_page(ptdesc)); -} - static inline void __pagetable_ctor(struct ptdesc *ptdesc) { - pg_data_t *pgdat = NODE_DATA(memdesc_nid(ptdesc->pt_flags)); - __SetPageTable(ptdesc_page(ptdesc)); - mod_node_page_state(pgdat, NR_PAGETABLE, ptdesc_nr_pages(ptdesc)); } static inline void pagetable_dtor(struct ptdesc *ptdesc) { - pg_data_t *pgdat = NODE_DATA(memdesc_nid(ptdesc->pt_flags)); - ptlock_free(ptdesc); __ClearPageTable(ptdesc_page(ptdesc)); - mod_node_page_state(pgdat, NR_PAGETABLE, -ptdesc_nr_pages(ptdesc)); } static inline void pagetable_dtor_free(struct ptdesc *ptdesc) diff --git a/mm/memory.c b/mm/memory.c index de9f999ffcf6..033fc4e29232 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -7280,7 +7280,13 @@ long copy_folio_from_user(struct folio *dst_folio, struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int order) { struct page *page = alloc_frozen_pages_noprof(gfp | __GFP_COMP, order); + pg_data_t *pgdat; + if (!page) + return NULL; + + pgdat = NODE_DATA(page_to_nid(page)); + mod_node_page_state(pgdat, NR_PAGETABLE, 1 << order); return page_ptdesc(page); } @@ -7293,9 +7299,12 @@ struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int order) */ void pagetable_free(struct ptdesc *pt) { + pg_data_t *pgdat = NODE_DATA(memdesc_nid(pt->pt_flags)); struct page *page = ptdesc_page(pt); + unsigned int order = compound_order(page); - free_frozen_pages(page, compound_order(page)); + mod_node_page_state(pgdat, NR_PAGETABLE, -(1L << order)); + free_frozen_pages(page, order); } #if defined(CONFIG_SPLIT_PTE_PTLOCKS) && ALLOC_SPLIT_PTLOCKS -- 2.47.2 Move the page type setting from the constructor to the allocation site. Some of the architecture code is a little complex to reason about, but I think this is all correct. This makes __pagetable_ctor() empty, so remove it. While pagetable_pud_ctor() and higher levels are now empty, leave them alone as there may be call to have them do something in future. Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/mm.h | 11 ----------- mm/memory.c | 2 ++ 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index ade37df5ea2b..edcb7d75542f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3044,15 +3044,9 @@ static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; } static inline void ptlock_free(struct ptdesc *ptdesc) {} #endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */ -static inline void __pagetable_ctor(struct ptdesc *ptdesc) -{ - __SetPageTable(ptdesc_page(ptdesc)); -} - static inline void pagetable_dtor(struct ptdesc *ptdesc) { ptlock_free(ptdesc); - __ClearPageTable(ptdesc_page(ptdesc)); } static inline void pagetable_dtor_free(struct ptdesc *ptdesc) @@ -3066,7 +3060,6 @@ static inline bool pagetable_pte_ctor(struct mm_struct *mm, { if (mm != &init_mm && !ptlock_init(ptdesc)) return false; - __pagetable_ctor(ptdesc); return true; } @@ -3174,7 +3167,6 @@ static inline bool pagetable_pmd_ctor(struct mm_struct *mm, if (mm != &init_mm && !pmd_ptlock_init(ptdesc)) return false; ptdesc_pmd_pts_init(ptdesc); - __pagetable_ctor(ptdesc); return true; } @@ -3199,17 +3191,14 @@ static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud) static inline void pagetable_pud_ctor(struct ptdesc *ptdesc) { - __pagetable_ctor(ptdesc); } static inline void pagetable_p4d_ctor(struct ptdesc *ptdesc) { - __pagetable_ctor(ptdesc); } static inline void pagetable_pgd_ctor(struct ptdesc *ptdesc) { - __pagetable_ctor(ptdesc); } extern void __init pagecache_init(void); diff --git a/mm/memory.c b/mm/memory.c index 033fc4e29232..47eb5834db23 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -7287,6 +7287,7 @@ struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int order) pgdat = NODE_DATA(page_to_nid(page)); mod_node_page_state(pgdat, NR_PAGETABLE, 1 << order); + __SetPageTable(page); return page_ptdesc(page); } @@ -7304,6 +7305,7 @@ void pagetable_free(struct ptdesc *pt) unsigned int order = compound_order(page); mod_node_page_state(pgdat, NR_PAGETABLE, -(1L << order)); + __ClearPageTable(page); free_frozen_pages(page, order); } -- 2.47.2 Use ptdesc->pt_list instead of page->lru. These are the same bits for now, but will be different when ptdesc is allocated separately. Signed-off-by: Matthew Wilcox (Oracle) --- mm/pgtable-generic.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 567e2d084071..f40bab9e6c46 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -164,13 +164,14 @@ pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable) { + struct ptdesc *ptdesc = page_ptdesc(pgtable); assert_spin_locked(pmd_lockptr(mm, pmdp)); /* FIFO */ if (!pmd_huge_pte(mm, pmdp)) - INIT_LIST_HEAD(&pgtable->lru); + INIT_LIST_HEAD(&ptdesc->pt_list); else - list_add(&pgtable->lru, &pmd_huge_pte(mm, pmdp)->lru); + list_add(&ptdesc->pt_list, &page_ptdesc(pmd_huge_pte(mm, pmdp))->pt_list); pmd_huge_pte(mm, pmdp) = pgtable; } #endif @@ -179,17 +180,22 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, /* no "address" argument so destroys page coloring of some arch */ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) { - pgtable_t pgtable; + struct ptdesc *ptdesc, *next; + struct page *page; assert_spin_locked(pmd_lockptr(mm, pmdp)); /* FIFO */ - pgtable = pmd_huge_pte(mm, pmdp); - pmd_huge_pte(mm, pmdp) = list_first_entry_or_null(&pgtable->lru, - struct page, lru); - if (pmd_huge_pte(mm, pmdp)) - list_del(&pgtable->lru); - return pgtable; + page = pmd_huge_pte(mm, pmdp); + ptdesc = page_ptdesc(page); + next = list_first_entry_or_null(&ptdesc->pt_list, struct ptdesc, pt_list); + if (next) { + pmd_huge_pte(mm, pmdp) = ptdesc_page(next); + list_del(&ptdesc->pt_list); + } else { + pmd_huge_pte(mm, pmdp) = NULL; + } + return page; } #endif -- 2.47.2 When separately allocating ptdesc from struct page, calling preallocate_vmalloc_pages() from mem_init() is too early as the slab allocator hasn't been set up yet. Move preallocate_vmalloc_pages() to vmalloc_init() which is called after the slab allocator has been set up. Honestly, this patch is a bit bobbins and I'm sure it'll be reworked before it goes upstream. Signed-off-by: Matthew Wilcox (Oracle) --- arch/x86/mm/init_64.c | 4 +--- include/linux/mm.h | 33 +++++++++++++++++++++++++++++++-- mm/vmalloc.c | 2 ++ 3 files changed, 34 insertions(+), 5 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 0e4270e20fad..5270fc24f6f6 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1322,7 +1322,7 @@ static void __init register_page_bootmem_info(void) * Only the level which needs to be synchronized between all page-tables is * allocated because the synchronization can be expensive. */ -static void __init preallocate_vmalloc_pages(void) +void __init preallocate_vmalloc_pages(void) { unsigned long addr; const char *lvl; @@ -1390,8 +1390,6 @@ void __init mem_init(void) /* Register memory areas for /proc/kcore */ if (get_gate_vma(&init_mm)) kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, PAGE_SIZE, KCORE_USER); - - preallocate_vmalloc_pages(); } int kernel_set_to_readonly; diff --git a/include/linux/mm.h b/include/linux/mm.h index edcb7d75542f..e60b181da3df 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1160,6 +1160,12 @@ static inline int is_vmalloc_or_module_addr(const void *x) } #endif +#ifdef CONFIG_X86 +void __init preallocate_vmalloc_pages(void); +#else +static inline void preallocate_vmalloc_pages(void) { } +#endif + /* * How many times the entire folio is mapped as a single unit (eg by a * PMD or PUD entry). This is probably not what you want, except for @@ -2939,9 +2945,32 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a } #endif /* CONFIG_MMU */ +static inline struct page *ptdesc_page(const struct ptdesc *pt) +{ + return pt->pt_page; +} + +static inline struct ptdesc *page_ptdesc(const struct page *page) +{ + memdesc_t memdesc = READ_ONCE(page->memdesc); + + if (memdesc_type(memdesc) != MEMDESC_TYPE_PAGE_TABLE) { + printk(KERN_EMERG "memdesc %lx index %lx\n", memdesc.v, page->__folio_index); + VM_BUG_ON_PAGE(1, page); + return NULL; + } + return (void *)(memdesc.v - MEMDESC_TYPE_PAGE_TABLE); +} + +/** + * enum pt_flags = How the ptdesc flags bits are used. + * @PT_reserved: Used by PowerPC + * + * The pt flags are stored in a memdesc_flags_t. + * The high bits are used for information like zone/node/section. + */ enum pt_flags { PT_reserved = PG_reserved, - /* High bits are used for zone/node/section */ }; static inline struct ptdesc *virt_to_ptdesc(const void *x) @@ -2957,7 +2986,7 @@ static inline struct ptdesc *virt_to_ptdesc(const void *x) */ static inline void *ptdesc_address(const struct ptdesc *pt) { - return folio_address(ptdesc_folio(pt)); + return page_address(pt->pt_page); } static inline bool pagetable_is_reserved(struct ptdesc *pt) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 798b2ed21e46..9b349051a83a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -5264,6 +5264,8 @@ void __init vmalloc_init(void) struct vm_struct *tmp; int i; + preallocate_vmalloc_pages(); + /* * Create the cache for vmap_area objects. */ -- 2.47.2 Convert the alloc_frozen_pages implementations into alloc_pages_memdesc and add wrappers to keep the frozen pages users working. This hasn't been widely tested; I bet the build bots will find something I missed. Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/gfp.h | 13 +++++++++++++ include/linux/mm_types.h | 25 +++++++++++++++++++++++++ mm/internal.h | 13 ++++++++++--- mm/mempolicy.c | 28 ++++++++++++++++------------ mm/page_alloc.c | 12 +++++++++--- 5 files changed, 73 insertions(+), 18 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 0ceb4e09306c..6e13e0b829f8 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -225,6 +225,10 @@ struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order, int preferred_n nodemask_t *nodemask); #define __alloc_pages(...) alloc_hooks(__alloc_pages_noprof(__VA_ARGS__)) +struct page *__alloc_pages_memdesc_noprof(gfp_t gfp, unsigned int order, + memdesc_t memdesc, int preferred_nid, nodemask_t *nodemask); +#define __alloc_pages_memdesc(...) alloc_hooks(__alloc_pages_memdesc_noprof(__VA_ARGS__)) + struct folio *__folio_alloc_noprof(gfp_t gfp, unsigned int order, int preferred_nid, nodemask_t *nodemask); #define __folio_alloc(...) alloc_hooks(__folio_alloc_noprof(__VA_ARGS__)) @@ -315,6 +319,8 @@ static inline struct page *alloc_pages_node_noprof(int nid, gfp_t gfp_mask, #ifdef CONFIG_NUMA struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order); +struct page *alloc_pages_memdesc_noprof(gfp_t gfp, unsigned int order, + memdesc_t memdesc); struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order); struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, struct mempolicy *mpol, pgoff_t ilx, int nid); @@ -325,6 +331,12 @@ static inline struct page *alloc_pages_noprof(gfp_t gfp_mask, unsigned int order { return alloc_pages_node_noprof(numa_node_id(), gfp_mask, order); } +static inline struct page *alloc_pages_memdesc_noprof(gfp_t gfp, + unsigned int order, memdesc_t memdesc) +{ + return __alloc_pages_memdesc_noprof(gfp, order, memdesc, + numa_node_id(), NULL); +} static inline struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) { return __folio_alloc_node_noprof(gfp, order, numa_node_id()); @@ -339,6 +351,7 @@ static inline struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int orde #endif #define alloc_pages(...) alloc_hooks(alloc_pages_noprof(__VA_ARGS__)) +#define alloc_pages_memdesc(...) alloc_hooks(alloc_pages_memdesc_noprof(__VA_ARGS__)) #define folio_alloc(...) alloc_hooks(folio_alloc_noprof(__VA_ARGS__)) #define folio_alloc_mpol(...) alloc_hooks(folio_alloc_mpol_noprof(__VA_ARGS__)) #define vma_alloc_folio(...) alloc_hooks(vma_alloc_folio_noprof(__VA_ARGS__)) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 90e5790c318f..f5d9e0afe0fa 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -38,6 +38,30 @@ typedef struct { unsigned long f; } memdesc_flags_t; +/** + * typedef memdesc_t - A typed memory descriptor. + * + * The bottom few bits of this encoded pointer determine the type + * of the memdesc. + */ +typedef struct { + unsigned long v; +} memdesc_t; + +#define MEMDESC_TYPE_PAGE_TABLE 15 + +static inline memdesc_t memdesc_create(void *p, unsigned long type) +{ + VM_BUG_ON((unsigned long)p & 15); + VM_BUG_ON(type > 15); + return (memdesc_t) { .v = type | (unsigned long)p }; +} + +static inline unsigned long memdesc_type(memdesc_t memdesc) +{ + return memdesc.v & 15; +} + /* * Each physical page in the system has a struct page associated with * it to keep track of whatever it is we are using the page for at the @@ -126,6 +150,7 @@ struct page { }; struct { /* Tail pages of compound page */ unsigned long compound_head; /* Bit zero is set */ + memdesc_t memdesc; /* All pages, not just tail */ }; struct { /* ZONE_DEVICE pages */ /* diff --git a/mm/internal.h b/mm/internal.h index 1561fc2ff5b8..15d64601289b 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -824,15 +824,22 @@ extern bool free_pages_prepare(struct page *page, unsigned int order); extern int user_min_free_kbytes; -struct page *__alloc_frozen_pages_noprof(gfp_t, unsigned int order, int nid, - nodemask_t *); +static inline struct page *__alloc_frozen_pages_noprof(gfp_t gfp, + unsigned int order, int nid, nodemask_t *mask) +{ + return __alloc_pages_memdesc_noprof(gfp, order, + memdesc_create(NULL, 0), nid, mask); +} #define __alloc_frozen_pages(...) \ alloc_hooks(__alloc_frozen_pages_noprof(__VA_ARGS__)) void free_frozen_pages(struct page *page, unsigned int order); void free_unref_folios(struct folio_batch *fbatch); #ifdef CONFIG_NUMA -struct page *alloc_frozen_pages_noprof(gfp_t, unsigned int order); +static inline struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned int order) +{ + return alloc_pages_memdesc_noprof(gfp, order, memdesc_create(NULL, 0)); +} #else static inline struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned int order) { diff --git a/mm/mempolicy.c b/mm/mempolicy.c index eb83cff7db8c..866d6609a758 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2338,7 +2338,7 @@ bool mempolicy_in_oom_domain(struct task_struct *tsk, } static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, - int nid, nodemask_t *nodemask) + memdesc_t memdesc, int nid, nodemask_t *nodemask) { struct page *page; gfp_t preferred_gfp; @@ -2351,9 +2351,11 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, */ preferred_gfp = gfp | __GFP_NOWARN; preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); - page = __alloc_frozen_pages_noprof(preferred_gfp, order, nid, nodemask); + page = __alloc_pages_memdesc_noprof(preferred_gfp, order, memdesc, + nid, nodemask); if (!page) - page = __alloc_frozen_pages_noprof(gfp, order, nid, NULL); + page = __alloc_pages_memdesc_noprof(gfp, order, memdesc, + nid, NULL); return page; } @@ -2362,6 +2364,7 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, * alloc_pages_mpol - Allocate pages according to NUMA mempolicy. * @gfp: GFP flags. * @order: Order of the page allocation. + * @memdesc: Memory descriptor. * @pol: Pointer to the NUMA mempolicy. * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()). * @nid: Preferred node (usually numa_node_id() but @mpol may override it). @@ -2369,7 +2372,7 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, * Return: The page on success or NULL if allocation fails. */ static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, - struct mempolicy *pol, pgoff_t ilx, int nid) + memdesc_t memdesc, struct mempolicy *pol, pgoff_t ilx, int nid) { nodemask_t *nodemask; struct page *page; @@ -2377,7 +2380,7 @@ static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, nodemask = policy_nodemask(gfp, pol, ilx, &nid); if (pol->mode == MPOL_PREFERRED_MANY) - return alloc_pages_preferred_many(gfp, order, nid, nodemask); + return alloc_pages_preferred_many(gfp, order, memdesc, nid, nodemask); if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && /* filter "hugepage" allocation, unless from alloc_pages() */ @@ -2399,9 +2402,9 @@ static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, * First, try to allocate THP only on local node, but * don't reclaim unnecessarily, just compact. */ - page = __alloc_frozen_pages_noprof( + page = __alloc_pages_memdesc_noprof( gfp | __GFP_THISNODE | __GFP_NORETRY, order, - nid, NULL); + memdesc, nid, NULL); if (page || !(gfp & __GFP_DIRECT_RECLAIM)) return page; /* @@ -2413,7 +2416,7 @@ static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, } } - page = __alloc_frozen_pages_noprof(gfp, order, nid, nodemask); + page = __alloc_pages_memdesc_noprof(gfp, order, memdesc, nid, nodemask); if (unlikely(pol->mode == MPOL_INTERLEAVE || pol->mode == MPOL_WEIGHTED_INTERLEAVE) && page) { @@ -2432,8 +2435,8 @@ static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, struct mempolicy *pol, pgoff_t ilx, int nid) { - struct page *page = alloc_pages_mpol(gfp | __GFP_COMP, order, pol, - ilx, nid); + struct page *page = alloc_pages_mpol(gfp | __GFP_COMP, order, + memdesc_create(NULL, 0), pol, ilx, nid); if (!page) return NULL; @@ -2473,7 +2476,8 @@ struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct } EXPORT_SYMBOL(vma_alloc_folio_noprof); -struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned order) +struct page *alloc_pages_memdesc_noprof(gfp_t gfp, unsigned order, + memdesc_t memdesc) { struct mempolicy *pol = &default_policy; @@ -2484,7 +2488,7 @@ struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned order) if (!in_interrupt() && !(gfp & __GFP_THISNODE)) pol = get_task_policy(current); - return alloc_pages_mpol(gfp, order, pol, NO_INTERLEAVE_INDEX, + return alloc_pages_mpol(gfp, order, memdesc, pol, NO_INTERLEAVE_INDEX, numa_node_id()); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 600d9e981c23..c1451ca0acc1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5144,8 +5144,8 @@ EXPORT_SYMBOL_GPL(alloc_pages_bulk_noprof); /* * This is the 'heart' of the zoned buddy allocator. */ -struct page *__alloc_frozen_pages_noprof(gfp_t gfp, unsigned int order, - int preferred_nid, nodemask_t *nodemask) +struct page *__alloc_pages_memdesc_noprof(gfp_t gfp, unsigned int order, + memdesc_t memdesc, int preferred_nid, nodemask_t *nodemask) { struct page *page; unsigned int alloc_flags = ALLOC_WMARK_LOW; @@ -5205,9 +5205,15 @@ struct page *__alloc_frozen_pages_noprof(gfp_t gfp, unsigned int order, trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype); kmsan_alloc_page(page, order, alloc_gfp); + if (page && memdesc.v) { + unsigned long i, max = 1UL << order; + + for (i = 0; i < max; i++) + page->memdesc = memdesc; + } return page; } -EXPORT_SYMBOL(__alloc_frozen_pages_noprof); +EXPORT_SYMBOL(__alloc_pages_memdesc_noprof); struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order, int preferred_nid, nodemask_t *nodemask) -- 2.47.2 Create a slab cache for ptdescs and point to the struct page from the ptdesc. Remove all the padding from ptdesc that makes it line up with struct page. Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/mm.h | 1 + include/linux/mm_types.h | 50 ++++------------------------------------ mm/internal.h | 1 + mm/memory.c | 35 ++++++++++++++++++++++++---- mm/mm_init.c | 1 + 5 files changed, 37 insertions(+), 51 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index e60b181da3df..e8bb52061b0c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2970,6 +2970,7 @@ static inline struct ptdesc *page_ptdesc(const struct page *page) * The high bits are used for information like zone/node/section. */ enum pt_flags { + /* Bits 0-3 used for pt_order */ PT_reserved = PG_reserved, }; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index f5d9e0afe0fa..efdf29b8b478 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -548,38 +548,30 @@ FOLIO_MATCH(compound_head, _head_3); /** * struct ptdesc - Memory descriptor for page tables. * @pt_flags: enum pt_flags plus zone/node/section. + * @pt_page: page allocated to store page table entries. * @pt_rcu_head: For freeing page table pages. * @pt_list: List of used page tables. Used for s390 gmap shadow pages * (which are not linked into the user page tables) and x86 * pgds. - * @_pt_pad_1: Padding that aliases with page's compound head. * @pmd_huge_pte: Protected by ptdesc->ptl, used for THPs. - * @__page_mapping: Aliases with page->mapping. Unused for page tables. * @pt_index: Used for s390 gmap. * @pt_mm: Used for x86 pgds. * @pt_frag_refcount: For fragmented page table tracking. Powerpc only. * @pt_share_count: Used for HugeTLB PMD page table share count. - * @_pt_pad_2: Padding to ensure proper alignment. * @ptl: Lock for the page table. - * @__page_type: Same as page->page_type. Unused for page tables. - * @__page_refcount: Same as page refcount. - * @pt_memcg_data: Memcg data. Tracked for page tables here. * * This struct overlays struct page for now. Do not modify without a good * understanding of the issues. */ struct ptdesc { memdesc_flags_t pt_flags; + struct page *pt_page; union { struct rcu_head pt_rcu_head; struct list_head pt_list; - struct { - unsigned long _pt_pad_1; - pgtable_t pmd_huge_pte; - }; + pgtable_t pmd_huge_pte; }; - unsigned long __page_mapping; union { pgoff_t pt_index; @@ -591,47 +583,13 @@ struct ptdesc { }; union { - unsigned long _pt_pad_2; #if ALLOC_SPLIT_PTLOCKS spinlock_t *ptl; #else spinlock_t ptl; #endif }; - unsigned int __page_type; - atomic_t __page_refcount; -#ifdef CONFIG_MEMCG - unsigned long pt_memcg_data; -#endif -}; - -#define TABLE_MATCH(pg, pt) \ - static_assert(offsetof(struct page, pg) == offsetof(struct ptdesc, pt)) -TABLE_MATCH(flags, pt_flags); -TABLE_MATCH(compound_head, pt_list); -TABLE_MATCH(compound_head, _pt_pad_1); -TABLE_MATCH(mapping, __page_mapping); -TABLE_MATCH(__folio_index, pt_index); -TABLE_MATCH(rcu_head, pt_rcu_head); -TABLE_MATCH(page_type, __page_type); -TABLE_MATCH(_refcount, __page_refcount); -#ifdef CONFIG_MEMCG -TABLE_MATCH(memcg_data, pt_memcg_data); -#endif -#undef TABLE_MATCH -static_assert(sizeof(struct ptdesc) <= sizeof(struct page)); - -#define ptdesc_page(pt) (_Generic((pt), \ - const struct ptdesc *: (const struct page *)(pt), \ - struct ptdesc *: (struct page *)(pt))) - -#define ptdesc_folio(pt) (_Generic((pt), \ - const struct ptdesc *: (const struct folio *)(pt), \ - struct ptdesc *: (struct folio *)(pt))) - -#define page_ptdesc(p) (_Generic((p), \ - const struct page *: (const struct ptdesc *)(p), \ - struct page *: (struct ptdesc *)(p))) +} __aligned(16); #ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING static inline void ptdesc_pmd_pts_init(struct ptdesc *ptdesc) diff --git a/mm/internal.h b/mm/internal.h index 15d64601289b..d57487ba443d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -100,6 +100,7 @@ struct pagetable_move_control { unlikely(__ret_warn_once); \ }) +void __init ptcache_init(void); void page_writeback_init(void); /* diff --git a/mm/memory.c b/mm/memory.c index 47eb5834db23..331582bec495 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -7267,10 +7267,17 @@ long copy_folio_from_user(struct folio *dst_folio, } #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ +static struct kmem_cache *ptcache; + +void __init ptcache_init(void) +{ + ptcache = KMEM_CACHE(ptdesc, 0); +} + /** * pagetable_alloc - Allocate pagetables * @gfp: GFP flags - * @order: desired pagetable order + * @order: pagetable order * * pagetable_alloc allocates memory for page tables as well as a page table * descriptor to describe that memory. @@ -7279,16 +7286,34 @@ long copy_folio_from_user(struct folio *dst_folio, */ struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int order) { - struct page *page = alloc_frozen_pages_noprof(gfp | __GFP_COMP, order); + struct page *page; pg_data_t *pgdat; + struct ptdesc *ptdesc; + + BUG_ON(!ptcache); - if (!page) + ptdesc = kmem_cache_alloc(ptcache, gfp); + if (!ptdesc) return NULL; + page = alloc_pages_memdesc(gfp, order, + memdesc_create(ptdesc, MEMDESC_TYPE_PAGE_TABLE)); + if (!page) { + kmem_cache_free(ptcache, ptdesc); + return NULL; + } + + VM_BUG_ON_PAGE(memdesc_type(page->memdesc) != MEMDESC_TYPE_PAGE_TABLE, page); pgdat = NODE_DATA(page_to_nid(page)); mod_node_page_state(pgdat, NR_PAGETABLE, 1 << order); __SetPageTable(page); - return page_ptdesc(page); + page->__folio_index = (unsigned long)ptdesc; + + ptdesc->pt_flags = page->flags; + ptdesc->pt_flags.f |= order; + ptdesc->pt_page = page; + + return ptdesc; } /** @@ -7302,7 +7327,7 @@ void pagetable_free(struct ptdesc *pt) { pg_data_t *pgdat = NODE_DATA(memdesc_nid(pt->pt_flags)); struct page *page = ptdesc_page(pt); - unsigned int order = compound_order(page); + unsigned int order = pt->pt_flags.f & 0xf; mod_node_page_state(pgdat, NR_PAGETABLE, -(1L << order)); __ClearPageTable(page); diff --git a/mm/mm_init.c b/mm/mm_init.c index 3db2dea7db4c..dc6d2f81b692 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2712,6 +2712,7 @@ void __init mm_core_init(void) */ page_ext_init_flatmem_late(); kmemleak_init(); + ptcache_init(); ptlock_cache_init(); pgtable_cache_init(); debug_objects_mem_init(); -- 2.47.2