Add the swap hooks 'prepare_to_swap()' (before swap-out and page-free), 'swap_restore()' (after page-alloc and swap-in), and 'swap_invalidate()' for page and area. The first two hooks are the core of the new functionality. They store the (initial) allocation stack trace at swap-out and load it at swap-in, in order to 'maintain' the allocation stack trace over swap-out/swap-in. The refcounts for the initial allocation and the allocation at swap-in are adjusted/fixed-up (incremented and decremented, respectively), as the initial allocation is decremented at swap-out (page free) later, and the swap-in allocation is incremented at swap-in (page alloc) earlier. This is based on the swap hooks implementation for memory tags on arm64 ('arch/arm64/mm/mteswap.c'; thanks!) Signed-off-by: Mauricio Faria de Oliveira --- include/linux/page_owner.h | 49 +++++++++++++++ mm/page_owner.c | 120 +++++++++++++++++++++++++++++++++++++ 2 files changed, 169 insertions(+) diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h index 3328357f6dba..cd95aacceba7 100644 --- a/include/linux/page_owner.h +++ b/include/linux/page_owner.h @@ -75,4 +75,53 @@ static inline void dump_page_owner(const struct page *page) { } #endif /* CONFIG_PAGE_OWNER */ + +#ifdef CONFIG_SWAP_PAGE_OWNER +extern struct static_key_false swap_page_owner_inited; + +extern int __page_owner_prepare_to_swap(struct folio *folio); +extern void __page_owner_swap_restore(swp_entry_t entry, struct folio *folio); +extern void __page_owner_swap_invalidate_page(int type, pgoff_t offset); +extern void __page_owner_swap_invalidate_area(int type); + +static inline int page_owner_prepare_to_swap(struct folio *folio) +{ + if (static_branch_unlikely(&swap_page_owner_inited)) + return __page_owner_prepare_to_swap(folio); + + return 0; +} + +static inline void page_owner_swap_restore(swp_entry_t entry, struct folio *folio) +{ + if (static_branch_unlikely(&swap_page_owner_inited)) + return __page_owner_swap_restore(entry, folio); +} + +static inline void page_owner_swap_invalidate_page(int type, pgoff_t offset) +{ + if (static_branch_unlikely(&swap_page_owner_inited)) + return __page_owner_swap_invalidate_page(type, offset); +} + +static inline void page_owner_swap_invalidate_area(int type) +{ + if (static_branch_unlikely(&swap_page_owner_inited)) + return __page_owner_swap_invalidate_area(type); +} +#else +static inline int page_owner_prepare_to_swap(struct folio *folio) +{ + return 0; +} +static inline void page_owner_swap_restore(swp_entry_t entry, struct folio *folio) +{ +} +static inline void page_owner_swap_invalidate_page(int type, pgoff_t offset) +{ +} +static inline void page_owner_swap_invalidate_area(int type) +{ +} +#endif /* CONFIG_SWAP_PAGE_OWNER */ #endif /* __LINUX_PAGE_OWNER_H */ diff --git a/mm/page_owner.c b/mm/page_owner.c index 5cd7de1f8023..d256f58deca4 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -525,6 +525,126 @@ static void copy_from_swap_page_owner(struct page_owner *page_owner, page_owner->tgid = spo->tgid; strscpy(page_owner->comm, spo->comm, sizeof(page_owner->comm)); } + +/* Store the initial stack information from page_owner to xarray. */ +int __page_owner_prepare_to_swap(struct folio *folio) +{ + struct page_ext_iter iter; + struct page_ext *page_ext; + struct page_owner *page_owner; + struct swap_page_owner *spo; + depot_stack_handle_t handle = 0; + swp_entry_t entry; + long i = 0, nr_pages = folio_nr_pages(folio); + int err; + + rcu_read_lock(); + for_each_page_ext(&folio->page, nr_pages, page_ext, iter) { + spo = alloc_swap_page_owner(); + if (!spo) { + err = -ENOMEM; + goto out_locked; + } + + page_owner = get_page_owner(page_ext); + copy_to_swap_page_owner(spo, page_owner); + entry = page_swap_entry(folio_page(folio, i)); + err = store_swap_page_owner(spo, entry); + if (err) + goto out_locked; + + if (!handle) + handle = page_owner->handle; + i++; + } + rcu_read_unlock(); + + /* + * Fix-up: increment refcount of the initial allocation. + * It will be decremented by page-free at swap-out. + */ + inc_stack_record_count(handle, GFP_KERNEL, nr_pages); + + return 0; + +out_locked: + for_each_page_ext(&folio->page, nr_pages, page_ext, iter) { + if (!i--) + break; + + entry = page_swap_entry(folio_page(folio, i)); + erase_swap_page_owner(entry, true); + + page_owner = get_page_owner(page_ext); + + } + rcu_read_unlock(); + return err; +} + +/* Load the initial stack information from xarray to page_owner. */ +void __page_owner_swap_restore(swp_entry_t entry, struct folio *folio) +{ + struct page_ext_iter iter; + struct page_ext *page_ext; + struct page_owner *page_owner; + struct swap_page_owner *spo; + depot_stack_handle_t handle = 0; + long i = 0, nr_pages = folio_nr_pages(folio); + + rcu_read_lock(); + for_each_page_ext(&folio->page, nr_pages, page_ext, iter) { + spo = (struct swap_page_owner *) load_swap_page_owner(entry); + if (!spo) { + rcu_read_unlock(); + return; + } + + page_owner = get_page_owner(page_ext); + copy_from_swap_page_owner(page_owner, spo); + + if (!handle) + handle = page_owner->handle; + i++; + entry.val++; + } + rcu_read_unlock(); + + /* + * Fix-up: decrement refcount of the swap-in allocation. + * It was incremented by the page-alloc at swap-in. + * (early_handle: see __reset_page_owner().) + * + * FIXME(mfo): 'dec_stack_record_count: refcount went to 0 ...' + * with stack_depot oops is hit occasionaly on tests or shutdown. + */ + if (handle != early_handle) + dec_stack_record_count(handle, nr_pages); +} + +void __page_owner_swap_invalidate_page(int type, pgoff_t offset) +{ + swp_entry_t entry = swp_entry(type, offset); + + erase_swap_page_owner(entry, true); +} + +void __page_owner_swap_invalidate_area(int type) +{ + swp_entry_t first_entry = swp_entry(type, 0); + swp_entry_t last_entry = swp_entry(type + 1, 0); + swp_entry_t entry; + void *spo; + + XA_STATE(xa_state, &swap_page_owners, first_entry.val); + + xa_lock(&swap_page_owners); + xas_for_each(&xa_state, spo, last_entry.val - 1) { + entry.val = xa_state.xa_index; + erase_swap_page_owner(entry, false); + } + xa_unlock(&swap_page_owners); +} #endif void pagetypeinfo_showmixedcount_print(struct seq_file *m, -- 2.51.0