Core functionality implementation for asynchronous release of swap entries: 1. For eligible processes, swap pages are first asynchronously aggregated to a global list 2. Batch release occurs once a defined threshold is reached 3. Asynchronous release is executed by kworkers of a workqueue, with a max_active configuration macro provided to control concurrent work item numbers and address NUMA release efficiency issues Signed-off-by: Lei Liu --- include/linux/oom.h | 23 ++++++ include/linux/swapfile.h | 1 + include/linux/vm_event_item.h | 1 + kernel/exit.c | 2 + mm/memcontrol.c | 6 -- mm/memory.c | 4 +- mm/swapfile.c | 134 ++++++++++++++++++++++++++++++++++ mm/vmstat.c | 1 + 8 files changed, 165 insertions(+), 7 deletions(-) diff --git a/include/linux/oom.h b/include/linux/oom.h index 1e0fc6931ce9..aa34429cc83b 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -56,6 +56,23 @@ struct oom_control { extern struct mutex oom_lock; extern struct mutex oom_adj_mutex; +extern atomic_t exiting_task_count; // exiting task counts + +static inline int get_exiting_task_count(void) +{ + return atomic_read(&exiting_task_count); +} + +static inline void inc_exiting_task_count(void) +{ + atomic_inc(&exiting_task_count); +} + +static inline void dec_exiting_task_count(void) +{ + atomic_dec(&exiting_task_count); +} + static inline void set_current_oom_origin(void) { current->signal->oom_flag_origin = true; @@ -76,6 +93,12 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk) return tsk->signal->oom_mm; } +static inline bool task_is_dying(void) +{ + return tsk_is_oom_victim(current) || fatal_signal_pending(current) || + (current->flags & PF_EXITING); +} + /* * Checks whether a page fault on the given mm is still reliable. * This is no longer true if the oom reaper started to reap the diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h index 99e3ed469e88..dc43464cd838 100644 --- a/include/linux/swapfile.h +++ b/include/linux/swapfile.h @@ -4,6 +4,7 @@ extern unsigned long generic_max_swapfile_size(void); unsigned long arch_max_swapfile_size(void); +int add_to_swap_gather_cache(struct mm_struct *mm, swp_entry_t entry, int nr); /* Maximum swapfile size supported for the arch (not inclusive). */ extern unsigned long swapfile_maximum_size; diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 9e15a088ba38..05f33d26d459 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -186,6 +186,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, KSTACK_REST, #endif #endif /* CONFIG_DEBUG_STACK_USAGE */ + ASYNC_SWAP_COUNTS, NR_VM_EVENT_ITEMS }; diff --git a/kernel/exit.c b/kernel/exit.c index 343eb97543d5..c879fe32aa0e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -897,6 +897,7 @@ void __noreturn do_exit(long code) WARN_ON(irqs_disabled()); WARN_ON(tsk->plug); + inc_exiting_task_count(); kcov_task_exit(tsk); kmsan_task_exit(tsk); @@ -1001,6 +1002,7 @@ void __noreturn do_exit(long code) exit_tasks_rcu_finish(); lockdep_free_task(tsk); + dec_exiting_task_count(); do_task_dead(); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8dd7fbed5a94..79bc4321cbb3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -102,12 +102,6 @@ static struct kmem_cache *memcg_pn_cachep; static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); #endif -static inline bool task_is_dying(void) -{ - return tsk_is_oom_victim(current) || fatal_signal_pending(current) || - (current->flags & PF_EXITING); -} - /* Some nice accessors for the vmpressure. */ struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) { diff --git a/mm/memory.c b/mm/memory.c index 0ba4f6b71847..e09db2932b25 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -75,6 +75,7 @@ #include #include #include +#include #include @@ -1617,7 +1618,8 @@ static inline int zap_nonpresent_ptes(struct mmu_gather *tlb, nr = swap_pte_batch(pte, max_nr, ptent); rss[MM_SWAPENTS] -= nr; - free_swap_and_cache_nr(entry, nr); + if (add_to_swap_gather_cache(tlb->mm, entry, nr)) + free_swap_and_cache_nr(entry, nr); } else if (is_migration_entry(entry)) { struct folio *folio = pfn_swap_entry_folio(entry); diff --git a/mm/swapfile.c b/mm/swapfile.c index b4f3cc712580..7c69e726b075 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -42,6 +42,10 @@ #include #include #include +#include +#include +#include +#include #include #include @@ -170,6 +174,136 @@ static long swap_usage_in_pages(struct swap_info_struct *si) /* Reclaim the swap entry if swap is getting full */ #define TTRS_FULL 0x4 +/* Minimum number of exiting processes, adjustable based on system load */ +#define MIN_EXITING_TASKS_THRESHOLD 1 +/* Number of active work items for asynchronously releasing swap cache. + * Defaults to zero and is determined by the system itself, it can also + * be configured manually based on system load. + */ +#define NUM_ASYNC_SWAP_WORK_ITEMS 0 + +static struct workqueue_struct *release_wq; +static LIST_HEAD(swap_cache_list); +static spinlock_t swap_cache_lock; +static int cache_count; +static int max_cache_entries = 32; +static struct kmem_cache *swap_entry_cachep; +atomic_t exiting_task_count = ATOMIC_INIT(0); + +/* Represents a cache entry for swap operations */ +struct swap_entry_cache { + swp_entry_t entry; + int nr; + struct list_head list; +}; + +static int async_swap_free_counts_show(struct seq_file *m, void *v) +{ + seq_printf(m, "exiting_tasks:%d cache_counts:%d\n", + get_exiting_task_count(), cache_count); + return 0; +} + +static void async_release_func(struct work_struct *work) +{ + struct swap_entry_cache *sec, *tmp; + unsigned int counts = 0; + LIST_HEAD(temp_list); + + if (cache_count) { + spin_lock_irq(&swap_cache_lock); + list_splice_init(&swap_cache_list, &temp_list); + cache_count = 0; + spin_unlock_irq(&swap_cache_lock); + } else { + goto out; + } + + list_for_each_entry_safe(sec, tmp, &temp_list, list) { + free_swap_and_cache_nr(sec->entry, sec->nr); + kmem_cache_free(swap_entry_cachep, sec); + counts++; + } + count_vm_events(ASYNC_SWAP_COUNTS, counts); +out: + kfree(work); +} + +static void flush_cache_if_needed(bool check_cache_count) +{ + struct work_struct *release_work; + + if ((!check_cache_count && cache_count) || + cache_count >= max_cache_entries) { + release_work = kmalloc(sizeof(*release_work), GFP_ATOMIC); + if (release_work) { + INIT_WORK(release_work, async_release_func); + queue_work(release_wq, release_work); + } + } +} + +/* + * add_to_swap_gather_cache - Add a swap entry to the cache. + * @mm: Memory descriptor. + * @entry: Swap entry to add. + * @nr: Associated number. + * + * Returns 0 on success, -1 for unmet conditions, -ENOMEM on allocation failure. + * + * Checks task exiting counts, allocates cache entry, adds it to the swap cache + * list, and may trigger a cache flush. + */ +int add_to_swap_gather_cache(struct mm_struct *mm, swp_entry_t entry, int nr) +{ + struct swap_entry_cache *sec; + + if (!mm || get_exiting_task_count() < MIN_EXITING_TASKS_THRESHOLD) + return -1; + + if (!task_is_dying() || + get_mm_counter(mm, MM_SWAPENTS) < (100 * SWAP_CLUSTER_MAX)) + return -1; + + sec = kmem_cache_alloc(swap_entry_cachep, GFP_ATOMIC); + if (!sec) + return -ENOMEM; + + sec->entry = entry; + sec->nr = nr; + INIT_LIST_HEAD(&sec->list); + + spin_lock_irq(&swap_cache_lock); + list_add_tail(&sec->list, &swap_cache_list); + cache_count++; + spin_unlock_irq(&swap_cache_lock); + + flush_cache_if_needed(true); + + return 0; +} + +static int __init swap_async_free_setup(void) +{ + release_wq = alloc_workqueue("async_swap_free", + WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM, + NUM_ASYNC_SWAP_WORK_ITEMS); + if (!release_wq) + return -ENOMEM; + + swap_entry_cachep = KMEM_CACHE(swap_entry_cache, SLAB_ACCOUNT); + if (!swap_entry_cachep) + return -ENOMEM; + + spin_lock_init(&swap_cache_lock); + proc_create_single("aswap_free_counts", 0, NULL, + async_swap_free_counts_show); + + return 0; +} + +postcore_initcall(swap_async_free_setup); + static bool swap_only_has_cache(struct swap_info_struct *si, unsigned long offset, int nr_pages) { diff --git a/mm/vmstat.c b/mm/vmstat.c index 71cd1ceba191..fa7fe910becf 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1494,6 +1494,7 @@ const char * const vmstat_text[] = { [I(KSTACK_REST)] = "kstack_rest", #endif #endif + [I(ASYNC_SWAP_COUNTS)] = "async_swap_count", #undef I #endif /* CONFIG_VM_EVENT_COUNTERS */ }; -- 2.34.1