The cost of the pcpu memory allocation when forking a new task is non-negligible, as reported in a few occasions, such as [1]. But it can also be fully avoided for single-threaded applications, where we know the vast majority of updates happen from the local task context. For the trivial benchmark, bound to cpu 0 to reduce cost of migrations), like below: for (( i = 0; i < 20000; i++ )); do /bin/true; done on an 80c machine, this patchset yielded a 6% improvement in system time. On a 256c machine, the system time reduced by 11%. Profiling shows mm_init went from 13.5% of samples to less than 3.33% in the same 256c machine: Before: - 13.50% 3.93% benchmark.sh [kernel.kallsyms] [k] mm_init - 9.57% mm_init + 4.80% pcpu_alloc_noprof + 3.87% __percpu_counter_init_many After: - 3.33% 0.80% benchmark.sh [kernel.kallsyms] [k] mm_init - 2.53% mm_init + 2.05% pcpu_alloc_noprof For kernbench in 256c, the patchset yields a 1.4% improvement on system time. For gitsource, the improvement in system time I'm measuring is around 3.12%. The upgrade adds some overhead to the second fork, in particular an atomic operation, besides the expensive allocation that was moved from the first fork to the second. So a fair question is the impact of this patchset on multi-threaded applications. I wrote a microbenchmark similar to the /bin/true above, but that just spawns a second pthread and waits for it to finish. The second thread just returns immediately. This is executed in a loop, bound to a single NUMA node, with: for (( i = 0; i < 20000; i++ )); do /bin/parallel-true; done Profiling shows the lazy upgrade impact is minimal to the performance: - 0.68% 0.04% parallel-true [kernel.kallsyms] [k] __lazy_percpu_counter_upgrade_many - 0.64% __lazy_percpu_counter_upgrade_many 0.62% pcpu_alloc_noprof Which is confirmed by the measured system time. With 20k runs, i'm still getting a slight improvement from baseline for the 2t case (2-4%). [1] https://lore.kernel.org/all/20230608111408.s2minsenlcjow7q3@quack3 Suggested-by: Jan Kara Signed-off-by: Gabriel Krisman Bertazi --- include/linux/mm.h | 24 ++++++++---------------- include/linux/mm_types.h | 4 ++-- include/trace/events/kmem.h | 4 ++-- kernel/fork.c | 14 ++++++-------- 4 files changed, 18 insertions(+), 28 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index d16b33bacc32..29de4c60ac6c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2679,36 +2679,28 @@ static inline bool get_user_page_fast_only(unsigned long addr, */ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member) { - return percpu_counter_read_positive(&mm->rss_stat[member]); + return lazy_percpu_counter_read_positive(&mm->rss_stat[member]); } static inline unsigned long get_mm_counter_sum(struct mm_struct *mm, int member) { - return percpu_counter_sum_positive(&mm->rss_stat[member]); + return lazy_percpu_counter_sum_positive(&mm->rss_stat[member]); } void mm_trace_rss_stat(struct mm_struct *mm, int member); static inline void add_mm_counter(struct mm_struct *mm, int member, long value) { - percpu_counter_add(&mm->rss_stat[member], value); - - mm_trace_rss_stat(mm, member); -} - -static inline void inc_mm_counter(struct mm_struct *mm, int member) -{ - percpu_counter_inc(&mm->rss_stat[member]); + if (READ_ONCE(current->mm) == mm) + lazy_percpu_counter_add_fast(&mm->rss_stat[member], value); + else + lazy_percpu_counter_add_atomic(&mm->rss_stat[member], value); mm_trace_rss_stat(mm, member); } -static inline void dec_mm_counter(struct mm_struct *mm, int member) -{ - percpu_counter_dec(&mm->rss_stat[member]); - - mm_trace_rss_stat(mm, member); -} +#define inc_mm_counter(mm, member) add_mm_counter(mm, member, 1) +#define dec_mm_counter(mm, member) add_mm_counter(mm, member, -1) /* Optimized variant when folio is already known not to be anon */ static inline int mm_counter_file(struct folio *folio) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 90e5790c318f..5a8d677efa85 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include @@ -1119,7 +1119,7 @@ struct mm_struct { unsigned long saved_e_flags; #endif - struct percpu_counter rss_stat[NR_MM_COUNTERS]; + struct lazy_percpu_counter rss_stat[NR_MM_COUNTERS]; struct linux_binfmt *binfmt; diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index 7f93e754da5c..e21572f4d8a6 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -442,8 +442,8 @@ TRACE_EVENT(rss_stat, __entry->mm_id = mm_ptr_to_hash(mm); __entry->curr = !!(current->mm == mm); __entry->member = member; - __entry->size = (percpu_counter_sum_positive(&mm->rss_stat[member]) - << PAGE_SHIFT); + __entry->size = (lazy_percpu_counter_sum_positive(&mm->rss_stat[member]) + << PAGE_SHIFT); ), TP_printk("mm_id=%u curr=%d type=%s size=%ldB", diff --git a/kernel/fork.c b/kernel/fork.c index 3da0f08615a9..92698c60922e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -583,7 +583,7 @@ static void check_mm(struct mm_struct *mm) "Please make sure 'struct resident_page_types[]' is updated as well"); for (i = 0; i < NR_MM_COUNTERS; i++) { - long x = percpu_counter_sum(&mm->rss_stat[i]); + long x = lazy_percpu_counter_sum_local(&mm->rss_stat[i]); if (unlikely(x)) { pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld Comm:%s Pid:%d\n", @@ -688,7 +688,7 @@ void __mmdrop(struct mm_struct *mm) put_user_ns(mm->user_ns); mm_pasid_drop(mm); mm_destroy_cid(mm); - percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS); + lazy_percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS); free_mm(mm); } @@ -1083,16 +1083,11 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, if (mm_alloc_cid(mm, p)) goto fail_cid; - if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT, - NR_MM_COUNTERS)) - goto fail_pcpu; - + lazy_percpu_counter_init_many(mm->rss_stat, 0, NR_MM_COUNTERS); mm->user_ns = get_user_ns(user_ns); lru_gen_init_mm(mm); return mm; -fail_pcpu: - mm_destroy_cid(mm); fail_cid: destroy_context(mm); fail_nocontext: @@ -1535,6 +1530,9 @@ static int copy_mm(u64 clone_flags, struct task_struct *tsk) return 0; if (clone_flags & CLONE_VM) { + if (lazy_percpu_counter_upgrade_many(oldmm->rss_stat, + NR_MM_COUNTERS, GFP_KERNEL_ACCOUNT)) + return -ENOMEM; mmget(oldmm); mm = oldmm; } else { -- 2.51.0