Replace the mutex-protected damon_access_reports[] single-buffer with a per-CPU SPSC ring. The producer (damon_report_access) is called from NMI by perf overflow handlers; the consumer (kdamond_check_reported_accesses) runs once per sample tick. - 256-entry ring per CPU with cache-line-aligned head/tail - per-CPU damon_report_ring_busy guards against NMI nesting on top of a process-context producer on the same CPU - per-CPU damon_ring_pending bit so the consumer iterates only CPUs that produced samples this tick - smp_mb between flag clear and head read on the consumer side pairs with the producer's head-publish ordering Replace the O(N) per-region scan in kdamond_apply_access_report() with bsearch over a per-tick per-target snapshot built into a reusable damon_ctx::drain_snapshot buffer. The pid-based ctx early-reject is no longer needed: kdamond_apply_access_report() already discriminates report->vaddr vs report->paddr per ctx. Wire the damon_perf_event lifecycle: init per attached event when kdamond starts, teardown when the ctx is destroyed, replayed across damon_commit_ctx. Add the matching forward decl + drain_snapshot field on struct damon_ctx. Signed-off-by: Ravi Jonnalagadda --- include/trace/events/damon.h | 17 ++ mm/damon/core.c | 383 ++++++++++++++++++++++++++++++----- 2 files changed, 344 insertions(+), 56 deletions(-) diff --git a/include/trace/events/damon.h b/include/trace/events/damon.h index b131bee27cc4a..e97e70579a8c8 100644 --- a/include/trace/events/damon.h +++ b/include/trace/events/damon.h @@ -74,6 +74,23 @@ TRACE_EVENT(damos_esz, __entry->esz) ); +TRACE_EVENT(damon_perf_ring_overflow, + + TP_PROTO(int cpu), + + TP_ARGS(cpu), + + TP_STRUCT__entry( + __field(int, cpu) + ), + + TP_fast_assign( + __entry->cpu = cpu; + ), + + TP_printk("cpu=%d", __entry->cpu) +); + TRACE_EVENT_CONDITION(damos_before_apply, TP_PROTO(unsigned int context_idx, unsigned int scheme_idx, diff --git a/mm/damon/core.c b/mm/damon/core.c index 23311189b589e..1e6966e45144f 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -8,6 +8,7 @@ #define pr_fmt(fmt) "damon: " fmt #include +#include #include #include #include @@ -24,22 +25,43 @@ #define CREATE_TRACE_POINTS #include -#define DAMON_ACCESS_REPORTS_CAP 1000 +#define DAMON_REPORT_RING_SIZE 256 +#define DAMON_REPORT_RING_MASK (DAMON_REPORT_RING_SIZE - 1) + +/* Per-target region lookup snapshot for the drain loop. */ +struct damon_target_lookup { + struct damon_target *t; + struct damon_region **regions; + unsigned int nr_regions; +}; + +struct damon_report_ring { + unsigned int head; /* written by producer (NMI) */ + unsigned int tail /* written by consumer (kdamond) */ + ____cacheline_aligned_in_smp; + struct damon_access_report entries[DAMON_REPORT_RING_SIZE] + ____cacheline_aligned_in_smp; +}; + +static DEFINE_PER_CPU(struct damon_report_ring, damon_report_rings); +static DEFINE_PER_CPU(local_t, damon_report_ring_busy); +/* + * Producer (NMI) sets after publishing a report; consumer (kdamond) clears + * before draining the corresponding ring. Per-CPU to avoid cross-CPU + * cacheline bouncing under sampling load on large systems. + */ +static DEFINE_PER_CPU(unsigned long, damon_ring_pending); static DEFINE_MUTEX(damon_lock); static int nr_running_ctxs; static bool running_exclusive_ctxs; +static struct damon_ctx *damon_perf_owner; static DEFINE_MUTEX(damon_ops_lock); static struct damon_operations damon_registered_ops[NR_DAMON_OPS]; static struct kmem_cache *damon_region_cache __ro_after_init; -static DEFINE_MUTEX(damon_access_reports_lock); -static struct damon_access_report damon_access_reports[ - DAMON_ACCESS_REPORTS_CAP]; -static int damon_access_reports_len; - /* Should be called under damon_ops_lock with id smaller than NR_DAMON_OPS */ static bool __damon_is_registered_ops(enum damon_ops_id id) { @@ -805,11 +827,24 @@ struct damon_ctx *damon_new_ctx(void) INIT_LIST_HEAD(&ctx->adaptive_targets); INIT_LIST_HEAD(&ctx->schemes); + INIT_LIST_HEAD(&ctx->perf_events); + prandom_seed_state(&ctx->rnd_state, get_random_u64()); return ctx; } +static void damon_perf_destroy(struct damon_ctx *ctx) +{ + struct damon_perf_event *event, *next; + + list_for_each_entry_safe(event, next, &ctx->perf_events, list) { + damon_perf_cleanup(ctx, event); + list_del(&event->list); + kfree(event); + } +} + static void damon_destroy_targets(struct damon_ctx *ctx) { struct damon_target *t, *next_t; @@ -835,6 +870,11 @@ void damon_destroy_ctx(struct damon_ctx *ctx) damon_for_each_sample_filter_safe(f, next_f, &ctx->sample_control) damon_destroy_sample_filter(f, &ctx->sample_control); + damon_perf_destroy(ctx); + + kfree(ctx->drain_snapshot.lookups); + kfree(ctx->drain_snapshot.region_buf); + kfree(ctx); } @@ -1694,6 +1734,45 @@ static int damon_commit_sample_control( return damon_commit_sample_filters(dst, src); } +static int damon_commit_perf_events(struct damon_ctx *dst, + struct damon_ctx *src) +{ + struct damon_perf_event *src_event, *new_event; + int err = 0; + + damon_perf_destroy(dst); + + list_for_each_entry(src_event, &src->perf_events, list) { + new_event = kzalloc(sizeof(*new_event), GFP_KERNEL); + if (!new_event) { + err = -ENOMEM; + goto out; + } + + new_event->attr = src_event->attr; + + if (damon_is_running(dst)) { + err = damon_perf_init(dst, new_event); + if (err) { + kfree(new_event); + goto out; + } + /* + * Events are created with attr.disabled=1 and only fire while + * the kdamond runs. Arm now if we are committing into a + * running ctx whose substrate is already armed. + */ + if (dst->perf_events_active) + damon_perf_event_arm(new_event); + } + list_add_tail(&new_event->list, &dst->perf_events); + } + return 0; +out: + damon_perf_destroy(dst); + return err; +} + static int __damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src) { int err; @@ -1742,6 +1821,9 @@ static int __damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src) return err; err = damon_commit_sample_control(&dst->sample_control, &src->sample_control); + if (err) + return err; + err = damon_commit_perf_events(dst, src); if (err) return err; dst->addr_unit = src->addr_unit; @@ -1929,12 +2011,40 @@ int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive) return -EBUSY; } + /* + * The per-CPU PMU events backing the perf-event substrate are a single + * shared resource; only one ctx may own them. Reject the start if + * another already-running ctx owns the substrate, or if more than one + * ctx in this batch wants it. + */ + for (i = 0; i < nr_ctxs; i++) { + if (!list_empty(&ctxs[i]->perf_events)) { + int j; + + if (damon_perf_owner) { + mutex_unlock(&damon_lock); + return -EBUSY; + } + for (j = i + 1; j < nr_ctxs; j++) { + if (!list_empty(&ctxs[j]->perf_events)) { + mutex_unlock(&damon_lock); + return -EBUSY; + } + } + damon_perf_owner = ctxs[i]; + break; + } + } + for (i = 0; i < nr_ctxs; i++) { err = __damon_start(ctxs[i]); if (err) break; nr_running_ctxs++; } + if (err && damon_perf_owner && + !damon_perf_owner->kdamond) + damon_perf_owner = NULL; if (exclusive && nr_running_ctxs) running_exclusive_ctxs = true; mutex_unlock(&damon_lock); @@ -2113,29 +2223,47 @@ int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control) * damon_report_access() - Report identified access events to DAMON. * @report: The reporting access information. * - * Report access events to DAMON. + * Report access events to DAMON via a per-CPU SPSC lockless ring. Producer + * is the local CPU (typically NMI from a hardware-sampling backend); + * consumer is the kdamond drain in kdamond_check_reported_accesses(). * - * Context: May sleep. + * Context: any (NMI-safe). An NMI nesting on top of a process-context + * producer on the same CPU would otherwise stomp the same entries[head] + * slot; the busy guard detects and drops in that case. * - * NOTE: we may be able to implement this as a lockless queue, and allow any - * context. As the overhead is unknown, and region-based DAMON logics would - * guarantee the reports would be not made that frequently, let's start with - * this simple implementation. + * If the ring is full, the sample is dropped and the per-CPU overflow + * counter incremented. */ void damon_report_access(struct damon_access_report *report) { - struct damon_access_report *dst; + struct damon_report_ring *ring; + unsigned int head, next; - /* silently fail for races */ - if (!mutex_trylock(&damon_access_reports_lock)) - return; - dst = &damon_access_reports[damon_access_reports_len++]; - /* just drop all existing reports in favor of simplicity. */ - if (damon_access_reports_len == DAMON_ACCESS_REPORTS_CAP) - damon_access_reports_len = 0; - *dst = *report; - dst->report_jiffies = jiffies; - mutex_unlock(&damon_access_reports_lock); + /* Pin to a CPU so the SPSC invariant holds for preemptible callers. */ + preempt_disable(); + if (local_inc_return(this_cpu_ptr(&damon_report_ring_busy)) != 1) { + /* NMI nested on a process-context producer; drop. */ + trace_damon_perf_ring_overflow(smp_processor_id()); + goto out; + } + + ring = this_cpu_ptr(&damon_report_rings); + head = ring->head; + next = (head + 1) & DAMON_REPORT_RING_MASK; + + if (next == READ_ONCE(ring->tail)) { + trace_damon_perf_ring_overflow(smp_processor_id()); + goto out; + } + + ring->entries[head] = *report; + ring->entries[head].report_jiffies = jiffies; + smp_wmb(); /* publish entry before head advance */ + WRITE_ONCE(ring->head, next); + WRITE_ONCE(*this_cpu_ptr(&damon_ring_pending), 1); +out: + local_dec(this_cpu_ptr(&damon_report_ring_busy)); + preempt_enable(); } #ifdef CONFIG_MMU @@ -2145,7 +2273,8 @@ void damon_report_page_fault(struct vm_fault *vmf, bool huge_pmd) .vaddr = vmf->address, .size = 1, /* todo: set appripriately */ .cpu = smp_processor_id(), - .tid = task_pid_vnr(current), + .tid = current->pid, + .tgid = task_tgid_nr(current), .is_write = vmf->flags & FAULT_FLAG_WRITE, }; @@ -3700,6 +3829,7 @@ static void kdamond_init_ctx(struct damon_ctx *ctx) unsigned long sample_interval = ctx->attrs.sample_interval ? ctx->attrs.sample_interval : 1; struct damos *scheme; + struct damon_perf_event *event, *next; ctx->passed_sample_intervals = 0; ctx->next_aggregation_sis = ctx->attrs.aggr_interval / sample_interval; @@ -3713,6 +3843,15 @@ static void kdamond_init_ctx(struct damon_ctx *ctx) damos_set_next_apply_sis(scheme, ctx); damos_set_filters_default_reject(scheme); } + + list_for_each_entry_safe(event, next, &ctx->perf_events, list) { + int err = damon_perf_init(ctx, event); + + if (err) { + list_del(&event->list); + kfree(event); + } + } } static bool damon_sample_filter_matching(struct damon_access_report *report, @@ -3759,26 +3898,46 @@ static bool damon_sample_filter_out(struct damon_access_report *report, } static void kdamond_apply_access_report(struct damon_access_report *report, - struct damon_target *t, struct damon_ctx *ctx) + struct damon_target *t, + struct damon_region **regions, unsigned int nr_regions, + struct damon_ctx *ctx) { struct damon_region *r; unsigned long addr; + int left, right, mid; - if (damon_sample_filter_out(report, &ctx->sample_control)) - return; - if (damon_target_has_pid(ctx)) + if (damon_target_has_pid(ctx)) { + if (pid_nr(t->pid) != report->tgid) + return; addr = report->vaddr; - else + } else { addr = report->paddr; + } - /* todo: make search faster, e.g., binary search? */ - damon_for_each_region(r, t) { - if (addr < r->ar.start) - continue; - if (r->ar.end < addr + report->size) - continue; - if (!r->access_reported) - damon_update_region_access_rate(r, true, &ctx->attrs); + /* Binary search the snapshot for the region containing addr. */ + left = 0; + right = nr_regions - 1; + r = NULL; + while (left <= right) { + /* Avoid (left + right) overflow at large nr_regions. */ + mid = left + (right - left) / 2; + if (addr < regions[mid]->ar.start) + right = mid - 1; + else if (addr >= regions[mid]->ar.end) + left = mid + 1; + else { + r = regions[mid]; + break; + } + } + + if (!r) + return; + /* Reject reports straddling a region boundary. */ + if (addr + report->size > r->ar.end) + return; + if (!r->access_reported) { + damon_update_region_access_rate(r, true, &ctx->attrs); r->access_reported = true; } } @@ -3802,28 +3961,120 @@ static unsigned int kdamond_apply_zero_access_report(struct damon_ctx *ctx) return max_nr_accesses; } -static unsigned int kdamond_check_reported_accesses(struct damon_ctx *ctx) +/* + * Build a snapshot of the ctx's targets and their region arrays for use + * by the ring drain loop. The snapshot buffer is reused across ticks, + * grown via krealloc only when a new high water mark is reached. + * + * The two-pass walk over adaptive_targets is safe even though + * krealloc_array() may sleep: target list mutation is funneled through + * damon_call onto the kdamond itself, so no other thread can mutate the + * list while kdamond is running this function. + */ +static struct damon_target_lookup *damon_build_target_lookup( + struct damon_ctx *ctx, unsigned int *nr_targets_out) { - int i; - struct damon_access_report *report; struct damon_target *t; + struct damon_target_lookup *tbl; + unsigned int nr_targets = 0, total_regions = 0, ti = 0, ri = 0; - /* currently damon_access_report supports only physical address */ - if (damon_target_has_pid(ctx)) - return 0; + damon_for_each_target(t, ctx) { + nr_targets++; + total_regions += damon_nr_regions(t); + } - mutex_lock(&damon_access_reports_lock); - for (i = 0; i < damon_access_reports_len; i++) { - report = &damon_access_reports[i]; - if (time_before(report->report_jiffies, - jiffies - - usecs_to_jiffies( - ctx->attrs.sample_interval))) + if (nr_targets > ctx->drain_snapshot.nr_lookups) { + tbl = krealloc_array(ctx->drain_snapshot.lookups, + nr_targets, sizeof(*tbl), GFP_KERNEL); + if (!tbl) + return NULL; + ctx->drain_snapshot.lookups = tbl; + ctx->drain_snapshot.nr_lookups = nr_targets; + } + tbl = ctx->drain_snapshot.lookups; + + if (total_regions > ctx->drain_snapshot.region_buf_cap) { + struct damon_region **buf; + + buf = krealloc_array(ctx->drain_snapshot.region_buf, + total_regions, sizeof(*buf), GFP_KERNEL); + if (!buf) + return NULL; + ctx->drain_snapshot.region_buf = buf; + ctx->drain_snapshot.region_buf_cap = total_regions; + } + + damon_for_each_target(t, ctx) { + struct damon_region *r; + + tbl[ti].t = t; + tbl[ti].regions = &ctx->drain_snapshot.region_buf[ri]; + tbl[ti].nr_regions = damon_nr_regions(t); + damon_for_each_region(r, t) + ctx->drain_snapshot.region_buf[ri++] = r; + ti++; + } + + *nr_targets_out = nr_targets; + return tbl; +} + +static unsigned int kdamond_check_reported_accesses(struct damon_ctx *ctx) +{ + int cpu; + struct damon_target_lookup *tbl; + unsigned int nr_targets = 0; + unsigned int i; + + tbl = damon_build_target_lookup(ctx, &nr_targets); + if (!tbl) { + pr_warn_ratelimited( + "damon: target-lookup alloc failed; ring drain skipped this tick\n"); + return kdamond_apply_zero_access_report(ctx); + } + + for_each_online_cpu(cpu) { + struct damon_report_ring *ring; + unsigned int head, tail; + + if (!READ_ONCE(*per_cpu_ptr(&damon_ring_pending, cpu))) continue; - damon_for_each_target(t, ctx) - kdamond_apply_access_report(report, t, ctx); + ring = per_cpu_ptr(&damon_report_rings, cpu); + + WRITE_ONCE(*per_cpu_ptr(&damon_ring_pending, cpu), 0); + /* + * Pair with the producer's smp_wmb between entry and head + * publish: order our flag clear before the head read so that + * a producer publishing between our clear and READ_ONCE(head) + * is observed via the flag it re-sets, not lost as a + * stale-head drain. + */ + smp_mb(); + head = READ_ONCE(ring->head); + smp_rmb(); /* pair with smp_wmb in producer */ + tail = ring->tail; + + while (tail != head) { + struct damon_access_report *report = + &ring->entries[tail]; + + if (time_before(report->report_jiffies, + jiffies - usecs_to_jiffies( + ctx->attrs.sample_interval))) + goto next; + if (damon_sample_filter_out(report, + &ctx->sample_control)) + goto next; + for (i = 0; i < nr_targets; i++) + kdamond_apply_access_report(report, + tbl[i].t, + tbl[i].regions, + tbl[i].nr_regions, ctx); +next: + tail = (tail + 1) & DAMON_REPORT_RING_MASK; + } + WRITE_ONCE(ring->tail, tail); } - mutex_unlock(&damon_access_reports_lock); /* For nr_accesses_bp, absence of access should also be reported. */ return kdamond_apply_zero_access_report(ctx); } @@ -3848,6 +4099,14 @@ static int kdamond_fn(void *data) complete(&ctx->kdamond_started); kdamond_init_ctx(ctx); + if (!list_empty(&ctx->perf_events)) { + struct damon_perf_event *event; + + WRITE_ONCE(ctx->perf_events_active, true); + list_for_each_entry(event, &ctx->perf_events, list) + damon_perf_event_arm(event); + } + if (ctx->ops.init) ctx->ops.init(ctx); ctx->regions_score_histogram = kmalloc_array(DAMOS_MAX_SCORE + 1, @@ -3871,14 +4130,15 @@ static int kdamond_fn(void *data) if (kdamond_wait_activation(ctx)) break; - if (ctx->ops.prepare_access_checks) + if (list_empty(&ctx->perf_events) && + ctx->ops.prepare_access_checks) ctx->ops.prepare_access_checks(ctx); kdamond_usleep(sample_interval); ctx->passed_sample_intervals++; - /* todo: make these non-exclusive */ - if (ctx->sample_control.primitives_enabled.page_fault) + if (!list_empty(&ctx->perf_events) || + ctx->sample_control.primitives_enabled.page_fault) max_nr_accesses = kdamond_check_reported_accesses(ctx); else if (ctx->ops.check_accesses) max_nr_accesses = ctx->ops.check_accesses(ctx); @@ -3965,6 +4225,15 @@ static int kdamond_fn(void *data) } } done: + if (ctx->perf_events_active) { + struct damon_perf_event *event; + + WRITE_ONCE(ctx->perf_events_active, false); + list_for_each_entry(event, &ctx->perf_events, list) + damon_perf_event_disarm(event); + /* Drain any in-flight reports queued before disarm took effect. */ + kdamond_check_reported_accesses(ctx); + } damon_destroy_targets(ctx); kfree(ctx->regions_score_histogram); @@ -3986,6 +4255,8 @@ static int kdamond_fn(void *data) nr_running_ctxs--; if (!nr_running_ctxs && running_exclusive_ctxs) running_exclusive_ctxs = false; + if (damon_perf_owner == ctx) + damon_perf_owner = NULL; mutex_unlock(&damon_lock); return 0; -- 2.43.0