From: Steven Rostedt Some trace events want to expose in their output if they were triggered in an interrupt or softirq context. Instead of recording this in the event structure itself, as this information is stored in the flags portion of the event header, add helper macros that can be used in the print format: TP_printk("val=%d %s", __entry->val, __entry_in_irq() ? "(in-irq)" : "") This will output "(in-irq)" for the event in the trace data if the event was triggered in hard or soft interrupt context. Link: https://lore.kernel.org/all/20251229132942.31a2b583@gandalf.local.home/ Signed-off-by: Steven Rostedt (Google) --- include/trace/stages/stage3_trace_output.h | 8 ++++++++ include/trace/stages/stage7_class_define.h | 19 +++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/include/trace/stages/stage3_trace_output.h b/include/trace/stages/stage3_trace_output.h index 1e7b0bef95f52..53a23988a3b8a 100644 --- a/include/trace/stages/stage3_trace_output.h +++ b/include/trace/stages/stage3_trace_output.h @@ -150,3 +150,11 @@ #undef __get_buf #define __get_buf(len) trace_seq_acquire(p, (len)) + +#undef __event_in_hardirq +#undef __event_in_softirq +#undef __event_in_irq + +#define __event_in_hardirq() (__entry->ent.flags & TRACE_FLAG_HARDIRQ) +#define __event_in_softirq() (__entry->ent.flags & TRACE_FLAG_SOFTIRQ) +#define __event_in_irq() (__entry->ent.flags & (TRACE_FLAG_HARDIRQ | TRACE_FLAG_SOFTIRQ)) diff --git a/include/trace/stages/stage7_class_define.h b/include/trace/stages/stage7_class_define.h index fcd564a590f43..47008897a7956 100644 --- a/include/trace/stages/stage7_class_define.h +++ b/include/trace/stages/stage7_class_define.h @@ -26,6 +26,25 @@ #undef __print_hex_dump #undef __get_buf +#undef __event_in_hardirq +#undef __event_in_softirq +#undef __event_in_irq + +/* + * The TRACE_FLAG_* are enums. Instead of using TRACE_DEFINE_ENUM(), + * use their hardcoded values. These values are parsed by user space + * tooling elsewhere so they will never change. + * + * See "enum trace_flag_type" in linux/trace_events.h: + * TRACE_FLAG_HARDIRQ + * TRACE_FLAG_SOFTIRQ + */ + +/* This is what is displayed in the format files */ +#define __event_in_hardirq() (REC->common_flags & 0x8) +#define __event_in_softirq() (REC->common_flags & 0x10) +#define __event_in_irq() (REC->common_flags & 0x18) + /* * The below is not executed in the kernel. It is only what is * displayed in the print format for userspace to parse. -- 2.33.8 Memory reclaim events are currently difficult to attribute to specific cgroups, making debugging memory pressure issues challenging. This patch adds memory cgroup ID (memcg_id) to key vmscan tracepoints to enable better correlation and analysis. For operations not associated with a specific cgroup, the field is defaulted to 0. Signed-off-by: Thomas Ballasi --- include/trace/events/vmscan.h | 83 ++++++++++++++++++++--------------- mm/shrinker.c | 6 ++- mm/vmscan.c | 17 +++---- 3 files changed, 61 insertions(+), 45 deletions(-) diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 490958fa10dee..20160e79eb0d7 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -114,85 +114,92 @@ TRACE_EVENT(mm_vmscan_wakeup_kswapd, DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_begin_template, - TP_PROTO(int order, gfp_t gfp_flags), + TP_PROTO(gfp_t gfp_flags, int order, u64 memcg_id), - TP_ARGS(order, gfp_flags), + TP_ARGS(gfp_flags, order, memcg_id), TP_STRUCT__entry( - __field( int, order ) __field( unsigned long, gfp_flags ) + __field( u64, memcg_id ) + __field( int, order ) ), TP_fast_assign( - __entry->order = order; __entry->gfp_flags = (__force unsigned long)gfp_flags; + __entry->order = order; + __entry->memcg_id = memcg_id; ), - TP_printk("order=%d gfp_flags=%s", + TP_printk("order=%d gfp_flags=%s memcg_id=%llu", __entry->order, - show_gfp_flags(__entry->gfp_flags)) + show_gfp_flags(__entry->gfp_flags), + __entry->memcg_id) ); DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_direct_reclaim_begin, - TP_PROTO(int order, gfp_t gfp_flags), + TP_PROTO(gfp_t gfp_flags, int order, u64 memcg_id), - TP_ARGS(order, gfp_flags) + TP_ARGS(gfp_flags, order, memcg_id) ); #ifdef CONFIG_MEMCG DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_reclaim_begin, - TP_PROTO(int order, gfp_t gfp_flags), + TP_PROTO(gfp_t gfp_flags, int order, u64 memcg_id), - TP_ARGS(order, gfp_flags) + TP_ARGS(gfp_flags, order, memcg_id) ); DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_softlimit_reclaim_begin, - TP_PROTO(int order, gfp_t gfp_flags), + TP_PROTO(gfp_t gfp_flags, int order, u64 memcg_id), - TP_ARGS(order, gfp_flags) + TP_ARGS(gfp_flags, order, memcg_id) ); #endif /* CONFIG_MEMCG */ DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_end_template, - TP_PROTO(unsigned long nr_reclaimed), + TP_PROTO(unsigned long nr_reclaimed, u64 memcg_id), - TP_ARGS(nr_reclaimed), + TP_ARGS(nr_reclaimed, memcg_id), TP_STRUCT__entry( __field( unsigned long, nr_reclaimed ) + __field( u64, memcg_id ) ), TP_fast_assign( __entry->nr_reclaimed = nr_reclaimed; + __entry->memcg_id = memcg_id; ), - TP_printk("nr_reclaimed=%lu", __entry->nr_reclaimed) + TP_printk("nr_reclaimed=%lu memcg_id=%llu", + __entry->nr_reclaimed, + __entry->memcg_id) ); DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_direct_reclaim_end, - TP_PROTO(unsigned long nr_reclaimed), + TP_PROTO(unsigned long nr_reclaimed, u64 memcg_id), - TP_ARGS(nr_reclaimed) + TP_ARGS(nr_reclaimed, memcg_id) ); #ifdef CONFIG_MEMCG DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_memcg_reclaim_end, - TP_PROTO(unsigned long nr_reclaimed), + TP_PROTO(unsigned long nr_reclaimed, u64 memcg_id), - TP_ARGS(nr_reclaimed) + TP_ARGS(nr_reclaimed, memcg_id) ); DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_memcg_softlimit_reclaim_end, - TP_PROTO(unsigned long nr_reclaimed), + TP_PROTO(unsigned long nr_reclaimed, u64 memcg_id), - TP_ARGS(nr_reclaimed) + TP_ARGS(nr_reclaimed, memcg_id) ); #endif /* CONFIG_MEMCG */ @@ -200,39 +207,42 @@ TRACE_EVENT(mm_shrink_slab_start, TP_PROTO(struct shrinker *shr, struct shrink_control *sc, long nr_objects_to_shrink, unsigned long cache_items, unsigned long long delta, unsigned long total_scan, - int priority), + int priority, u64 memcg_id), TP_ARGS(shr, sc, nr_objects_to_shrink, cache_items, delta, total_scan, - priority), + priority, memcg_id), TP_STRUCT__entry( __field(struct shrinker *, shr) __field(void *, shrink) - __field(int, nid) __field(long, nr_objects_to_shrink) __field(unsigned long, gfp_flags) __field(unsigned long, cache_items) __field(unsigned long long, delta) __field(unsigned long, total_scan) __field(int, priority) + __field(int, nid) + __field(u64, memcg_id) ), TP_fast_assign( __entry->shr = shr; __entry->shrink = shr->scan_objects; - __entry->nid = sc->nid; __entry->nr_objects_to_shrink = nr_objects_to_shrink; __entry->gfp_flags = (__force unsigned long)sc->gfp_mask; __entry->cache_items = cache_items; __entry->delta = delta; __entry->total_scan = total_scan; __entry->priority = priority; + __entry->nid = sc->nid; + __entry->memcg_id = memcg_id; ), - TP_printk("%pS %p: nid: %d objects to shrink %ld gfp_flags %s cache items %ld delta %lld total_scan %ld priority %d", + TP_printk("%pS %p: nid: %d memcg_id: %llu objects to shrink %ld gfp_flags %s cache items %ld delta %lld total_scan %ld priority %d", __entry->shrink, __entry->shr, __entry->nid, + __entry->memcg_id, __entry->nr_objects_to_shrink, show_gfp_flags(__entry->gfp_flags), __entry->cache_items, @@ -243,35 +253,38 @@ TRACE_EVENT(mm_shrink_slab_start, TRACE_EVENT(mm_shrink_slab_end, TP_PROTO(struct shrinker *shr, int nid, int shrinker_retval, - long unused_scan_cnt, long new_scan_cnt, long total_scan), + long unused_scan_cnt, long new_scan_cnt, long total_scan, u64 memcg_id), TP_ARGS(shr, nid, shrinker_retval, unused_scan_cnt, new_scan_cnt, - total_scan), + total_scan, memcg_id), TP_STRUCT__entry( __field(struct shrinker *, shr) - __field(int, nid) __field(void *, shrink) __field(long, unused_scan) __field(long, new_scan) - __field(int, retval) __field(long, total_scan) + __field(int, nid) + __field(int, retval) + __field(u64, memcg_id) ), TP_fast_assign( __entry->shr = shr; - __entry->nid = nid; __entry->shrink = shr->scan_objects; __entry->unused_scan = unused_scan_cnt; __entry->new_scan = new_scan_cnt; - __entry->retval = shrinker_retval; __entry->total_scan = total_scan; + __entry->nid = nid; + __entry->retval = shrinker_retval; + __entry->memcg_id = memcg_id; ), - TP_printk("%pS %p: nid: %d unused scan count %ld new scan count %ld total_scan %ld last shrinker return val %d", + TP_printk("%pS %p: nid: %d memcg_id: %llu unused scan count %ld new scan count %ld total_scan %ld last shrinker return val %d", __entry->shrink, __entry->shr, __entry->nid, + __entry->memcg_id, __entry->unused_scan, __entry->new_scan, __entry->total_scan, @@ -504,9 +517,9 @@ TRACE_EVENT(mm_vmscan_node_reclaim_begin, DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_node_reclaim_end, - TP_PROTO(unsigned long nr_reclaimed), + TP_PROTO(unsigned long nr_reclaimed, u64 memcg_id), - TP_ARGS(nr_reclaimed) + TP_ARGS(nr_reclaimed, memcg_id) ); TRACE_EVENT(mm_vmscan_throttled, diff --git a/mm/shrinker.c b/mm/shrinker.c index 4a93fd433689a..ccfbe3fb3b378 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -410,7 +410,8 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, total_scan = min(total_scan, (2 * freeable)); trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, - freeable, delta, total_scan, priority); + freeable, delta, total_scan, priority, + mem_cgroup_id(shrinkctl->memcg)); /* * Normally, we should not scan less than batch_size objects in one @@ -461,7 +462,8 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, */ new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl); - trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan); + trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan, + mem_cgroup_id(shrinkctl->memcg)); return freed; } diff --git a/mm/vmscan.c b/mm/vmscan.c index b2fc8b626d3df..9a3cd975a9f30 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -6642,11 +6642,11 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, return 1; set_task_reclaim_state(current, &sc.reclaim_state); - trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask); + trace_mm_vmscan_direct_reclaim_begin(sc.gfp_mask, order, 0); nr_reclaimed = do_try_to_free_pages(zonelist, &sc); - trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); + trace_mm_vmscan_direct_reclaim_end(nr_reclaimed, 0); set_task_reclaim_state(current, NULL); return nr_reclaimed; @@ -6675,8 +6675,9 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); - trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order, - sc.gfp_mask); + trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.gfp_mask, + sc.order, + mem_cgroup_id(memcg)); /* * NOTE: Although we can get the priority field, using it @@ -6687,7 +6688,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, */ shrink_lruvec(lruvec, &sc); - trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); + trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed, mem_cgroup_id(memcg)); *nr_scanned = sc.nr_scanned; @@ -6723,13 +6724,13 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); set_task_reclaim_state(current, &sc.reclaim_state); - trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask); + trace_mm_vmscan_memcg_reclaim_begin(sc.gfp_mask, 0, mem_cgroup_id(memcg)); noreclaim_flag = memalloc_noreclaim_save(); nr_reclaimed = do_try_to_free_pages(zonelist, &sc); memalloc_noreclaim_restore(noreclaim_flag); - trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); + trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed, mem_cgroup_id(memcg)); set_task_reclaim_state(current, NULL); return nr_reclaimed; @@ -7675,7 +7676,7 @@ static unsigned long __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, delayacct_freepages_end(); psi_memstall_leave(&pflags); - trace_mm_vmscan_node_reclaim_end(sc->nr_reclaimed); + trace_mm_vmscan_node_reclaim_end(sc->nr_reclaimed, 0); return sc->nr_reclaimed; } -- 2.33.8 The changes aims at adding additionnal tracepoints variables to help debuggers attribute them to specific processes. The PID field uses in_task() to reliably detect when we're in process context and can safely access current->pid. When not in process context (such as in interrupt or in an asynchronous RCU context), the field is set to -1 as a sentinel value. Signed-off-by: Thomas Ballasi --- include/trace/events/vmscan.h | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 20160e79eb0d7..c7f7621e48af5 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -122,18 +122,22 @@ DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_begin_template, __field( unsigned long, gfp_flags ) __field( u64, memcg_id ) __field( int, order ) + __field( int, pid ) ), TP_fast_assign( __entry->gfp_flags = (__force unsigned long)gfp_flags; __entry->order = order; + __entry->pid = current->pid; __entry->memcg_id = memcg_id; ), - TP_printk("order=%d gfp_flags=%s memcg_id=%llu", + TP_printk("order=%d gfp_flags=%s pid=%d memcg_id=%llu %s", __entry->order, show_gfp_flags(__entry->gfp_flags), - __entry->memcg_id) + __entry->pid, + __entry->memcg_id, + __event_in_irq() ? "(in-irq)" : "") ); DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_direct_reclaim_begin, @@ -168,16 +172,20 @@ DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_end_template, TP_STRUCT__entry( __field( unsigned long, nr_reclaimed ) __field( u64, memcg_id ) + __field( int, pid ) ), TP_fast_assign( __entry->nr_reclaimed = nr_reclaimed; __entry->memcg_id = memcg_id; + __entry->pid = current->pid; ), - TP_printk("nr_reclaimed=%lu memcg_id=%llu", + TP_printk("nr_reclaimed=%lu pid=%d memcg_id=%llu %s", __entry->nr_reclaimed, - __entry->memcg_id) + __entry->pid, + __entry->memcg_id, + __event_in_irq() ? "(in-irq)" : "") ); DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_direct_reclaim_end, @@ -220,9 +228,10 @@ TRACE_EVENT(mm_shrink_slab_start, __field(unsigned long, cache_items) __field(unsigned long long, delta) __field(unsigned long, total_scan) + __field(u64, memcg_id) __field(int, priority) __field(int, nid) - __field(u64, memcg_id) + __field(int, pid) ), TP_fast_assign( @@ -236,19 +245,22 @@ TRACE_EVENT(mm_shrink_slab_start, __entry->priority = priority; __entry->nid = sc->nid; __entry->memcg_id = memcg_id; + __entry->pid = current->pid; ), - TP_printk("%pS %p: nid: %d memcg_id: %llu objects to shrink %ld gfp_flags %s cache items %ld delta %lld total_scan %ld priority %d", + TP_printk("%pS %p: nid: %d pid: %d memcg_id: %llu objects to shrink %ld gfp_flags %s cache items %ld delta %lld total_scan %ld priority %d %s", __entry->shrink, __entry->shr, __entry->nid, + __entry->pid, __entry->memcg_id, __entry->nr_objects_to_shrink, show_gfp_flags(__entry->gfp_flags), __entry->cache_items, __entry->delta, __entry->total_scan, - __entry->priority) + __entry->priority, + __event_in_irq() ? "(in-irq)" : "") ); TRACE_EVENT(mm_shrink_slab_end, @@ -266,29 +278,32 @@ TRACE_EVENT(mm_shrink_slab_end, __field(long, total_scan) __field(int, nid) __field(int, retval) + __field(int, pid) __field(u64, memcg_id) ), TP_fast_assign( __entry->shr = shr; - __entry->shrink = shr->scan_objects; __entry->unused_scan = unused_scan_cnt; __entry->new_scan = new_scan_cnt; __entry->total_scan = total_scan; __entry->nid = nid; __entry->retval = shrinker_retval; + __entry->pid = current->pid; __entry->memcg_id = memcg_id; ), - TP_printk("%pS %p: nid: %d memcg_id: %llu unused scan count %ld new scan count %ld total_scan %ld last shrinker return val %d", + TP_printk("%pS %p: nid: %d pid: %d memcg_id: %llu unused scan count %ld new scan count %ld total_scan %ld last shrinker return val %d %s", __entry->shrink, __entry->shr, __entry->nid, + __entry->pid, __entry->memcg_id, __entry->unused_scan, __entry->new_scan, __entry->total_scan, - __entry->retval) + __entry->retval, + __event_in_irq() ? "(in-irq)" : "") ); TRACE_EVENT(mm_vmscan_lru_isolate, -- 2.33.8