From: Jakub Brnak <jbrnak@redhat.com>

Replace custom syscall structs with the standard trace_event_raw_sys_enter
and trace_event_raw_sys_exit from vmlinux.h.
This fixes a data structure misalignment issue discovered on RHEL-9, which
prevented BPF programs from correctly accessing syscall arguments.
This change also aims to improve compatibility between different version
of the perf tool and kernel by using CO-RE so BPF code can correclty
adjust field offsets.

Signed-off-by: Jakub Brnak <jbrnak@redhat.com>
[ coding style updates and fix a BPF verifier issue ]
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 .../bpf_skel/augmented_raw_syscalls.bpf.c     | 62 ++++++++-----------
 tools/perf/util/bpf_skel/vmlinux/vmlinux.h    | 14 +++++
 2 files changed, 40 insertions(+), 36 deletions(-)

diff --git a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
index cb86e261b4de0685..2c9bcc6b8cb0c06c 100644
--- a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
+++ b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
@@ -60,18 +60,6 @@ struct syscalls_sys_exit {
 	__uint(max_entries, 512);
 } syscalls_sys_exit SEC(".maps");
 
-struct syscall_enter_args {
-	unsigned long long common_tp_fields;
-	long		   syscall_nr;
-	unsigned long	   args[6];
-};
-
-struct syscall_exit_args {
-	unsigned long long common_tp_fields;
-	long		   syscall_nr;
-	long		   ret;
-};
-
 /*
  * Desired design of maximum size and alignment (see RFC2553)
  */
@@ -115,7 +103,7 @@ struct pids_filtered {
 } pids_filtered SEC(".maps");
 
 struct augmented_args_payload {
-	struct syscall_enter_args args;
+	struct trace_event_raw_sys_enter args;
 	struct augmented_arg arg, arg2; // We have to reserve space for two arguments (rename, etc)
 };
 
@@ -135,7 +123,7 @@ struct beauty_map_enter {
 } beauty_map_enter SEC(".maps");
 
 struct beauty_payload_enter {
-	struct syscall_enter_args args;
+	struct trace_event_raw_sys_enter args;
 	struct augmented_arg aug_args[6];
 };
 
@@ -192,7 +180,7 @@ unsigned int augmented_arg__read_str(struct augmented_arg *augmented_arg, const
 }
 
 SEC("tp/raw_syscalls/sys_enter")
-int syscall_unaugmented(struct syscall_enter_args *args)
+int syscall_unaugmented(struct trace_event_raw_sys_enter *args)
 {
 	return 1;
 }
@@ -204,7 +192,7 @@ int syscall_unaugmented(struct syscall_enter_args *args)
  * filename.
  */
 SEC("tp/syscalls/sys_enter_connect")
-int sys_enter_connect(struct syscall_enter_args *args)
+int sys_enter_connect(struct trace_event_raw_sys_enter *args)
 {
 	struct augmented_args_payload *augmented_args = augmented_args_payload();
 	const void *sockaddr_arg = (const void *)args->args[1];
@@ -225,7 +213,7 @@ int sys_enter_connect(struct syscall_enter_args *args)
 }
 
 SEC("tp/syscalls/sys_enter_sendto")
-int sys_enter_sendto(struct syscall_enter_args *args)
+int sys_enter_sendto(struct trace_event_raw_sys_enter *args)
 {
 	struct augmented_args_payload *augmented_args = augmented_args_payload();
 	const void *sockaddr_arg = (const void *)args->args[4];
@@ -243,7 +231,7 @@ int sys_enter_sendto(struct syscall_enter_args *args)
 }
 
 SEC("tp/syscalls/sys_enter_open")
-int sys_enter_open(struct syscall_enter_args *args)
+int sys_enter_open(struct trace_event_raw_sys_enter *args)
 {
 	struct augmented_args_payload *augmented_args = augmented_args_payload();
 	const void *filename_arg = (const void *)args->args[0];
@@ -258,7 +246,7 @@ int sys_enter_open(struct syscall_enter_args *args)
 }
 
 SEC("tp/syscalls/sys_enter_openat")
-int sys_enter_openat(struct syscall_enter_args *args)
+int sys_enter_openat(struct trace_event_raw_sys_enter *args)
 {
 	struct augmented_args_payload *augmented_args = augmented_args_payload();
 	const void *filename_arg = (const void *)args->args[1];
@@ -273,7 +261,7 @@ int sys_enter_openat(struct syscall_enter_args *args)
 }
 
 SEC("tp/syscalls/sys_enter_rename")
-int sys_enter_rename(struct syscall_enter_args *args)
+int sys_enter_rename(struct trace_event_raw_sys_enter *args)
 {
 	struct augmented_args_payload *augmented_args = augmented_args_payload();
 	const void *oldpath_arg = (const void *)args->args[0],
@@ -304,7 +292,7 @@ int sys_enter_rename(struct syscall_enter_args *args)
 }
 
 SEC("tp/syscalls/sys_enter_renameat2")
-int sys_enter_renameat2(struct syscall_enter_args *args)
+int sys_enter_renameat2(struct trace_event_raw_sys_enter *args)
 {
 	struct augmented_args_payload *augmented_args = augmented_args_payload();
 	const void *oldpath_arg = (const void *)args->args[1],
@@ -346,7 +334,7 @@ struct perf_event_attr_size {
 };
 
 SEC("tp/syscalls/sys_enter_perf_event_open")
-int sys_enter_perf_event_open(struct syscall_enter_args *args)
+int sys_enter_perf_event_open(struct trace_event_raw_sys_enter *args)
 {
 	struct augmented_args_payload *augmented_args = augmented_args_payload();
 	const struct perf_event_attr_size *attr = (const struct perf_event_attr_size *)args->args[0], *attr_read;
@@ -378,7 +366,7 @@ int sys_enter_perf_event_open(struct syscall_enter_args *args)
 }
 
 SEC("tp/syscalls/sys_enter_clock_nanosleep")
-int sys_enter_clock_nanosleep(struct syscall_enter_args *args)
+int sys_enter_clock_nanosleep(struct trace_event_raw_sys_enter *args)
 {
 	struct augmented_args_payload *augmented_args = augmented_args_payload();
 	const void *rqtp_arg = (const void *)args->args[2];
@@ -399,7 +387,7 @@ int sys_enter_clock_nanosleep(struct syscall_enter_args *args)
 }
 
 SEC("tp/syscalls/sys_enter_nanosleep")
-int sys_enter_nanosleep(struct syscall_enter_args *args)
+int sys_enter_nanosleep(struct trace_event_raw_sys_enter *args)
 {
 	struct augmented_args_payload *augmented_args = augmented_args_payload();
 	const void *req_arg = (const void *)args->args[0];
@@ -429,7 +417,7 @@ static bool pid_filter__has(struct pids_filtered *pids, pid_t pid)
 	return bpf_map_lookup_elem(pids, &pid) != NULL;
 }
 
-static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
+static int augment_sys_enter(void *ctx, struct trace_event_raw_sys_enter *args)
 {
 	bool augmented, do_output = false;
 	int zero = 0, index, value_size = sizeof(struct augmented_arg) - offsetof(struct augmented_arg, value);
@@ -444,7 +432,7 @@ static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
 		return 1;
 
 	/* use syscall number to get beauty_map entry */
-	nr             = (__u32)args->syscall_nr;
+	nr             = (__u32)args->id;
 	beauty_map     = bpf_map_lookup_elem(&beauty_map_enter, &nr);
 
 	/* set up payload for output */
@@ -454,8 +442,8 @@ static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
 	if (beauty_map == NULL || payload == NULL)
 		return 1;
 
-	/* copy the sys_enter header, which has the syscall_nr */
-	__builtin_memcpy(&payload->args, args, sizeof(struct syscall_enter_args));
+	/* copy the sys_enter header, which has the id */
+	__builtin_memcpy(&payload->args, args, sizeof(*args));
 
 	/*
 	 * Determine what type of argument and how many bytes to read from user space, using the
@@ -489,9 +477,11 @@ static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
 			index = -(size + 1);
 			barrier_var(index); // Prevent clang (noticed with v18) from removing the &= 7 trick.
 			index &= 7;	    // Satisfy the bounds checking with the verifier in some kernels.
-			aug_size = args->args[index] > TRACE_AUG_MAX_BUF ? TRACE_AUG_MAX_BUF : args->args[index];
+			aug_size = args->args[index];
 
 			if (aug_size > 0) {
+				if (aug_size > TRACE_AUG_MAX_BUF)
+					aug_size = TRACE_AUG_MAX_BUF;
 				if (!bpf_probe_read_user(((struct augmented_arg *)payload_offset)->value, aug_size, arg))
 					augmented = true;
 			}
@@ -515,14 +505,14 @@ static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
 		}
 	}
 
-	if (!do_output || (sizeof(struct syscall_enter_args) + output) > sizeof(struct beauty_payload_enter))
+	if (!do_output || (sizeof(*args) + output) > sizeof(*payload))
 		return 1;
 
-	return augmented__beauty_output(ctx, payload, sizeof(struct syscall_enter_args) + output);
+	return augmented__beauty_output(ctx, payload, sizeof(*args) + output);
 }
 
 SEC("tp/raw_syscalls/sys_enter")
-int sys_enter(struct syscall_enter_args *args)
+int sys_enter(struct trace_event_raw_sys_enter *args)
 {
 	struct augmented_args_payload *augmented_args;
 	/*
@@ -550,16 +540,16 @@ int sys_enter(struct syscall_enter_args *args)
 	 * unaugmented tracepoint payload.
 	 */
 	if (augment_sys_enter(args, &augmented_args->args))
-		bpf_tail_call(args, &syscalls_sys_enter, augmented_args->args.syscall_nr);
+		bpf_tail_call(args, &syscalls_sys_enter, augmented_args->args.id);
 
 	// If not found on the PROG_ARRAY syscalls map, then we're filtering it:
 	return 0;
 }
 
 SEC("tp/raw_syscalls/sys_exit")
-int sys_exit(struct syscall_exit_args *args)
+int sys_exit(struct trace_event_raw_sys_exit *args)
 {
-	struct syscall_exit_args exit_args;
+	struct trace_event_raw_sys_exit exit_args;
 
 	if (pid_filter__has(&pids_filtered, getpid()))
 		return 0;
@@ -570,7 +560,7 @@ int sys_exit(struct syscall_exit_args *args)
 	 * "!raw_syscalls:unaugmented" that will just return 1 to return the
 	 * unaugmented tracepoint payload.
 	 */
-	bpf_tail_call(args, &syscalls_sys_exit, exit_args.syscall_nr);
+	bpf_tail_call(args, &syscalls_sys_exit, exit_args.id);
 	/*
 	 * If not found on the PROG_ARRAY syscalls map, then we're filtering it:
 	 */
diff --git a/tools/perf/util/bpf_skel/vmlinux/vmlinux.h b/tools/perf/util/bpf_skel/vmlinux/vmlinux.h
index a59ce912be18cd0f..b8b2347268633cdf 100644
--- a/tools/perf/util/bpf_skel/vmlinux/vmlinux.h
+++ b/tools/perf/util/bpf_skel/vmlinux/vmlinux.h
@@ -212,4 +212,18 @@ struct pglist_data {
 	int nr_zones;
 } __attribute__((preserve_access_index));
 
+struct trace_event_raw_sys_enter {
+	struct trace_entry ent;
+	long int id;
+	long unsigned int args[6];
+	char __data[0];
+} __attribute__((preserve_access_index));
+
+struct trace_event_raw_sys_exit {
+	struct trace_entry ent;
+	long int id;
+	long int ret;
+	char __data[0];
+} __attribute__((preserve_access_index));
+
 #endif // __VMLINUX_H
-- 
2.51.0.rc1.167.g924127e9c0-goog

We want to handle syscall exit path differently so let's split the
unaugmented exit BPF program.  Currently it does nothing (same as
sys_enter).

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/builtin-trace.c                            |  8 +++++---
 tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c |  8 +++++++-
 tools/perf/util/bpf_trace_augment.c                   |  9 +++++++--
 tools/perf/util/trace_augment.h                       | 10 ++++++++--
 4 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index fe737b3ac6e67d3b..1bc912273af2db66 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -3770,13 +3770,15 @@ static void trace__init_syscall_bpf_progs(struct trace *trace, int e_machine, in
 static int trace__bpf_prog_sys_enter_fd(struct trace *trace, int e_machine, int id)
 {
 	struct syscall *sc = trace__syscall_info(trace, NULL, e_machine, id);
-	return sc ? bpf_program__fd(sc->bpf_prog.sys_enter) : bpf_program__fd(unaugmented_prog);
+	return sc ? bpf_program__fd(sc->bpf_prog.sys_enter) :
+		bpf_program__fd(augmented_syscalls__unaugmented_enter());
 }
 
 static int trace__bpf_prog_sys_exit_fd(struct trace *trace, int e_machine, int id)
 {
 	struct syscall *sc = trace__syscall_info(trace, NULL, e_machine, id);
-	return sc ? bpf_program__fd(sc->bpf_prog.sys_exit) : bpf_program__fd(unaugmented_prog);
+	return sc ? bpf_program__fd(sc->bpf_prog.sys_exit) :
+		bpf_program__fd(augmented_syscalls__unaugmented_exit());
 }
 
 static int trace__bpf_sys_enter_beauty_map(struct trace *trace, int e_machine, int key, unsigned int *beauty_array)
@@ -3977,7 +3979,7 @@ static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace, int e_m
 	if (augmented_syscalls__get_map_fds(&map_enter_fd, &map_exit_fd, &beauty_map_fd) < 0)
 		return -1;
 
-	unaugmented_prog = augmented_syscalls__unaugmented();
+	unaugmented_prog = augmented_syscalls__unaugmented_enter();
 
 	for (int i = 0, num_idx = syscalltbl__num_idx(e_machine); i < num_idx; ++i) {
 		int prog_fd, key = syscalltbl__id_at_idx(e_machine, i);
diff --git a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
index 2c9bcc6b8cb0c06c..0016deb321fe0d97 100644
--- a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
+++ b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
@@ -180,7 +180,13 @@ unsigned int augmented_arg__read_str(struct augmented_arg *augmented_arg, const
 }
 
 SEC("tp/raw_syscalls/sys_enter")
-int syscall_unaugmented(struct trace_event_raw_sys_enter *args)
+int sys_enter_unaugmented(struct trace_event_raw_sys_enter *args)
+{
+	return 1;
+}
+
+SEC("tp/raw_syscalls/sys_exit")
+int sys_exit_unaugmented(struct trace_event_raw_sys_exit *args)
 {
 	return 1;
 }
diff --git a/tools/perf/util/bpf_trace_augment.c b/tools/perf/util/bpf_trace_augment.c
index 56ed17534caa4f3f..f2792ede0249ab89 100644
--- a/tools/perf/util/bpf_trace_augment.c
+++ b/tools/perf/util/bpf_trace_augment.c
@@ -115,9 +115,14 @@ int augmented_syscalls__get_map_fds(int *enter_fd, int *exit_fd, int *beauty_fd)
 	return 0;
 }
 
-struct bpf_program *augmented_syscalls__unaugmented(void)
+struct bpf_program *augmented_syscalls__unaugmented_enter(void)
 {
-	return skel->progs.syscall_unaugmented;
+	return skel->progs.sys_enter_unaugmented;
+}
+
+struct bpf_program *augmented_syscalls__unaugmented_exit(void)
+{
+	return skel->progs.sys_exit_unaugmented;
 }
 
 struct bpf_program *augmented_syscalls__find_by_title(const char *name)
diff --git a/tools/perf/util/trace_augment.h b/tools/perf/util/trace_augment.h
index 4f729bc6775304b4..70b11d3f52906c36 100644
--- a/tools/perf/util/trace_augment.h
+++ b/tools/perf/util/trace_augment.h
@@ -14,7 +14,8 @@ void augmented_syscalls__setup_bpf_output(void);
 int augmented_syscalls__set_filter_pids(unsigned int nr, pid_t *pids);
 int augmented_syscalls__get_map_fds(int *enter_fd, int *exit_fd, int *beauty_fd);
 struct bpf_program *augmented_syscalls__find_by_title(const char *name);
-struct bpf_program *augmented_syscalls__unaugmented(void);
+struct bpf_program *augmented_syscalls__unaugmented_enter(void);
+struct bpf_program *augmented_syscalls__unaugmented_exit(void);
 void augmented_syscalls__cleanup(void);
 
 #else /* !HAVE_BPF_SKEL */
@@ -52,7 +53,12 @@ augmented_syscalls__find_by_title(const char *name __maybe_unused)
 	return NULL;
 }
 
-static inline struct bpf_program *augmented_syscalls__unaugmented(void)
+static inline struct bpf_program *augmented_syscalls__unaugmented_enter(void)
+{
+	return NULL;
+}
+
+static inline struct bpf_program *augmented_syscalls__unaugmented_exit(void)
 {
 	return NULL;
 }
-- 
2.51.0.rc1.167.g924127e9c0-goog

Howard reported that returning 0 from the BPF resulted in affecting
global syscall tracepoint handling.  What we want to do is just to drop
syscall output in the current perf session.  So we need a different
approach.

Currently perf trace uses bpf-output event for augmented arguments and
raw_syscalls:sys_{enter,exit} tracepoint events for normal arguments.
But I think we can just use bpf-output in both cases and drop the trace
point events.

Then it needs to distinguish bpf-output data if it's for enter or exit.
Repurpose struct trace_entry.type which is common in both syscall entry
and exit tracepoints.

Closes: https://lore.kernel.org/r/20250529065537.529937-1-howardchu95@gmail.com
Suggested-by: Howard Chu <howardchu95@gmail.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/builtin-trace.c                    | 119 ++++++++++++++----
 .../bpf_skel/augmented_raw_syscalls.bpf.c     |  37 ++++--
 tools/perf/util/bpf_skel/perf_trace_u.h       |  14 +++
 3 files changed, 133 insertions(+), 37 deletions(-)
 create mode 100644 tools/perf/util/bpf_skel/perf_trace_u.h

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 1bc912273af2db66..e1caa82bc427b68b 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -22,6 +22,7 @@
 #include <bpf/btf.h>
 #endif
 #include "util/bpf_map.h"
+#include "util/bpf_skel/perf_trace_u.h"
 #include "util/rlimit.h"
 #include "builtin.h"
 #include "util/cgroup.h"
@@ -535,6 +536,61 @@ static struct evsel *perf_evsel__raw_syscall_newtp(const char *direction, void *
 	return NULL;
 }
 
+static struct syscall_tp sys_enter_tp;
+static struct syscall_tp sys_exit_tp;
+
+static int evsel__init_bpf_output_tp(struct evsel *evsel)
+{
+	struct tep_event *event;
+	struct tep_format_field *field;
+	struct syscall_tp *sc;
+
+	if (evsel == NULL)
+		return 0;
+
+	event = trace_event__tp_format("raw_syscalls", "sys_enter");
+	if (IS_ERR(event))
+		event = trace_event__tp_format("syscalls", "sys_enter");
+	if (IS_ERR(event))
+		return PTR_ERR(event);
+
+	field = tep_find_field(event, "id");
+	if (field == NULL)
+		return -EINVAL;
+
+	tp_field__init_uint(&sys_enter_tp.id, field, evsel->needs_swap);
+	__tp_field__init_ptr(&sys_enter_tp.args, sys_enter_tp.id.offset + sizeof(u64));
+
+	/* ID is at the same offset, use evsel sc for convenience */
+	sc = evsel__syscall_tp(evsel);
+	if (sc == NULL)
+		return -ENOMEM;
+
+	event = trace_event__tp_format("raw_syscalls", "sys_exit");
+	if (IS_ERR(event))
+		event = trace_event__tp_format("syscalls", "sys_exit");
+	if (IS_ERR(event))
+		return PTR_ERR(event);
+
+	field = tep_find_field(event, "id");
+	if (field == NULL)
+		return -EINVAL;
+
+	tp_field__init_uint(&sys_exit_tp.id, field, evsel->needs_swap);
+
+	field = tep_find_field(event, "ret");
+	if (field == NULL)
+		return -EINVAL;
+
+	tp_field__init_uint(&sys_exit_tp.ret, field, evsel->needs_swap);
+
+	/* Save the common part to the evsel sc */
+	BUG_ON(sys_enter_tp.id.offset != sys_exit_tp.id.offset);
+	sc->id = sys_enter_tp.id;
+
+	return 0;
+}
+
 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
 	({ struct syscall_tp *fields = __evsel__syscall_tp(evsel); \
 	   fields->name.integer(&fields->name, sample); })
@@ -2777,7 +2833,10 @@ static int trace__sys_enter(struct trace *trace, struct evsel *evsel,
 
 	trace__fprintf_sample(trace, evsel, sample, thread);
 
-	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
+	if (evsel == trace->syscalls.events.bpf_output)
+		args = sys_enter_tp.args.pointer(&sys_enter_tp.args, sample);
+	else
+		args = perf_evsel__sc_tp_ptr(evsel, args, sample);
 
 	if (ttrace->entry_str == NULL) {
 		ttrace->entry_str = malloc(trace__entry_str_size);
@@ -2797,8 +2856,10 @@ static int trace__sys_enter(struct trace *trace, struct evsel *evsel,
 	 * thinking that the extra 2 u64 args are the augmented filename, so just check
 	 * here and avoid using augmented syscalls when the evsel is the raw_syscalls one.
 	 */
-	if (evsel != trace->syscalls.events.sys_enter)
-		augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls_args_size);
+	if (evsel == trace->syscalls.events.bpf_output) {
+		augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size,
+							 trace->raw_augmented_syscalls_args_size);
+	}
 	ttrace->entry_time = sample->time;
 	msg = ttrace->entry_str;
 	printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
@@ -2922,7 +2983,10 @@ static int trace__sys_exit(struct trace *trace, struct evsel *evsel,
 
 	trace__fprintf_sample(trace, evsel, sample, thread);
 
-	ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
+	if (evsel == trace->syscalls.events.bpf_output)
+		ret = sys_exit_tp.ret.integer(&sys_exit_tp.ret, sample);
+	else
+		ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
 
 	if (trace->summary)
 		thread__update_stats(thread, ttrace, id, sample, ret, trace);
@@ -3252,6 +3316,17 @@ static int trace__event_handler(struct trace *trace, struct evsel *evsel,
 		}
 	}
 
+	if (evsel == trace->syscalls.events.bpf_output) {
+		short *event_type = sample->raw_data;
+
+		if (*event_type == SYSCALL_TRACE_ENTER)
+			trace__sys_enter(trace, evsel, event, sample);
+		else
+			trace__sys_exit(trace, evsel, event, sample);
+
+		goto printed;
+	}
+
 	trace__printf_interrupted_entry(trace);
 	trace__fprintf_tstamp(trace, sample->time, trace->output);
 
@@ -3261,25 +3336,6 @@ static int trace__event_handler(struct trace *trace, struct evsel *evsel,
 	if (thread)
 		trace__fprintf_comm_tid(trace, thread, trace->output);
 
-	if (evsel == trace->syscalls.events.bpf_output) {
-		int id = perf_evsel__sc_tp_uint(evsel, id, sample);
-		int e_machine = thread ? thread__e_machine(thread, trace->host) : EM_HOST;
-		struct syscall *sc = trace__syscall_info(trace, evsel, e_machine, id);
-
-		if (sc) {
-			fprintf(trace->output, "%s(", sc->name);
-			trace__fprintf_sys_enter(trace, evsel, sample);
-			fputc(')', trace->output);
-			goto newline;
-		}
-
-		/*
-		 * XXX: Not having the associated syscall info or not finding/adding
-		 * 	the thread should never happen, but if it does...
-		 * 	fall thru and print it as a bpf_output event.
-		 */
-	}
-
 	fprintf(trace->output, "%s(", evsel->name);
 
 	if (evsel__is_bpf_output(evsel)) {
@@ -3299,7 +3355,6 @@ static int trace__event_handler(struct trace *trace, struct evsel *evsel,
 		}
 	}
 
-newline:
 	fprintf(trace->output, ")\n");
 
 	if (callchain_ret > 0)
@@ -3307,6 +3362,7 @@ static int trace__event_handler(struct trace *trace, struct evsel *evsel,
 	else if (callchain_ret < 0)
 		pr_err("Problem processing %s callchain, skipping...\n", evsel__name(evsel));
 
+printed:
 	++trace->nr_events_printed;
 
 	if (evsel->max_events != ULONG_MAX && ++evsel->nr_events_printed == evsel->max_events) {
@@ -4527,7 +4583,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
 
 	trace->multiple_threads = perf_thread_map__pid(evlist->core.threads, 0) == -1 ||
 		perf_thread_map__nr(evlist->core.threads) > 1 ||
-		evlist__first(evlist)->core.attr.inherit;
+		!trace->opts.no_inherit;
 
 	/*
 	 * Now that we already used evsel->core.attr to ask the kernel to setup the
@@ -5552,8 +5608,6 @@ int cmd_trace(int argc, const char **argv)
 	if (err < 0)
 		goto skip_augmentation;
 
-	trace__add_syscall_newtp(&trace);
-
 	err = augmented_syscalls__create_bpf_output(trace.evlist);
 	if (err == 0)
 		trace.syscalls.events.bpf_output = evlist__last(trace.evlist);
@@ -5589,6 +5643,7 @@ int cmd_trace(int argc, const char **argv)
 
 	if (trace.evlist->core.nr_entries > 0) {
 		bool use_btf = false;
+		struct evsel *augmented = trace.syscalls.events.bpf_output;
 
 		evlist__set_default_evsel_handler(trace.evlist, trace__event_handler);
 		if (evlist__set_syscall_tp_fields(trace.evlist, &use_btf)) {
@@ -5598,6 +5653,16 @@ int cmd_trace(int argc, const char **argv)
 
 		if (use_btf)
 			trace__load_vmlinux_btf(&trace);
+
+		if (augmented) {
+			if (evsel__init_bpf_output_tp(augmented) < 0) {
+				perror("failed to initialize bpf output fields\n");
+				goto out;
+			}
+			trace.raw_augmented_syscalls_args_size = sys_enter_tp.id.offset;
+			trace.raw_augmented_syscalls_args_size += (6 + 1) * sizeof(long);
+			trace.raw_augmented_syscalls = true;
+		}
 	}
 
 	/*
diff --git a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
index 0016deb321fe0d97..979d60d7dce6565b 100644
--- a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
+++ b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
@@ -7,6 +7,7 @@
  */
 
 #include "vmlinux.h"
+#include "perf_trace_u.h"
 
 #include <bpf/bpf_helpers.h>
 #include <linux/limits.h>
@@ -140,7 +141,7 @@ static inline struct augmented_args_payload *augmented_args_payload(void)
 	return bpf_map_lookup_elem(&augmented_args_tmp, &key);
 }
 
-static inline int augmented__output(void *ctx, struct augmented_args_payload *args, int len)
+static inline int augmented__output(void *ctx, void *args, int len)
 {
 	/* If perf_event_output fails, return non-zero so that it gets recorded unaugmented */
 	return bpf_perf_event_output(ctx, &__augmented_syscalls__, BPF_F_CURRENT_CPU, args, len);
@@ -182,12 +183,20 @@ unsigned int augmented_arg__read_str(struct augmented_arg *augmented_arg, const
 SEC("tp/raw_syscalls/sys_enter")
 int sys_enter_unaugmented(struct trace_event_raw_sys_enter *args)
 {
+	struct augmented_args_payload *augmented_args = augmented_args_payload();
+
+        if (augmented_args)
+		augmented__output(args, &augmented_args->args, sizeof(*args));
 	return 1;
 }
 
 SEC("tp/raw_syscalls/sys_exit")
 int sys_exit_unaugmented(struct trace_event_raw_sys_exit *args)
 {
+	struct augmented_args_payload *augmented_args = augmented_args_payload();
+
+	if (augmented_args)
+		augmented__output(args, &augmented_args->args, sizeof(*args));
 	return 1;
 }
 
@@ -450,6 +459,7 @@ static int augment_sys_enter(void *ctx, struct trace_event_raw_sys_enter *args)
 
 	/* copy the sys_enter header, which has the id */
 	__builtin_memcpy(&payload->args, args, sizeof(*args));
+	payload->args.ent.type = SYSCALL_TRACE_ENTER;
 
 	/*
 	 * Determine what type of argument and how many bytes to read from user space, using the
@@ -532,13 +542,14 @@ int sys_enter(struct trace_event_raw_sys_enter *args)
 	 */
 
 	if (pid_filter__has(&pids_filtered, getpid()))
-		return 0;
+		return 1;
 
 	augmented_args = augmented_args_payload();
 	if (augmented_args == NULL)
 		return 1;
 
 	bpf_probe_read_kernel(&augmented_args->args, sizeof(augmented_args->args), args);
+	augmented_args->args.ent.type = SYSCALL_TRACE_ENTER;
 
 	/*
 	 * Jump to syscall specific augmenter, even if the default one,
@@ -548,29 +559,35 @@ int sys_enter(struct trace_event_raw_sys_enter *args)
 	if (augment_sys_enter(args, &augmented_args->args))
 		bpf_tail_call(args, &syscalls_sys_enter, augmented_args->args.id);
 
-	// If not found on the PROG_ARRAY syscalls map, then we're filtering it:
-	return 0;
+	return 1;
 }
 
 SEC("tp/raw_syscalls/sys_exit")
 int sys_exit(struct trace_event_raw_sys_exit *args)
 {
-	struct trace_event_raw_sys_exit exit_args;
+	struct augmented_args_payload *augmented_args;
 
 	if (pid_filter__has(&pids_filtered, getpid()))
-		return 0;
+		return 1;
+
+	augmented_args = augmented_args_payload();
+	if (augmented_args == NULL)
+		return 1;
+
+	bpf_probe_read_kernel(&augmented_args->args, sizeof(*args), args);
+	augmented_args->args.ent.type = SYSCALL_TRACE_EXIT;
 
-	bpf_probe_read_kernel(&exit_args, sizeof(exit_args), args);
 	/*
 	 * Jump to syscall specific return augmenter, even if the default one,
 	 * "!raw_syscalls:unaugmented" that will just return 1 to return the
 	 * unaugmented tracepoint payload.
 	 */
-	bpf_tail_call(args, &syscalls_sys_exit, exit_args.id);
+	bpf_tail_call(args, &syscalls_sys_exit, args->id);
 	/*
-	 * If not found on the PROG_ARRAY syscalls map, then we're filtering it:
+	 * If not found on the PROG_ARRAY syscalls map, then we're filtering it
+	 * by not emitting bpf-output event.
 	 */
-	return 0;
+	return 1;
 }
 
 char _license[] SEC("license") = "GPL";
diff --git a/tools/perf/util/bpf_skel/perf_trace_u.h b/tools/perf/util/bpf_skel/perf_trace_u.h
new file mode 100644
index 0000000000000000..5b41afa734331d89
--- /dev/null
+++ b/tools/perf/util/bpf_skel/perf_trace_u.h
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+// Copyright (c) 2025 Google
+
+// This file will be shared between BPF and userspace.
+
+#ifndef __PERF_TRACE_U_H
+#define __PERF_TRACE_U_H
+
+enum syscall_trace_type {
+	SYSCALL_TRACE_ENTER = 0,
+	SYSCALL_TRACE_EXIT,
+};
+
+#endif /* __PERF_TRACE_U_H */
-- 
2.51.0.rc1.167.g924127e9c0-goog

Now syscall init for augmented arguments is simplified.  Let's get rid
of dead code.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/builtin-trace.c | 110 -------------------------------------
 1 file changed, 110 deletions(-)

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index e1caa82bc427b68b..a7a49d8997d55594 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -470,38 +470,6 @@ static int evsel__init_syscall_tp(struct evsel *evsel)
 	return -ENOMEM;
 }
 
-static int evsel__init_augmented_syscall_tp(struct evsel *evsel, struct evsel *tp)
-{
-	struct syscall_tp *sc = evsel__syscall_tp(evsel);
-
-	if (sc != NULL) {
-		struct tep_format_field *syscall_id = evsel__field(tp, "id");
-		if (syscall_id == NULL)
-			syscall_id = evsel__field(tp, "__syscall_nr");
-		if (syscall_id == NULL ||
-		    __tp_field__init_uint(&sc->id, syscall_id->size, syscall_id->offset, evsel->needs_swap))
-			return -EINVAL;
-
-		return 0;
-	}
-
-	return -ENOMEM;
-}
-
-static int evsel__init_augmented_syscall_tp_args(struct evsel *evsel)
-{
-	struct syscall_tp *sc = __evsel__syscall_tp(evsel);
-
-	return __tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64));
-}
-
-static int evsel__init_augmented_syscall_tp_ret(struct evsel *evsel)
-{
-	struct syscall_tp *sc = __evsel__syscall_tp(evsel);
-
-	return __tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap);
-}
-
 static int evsel__init_raw_syscall_tp(struct evsel *evsel, void *handler)
 {
 	if (evsel__syscall_tp(evsel) != NULL) {
@@ -5506,7 +5474,6 @@ int cmd_trace(int argc, const char **argv)
 	};
 	bool __maybe_unused max_stack_user_set = true;
 	bool mmap_pages_user_set = true;
-	struct evsel *evsel;
 	const char * const trace_subcommands[] = { "record", NULL };
 	int err = -1;
 	char bf[BUFSIZ];
@@ -5665,83 +5632,6 @@ int cmd_trace(int argc, const char **argv)
 		}
 	}
 
-	/*
-	 * If we are augmenting syscalls, then combine what we put in the
-	 * __augmented_syscalls__ BPF map with what is in the
-	 * syscalls:sys_exit_FOO tracepoints, i.e. just like we do without BPF,
-	 * combining raw_syscalls:sys_enter with raw_syscalls:sys_exit.
-	 *
-	 * We'll switch to look at two BPF maps, one for sys_enter and the
-	 * other for sys_exit when we start augmenting the sys_exit paths with
-	 * buffers that are being copied from kernel to userspace, think 'read'
-	 * syscall.
-	 */
-	if (trace.syscalls.events.bpf_output) {
-		evlist__for_each_entry(trace.evlist, evsel) {
-			bool raw_syscalls_sys_exit = evsel__name_is(evsel, "raw_syscalls:sys_exit");
-
-			if (raw_syscalls_sys_exit) {
-				trace.raw_augmented_syscalls = true;
-				goto init_augmented_syscall_tp;
-			}
-
-			if (trace.syscalls.events.bpf_output->priv == NULL &&
-			    strstr(evsel__name(evsel), "syscalls:sys_enter")) {
-				struct evsel *augmented = trace.syscalls.events.bpf_output;
-				if (evsel__init_augmented_syscall_tp(augmented, evsel) ||
-				    evsel__init_augmented_syscall_tp_args(augmented))
-					goto out;
-				/*
-				 * Augmented is __augmented_syscalls__ BPF_OUTPUT event
-				 * Above we made sure we can get from the payload the tp fields
-				 * that we get from syscalls:sys_enter tracefs format file.
-				 */
-				augmented->handler = trace__sys_enter;
-				/*
-				 * Now we do the same for the *syscalls:sys_enter event so that
-				 * if we handle it directly, i.e. if the BPF prog returns 0 so
-				 * as not to filter it, then we'll handle it just like we would
-				 * for the BPF_OUTPUT one:
-				 */
-				if (evsel__init_augmented_syscall_tp(evsel, evsel) ||
-				    evsel__init_augmented_syscall_tp_args(evsel))
-					goto out;
-				evsel->handler = trace__sys_enter;
-			}
-
-			if (strstarts(evsel__name(evsel), "syscalls:sys_exit_")) {
-				struct syscall_tp *sc;
-init_augmented_syscall_tp:
-				if (evsel__init_augmented_syscall_tp(evsel, evsel))
-					goto out;
-				sc = __evsel__syscall_tp(evsel);
-				/*
-				 * For now with BPF raw_augmented we hook into
-				 * raw_syscalls:sys_enter and there we get all
-				 * 6 syscall args plus the tracepoint common
-				 * fields and the syscall_nr (another long).
-				 * So we check if that is the case and if so
-				 * don't look after the sc->args_size but
-				 * always after the full raw_syscalls:sys_enter
-				 * payload, which is fixed.
-				 *
-				 * We'll revisit this later to pass
-				 * s->args_size to the BPF augmenter (now
-				 * tools/perf/examples/bpf/augmented_raw_syscalls.c,
-				 * so that it copies only what we need for each
-				 * syscall, like what happens when we use
-				 * syscalls:sys_enter_NAME, so that we reduce
-				 * the kernel/userspace traffic to just what is
-				 * needed for each syscall.
-				 */
-				if (trace.raw_augmented_syscalls)
-					trace.raw_augmented_syscalls_args_size = (6 + 1) * sizeof(long) + sc->id.offset;
-				evsel__init_augmented_syscall_tp_ret(evsel);
-				evsel->handler = trace__sys_exit;
-			}
-		}
-	}
-
 	if ((argc >= 1) && (strcmp(argv[0], "record") == 0)) {
 		err = trace__record(&trace, argc-1, &argv[1]);
 		goto out;
-- 
2.51.0.rc1.167.g924127e9c0-goog

Now it's safe to run multiple perf trace commands at the same time.
Let's make them non-exclusive so that they can run in parallel.

  $ sudo perf test 'perf trace'
  113: Check open filename arg using perf trace + vfs_getname          : Skip
  114: perf trace enum augmentation tests                              : Ok
  115: perf trace BTF general tests                                    : Ok
  116: perf trace exit race                                            : Ok
  117: perf trace record and replay                                    : Ok
  118: perf trace summary                                              : Ok

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/tests/shell/trace+probe_vfs_getname.sh | 2 +-
 tools/perf/tests/shell/trace_summary.sh           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/perf/tests/shell/trace+probe_vfs_getname.sh b/tools/perf/tests/shell/trace+probe_vfs_getname.sh
index 7a0b1145d0cd744b..ff7c2f8d41db5802 100755
--- a/tools/perf/tests/shell/trace+probe_vfs_getname.sh
+++ b/tools/perf/tests/shell/trace+probe_vfs_getname.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Check open filename arg using perf trace + vfs_getname (exclusive)
+# Check open filename arg using perf trace + vfs_getname
 
 # Uses the 'perf test shell' library to add probe:vfs_getname to the system
 # then use it with 'perf trace' using 'touch' to write to a temp file, then
diff --git a/tools/perf/tests/shell/trace_summary.sh b/tools/perf/tests/shell/trace_summary.sh
index 22e2651d59191676..1a99a125492955ad 100755
--- a/tools/perf/tests/shell/trace_summary.sh
+++ b/tools/perf/tests/shell/trace_summary.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# perf trace summary (exclusive)
+# perf trace summary
 # SPDX-License-Identifier: GPL-2.0
 
 # Check that perf trace works with various summary mode
-- 
2.51.0.rc1.167.g924127e9c0-goog