From: Jakub Brnak Replace custom syscall structs with the standard trace_event_raw_sys_enter and trace_event_raw_sys_exit from vmlinux.h. This fixes a data structure misalignment issue discovered on RHEL-9, which prevented BPF programs from correctly accessing syscall arguments. This change also aims to improve compatibility between different version of the perf tool and kernel by using CO-RE so BPF code can correclty adjust field offsets. Signed-off-by: Jakub Brnak [ coding style updates and fix a BPF verifier issue ] Signed-off-by: Namhyung Kim --- .../bpf_skel/augmented_raw_syscalls.bpf.c | 62 ++++++++----------- tools/perf/util/bpf_skel/vmlinux/vmlinux.h | 14 +++++ 2 files changed, 40 insertions(+), 36 deletions(-) diff --git a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c index cb86e261b4de0685..2c9bcc6b8cb0c06c 100644 --- a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c +++ b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c @@ -60,18 +60,6 @@ struct syscalls_sys_exit { __uint(max_entries, 512); } syscalls_sys_exit SEC(".maps"); -struct syscall_enter_args { - unsigned long long common_tp_fields; - long syscall_nr; - unsigned long args[6]; -}; - -struct syscall_exit_args { - unsigned long long common_tp_fields; - long syscall_nr; - long ret; -}; - /* * Desired design of maximum size and alignment (see RFC2553) */ @@ -115,7 +103,7 @@ struct pids_filtered { } pids_filtered SEC(".maps"); struct augmented_args_payload { - struct syscall_enter_args args; + struct trace_event_raw_sys_enter args; struct augmented_arg arg, arg2; // We have to reserve space for two arguments (rename, etc) }; @@ -135,7 +123,7 @@ struct beauty_map_enter { } beauty_map_enter SEC(".maps"); struct beauty_payload_enter { - struct syscall_enter_args args; + struct trace_event_raw_sys_enter args; struct augmented_arg aug_args[6]; }; @@ -192,7 +180,7 @@ unsigned int augmented_arg__read_str(struct augmented_arg *augmented_arg, const } SEC("tp/raw_syscalls/sys_enter") -int syscall_unaugmented(struct syscall_enter_args *args) +int syscall_unaugmented(struct trace_event_raw_sys_enter *args) { return 1; } @@ -204,7 +192,7 @@ int syscall_unaugmented(struct syscall_enter_args *args) * filename. */ SEC("tp/syscalls/sys_enter_connect") -int sys_enter_connect(struct syscall_enter_args *args) +int sys_enter_connect(struct trace_event_raw_sys_enter *args) { struct augmented_args_payload *augmented_args = augmented_args_payload(); const void *sockaddr_arg = (const void *)args->args[1]; @@ -225,7 +213,7 @@ int sys_enter_connect(struct syscall_enter_args *args) } SEC("tp/syscalls/sys_enter_sendto") -int sys_enter_sendto(struct syscall_enter_args *args) +int sys_enter_sendto(struct trace_event_raw_sys_enter *args) { struct augmented_args_payload *augmented_args = augmented_args_payload(); const void *sockaddr_arg = (const void *)args->args[4]; @@ -243,7 +231,7 @@ int sys_enter_sendto(struct syscall_enter_args *args) } SEC("tp/syscalls/sys_enter_open") -int sys_enter_open(struct syscall_enter_args *args) +int sys_enter_open(struct trace_event_raw_sys_enter *args) { struct augmented_args_payload *augmented_args = augmented_args_payload(); const void *filename_arg = (const void *)args->args[0]; @@ -258,7 +246,7 @@ int sys_enter_open(struct syscall_enter_args *args) } SEC("tp/syscalls/sys_enter_openat") -int sys_enter_openat(struct syscall_enter_args *args) +int sys_enter_openat(struct trace_event_raw_sys_enter *args) { struct augmented_args_payload *augmented_args = augmented_args_payload(); const void *filename_arg = (const void *)args->args[1]; @@ -273,7 +261,7 @@ int sys_enter_openat(struct syscall_enter_args *args) } SEC("tp/syscalls/sys_enter_rename") -int sys_enter_rename(struct syscall_enter_args *args) +int sys_enter_rename(struct trace_event_raw_sys_enter *args) { struct augmented_args_payload *augmented_args = augmented_args_payload(); const void *oldpath_arg = (const void *)args->args[0], @@ -304,7 +292,7 @@ int sys_enter_rename(struct syscall_enter_args *args) } SEC("tp/syscalls/sys_enter_renameat2") -int sys_enter_renameat2(struct syscall_enter_args *args) +int sys_enter_renameat2(struct trace_event_raw_sys_enter *args) { struct augmented_args_payload *augmented_args = augmented_args_payload(); const void *oldpath_arg = (const void *)args->args[1], @@ -346,7 +334,7 @@ struct perf_event_attr_size { }; SEC("tp/syscalls/sys_enter_perf_event_open") -int sys_enter_perf_event_open(struct syscall_enter_args *args) +int sys_enter_perf_event_open(struct trace_event_raw_sys_enter *args) { struct augmented_args_payload *augmented_args = augmented_args_payload(); const struct perf_event_attr_size *attr = (const struct perf_event_attr_size *)args->args[0], *attr_read; @@ -378,7 +366,7 @@ int sys_enter_perf_event_open(struct syscall_enter_args *args) } SEC("tp/syscalls/sys_enter_clock_nanosleep") -int sys_enter_clock_nanosleep(struct syscall_enter_args *args) +int sys_enter_clock_nanosleep(struct trace_event_raw_sys_enter *args) { struct augmented_args_payload *augmented_args = augmented_args_payload(); const void *rqtp_arg = (const void *)args->args[2]; @@ -399,7 +387,7 @@ int sys_enter_clock_nanosleep(struct syscall_enter_args *args) } SEC("tp/syscalls/sys_enter_nanosleep") -int sys_enter_nanosleep(struct syscall_enter_args *args) +int sys_enter_nanosleep(struct trace_event_raw_sys_enter *args) { struct augmented_args_payload *augmented_args = augmented_args_payload(); const void *req_arg = (const void *)args->args[0]; @@ -429,7 +417,7 @@ static bool pid_filter__has(struct pids_filtered *pids, pid_t pid) return bpf_map_lookup_elem(pids, &pid) != NULL; } -static int augment_sys_enter(void *ctx, struct syscall_enter_args *args) +static int augment_sys_enter(void *ctx, struct trace_event_raw_sys_enter *args) { bool augmented, do_output = false; int zero = 0, index, value_size = sizeof(struct augmented_arg) - offsetof(struct augmented_arg, value); @@ -444,7 +432,7 @@ static int augment_sys_enter(void *ctx, struct syscall_enter_args *args) return 1; /* use syscall number to get beauty_map entry */ - nr = (__u32)args->syscall_nr; + nr = (__u32)args->id; beauty_map = bpf_map_lookup_elem(&beauty_map_enter, &nr); /* set up payload for output */ @@ -454,8 +442,8 @@ static int augment_sys_enter(void *ctx, struct syscall_enter_args *args) if (beauty_map == NULL || payload == NULL) return 1; - /* copy the sys_enter header, which has the syscall_nr */ - __builtin_memcpy(&payload->args, args, sizeof(struct syscall_enter_args)); + /* copy the sys_enter header, which has the id */ + __builtin_memcpy(&payload->args, args, sizeof(*args)); /* * Determine what type of argument and how many bytes to read from user space, using the @@ -489,9 +477,11 @@ static int augment_sys_enter(void *ctx, struct syscall_enter_args *args) index = -(size + 1); barrier_var(index); // Prevent clang (noticed with v18) from removing the &= 7 trick. index &= 7; // Satisfy the bounds checking with the verifier in some kernels. - aug_size = args->args[index] > TRACE_AUG_MAX_BUF ? TRACE_AUG_MAX_BUF : args->args[index]; + aug_size = args->args[index]; if (aug_size > 0) { + if (aug_size > TRACE_AUG_MAX_BUF) + aug_size = TRACE_AUG_MAX_BUF; if (!bpf_probe_read_user(((struct augmented_arg *)payload_offset)->value, aug_size, arg)) augmented = true; } @@ -515,14 +505,14 @@ static int augment_sys_enter(void *ctx, struct syscall_enter_args *args) } } - if (!do_output || (sizeof(struct syscall_enter_args) + output) > sizeof(struct beauty_payload_enter)) + if (!do_output || (sizeof(*args) + output) > sizeof(*payload)) return 1; - return augmented__beauty_output(ctx, payload, sizeof(struct syscall_enter_args) + output); + return augmented__beauty_output(ctx, payload, sizeof(*args) + output); } SEC("tp/raw_syscalls/sys_enter") -int sys_enter(struct syscall_enter_args *args) +int sys_enter(struct trace_event_raw_sys_enter *args) { struct augmented_args_payload *augmented_args; /* @@ -550,16 +540,16 @@ int sys_enter(struct syscall_enter_args *args) * unaugmented tracepoint payload. */ if (augment_sys_enter(args, &augmented_args->args)) - bpf_tail_call(args, &syscalls_sys_enter, augmented_args->args.syscall_nr); + bpf_tail_call(args, &syscalls_sys_enter, augmented_args->args.id); // If not found on the PROG_ARRAY syscalls map, then we're filtering it: return 0; } SEC("tp/raw_syscalls/sys_exit") -int sys_exit(struct syscall_exit_args *args) +int sys_exit(struct trace_event_raw_sys_exit *args) { - struct syscall_exit_args exit_args; + struct trace_event_raw_sys_exit exit_args; if (pid_filter__has(&pids_filtered, getpid())) return 0; @@ -570,7 +560,7 @@ int sys_exit(struct syscall_exit_args *args) * "!raw_syscalls:unaugmented" that will just return 1 to return the * unaugmented tracepoint payload. */ - bpf_tail_call(args, &syscalls_sys_exit, exit_args.syscall_nr); + bpf_tail_call(args, &syscalls_sys_exit, exit_args.id); /* * If not found on the PROG_ARRAY syscalls map, then we're filtering it: */ diff --git a/tools/perf/util/bpf_skel/vmlinux/vmlinux.h b/tools/perf/util/bpf_skel/vmlinux/vmlinux.h index a59ce912be18cd0f..b8b2347268633cdf 100644 --- a/tools/perf/util/bpf_skel/vmlinux/vmlinux.h +++ b/tools/perf/util/bpf_skel/vmlinux/vmlinux.h @@ -212,4 +212,18 @@ struct pglist_data { int nr_zones; } __attribute__((preserve_access_index)); +struct trace_event_raw_sys_enter { + struct trace_entry ent; + long int id; + long unsigned int args[6]; + char __data[0]; +} __attribute__((preserve_access_index)); + +struct trace_event_raw_sys_exit { + struct trace_entry ent; + long int id; + long int ret; + char __data[0]; +} __attribute__((preserve_access_index)); + #endif // __VMLINUX_H -- 2.51.0.rc1.167.g924127e9c0-goog