From: Mykyta Yatsenko Add BPF_TRACE_RAW_TP to the set of tracing program attach types that can be loaded as sleepable in can_be_sleepable(). The actual enforcement that the target tracepoint supports sleepable execution (i.e., is faultable) is deferred to attach time, since the target tracepoint is not known at program load time. Signed-off-by: Mykyta Yatsenko --- kernel/bpf/verifier.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0162f946032fe317ce1e5cf4a82e86a9357eca2b..1973184306e41d1279bafe7b2870cc9f5772c40e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -25186,6 +25186,7 @@ static bool can_be_sleepable(struct bpf_prog *prog) case BPF_MODIFY_RETURN: case BPF_TRACE_ITER: case BPF_TRACE_FSESSION: + case BPF_TRACE_RAW_TP: return true; default: return false; @@ -25215,7 +25216,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) } if (prog->sleepable && !can_be_sleepable(prog)) { - verbose(env, "Only fentry/fexit/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable\n"); + verbose(env, "Only fentry/fexit/fmod_ret, lsm, iter, uprobe, struct_ops, and raw_tp programs can be sleepable\n"); return -EINVAL; } -- 2.53.0 From: Mykyta Yatsenko Add an attach-time check in bpf_raw_tp_link_attach() to ensure that sleepable BPF programs can only attach to faultable tracepoints. Faultable tracepoints (e.g., sys_enter, sys_exit) are guaranteed to run in a context where sleeping is safe, using rcu_tasks_trace for protection. Non-faultable tracepoints may run in NMI or other non-sleepable contexts. This complements the verifier-side change that allows BPF_TRACE_RAW_TP programs to be loaded as sleepable. Signed-off-by: Mykyta Yatsenko --- kernel/bpf/syscall.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index dd89bf809772501c109789b39dd5f58f688354e2..af6c93d332fb70a8a14d7768048000b0b7f34650 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4261,6 +4261,11 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, if (!btp) return -ENOENT; + if (prog->sleepable && !tracepoint_is_faultable(btp->tp)) { + err = -EINVAL; + goto out_put_btp; + } + link = kzalloc(sizeof(*link), GFP_USER); if (!link) { err = -ENOMEM; -- 2.53.0 From: Mykyta Yatsenko Remove preempt_disable_notrace()/preempt_enable_notrace() from __BPF_DECLARE_TRACE_SYSCALL, the BPF probe callback wrapper for faultable (syscall) tracepoints. The preemption management is now handled inside __bpf_trace_run() on a per-program basis: migrate_disable() for sleepable programs, rcu_read_lock() (which implies preempt-off in non-PREEMPT_RCU configs) for non-sleepable programs. This allows sleepable BPF programs to actually sleep when attached to faultable tracepoints. Signed-off-by: Mykyta Yatsenko --- include/trace/bpf_probe.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h index 9391d54d3f124ab0d56ec57445cfc79baeffc28c..d1de8f9aa07fb76e9ee8037ce43099efb95b05d5 100644 --- a/include/trace/bpf_probe.h +++ b/include/trace/bpf_probe.h @@ -58,9 +58,7 @@ static notrace void \ __bpf_trace_##call(void *__data, proto) \ { \ might_fault(); \ - preempt_disable_notrace(); \ CONCATENATE(bpf_trace_run, COUNT_ARGS(args))(__data, CAST_TO_U64(args)); \ - preempt_enable_notrace(); \ } #undef DECLARE_EVENT_SYSCALL_CLASS -- 2.53.0 From: Mykyta Yatsenko Modify __bpf_trace_run() to support both sleepable and non-sleepable BPF programs. When the program is sleepable: - Skip cant_sleep() and instead call might_fault() to annotate the faultable context - Use migrate_disable()/migrate_enable() instead of rcu_read_lock()/rcu_read_unlock() to allow sleeping while still protecting percpu data access - The outer rcu_tasks_trace lock is already held by the faultable tracepoint callback (__DECLARE_TRACE_SYSCALL), providing lifetime protection for the BPF program For non-sleepable programs, behavior is unchanged: cant_sleep() check, rcu_read_lock() protection. This allows multiple BPF programs with different sleepable settings to coexist on the same faultable tracepoint, since __bpf_trace_run() is invoked per-link. Signed-off-by: Mykyta Yatsenko --- kernel/trace/bpf_trace.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f7baeb8278cac8c4f93bfd62469dab26586f4531..6ac9f2dc36549269937c58f4d4add244d7739edb 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2076,7 +2076,8 @@ void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args) struct bpf_run_ctx *old_run_ctx; struct bpf_trace_run_ctx run_ctx; - cant_sleep(); + migrate_disable(); + if (unlikely(!bpf_prog_get_recursion_context(prog))) { bpf_prog_inc_misses_counter(prog); goto out; @@ -2085,13 +2086,26 @@ void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args) run_ctx.bpf_cookie = link->cookie; old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); - rcu_read_lock(); - (void) bpf_prog_run(prog, args); - rcu_read_unlock(); + if (prog->sleepable) { + might_fault(); + (void)bpf_prog_run(prog, args); + } else { + /* + * Non-sleepable programs may run in the faultable context, + * do cant_sleep() only if program is non-sleepable and context + * is non-faultable. + */ + if (!link->link.sleepable) + cant_sleep(); + rcu_read_lock(); + (void)bpf_prog_run(prog, args); + rcu_read_unlock(); + } bpf_reset_run_ctx(old_run_ctx); out: bpf_prog_put_recursion_context(prog); + migrate_enable(); } #define UNPACK(...) __VA_ARGS__ -- 2.53.0 From: Mykyta Yatsenko Add SEC_DEF for "tp_btf.s+" section prefix, enabling userspace BPF programs to use SEC("tp_btf.s/") to load sleepable raw tracepoint programs. This follows the existing pattern used for fentry.s, fexit.s, fmod_ret.s, and lsm.s section definitions. Signed-off-by: Mykyta Yatsenko --- tools/lib/bpf/libbpf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 0c8bf0b5cce44462414e99a958260e940148e818..e5be88238d3fe589b0ded1c46bf03f26a4e64bb4 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -9854,6 +9854,7 @@ static const struct bpf_sec_def section_defs[] = { SEC_DEF("raw_tracepoint.w+", RAW_TRACEPOINT_WRITABLE, 0, SEC_NONE, attach_raw_tp), SEC_DEF("raw_tp.w+", RAW_TRACEPOINT_WRITABLE, 0, SEC_NONE, attach_raw_tp), SEC_DEF("tp_btf+", TRACING, BPF_TRACE_RAW_TP, SEC_ATTACH_BTF, attach_trace), + SEC_DEF("tp_btf.s+", TRACING, BPF_TRACE_RAW_TP, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace), SEC_DEF("fentry+", TRACING, BPF_TRACE_FENTRY, SEC_ATTACH_BTF, attach_trace), SEC_DEF("fmod_ret+", TRACING, BPF_MODIFY_RETURN, SEC_ATTACH_BTF, attach_trace), SEC_DEF("fexit+", TRACING, BPF_TRACE_FEXIT, SEC_ATTACH_BTF, attach_trace), -- 2.53.0 From: Mykyta Yatsenko Add two subtests: - success: Attach a sleepable BPF program to the faultable sys_enter tracepoint (tp_btf.s/sys_enter). Verify the program is triggered by a syscall. - reject_non_faultable: Attempt to attach a sleepable BPF program to a non-faultable tracepoint (tp_btf.s/sched_switch). Verify that attachment is rejected. Signed-off-by: Mykyta Yatsenko --- .../selftests/bpf/prog_tests/sleepable_raw_tp.c | 56 ++++++++++++++++++++++ .../selftests/bpf/progs/test_sleepable_raw_tp.c | 43 +++++++++++++++++ .../bpf/progs/test_sleepable_raw_tp_fail.c | 16 +++++++ tools/testing/selftests/bpf/verifier/sleepable.c | 5 +- 4 files changed, 117 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sleepable_raw_tp.c b/tools/testing/selftests/bpf/prog_tests/sleepable_raw_tp.c new file mode 100644 index 0000000000000000000000000000000000000000..9b0ec7cc4cacf6ee3d2e0cdc23f63388c9613384 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/sleepable_raw_tp.c @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include "test_sleepable_raw_tp.skel.h" +#include "test_sleepable_raw_tp_fail.skel.h" + +static void test_sleepable_raw_tp_success(void) +{ + struct test_sleepable_raw_tp *skel; + int err; + + skel = test_sleepable_raw_tp__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_load")) + return; + + skel->bss->target_pid = getpid(); + + err = test_sleepable_raw_tp__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto cleanup; + + syscall(__NR_nanosleep, &(struct timespec){ .tv_nsec = 555 }, NULL); + + ASSERT_EQ(skel->bss->triggered, 1, "triggered"); + ASSERT_EQ(skel->bss->err, 0, "err"); + ASSERT_EQ(skel->bss->copied_tv_nsec, 555, "copied_tv_nsec"); + +cleanup: + test_sleepable_raw_tp__destroy(skel); +} + +static void test_sleepable_raw_tp_reject(void) +{ + struct test_sleepable_raw_tp_fail *skel; + int err; + + skel = test_sleepable_raw_tp_fail__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_load")) + goto cleanup; + + err = test_sleepable_raw_tp_fail__attach(skel); + ASSERT_ERR(err, "skel_attach_should_fail"); + +cleanup: + test_sleepable_raw_tp_fail__destroy(skel); +} + +void test_sleepable_raw_tp(void) +{ + if (test__start_subtest("success")) + test_sleepable_raw_tp_success(); + if (test__start_subtest("reject_non_faultable")) + test_sleepable_raw_tp_reject(); +} diff --git a/tools/testing/selftests/bpf/progs/test_sleepable_raw_tp.c b/tools/testing/selftests/bpf/progs/test_sleepable_raw_tp.c new file mode 100644 index 0000000000000000000000000000000000000000..ebacc766df573c4ab725202b90c0a9b6d32970a1 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_sleepable_raw_tp.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include +#include +#include + +char _license[] SEC("license") = "GPL"; + +int target_pid; +int triggered; +long err; +long copied_tv_nsec; + +SEC("tp_btf.s/sys_enter") +int BPF_PROG(test_sleepable_sys_enter, struct pt_regs *regs, long id) +{ + struct task_struct *task = bpf_get_current_task_btf(); + struct __kernel_timespec *ts; + long tv_nsec; + + if (task->pid != target_pid) + return 0; + + if (id != __NR_nanosleep) + return 0; + + ts = (void *)PT_REGS_PARM1_CORE_SYSCALL(regs); + + /* + * Use bpf_copy_from_user() - a sleepable helper - to read user memory. + * This exercises the sleepable execution path of raw tracepoints. + */ + err = bpf_copy_from_user(&tv_nsec, sizeof(tv_nsec), &ts->tv_nsec); + if (err) + return err; + + copied_tv_nsec = tv_nsec; + triggered = 1; + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/test_sleepable_raw_tp_fail.c b/tools/testing/selftests/bpf/progs/test_sleepable_raw_tp_fail.c new file mode 100644 index 0000000000000000000000000000000000000000..ef5dc3888df6d826f6b1d1adb211b439b71d6322 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_sleepable_raw_tp_fail.c @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include + +char _license[] SEC("license") = "GPL"; + +/* Sleepable program on a non-faultable tracepoint should fail at attach */ +SEC("tp_btf.s/sched_switch") +int BPF_PROG(test_sleepable_sched_switch, bool preempt, + struct task_struct *prev, struct task_struct *next) +{ + return 0; +} diff --git a/tools/testing/selftests/bpf/verifier/sleepable.c b/tools/testing/selftests/bpf/verifier/sleepable.c index 1f0d2bdc673f6e84e8e44be96c72977da0f73ab7..39522b7cd317080de42233afc180347b49fdff34 100644 --- a/tools/testing/selftests/bpf/verifier/sleepable.c +++ b/tools/testing/selftests/bpf/verifier/sleepable.c @@ -76,7 +76,7 @@ .runs = -1, }, { - "sleepable raw tracepoint reject", + "sleepable raw tracepoint accept", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), @@ -84,8 +84,7 @@ .prog_type = BPF_PROG_TYPE_TRACING, .expected_attach_type = BPF_TRACE_RAW_TP, .kfunc = "sched_switch", - .result = REJECT, - .errstr = "Only fentry/fexit/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable", + .result = ACCEPT, .flags = BPF_F_SLEEPABLE, .runs = -1, }, -- 2.53.0