From: Alexei Starovoitov Though hrtimer_start/cancel() inlines all of the smaller helpers in hrtimer.c and only call timerqueue_add/del() from lib/timerqueue.c where everything is not traceable and not kprobe-able (because all files in lib/ are not traceable), there are tracepoints within hrtimer that are called with locks held. Therefore prevent the deadlock by tightening conditions when timer/wq can be called synchronously. hrtimer/wq are using raw_spin_lock_irqsave(), so irqs_disabled() is enough. Fixes: 1bfbc267ec91 ("bpf: Enable bpf_timer and bpf_wq in any context") Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index d4aedac14a60..0517e9a8fc7c 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1430,8 +1430,6 @@ static int bpf_async_update_prog_callback(struct bpf_async_cb *cb, static int bpf_async_schedule_op(struct bpf_async_cb *cb, enum bpf_async_op op, u64 nsec, u32 timer_mode) { - WARN_ON_ONCE(!in_hardirq()); - struct bpf_async_cmd *cmd = kmalloc_nolock(sizeof(*cmd), 0, NUMA_NO_NODE); if (!cmd) { @@ -1473,6 +1471,11 @@ static const struct bpf_func_proto bpf_timer_set_callback_proto = { .arg2_type = ARG_PTR_TO_FUNC, }; +static bool defer_timer_wq_op(void) +{ + return in_hardirq() || irqs_disabled(); +} + BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, async, u64, nsecs, u64, flags) { struct bpf_hrtimer *t; @@ -1500,7 +1503,7 @@ BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, async, u64, nsecs, u64, fla if (!refcount_inc_not_zero(&t->cb.refcnt)) return -ENOENT; - if (!in_hardirq()) { + if (!defer_timer_wq_op()) { hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode); bpf_async_refcount_put(&t->cb); return 0; @@ -1524,7 +1527,7 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, async) bool inc = false; int ret = 0; - if (in_hardirq()) + if (defer_timer_wq_op()) return -EOPNOTSUPP; t = READ_ONCE(async->timer); @@ -1650,7 +1653,7 @@ static void bpf_async_cancel_and_free(struct bpf_async_kern *async) * refcnt. Either synchronously or asynchronously in irq_work. */ - if (!in_hardirq()) { + if (!defer_timer_wq_op()) { bpf_async_process_op(cb, BPF_ASYNC_CANCEL, 0, 0); } else { (void)bpf_async_schedule_op(cb, BPF_ASYNC_CANCEL, 0, 0); @@ -3161,7 +3164,7 @@ __bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) if (!refcount_inc_not_zero(&w->cb.refcnt)) return -ENOENT; - if (!in_hardirq()) { + if (!defer_timer_wq_op()) { schedule_work(&w->work); bpf_async_refcount_put(&w->cb); return 0; @@ -4461,7 +4464,7 @@ __bpf_kfunc int bpf_timer_cancel_async(struct bpf_timer *timer) if (!refcount_inc_not_zero(&cb->refcnt)) return -ENOENT; - if (!in_hardirq()) { + if (!defer_timer_wq_op()) { struct bpf_hrtimer *t = container_of(cb, struct bpf_hrtimer, cb); ret = hrtimer_try_to_cancel(&t->timer); -- 2.47.3 From: Alexei Starovoitov Add a testcase that checks that deadlock avoidance is working as expected. Signed-off-by: Alexei Starovoitov --- .../bpf/prog_tests/timer_start_deadlock.c | 33 ++++++++ .../bpf/progs/timer_start_deadlock.c | 75 +++++++++++++++++++ 2 files changed, 108 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/timer_start_deadlock.c create mode 100644 tools/testing/selftests/bpf/progs/timer_start_deadlock.c diff --git a/tools/testing/selftests/bpf/prog_tests/timer_start_deadlock.c b/tools/testing/selftests/bpf/prog_tests/timer_start_deadlock.c new file mode 100644 index 000000000000..9f1f9aec8888 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/timer_start_deadlock.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#include +#include "timer_start_deadlock.skel.h" + +void test_timer_start_deadlock(void) +{ + struct timer_start_deadlock *skel; + int err, prog_fd; + LIBBPF_OPTS(bpf_test_run_opts, opts); + + skel = timer_start_deadlock__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) + return; + + err = timer_start_deadlock__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto cleanup; + + prog_fd = bpf_program__fd(skel->progs.start_timer); + + /* + * Run the syscall program that attempts to deadlock. + * If the kernel deadlocks, this call will never return. + */ + err = bpf_prog_test_run_opts(prog_fd, &opts); + ASSERT_OK(err, "prog_test_run"); + ASSERT_EQ(opts.retval, 0, "prog_retval"); + + ASSERT_EQ(skel->bss->tp_called, 1, "tp_called"); +cleanup: + timer_start_deadlock__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/timer_start_deadlock.c b/tools/testing/selftests/bpf/progs/timer_start_deadlock.c new file mode 100644 index 000000000000..368563747a46 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/timer_start_deadlock.c @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include + +#define CLOCK_MONOTONIC 1 + +char _license[] SEC("license") = "GPL"; + +struct elem { + struct bpf_timer timer; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct elem); +} timer_map SEC(".maps"); + +volatile int in_timer_start; +volatile int tp_called; + +static int timer_cb(void *map, int *key, struct elem *value) +{ + return 0; +} + +SEC("tp_btf/hrtimer_cancel") +int BPF_PROG(tp_hrtimer_cancel, struct hrtimer *hrtimer) +{ + struct bpf_timer *timer; + static bool called = false; + int key = 0; + + if (!in_timer_start) + return 0; + + tp_called = 1; + timer = bpf_map_lookup_elem(&timer_map, &key); + + /* + * Call bpf_timer_start() from the tracepoint within hrtimer logic + * on the same timer to make sure it doesn't deadlock, + * and do it once. + */ + if (!called) { + called = true; + bpf_timer_start(timer, 1000000000, 0); + } + return 0; +} + +SEC("syscall") +int start_timer(void *ctx) +{ + struct bpf_timer *timer; + int key = 0; + + timer = bpf_map_lookup_elem(&timer_map, &key); + /* claude may complain here that there is no NULL check. Ignoring it. */ + bpf_timer_init(timer, &timer_map, CLOCK_MONOTONIC); + bpf_timer_set_callback(timer, timer_cb); + + /* + * call hrtimer_start() twice, so that 2nd call does + * remove_hrtimer() and trace_hrtimer_cancel() tracepoint. + */ + in_timer_start = 1; + bpf_timer_start(timer, 1000000000, 0); + bpf_timer_start(timer, 1000000000, 0); + in_timer_start = 0; + return 0; +} -- 2.47.3 From: Alexei Starovoitov Do not schedule timer/wq operation on a cpu that is in irq_work callback that is processing async_cmds queue. Otherwise the following loop is possible: bpf_timer_start() -> bpf_async_schedule_op() -> irq_work_queue(). irqrestore -> bpf_async_irq_worker() -> tracepoint -> bpf_timer_start(). Fixes: 1bfbc267ec91 ("bpf: Enable bpf_timer and bpf_wq in any context") Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 0517e9a8fc7c..01052f8664eb 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1427,9 +1427,23 @@ static int bpf_async_update_prog_callback(struct bpf_async_cb *cb, return 0; } +static DEFINE_PER_CPU(struct bpf_async_cb *, async_cb_running); + static int bpf_async_schedule_op(struct bpf_async_cb *cb, enum bpf_async_op op, u64 nsec, u32 timer_mode) { + /* + * Do not schedule another operation on this cpu if it's in irq_work + * callback that is processing async_cmds queue. Otherwise the following + * loop is possible: + * bpf_timer_start() -> bpf_async_schedule_op() -> irq_work_queue(). + * irqrestore -> bpf_async_irq_worker() -> tracepoint -> bpf_timer_start(). + */ + if (this_cpu_read(async_cb_running) == cb) { + bpf_async_refcount_put(cb); + return -EDEADLK; + } + struct bpf_async_cmd *cmd = kmalloc_nolock(sizeof(*cmd), 0, NUMA_NO_NODE); if (!cmd) { @@ -1628,6 +1642,7 @@ static void bpf_async_irq_worker(struct irq_work *work) return; list = llist_reverse_order(list); + this_cpu_write(async_cb_running, cb); llist_for_each_safe(pos, n, list) { struct bpf_async_cmd *cmd; @@ -1635,6 +1650,7 @@ static void bpf_async_irq_worker(struct irq_work *work) bpf_async_process_op(cb, cmd->op, cmd->nsec, cmd->mode); kfree_nolock(cmd); } + this_cpu_write(async_cb_running, NULL); } static void bpf_async_cancel_and_free(struct bpf_async_kern *async) -- 2.47.3 From: Alexei Starovoitov Strengthen timer_start_deadlock test and check for recursion now Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/timer_start_deadlock.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/timer_start_deadlock.c b/tools/testing/selftests/bpf/progs/timer_start_deadlock.c index 368563747a46..019518ee18cd 100644 --- a/tools/testing/selftests/bpf/progs/timer_start_deadlock.c +++ b/tools/testing/selftests/bpf/progs/timer_start_deadlock.c @@ -31,7 +31,6 @@ SEC("tp_btf/hrtimer_cancel") int BPF_PROG(tp_hrtimer_cancel, struct hrtimer *hrtimer) { struct bpf_timer *timer; - static bool called = false; int key = 0; if (!in_timer_start) @@ -42,13 +41,9 @@ int BPF_PROG(tp_hrtimer_cancel, struct hrtimer *hrtimer) /* * Call bpf_timer_start() from the tracepoint within hrtimer logic - * on the same timer to make sure it doesn't deadlock, - * and do it once. + * on the same timer to make sure it doesn't deadlock. */ - if (!called) { - called = true; - bpf_timer_start(timer, 1000000000, 0); - } + bpf_timer_start(timer, 1000000000, 0); return 0; } -- 2.47.3