From: Mykyta Yatsenko Refactor bpf timer and workqueue helpers to allow calling them from NMI context by making all operations lock-free and deferring NMI-unsafe work to irq_work. Previously, bpf_timer_start(), and bpf_wq_start() could not be called from NMI context because they acquired bpf_spin_lock and called hrtimer/schedule_work APIs directly. This patch removes these limitations. Key changes: * Remove bpf_spin_lock from struct bpf_async_kern. Replace locked operations with atomic cmpxchg() for initialization and xchg() for cancel and free. * Add per-async irq_work to defer NMI-unsafe operations (hrtimer_start, hrtimer_try_to_cancel, schedule_work) from NMI to softirq context. * Use the lock-free seqcount_latch_t to pass operation commands (start/cancel/free) along with their parameters (nsec, mode) from NMI-safe callers to the irq_work handler. * Add reference counting to bpf_async_cb to ensure the object stays alive until all scheduled irq_work completes and the timer/work callback finishes. * Move bpf_prog_put() to RCU callback to handle races between set_callback() and cancel_and_free(). * Refactor __bpf_async_set_callback() getting rid of locks. Each iteration acquires a new reference and stores it in cb->prog via xchg. The previous value is retrieved and released. The loop condition checks if both cb->prog and cb->callback_fn match what we just wrote. If either differs, a concurrent writer overwrote our value, and we must retry. When we retry, our previously-stored prog was already put() by the concurrent writer or we put() it after xchg(). This enables BPF programs attached to NMI-context hooks (perf events) to use timers and workqueues for deferred processing. Signed-off-by: Mykyta Yatsenko --- kernel/bpf/helpers.c | 335 ++++++++++++++++++++++++++++++++++----------------- 1 file changed, 225 insertions(+), 110 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 61ba4f6b741cc05b4a7a73a0322a23874bfd8e83..8c1ed75af072f64f2a63afd90ad40a962e8e46b0 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -29,6 +29,7 @@ #include #include #include +#include #include "../../lib/kstrtox.h" @@ -1095,6 +1096,23 @@ static void *map_key_from_value(struct bpf_map *map, void *value, u32 *arr_idx) return (void *)value - round_up(map->key_size, 8); } +enum bpf_async_type { + BPF_ASYNC_TYPE_TIMER = 0, + BPF_ASYNC_TYPE_WQ, +}; + +enum bpf_async_op { + BPF_ASYNC_START, + BPF_ASYNC_CANCEL, + BPF_ASYNC_CANCEL_AND_FREE, +}; + +struct bpf_async_cmd { + u64 nsec; + u32 mode; + u32 op; +}; + struct bpf_async_cb { struct bpf_map *map; struct bpf_prog *prog; @@ -1105,6 +1123,13 @@ struct bpf_async_cb { struct work_struct delete_work; }; u64 flags; + struct irq_work worker; + atomic_t writer; + seqcount_latch_t latch; + struct bpf_async_cmd cmd[2]; + atomic_t last_seq; + refcount_t refcnt; + enum bpf_async_type type; }; /* BPF map elements can contain 'struct bpf_timer'. @@ -1142,18 +1167,9 @@ struct bpf_async_kern { struct bpf_hrtimer *timer; struct bpf_work *work; }; - /* bpf_spin_lock is used here instead of spinlock_t to make - * sure that it always fits into space reserved by struct bpf_timer - * regardless of LOCKDEP and spinlock debug flags. - */ - struct bpf_spin_lock lock; + u32 __opaque; } __attribute__((aligned(8))); -enum bpf_async_type { - BPF_ASYNC_TYPE_TIMER = 0, - BPF_ASYNC_TYPE_WQ, -}; - static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running); static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer) @@ -1219,6 +1235,13 @@ static void bpf_async_cb_rcu_free(struct rcu_head *rcu) { struct bpf_async_cb *cb = container_of(rcu, struct bpf_async_cb, rcu); + /* + * Drop the last reference to prog only after RCU GP, as set_callback() + * may race with cancel_and_free() + */ + if (cb->prog) + bpf_prog_put(cb->prog); + kfree_nolock(cb); } @@ -1246,18 +1269,17 @@ static void bpf_timer_delete_work(struct work_struct *work) call_rcu(&t->cb.rcu, bpf_async_cb_rcu_free); } +static void __bpf_async_cancel_and_free(struct bpf_async_kern *async); +static void bpf_async_irq_worker(struct irq_work *work); + static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags, enum bpf_async_type type) { - struct bpf_async_cb *cb; + struct bpf_async_cb *cb, *old_cb; struct bpf_hrtimer *t; struct bpf_work *w; clockid_t clockid; size_t size; - int ret = 0; - - if (in_nmi()) - return -EOPNOTSUPP; switch (type) { case BPF_ASYNC_TYPE_TIMER: @@ -1270,18 +1292,13 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u return -EINVAL; } - __bpf_spin_lock_irqsave(&async->lock); - t = async->timer; - if (t) { - ret = -EBUSY; - goto out; - } + old_cb = READ_ONCE(async->cb); + if (old_cb) + return -EBUSY; cb = bpf_map_kmalloc_nolock(map, size, 0, map->numa_node); - if (!cb) { - ret = -ENOMEM; - goto out; - } + if (!cb) + return -ENOMEM; switch (type) { case BPF_ASYNC_TYPE_TIMER: @@ -1304,9 +1321,20 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u cb->map = map; cb->prog = NULL; cb->flags = flags; + cb->worker = IRQ_WORK_INIT(bpf_async_irq_worker); + seqcount_latch_init(&cb->latch); + atomic_set(&cb->writer, 0); + refcount_set(&cb->refcnt, 1); /* map's reference */ + atomic_set(&cb->last_seq, 0); + cb->type = type; rcu_assign_pointer(cb->callback_fn, NULL); - WRITE_ONCE(async->cb, cb); + old_cb = cmpxchg(&async->cb, NULL, cb); + if (old_cb) { + /* Lost the race to initialize this bpf_async_kern, drop the allocated object */ + kfree_nolock(cb); + return -EBUSY; + } /* Guarantee the order between async->cb and map->usercnt. So * when there are concurrent uref release and bpf timer init, either * bpf_timer_cancel_and_free() called by uref release reads a no-NULL @@ -1317,13 +1345,11 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u /* maps with timers must be either held by user space * or pinned in bpffs. */ - WRITE_ONCE(async->cb, NULL); - kfree_nolock(cb); - ret = -EPERM; + __bpf_async_cancel_and_free(async); + return -EPERM; } -out: - __bpf_spin_unlock_irqrestore(&async->lock); - return ret; + + return 0; } BPF_CALL_3(bpf_timer_init, struct bpf_async_kern *, timer, struct bpf_map *, map, @@ -1388,33 +1414,95 @@ static int bpf_async_update_prog_callback(struct bpf_async_cb *cb, void *callbac return 0; } +/* Decrements bpf_async_cb refcnt, if it becomes 0 schedule cleanup irq_work */ +static void bpf_async_refcnt_dec_cleanup(struct bpf_async_cb *cb) +{ + if (!refcount_dec_and_test(&cb->refcnt)) + return; + + /* + * At this point we took the last reference + * Try to schedule cleanup, either: + * - Set ref to 1 and succeed irq_work_queue + * - See non-zero refcnt after decrement - other irq_work is going to cleanup + */ + do { + refcount_set(&cb->refcnt, 1); + if (irq_work_queue(&cb->worker)) + break; + } while (refcount_dec_and_test(&cb->refcnt)); +} + +static int bpf_async_schedule_op(struct bpf_async_cb *cb, u32 op, u64 nsec, u32 timer_mode) +{ + /* Acquire active writer atomic*/ + if (atomic_cmpxchg_acquire(&cb->writer, 0, 1)) + return -EBUSY; + + write_seqcount_latch_begin(&cb->latch); + cb->cmd[0].nsec = nsec; + cb->cmd[0].mode = timer_mode; + cb->cmd[0].op = op; + write_seqcount_latch(&cb->latch); + cb->cmd[1].nsec = nsec; + cb->cmd[1].mode = timer_mode; + cb->cmd[1].op = op; + write_seqcount_latch_end(&cb->latch); + atomic_set(&cb->writer, 0); + + if (!refcount_inc_not_zero(&cb->refcnt)) + return -EBUSY; + + /* TODO: Run operation without irq_work if not in NMI */ + if (!irq_work_queue(&cb->worker)) + /* irq_work is already scheduled on this CPU */ + bpf_async_refcnt_dec_cleanup(cb); + + return 0; +} + +static int bpf_async_read_op(struct bpf_async_cb *cb, enum bpf_async_op *op, + u64 *nsec, u32 *flags) +{ + u32 seq, last_seq, idx; + + while (true) { + last_seq = atomic_read_acquire(&cb->last_seq); + + seq = raw_read_seqcount_latch(&cb->latch); + + /* Return -EBUSY if current seq is consumed by another reader */ + if (seq == last_seq) + return -EBUSY; + + idx = seq & 1; + *nsec = cb->cmd[idx].nsec; + *flags = cb->cmd[idx].mode; + *op = cb->cmd[idx].op; + + if (raw_read_seqcount_latch_retry(&cb->latch, seq)) + continue; + /* Commit read sequence number, own snapshot exclusively */ + if (atomic_cmpxchg_release(&cb->last_seq, last_seq, seq) == last_seq) + break; + } + + return 0; +} + static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback_fn, struct bpf_prog *prog) { struct bpf_async_cb *cb; - int ret = 0; - if (in_nmi()) - return -EOPNOTSUPP; - __bpf_spin_lock_irqsave(&async->lock); - cb = async->cb; - if (!cb) { - ret = -EINVAL; - goto out; - } - if (!atomic64_read(&cb->map->usercnt)) { - /* maps with timers must be either held by user space - * or pinned in bpffs. Otherwise timer might still be - * running even when bpf prog is detached and user space - * is gone, since map_release_uref won't ever be called. - */ - ret = -EPERM; - goto out; - } - ret = bpf_async_update_prog_callback(cb, callback_fn, prog); -out: - __bpf_spin_unlock_irqrestore(&async->lock); - return ret; + /* Make sure bpf_async_cb_rcu_free() is not called while here */ + guard(rcu)(); + + cb = READ_ONCE(async->cb); + if (!cb) + return -EINVAL; + + return bpf_async_update_prog_callback(cb, callback_fn, prog); } BPF_CALL_3(bpf_timer_set_callback, struct bpf_async_kern *, timer, void *, callback_fn, @@ -1431,22 +1519,19 @@ static const struct bpf_func_proto bpf_timer_set_callback_proto = { .arg2_type = ARG_PTR_TO_FUNC, }; -BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, timer, u64, nsecs, u64, flags) +BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, async, u64, nsecs, u64, flags) { struct bpf_hrtimer *t; - int ret = 0; - enum hrtimer_mode mode; + u32 mode; - if (in_nmi()) - return -EOPNOTSUPP; if (flags & ~(BPF_F_TIMER_ABS | BPF_F_TIMER_CPU_PIN)) return -EINVAL; - __bpf_spin_lock_irqsave(&timer->lock); - t = timer->timer; - if (!t || !t->cb.prog) { - ret = -EINVAL; - goto out; - } + + guard(rcu)(); + + t = READ_ONCE(async->timer); + if (!t || !READ_ONCE(t->cb.prog)) + return -EINVAL; if (flags & BPF_F_TIMER_ABS) mode = HRTIMER_MODE_ABS_SOFT; @@ -1456,10 +1541,7 @@ BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, timer, u64, nsecs, u64, fla if (flags & BPF_F_TIMER_CPU_PIN) mode |= HRTIMER_MODE_PINNED; - hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode); -out: - __bpf_spin_unlock_irqrestore(&timer->lock); - return ret; + return bpf_async_schedule_op(&t->cb, BPF_ASYNC_START, nsecs, mode); } static const struct bpf_func_proto bpf_timer_start_proto = { @@ -1536,27 +1618,16 @@ static const struct bpf_func_proto bpf_timer_cancel_proto = { .arg1_type = ARG_PTR_TO_TIMER, }; -static struct bpf_async_cb *__bpf_async_cancel_and_free(struct bpf_async_kern *async) +static void __bpf_async_cancel_and_free(struct bpf_async_kern *async) { struct bpf_async_cb *cb; - /* Performance optimization: read async->cb without lock first. */ - if (!READ_ONCE(async->cb)) - return NULL; - - __bpf_spin_lock_irqsave(&async->lock); - /* re-read it under lock */ - cb = async->cb; + cb = xchg(&async->cb, NULL); if (!cb) - goto out; - bpf_async_update_prog_callback(cb, NULL, NULL); - /* The subsequent bpf_timer_start/cancel() helpers won't be able to use - * this timer, since it won't be initialized. - */ - WRITE_ONCE(async->cb, NULL); -out: - __bpf_spin_unlock_irqrestore(&async->lock); - return cb; + return; + + /* Consume map's own refcnt, schedule cleanup irq_work if this is the last ref */ + bpf_async_refcnt_dec_cleanup(cb); } static void bpf_timer_delete(struct bpf_hrtimer *t) @@ -1611,19 +1682,76 @@ static void bpf_timer_delete(struct bpf_hrtimer *t) } } +static void bpf_async_process_op(struct bpf_async_cb *cb, u32 op, + u64 timer_nsec, u32 timer_mode) +{ + switch (cb->type) { + case BPF_ASYNC_TYPE_TIMER: { + struct bpf_hrtimer *t = container_of(cb, struct bpf_hrtimer, cb); + + switch (op) { + case BPF_ASYNC_START: + hrtimer_start(&t->timer, ns_to_ktime(timer_nsec), timer_mode); + break; + case BPF_ASYNC_CANCEL: + hrtimer_try_to_cancel(&t->timer); + break; + case BPF_ASYNC_CANCEL_AND_FREE: + bpf_timer_delete(t); + break; + default: + break; + } + break; + } + case BPF_ASYNC_TYPE_WQ: { + struct bpf_work *w = container_of(cb, struct bpf_work, cb); + + switch (op) { + case BPF_ASYNC_START: + schedule_work(&w->work); + break; + case BPF_ASYNC_CANCEL_AND_FREE: + /* + * Trigger cancel of the sleepable work, but *do not* wait for + * it to finish. + * kfree will be called once the work has finished. + */ + schedule_work(&w->delete_work); + break; + default: + break; + } + break; + } + } +} + +static void bpf_async_irq_worker(struct irq_work *work) +{ + struct bpf_async_cb *cb = container_of(work, struct bpf_async_cb, worker); + u32 op, timer_mode; + u64 nsec; + int err; + + err = bpf_async_read_op(cb, &op, &nsec, &timer_mode); + if (err) + goto out; + + bpf_async_process_op(cb, op, nsec, timer_mode); + +out: + if (refcount_dec_and_test(&cb->refcnt)) + bpf_async_process_op(cb, BPF_ASYNC_CANCEL_AND_FREE, 0, 0); +} + /* * This function is called by map_delete/update_elem for individual element and * by ops->map_release_uref when the user space reference to a map reaches zero. */ void bpf_timer_cancel_and_free(void *val) { - struct bpf_hrtimer *t; - - t = (struct bpf_hrtimer *)__bpf_async_cancel_and_free(val); - if (!t) - return; - - bpf_timer_delete(t); + __bpf_async_cancel_and_free(val); } /* This function is called by map_delete/update_elem for individual element and @@ -1631,19 +1759,7 @@ void bpf_timer_cancel_and_free(void *val) */ void bpf_wq_cancel_and_free(void *val) { - struct bpf_work *work; - - BTF_TYPE_EMIT(struct bpf_wq); - - work = (struct bpf_work *)__bpf_async_cancel_and_free(val); - if (!work) - return; - /* Trigger cancel of the sleepable work, but *do not* wait for - * it to finish if it was running as we might not be in a - * sleepable context. - * kfree will be called once the work has finished. - */ - schedule_work(&work->delete_work); + __bpf_async_cancel_and_free(val); } BPF_CALL_2(bpf_kptr_xchg, void *, dst, void *, ptr) @@ -3116,16 +3232,15 @@ __bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) struct bpf_async_kern *async = (struct bpf_async_kern *)wq; struct bpf_work *w; - if (in_nmi()) - return -EOPNOTSUPP; if (flags) return -EINVAL; + + guard(rcu)(); w = READ_ONCE(async->work); if (!w || !READ_ONCE(w->cb.prog)) return -EINVAL; - schedule_work(&w->work); - return 0; + return bpf_async_schedule_op(&w->cb, BPF_ASYNC_START, 0, 0); } __bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq, -- 2.52.0