From: Mykyta Yatsenko Move the timer deletion logic into a dedicated bpf_timer_delete() helper so it can be reused by later patches. Acked-by: Eduard Zingerman Signed-off-by: Mykyta Yatsenko --- kernel/bpf/helpers.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 9eaa4185e0a79b903c6fc2ccb310f521a4b14a1d..cbacddc7101a82b2f72278034bba4188829fecd6 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1558,18 +1558,10 @@ static struct bpf_async_cb *__bpf_async_cancel_and_free(struct bpf_async_kern *a return cb; } -/* This function is called by map_delete/update_elem for individual element and - * by ops->map_release_uref when the user space reference to a map reaches zero. - */ -void bpf_timer_cancel_and_free(void *val) +static void bpf_timer_delete(struct bpf_hrtimer *t) { - struct bpf_hrtimer *t; - - t = (struct bpf_hrtimer *)__bpf_async_cancel_and_free(val); - - if (!t) - return; - /* We check that bpf_map_delete/update_elem() was called from timer + /* + * We check that bpf_map_delete/update_elem() was called from timer * callback_fn. In such case we don't call hrtimer_cancel() (since it * will deadlock) and don't call hrtimer_try_to_cancel() (since it will * just return -1). Though callback_fn is still running on this cpu it's @@ -1618,6 +1610,21 @@ void bpf_timer_cancel_and_free(void *val) } } +/* + * This function is called by map_delete/update_elem for individual element and + * by ops->map_release_uref when the user space reference to a map reaches zero. + */ +void bpf_timer_cancel_and_free(void *val) +{ + struct bpf_hrtimer *t; + + t = (struct bpf_hrtimer *)__bpf_async_cancel_and_free(val); + if (!t) + return; + + bpf_timer_delete(t); +} + /* This function is called by map_delete/update_elem for individual element and * by ops->map_release_uref when the user space reference to a map reaches zero. */ -- 2.52.0 From: Mykyta Yatsenko Remove unused arguments from __bpf_async_set_callback(). Signed-off-by: Mykyta Yatsenko --- kernel/bpf/helpers.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index cbacddc7101a82b2f72278034bba4188829fecd6..962b7f1b81b05d663b79218d9d7eaa73679ce94f 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1355,10 +1355,9 @@ static const struct bpf_func_proto bpf_timer_init_proto = { }; static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback_fn, - struct bpf_prog_aux *aux, unsigned int flags, - enum bpf_async_type type) + struct bpf_prog *prog) { - struct bpf_prog *prev, *prog = aux->prog; + struct bpf_prog *prev; struct bpf_async_cb *cb; int ret = 0; @@ -1403,7 +1402,7 @@ static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback BPF_CALL_3(bpf_timer_set_callback, struct bpf_async_kern *, timer, void *, callback_fn, struct bpf_prog_aux *, aux) { - return __bpf_async_set_callback(timer, callback_fn, aux, 0, BPF_ASYNC_TYPE_TIMER); + return __bpf_async_set_callback(timer, callback_fn, aux->prog); } static const struct bpf_func_proto bpf_timer_set_callback_proto = { @@ -3138,7 +3137,7 @@ __bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq, if (flags) return -EINVAL; - return __bpf_async_set_callback(async, callback_fn, aux, flags, BPF_ASYNC_TYPE_WQ); + return __bpf_async_set_callback(async, callback_fn, aux->prog); } __bpf_kfunc void bpf_preempt_disable(void) -- 2.52.0 From: Mykyta Yatsenko Introduce bpf_async_update_prog_callback(): lock-free update of cb->prog and cb->callback_fn. This function allows updating prog and callback_fn fields of the struct bpf_async_cb without holding lock. For now use it under the lock from __bpf_async_set_callback(), in the next patches that lock will be removed. Lock-free algorithm: * Acquire a guard reference on prog to prevent it from being freed during the retry loop. * Retry loop: 1. Each iteration acquires a new prog reference and stores it in cb->prog via xchg. The previous prog is released. 2. The loop condition checks if both cb->prog and cb->callback_fn match what we just wrote. If either differs, a concurrent writer overwrote our value, and we must retry. 3. When we retry, our previously-stored prog was already released by the concurrent writer or will be released by us after overwriting. * Release guard reference. Signed-off-by: Mykyta Yatsenko --- kernel/bpf/helpers.c | 67 +++++++++++++++++++++++++++++----------------------- 1 file changed, 37 insertions(+), 30 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 962b7f1b81b05d663b79218d9d7eaa73679ce94f..66424bc5b86137599990957ad2300110b4977df9 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1354,10 +1354,43 @@ static const struct bpf_func_proto bpf_timer_init_proto = { .arg3_type = ARG_ANYTHING, }; +static int bpf_async_update_prog_callback(struct bpf_async_cb *cb, void *callback_fn, + struct bpf_prog *prog) +{ + struct bpf_prog *prev; + + /* Acquire a guard reference on prog to prevent it from being freed during the loop */ + if (prog) { + prog = bpf_prog_inc_not_zero(prog); + if (IS_ERR(prog)) + return PTR_ERR(prog); + } + + do { + if (prog) + prog = bpf_prog_inc_not_zero(prog); + prev = xchg(&cb->prog, prog); + rcu_assign_pointer(cb->callback_fn, callback_fn); + + /* + * Release previous prog, make sure that if other CPU is contending, + * to set bpf_prog, references are not leaked as each iteration acquires and + * releases one reference. + */ + if (prev) + bpf_prog_put(prev); + + } while (READ_ONCE(cb->prog) != prog || READ_ONCE(cb->callback_fn) != callback_fn); + + if (prog) + bpf_prog_put(prog); + + return 0; +} + static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback_fn, struct bpf_prog *prog) { - struct bpf_prog *prev; struct bpf_async_cb *cb; int ret = 0; @@ -1378,22 +1411,7 @@ static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback ret = -EPERM; goto out; } - prev = cb->prog; - if (prev != prog) { - /* Bump prog refcnt once. Every bpf_timer_set_callback() - * can pick different callback_fn-s within the same prog. - */ - prog = bpf_prog_inc_not_zero(prog); - if (IS_ERR(prog)) { - ret = PTR_ERR(prog); - goto out; - } - if (prev) - /* Drop prev prog refcnt when swapping with new prog */ - bpf_prog_put(prev); - cb->prog = prog; - } - rcu_assign_pointer(cb->callback_fn, callback_fn); + ret = bpf_async_update_prog_callback(cb, callback_fn, prog); out: __bpf_spin_unlock_irqrestore(&async->lock); return ret; @@ -1453,17 +1471,6 @@ static const struct bpf_func_proto bpf_timer_start_proto = { .arg3_type = ARG_ANYTHING, }; -static void drop_prog_refcnt(struct bpf_async_cb *async) -{ - struct bpf_prog *prog = async->prog; - - if (prog) { - bpf_prog_put(prog); - async->prog = NULL; - rcu_assign_pointer(async->callback_fn, NULL); - } -} - BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer) { struct bpf_hrtimer *t, *cur_t; @@ -1514,7 +1521,7 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer) goto out; } drop: - drop_prog_refcnt(&t->cb); + bpf_async_update_prog_callback(&t->cb, NULL, NULL); out: __bpf_spin_unlock_irqrestore(&timer->lock); /* Cancel the timer and wait for associated callback to finish @@ -1547,7 +1554,7 @@ static struct bpf_async_cb *__bpf_async_cancel_and_free(struct bpf_async_kern *a cb = async->cb; if (!cb) goto out; - drop_prog_refcnt(cb); + bpf_async_update_prog_callback(cb, NULL, NULL); /* The subsequent bpf_timer_start/cancel() helpers won't be able to use * this timer, since it won't be initialized. */ -- 2.52.0 From: Mykyta Yatsenko Remove lock from the bpf_timer_cancel() helper. The lock does not protect from concurrent modification of the bpf_async_cb data fields as those are modified in the callback without locking. Use guard(rcu)() instead of pair of explicit lock()/unlock(). Signed-off-by: Mykyta Yatsenko Acked-by: Kumar Kartikeya Dwivedi --- kernel/bpf/helpers.c | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 66424bc5b86137599990957ad2300110b4977df9..61ba4f6b741cc05b4a7a73a0322a23874bfd8e83 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1471,7 +1471,7 @@ static const struct bpf_func_proto bpf_timer_start_proto = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer) +BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, async) { struct bpf_hrtimer *t, *cur_t; bool inc = false; @@ -1479,13 +1479,12 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer) if (in_nmi()) return -EOPNOTSUPP; - rcu_read_lock(); - __bpf_spin_lock_irqsave(&timer->lock); - t = timer->timer; - if (!t) { - ret = -EINVAL; - goto out; - } + + guard(rcu)(); + + t = READ_ONCE(async->timer); + if (!t) + return -EINVAL; cur_t = this_cpu_read(hrtimer_running); if (cur_t == t) { @@ -1493,8 +1492,7 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer) * its own timer the hrtimer_cancel() will deadlock * since it waits for callback_fn to finish. */ - ret = -EDEADLK; - goto out; + return -EDEADLK; } /* Only account in-flight cancellations when invoked from a timer @@ -1517,20 +1515,17 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer) * cancelling and waiting for it synchronously, since it might * do the same. Bail! */ - ret = -EDEADLK; - goto out; + atomic_dec(&t->cancelling); + return -EDEADLK; } drop: bpf_async_update_prog_callback(&t->cb, NULL, NULL); -out: - __bpf_spin_unlock_irqrestore(&timer->lock); /* Cancel the timer and wait for associated callback to finish * if it was running. */ - ret = ret ?: hrtimer_cancel(&t->timer); + ret = hrtimer_cancel(&t->timer); if (inc) atomic_dec(&t->cancelling); - rcu_read_unlock(); return ret; } -- 2.52.0 From: Mykyta Yatsenko Refactor bpf timer and workqueue helpers to allow calling them from NMI context by making all operations lock-free and deferring NMI-unsafe work to irq_work. Previously, bpf_timer_start(), and bpf_wq_start() could not be called from NMI context because they acquired bpf_spin_lock and called hrtimer/schedule_work APIs directly. This patch removes these limitations. Key changes: * Remove bpf_spin_lock from struct bpf_async_kern. Replace locked operations with atomic cmpxchg() for initialization and xchg() for cancel and free. * Add per-async irq_work to defer NMI-unsafe operations (hrtimer_start, hrtimer_try_to_cancel, schedule_work) from NMI to softirq context. * Use the lock-free seqcount_latch_t to pass operation commands (start/cancel/free) along with their parameters (nsec, mode) from NMI-safe callers to the irq_work handler. * Add reference counting to bpf_async_cb to ensure the object stays alive until all scheduled irq_work completes and the timer/work callback finishes. * Move bpf_prog_put() to RCU callback to handle races between set_callback() and cancel_and_free(). * Refactor __bpf_async_set_callback() getting rid of locks. Each iteration acquires a new reference and stores it in cb->prog via xchg. The previous value is retrieved and released. The loop condition checks if both cb->prog and cb->callback_fn match what we just wrote. If either differs, a concurrent writer overwrote our value, and we must retry. When we retry, our previously-stored prog was already put() by the concurrent writer or we put() it after xchg(). This enables BPF programs attached to NMI-context hooks (perf events) to use timers and workqueues for deferred processing. Signed-off-by: Mykyta Yatsenko --- kernel/bpf/helpers.c | 335 ++++++++++++++++++++++++++++++++++----------------- 1 file changed, 225 insertions(+), 110 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 61ba4f6b741cc05b4a7a73a0322a23874bfd8e83..8c1ed75af072f64f2a63afd90ad40a962e8e46b0 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -29,6 +29,7 @@ #include #include #include +#include #include "../../lib/kstrtox.h" @@ -1095,6 +1096,23 @@ static void *map_key_from_value(struct bpf_map *map, void *value, u32 *arr_idx) return (void *)value - round_up(map->key_size, 8); } +enum bpf_async_type { + BPF_ASYNC_TYPE_TIMER = 0, + BPF_ASYNC_TYPE_WQ, +}; + +enum bpf_async_op { + BPF_ASYNC_START, + BPF_ASYNC_CANCEL, + BPF_ASYNC_CANCEL_AND_FREE, +}; + +struct bpf_async_cmd { + u64 nsec; + u32 mode; + u32 op; +}; + struct bpf_async_cb { struct bpf_map *map; struct bpf_prog *prog; @@ -1105,6 +1123,13 @@ struct bpf_async_cb { struct work_struct delete_work; }; u64 flags; + struct irq_work worker; + atomic_t writer; + seqcount_latch_t latch; + struct bpf_async_cmd cmd[2]; + atomic_t last_seq; + refcount_t refcnt; + enum bpf_async_type type; }; /* BPF map elements can contain 'struct bpf_timer'. @@ -1142,18 +1167,9 @@ struct bpf_async_kern { struct bpf_hrtimer *timer; struct bpf_work *work; }; - /* bpf_spin_lock is used here instead of spinlock_t to make - * sure that it always fits into space reserved by struct bpf_timer - * regardless of LOCKDEP and spinlock debug flags. - */ - struct bpf_spin_lock lock; + u32 __opaque; } __attribute__((aligned(8))); -enum bpf_async_type { - BPF_ASYNC_TYPE_TIMER = 0, - BPF_ASYNC_TYPE_WQ, -}; - static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running); static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer) @@ -1219,6 +1235,13 @@ static void bpf_async_cb_rcu_free(struct rcu_head *rcu) { struct bpf_async_cb *cb = container_of(rcu, struct bpf_async_cb, rcu); + /* + * Drop the last reference to prog only after RCU GP, as set_callback() + * may race with cancel_and_free() + */ + if (cb->prog) + bpf_prog_put(cb->prog); + kfree_nolock(cb); } @@ -1246,18 +1269,17 @@ static void bpf_timer_delete_work(struct work_struct *work) call_rcu(&t->cb.rcu, bpf_async_cb_rcu_free); } +static void __bpf_async_cancel_and_free(struct bpf_async_kern *async); +static void bpf_async_irq_worker(struct irq_work *work); + static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags, enum bpf_async_type type) { - struct bpf_async_cb *cb; + struct bpf_async_cb *cb, *old_cb; struct bpf_hrtimer *t; struct bpf_work *w; clockid_t clockid; size_t size; - int ret = 0; - - if (in_nmi()) - return -EOPNOTSUPP; switch (type) { case BPF_ASYNC_TYPE_TIMER: @@ -1270,18 +1292,13 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u return -EINVAL; } - __bpf_spin_lock_irqsave(&async->lock); - t = async->timer; - if (t) { - ret = -EBUSY; - goto out; - } + old_cb = READ_ONCE(async->cb); + if (old_cb) + return -EBUSY; cb = bpf_map_kmalloc_nolock(map, size, 0, map->numa_node); - if (!cb) { - ret = -ENOMEM; - goto out; - } + if (!cb) + return -ENOMEM; switch (type) { case BPF_ASYNC_TYPE_TIMER: @@ -1304,9 +1321,20 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u cb->map = map; cb->prog = NULL; cb->flags = flags; + cb->worker = IRQ_WORK_INIT(bpf_async_irq_worker); + seqcount_latch_init(&cb->latch); + atomic_set(&cb->writer, 0); + refcount_set(&cb->refcnt, 1); /* map's reference */ + atomic_set(&cb->last_seq, 0); + cb->type = type; rcu_assign_pointer(cb->callback_fn, NULL); - WRITE_ONCE(async->cb, cb); + old_cb = cmpxchg(&async->cb, NULL, cb); + if (old_cb) { + /* Lost the race to initialize this bpf_async_kern, drop the allocated object */ + kfree_nolock(cb); + return -EBUSY; + } /* Guarantee the order between async->cb and map->usercnt. So * when there are concurrent uref release and bpf timer init, either * bpf_timer_cancel_and_free() called by uref release reads a no-NULL @@ -1317,13 +1345,11 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u /* maps with timers must be either held by user space * or pinned in bpffs. */ - WRITE_ONCE(async->cb, NULL); - kfree_nolock(cb); - ret = -EPERM; + __bpf_async_cancel_and_free(async); + return -EPERM; } -out: - __bpf_spin_unlock_irqrestore(&async->lock); - return ret; + + return 0; } BPF_CALL_3(bpf_timer_init, struct bpf_async_kern *, timer, struct bpf_map *, map, @@ -1388,33 +1414,95 @@ static int bpf_async_update_prog_callback(struct bpf_async_cb *cb, void *callbac return 0; } +/* Decrements bpf_async_cb refcnt, if it becomes 0 schedule cleanup irq_work */ +static void bpf_async_refcnt_dec_cleanup(struct bpf_async_cb *cb) +{ + if (!refcount_dec_and_test(&cb->refcnt)) + return; + + /* + * At this point we took the last reference + * Try to schedule cleanup, either: + * - Set ref to 1 and succeed irq_work_queue + * - See non-zero refcnt after decrement - other irq_work is going to cleanup + */ + do { + refcount_set(&cb->refcnt, 1); + if (irq_work_queue(&cb->worker)) + break; + } while (refcount_dec_and_test(&cb->refcnt)); +} + +static int bpf_async_schedule_op(struct bpf_async_cb *cb, u32 op, u64 nsec, u32 timer_mode) +{ + /* Acquire active writer atomic*/ + if (atomic_cmpxchg_acquire(&cb->writer, 0, 1)) + return -EBUSY; + + write_seqcount_latch_begin(&cb->latch); + cb->cmd[0].nsec = nsec; + cb->cmd[0].mode = timer_mode; + cb->cmd[0].op = op; + write_seqcount_latch(&cb->latch); + cb->cmd[1].nsec = nsec; + cb->cmd[1].mode = timer_mode; + cb->cmd[1].op = op; + write_seqcount_latch_end(&cb->latch); + atomic_set(&cb->writer, 0); + + if (!refcount_inc_not_zero(&cb->refcnt)) + return -EBUSY; + + /* TODO: Run operation without irq_work if not in NMI */ + if (!irq_work_queue(&cb->worker)) + /* irq_work is already scheduled on this CPU */ + bpf_async_refcnt_dec_cleanup(cb); + + return 0; +} + +static int bpf_async_read_op(struct bpf_async_cb *cb, enum bpf_async_op *op, + u64 *nsec, u32 *flags) +{ + u32 seq, last_seq, idx; + + while (true) { + last_seq = atomic_read_acquire(&cb->last_seq); + + seq = raw_read_seqcount_latch(&cb->latch); + + /* Return -EBUSY if current seq is consumed by another reader */ + if (seq == last_seq) + return -EBUSY; + + idx = seq & 1; + *nsec = cb->cmd[idx].nsec; + *flags = cb->cmd[idx].mode; + *op = cb->cmd[idx].op; + + if (raw_read_seqcount_latch_retry(&cb->latch, seq)) + continue; + /* Commit read sequence number, own snapshot exclusively */ + if (atomic_cmpxchg_release(&cb->last_seq, last_seq, seq) == last_seq) + break; + } + + return 0; +} + static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback_fn, struct bpf_prog *prog) { struct bpf_async_cb *cb; - int ret = 0; - if (in_nmi()) - return -EOPNOTSUPP; - __bpf_spin_lock_irqsave(&async->lock); - cb = async->cb; - if (!cb) { - ret = -EINVAL; - goto out; - } - if (!atomic64_read(&cb->map->usercnt)) { - /* maps with timers must be either held by user space - * or pinned in bpffs. Otherwise timer might still be - * running even when bpf prog is detached and user space - * is gone, since map_release_uref won't ever be called. - */ - ret = -EPERM; - goto out; - } - ret = bpf_async_update_prog_callback(cb, callback_fn, prog); -out: - __bpf_spin_unlock_irqrestore(&async->lock); - return ret; + /* Make sure bpf_async_cb_rcu_free() is not called while here */ + guard(rcu)(); + + cb = READ_ONCE(async->cb); + if (!cb) + return -EINVAL; + + return bpf_async_update_prog_callback(cb, callback_fn, prog); } BPF_CALL_3(bpf_timer_set_callback, struct bpf_async_kern *, timer, void *, callback_fn, @@ -1431,22 +1519,19 @@ static const struct bpf_func_proto bpf_timer_set_callback_proto = { .arg2_type = ARG_PTR_TO_FUNC, }; -BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, timer, u64, nsecs, u64, flags) +BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, async, u64, nsecs, u64, flags) { struct bpf_hrtimer *t; - int ret = 0; - enum hrtimer_mode mode; + u32 mode; - if (in_nmi()) - return -EOPNOTSUPP; if (flags & ~(BPF_F_TIMER_ABS | BPF_F_TIMER_CPU_PIN)) return -EINVAL; - __bpf_spin_lock_irqsave(&timer->lock); - t = timer->timer; - if (!t || !t->cb.prog) { - ret = -EINVAL; - goto out; - } + + guard(rcu)(); + + t = READ_ONCE(async->timer); + if (!t || !READ_ONCE(t->cb.prog)) + return -EINVAL; if (flags & BPF_F_TIMER_ABS) mode = HRTIMER_MODE_ABS_SOFT; @@ -1456,10 +1541,7 @@ BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, timer, u64, nsecs, u64, fla if (flags & BPF_F_TIMER_CPU_PIN) mode |= HRTIMER_MODE_PINNED; - hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode); -out: - __bpf_spin_unlock_irqrestore(&timer->lock); - return ret; + return bpf_async_schedule_op(&t->cb, BPF_ASYNC_START, nsecs, mode); } static const struct bpf_func_proto bpf_timer_start_proto = { @@ -1536,27 +1618,16 @@ static const struct bpf_func_proto bpf_timer_cancel_proto = { .arg1_type = ARG_PTR_TO_TIMER, }; -static struct bpf_async_cb *__bpf_async_cancel_and_free(struct bpf_async_kern *async) +static void __bpf_async_cancel_and_free(struct bpf_async_kern *async) { struct bpf_async_cb *cb; - /* Performance optimization: read async->cb without lock first. */ - if (!READ_ONCE(async->cb)) - return NULL; - - __bpf_spin_lock_irqsave(&async->lock); - /* re-read it under lock */ - cb = async->cb; + cb = xchg(&async->cb, NULL); if (!cb) - goto out; - bpf_async_update_prog_callback(cb, NULL, NULL); - /* The subsequent bpf_timer_start/cancel() helpers won't be able to use - * this timer, since it won't be initialized. - */ - WRITE_ONCE(async->cb, NULL); -out: - __bpf_spin_unlock_irqrestore(&async->lock); - return cb; + return; + + /* Consume map's own refcnt, schedule cleanup irq_work if this is the last ref */ + bpf_async_refcnt_dec_cleanup(cb); } static void bpf_timer_delete(struct bpf_hrtimer *t) @@ -1611,19 +1682,76 @@ static void bpf_timer_delete(struct bpf_hrtimer *t) } } +static void bpf_async_process_op(struct bpf_async_cb *cb, u32 op, + u64 timer_nsec, u32 timer_mode) +{ + switch (cb->type) { + case BPF_ASYNC_TYPE_TIMER: { + struct bpf_hrtimer *t = container_of(cb, struct bpf_hrtimer, cb); + + switch (op) { + case BPF_ASYNC_START: + hrtimer_start(&t->timer, ns_to_ktime(timer_nsec), timer_mode); + break; + case BPF_ASYNC_CANCEL: + hrtimer_try_to_cancel(&t->timer); + break; + case BPF_ASYNC_CANCEL_AND_FREE: + bpf_timer_delete(t); + break; + default: + break; + } + break; + } + case BPF_ASYNC_TYPE_WQ: { + struct bpf_work *w = container_of(cb, struct bpf_work, cb); + + switch (op) { + case BPF_ASYNC_START: + schedule_work(&w->work); + break; + case BPF_ASYNC_CANCEL_AND_FREE: + /* + * Trigger cancel of the sleepable work, but *do not* wait for + * it to finish. + * kfree will be called once the work has finished. + */ + schedule_work(&w->delete_work); + break; + default: + break; + } + break; + } + } +} + +static void bpf_async_irq_worker(struct irq_work *work) +{ + struct bpf_async_cb *cb = container_of(work, struct bpf_async_cb, worker); + u32 op, timer_mode; + u64 nsec; + int err; + + err = bpf_async_read_op(cb, &op, &nsec, &timer_mode); + if (err) + goto out; + + bpf_async_process_op(cb, op, nsec, timer_mode); + +out: + if (refcount_dec_and_test(&cb->refcnt)) + bpf_async_process_op(cb, BPF_ASYNC_CANCEL_AND_FREE, 0, 0); +} + /* * This function is called by map_delete/update_elem for individual element and * by ops->map_release_uref when the user space reference to a map reaches zero. */ void bpf_timer_cancel_and_free(void *val) { - struct bpf_hrtimer *t; - - t = (struct bpf_hrtimer *)__bpf_async_cancel_and_free(val); - if (!t) - return; - - bpf_timer_delete(t); + __bpf_async_cancel_and_free(val); } /* This function is called by map_delete/update_elem for individual element and @@ -1631,19 +1759,7 @@ void bpf_timer_cancel_and_free(void *val) */ void bpf_wq_cancel_and_free(void *val) { - struct bpf_work *work; - - BTF_TYPE_EMIT(struct bpf_wq); - - work = (struct bpf_work *)__bpf_async_cancel_and_free(val); - if (!work) - return; - /* Trigger cancel of the sleepable work, but *do not* wait for - * it to finish if it was running as we might not be in a - * sleepable context. - * kfree will be called once the work has finished. - */ - schedule_work(&work->delete_work); + __bpf_async_cancel_and_free(val); } BPF_CALL_2(bpf_kptr_xchg, void *, dst, void *, ptr) @@ -3116,16 +3232,15 @@ __bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) struct bpf_async_kern *async = (struct bpf_async_kern *)wq; struct bpf_work *w; - if (in_nmi()) - return -EOPNOTSUPP; if (flags) return -EINVAL; + + guard(rcu)(); w = READ_ONCE(async->work); if (!w || !READ_ONCE(w->cb.prog)) return -EINVAL; - schedule_work(&w->work); - return 0; + return bpf_async_schedule_op(&w->cb, BPF_ASYNC_START, 0, 0); } __bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq, -- 2.52.0 From: Mykyta Yatsenko Extend the verifier to recognize struct bpf_timer as a valid kfunc argument type. Previously, bpf_timer was only supported in BPF helpers. This prepares for adding timer-related kfuncs in subsequent patches. Signed-off-by: Mykyta Yatsenko Acked-by: Andrii Nakryiko --- kernel/bpf/verifier.c | 59 +++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 53 insertions(+), 6 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index faa1ecc1fe9d753325ac6b8c255c8885a77102d4..98cb8be76eb5da878b0e391cf52e20a8dd6f9b24 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8566,17 +8566,15 @@ static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno, } static int process_timer_func(struct bpf_verifier_env *env, int regno, - struct bpf_call_arg_meta *meta) + struct bpf_map *map) { - struct bpf_reg_state *reg = reg_state(env, regno); - struct bpf_map *map = reg->map_ptr; int err; err = check_map_field_pointer(env, regno, BPF_TIMER); if (err) return err; - if (meta->map_ptr) { + if (map) { verifier_bug(env, "Two map pointers in a timer helper"); return -EFAULT; } @@ -8584,8 +8582,36 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno, verbose(env, "bpf_timer cannot be used for PREEMPT_RT.\n"); return -EOPNOTSUPP; } + return 0; +} + +static int process_timer_helper(struct bpf_verifier_env *env, int regno, + struct bpf_call_arg_meta *meta) +{ + struct bpf_reg_state *reg = reg_state(env, regno); + int err; + + err = process_timer_func(env, regno, meta->map_ptr); + if (err) + return err; + meta->map_uid = reg->map_uid; - meta->map_ptr = map; + meta->map_ptr = reg->map_ptr; + return 0; +} + +static int process_timer_kfunc(struct bpf_verifier_env *env, int regno, + struct bpf_kfunc_call_arg_meta *meta) +{ + struct bpf_reg_state *reg = reg_state(env, regno); + int err; + + err = process_timer_func(env, regno, meta->map.ptr); + if (err) + return err; + + meta->map.uid = reg->map_uid; + meta->map.ptr = reg->map_ptr; return 0; } @@ -9908,7 +9934,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, } break; case ARG_PTR_TO_TIMER: - err = process_timer_func(env, regno, meta); + err = process_timer_helper(env, regno, meta); if (err) return err; break; @@ -12161,6 +12187,7 @@ enum { KF_ARG_WORKQUEUE_ID, KF_ARG_RES_SPIN_LOCK_ID, KF_ARG_TASK_WORK_ID, + KF_ARG_TIMER_ID, }; BTF_ID_LIST(kf_arg_btf_ids) @@ -12172,6 +12199,7 @@ BTF_ID(struct, bpf_rb_node) BTF_ID(struct, bpf_wq) BTF_ID(struct, bpf_res_spin_lock) BTF_ID(struct, bpf_task_work) +BTF_ID(struct, bpf_timer) static bool __is_kfunc_ptr_arg_type(const struct btf *btf, const struct btf_param *arg, int type) @@ -12215,6 +12243,11 @@ static bool is_kfunc_arg_rbtree_node(const struct btf *btf, const struct btf_par return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RB_NODE_ID); } +static bool is_kfunc_arg_timer(const struct btf *btf, const struct btf_param *arg) +{ + return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_TIMER_ID); +} + static bool is_kfunc_arg_wq(const struct btf *btf, const struct btf_param *arg) { return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_WORKQUEUE_ID); @@ -12309,6 +12342,7 @@ enum kfunc_ptr_arg_type { KF_ARG_PTR_TO_NULL, KF_ARG_PTR_TO_CONST_STR, KF_ARG_PTR_TO_MAP, + KF_ARG_PTR_TO_TIMER, KF_ARG_PTR_TO_WORKQUEUE, KF_ARG_PTR_TO_IRQ_FLAG, KF_ARG_PTR_TO_RES_SPIN_LOCK, @@ -12554,6 +12588,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_wq(meta->btf, &args[argno])) return KF_ARG_PTR_TO_WORKQUEUE; + if (is_kfunc_arg_timer(meta->btf, &args[argno])) + return KF_ARG_PTR_TO_TIMER; + if (is_kfunc_arg_task_work(meta->btf, &args[argno])) return KF_ARG_PTR_TO_TASK_WORK; @@ -13340,6 +13377,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_REFCOUNTED_KPTR: case KF_ARG_PTR_TO_CONST_STR: case KF_ARG_PTR_TO_WORKQUEUE: + case KF_ARG_PTR_TO_TIMER: case KF_ARG_PTR_TO_TASK_WORK: case KF_ARG_PTR_TO_IRQ_FLAG: case KF_ARG_PTR_TO_RES_SPIN_LOCK: @@ -13639,6 +13677,15 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (ret < 0) return ret; break; + case KF_ARG_PTR_TO_TIMER: + if (reg->type != PTR_TO_MAP_VALUE) { + verbose(env, "arg#%d doesn't point to a map value\n", i); + return -EINVAL; + } + ret = process_timer_kfunc(env, regno, meta); + if (ret < 0) + return ret; + break; case KF_ARG_PTR_TO_TASK_WORK: if (reg->type != PTR_TO_MAP_VALUE) { verbose(env, "arg#%d doesn't point to a map value\n", i); -- 2.52.0 From: Mykyta Yatsenko introducing bpf timer cancel kfunc that attempts canceling timer asynchronously, hence, supports working in NMI context. Signed-off-by: Mykyta Yatsenko --- kernel/bpf/helpers.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 8c1ed75af072f64f2a63afd90ad40a962e8e46b0..e058102a6d4fc3d7b4bf507f92f876370bc32346 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -4483,6 +4483,19 @@ __bpf_kfunc int bpf_dynptr_file_discard(struct bpf_dynptr *dynptr) return 0; } +__bpf_kfunc int bpf_timer_cancel_async(struct bpf_timer *timer) +{ + struct bpf_async_cb *cb; + struct bpf_async_kern *async = (void *)timer; + + guard(rcu)(); + cb = READ_ONCE(async->cb); + if (!cb) + return -EINVAL; + + return bpf_async_schedule_op(cb, BPF_ASYNC_CANCEL, 0, 0); +} + __bpf_kfunc_end_defs(); static void bpf_task_work_cancel_scheduled(struct irq_work *irq_work) @@ -4664,6 +4677,7 @@ BTF_ID_FLAGS(func, bpf_task_work_schedule_signal_impl) BTF_ID_FLAGS(func, bpf_task_work_schedule_resume_impl) BTF_ID_FLAGS(func, bpf_dynptr_from_file) BTF_ID_FLAGS(func, bpf_dynptr_file_discard) +BTF_ID_FLAGS(func, bpf_timer_cancel_async) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { -- 2.52.0 From: Mykyta Yatsenko Refactor timer selftests, extracting stress test into a separate test. This makes it easier to debug test failures and allows to extend. Signed-off-by: Mykyta Yatsenko --- tools/testing/selftests/bpf/prog_tests/timer.c | 55 +++++++++++++++++--------- 1 file changed, 36 insertions(+), 19 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/timer.c b/tools/testing/selftests/bpf/prog_tests/timer.c index 34f9ccce260293755980bcd6fcece491964f7929..4d853d1bd2a71b3d0f1ba0daa7a699945b4457fe 100644 --- a/tools/testing/selftests/bpf/prog_tests/timer.c +++ b/tools/testing/selftests/bpf/prog_tests/timer.c @@ -22,13 +22,35 @@ static void *spin_lock_thread(void *arg) pthread_exit(arg); } -static int timer(struct timer *timer_skel) + +static int timer_stress(struct timer *timer_skel) { - int i, err, prog_fd; + int i, err = 1, prog_fd; LIBBPF_OPTS(bpf_test_run_opts, topts); pthread_t thread_id[NUM_THR]; void *ret; + prog_fd = bpf_program__fd(timer_skel->progs.race); + for (i = 0; i < NUM_THR; i++) { + err = pthread_create(&thread_id[i], NULL, + &spin_lock_thread, &prog_fd); + if (!ASSERT_OK(err, "pthread_create")) + break; + } + + while (i) { + err = pthread_join(thread_id[--i], &ret); + if (ASSERT_OK(err, "pthread_join")) + ASSERT_EQ(ret, (void *)&prog_fd, "pthread_join"); + } + return err; +} + +static int timer(struct timer *timer_skel) +{ + int err, prog_fd; + LIBBPF_OPTS(bpf_test_run_opts, topts); + err = timer__attach(timer_skel); if (!ASSERT_OK(err, "timer_attach")) return err; @@ -63,25 +85,10 @@ static int timer(struct timer *timer_skel) /* check that code paths completed */ ASSERT_EQ(timer_skel->bss->ok, 1 | 2 | 4, "ok"); - prog_fd = bpf_program__fd(timer_skel->progs.race); - for (i = 0; i < NUM_THR; i++) { - err = pthread_create(&thread_id[i], NULL, - &spin_lock_thread, &prog_fd); - if (!ASSERT_OK(err, "pthread_create")) - break; - } - - while (i) { - err = pthread_join(thread_id[--i], &ret); - if (ASSERT_OK(err, "pthread_join")) - ASSERT_EQ(ret, (void *)&prog_fd, "pthread_join"); - } - return 0; } -/* TODO: use pid filtering */ -void serial_test_timer(void) +static void test_timer(int (*timer_test_fn)(struct timer *timer_skel)) { struct timer *timer_skel = NULL; int err; @@ -94,13 +101,23 @@ void serial_test_timer(void) if (!ASSERT_OK_PTR(timer_skel, "timer_skel_load")) return; - err = timer(timer_skel); + err = timer_test_fn(timer_skel); ASSERT_OK(err, "timer"); timer__destroy(timer_skel); +} + +void serial_test_timer(void) +{ + test_timer(timer); RUN_TESTS(timer_failure); } +void serial_test_timer_stress(void) +{ + test_timer(timer_stress); +} + void test_timer_interrupt(void) { struct timer_interrupt *skel = NULL; -- 2.52.0 From: Mykyta Yatsenko Extend BPF timer selftest to run stress test for async cancel. Signed-off-by: Mykyta Yatsenko --- tools/testing/selftests/bpf/prog_tests/timer.c | 18 +++++++++++++++++- tools/testing/selftests/bpf/progs/timer.c | 14 +++++++++++--- 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/timer.c b/tools/testing/selftests/bpf/prog_tests/timer.c index 4d853d1bd2a71b3d0f1ba0daa7a699945b4457fe..a157a2a699e638c9f21712b1e7194fc4b6382e71 100644 --- a/tools/testing/selftests/bpf/prog_tests/timer.c +++ b/tools/testing/selftests/bpf/prog_tests/timer.c @@ -23,13 +23,14 @@ static void *spin_lock_thread(void *arg) } -static int timer_stress(struct timer *timer_skel) +static int timer_stress_runner(struct timer *timer_skel, bool async_cancel) { int i, err = 1, prog_fd; LIBBPF_OPTS(bpf_test_run_opts, topts); pthread_t thread_id[NUM_THR]; void *ret; + timer_skel->bss->async_cancel = async_cancel; prog_fd = bpf_program__fd(timer_skel->progs.race); for (i = 0; i < NUM_THR; i++) { err = pthread_create(&thread_id[i], NULL, @@ -46,6 +47,16 @@ static int timer_stress(struct timer *timer_skel) return err; } +static int timer_stress(struct timer *timer_skel) +{ + return timer_stress_runner(timer_skel, false); +} + +static int timer_stress_async_cancel(struct timer *timer_skel) +{ + return timer_stress_runner(timer_skel, true); +} + static int timer(struct timer *timer_skel) { int err, prog_fd; @@ -118,6 +129,11 @@ void serial_test_timer_stress(void) test_timer(timer_stress); } +void serial_test_timer_stress_async_cancel(void) +{ + test_timer(timer_stress_async_cancel); +} + void test_timer_interrupt(void) { struct timer_interrupt *skel = NULL; diff --git a/tools/testing/selftests/bpf/progs/timer.c b/tools/testing/selftests/bpf/progs/timer.c index 4c677c001258a4c05cd570ec52363d49d8eea169..a81413514e4b07ef745f27eade71454234e731e8 100644 --- a/tools/testing/selftests/bpf/progs/timer.c +++ b/tools/testing/selftests/bpf/progs/timer.c @@ -1,13 +1,17 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2021 Facebook */ -#include -#include + +#include #include #include #include #include +#define CLOCK_MONOTONIC 1 +#define CLOCK_BOOTTIME 7 + char _license[] SEC("license") = "GPL"; + struct hmap_elem { int counter; struct bpf_timer timer; @@ -63,6 +67,7 @@ __u64 callback_check = 52; __u64 callback2_check = 52; __u64 pinned_callback_check; __s32 pinned_cpu; +bool async_cancel = 0; #define ARRAY 1 #define HTAB 2 @@ -419,7 +424,10 @@ int race(void *ctx) bpf_timer_set_callback(timer, race_timer_callback); bpf_timer_start(timer, 0, 0); - bpf_timer_cancel(timer); + if (async_cancel) + bpf_timer_cancel_async(timer); + else + bpf_timer_cancel(timer); return 0; } -- 2.52.0 From: Mykyta Yatsenko Add test that verifies that bpf_timer_cancel_async works: can cancel callback successfully. Signed-off-by: Mykyta Yatsenko --- tools/testing/selftests/bpf/prog_tests/timer.c | 25 +++++++++++++++++++++++++ tools/testing/selftests/bpf/progs/timer.c | 23 +++++++++++++++++++++++ 2 files changed, 48 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/timer.c b/tools/testing/selftests/bpf/prog_tests/timer.c index a157a2a699e638c9f21712b1e7194fc4b6382e71..2b932d4dfd436fd322bd07169f492e20e4ec7624 100644 --- a/tools/testing/selftests/bpf/prog_tests/timer.c +++ b/tools/testing/selftests/bpf/prog_tests/timer.c @@ -99,6 +99,26 @@ static int timer(struct timer *timer_skel) return 0; } +static int timer_cancel_async(struct timer *timer_skel) +{ + int err, prog_fd; + LIBBPF_OPTS(bpf_test_run_opts, topts); + + prog_fd = bpf_program__fd(timer_skel->progs.test_async_cancel_succeed); + err = bpf_prog_test_run_opts(prog_fd, &topts); + ASSERT_OK(err, "test_run"); + ASSERT_EQ(topts.retval, 0, "test_run"); + + usleep(500); + /* check that there were no errors in timer execution */ + ASSERT_EQ(timer_skel->bss->err, 0, "err"); + + /* check that code paths completed */ + ASSERT_EQ(timer_skel->bss->ok, 1 | 2 | 4, "ok"); + + return 0; +} + static void test_timer(int (*timer_test_fn)(struct timer *timer_skel)) { struct timer *timer_skel = NULL; @@ -134,6 +154,11 @@ void serial_test_timer_stress_async_cancel(void) test_timer(timer_stress_async_cancel); } +void serial_test_timer_async_cancel(void) +{ + test_timer(timer_cancel_async); +} + void test_timer_interrupt(void) { struct timer_interrupt *skel = NULL; diff --git a/tools/testing/selftests/bpf/progs/timer.c b/tools/testing/selftests/bpf/progs/timer.c index a81413514e4b07ef745f27eade71454234e731e8..4b4ca781e7cdcf78015359cbd8f8d8ff591d6036 100644 --- a/tools/testing/selftests/bpf/progs/timer.c +++ b/tools/testing/selftests/bpf/progs/timer.c @@ -169,6 +169,29 @@ int BPF_PROG2(test1, int, a) return 0; } +static int timer_error(void *map, int *key, struct bpf_timer *timer) +{ + err = 42; + return 0; +} + +SEC("syscall") +int test_async_cancel_succeed(void *ctx) +{ + struct bpf_timer *arr_timer; + int array_key = ARRAY; + + arr_timer = bpf_map_lookup_elem(&array, &array_key); + if (!arr_timer) + return 0; + bpf_timer_init(arr_timer, &array, CLOCK_MONOTONIC); + bpf_timer_set_callback(arr_timer, timer_error); + bpf_timer_start(arr_timer, 100000 /* 100us */, 0); + bpf_timer_cancel_async(arr_timer); + ok = 7; + return 0; +} + /* callback for prealloc and non-prealloca hashtab timers */ static int timer_cb2(void *map, int *key, struct hmap_elem *val) { -- 2.52.0