From: Mykyta Yatsenko Move logic for updating callback and prog owning it into a separate function: bpf_async_update_callback(). This helps to localize data race and will be reused in the next patches. Signed-off-by: Mykyta Yatsenko --- kernel/bpf/helpers.c | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 930e132f440fcef15343087d0a0254905b0acca1..e60fea1330d40326459933170023ca3e6ffc5cee 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1351,20 +1351,17 @@ static const struct bpf_func_proto bpf_timer_init_proto = { .arg3_type = ARG_ANYTHING, }; -static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback_fn, - struct bpf_prog_aux *aux, unsigned int flags, - enum bpf_async_type type) +static int bpf_async_update_callback(struct bpf_async_kern *async, void *callback_fn, + struct bpf_prog *prog) { - struct bpf_prog *prev, *prog = aux->prog; + struct bpf_prog *prev; struct bpf_async_cb *cb; - int ret = 0; + int err = 0; - if (in_nmi()) - return -EOPNOTSUPP; __bpf_spin_lock_irqsave(&async->lock); cb = async->cb; if (!cb) { - ret = -EINVAL; + err = -EINVAL; goto out; } if (!atomic64_read(&cb->map->usercnt)) { @@ -1373,9 +1370,10 @@ static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback * running even when bpf prog is detached and user space * is gone, since map_release_uref won't ever be called. */ - ret = -EPERM; + err = -EPERM; goto out; } + prev = cb->prog; if (prev != prog) { /* Bump prog refcnt once. Every bpf_timer_set_callback() @@ -1383,7 +1381,7 @@ static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback */ prog = bpf_prog_inc_not_zero(prog); if (IS_ERR(prog)) { - ret = PTR_ERR(prog); + err = PTR_ERR(prog); goto out; } if (prev) @@ -1394,7 +1392,19 @@ static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback rcu_assign_pointer(cb->callback_fn, callback_fn); out: __bpf_spin_unlock_irqrestore(&async->lock); - return ret; + return err; +} + +static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback_fn, + struct bpf_prog_aux *aux, unsigned int flags, + enum bpf_async_type type) +{ + struct bpf_prog *prog = aux->prog; + + if (in_nmi()) + return -EOPNOTSUPP; + + return bpf_async_update_callback(async, callback_fn, prog); } BPF_CALL_3(bpf_timer_set_callback, struct bpf_async_kern *, timer, void *, callback_fn, -- 2.51.1 From: Mykyta Yatsenko Move the logic that swaps the bpf_prog in struct bpf_async_cb into a dedicated helper to make follow-up patches simpler. Signed-off-by: Mykyta Yatsenko --- kernel/bpf/helpers.c | 45 ++++++++++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index e60fea1330d40326459933170023ca3e6ffc5cee..d09c2b8989a123d6fd5a3b59efd40018c81d0149 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1351,10 +1351,34 @@ static const struct bpf_func_proto bpf_timer_init_proto = { .arg3_type = ARG_ANYTHING, }; +static int bpf_async_swap_prog(struct bpf_async_cb *cb, struct bpf_prog *prog) +{ + struct bpf_prog *prev; + + prev = cb->prog; + if (prev == prog) + return 0; + + if (prog) { + /* + * Bump prog refcnt once. Every bpf_timer_set_callback() + * can pick different callback_fn-s within the same prog. + */ + prog = bpf_prog_inc_not_zero(prog); + if (IS_ERR(prog)) + return PTR_ERR(prog); + } + if (prev) + /* Drop prev prog refcnt when swapping with new prog */ + bpf_prog_put(prev); + + cb->prog = prog; + return 0; +} + static int bpf_async_update_callback(struct bpf_async_kern *async, void *callback_fn, struct bpf_prog *prog) { - struct bpf_prog *prev; struct bpf_async_cb *cb; int err = 0; @@ -1374,22 +1398,9 @@ static int bpf_async_update_callback(struct bpf_async_kern *async, void *callbac goto out; } - prev = cb->prog; - if (prev != prog) { - /* Bump prog refcnt once. Every bpf_timer_set_callback() - * can pick different callback_fn-s within the same prog. - */ - prog = bpf_prog_inc_not_zero(prog); - if (IS_ERR(prog)) { - err = PTR_ERR(prog); - goto out; - } - if (prev) - /* Drop prev prog refcnt when swapping with new prog */ - bpf_prog_put(prev); - cb->prog = prog; - } - rcu_assign_pointer(cb->callback_fn, callback_fn); + err = bpf_async_swap_prog(cb, prog); + if (!err) + rcu_assign_pointer(cb->callback_fn, callback_fn); out: __bpf_spin_unlock_irqrestore(&async->lock); return err; -- 2.51.1 From: Mykyta Yatsenko Move the timer deletion logic into a dedicated bpf_timer_delete() helper so it can be reused by later patches. Signed-off-by: Mykyta Yatsenko --- kernel/bpf/helpers.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index d09c2b8989a123d6fd5a3b59efd40018c81d0149..2eb2369cae3ad34fd218387aa237140003cc1853 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1153,6 +1153,8 @@ enum bpf_async_type { static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running); +static void bpf_timer_delete(struct bpf_hrtimer *t); + static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer) { struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer); @@ -1576,18 +1578,10 @@ static struct bpf_async_cb *__bpf_async_cancel_and_free(struct bpf_async_kern *a return cb; } -/* This function is called by map_delete/update_elem for individual element and - * by ops->map_release_uref when the user space reference to a map reaches zero. - */ -void bpf_timer_cancel_and_free(void *val) +static void bpf_timer_delete(struct bpf_hrtimer *t) { - struct bpf_hrtimer *t; - - t = (struct bpf_hrtimer *)__bpf_async_cancel_and_free(val); - - if (!t) - return; - /* We check that bpf_map_delete/update_elem() was called from timer + /* + * We check that bpf_map_delete/update_elem() was called from timer * callback_fn. In such case we don't call hrtimer_cancel() (since it * will deadlock) and don't call hrtimer_try_to_cancel() (since it will * just return -1). Though callback_fn is still running on this cpu it's @@ -1636,6 +1630,21 @@ void bpf_timer_cancel_and_free(void *val) } } +/* + * This function is called by map_delete/update_elem for individual element and + * by ops->map_release_uref when the user space reference to a map reaches zero. + */ +void bpf_timer_cancel_and_free(void *val) +{ + struct bpf_hrtimer *t; + + t = (struct bpf_hrtimer *)__bpf_async_cancel_and_free(val); + if (!t) + return; + + bpf_timer_delete(t); +} + /* This function is called by map_delete/update_elem for individual element and * by ops->map_release_uref when the user space reference to a map reaches zero. */ -- 2.51.1 From: Mykyta Yatsenko To manage lifetime guarantees of the struct bpf_async_cb, when no lock serializes mutations, introduce refcnt field into the struct. Implement bpf_async_tryget() and bpf_async_put() to handle the refcnt. Signed-off-by: Mykyta Yatsenko --- kernel/bpf/helpers.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 2eb2369cae3ad34fd218387aa237140003cc1853..3d9b370e47a1528e75cade3fe4a43c946200e63a 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1102,6 +1102,7 @@ struct bpf_async_cb { struct work_struct delete_work; }; u64 flags; + refcount_t refcnt; }; /* BPF map elements can contain 'struct bpf_timer'. @@ -1155,6 +1156,33 @@ static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running); static void bpf_timer_delete(struct bpf_hrtimer *t); +static bool bpf_async_tryget(struct bpf_async_cb *cb) +{ + return refcount_inc_not_zero(&cb->refcnt); +} + +static void bpf_async_put(struct bpf_async_cb *cb, enum bpf_async_type type) +{ + if (!refcount_dec_and_test(&cb->refcnt)) + return; + + switch (type) { + case BPF_ASYNC_TYPE_TIMER: + bpf_timer_delete((struct bpf_hrtimer *)cb); + break; + case BPF_ASYNC_TYPE_WQ: { + struct bpf_work *work = (void *)cb; + /* Trigger cancel of the sleepable work, but *do not* wait for + * it to finish if it was running as we might not be in a + * sleepable context. + * kfree will be called once the work has finished. + */ + schedule_work(&work->delete_work); + break; + } + } +} + static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer) { struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer); -- 2.51.1 From: Mykyta Yatsenko Remove lock from bpf_async_cb, refactor bpf_timer and bpf_wq kfuncs and helpers to run without it. bpf_async_cb lifetime is managed by the refcnt and RCU, so every function that uses it has to apply RCU guard. cancel_and_free() path detaches bpf_async_cb from the map value (struct bpf_async_kern) and sets the state to the terminal BPF_ASYNC_FREED atomically, concurrent readers may operate on detached bpf_async_cb safely under RCU read lock. Guarantee safe bpf_prog drop from the bpf_async_cb by handling BPF_ASYNC_FREED state in bpf_async_update_callback(). Signed-off-by: Mykyta Yatsenko --- kernel/bpf/helpers.c | 201 +++++++++++++++++++++++++++------------------------ 1 file changed, 106 insertions(+), 95 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 3d9b370e47a1528e75cade3fe4a43c946200e63a..75834338558929cbd0b02a9823629d8be946fb18 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1092,6 +1092,12 @@ static void *map_key_from_value(struct bpf_map *map, void *value, u32 *arr_idx) return (void *)value - round_up(map->key_size, 8); } +enum bpf_async_state { + BPF_ASYNC_READY, + BPF_ASYNC_BUSY, + BPF_ASYNC_FREED, +}; + struct bpf_async_cb { struct bpf_map *map; struct bpf_prog *prog; @@ -1103,6 +1109,7 @@ struct bpf_async_cb { }; u64 flags; refcount_t refcnt; + enum bpf_async_state state; }; /* BPF map elements can contain 'struct bpf_timer'. @@ -1140,11 +1147,6 @@ struct bpf_async_kern { struct bpf_hrtimer *timer; struct bpf_work *work; }; - /* bpf_spin_lock is used here instead of spinlock_t to make - * sure that it always fits into space reserved by struct bpf_timer - * regardless of LOCKDEP and spinlock debug flags. - */ - struct bpf_spin_lock lock; } __attribute__((aligned(8))); enum bpf_async_type { @@ -1276,7 +1278,7 @@ static void bpf_timer_delete_work(struct work_struct *work) static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags, enum bpf_async_type type) { - struct bpf_async_cb *cb; + struct bpf_async_cb *cb, *old_cb; struct bpf_hrtimer *t; struct bpf_work *w; clockid_t clockid; @@ -1297,18 +1299,13 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u return -EINVAL; } - __bpf_spin_lock_irqsave(&async->lock); - t = async->timer; - if (t) { - ret = -EBUSY; - goto out; - } + cb = READ_ONCE(async->cb); + if (cb) + return -EBUSY; cb = bpf_map_kmalloc_nolock(map, size, 0, map->numa_node); - if (!cb) { - ret = -ENOMEM; - goto out; - } + if (!cb) + return -ENOMEM; switch (type) { case BPF_ASYNC_TYPE_TIMER: @@ -1331,9 +1328,16 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u cb->map = map; cb->prog = NULL; cb->flags = flags; + cb->state = BPF_ASYNC_READY; rcu_assign_pointer(cb->callback_fn, NULL); + refcount_set(&cb->refcnt, 1); /* map's own ref */ - WRITE_ONCE(async->cb, cb); + old_cb = cmpxchg(&async->cb, NULL, cb); + if (old_cb) { + /* Lost the race to initialize this bpf_async_kern, drop the allocated object */ + kfree_nolock(cb); + return -EBUSY; + } /* Guarantee the order between async->cb and map->usercnt. So * when there are concurrent uref release and bpf timer init, either * bpf_timer_cancel_and_free() called by uref release reads a no-NULL @@ -1344,12 +1348,17 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u /* maps with timers must be either held by user space * or pinned in bpffs. */ - WRITE_ONCE(async->cb, NULL); - kfree_nolock(cb); - ret = -EPERM; + switch (type) { + case BPF_ASYNC_TYPE_TIMER: + bpf_timer_cancel_and_free(async); + break; + case BPF_ASYNC_TYPE_WQ: + bpf_wq_cancel_and_free(async); + break; + } + return -EPERM; } -out: - __bpf_spin_unlock_irqrestore(&async->lock); + return ret; } @@ -1398,41 +1407,42 @@ static int bpf_async_swap_prog(struct bpf_async_cb *cb, struct bpf_prog *prog) if (IS_ERR(prog)) return PTR_ERR(prog); } + /* Make sure only one thread runs bpf_prog_put() */ + prev = xchg(&cb->prog, prog); if (prev) /* Drop prev prog refcnt when swapping with new prog */ bpf_prog_put(prev); - cb->prog = prog; return 0; } -static int bpf_async_update_callback(struct bpf_async_kern *async, void *callback_fn, +static int bpf_async_update_callback(struct bpf_async_cb *cb, void *callback_fn, struct bpf_prog *prog) { - struct bpf_async_cb *cb; + enum bpf_async_state state; int err = 0; - __bpf_spin_lock_irqsave(&async->lock); - cb = async->cb; - if (!cb) { - err = -EINVAL; - goto out; - } - if (!atomic64_read(&cb->map->usercnt)) { - /* maps with timers must be either held by user space - * or pinned in bpffs. Otherwise timer might still be - * running even when bpf prog is detached and user space - * is gone, since map_release_uref won't ever be called. - */ - err = -EPERM; - goto out; - } + state = cmpxchg(&cb->state, BPF_ASYNC_READY, BPF_ASYNC_BUSY); + if (state == BPF_ASYNC_BUSY) + return -EBUSY; + if (state == BPF_ASYNC_FREED) + goto drop; err = bpf_async_swap_prog(cb, prog); if (!err) rcu_assign_pointer(cb->callback_fn, callback_fn); -out: - __bpf_spin_unlock_irqrestore(&async->lock); + + state = cmpxchg(&cb->state, BPF_ASYNC_BUSY, BPF_ASYNC_READY); + if (state == BPF_ASYNC_FREED) { + /* + * cb is freed concurrently, we may have overwritten prog and callback, + * make sure to drop them + */ +drop: + bpf_async_swap_prog(cb, NULL); + rcu_assign_pointer(cb->callback_fn, NULL); + return -EPERM; + } return err; } @@ -1441,11 +1451,18 @@ static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback enum bpf_async_type type) { struct bpf_prog *prog = aux->prog; + struct bpf_async_cb *cb; if (in_nmi()) return -EOPNOTSUPP; - return bpf_async_update_callback(async, callback_fn, prog); + guard(rcu)(); + + cb = READ_ONCE(async->cb); + if (!cb) + return -EINVAL; + + return bpf_async_update_callback(cb, callback_fn, prog); } BPF_CALL_3(bpf_timer_set_callback, struct bpf_async_kern *, timer, void *, callback_fn, @@ -1462,7 +1479,7 @@ static const struct bpf_func_proto bpf_timer_set_callback_proto = { .arg2_type = ARG_PTR_TO_FUNC, }; -BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, timer, u64, nsecs, u64, flags) +BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, async, u64, nsecs, u64, flags) { struct bpf_hrtimer *t; int ret = 0; @@ -1472,12 +1489,19 @@ BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, timer, u64, nsecs, u64, fla return -EOPNOTSUPP; if (flags & ~(BPF_F_TIMER_ABS | BPF_F_TIMER_CPU_PIN)) return -EINVAL; - __bpf_spin_lock_irqsave(&timer->lock); - t = timer->timer; - if (!t || !t->cb.prog) { - ret = -EINVAL; - goto out; - } + + guard(rcu)(); + + t = READ_ONCE(async->timer); + if (!t) + return -EINVAL; + + /* + * Hold ref while scheduling timer, to make sure, we only cancel and free after + * hrtimer_start(). + */ + if (!bpf_async_tryget(&t->cb)) + return -EINVAL; if (flags & BPF_F_TIMER_ABS) mode = HRTIMER_MODE_ABS_SOFT; @@ -1488,8 +1512,8 @@ BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, timer, u64, nsecs, u64, fla mode |= HRTIMER_MODE_PINNED; hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode); -out: - __bpf_spin_unlock_irqrestore(&timer->lock); + + bpf_async_put(&t->cb, BPF_ASYNC_TYPE_TIMER); return ret; } @@ -1502,18 +1526,7 @@ static const struct bpf_func_proto bpf_timer_start_proto = { .arg3_type = ARG_ANYTHING, }; -static void drop_prog_refcnt(struct bpf_async_cb *async) -{ - struct bpf_prog *prog = async->prog; - - if (prog) { - bpf_prog_put(prog); - async->prog = NULL; - rcu_assign_pointer(async->callback_fn, NULL); - } -} - -BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer) +BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, async) { struct bpf_hrtimer *t, *cur_t; bool inc = false; @@ -1521,13 +1534,12 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer) if (in_nmi()) return -EOPNOTSUPP; - rcu_read_lock(); - __bpf_spin_lock_irqsave(&timer->lock); - t = timer->timer; - if (!t) { - ret = -EINVAL; - goto out; - } + + guard(rcu)(); + + t = READ_ONCE(async->timer); + if (!t) + return -EINVAL; cur_t = this_cpu_read(hrtimer_running); if (cur_t == t) { @@ -1563,16 +1575,15 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer) goto out; } drop: - drop_prog_refcnt(&t->cb); + bpf_async_update_callback(&t->cb, NULL, NULL); out: - __bpf_spin_unlock_irqrestore(&timer->lock); /* Cancel the timer and wait for associated callback to finish * if it was running. */ ret = ret ?: hrtimer_cancel(&t->timer); if (inc) atomic_dec(&t->cancelling); - rcu_read_unlock(); + return ret; } @@ -1587,22 +1598,17 @@ static struct bpf_async_cb *__bpf_async_cancel_and_free(struct bpf_async_kern *a { struct bpf_async_cb *cb; - /* Performance optimization: read async->cb without lock first. */ - if (!READ_ONCE(async->cb)) - return NULL; - - __bpf_spin_lock_irqsave(&async->lock); - /* re-read it under lock */ - cb = async->cb; - if (!cb) - goto out; - drop_prog_refcnt(cb); - /* The subsequent bpf_timer_start/cancel() helpers won't be able to use + /* + * The subsequent bpf_timer_start/cancel() helpers won't be able to use * this timer, since it won't be initialized. */ - WRITE_ONCE(async->cb, NULL); -out: - __bpf_spin_unlock_irqrestore(&async->lock); + cb = xchg(&async->cb, NULL); + if (!cb) + return NULL; + + /* cb is detached, set state to FREED, so that concurrent users drop it */ + xchg(&cb->state, BPF_ASYNC_FREED); + bpf_async_update_callback(cb, NULL, NULL); return cb; } @@ -1670,7 +1676,7 @@ void bpf_timer_cancel_and_free(void *val) if (!t) return; - bpf_timer_delete(t); + bpf_async_put(&t->cb, BPF_ASYNC_TYPE_TIMER); /* Put map's own reference */ } /* This function is called by map_delete/update_elem for individual element and @@ -1685,12 +1691,8 @@ void bpf_wq_cancel_and_free(void *val) work = (struct bpf_work *)__bpf_async_cancel_and_free(val); if (!work) return; - /* Trigger cancel of the sleepable work, but *do not* wait for - * it to finish if it was running as we might not be in a - * sleepable context. - * kfree will be called once the work has finished. - */ - schedule_work(&work->delete_work); + + bpf_async_put(&work->cb, BPF_ASYNC_TYPE_WQ); /* Put map's own reference */ } BPF_CALL_2(bpf_kptr_xchg, void *, dst, void *, ptr) @@ -3169,11 +3171,20 @@ __bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) return -EOPNOTSUPP; if (flags) return -EINVAL; + + guard(rcu)(); + w = READ_ONCE(async->work); if (!w || !READ_ONCE(w->cb.prog)) return -EINVAL; + if (!bpf_async_tryget(&w->cb)) + return -EINVAL; + schedule_work(&w->work); + + bpf_async_put(&w->cb, BPF_ASYNC_TYPE_WQ); + return 0; } -- 2.51.1