From: Mykyta Yatsenko cancel_and_free() uses xchg(&ptr, NULL) to steal the cb/ctx pointer. For non-preallocated maps, the element is freed via bpf_mem_cache_free() making it immediately available for reuse. Between the xchg and the free, a concurrent BPF program sees NULL, treats it as first use, allocates a new cb/ctx, and stores it via cmpxchg. The allocation is then orphaned when the element is freed, causing a memory leak. Use BPF_PTR_POISON instead of NULL in the xchg so that concurrent readers can distinguish "being destroyed" from "never initialized" and refuse to allocate. For preallocated maps, clear the poison before recycled elements are handed back to BPF programs. Fixes: 1bfbc267ec91 ("bpf: Enable bpf_timer and bpf_wq in any context") Signed-off-by: Mykyta Yatsenko --- kernel/bpf/hashtab.c | 39 +++++++++++++++++++++++++++++++++++++++ kernel/bpf/helpers.c | 25 +++++++++++++++---------- 2 files changed, 54 insertions(+), 10 deletions(-) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3b9d297a53be..9117554f5133 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -279,6 +279,36 @@ static void htab_free_elems(struct bpf_htab *htab) bpf_map_area_free(htab->elems); } +/* + * Clear special fields (timer/wq/task_work) poisoned by cancel_and_free() + * in recycled preallocated map elements. + * + * Uses WRITE_ONCE to match the READ_ONCE/xchg/cmpxchg used by concurrent + * readers on the underlying pointer (bpf_async_kern.cb / bpf_task_work_kern.ctx). + */ +static void bpf_obj_init_special_fields(const struct btf_record *rec, void *obj) +{ + void **p; + + if (IS_ERR_OR_NULL(rec)) + return; + + if (rec->timer_off >= 0) { + p = obj + rec->timer_off; + WRITE_ONCE(*p, NULL); + } + + if (rec->wq_off >= 0) { + p = obj + rec->wq_off; + WRITE_ONCE(*p, NULL); + } + + if (rec->task_work_off >= 0) { + p = obj + rec->task_work_off; + WRITE_ONCE(*p, NULL); + } +} + /* The LRU list has a lock (lru_lock). Each htab bucket has a lock * (bucket_lock). If both locks need to be acquired together, the lock * order is always lru_lock -> bucket_lock and this only happens in @@ -300,6 +330,8 @@ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, bpf_map_inc_elem_count(&htab->map); l = container_of(node, struct htab_elem, lru_node); memcpy(l->key, key, htab->map.key_size); + bpf_obj_init_special_fields(htab->map.record, + htab_elem_value(l, htab->map.key_size)); return l; } @@ -1033,6 +1065,13 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, } memcpy(l_new->key, key, key_size); + /* + * Clear async fields poisoned by cancel_and_free() in recycled elements. + * copy_map_value() skips special fields, so zeroed fields survive the copy. + */ + bpf_obj_init_special_fields(htab->map.record, + htab_elem_value(l_new, key_size)); + if (percpu) { if (prealloc) { pptr = htab_elem_get_ptr(l_new, key_size); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 7ac32798eb04..da51ba35a898 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1477,7 +1477,7 @@ static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback struct bpf_async_cb *cb; cb = READ_ONCE(async->cb); - if (!cb) + if (!cb || cb == BPF_PTR_POISON) return -EINVAL; return bpf_async_update_prog_callback(cb, prog, callback_fn); @@ -1511,7 +1511,7 @@ BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, async, u64, nsecs, u64, fla return -EINVAL; t = READ_ONCE(async->timer); - if (!t || !READ_ONCE(t->cb.prog)) + if (!t || (void *)t == BPF_PTR_POISON || !READ_ONCE(t->cb.prog)) return -EINVAL; if (flags & BPF_F_TIMER_ABS) @@ -1557,7 +1557,7 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, async) return -EOPNOTSUPP; t = READ_ONCE(async->timer); - if (!t) + if (!t || (void *)t == BPF_PTR_POISON) return -EINVAL; cur_t = this_cpu_read(hrtimer_running); @@ -1669,11 +1669,12 @@ static void bpf_async_cancel_and_free(struct bpf_async_kern *async) { struct bpf_async_cb *cb; - if (!READ_ONCE(async->cb)) + cb = READ_ONCE(async->cb); + if (!cb || cb == BPF_PTR_POISON) return; - cb = xchg(&async->cb, NULL); - if (!cb) + cb = xchg(&async->cb, BPF_PTR_POISON); + if (!cb || cb == BPF_PTR_POISON) return; bpf_async_update_prog_callback(cb, NULL, NULL); @@ -3183,7 +3184,7 @@ __bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) return -EINVAL; w = READ_ONCE(async->work); - if (!w || !READ_ONCE(w->cb.prog)) + if (!w || (void *)w == BPF_PTR_POISON || !READ_ONCE(w->cb.prog)) return -EINVAL; if (!refcount_inc_not_zero(&w->cb.refcnt)) @@ -4267,6 +4268,8 @@ static struct bpf_task_work_ctx *bpf_task_work_fetch_ctx(struct bpf_task_work *t struct bpf_task_work_ctx *ctx, *old_ctx; ctx = READ_ONCE(twk->ctx); + if (ctx == BPF_PTR_POISON) + return ERR_PTR(-EINVAL); if (ctx) return ctx; @@ -4285,6 +4288,8 @@ static struct bpf_task_work_ctx *bpf_task_work_fetch_ctx(struct bpf_task_work *t * memory and try to reuse already set context. */ bpf_mem_free(&bpf_global_ma, ctx); + if (old_ctx == BPF_PTR_POISON) + return ERR_PTR(-EINVAL); return old_ctx; } @@ -4476,7 +4481,7 @@ __bpf_kfunc int bpf_timer_cancel_async(struct bpf_timer *timer) int ret; cb = READ_ONCE(async->cb); - if (!cb) + if (!cb || cb == BPF_PTR_POISON) return -EINVAL; /* @@ -4517,8 +4522,8 @@ void bpf_task_work_cancel_and_free(void *val) struct bpf_task_work_ctx *ctx; enum bpf_task_work_state state; - ctx = xchg(&twk->ctx, NULL); - if (!ctx) + ctx = xchg(&twk->ctx, BPF_PTR_POISON); + if (!ctx || ctx == BPF_PTR_POISON) return; state = xchg(&ctx->state, BPF_TW_FREED); -- 2.53.0