Some iterators hold resources (like mmap_lock in task_vma) that prevent sleeping. To allow BPF programs to release such resources mid-iteration and call sleepable helpers, the verifier needs to track acquire/release semantics on iterator _next pointers. Repurpose the st->id field on STACK_ITER slots to track the ref_obj_id of the pointer returned by _next when the kfunc is annotated with KF_ACQUIRE. This is safe because st->id is initialized to 0 by __mark_reg_known_zero() in mark_stack_slots_iter() and is not compared in stacksafe() for STACK_ITER slots. The lifecycle is: _next (KF_ACQUIRE): - auto-release old ref if st->id != 0 - acquire new ref, store ref_obj_id in st->id - DRAINED branch: release via st->id, set st->id = 0 - ACTIVE branch: keeps ref, st->id tracks it _release (KF_RELEASE + __iter arg): - read st->id, release_reference(), set st->id = 0 _destroy: - release st->id if non-zero before releasing iterator's own ref Signed-off-by: Puranjay Mohan --- kernel/bpf/verifier.c | 71 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 69 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1153a828ce8d..af18cda06dc1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1038,6 +1038,8 @@ static void __mark_reg_known_zero(struct bpf_reg_state *reg); static bool in_rcu_cs(struct bpf_verifier_env *env); static bool is_kfunc_rcu_protected(struct bpf_kfunc_call_arg_meta *meta); +static bool is_kfunc_acquire(struct bpf_kfunc_call_arg_meta *meta); +static bool is_kfunc_release(struct bpf_kfunc_call_arg_meta *meta); static int mark_stack_slots_iter(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta, @@ -1083,6 +1085,22 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env, return 0; } +/* + * Release the acquired reference tracked by iter_st->id, if any. + * Used during auto-release in _next, DRAINED handling, and _destroy. + */ +static int iter_release_acquired_ref(struct bpf_verifier_env *env, + struct bpf_reg_state *iter_st) +{ + int err; + + if (!iter_st->id) + return 0; + err = release_reference(env, iter_st->id); + iter_st->id = 0; + return err; +} + static int unmark_stack_slots_iter(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int nr_slots) { @@ -1097,8 +1115,14 @@ static int unmark_stack_slots_iter(struct bpf_verifier_env *env, struct bpf_stack_state *slot = &state->stack[spi - i]; struct bpf_reg_state *st = &slot->spilled_ptr; - if (i == 0) + if (i == 0) { + /* + * Release any outstanding acquired ref tracked by st->id + * before releasing the iterator's own ref. + */ + WARN_ON_ONCE(iter_release_acquired_ref(env, st)); WARN_ON_ONCE(release_reference(env, st->ref_obj_id)); + } __mark_reg_not_init(env, st); @@ -8943,6 +8967,8 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id /* remember meta->iter info for process_iter_next_call() */ meta->iter.spi = spi; meta->iter.frameno = reg->frameno; + if (is_kfunc_release(meta)) + meta->release_regno = regno; meta->ref_obj_id = iter_ref_obj_id(env, reg, spi); if (is_iter_destroy_kfunc(meta)) { @@ -9178,6 +9204,12 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx, /* mark current iter state as drained and assume returned NULL */ cur_iter->iter.state = BPF_ITER_STATE_DRAINED; __mark_reg_const_zero(env, &cur_fr->regs[BPF_REG_0]); + /* + * If _next acquired a ref (KF_ACQUIRE), release it in the DRAINED branch since NULL + * was returned. + */ + if (is_kfunc_acquire(meta)) + return iter_release_acquired_ref(env, cur_iter); return 0; } @@ -14201,6 +14233,22 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (meta.initialized_dynptr.ref_obj_id) { err = unmark_stack_slots_dynptr(env, reg); + } else if (base_type(reg->type) == PTR_TO_STACK) { + struct bpf_func_state *fstate; + struct bpf_reg_state *iter_st; + + fstate = env->cur_state->frame[meta.iter.frameno]; + if (fstate->stack[meta.iter.spi].slot_type[0] != STACK_ITER) { + verbose(env, "expected iterator on stack for release\n"); + return -EINVAL; + } + + iter_st = get_iter_from_state(env->cur_state, &meta); + if (!iter_st->id) { + verbose(env, "no acquired reference to release\n"); + return -EINVAL; + } + err = iter_release_acquired_ref(env, iter_st); } else { err = release_reference(env, reg->ref_obj_id); if (err) @@ -14278,6 +14326,8 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, __mark_reg_const_zero(env, ®s[BPF_REG_0]); mark_btf_func_reg_size(env, BPF_REG_0, t->size); } else if (btf_type_is_ptr(t)) { + struct bpf_reg_state *iter_acquire_st = NULL; + ptr_type = btf_type_skip_modifiers(desc_btf, t->type, &ptr_type_id); err = check_special_kfunc(env, &meta, regs, insn_aux, ptr_type, desc_btf); if (err) { @@ -14361,7 +14411,21 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *)); if (is_kfunc_acquire(&meta)) { - int id = acquire_reference(env, insn_idx); + int id; + + /* + * For iterators with KF_ACQUIRE, auto-release the previous + * iteration's ref before acquiring a new one, and after + * acquisition track the new ref on the iter slot. + */ + if (is_iter_next_kfunc(&meta)) { + iter_acquire_st = get_iter_from_state(env->cur_state, &meta); + err = iter_release_acquired_ref(env, iter_acquire_st); + if (err) + return err; + } + + id = acquire_reference(env, insn_idx); if (id < 0) return id; @@ -14372,6 +14436,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, ref_set_non_owning(env, ®s[BPF_REG_0]); } + if (iter_acquire_st) + iter_acquire_st->id = regs[BPF_REG_0].ref_obj_id; + if (reg_may_point_to_spin_lock(®s[BPF_REG_0]) && !regs[BPF_REG_0].id) regs[BPF_REG_0].id = ++env->id_gen; } else if (btf_type_is_void(t)) { -- 2.47.3 check_helper_call() prints the error message for every env->cur_state->active* element when calling a sleepable helper. Consolidate all of them into a single print statement. The check for env->cur_state->active_locks was not part of the removed print statements and will not be triggered with the consolidated print as well because it is checked in do_check() before check_helper_call() is even reached. Signed-off-by: Puranjay Mohan --- kernel/bpf/verifier.c | 40 ++++++++++++++++++---------------------- 1 file changed, 18 insertions(+), 22 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index af18cda06dc1..ddfb88b8ddd2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11581,6 +11581,19 @@ static inline bool in_sleepable_context(struct bpf_verifier_env *env) in_sleepable(env); } +static const char *non_sleepable_context_description(struct bpf_verifier_env *env) +{ + if (env->cur_state->active_rcu_locks) + return "rcu_read_lock region"; + if (env->cur_state->active_preempt_locks) + return "non-preemptible region"; + if (env->cur_state->active_irq_id) + return "IRQ-disabled region"; + if (env->cur_state->active_locks) + return "lock region"; + return "non-sleepable context"; +} + static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { @@ -11641,28 +11654,11 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return err; } - if (env->cur_state->active_rcu_locks) { - if (fn->might_sleep) { - verbose(env, "sleepable helper %s#%d in rcu_read_lock region\n", - func_id_name(func_id), func_id); - return -EINVAL; - } - } - - if (env->cur_state->active_preempt_locks) { - if (fn->might_sleep) { - verbose(env, "sleepable helper %s#%d in non-preemptible region\n", - func_id_name(func_id), func_id); - return -EINVAL; - } - } - - if (env->cur_state->active_irq_id) { - if (fn->might_sleep) { - verbose(env, "sleepable helper %s#%d in IRQ-disabled region\n", - func_id_name(func_id), func_id); - return -EINVAL; - } + if (fn->might_sleep && !in_sleepable_context(env)) { + verbose(env, "sleepable helper %s#%d in %s\n", + func_id_name(func_id), func_id, + non_sleepable_context_description(env)); + return -EINVAL; } /* Track non-sleepable context for helpers. */ -- 2.47.3 check_kfunc_call() has multiple scattered checks that reject sleepable kfuncs in various non-sleepable contexts (RCU, preempt-disabled, IRQ- disabled). These are the same conditions already checked by in_sleepable_context(), so replace them with a single consolidated check. This also simplifies the preempt lock tracking by flattening the nested if/else structure into a linear chain: preempt_disable increments, preempt_enable checks for underflow and decrements, and the sleepable check uses in_sleepable_context() which covers all non-sleepable contexts uniformly. No functional change since in_sleepable_context() checks all the same state (active_rcu_locks, active_preempt_locks, active_locks, active_irq_id, in_sleepable). Signed-off-by: Puranjay Mohan --- kernel/bpf/verifier.c | 34 +++++++++++----------------------- 1 file changed, 11 insertions(+), 23 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ddfb88b8ddd2..d54a04871c44 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14185,34 +14185,22 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } })); } - } else if (sleepable && env->cur_state->active_rcu_locks) { - verbose(env, "kernel func %s is sleepable within rcu_read_lock region\n", func_name); - return -EACCES; - } - - if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) { - verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n"); - return -EACCES; - } - - if (env->cur_state->active_preempt_locks) { - if (preempt_disable) { - env->cur_state->active_preempt_locks++; - } else if (preempt_enable) { - env->cur_state->active_preempt_locks--; - } else if (sleepable) { - verbose(env, "kernel func %s is sleepable within non-preemptible region\n", func_name); - return -EACCES; - } } else if (preempt_disable) { env->cur_state->active_preempt_locks++; } else if (preempt_enable) { - verbose(env, "unmatched attempt to enable preemption (kernel function %s)\n", func_name); - return -EINVAL; + if (env->cur_state->active_preempt_locks == 0) { + verbose(env, "unmatched attempt to enable preemption (kernel function %s)\n", func_name); + return -EINVAL; + } + env->cur_state->active_preempt_locks--; + } else if (sleepable && !in_sleepable_context(env)) { + verbose(env, "kernel func %s is sleepable within %s\n", + func_name, non_sleepable_context_description(env)); + return -EACCES; } - if (env->cur_state->active_irq_id && sleepable) { - verbose(env, "kernel func %s is sleepable within IRQ-disabled region\n", func_name); + if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) { + verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n"); return -EACCES; } -- 2.47.3 With KF_ACQUIRE support for iterators in place, we need a way to tell the verifier that holding a particular acquired reference forbids sleeping. For example, task_vma's _next holds mmap_lock, so sleeping between _next and _release must be rejected. Add a KF_FORBID_SLEEP flag (1 << 17) that can be combined with KF_ACQUIRE. When acquire_reference() is called for such a kfunc, the reference is tagged with forbid_sleep=true and a per-state forbid_sleep_count counter is incremented. When the reference is released through release_reference_nomark(), the counter is decremented in the same loop that already scans the refs array. The counter is checked wherever the verifier decides if sleeping is allowed. This is generic and works for both iterator and non-iterator kfuncs. For iterators, the auto-release and explicit _release from the previous commit handle the counter decrement automatically via release_reference(). Signed-off-by: Puranjay Mohan --- include/linux/bpf_verifier.h | 2 ++ include/linux/btf.h | 1 + kernel/bpf/verifier.c | 41 +++++++++++++++++++++++++++--------- 3 files changed, 34 insertions(+), 10 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c1e30096ea7b..39904401df3d 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -260,6 +260,7 @@ struct bpf_reference_state { * it matches on unlock. */ void *ptr; + bool forbid_sleep; /* ref prevents sleeping while held */ }; struct bpf_retval_range { @@ -420,6 +421,7 @@ struct bpf_verifier_state { u32 active_lock_id; void *active_lock_ptr; u32 active_rcu_locks; + u32 forbid_sleep_count; bool speculative; bool in_sleepable; diff --git a/include/linux/btf.h b/include/linux/btf.h index 48108471c5b1..c326c5ba49cb 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -79,6 +79,7 @@ #define KF_ARENA_ARG1 (1 << 14) /* kfunc takes an arena pointer as its first argument */ #define KF_ARENA_ARG2 (1 << 15) /* kfunc takes an arena pointer as its second argument */ #define KF_IMPLICIT_ARGS (1 << 16) /* kfunc has implicit arguments supplied by the verifier */ +#define KF_FORBID_SLEEP (1 << 17) /* acquired reference forbids sleeping while held */ /* * Tag marking a kernel function as a kfunc. This is meant to minimize the diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d54a04871c44..3aa638b1163c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -202,7 +202,7 @@ struct bpf_verifier_stack_elem { #define BPF_PRIV_STACK_MIN_SIZE 64 -static int acquire_reference(struct bpf_verifier_env *env, int insn_idx); +static int acquire_reference(struct bpf_verifier_env *env, int insn_idx, bool forbid_sleep); static int release_reference_nomark(struct bpf_verifier_state *state, int ref_obj_id); static int release_reference(struct bpf_verifier_env *env, int ref_obj_id); static void invalidate_non_owning_refs(struct bpf_verifier_env *env); @@ -210,6 +210,7 @@ static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env); static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg); static bool is_trusted_reg(const struct bpf_reg_state *reg); +static inline bool in_sleepable_context(struct bpf_verifier_env *env); static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux) { @@ -805,7 +806,7 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_ if (clone_ref_obj_id) id = clone_ref_obj_id; else - id = acquire_reference(env, insn_idx); + id = acquire_reference(env, insn_idx, false); if (id < 0) return id; @@ -1053,7 +1054,7 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env, if (spi < 0) return spi; - id = acquire_reference(env, insn_idx); + id = acquire_reference(env, insn_idx, false); if (id < 0) return id; @@ -1474,6 +1475,7 @@ static int copy_reference_state(struct bpf_verifier_state *dst, const struct bpf dst->active_irq_id = src->active_irq_id; dst->active_lock_id = src->active_lock_id; dst->active_lock_ptr = src->active_lock_ptr; + dst->forbid_sleep_count = src->forbid_sleep_count; return 0; } @@ -1547,7 +1549,7 @@ static struct bpf_reference_state *acquire_reference_state(struct bpf_verifier_e return &state->refs[new_ofs]; } -static int acquire_reference(struct bpf_verifier_env *env, int insn_idx) +static int acquire_reference(struct bpf_verifier_env *env, int insn_idx, bool forbid_sleep) { struct bpf_reference_state *s; @@ -1556,6 +1558,9 @@ static int acquire_reference(struct bpf_verifier_env *env, int insn_idx) return -ENOMEM; s->type = REF_TYPE_PTR; s->id = ++env->id_gen; + s->forbid_sleep = forbid_sleep; + if (forbid_sleep) + env->cur_state->forbid_sleep_count++; return s->id; } @@ -1598,6 +1603,9 @@ static void release_reference_state(struct bpf_verifier_state *state, int idx) int last_idx; size_t rem; + if (state->refs[idx].forbid_sleep) + state->forbid_sleep_count--; + /* IRQ state requires the relative ordering of elements remaining the * same, since it relies on the refs array to behave as a stack, so that * it can detect out-of-order IRQ restore. Hence use memmove to shift @@ -10852,9 +10860,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EINVAL; } - if (env->subprog_info[subprog].might_sleep && - (env->cur_state->active_rcu_locks || env->cur_state->active_preempt_locks || - env->cur_state->active_irq_id || !in_sleepable(env))) { + if (env->subprog_info[subprog].might_sleep && !in_sleepable_context(env)) { verbose(env, "global functions that may sleep are not allowed in non-sleepable context,\n" "i.e., in a RCU/IRQ/preempt-disabled section, or in\n" "a non-sleepable BPF program context\n"); @@ -11578,6 +11584,7 @@ static inline bool in_sleepable_context(struct bpf_verifier_env *env) !env->cur_state->active_preempt_locks && !env->cur_state->active_locks && !env->cur_state->active_irq_id && + !env->cur_state->forbid_sleep_count && in_sleepable(env); } @@ -11591,6 +11598,8 @@ static const char *non_sleepable_context_description(struct bpf_verifier_env *en return "IRQ-disabled region"; if (env->cur_state->active_locks) return "lock region"; + if (env->cur_state->forbid_sleep_count) + return "nosleep region"; return "non-sleepable context"; } @@ -12042,7 +12051,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn /* For release_reference() */ regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; } else if (is_acquire_function(func_id, meta.map.ptr)) { - int id = acquire_reference(env, insn_idx); + int id = acquire_reference(env, insn_idx, false); if (id < 0) return id; @@ -12147,6 +12156,11 @@ static bool is_kfunc_release(struct bpf_kfunc_call_arg_meta *meta) return meta->kfunc_flags & KF_RELEASE; } +static bool is_kfunc_forbid_sleep(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->kfunc_flags & KF_FORBID_SLEEP; +} + static bool is_kfunc_sleepable(struct bpf_kfunc_call_arg_meta *meta) { return meta->kfunc_flags & KF_SLEEPABLE; @@ -14395,6 +14409,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *)); if (is_kfunc_acquire(&meta)) { + bool forbid_sleep = is_kfunc_forbid_sleep(&meta); int id; /* @@ -14409,7 +14424,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return err; } - id = acquire_reference(env, insn_idx); + id = acquire_reference(env, insn_idx, forbid_sleep); if (id < 0) return id; @@ -20053,6 +20068,9 @@ static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *c if (old->active_rcu_locks != cur->active_rcu_locks) return false; + if (old->forbid_sleep_count != cur->forbid_sleep_count) + return false; + if (!check_ids(old->active_irq_id, cur->active_irq_id, idmap)) return false; @@ -20066,6 +20084,9 @@ static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *c return false; switch (old->refs[i].type) { case REF_TYPE_PTR: + if (old->refs[i].forbid_sleep != cur->refs[i].forbid_sleep) + return false; + break; case REF_TYPE_IRQ: break; case REF_TYPE_LOCK: @@ -24589,7 +24610,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) if (!subprog && env->prog->type == BPF_PROG_TYPE_STRUCT_OPS) { for (i = 0; i < aux->ctx_arg_info_size; i++) aux->ctx_arg_info[i].ref_obj_id = aux->ctx_arg_info[i].refcounted ? - acquire_reference(env, 0) : 0; + acquire_reference(env, 0, false) : 0; } ret = do_check(env); -- 2.47.3 The current implementation of task_vma iterator takes the mmap_lock in the _new() function and holds it for the entire duration of the iterator. The next commits will allow releasing the lock in the middle of the iteration and it would mean that the _next() call should re-take the mmap_lock. Move the mmap_lock setup to bpf_iter_task_vma_next() Signed-off-by: Puranjay Mohan --- kernel/bpf/task_iter.c | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 98d9b4c0daff..a85115c191e4 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -799,6 +799,8 @@ struct bpf_iter_task_vma_kern_data { struct mm_struct *mm; struct mmap_unlock_irq_work *work; struct vma_iterator vmi; + u64 last_addr; + bool locked; }; struct bpf_iter_task_vma { @@ -819,7 +821,6 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it, struct task_struct *task, u64 addr) { struct bpf_iter_task_vma_kern *kit = (void *)it; - bool irq_work_busy = false; int err; BUILD_BUG_ON(sizeof(struct bpf_iter_task_vma_kern) != sizeof(struct bpf_iter_task_vma)); @@ -840,14 +841,8 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it, goto err_cleanup_iter; } - /* kit->data->work == NULL is valid after bpf_mmap_unlock_get_irq_work */ - irq_work_busy = bpf_mmap_unlock_get_irq_work(&kit->data->work); - if (irq_work_busy || !mmap_read_trylock(kit->data->mm)) { - err = -EBUSY; - goto err_cleanup_iter; - } - - vma_iter_init(&kit->data->vmi, kit->data->mm, addr); + kit->data->locked = false; + kit->data->last_addr = addr; return 0; err_cleanup_iter: @@ -862,10 +857,26 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it, __bpf_kfunc struct vm_area_struct *bpf_iter_task_vma_next(struct bpf_iter_task_vma *it) { struct bpf_iter_task_vma_kern *kit = (void *)it; + struct vm_area_struct *vma; if (!kit->data) /* bpf_iter_task_vma_new failed */ return NULL; - return vma_next(&kit->data->vmi); + + if (!kit->data->locked) { + bool irq_work_busy; + + irq_work_busy = bpf_mmap_unlock_get_irq_work(&kit->data->work); + if (irq_work_busy || !mmap_read_trylock(kit->data->mm)) + return NULL; + + kit->data->locked = true; + vma_iter_init(&kit->data->vmi, kit->data->mm, kit->data->last_addr); + } + + vma = vma_next(&kit->data->vmi); + if (vma) + kit->data->last_addr = vma->vm_end; + return vma; } __bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it) @@ -873,7 +884,8 @@ __bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it) struct bpf_iter_task_vma_kern *kit = (void *)it; if (kit->data) { - bpf_mmap_unlock_mm(kit->data->work, kit->data->mm); + if (kit->data->locked) + bpf_mmap_unlock_mm(kit->data->work, kit->data->mm); put_task_struct(kit->data->task); bpf_mem_free(&bpf_global_ma, kit->data); } -- 2.47.3 Wire up the task_vma iterator to use the KF_ACQUIRE/KF_RELEASE and KF_FORBID_SLEEP infrastructure from the previous commits. Annotate bpf_iter_task_vma_next with KF_ACQUIRE | KF_FORBID_SLEEP so each returned VMA pointer becomes a tracked reference that prevents sleeping. Add bpf_iter_task_vma_release(it__iter) with KF_RELEASE that releases the mmap read lock, drops the acquired reference, and re-enables sleeping. The next call to _next will re-acquire the lock via mmap_read_trylock(). This enables a split iteration pattern: bpf_for_each(task_vma, vma, task, 0) { u64 start = vma->vm_start; /* vma access allowed, sleeping forbidden */ bpf_iter_task_vma_release(&___it); /* vma access forbidden, sleeping allowed */ bpf_copy_from_user(&buf, sizeof(buf), (void *)start); } Signed-off-by: Puranjay Mohan --- kernel/bpf/helpers.c | 3 ++- kernel/bpf/task_iter.c | 10 ++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 6eb6c82ed2ee..2663280c92c6 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -4605,8 +4605,9 @@ BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW) BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW | KF_RCU) -BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL | KF_ACQUIRE | KF_FORBID_SLEEP) BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY) +BTF_ID_FLAGS(func, bpf_iter_task_vma_release, KF_RELEASE) #ifdef CONFIG_CGROUPS BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW) BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL) diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index a85115c191e4..55b264d634db 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -879,6 +879,16 @@ __bpf_kfunc struct vm_area_struct *bpf_iter_task_vma_next(struct bpf_iter_task_v return vma; } +__bpf_kfunc void bpf_iter_task_vma_release(struct bpf_iter_task_vma *it__iter) +{ + struct bpf_iter_task_vma_kern *kit = (void *)it__iter; + + if (kit->data && kit->data->locked) { + bpf_mmap_unlock_mm(kit->data->work, kit->data->mm); + kit->data->locked = false; + } +} + __bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it) { struct bpf_iter_task_vma_kern *kit = (void *)it; -- 2.47.3 Add runtime and verifier tests for the split task_vma iteration model. The runtime tests are: - iter_task_vma_release_and_copy: iterates VMAs, saves vm_start, releases mmap_lock via bpf_iter_task_vma_release(), then calls bpf_copy_from_user() on the saved address. - iter_task_vma_nested: two nested task_vma iterators on the same task verify that mmap_read_trylock() succeeds for the second reader on the same mm. The verifier tests cover: - nosleep_iter_release_then_sleep: release then sleep is accepted - nosleep_iter_sleep_without_release: sleep without release is rejected with "in nosleep region" - nosleep_iter_vma_access_after_release: VMA access after release is rejected with "invalid mem access 'scalar'" - nosleep_iter_double_release: double release is rejected - nosleep_iter_release_without_acquire: release without prior _next is rejected - nosleep_iter_nested_release_inner: releasing inner iterator does not allow sleeping when outer still holds a nosleep reference Signed-off-by: Puranjay Mohan --- .../testing/selftests/bpf/bpf_experimental.h | 1 + .../testing/selftests/bpf/prog_tests/iters.c | 24 ++++ .../selftests/bpf/progs/iters_task_vma.c | 71 ++++++++++ .../bpf/progs/iters_task_vma_nosleep.c | 125 ++++++++++++++++++ 4 files changed, 221 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/iters_task_vma_nosleep.c diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index 9df77e59d4f5..531d6c6aab45 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -165,6 +165,7 @@ extern int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it, struct task_struct *task, __u64 addr) __ksym; extern struct vm_area_struct *bpf_iter_task_vma_next(struct bpf_iter_task_vma *it) __ksym; +extern void bpf_iter_task_vma_release(struct bpf_iter_task_vma *it) __ksym; extern void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it) __ksym; /* Convenience macro to wrap over bpf_obj_drop_impl */ diff --git a/tools/testing/selftests/bpf/prog_tests/iters.c b/tools/testing/selftests/bpf/prog_tests/iters.c index a539980a2fbe..ab24871762c8 100644 --- a/tools/testing/selftests/bpf/prog_tests/iters.c +++ b/tools/testing/selftests/bpf/prog_tests/iters.c @@ -21,6 +21,7 @@ #include "iters_css_task.skel.h" #include "iters_css.skel.h" #include "iters_task_failure.skel.h" +#include "iters_task_vma_nosleep.skel.h" static void subtest_num_iters(void) { @@ -152,6 +153,28 @@ static void subtest_task_vma_iters(void) if (!ASSERT_EQ(skel->bss->vmas_seen, seen, "vmas_seen_eq")) goto cleanup; + /* Test release+sleepable: trigger the release_and_copy program */ + skel->bss->release_vmas_seen = 0; + err = iters_task_vma__attach(skel); + if (!ASSERT_OK(err, "skel_reattach")) + goto cleanup; + + getpgid(skel->bss->target_pid); + iters_task_vma__detach(skel); + + ASSERT_GT(skel->bss->release_vmas_seen, 0, "release_vmas_seen_gt_zero"); + + /* Test nested iterators on same task (same mmap_lock) */ + skel->bss->nested_vmas_seen = 0; + err = iters_task_vma__attach(skel); + if (!ASSERT_OK(err, "skel_reattach_nested")) + goto cleanup; + + getpgid(skel->bss->target_pid); + iters_task_vma__detach(skel); + + ASSERT_GT(skel->bss->nested_vmas_seen, 0, "nested_vmas_seen_gt_zero"); + cleanup: if (f) fclose(f); @@ -322,4 +345,5 @@ void test_iters(void) if (test__start_subtest("css")) subtest_css_iters(); RUN_TESTS(iters_task_failure); + RUN_TESTS(iters_task_vma_nosleep); } diff --git a/tools/testing/selftests/bpf/progs/iters_task_vma.c b/tools/testing/selftests/bpf/progs/iters_task_vma.c index dc0c3691dcc2..baecd79d2998 100644 --- a/tools/testing/selftests/bpf/progs/iters_task_vma.c +++ b/tools/testing/selftests/bpf/progs/iters_task_vma.c @@ -40,4 +40,75 @@ int iter_task_vma_for_each(const void *ctx) return 0; } +unsigned int release_vmas_seen = 0; + +SEC("fentry.s/" SYS_PREFIX "sys_getpgid") +int iter_task_vma_release_and_copy(const void *ctx) +{ + struct task_struct *task = bpf_get_current_task_btf(); + struct vm_area_struct *vma; + unsigned int seen = 0; + + if (task->pid != target_pid) + return 0; + + if (release_vmas_seen) + return 0; + + bpf_for_each(task_vma, vma, task, 0) { + __u64 start; + char buf[8]; + + if (bpf_cmp_unlikely(seen, >=, 1000)) + break; + + /* Phase 1: mmap_lock held, read VMA data */ + start = vma->vm_start; + + /* Transition: release mmap_lock */ + bpf_iter_task_vma_release(&___it); + /* VMA pointer is now invalid; sleepable helpers allowed */ + + /* Phase 2: mmap_lock released, sleepable call */ + bpf_copy_from_user(&buf, sizeof(buf), (void *)start); + + seen++; + } + + release_vmas_seen = seen; + return 0; +} + +/* + * Test nested task_vma iterators on the same task. Both iterators take + * mmap_read_trylock() on the same mm; the rwsem should allow the second + * reader and the inner loop should observe at least one VMA. + */ +unsigned int nested_vmas_seen = 0; + +SEC("fentry.s/" SYS_PREFIX "sys_getpgid") +int iter_task_vma_nested(const void *ctx) +{ + struct task_struct *task = bpf_get_current_task_btf(); + struct vm_area_struct *vma1, *vma2; + unsigned int seen = 0; + + if (task->pid != target_pid) + return 0; + + if (nested_vmas_seen) + return 0; + + bpf_for_each(task_vma, vma1, task, 0) { + bpf_for_each(task_vma, vma2, task, 0) { + seen++; + break; + } + break; + } + + nested_vmas_seen = seen; + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/iters_task_vma_nosleep.c b/tools/testing/selftests/bpf/progs/iters_task_vma_nosleep.c new file mode 100644 index 000000000000..ab607e29b36a --- /dev/null +++ b/tools/testing/selftests/bpf/progs/iters_task_vma_nosleep.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include "vmlinux.h" +#include "bpf_experimental.h" +#include +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +/* Negative test: sleepable call without release should be rejected */ +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +__failure __msg("sleepable helper bpf_copy_from_user#148 in nosleep region") +int nosleep_iter_sleep_without_release(const void *ctx) +{ + struct task_struct *task = bpf_get_current_task_btf(); + struct vm_area_struct *vma; + + bpf_for_each(task_vma, vma, task, 0) { + char buf[8]; + + /* Attempt to call sleepable helper without releasing mmap_lock. + * Verifier should reject this. + */ + bpf_copy_from_user(&buf, sizeof(buf), (void *)vma->vm_start); + break; + } + return 0; +} + +/* Negative test: VMA access after release should be rejected */ +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +__failure __msg("invalid mem access 'scalar'") +int nosleep_iter_vma_access_after_release(const void *ctx) +{ + struct task_struct *task = bpf_get_current_task_btf(); + struct vm_area_struct *vma; + __u64 val = 0; + + bpf_for_each(task_vma, vma, task, 0) { + bpf_iter_task_vma_release(&___it); + /* VMA pointer is now invalid. Accessing it should be rejected. */ + val = vma->vm_start; + break; + } + __sink(val); + return 0; +} + +/* Positive test: release then sleepable call should succeed */ +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +__success +int nosleep_iter_release_then_sleep(const void *ctx) +{ + struct task_struct *task = bpf_get_current_task_btf(); + struct vm_area_struct *vma; + + bpf_for_each(task_vma, vma, task, 0) { + __u64 start = vma->vm_start; + char buf[8]; + + bpf_iter_task_vma_release(&___it); + bpf_copy_from_user(&buf, sizeof(buf), (void *)start); + break; + } + return 0; +} + +/* Negative test: double release should be rejected */ +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +__failure __msg("no acquired reference to release") +int nosleep_iter_double_release(const void *ctx) +{ + struct task_struct *task = bpf_get_current_task_btf(); + struct vm_area_struct *vma; + + bpf_for_each(task_vma, vma, task, 0) { + bpf_iter_task_vma_release(&___it); + bpf_iter_task_vma_release(&___it); + break; + } + return 0; +} + +/* Negative test: release without any prior _next acquire */ +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +__failure __msg("no acquired reference to release") +int nosleep_iter_release_without_acquire(const void *ctx) +{ + struct task_struct *task = bpf_get_current_task_btf(); + struct bpf_iter_task_vma it; + + bpf_iter_task_vma_new(&it, task, 0); + /* No _next called, so no acquired reference exists */ + bpf_iter_task_vma_release(&it); + bpf_iter_task_vma_destroy(&it); + return 0; +} + +/* Negative test: nested iterators, releasing inner should not allow sleeping + * because the outer iterator still holds a nosleep reference. + */ +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +__failure __msg("sleepable helper bpf_copy_from_user#148 in nosleep region") +int nosleep_iter_nested_release_inner(const void *ctx) +{ + struct task_struct *task = bpf_get_current_task_btf(); + struct vm_area_struct *vma_outer, *vma_inner; + + bpf_for_each(task_vma, vma_outer, task, 0) { + bpf_for_each(task_vma, vma_inner, task, 0) { + __u64 start = vma_inner->vm_start; + char buf[8]; + + bpf_iter_task_vma_release(&___it); + /* Inner released, but outer still holds nosleep ref. + * Sleeping should still be forbidden. + */ + bpf_copy_from_user(&buf, sizeof(buf), (void *)start); + break; + } + break; + } + return 0; +} -- 2.47.3