Some iterators hold resources (like mmap_lock in task_vma) that prevent sleeping. To allow BPF programs to release such resources mid-iteration and call sleepable helpers, the verifier needs to track acquire/release semantics on iterator _next pointers. Repurpose the st->id field on STACK_ITER slots to track the ref_obj_id of the pointer returned by _next when the kfunc is annotated with KF_ACQUIRE. This is safe because st->id is initialized to 0 by __mark_reg_known_zero() in mark_stack_slots_iter() and is not compared in stacksafe() for STACK_ITER slots. The lifecycle is: _next (KF_ACQUIRE): - auto-release old ref if st->id != 0 - acquire new ref, store ref_obj_id in st->id - DRAINED branch: release via st->id, set st->id = 0 - ACTIVE branch: keeps ref, st->id tracks it _release (KF_RELEASE + __iter arg): - read st->id, release_reference(), set st->id = 0 _destroy: - release st->id if non-zero before releasing iterator's own ref Signed-off-by: Puranjay Mohan --- kernel/bpf/verifier.c | 67 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 63 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0162f946032f..aa48180b6073 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -355,6 +355,11 @@ struct bpf_kfunc_call_arg_meta { u8 spi; u8 frameno; } iter; + /* Set when a kfunc takes an __iter arg. Used by the KF_RELEASE + * path to release the reference tracked on the iterator slot + * (st->id) instead of requiring a refcounted PTR_TO_BTF_ID arg. + */ + bool has_iter_arg; struct bpf_map_desc map; u64 mem_size; }; @@ -1083,6 +1088,22 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env, return 0; } +/* Release the acquired reference tracked by iter_st->id, if any. + * Used during auto-release in _next, DRAINED handling, and _destroy. + */ +static int iter_release_acquired_ref(struct bpf_verifier_env *env, + struct bpf_reg_state *iter_st) +{ + int err; + + if (!iter_st->id) + return 0; + err = release_reference(env, iter_st->id); + if (!err) + iter_st->id = 0; + return err; +} + static int unmark_stack_slots_iter(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int nr_slots) { @@ -1097,8 +1118,13 @@ static int unmark_stack_slots_iter(struct bpf_verifier_env *env, struct bpf_stack_state *slot = &state->stack[spi - i]; struct bpf_reg_state *st = &slot->spilled_ptr; - if (i == 0) + if (i == 0) { + /* Release any outstanding acquired ref tracked by + * st->id before releasing the iterator's own ref. + */ + WARN_ON_ONCE(iter_release_acquired_ref(env, st)); WARN_ON_ONCE(release_reference(env, st->ref_obj_id)); + } __mark_reg_not_init(env, st); @@ -8943,6 +8969,7 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id /* remember meta->iter info for process_iter_next_call() */ meta->iter.spi = spi; meta->iter.frameno = reg->frameno; + meta->has_iter_arg = true; meta->ref_obj_id = iter_ref_obj_id(env, reg, spi); if (is_iter_destroy_kfunc(meta)) { @@ -9178,8 +9205,10 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx, /* mark current iter state as drained and assume returned NULL */ cur_iter->iter.state = BPF_ITER_STATE_DRAINED; __mark_reg_const_zero(env, &cur_fr->regs[BPF_REG_0]); - - return 0; + /* If _next acquired a ref (KF_ACQUIRE), release it in the DRAINED + * branch since NULL was returned. + */ + return iter_release_acquired_ref(env, cur_iter); } static bool arg_type_is_mem_size(enum bpf_arg_type type) @@ -13797,7 +13826,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } } - if (is_kfunc_release(meta) && !meta->release_regno) { + if (is_kfunc_release(meta) && !meta->release_regno && !meta->has_iter_arg) { verbose(env, "release kernel function %s expects refcounted PTR_TO_BTF_ID\n", func_name); return -EINVAL; @@ -14205,6 +14234,21 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } if (err) return err; + } else if (meta.has_iter_arg && is_kfunc_release(&meta)) { + /* For KF_RELEASE kfuncs taking an __iter arg, release the + * reference tracked by st->id on the iterator slot. + */ + struct bpf_reg_state *iter_st; + + iter_st = get_iter_from_state(env->cur_state, &meta); + if (!iter_st->id) { + verbose(env, "no acquired reference to release\n"); + return -EINVAL; + } + err = release_reference(env, iter_st->id); + if (err) + return err; + iter_st->id = 0; } if (meta.func_id == special_kfunc_list[KF_bpf_list_push_front_impl] || @@ -14356,6 +14400,18 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, regs[BPF_REG_0].id = ++env->id_gen; } mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *)); + /* For iterators with KF_ACQUIRE, auto-release the previous + * iteration's ref before acquiring a new one, and after + * acquisition track the new ref on the iter slot. + */ + struct bpf_reg_state *iter_acquire_st = NULL; + + if (is_iter_next_kfunc(&meta) && is_kfunc_acquire(&meta)) { + iter_acquire_st = get_iter_from_state(env->cur_state, &meta); + err = iter_release_acquired_ref(env, iter_acquire_st); + if (err) + return err; + } if (is_kfunc_acquire(&meta)) { int id = acquire_reference(env, insn_idx); @@ -14368,6 +14424,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, ref_set_non_owning(env, ®s[BPF_REG_0]); } + if (iter_acquire_st) + iter_acquire_st->id = regs[BPF_REG_0].ref_obj_id; + if (reg_may_point_to_spin_lock(®s[BPF_REG_0]) && !regs[BPF_REG_0].id) regs[BPF_REG_0].id = ++env->id_gen; } else if (btf_type_is_void(t)) { -- 2.47.3 With KF_ACQUIRE support for iterators in place, we need a way to tell the verifier that holding a particular acquired reference forbids sleeping. For example, task_vma's _next holds mmap_lock, so sleeping between _next and _release must be rejected. Add a KF_FORBID_SLEEP flag (1 << 17) that can be combined with KF_ACQUIRE. When acquire_reference() is called for such a kfunc, the reference is tagged with forbid_sleep=true and a per-state forbid_sleep_count counter is incremented. When the reference is released through release_reference_nomark(), the counter is decremented in the same loop that already scans the refs array. The counter is checked wherever the verifier decides if sleeping is allowed. This is generic and works for both iterator and non-iterator kfuncs. For iterators, the auto-release and explicit _release from the previous commit handle the counter decrement automatically via release_reference(). Signed-off-by: Puranjay Mohan --- include/linux/bpf_verifier.h | 2 ++ include/linux/btf.h | 1 + kernel/bpf/verifier.c | 54 ++++++++++++++++++++++++++++++------ 3 files changed, 49 insertions(+), 8 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c1e30096ea7b..39904401df3d 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -260,6 +260,7 @@ struct bpf_reference_state { * it matches on unlock. */ void *ptr; + bool forbid_sleep; /* ref prevents sleeping while held */ }; struct bpf_retval_range { @@ -420,6 +421,7 @@ struct bpf_verifier_state { u32 active_lock_id; void *active_lock_ptr; u32 active_rcu_locks; + u32 forbid_sleep_count; bool speculative; bool in_sleepable; diff --git a/include/linux/btf.h b/include/linux/btf.h index 48108471c5b1..c326c5ba49cb 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -79,6 +79,7 @@ #define KF_ARENA_ARG1 (1 << 14) /* kfunc takes an arena pointer as its first argument */ #define KF_ARENA_ARG2 (1 << 15) /* kfunc takes an arena pointer as its second argument */ #define KF_IMPLICIT_ARGS (1 << 16) /* kfunc has implicit arguments supplied by the verifier */ +#define KF_FORBID_SLEEP (1 << 17) /* acquired reference forbids sleeping while held */ /* * Tag marking a kernel function as a kfunc. This is meant to minimize the diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index aa48180b6073..fd843dc93616 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -202,7 +202,7 @@ struct bpf_verifier_stack_elem { #define BPF_PRIV_STACK_MIN_SIZE 64 -static int acquire_reference(struct bpf_verifier_env *env, int insn_idx); +static int acquire_reference(struct bpf_verifier_env *env, int insn_idx, bool forbid_sleep); static int release_reference_nomark(struct bpf_verifier_state *state, int ref_obj_id); static int release_reference(struct bpf_verifier_env *env, int ref_obj_id); static void invalidate_non_owning_refs(struct bpf_verifier_env *env); @@ -810,7 +810,7 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_ if (clone_ref_obj_id) id = clone_ref_obj_id; else - id = acquire_reference(env, insn_idx); + id = acquire_reference(env, insn_idx, false); if (id < 0) return id; @@ -1056,7 +1056,7 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env, if (spi < 0) return spi; - id = acquire_reference(env, insn_idx); + id = acquire_reference(env, insn_idx, false); if (id < 0) return id; @@ -1476,6 +1476,7 @@ static int copy_reference_state(struct bpf_verifier_state *dst, const struct bpf dst->active_irq_id = src->active_irq_id; dst->active_lock_id = src->active_lock_id; dst->active_lock_ptr = src->active_lock_ptr; + dst->forbid_sleep_count = src->forbid_sleep_count; return 0; } @@ -1549,7 +1550,7 @@ static struct bpf_reference_state *acquire_reference_state(struct bpf_verifier_e return &state->refs[new_ofs]; } -static int acquire_reference(struct bpf_verifier_env *env, int insn_idx) +static int acquire_reference(struct bpf_verifier_env *env, int insn_idx, bool forbid_sleep) { struct bpf_reference_state *s; @@ -1558,6 +1559,9 @@ static int acquire_reference(struct bpf_verifier_env *env, int insn_idx) return -ENOMEM; s->type = REF_TYPE_PTR; s->id = ++env->id_gen; + s->forbid_sleep = forbid_sleep; + if (forbid_sleep) + env->cur_state->forbid_sleep_count++; return s->id; } @@ -10510,6 +10514,8 @@ static int release_reference_nomark(struct bpf_verifier_state *state, int ref_ob if (state->refs[i].type != REF_TYPE_PTR) continue; if (state->refs[i].id == ref_obj_id) { + if (state->refs[i].forbid_sleep) + state->forbid_sleep_count--; release_reference_state(state, i); return 0; } @@ -10847,7 +10853,8 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (env->subprog_info[subprog].might_sleep && (env->cur_state->active_rcu_locks || env->cur_state->active_preempt_locks || - env->cur_state->active_irq_id || !in_sleepable(env))) { + env->cur_state->active_irq_id || env->cur_state->forbid_sleep_count || + !in_sleepable(env))) { verbose(env, "global functions that may sleep are not allowed in non-sleepable context,\n" "i.e., in a RCU/IRQ/preempt-disabled section, or in\n" "a non-sleepable BPF program context\n"); @@ -11435,6 +11442,11 @@ static int check_resource_leak(struct bpf_verifier_env *env, bool exception_exit return -EINVAL; } + if (check_lock && env->cur_state->forbid_sleep_count) { + verbose(env, "%s cannot be used inside nosleep region\n", prefix); + return -EINVAL; + } + if (check_lock && env->cur_state->active_preempt_locks) { verbose(env, "%s cannot be used inside bpf_preempt_disable-ed region\n", prefix); return -EINVAL; @@ -11571,6 +11583,7 @@ static inline bool in_sleepable_context(struct bpf_verifier_env *env) !env->cur_state->active_preempt_locks && !env->cur_state->active_locks && !env->cur_state->active_irq_id && + !env->cur_state->forbid_sleep_count && in_sleepable(env); } @@ -11658,6 +11671,14 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } } + if (env->cur_state->forbid_sleep_count) { + if (fn->might_sleep) { + verbose(env, "sleepable helper %s#%d in nosleep region\n", + func_id_name(func_id), func_id); + return -EINVAL; + } + } + /* Track non-sleepable context for helpers. */ if (!in_sleepable_context(env)) env->insn_aux_data[insn_idx].non_sleepable = true; @@ -12039,7 +12060,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn /* For release_reference() */ regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; } else if (is_acquire_function(func_id, meta.map.ptr)) { - int id = acquire_reference(env, insn_idx); + int id = acquire_reference(env, insn_idx, false); if (id < 0) return id; @@ -12144,6 +12165,11 @@ static bool is_kfunc_release(struct bpf_kfunc_call_arg_meta *meta) return meta->kfunc_flags & KF_RELEASE; } +static bool is_kfunc_forbid_sleep(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->kfunc_flags & KF_FORBID_SLEEP; +} + static bool is_kfunc_sleepable(struct bpf_kfunc_call_arg_meta *meta) { return meta->kfunc_flags & KF_SLEEPABLE; @@ -14113,6 +14139,11 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EACCES; } + if (sleepable && env->cur_state->forbid_sleep_count) { + verbose(env, "sleepable kfunc %s in nosleep region\n", func_name); + return -EACCES; + } + /* Track non-sleepable context for kfuncs, same as for helpers. */ if (!in_sleepable_context(env)) insn_aux->non_sleepable = true; @@ -14413,7 +14444,8 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return err; } if (is_kfunc_acquire(&meta)) { - int id = acquire_reference(env, insn_idx); + bool forbid_sleep = is_kfunc_forbid_sleep(&meta); + int id = acquire_reference(env, insn_idx, forbid_sleep); if (id < 0) return id; @@ -20054,6 +20086,9 @@ static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *c if (old->active_rcu_locks != cur->active_rcu_locks) return false; + if (old->forbid_sleep_count != cur->forbid_sleep_count) + return false; + if (!check_ids(old->active_irq_id, cur->active_irq_id, idmap)) return false; @@ -20067,6 +20102,9 @@ static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *c return false; switch (old->refs[i].type) { case REF_TYPE_PTR: + if (old->refs[i].forbid_sleep != cur->refs[i].forbid_sleep) + return false; + break; case REF_TYPE_IRQ: break; case REF_TYPE_LOCK: @@ -24584,7 +24622,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) if (!subprog && env->prog->type == BPF_PROG_TYPE_STRUCT_OPS) { for (i = 0; i < aux->ctx_arg_info_size; i++) aux->ctx_arg_info[i].ref_obj_id = aux->ctx_arg_info[i].refcounted ? - acquire_reference(env, 0) : 0; + acquire_reference(env, 0, false) : 0; } ret = do_check(env); -- 2.47.3 The current implementation of task_vma iterator takes the mmap_lock in the _new() function and holds it for the entire duration of the iterator. The next commits will allow releasing the lock in the middle of the iteration and it would mean that the _next() call should re-take the mmap_lock. Move the mmap_lock setup to bpf_iter_task_vma_next() Signed-off-by: Puranjay Mohan --- kernel/bpf/task_iter.c | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 98d9b4c0daff..a85115c191e4 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -799,6 +799,8 @@ struct bpf_iter_task_vma_kern_data { struct mm_struct *mm; struct mmap_unlock_irq_work *work; struct vma_iterator vmi; + u64 last_addr; + bool locked; }; struct bpf_iter_task_vma { @@ -819,7 +821,6 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it, struct task_struct *task, u64 addr) { struct bpf_iter_task_vma_kern *kit = (void *)it; - bool irq_work_busy = false; int err; BUILD_BUG_ON(sizeof(struct bpf_iter_task_vma_kern) != sizeof(struct bpf_iter_task_vma)); @@ -840,14 +841,8 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it, goto err_cleanup_iter; } - /* kit->data->work == NULL is valid after bpf_mmap_unlock_get_irq_work */ - irq_work_busy = bpf_mmap_unlock_get_irq_work(&kit->data->work); - if (irq_work_busy || !mmap_read_trylock(kit->data->mm)) { - err = -EBUSY; - goto err_cleanup_iter; - } - - vma_iter_init(&kit->data->vmi, kit->data->mm, addr); + kit->data->locked = false; + kit->data->last_addr = addr; return 0; err_cleanup_iter: @@ -862,10 +857,26 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it, __bpf_kfunc struct vm_area_struct *bpf_iter_task_vma_next(struct bpf_iter_task_vma *it) { struct bpf_iter_task_vma_kern *kit = (void *)it; + struct vm_area_struct *vma; if (!kit->data) /* bpf_iter_task_vma_new failed */ return NULL; - return vma_next(&kit->data->vmi); + + if (!kit->data->locked) { + bool irq_work_busy; + + irq_work_busy = bpf_mmap_unlock_get_irq_work(&kit->data->work); + if (irq_work_busy || !mmap_read_trylock(kit->data->mm)) + return NULL; + + kit->data->locked = true; + vma_iter_init(&kit->data->vmi, kit->data->mm, kit->data->last_addr); + } + + vma = vma_next(&kit->data->vmi); + if (vma) + kit->data->last_addr = vma->vm_end; + return vma; } __bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it) @@ -873,7 +884,8 @@ __bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it) struct bpf_iter_task_vma_kern *kit = (void *)it; if (kit->data) { - bpf_mmap_unlock_mm(kit->data->work, kit->data->mm); + if (kit->data->locked) + bpf_mmap_unlock_mm(kit->data->work, kit->data->mm); put_task_struct(kit->data->task); bpf_mem_free(&bpf_global_ma, kit->data); } -- 2.47.3 Wire up the task_vma iterator to use the KF_ACQUIRE/KF_RELEASE and KF_FORBID_SLEEP infrastructure from the previous commits. Annotate bpf_iter_task_vma_next with KF_ACQUIRE | KF_FORBID_SLEEP so each returned VMA pointer becomes a tracked reference that prevents sleeping. Add bpf_iter_task_vma_release(it__iter) with KF_RELEASE that releases the mmap read lock, drops the acquired reference, and re-enables sleeping. The next call to _next will re-acquire the lock via mmap_read_trylock(). This enables a split iteration pattern: bpf_for_each(task_vma, vma, task, 0) { u64 start = vma->vm_start; /* vma access allowed, sleeping forbidden */ bpf_iter_task_vma_release(&___it); /* vma access forbidden, sleeping allowed */ bpf_copy_from_user(&buf, sizeof(buf), (void *)start); } Signed-off-by: Puranjay Mohan --- kernel/bpf/helpers.c | 3 ++- kernel/bpf/task_iter.c | 10 ++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 7ac32798eb04..b3921c22810e 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -4605,8 +4605,9 @@ BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW) BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW | KF_RCU) -BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL | KF_ACQUIRE | KF_FORBID_SLEEP) BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY) +BTF_ID_FLAGS(func, bpf_iter_task_vma_release, KF_RELEASE) #ifdef CONFIG_CGROUPS BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW) BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL) diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index a85115c191e4..55b264d634db 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -879,6 +879,16 @@ __bpf_kfunc struct vm_area_struct *bpf_iter_task_vma_next(struct bpf_iter_task_v return vma; } +__bpf_kfunc void bpf_iter_task_vma_release(struct bpf_iter_task_vma *it__iter) +{ + struct bpf_iter_task_vma_kern *kit = (void *)it__iter; + + if (kit->data && kit->data->locked) { + bpf_mmap_unlock_mm(kit->data->work, kit->data->mm); + kit->data->locked = false; + } +} + __bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it) { struct bpf_iter_task_vma_kern *kit = (void *)it; -- 2.47.3 Add runtime and verifier tests for the split task_vma iteration model. The runtime test iter_task_vma_release_and_copy iterates VMAs, saves vm_start, releases mmap_lock via bpf_iter_task_vma_release(), then calls bpf_copy_from_user() on the saved address. The verifier tests cover: - nosleep_iter_release_then_sleep: release then sleep is accepted - nosleep_iter_sleep_without_release: sleep without release is rejected with "in nosleep region" - nosleep_iter_vma_access_after_release: VMA access after release is rejected with "invalid mem access 'scalar'" Signed-off-by: Puranjay Mohan --- .../testing/selftests/bpf/bpf_experimental.h | 1 + .../testing/selftests/bpf/prog_tests/iters.c | 13 ++ .../selftests/bpf/progs/iters_task_vma.c | 39 ++++++ .../bpf/progs/iters_task_vma_nosleep.c | 125 ++++++++++++++++++ 4 files changed, 178 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/iters_task_vma_nosleep.c diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index 9df77e59d4f5..531d6c6aab45 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -165,6 +165,7 @@ extern int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it, struct task_struct *task, __u64 addr) __ksym; extern struct vm_area_struct *bpf_iter_task_vma_next(struct bpf_iter_task_vma *it) __ksym; +extern void bpf_iter_task_vma_release(struct bpf_iter_task_vma *it) __ksym; extern void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it) __ksym; /* Convenience macro to wrap over bpf_obj_drop_impl */ diff --git a/tools/testing/selftests/bpf/prog_tests/iters.c b/tools/testing/selftests/bpf/prog_tests/iters.c index a539980a2fbe..977114f0e88f 100644 --- a/tools/testing/selftests/bpf/prog_tests/iters.c +++ b/tools/testing/selftests/bpf/prog_tests/iters.c @@ -21,6 +21,7 @@ #include "iters_css_task.skel.h" #include "iters_css.skel.h" #include "iters_task_failure.skel.h" +#include "iters_task_vma_nosleep.skel.h" static void subtest_num_iters(void) { @@ -152,6 +153,17 @@ static void subtest_task_vma_iters(void) if (!ASSERT_EQ(skel->bss->vmas_seen, seen, "vmas_seen_eq")) goto cleanup; + /* Test release+sleepable: trigger the release_and_copy program */ + skel->bss->release_vmas_seen = 0; + err = iters_task_vma__attach(skel); + if (!ASSERT_OK(err, "skel_reattach")) + goto cleanup; + + getpgid(skel->bss->target_pid); + iters_task_vma__detach(skel); + + ASSERT_GT(skel->bss->release_vmas_seen, 0, "release_vmas_seen_gt_zero"); + cleanup: if (f) fclose(f); @@ -322,4 +334,5 @@ void test_iters(void) if (test__start_subtest("css")) subtest_css_iters(); RUN_TESTS(iters_task_failure); + RUN_TESTS(iters_task_vma_nosleep); } diff --git a/tools/testing/selftests/bpf/progs/iters_task_vma.c b/tools/testing/selftests/bpf/progs/iters_task_vma.c index dc0c3691dcc2..59065f9da4f8 100644 --- a/tools/testing/selftests/bpf/progs/iters_task_vma.c +++ b/tools/testing/selftests/bpf/progs/iters_task_vma.c @@ -40,4 +40,43 @@ int iter_task_vma_for_each(const void *ctx) return 0; } +unsigned int release_vmas_seen = 0; + +SEC("fentry.s/" SYS_PREFIX "sys_getpgid") +int iter_task_vma_release_and_copy(const void *ctx) +{ + struct task_struct *task = bpf_get_current_task_btf(); + struct vm_area_struct *vma; + unsigned int seen = 0; + + if (task->pid != target_pid) + return 0; + + if (release_vmas_seen) + return 0; + + bpf_for_each(task_vma, vma, task, 0) { + __u64 start; + char buf[8]; + + if (bpf_cmp_unlikely(seen, >=, 1000)) + break; + + /* Phase 1: mmap_lock held, read VMA data */ + start = vma->vm_start; + + /* Transition: release mmap_lock */ + bpf_iter_task_vma_release(&___it); + /* VMA pointer is now invalid; sleepable helpers allowed */ + + /* Phase 2: mmap_lock released, sleepable call */ + bpf_copy_from_user(&buf, sizeof(buf), (void *)start); + + seen++; + } + + release_vmas_seen = seen; + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/iters_task_vma_nosleep.c b/tools/testing/selftests/bpf/progs/iters_task_vma_nosleep.c new file mode 100644 index 000000000000..ab607e29b36a --- /dev/null +++ b/tools/testing/selftests/bpf/progs/iters_task_vma_nosleep.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include "vmlinux.h" +#include "bpf_experimental.h" +#include +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +/* Negative test: sleepable call without release should be rejected */ +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +__failure __msg("sleepable helper bpf_copy_from_user#148 in nosleep region") +int nosleep_iter_sleep_without_release(const void *ctx) +{ + struct task_struct *task = bpf_get_current_task_btf(); + struct vm_area_struct *vma; + + bpf_for_each(task_vma, vma, task, 0) { + char buf[8]; + + /* Attempt to call sleepable helper without releasing mmap_lock. + * Verifier should reject this. + */ + bpf_copy_from_user(&buf, sizeof(buf), (void *)vma->vm_start); + break; + } + return 0; +} + +/* Negative test: VMA access after release should be rejected */ +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +__failure __msg("invalid mem access 'scalar'") +int nosleep_iter_vma_access_after_release(const void *ctx) +{ + struct task_struct *task = bpf_get_current_task_btf(); + struct vm_area_struct *vma; + __u64 val = 0; + + bpf_for_each(task_vma, vma, task, 0) { + bpf_iter_task_vma_release(&___it); + /* VMA pointer is now invalid. Accessing it should be rejected. */ + val = vma->vm_start; + break; + } + __sink(val); + return 0; +} + +/* Positive test: release then sleepable call should succeed */ +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +__success +int nosleep_iter_release_then_sleep(const void *ctx) +{ + struct task_struct *task = bpf_get_current_task_btf(); + struct vm_area_struct *vma; + + bpf_for_each(task_vma, vma, task, 0) { + __u64 start = vma->vm_start; + char buf[8]; + + bpf_iter_task_vma_release(&___it); + bpf_copy_from_user(&buf, sizeof(buf), (void *)start); + break; + } + return 0; +} + +/* Negative test: double release should be rejected */ +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +__failure __msg("no acquired reference to release") +int nosleep_iter_double_release(const void *ctx) +{ + struct task_struct *task = bpf_get_current_task_btf(); + struct vm_area_struct *vma; + + bpf_for_each(task_vma, vma, task, 0) { + bpf_iter_task_vma_release(&___it); + bpf_iter_task_vma_release(&___it); + break; + } + return 0; +} + +/* Negative test: release without any prior _next acquire */ +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +__failure __msg("no acquired reference to release") +int nosleep_iter_release_without_acquire(const void *ctx) +{ + struct task_struct *task = bpf_get_current_task_btf(); + struct bpf_iter_task_vma it; + + bpf_iter_task_vma_new(&it, task, 0); + /* No _next called, so no acquired reference exists */ + bpf_iter_task_vma_release(&it); + bpf_iter_task_vma_destroy(&it); + return 0; +} + +/* Negative test: nested iterators, releasing inner should not allow sleeping + * because the outer iterator still holds a nosleep reference. + */ +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +__failure __msg("sleepable helper bpf_copy_from_user#148 in nosleep region") +int nosleep_iter_nested_release_inner(const void *ctx) +{ + struct task_struct *task = bpf_get_current_task_btf(); + struct vm_area_struct *vma_outer, *vma_inner; + + bpf_for_each(task_vma, vma_outer, task, 0) { + bpf_for_each(task_vma, vma_inner, task, 0) { + __u64 start = vma_inner->vm_start; + char buf[8]; + + bpf_iter_task_vma_release(&___it); + /* Inner released, but outer still holds nosleep ref. + * Sleeping should still be forbidden. + */ + bpf_copy_from_user(&buf, sizeof(buf), (void *)start); + break; + } + break; + } + return 0; +} -- 2.47.3