The open-coded task_vma iterator reads task->mm and acquires mmap_read_trylock() but never calls mmget(). The mm can reach mm_users == 0 if the task exits while the iterator holds the lock. Add mmget_not_zero() before mmap_read_trylock(). Drop the mm reference via mmput_async() in _destroy() and the error path. Reject irqs-disabled contexts (including NMI) up front. Operations used by _next() and _destroy() (mmap_read_unlock, mmput_async) take spinlocks with IRQs disabled (pool->lock, pi_lock). Running from NMI or from a tracepoint that fires with those locks held could deadlock. Widen the mmput_async() #if guard to include CONFIG_BPF_SYSCALL, following the same approach used for CONFIG_FUTEX_PRIVATE_HASH. Fixes: 4ac454682158 ("bpf: Introduce task_vma open-coded iterator kfuncs") Signed-off-by: Puranjay Mohan --- include/linux/sched/mm.h | 2 +- kernel/bpf/task_iter.c | 31 ++++++++++++++++++++++++++++--- kernel/fork.c | 2 +- 3 files changed, 30 insertions(+), 5 deletions(-) diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 95d0040df584..5908de0c2f82 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -140,7 +140,7 @@ static inline bool mmget_not_zero(struct mm_struct *mm) /* mmput gets rid of the mappings and all user-space */ extern void mmput(struct mm_struct *); -#if defined(CONFIG_MMU) || defined(CONFIG_FUTEX_PRIVATE_HASH) +#if defined(CONFIG_MMU) || defined(CONFIG_FUTEX_PRIVATE_HASH) || defined(CONFIG_BPF_SYSCALL) /* same as above but performs the slow path from the async context. Can * be called from the atomic context as well */ diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 98d9b4c0daff..718f0f9b6396 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "mmap_unlock_work.h" static const char * const iter_task_type_names[] = { @@ -825,6 +826,18 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it, BUILD_BUG_ON(sizeof(struct bpf_iter_task_vma_kern) != sizeof(struct bpf_iter_task_vma)); BUILD_BUG_ON(__alignof__(struct bpf_iter_task_vma_kern) != __alignof__(struct bpf_iter_task_vma)); + /* + * Reject irqs-disabled contexts including NMI. Operations used + * by _next() and _destroy() (mmap_read_unlock, mmput_async) + * can take spinlocks with IRQs disabled (pi_lock, pool->lock). + * Running from NMI or from a tracepoint that fires with those + * locks held could deadlock. + */ + if (irqs_disabled()) { + kit->data = NULL; + return -EBUSY; + } + /* is_iter_reg_valid_uninit guarantees that kit hasn't been initialized * before, so non-NULL kit->data doesn't point to previously * bpf_mem_alloc'd bpf_iter_task_vma_kern_data @@ -842,17 +855,28 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it, /* kit->data->work == NULL is valid after bpf_mmap_unlock_get_irq_work */ irq_work_busy = bpf_mmap_unlock_get_irq_work(&kit->data->work); - if (irq_work_busy || !mmap_read_trylock(kit->data->mm)) { + if (irq_work_busy) { err = -EBUSY; goto err_cleanup_iter; } + if (!mmget_not_zero(kit->data->mm)) { + err = -ENOENT; + goto err_cleanup_iter; + } + + if (!mmap_read_trylock(kit->data->mm)) { + err = -EBUSY; + goto err_cleanup_mmget; + } + vma_iter_init(&kit->data->vmi, kit->data->mm, addr); return 0; +err_cleanup_mmget: + mmput_async(kit->data->mm); err_cleanup_iter: - if (kit->data->task) - put_task_struct(kit->data->task); + put_task_struct(kit->data->task); bpf_mem_free(&bpf_global_ma, kit->data); /* NULL kit->data signals failed bpf_iter_task_vma initialization */ kit->data = NULL; @@ -875,6 +899,7 @@ __bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it) if (kit->data) { bpf_mmap_unlock_mm(kit->data->work, kit->data->mm); put_task_struct(kit->data->task); + mmput_async(kit->data->mm); bpf_mem_free(&bpf_global_ma, kit->data); } } diff --git a/kernel/fork.c b/kernel/fork.c index 65113a304518..d0411a63d4ab 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1198,7 +1198,7 @@ void mmput(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(mmput); -#if defined(CONFIG_MMU) || defined(CONFIG_FUTEX_PRIVATE_HASH) +#if defined(CONFIG_MMU) || defined(CONFIG_FUTEX_PRIVATE_HASH) || defined(CONFIG_BPF_SYSCALL) static void mmput_async_fn(struct work_struct *work) { struct mm_struct *mm = container_of(work, struct mm_struct, -- 2.52.0