From: Alexei Starovoitov zap_vma_range() requires the owning mm's mmap_lock to be held. Taking mmap_read_lock under arena->lock would AB-BA against arena_vm_close() and arena_map_mmap(), both of which run with mmap_write_lock held and then acquire arena->lock. Instead drop arena->lock, mmget_not_zero() the vma's mm, take mmap_read_lock, and re-resolve the vma via find_vma() since it may have been unmapped or replaced while waiting. Track processed vmls with a per-call generation in vml->zap_gen and serialize zap_pages() callers with a new arena->zap_mutex so concurrent callers on different uaddr ranges do not mark each other's vmls processed before the zap is done. Reported-by: David Hildenbrand Fixes: 317460317a02 ("bpf: Introduce bpf_arena.") Signed-off-by: Alexei Starovoitov --- kernel/bpf/arena.c | 61 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 57 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 1727503b25d8..9b2dea229b38 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -60,6 +60,8 @@ struct bpf_arena { struct list_head vma_list; /* protects vma_list */ struct mutex lock; + u64 zap_gen; + struct mutex zap_mutex; struct irq_work free_irq; struct work_struct free_work; struct llist_head free_spans; @@ -289,6 +291,7 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr) if (err) goto err_free_scratch; mutex_init(&arena->lock); + mutex_init(&arena->zap_mutex); raw_res_spin_lock_init(&arena->spinlock); err = populate_pgtable_except_pte(arena); if (err) @@ -391,6 +394,7 @@ struct vma_list { struct vm_area_struct *vma; struct list_head head; refcount_t mmap_count; + u64 zap_gen; }; static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma) @@ -403,6 +407,7 @@ static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma) refcount_set(&vml->mmap_count, 1); vma->vm_private_data = vml; vml->vma = vma; + vml->zap_gen = 0; list_add(&vml->head, &arena->vma_list); return 0; } @@ -746,12 +751,60 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt */ static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt) { + unsigned long size = (unsigned long)page_cnt << PAGE_SHIFT; + struct vm_area_struct *vma; + struct mm_struct *mm; struct vma_list *vml; + unsigned long vm_start; + u64 my_gen; - guard(mutex)(&arena->lock); - /* iterate link list under lock */ - list_for_each_entry(vml, &arena->vma_list, head) - zap_vma_range(vml->vma, uaddr, PAGE_SIZE * page_cnt); + /* + * Taking mmap_read_lock() under arena->lock would deadlock against + * arena_vm_close(), which runs with mmap_write_lock held and then + * acquires arena->lock. Drop arena->lock for mmap_read_lock(). + * + * Use per-call my_gen, recorded in vml->zap_gen, to remember which + * vmls this invocation has already processed across the lock drop. + * Hold zap_mutex around the whole walk so concurrent zap_pages() + * callers cannot overwrite each other's marks on shared vmls -- + * otherwise call B's mark would make call A skip a vml that A has + * not yet zapped for A's uaddr range. + */ + mutex_lock(&arena->zap_mutex); + mutex_lock(&arena->lock); + my_gen = ++arena->zap_gen; + for (;;) { + mm = NULL; + list_for_each_entry(vml, &arena->vma_list, head) { + if (vml->zap_gen >= my_gen) + continue; + vml->zap_gen = my_gen; + if (!mmget_not_zero(vml->vma->vm_mm)) + continue; + mm = vml->vma->vm_mm; + vm_start = vml->vma->vm_start; + break; + } + if (!mm) + break; + mutex_unlock(&arena->lock); + + mmap_read_lock(mm); + /* + * Re-resolve: while we waited the VMA could have been unmapped + * and a different mapping installed at the same address. + */ + vma = find_vma(mm, vm_start); + if (vma && vma->vm_start == vm_start && + vma->vm_file && vma->vm_file->private_data == &arena->map) + zap_vma_range(vma, uaddr, size); + mmap_read_unlock(mm); + mmput(mm); + + mutex_lock(&arena->lock); + } + mutex_unlock(&arena->lock); + mutex_unlock(&arena->zap_mutex); } static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, bool sleepable) -- 2.53.0-Meta