To prepare for changing bpf_local_storage_map_bucket::lock to rqspinlock, convert bpf_selem_unlink_map() to failable. It still always succeeds and returns 0 for now. Since some operations updating local storage cannot fail in the middle, open-code bpf_selem_unlink_map() to take the b->lock before the operation. There are two such locations: - bpf_local_storage_alloc() The first selem will be unlinked from smap if cmpxchg owner_storage_ptr fails, which should not fail. Therefore, hold b->lock when linking until allocation complete. Helpers that assume b->lock is held by callers are introduced: bpf_selem_link_map_nolock() and bpf_selem_unlink_map_nolock(). - bpf_local_storage_update() The three step update process: link_map(new_selem), link_storage(new_selem), and unlink_map(old_selem) should not fail in the middle. Hence, lock both b->lock before the update process starts. While locking two different buckets decided by the hash function introduces different locking order, this will not cause ABBA deadlock since this is performed under local_storage->lock. - bpf_selem_unlink() bpf_selem_unlink_map() and bpf_selem_unlink_storage() should either all succeed or fail as a whole instead of failing in the middle. As the first step, open code bpf_selem_unlink_map(). A later patch will open code bpf_selem_unlink_storage(). Then, unlink_map and unlink_storage will be done after successfully acquiring both local_storage->lock and b->lock. One caller of bpf_selem_unlink_map() cannot run recursively (e.g., called by helpers in tracing bpf programs) and therefore cannot deadlock. Assert that these calls cannot fail instead of handling them. - bpf_local_storage_destroy() Called by owner (e.g., task_struct, sk, ...). Will not recur and cause AA deadlock. Signed-off-by: Amery Hung --- kernel/bpf/bpf_local_storage.c | 75 ++++++++++++++++++++++++++++------ 1 file changed, 62 insertions(+), 13 deletions(-) diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index b931fbceb54d..7e39b88ef795 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -409,7 +409,7 @@ void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, hlist_add_head_rcu(&selem->snode, &local_storage->list); } -static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) +static int bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) { struct bpf_local_storage_map *smap; struct bpf_local_storage_map_bucket *b; @@ -417,7 +417,7 @@ static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) if (unlikely(!selem_linked_to_map_lockless(selem))) /* selem has already be unlinked from smap */ - return; + return 0; smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); b = select_bucket(smap, selem); @@ -425,6 +425,14 @@ static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) if (likely(selem_linked_to_map(selem))) hlist_del_init_rcu(&selem->map_node); raw_spin_unlock_irqrestore(&b->lock, flags); + + return 0; +} + +static void bpf_selem_unlink_map_nolock(struct bpf_local_storage_elem *selem) +{ + if (likely(selem_linked_to_map(selem))) + hlist_del_init_rcu(&selem->map_node); } void bpf_selem_link_map(struct bpf_local_storage_map *smap, @@ -439,13 +447,33 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap, raw_spin_unlock_irqrestore(&b->lock, flags); } +static void bpf_selem_link_map_nolock(struct bpf_local_storage_map *smap, + struct bpf_local_storage_elem *selem, + struct bpf_local_storage_map_bucket *b) +{ + RCU_INIT_POINTER(SDATA(selem)->smap, smap); + hlist_add_head_rcu(&selem->map_node, &b->list); +} + void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) { - /* Always unlink from map before unlinking from local_storage - * because selem will be freed after successfully unlinked from - * the local_storage. - */ - bpf_selem_unlink_map(selem); + struct bpf_local_storage_map_bucket *b; + struct bpf_local_storage_map *smap; + unsigned long flags; + + if (likely(selem_linked_to_map_lockless(selem))) { + smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); + b = select_bucket(smap, selem); + raw_spin_lock_irqsave(&b->lock, flags); + + /* Always unlink from map before unlinking from local_storage + * because selem will be freed after successfully unlinked from + * the local_storage. + */ + bpf_selem_unlink_map_nolock(selem); + raw_spin_unlock_irqrestore(&b->lock, flags); + } + bpf_selem_unlink_storage(selem, reuse_now); } @@ -487,6 +515,8 @@ int bpf_local_storage_alloc(void *owner, { struct bpf_local_storage *prev_storage, *storage; struct bpf_local_storage **owner_storage_ptr; + struct bpf_local_storage_map_bucket *b; + unsigned long flags; int err; err = mem_charge(smap, owner, sizeof(*storage)); @@ -509,7 +539,10 @@ int bpf_local_storage_alloc(void *owner, storage->owner = owner; bpf_selem_link_storage_nolock(storage, first_selem); - bpf_selem_link_map(smap, first_selem); + + b = select_bucket(smap, first_selem); + raw_spin_lock_irqsave(&b->lock, flags); + bpf_selem_link_map_nolock(smap, first_selem, b); owner_storage_ptr = (struct bpf_local_storage **)owner_storage(smap, owner); @@ -525,7 +558,8 @@ int bpf_local_storage_alloc(void *owner, */ prev_storage = cmpxchg(owner_storage_ptr, NULL, storage); if (unlikely(prev_storage)) { - bpf_selem_unlink_map(first_selem); + bpf_selem_unlink_map_nolock(first_selem); + raw_spin_unlock_irqrestore(&b->lock, flags); err = -EAGAIN; goto uncharge; @@ -539,6 +573,7 @@ int bpf_local_storage_alloc(void *owner, * bucket->list under rcu_read_lock(). */ } + raw_spin_unlock_irqrestore(&b->lock, flags); return 0; @@ -560,8 +595,9 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, struct bpf_local_storage_data *old_sdata = NULL; struct bpf_local_storage_elem *alloc_selem, *selem = NULL; struct bpf_local_storage *local_storage; + struct bpf_local_storage_map_bucket *b, *old_b; HLIST_HEAD(old_selem_free_list); - unsigned long flags; + unsigned long flags, b_flags, old_b_flags; int err; /* BPF_EXIST and BPF_NOEXIST cannot be both set */ @@ -645,20 +681,31 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, goto unlock; } + b = select_bucket(smap, selem); + old_b = old_sdata ? select_bucket(smap, SELEM(old_sdata)) : b; + + raw_spin_lock_irqsave(&b->lock, b_flags); + if (b != old_b) + raw_spin_lock_irqsave(&old_b->lock, old_b_flags); + alloc_selem = NULL; /* First, link the new selem to the map */ - bpf_selem_link_map(smap, selem); + bpf_selem_link_map_nolock(smap, selem, b); /* Second, link (and publish) the new selem to local_storage */ bpf_selem_link_storage_nolock(local_storage, selem); /* Third, remove old selem, SELEM(old_sdata) */ if (old_sdata) { - bpf_selem_unlink_map(SELEM(old_sdata)); + bpf_selem_unlink_map_nolock(SELEM(old_sdata)); bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata), true, &old_selem_free_list); } + if (b != old_b) + raw_spin_unlock_irqrestore(&old_b->lock, old_b_flags); + raw_spin_unlock_irqrestore(&b->lock, b_flags); + unlock: raw_spin_unlock_irqrestore(&local_storage->lock, flags); bpf_selem_free_list(&old_selem_free_list, false); @@ -736,6 +783,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) HLIST_HEAD(free_selem_list); struct hlist_node *n; unsigned long flags; + int err; storage_smap = rcu_dereference_check(local_storage->smap, bpf_rcu_lock_held()); bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, NULL); @@ -754,7 +802,8 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) /* Always unlink from map before unlinking from * local_storage. */ - bpf_selem_unlink_map(selem); + err = bpf_selem_unlink_map(selem); + WARN_ON(err); /* If local_storage list has only one element, the * bpf_selem_unlink_storage_nolock() will return true. * Otherwise, it will return false. The current loop iteration -- 2.47.3