Introduce bpf_selem_unlink_lockless() to properly handle errors returned from rqspinlock in bpf_local_storage_map_free() and bpf_local_storage_destroy() where the operation must succeeds. The idea of bpf_selem_unlink_lockless() is to allow an selem to be partially linked and use refcount to determine when and who can free the selem. An selem initially is fully linked to a map and a local storage and therefore selem->link_cnt is set to 2. Under normal circumstances, bpf_selem_unlink_lockless() will be able to grab locks and unlink an selem from map and local storage in sequeunce, just like bpf_selem_unlink(), and then add it to a local tofree list provide by the caller. However, if any of the lock attempts fails, it will only clear SDATA(selem)->smap or selem->local_storage depending on the caller and decrement link_cnt to signal that the corresponding data structure holding a reference to the selem is gone. Then, only when both map and local storage are gone, an selem can be free by the last caller that turns link_cnt to 0. To make sure bpf_obj_free_fields() is done only once and when map is still present, it is called when unlinking an selem from b->list under b->lock. To make sure uncharging memory is only done once and when owner is still present, only unlink selem from local_storage->list in bpf_local_storage_destroy() and return the amount of memory to uncharge to the caller (i.e., owner) since the map associated with an selem may already be gone and map->ops->map_local_storage_uncharge can no longer be referenced. Finally, access of selem, SDATA(selem)->smap and selem->local_storage are racy. Callers will protect these fields with RCU. Co-developed-by: Martin KaFai Lau Signed-off-by: Martin KaFai Lau Signed-off-by: Amery Hung --- include/linux/bpf_local_storage.h | 2 +- kernel/bpf/bpf_local_storage.c | 77 +++++++++++++++++++++++++++++-- 2 files changed, 74 insertions(+), 5 deletions(-) diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 20918c31b7e5..1fd908c44fb6 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -80,9 +80,9 @@ struct bpf_local_storage_elem { * after raw_spin_unlock */ }; + atomic_t link_cnt; u16 size; bool use_kmalloc_nolock; - /* 4 bytes hole */ /* The data is stored in another cacheline to minimize * the number of cachelines access during a cache hit. */ diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 62201552dca6..4c682d5aef7f 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -97,6 +97,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, if (swap_uptrs) bpf_obj_swap_uptrs(smap->map.record, SDATA(selem)->data, value); } + atomic_set(&selem->link_cnt, 2); selem->size = smap->elem_size; selem->use_kmalloc_nolock = smap->use_kmalloc_nolock; return selem; @@ -200,9 +201,11 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu) /* The bpf_local_storage_map_free will wait for rcu_barrier */ smap = rcu_dereference_check(SDATA(selem)->smap, 1); - migrate_disable(); - bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); - migrate_enable(); + if (smap) { + migrate_disable(); + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); + migrate_enable(); + } kfree_nolock(selem); } @@ -227,7 +230,8 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem, * is only supported in task local storage, where * smap->use_kmalloc_nolock == true. */ - bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); + if (smap) + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); __bpf_selem_free(selem, reuse_now); return; } @@ -419,6 +423,71 @@ int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) return err; } +/* Callers of bpf_selem_unlink_lockless() */ +#define BPF_LOCAL_STORAGE_MAP_FREE 0 +#define BPF_LOCAL_STORAGE_DESTROY 1 + +/* + * Unlink an selem from map and local storage with lockless fallback if callers + * are racing or rqspinlock returns error. It should only be called by + * bpf_local_storage_destroy() or bpf_local_storage_map_free(). + */ +static void bpf_selem_unlink_lockless(struct bpf_local_storage_elem *selem, + struct hlist_head *to_free, int caller) +{ + struct bpf_local_storage *local_storage; + struct bpf_local_storage_map_bucket *b; + struct bpf_local_storage_map *smap; + unsigned long flags; + int err, unlink = 0; + + local_storage = rcu_dereference_check(selem->local_storage, bpf_rcu_lock_held()); + smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); + + /* + * Free special fields immediately as SDATA(selem)->smap will be cleared. + * No BPF program should be reading the selem. + */ + if (smap) { + b = select_bucket(smap, selem); + err = raw_res_spin_lock_irqsave(&b->lock, flags); + if (!err) { + if (likely(selem_linked_to_map(selem))) { + hlist_del_init_rcu(&selem->map_node); + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); + RCU_INIT_POINTER(SDATA(selem)->smap, NULL); + unlink++; + } + raw_res_spin_unlock_irqrestore(&b->lock, flags); + } else if (caller == BPF_LOCAL_STORAGE_MAP_FREE) { + RCU_INIT_POINTER(SDATA(selem)->smap, NULL); + } + } + + /* + * Only let destroy() unlink from local_storage->list and do mem_uncharge + * as owner is guaranteed to be valid in destroy(). + */ + if (local_storage && caller == BPF_LOCAL_STORAGE_DESTROY) { + err = raw_res_spin_lock_irqsave(&local_storage->lock, flags); + if (!err) { + hlist_del_init_rcu(&selem->snode); + unlink++; + raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); + } + RCU_INIT_POINTER(selem->local_storage, NULL); + } + + /* + * Normally, an selem can be unlink under local_storage->lock and b->lock, and + * then added to a local to_free list. However, if destroy() and map_free() are + * racing or rqspinlock returns errors in unlikely situations (unlink != 2), free + * the selem only after both map_free() and destroy() drop the refcnt. + */ + if (unlink == 2 || atomic_dec_and_test(&selem->link_cnt)) + hlist_add_head(&selem->free_node, to_free); +} + void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage, struct bpf_local_storage_map *smap, struct bpf_local_storage_elem *selem) -- 2.47.3