From: Mykyta Yatsenko Implement rhtab_map_get_next_key(), batch lookup/lookup-and-delete, and the seq_file BPF iterator for BPF_MAP_TYPE_RHASH. All three use rhashtable_next_key() with the previous key as cursor. This avoids the rhashtable walker state machine and its spinlock acquisition per enter/exit, and lets callers stay stateless across syscalls. ERR_PTR(-ENOENT) from rhashtable_next_key() (cursor was concurrently deleted) is handled per callsite: get_next_key falls back to the first key (matches BPF UAPI semantics of htab); the seq_file iterator and batch terminate iteration. Also implements rhtab_map_mem_usage() to report memory consumption. Signed-off-by: Mykyta Yatsenko --- kernel/bpf/hashtab.c | 290 ++++++++++++++++++++++++++++++++++++++++++++++++-- kernel/bpf/map_iter.c | 3 +- 2 files changed, 284 insertions(+), 9 deletions(-) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index da1754e1a525..61eb88cb9229 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -3021,68 +3021,342 @@ static void rhtab_map_free_internal_structs(struct bpf_map *map) } static int rhtab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) + __must_hold_shared(RCU) { - return -EOPNOTSUPP; + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + struct rhtab_elem *elem; + + elem = rhashtable_next_key(&rhtab->ht, key, rhtab_params); + + /* if not found, return the first key */ + if (PTR_ERR(elem) == -ENOENT) + elem = rhashtable_next_key(&rhtab->ht, NULL, rhtab_params); + + if (!elem) + return -ENOENT; + + memcpy(next_key, elem->data, map->key_size); + return 0; } static void rhtab_map_seq_show_elem(struct bpf_map *map, void *key, struct seq_file *m) { + void *value; + + /* Guarantee that hashtab value is not freed */ + guard(rcu)(); + + value = rhtab_map_lookup_elem(map, key); + if (!value) + return; + + btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); + seq_puts(m, ": "); + btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); + seq_putc(m, '\n'); } static long bpf_each_rhash_elem(struct bpf_map *map, bpf_callback_t callback_fn, void *callback_ctx, u64 flags) { - return -EOPNOTSUPP; + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + void *prev_key = NULL; + struct rhtab_elem *elem; + int num_elems = 0; + void *key_buf = NULL; + u64 ret = 0; + + if (flags != 0) + return -EINVAL; + + /* + * Buffer for stashing the cursor across rcu_read_unlock() in the + * cond_resched path. + */ + key_buf = kmalloc_nolock(map->key_size, 0, NUMA_NO_NODE); + if (!key_buf) + return -ENOMEM; + + rcu_read_lock(); + /* + * Best-effort iteration: if rhashtable is concurrently resized or + * elements are deleted/inserted, there may be missed or duplicate + * elements visited. + */ + while ((elem = rhashtable_next_key(&rhtab->ht, prev_key, rhtab_params))) { + if (IS_ERR(elem)) + break; + num_elems++; + ret = callback_fn((u64)(long)map, + (u64)(long)elem->data, + (u64)(long)rhtab_elem_value(elem, map->key_size), + (u64)(long)callback_ctx, 0); + if (ret) + break; + + prev_key = elem->data; /* valid while RCU held */ + + if (need_resched()) { + memcpy(key_buf, elem->data, map->key_size); + rcu_read_unlock(); + cond_resched(); + rcu_read_lock(); + prev_key = key_buf; + } + } + rcu_read_unlock(); + + kfree_nolock(key_buf); + return num_elems; } static u64 rhtab_map_mem_usage(const struct bpf_map *map) { - return 0; + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + u64 num_entries; + + /* Excludes rhashtable bucket overhead (~ nelems * sizeof(void *) at 75% load). */ + num_entries = atomic_read(&rhtab->ht.nelems); + return sizeof(struct bpf_rhtab) + rhtab->elem_size * num_entries; +} + +static int __rhtab_map_lookup_and_delete_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr, + bool do_delete) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + void __user *uvalues = u64_to_user_ptr(attr->batch.values); + void __user *ukeys = u64_to_user_ptr(attr->batch.keys); + void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch); + void *cursor = NULL, *keys = NULL, *values = NULL, *dst_key, *dst_val; + struct rhtab_elem **del_elems = NULL; + u32 max_count, total, key_size, value_size, i; + bool has_next_cursor = false; + struct rhtab_elem *elem; + u64 elem_map_flags, map_flags; + int ret = 0; + + elem_map_flags = attr->batch.elem_flags; + ret = bpf_map_check_op_flags(map, elem_map_flags, BPF_F_LOCK); + if (ret) + return ret; + + map_flags = attr->batch.flags; + if (map_flags) + return -EINVAL; + + max_count = attr->batch.count; + if (!max_count) + return 0; + + if (put_user(0, &uattr->batch.count)) + return -EFAULT; + + key_size = map->key_size; + value_size = map->value_size; + + keys = kvmalloc_array(max_count, key_size, GFP_USER | __GFP_NOWARN); + values = kvmalloc_array(max_count, value_size, GFP_USER | __GFP_NOWARN); + if (do_delete) + del_elems = kvmalloc_array(max_count, sizeof(void *), + GFP_USER | __GFP_NOWARN); + cursor = kmalloc(key_size, GFP_USER | __GFP_NOWARN); + + if (!keys || !values || !cursor || (do_delete && !del_elems)) { + ret = -ENOMEM; + goto free; + } + + if (ubatch && copy_from_user(cursor, ubatch, key_size)) { + ret = -EFAULT; + goto free; + } + + dst_key = keys; + dst_val = values; + total = 0; + + rcu_read_lock(); + + /* + * If the cursor key was concurrently deleted, rhashtable_next_key() + * returns ERR_PTR(-ENOENT); the batch terminates with no elements + * and userspace must restart from a NULL cursor. + */ + elem = rhashtable_next_key(&rhtab->ht, ubatch ? cursor : NULL, + rhtab_params); + while (elem && !IS_ERR(elem) && total < max_count) { + memcpy(dst_key, elem->data, key_size); + rhtab_read_elem_value(map, dst_val, elem, elem_map_flags); + check_and_init_map_value(map, dst_val); + + if (do_delete) + del_elems[total] = elem; + + elem = rhashtable_next_key(&rhtab->ht, dst_key, rhtab_params); + dst_key += key_size; + dst_val += value_size; + total++; + + /* Bail to userspace to avoid stalls. */ + if (need_resched()) + break; + } + + if (elem && !IS_ERR(elem)) { + /* Stash next key as the cursor */ + memcpy(cursor, elem->data, key_size); + has_next_cursor = true; + } + + if (do_delete) { + for (i = 0; i < total; i++) + rhtab_delete_elem(rhtab, del_elems[i], NULL, 0); + } + + rcu_read_unlock(); + + if (total == 0) { + ret = -ENOENT; + goto free; + } + + /* No more elements after this batch. */ + if (!has_next_cursor) + ret = -ENOENT; + + if (copy_to_user(ukeys, keys, total * key_size) || + copy_to_user(uvalues, values, total * value_size) || + put_user(total, &uattr->batch.count) || + (has_next_cursor && + copy_to_user(u64_to_user_ptr(attr->batch.out_batch), + cursor, key_size))) { + ret = -EFAULT; + goto free; + } + +free: + kfree(cursor); + kvfree(keys); + kvfree(values); + kvfree(del_elems); + return ret; } static int rhtab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr) { - return -EOPNOTSUPP; + return __rhtab_map_lookup_and_delete_batch(map, attr, uattr, false); } static int rhtab_map_lookup_and_delete_batch(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr) { - return -EOPNOTSUPP; + return __rhtab_map_lookup_and_delete_batch(map, attr, uattr, true); } struct bpf_iter_seq_rhash_map_info { struct bpf_map *map; struct bpf_rhtab *rhtab; + void *last_key; + bool last_key_valid; }; static void *bpf_rhash_map_seq_start(struct seq_file *seq, loff_t *pos) + __acquires_shared(RCU) { - return NULL; + struct bpf_iter_seq_rhash_map_info *info = seq->private; + void *key = *pos > 0 && info->last_key_valid ? info->last_key : NULL; + struct rhtab_elem *elem; + + rcu_read_lock(); + elem = rhashtable_next_key(&info->rhtab->ht, key, + rhtab_params); + if (IS_ERR_OR_NULL(elem)) + return NULL; + if (*pos == 0) + ++*pos; + return elem; } static void *bpf_rhash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - return NULL; + struct bpf_iter_seq_rhash_map_info *info = seq->private; + struct rhtab_elem *elem = v; + void *next; + + memcpy(info->last_key, elem->data, info->map->key_size); + info->last_key_valid = true; + ++*pos; + + next = rhashtable_next_key(&info->rhtab->ht, info->last_key, + rhtab_params); + if (IS_ERR(next)) + return NULL; + return next; +} + +static int __bpf_rhash_map_seq_show(struct seq_file *seq, + struct rhtab_elem *elem) +{ + struct bpf_iter_seq_rhash_map_info *info = seq->private; + struct bpf_iter__bpf_map_elem ctx = {}; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + int ret = 0; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, elem == NULL); + if (prog) { + ctx.meta = &meta; + ctx.map = info->map; + if (elem) { + ctx.key = elem->data; + ctx.value = rhtab_elem_value(elem, info->map->key_size); + } + ret = bpf_iter_run_prog(prog, &ctx); + } + + return ret; } static int bpf_rhash_map_seq_show(struct seq_file *seq, void *v) { - return 0; + return __bpf_rhash_map_seq_show(seq, v); } static void bpf_rhash_map_seq_stop(struct seq_file *seq, void *v) + __releases_shared(RCU) { + if (!v) + (void)__bpf_rhash_map_seq_show(seq, NULL); + + rcu_read_unlock(); } static int bpf_iter_init_rhash_map(void *priv_data, struct bpf_iter_aux_info *aux) { + struct bpf_iter_seq_rhash_map_info *info = priv_data; + struct bpf_map *map = aux->map; + + info->last_key_valid = false; + info->last_key = kmalloc(map->key_size, GFP_USER); + if (!info->last_key) + return -ENOMEM; + + bpf_map_inc_with_uref(map); + info->map = map; + info->rhtab = container_of(map, struct bpf_rhtab, map); return 0; } static void bpf_iter_fini_rhash_map(void *priv_data) { + struct bpf_iter_seq_rhash_map_info *info = priv_data; + + kfree(info->last_key); + bpf_map_put_with_uref(info->map); } static const struct seq_operations bpf_rhash_map_seq_ops = { diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index 261a03ea73d3..4a2aafbe28b4 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -119,7 +119,8 @@ static int bpf_iter_attach_map(struct bpf_prog *prog, is_percpu = true; else if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && - map->map_type != BPF_MAP_TYPE_ARRAY) + map->map_type != BPF_MAP_TYPE_ARRAY && + map->map_type != BPF_MAP_TYPE_RHASH) goto put_map; key_acc_size = prog->aux->max_rdonly_access; -- 2.53.0-Meta