From: Mykyta Yatsenko Use rhashtable_lookup_likely() for lookups, rhashtable_remove_fast() for deletes, and rhashtable_lookup_get_insert_fast() for inserts. Updates modify values in place under RCU rather than allocating a new element and swapping the pointer (as regular htab does). This trades read consistency for performance: concurrent readers may see partial updates. Users requiring consistent reads should use BPF_F_LOCK. Initialize rhashtable with bpf_mem_alloc element cache. Require BPF_F_NO_PREALLOC. Limit max_entries to 2^31. Free elements via rhashtable_free_and_destroy() callback to handle internal structs. Signed-off-by: Mykyta Yatsenko Reviewed-by: Emil Tsalapatis --- include/linux/bpf_types.h | 1 + include/uapi/linux/bpf.h | 1 + kernel/bpf/hashtab.c | 381 +++++++++++++++++++++++++++++++++++++++++ kernel/bpf/map_iter.c | 3 +- kernel/bpf/syscall.c | 3 + kernel/bpf/verifier.c | 1 + tools/include/uapi/linux/bpf.h | 1 + 7 files changed, 390 insertions(+), 1 deletion(-) diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index b13de31e163f..56e4c3f983d3 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -134,6 +134,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_BLOOM_FILTER, bloom_filter_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_USER_RINGBUF, user_ringbuf_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_ARENA, arena_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_INSN_ARRAY, insn_array_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_RHASH, rhtab_map_ops) BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint) BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 552bc5d9afbd..822582c04f22 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1046,6 +1046,7 @@ enum bpf_map_type { BPF_MAP_TYPE_CGRP_STORAGE, BPF_MAP_TYPE_ARENA, BPF_MAP_TYPE_INSN_ARRAY, + BPF_MAP_TYPE_RHASH, __MAX_BPF_MAP_TYPE }; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3dd9b4924ae4..a37bd2a7b30f 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -2739,3 +2740,383 @@ const struct bpf_map_ops htab_of_maps_map_ops = { BATCH_OPS(htab), .map_btf_id = &htab_map_btf_ids[0], }; + +struct rhtab_elem { + struct rhash_head node; + /* key bytes, then value bytes follow */ + u8 data[] __aligned(8); +}; + +struct bpf_rhtab { + struct bpf_map map; + struct rhashtable ht; + struct rhashtable_params params; + struct bpf_mem_alloc ma; + u32 elem_size; +}; + +static inline void *rhtab_elem_value(struct rhtab_elem *l, u32 key_size) +{ + return l->data + round_up(key_size, 8); +} + +static struct bpf_map *rhtab_map_alloc(union bpf_attr *attr) +{ + struct bpf_rhtab *rhtab; + int err = 0; + + rhtab = bpf_map_area_alloc(sizeof(*rhtab), NUMA_NO_NODE); + if (!rhtab) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&rhtab->map, attr); + + if (rhtab->map.max_entries > 1UL << 31) { + err = -E2BIG; + goto free_rhtab; + } + + rhtab->elem_size = sizeof(struct rhtab_elem) + round_up(rhtab->map.key_size, 8) + + round_up(rhtab->map.value_size, 8); + + rhtab->params.head_offset = offsetof(struct rhtab_elem, node); + rhtab->params.key_offset = offsetof(struct rhtab_elem, data); + rhtab->params.key_len = rhtab->map.key_size; + + err = rhashtable_init(&rhtab->ht, &rhtab->params); + if (err) + goto free_rhtab; + + /* Set max_elems after rhashtable_init() since init zeroes the struct */ + rhtab->ht.max_elems = rhtab->map.max_entries; + + err = bpf_mem_alloc_init(&rhtab->ma, rhtab->elem_size, false); + if (err) + goto destroy_rhtab; + + return &rhtab->map; + +destroy_rhtab: + rhashtable_destroy(&rhtab->ht); +free_rhtab: + bpf_map_area_free(rhtab); + return ERR_PTR(err); +} + +static int rhtab_map_alloc_check(union bpf_attr *attr) +{ + if (!(attr->map_flags & BPF_F_NO_PREALLOC)) + return -EINVAL; + + if (attr->map_flags & BPF_F_ZERO_SEED) + return -EINVAL; + + return htab_map_alloc_check(attr); +} + +static void rhtab_check_and_free_fields(struct bpf_rhtab *rhtab, + struct rhtab_elem *elem) +{ + if (IS_ERR_OR_NULL(rhtab->map.record)) + return; + + bpf_obj_free_fields(rhtab->map.record, + rhtab_elem_value(elem, rhtab->map.key_size)); +} + +static void rhtab_mem_dtor(void *obj, void *ctx) +{ + struct htab_btf_record *hrec = ctx; + struct rhtab_elem *elem = obj; + + if (IS_ERR_OR_NULL(hrec->record)) + return; + + bpf_obj_free_fields(hrec->record, + rhtab_elem_value(elem, hrec->key_size)); +} + +static int rhtab_map_check_btf(struct bpf_map *map, const struct btf *btf, + const struct btf_type *key_type, + const struct btf_type *value_type) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + + return bpf_ma_set_dtor(map, &rhtab->ma, rhtab_mem_dtor); +} + +static void rhtab_free_elem(void *ptr, void *arg) +{ + struct bpf_rhtab *rhtab = arg; + struct rhtab_elem *elem = ptr; + + bpf_map_free_internal_structs(&rhtab->map, rhtab_elem_value(elem, rhtab->map.key_size)); + bpf_mem_cache_free_rcu(&rhtab->ma, elem); +} + +static void rhtab_map_free(struct bpf_map *map) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + + rhashtable_free_and_destroy(&rhtab->ht, rhtab_free_elem, rhtab); + bpf_mem_alloc_destroy(&rhtab->ma); + bpf_map_area_free(rhtab); +} + +static void *rhtab_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + /* Using constant zeroed params to force rhashtable use inlined hashfunc */ + static const struct rhashtable_params params = { 0 }; + + /* Hold RCU lock in case sleepable program calls via gen_lookup */ + guard(rcu)(); + + return rhashtable_lookup_likely(&rhtab->ht, key, params); +} + +static void *rhtab_map_lookup_elem(struct bpf_map *map, void *key) __must_hold(RCU) +{ + struct rhtab_elem *l; + + l = rhtab_lookup_elem(map, key); + return l ? rhtab_elem_value(l, map->key_size) : NULL; +} + +static int rhtab_delete_elem(struct bpf_rhtab *rhtab, struct rhtab_elem *elem) +{ + int err; + + err = rhashtable_remove_fast(&rhtab->ht, &elem->node, rhtab->params); + if (err) + return err; + + bpf_mem_cache_free_rcu(&rhtab->ma, elem); + return 0; +} + +static long rhtab_map_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + struct rhtab_elem *l; + + guard(rcu)(); + l = rhtab_lookup_elem(map, key); + return l ? rhtab_delete_elem(rhtab, l) : -ENOENT; +} + +static void rhtab_read_elem_value(struct bpf_map *map, void *dst, struct rhtab_elem *elem, + u64 flags) +{ + void *src = rhtab_elem_value(elem, map->key_size); + + if (flags & BPF_F_LOCK) + copy_map_value_locked(map, dst, src, true); + else + copy_map_value(map, dst, src); +} + +static int rhtab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, void *value, u64 flags) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + struct rhtab_elem *l; + int err; + + err = bpf_map_check_op_flags(map, flags, BPF_F_LOCK); + if (err) + return err; + + /* Make sure element is not deleted between lookup and copy */ + guard(rcu)(); + + l = rhtab_lookup_elem(map, key); + if (!l) + return -ENOENT; + + rhtab_read_elem_value(map, value, l, flags); + err = rhtab_delete_elem(rhtab, l); + if (err) + return err; + + check_and_init_map_value(map, value); + return 0; +} + +static long rhtab_map_update_existing(struct bpf_map *map, struct rhtab_elem *elem, void *value, + u64 map_flags) +{ + void *old_val = rhtab_elem_value(elem, map->key_size); + + if (map_flags & BPF_NOEXIST) + return -EEXIST; + + if (map_flags & BPF_F_LOCK) + copy_map_value_locked(map, old_val, value, false); + else + copy_map_value(map, old_val, value); + return 0; +} + +static long rhtab_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + struct rhtab_elem *elem, *tmp; + + if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST)) + return -EINVAL; + + if ((map_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK)) + return -EINVAL; + + guard(rcu)(); + elem = rhtab_lookup_elem(map, key); + if (elem) + return rhtab_map_update_existing(map, elem, value, map_flags); + + if (map_flags & BPF_EXIST) + return -ENOENT; + + /* Check max_entries limit before inserting new element */ + if (atomic_read(&rhtab->ht.nelems) >= map->max_entries) + return -E2BIG; + + elem = bpf_mem_cache_alloc(&rhtab->ma); + if (!elem) + return -ENOMEM; + + memcpy(elem->data, key, map->key_size); + copy_map_value(map, rhtab_elem_value(elem, map->key_size), value); + + tmp = rhashtable_lookup_get_insert_fast(&rhtab->ht, &elem->node, rhtab->params); + if (tmp) { + bpf_mem_cache_free(&rhtab->ma, elem); + if (IS_ERR(tmp)) + return PTR_ERR(tmp); + + return rhtab_map_update_existing(map, tmp, value, map_flags); + } + + return 0; +} + +static int rhtab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +{ + struct bpf_insn *insn = insn_buf; + const int ret = BPF_REG_0; + + BUILD_BUG_ON(!__same_type(&rhtab_lookup_elem, + (void *(*)(struct bpf_map *map, void *key)) NULL)); + *insn++ = BPF_EMIT_CALL(rhtab_lookup_elem); + *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1); + *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, + offsetof(struct rhtab_elem, data) + round_up(map->key_size, 8)); + + return insn - insn_buf; +} + +static void rhtab_map_free_internal_structs(struct bpf_map *map) +{ +} + +static int rhtab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + return -EOPNOTSUPP; +} + +static void rhtab_map_seq_show_elem(struct bpf_map *map, void *key, struct seq_file *m) +{ +} + +static long bpf_each_rhash_elem(struct bpf_map *map, bpf_callback_t callback_fn, + void *callback_ctx, u64 flags) +{ + return -EOPNOTSUPP; +} + +static u64 rhtab_map_mem_usage(const struct bpf_map *map) +{ + return 0; +} + +static int rhtab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return -EOPNOTSUPP; +} + +static int rhtab_map_lookup_and_delete_batch(struct bpf_map *map, const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return -EOPNOTSUPP; +} + +struct bpf_iter_seq_rhash_map_info { + struct bpf_map *map; + struct bpf_rhtab *rhtab; + struct rhashtable_iter iter; + bool iter_active; +}; + +static void *bpf_rhash_map_seq_start(struct seq_file *seq, loff_t *pos) +{ + return NULL; +} + +static void *bpf_rhash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + return NULL; +} + +static int bpf_rhash_map_seq_show(struct seq_file *seq, void *v) +{ + return 0; +} + +static void bpf_rhash_map_seq_stop(struct seq_file *seq, void *v) +{ +} + +static int bpf_iter_init_rhash_map(void *priv_data, struct bpf_iter_aux_info *aux) +{ + return 0; +} + +static void bpf_iter_fini_rhash_map(void *priv_data) +{ +} + +static const struct seq_operations bpf_rhash_map_seq_ops = { + .start = bpf_rhash_map_seq_start, + .next = bpf_rhash_map_seq_next, + .stop = bpf_rhash_map_seq_stop, + .show = bpf_rhash_map_seq_show, +}; + +static const struct bpf_iter_seq_info rhash_iter_seq_info = { + .seq_ops = &bpf_rhash_map_seq_ops, + .init_seq_private = bpf_iter_init_rhash_map, + .fini_seq_private = bpf_iter_fini_rhash_map, + .seq_priv_size = sizeof(struct bpf_iter_seq_rhash_map_info), +}; + +BTF_ID_LIST_SINGLE(rhtab_map_btf_ids, struct, bpf_rhtab) +const struct bpf_map_ops rhtab_map_ops = { + .map_meta_equal = bpf_map_meta_equal, + .map_alloc_check = rhtab_map_alloc_check, + .map_alloc = rhtab_map_alloc, + .map_free = rhtab_map_free, + .map_get_next_key = rhtab_map_get_next_key, + .map_release_uref = rhtab_map_free_internal_structs, + .map_lookup_elem = rhtab_map_lookup_elem, + .map_lookup_and_delete_elem = rhtab_map_lookup_and_delete_elem, + .map_update_elem = rhtab_map_update_elem, + .map_delete_elem = rhtab_map_delete_elem, + .map_gen_lookup = rhtab_map_gen_lookup, + .map_seq_show_elem = rhtab_map_seq_show_elem, + .map_set_for_each_callback_args = map_set_for_each_callback_args, + .map_for_each_callback = bpf_each_rhash_elem, + .map_mem_usage = rhtab_map_mem_usage, + BATCH_OPS(rhtab), + .map_btf_id = &rhtab_map_btf_ids[0], + .iter_seq_info = &rhash_iter_seq_info, +}; diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index 261a03ea73d3..4a2aafbe28b4 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -119,7 +119,8 @@ static int bpf_iter_attach_map(struct bpf_prog *prog, is_percpu = true; else if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && - map->map_type != BPF_MAP_TYPE_ARRAY) + map->map_type != BPF_MAP_TYPE_ARRAY && + map->map_type != BPF_MAP_TYPE_RHASH) goto put_map; key_acc_size = prog->aux->max_rdonly_access; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a3c0214ca934..6c4369691d87 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1280,6 +1280,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, case BPF_SPIN_LOCK: case BPF_RES_SPIN_LOCK: if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_RHASH && map->map_type != BPF_MAP_TYPE_ARRAY && map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && map->map_type != BPF_MAP_TYPE_SK_STORAGE && @@ -1457,6 +1458,7 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr) case BPF_MAP_TYPE_CGROUP_ARRAY: case BPF_MAP_TYPE_ARRAY_OF_MAPS: case BPF_MAP_TYPE_HASH: + case BPF_MAP_TYPE_RHASH: case BPF_MAP_TYPE_PERCPU_HASH: case BPF_MAP_TYPE_HASH_OF_MAPS: case BPF_MAP_TYPE_RINGBUF: @@ -2192,6 +2194,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_RHASH || map->map_type == BPF_MAP_TYPE_STACK_TRACE) { if (!bpf_map_is_offloaded(map)) { bpf_disable_instrumentation(); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 185210b73385..633d29fe79b6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -18070,6 +18070,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, if (prog->sleepable) switch (map->map_type) { case BPF_MAP_TYPE_HASH: + case BPF_MAP_TYPE_RHASH: case BPF_MAP_TYPE_LRU_HASH: case BPF_MAP_TYPE_ARRAY: case BPF_MAP_TYPE_PERCPU_HASH: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 677be9a47347..9d7df174770a 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1046,6 +1046,7 @@ enum bpf_map_type { BPF_MAP_TYPE_CGRP_STORAGE, BPF_MAP_TYPE_ARENA, BPF_MAP_TYPE_INSN_ARRAY, + BPF_MAP_TYPE_RHASH, __MAX_BPF_MAP_TYPE }; -- 2.52.0