When dumping sets in order to create the proper order for restore, the list type of sets dumped last. Therefore internally we run the dumping loop twice: first with all non-list type of sets and skipping the list type ones and then secondly for the list type of sets. Sashiko noticed that there's a potential race between dump and destroy if in the first loop the last set was a list type of set: its pointer remains unreferenced and a concurrent destroy can free it. Fix the issue by resetting the variable holding the pointer. Signed-off-by: Jozsef Kadlecsik --- net/netfilter/ipset/ip_set_core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index c5a26236a0bb..0874029cb0f2 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1613,6 +1613,7 @@ ip_set_dump_do(struct sk_buff *skb, struct netlink_callback *cb) ((dump_type == DUMP_ALL) == !!(set->type->features & IPSET_DUMP_LAST))) { write_unlock_bh(&ip_set_ref_lock); + set = NULL; continue; } pr_debug("List set: %s\n", set->name); -- 2.39.5 The "ipset list -terse" command is actually a dump operation which may run parallel with "ipset add" commands, which can trigger an internal resizing of the hash type of sets just being dumped. However, dumping just the header part of the set was not protected against underlying resizing. Fix it by protecting the header dumping part as well. Signed-off-by: Jozsef Kadlecsik --- net/netfilter/ipset/ip_set_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 0874029cb0f2..3706b4a85a0f 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1649,13 +1649,13 @@ ip_set_dump_do(struct sk_buff *skb, struct netlink_callback *cb) if (cb->args[IPSET_CB_PROTO] > IPSET_PROTOCOL_MIN && nla_put_net16(skb, IPSET_ATTR_INDEX, htons(index))) goto nla_put_failure; + if (set->variant->uref) + set->variant->uref(set, cb, true); ret = set->variant->head(set, skb); if (ret < 0) goto release_refcount; if (dump_flags & IPSET_FLAG_LIST_HEADER) goto next_set; - if (set->variant->uref) - set->variant->uref(set, cb, true); fallthrough; default: ret = set->variant->list(set, skb, cb); -- 2.39.5 When adding a new entry to the next position in the existing hash bucket, the position index was incremented too early and parallel dump could read it before the entry was populated with the value. Move the setting of the position index after populating the entry. v2: Position counting fixed, noticed by Florian Westphal. Reported-by: syzbot+786c889f046e8b003ca6@syzkaller.appspotmail.com Reported-by: syzbot+1da17e4b41d795df059e@syzkaller.appspotmail.com Reported-by: syzbot+421c5f3ff8e9493084d9@syzkaller.appspotmail.com Signed-off-by: Jozsef Kadlecsik --- net/netfilter/ipset/ip_set_hash_gen.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index b79e5dd2af03..133ce4611eed 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -844,7 +844,7 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, const struct mtype_elem *d = value; struct mtype_elem *data; struct hbucket *n, *old = ERR_PTR(-ENOENT); - int i, j = -1, ret; + int i, j = -1, npos = 0, ret; bool flag_exist = flags & IPSET_FLAG_EXIST; bool deleted = false, forceadd = false, reuse = false; u32 r, key, multi = 0, elements, maxelem; @@ -889,6 +889,7 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, ext_size(AHASH_INIT_SIZE, set->dsize); goto copy_elem; } + npos = n->pos; for (i = 0; i < n->pos; i++) { if (!test_bit(i, n->used)) { /* Reuse first deleted entry */ @@ -962,7 +963,8 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, } copy_elem: - j = n->pos++; + j = npos; + npos = n->pos + 1; data = ahash_data(n, j, set->dsize); copy_data: t->hregion[r].elements++; @@ -985,6 +987,7 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, if (SET_WITH_TIMEOUT(set)) ip_set_timeout_set(ext_timeout(data, set), ext->timeout); smp_mb__before_atomic(); + n->pos = npos; set_bit(j, n->used); if (old != ERR_PTR(-ENOENT)) { rcu_assign_pointer(hbucket(t, key), n); -- 2.39.5 The "pos" structure member of struct hbucket stores the first free slot in the hash bucket of a hash type of set and there are concurrent readers/writers. Annotate accesses properly. Signed-off-by: Jozsef Kadlecsik --- net/netfilter/ipset/ip_set_hash_gen.h | 62 ++++++++++++++++----------- 1 file changed, 38 insertions(+), 24 deletions(-) diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 133ce4611eed..04e4627ddfc1 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -386,8 +386,9 @@ static void mtype_ext_cleanup(struct ip_set *set, struct hbucket *n) { int i; + u8 pos = smp_load_acquire(&n->pos); - for (i = 0; i < n->pos; i++) + for (i = 0; i < pos; i++) if (test_bit(i, n->used)) ip_set_ext_destroy(set, ahash_data(n, i, set->dsize)); } @@ -490,7 +491,7 @@ mtype_gc_do(struct ip_set *set, struct htype *h, struct htable *t, u32 r) #ifdef IP_SET_HASH_WITH_NETS u8 k; #endif - u8 htable_bits = t->htable_bits; + u8 pos, htable_bits = t->htable_bits; spin_lock_bh(&t->hregion[r].lock); for (i = ahash_bucket_start(r, htable_bits); @@ -498,7 +499,8 @@ mtype_gc_do(struct ip_set *set, struct htype *h, struct htable *t, u32 r) n = __ipset_dereference(hbucket(t, i)); if (!n) continue; - for (j = 0, d = 0; j < n->pos; j++) { + pos = smp_load_acquire(&n->pos); + for (j = 0, d = 0; j < pos; j++) { if (!test_bit(j, n->used)) { d++; continue; @@ -534,7 +536,7 @@ mtype_gc_do(struct ip_set *set, struct htype *h, struct htable *t, u32 r) /* Still try to delete expired elements. */ continue; tmp->size = n->size - AHASH_INIT_SIZE; - for (j = 0, d = 0; j < n->pos; j++) { + for (j = 0, d = 0; j < pos; j++) { if (!test_bit(j, n->used)) continue; data = ahash_data(n, j, dsize); @@ -623,7 +625,7 @@ mtype_resize(struct ip_set *set, bool retried) { struct htype *h = set->data; struct htable *t, *orig; - u8 htable_bits; + u8 pos, htable_bits; size_t hsize, dsize = set->dsize; #ifdef IP_SET_HASH_WITH_NETS u8 flags; @@ -685,7 +687,8 @@ mtype_resize(struct ip_set *set, bool retried) n = __ipset_dereference(hbucket(orig, i)); if (!n) continue; - for (j = 0; j < n->pos; j++) { + pos = smp_load_acquire(&n->pos); + for (j = 0; j < pos; j++) { if (!test_bit(j, n->used)) continue; data = ahash_data(n, j, dsize); @@ -809,9 +812,10 @@ mtype_ext_size(struct ip_set *set, u32 *elements, size_t *ext_size) { struct htype *h = set->data; const struct htable *t; - u32 i, j, r; struct hbucket *n; struct mtype_elem *data; + u32 i, j, r; + u8 pos; t = rcu_dereference_bh(h->table); for (r = 0; r < ahash_numof_locks(t->htable_bits); r++) { @@ -820,7 +824,8 @@ mtype_ext_size(struct ip_set *set, u32 *elements, size_t *ext_size) n = rcu_dereference_bh(hbucket(t, i)); if (!n) continue; - for (j = 0; j < n->pos; j++) { + pos = smp_load_acquire(&n->pos); + for (j = 0; j < pos; j++) { if (!test_bit(j, n->used)) continue; data = ahash_data(n, j, set->dsize); @@ -844,10 +849,11 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, const struct mtype_elem *d = value; struct mtype_elem *data; struct hbucket *n, *old = ERR_PTR(-ENOENT); - int i, j = -1, npos = 0, ret; + int i, j = -1, ret; bool flag_exist = flags & IPSET_FLAG_EXIST; bool deleted = false, forceadd = false, reuse = false; u32 r, key, multi = 0, elements, maxelem; + u8 npos = 0; rcu_read_lock_bh(); t = rcu_dereference_bh(h->table); @@ -889,8 +895,8 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, ext_size(AHASH_INIT_SIZE, set->dsize); goto copy_elem; } - npos = n->pos; - for (i = 0; i < n->pos; i++) { + npos = smp_load_acquire(&n->pos); + for (i = 0; i < npos; i++) { if (!test_bit(i, n->used)) { /* Reuse first deleted entry */ if (j == -1) { @@ -934,7 +940,7 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, if (elements >= maxelem) goto set_full; /* Create a new slot */ - if (n->pos >= n->size) { + if (npos >= n->size) { #ifdef IP_SET_HASH_WITH_MULTI if (h->bucketsize >= AHASH_MAX_TUNED) goto set_full; @@ -963,8 +969,7 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, } copy_elem: - j = npos; - npos = n->pos + 1; + j = npos++; data = ahash_data(n, j, set->dsize); copy_data: t->hregion[r].elements++; @@ -987,7 +992,8 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, if (SET_WITH_TIMEOUT(set)) ip_set_timeout_set(ext_timeout(data, set), ext->timeout); smp_mb__before_atomic(); - n->pos = npos; + /* Ensure all data writes are visible before updating position */ + smp_store_release(&n->pos, npos); set_bit(j, n->used); if (old != ERR_PTR(-ENOENT)) { rcu_assign_pointer(hbucket(t, key), n); @@ -1046,6 +1052,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, int i, j, k, r, ret = -IPSET_ERR_EXIST; u32 key, multi = 0; size_t dsize = set->dsize; + u8 pos; /* Userspace add and resize is excluded by the mutex. * Kernespace add does not trigger resize. @@ -1061,7 +1068,8 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, n = rcu_dereference_bh(hbucket(t, key)); if (!n) goto out; - for (i = 0, k = 0; i < n->pos; i++) { + pos = smp_load_acquire(&n->pos); + for (i = 0, k = 0; i < pos; i++) { if (!test_bit(i, n->used)) { k++; continue; @@ -1075,8 +1083,8 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, ret = 0; clear_bit(i, n->used); smp_mb__after_atomic(); - if (i + 1 == n->pos) - n->pos--; + if (i + 1 == pos) + smp_store_release(&n->pos, --pos); t->hregion[r].elements--; #ifdef IP_SET_HASH_WITH_NETS for (j = 0; j < IPSET_NET_COUNT; j++) @@ -1097,11 +1105,11 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, x->flags = flags; } } - for (; i < n->pos; i++) { + for (; i < pos; i++) { if (!test_bit(i, n->used)) k++; } - if (k == n->pos) { + if (k == pos) { t->hregion[r].ext_size -= ext_size(n->size, dsize); rcu_assign_pointer(hbucket(t, key), NULL); kfree_rcu(n, rcu); @@ -1112,7 +1120,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, if (!tmp) goto out; tmp->size = n->size - AHASH_INIT_SIZE; - for (j = 0, k = 0; j < n->pos; j++) { + for (j = 0, k = 0; j < pos; j++) { if (!test_bit(j, n->used)) continue; data = ahash_data(n, j, dsize); @@ -1173,6 +1181,7 @@ mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d, int ret, i, j = 0; #endif u32 key, multi = 0; + u8 pos; pr_debug("test by nets\n"); for (; j < NLEN && h->nets[j].cidr[0] && !multi; j++) { @@ -1190,7 +1199,8 @@ mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d, n = rcu_dereference_bh(hbucket(t, key)); if (!n) continue; - for (i = 0; i < n->pos; i++) { + pos = smp_load_acquire(&n->pos); + for (i = 0; i < pos; i++) { if (!test_bit(i, n->used)) continue; data = ahash_data(n, i, set->dsize); @@ -1224,6 +1234,7 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct mtype_elem *data; int i, ret = 0; u32 key, multi = 0; + u8 pos; rcu_read_lock_bh(); t = rcu_dereference_bh(h->table); @@ -1246,7 +1257,8 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, ret = 0; goto out; } - for (i = 0; i < n->pos; i++) { + pos = smp_load_acquire(&n->pos); + for (i = 0; i < pos; i++) { if (!test_bit(i, n->used)) continue; data = ahash_data(n, i, set->dsize); @@ -1363,6 +1375,7 @@ mtype_list(const struct ip_set *set, /* We assume that one hash bucket fills into one page */ void *incomplete; int i, ret = 0; + u8 pos; atd = nla_nest_start(skb, IPSET_ATTR_ADT); if (!atd) @@ -1381,7 +1394,8 @@ mtype_list(const struct ip_set *set, cb->args[IPSET_CB_ARG0], t, n); if (!n) continue; - for (i = 0; i < n->pos; i++) { + pos = smp_load_acquire(&n->pos); + for (i = 0; i < pos; i++) { if (!test_bit(i, n->used)) continue; e = ahash_data(n, i, set->dsize); -- 2.39.5 Sashiko pointed out that there are a few lockless RCU readers using test_bit() which is a relaxed atomic operation and provides no memory barrier guarantees. Use test_bit_acquire() instead where the operation may run parallel with add/del/gc, i.e. is not one from the next cases - protected by region lock - in a set destroy phase - in a new/temporary set creation phase Also, add two missing smp_mb__after_atomic() operations. Signed-off-by: Jozsef Kadlecsik --- net/netfilter/ipset/ip_set_hash_gen.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 04e4627ddfc1..6a31f2db824a 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -689,7 +689,7 @@ mtype_resize(struct ip_set *set, bool retried) continue; pos = smp_load_acquire(&n->pos); for (j = 0; j < pos; j++) { - if (!test_bit(j, n->used)) + if (!test_bit_acquire(j, n->used)) continue; data = ahash_data(n, j, dsize); if (SET_ELEM_EXPIRED(set, data)) @@ -826,7 +826,7 @@ mtype_ext_size(struct ip_set *set, u32 *elements, size_t *ext_size) continue; pos = smp_load_acquire(&n->pos); for (j = 0; j < pos; j++) { - if (!test_bit(j, n->used)) + if (!test_bit_acquire(j, n->used)) continue; data = ahash_data(n, j, set->dsize); if (!SET_ELEM_EXPIRED(set, data)) @@ -995,6 +995,7 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, /* Ensure all data writes are visible before updating position */ smp_store_release(&n->pos, npos); set_bit(j, n->used); + smp_mb__after_atomic(); if (old != ERR_PTR(-ENOENT)) { rcu_assign_pointer(hbucket(t, key), n); if (old) @@ -1201,7 +1202,7 @@ mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d, continue; pos = smp_load_acquire(&n->pos); for (i = 0; i < pos; i++) { - if (!test_bit(i, n->used)) + if (!test_bit_acquire(i, n->used)) continue; data = ahash_data(n, i, set->dsize); if (!mtype_data_equal(data, d, &multi)) @@ -1259,7 +1260,7 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, } pos = smp_load_acquire(&n->pos); for (i = 0; i < pos; i++) { - if (!test_bit(i, n->used)) + if (!test_bit_acquire(i, n->used)) continue; data = ahash_data(n, i, set->dsize); if (!mtype_data_equal(data, d, &multi)) @@ -1396,7 +1397,7 @@ mtype_list(const struct ip_set *set, continue; pos = smp_load_acquire(&n->pos); for (i = 0; i < pos; i++) { - if (!test_bit(i, n->used)) + if (!test_bit_acquire(i, n->used)) continue; e = ahash_data(n, i, set->dsize); if (SET_ELEM_EXPIRED(set, e)) -- 2.39.5 Sashiko pointed out that due to using memcpy() to overwrite already existing entry in reuse/forceadd cases, it can lead to torn reads for concurrent lockless RCU readers. Set the element explicitly to unused before reusing it. Signed-off-by: Jozsef Kadlecsik --- net/netfilter/ipset/ip_set_hash_gen.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 6a31f2db824a..377b4be9e4d5 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -926,6 +926,8 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, j = 0; data = ahash_data(n, j, set->dsize); if (!deleted) { + clear_bit(j, n->used); + smp_mb__after_atomic(); #ifdef IP_SET_HASH_WITH_NETS for (i = 0; i < IPSET_NET_COUNT; i++) mtype_del_cidr(set, h, -- 2.39.5 Zhengchuan Liang reported that because resize does not copy the comment extension into the resized set but uses it's pointer, ongoing gc can free the extension in the original set which then results stale pointer in the resized one. The proposed patch was to recreate the extensions for every element in the resized set. It is both expensive and wastes memory, so better skip gc when resizing in progress detected: resizing will destroy the original set anyway, so doing gc on it unnecessary. Reported by: Zhengchuan Liang Signed-off-by: Jozsef Kadlecsik --- net/netfilter/ipset/ip_set_hash_gen.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 377b4be9e4d5..71b57c731dcb 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -501,6 +501,8 @@ mtype_gc_do(struct ip_set *set, struct htype *h, struct htable *t, u32 r) continue; pos = smp_load_acquire(&n->pos); for (j = 0, d = 0; j < pos; j++) { + if (atomic_read(&t->ref)) + goto resize_in_progress; if (!test_bit(j, n->used)) { d++; continue; @@ -552,6 +554,7 @@ mtype_gc_do(struct ip_set *set, struct htype *h, struct htable *t, u32 r) kfree_rcu(n, rcu); } } +resize_in_progress: spin_unlock_bh(&t->hregion[r].lock); } @@ -672,7 +675,10 @@ mtype_resize(struct ip_set *set, bool retried) spin_lock_init(&t->hregion[i].lock); /* There can't be another parallel resizing, - * but dumping, gc, kernel side add/del are possible + * but dumping, kernel side add/del are possible. + * gc must detect ongoing resize when comments are in use + * in order not to free the comment extension area shared + * between the original and resized sets. */ orig = ipset_dereference_bh_nfnl(h->table); atomic_set(&orig->ref, 1); -- 2.39.5 Eulgyu Kim reported a slab-use-after-free issue when resizing a set and gc runs in parallel. Resizing may run parallel with already running gc or gc can start but notice that resizing started. The operation which finishes last must destroy the original set. The logic for the testing is: "I was the last user of the set and it was resized". However setting the counters in resizing was: "the set will be resized and I'm going to use the set". That created a small racing window at the testing phase. Fix the order in the resizing functions. Reported by: Eulgyu Kim Signed-off-by: Jozsef Kadlecsik --- net/netfilter/ipset/ip_set_hash_gen.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 71b57c731dcb..023a3d7aeba0 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -681,8 +681,9 @@ mtype_resize(struct ip_set *set, bool retried) * between the original and resized sets. */ orig = ipset_dereference_bh_nfnl(h->table); - atomic_set(&orig->ref, 1); atomic_inc(&orig->uref); + smp_mb__after_atomic(); + atomic_set(&orig->ref, 1); pr_debug("attempt to resize set %s from %u to %u, t %p\n", set->name, orig->htable_bits, htable_bits, orig); for (r = 0; r < ahash_numof_locks(orig->htable_bits); r++) { @@ -799,6 +800,7 @@ mtype_resize(struct ip_set *set, bool retried) cleanup: rcu_read_unlock_bh(); atomic_set(&orig->ref, 0); + smp_mb__before_atomic(); atomic_dec(&orig->uref); mtype_ahash_destroy(set, t, false); if (ret == -EAGAIN) -- 2.39.5