If a transaction fails the final validation in the commit hook, the table validation state is changed to NFT_VALIDATE_DO and a replay of the batch is performed. Every rule insert will then do a graph validation. This is much slower, but provides better error reporting to the user because we can point at the rule that introduces the validation issue. Without this reset the affected table(s) remain in full validation mode, i.e. on next transaction we start with slow-mode. This makes the next transaction after a failed incremental update very slow: # time iptables-restore < /tmp/ruleset real 0m0.496s [..] # time iptables -A CALLEE -j CALLER iptables v1.8.11 (nf_tables): RULE_APPEND failed (Too many links): rule in chain CALLEE real 0m0.022s [..] # time iptables-restore < /tmp/ruleset real 1m22.355s [..] After this patch, 2nd iptables-restore is back to ~0.5s. Fixes: 9a32e9850686 ("netfilter: nf_tables: don't write table validation state without mutex") Signed-off-by: Florian Westphal --- net/netfilter/nf_tables_api.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 729a92781a1a..027bab30c238 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -11536,6 +11536,13 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb, ret = __nf_tables_abort(net, action); nft_gc_seq_end(nft_net, gc_seq); + if (action == NFNL_ABORT_NONE) { + struct nft_table *table; + + list_for_each_entry(table, &nft_net->tables, list) + table->validate_state = NFT_VALIDATE_SKIP; + } + WARN_ON_ONCE(!list_empty(&nft_net->commit_list)); /* module autoload needs to happen after GC sequence update because it -- 2.52.0 From: Yuto Hamaguchi The upstream commit, 71d8c47fc653711c41bc3282e5b0e605b3727956 ("netfilter: conntrack: introduce clash resolution on insertion race"), sets allow_clash=true in the UDP/UDPLITE protocol handler but does not set it in the generic protocol handler. As a result, packets composed of connectionless protocols at each layer, such as UDP over IP-in-IP, still drop packets due to conflicts during conntrack insertion. To resolve this, this patch sets allow_clash in the nf_conntrack_l4proto_generic. Signed-off-by: Yuto Hamaguchi Signed-off-by: Florian Westphal --- net/netfilter/nf_conntrack_proto_generic.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c index e831637bc8ca..cb260eb3d012 100644 --- a/net/netfilter/nf_conntrack_proto_generic.c +++ b/net/netfilter/nf_conntrack_proto_generic.c @@ -67,6 +67,7 @@ void nf_conntrack_generic_init_net(struct net *net) const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic = { .l4proto = 255, + .allow_clash = true, #ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = generic_timeout_nlattr_to_obj, -- 2.52.0 From: Fernando Fernandez Mancera After the optimization to only perform one GC per jiffy, a new problem was introduced. If more than 8 new connections are tracked per jiffy the list won't be cleaned up fast enough possibly reaching the limit wrongly. In order to prevent this issue, only skip the GC if it was already triggered during the same jiffy and the increment is lower than the clean up limit. In addition, increase the clean up limit to 64 connections to avoid triggering GC too often and do more effective GCs. This has been tested using a HTTP server and several performance tools while having nft_connlimit/xt_connlimit or OVS limit configured. Output of slowhttptest + OVS limit at 52000 connections: slow HTTP test status on 340th second: initializing: 0 pending: 432 connected: 51998 error: 0 closed: 0 service available: YES Fixes: d265929930e2 ("netfilter: nf_conncount: reduce unnecessary GC") Reported-by: Aleksandra Rukomoinikova Closes: https://lore.kernel.org/netfilter/b2064e7b-0776-4e14-adb6-c68080987471@k2.cloud/ Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Florian Westphal --- include/net/netfilter/nf_conntrack_count.h | 1 + net/netfilter/nf_conncount.c | 15 ++++++++++----- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/include/net/netfilter/nf_conntrack_count.h b/include/net/netfilter/nf_conntrack_count.h index 52a06de41aa0..cf0166520cf3 100644 --- a/include/net/netfilter/nf_conntrack_count.h +++ b/include/net/netfilter/nf_conntrack_count.h @@ -13,6 +13,7 @@ struct nf_conncount_list { u32 last_gc; /* jiffies at most recent gc */ struct list_head head; /* connections with the same filtering key */ unsigned int count; /* length of list */ + unsigned int last_gc_count; /* length of list at most recent gc */ }; struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int keylen); diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index 8487808c8761..288936f5c1bf 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -34,8 +34,9 @@ #define CONNCOUNT_SLOTS 256U -#define CONNCOUNT_GC_MAX_NODES 8 -#define MAX_KEYLEN 5 +#define CONNCOUNT_GC_MAX_NODES 8 +#define CONNCOUNT_GC_MAX_COLLECT 64 +#define MAX_KEYLEN 5 /* we will save the tuples of all connections we care about */ struct nf_conncount_tuple { @@ -182,12 +183,13 @@ static int __nf_conncount_add(struct net *net, goto out_put; } - if ((u32)jiffies == list->last_gc) + if ((u32)jiffies == list->last_gc && + (list->count - list->last_gc_count) < CONNCOUNT_GC_MAX_COLLECT) goto add_new_node; /* check the saved connections */ list_for_each_entry_safe(conn, conn_n, &list->head, node) { - if (collect > CONNCOUNT_GC_MAX_NODES) + if (collect > CONNCOUNT_GC_MAX_COLLECT) break; found = find_or_evict(net, list, conn); @@ -230,6 +232,7 @@ static int __nf_conncount_add(struct net *net, nf_ct_put(found_ct); } list->last_gc = (u32)jiffies; + list->last_gc_count = list->count; add_new_node: if (WARN_ON_ONCE(list->count > INT_MAX)) { @@ -277,6 +280,7 @@ void nf_conncount_list_init(struct nf_conncount_list *list) spin_lock_init(&list->list_lock); INIT_LIST_HEAD(&list->head); list->count = 0; + list->last_gc_count = 0; list->last_gc = (u32)jiffies; } EXPORT_SYMBOL_GPL(nf_conncount_list_init); @@ -316,13 +320,14 @@ static bool __nf_conncount_gc_list(struct net *net, } nf_ct_put(found_ct); - if (collected > CONNCOUNT_GC_MAX_NODES) + if (collected > CONNCOUNT_GC_MAX_COLLECT) break; } if (!list->count) ret = true; list->last_gc = (u32)jiffies; + list->last_gc_count = list->count; return ret; } -- 2.52.0 Not strictly required, but should not be harmful either: This isn't a stateful protocol, hence clash resolution should work fine. Signed-off-by: Florian Westphal --- net/netfilter/nf_conntrack_proto_icmp.c | 1 + net/netfilter/nf_conntrack_proto_icmpv6.c | 1 + 2 files changed, 2 insertions(+) diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c index b38b7164acd5..32148a3a8509 100644 --- a/net/netfilter/nf_conntrack_proto_icmp.c +++ b/net/netfilter/nf_conntrack_proto_icmp.c @@ -365,6 +365,7 @@ void nf_conntrack_icmp_init_net(struct net *net) const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp = { .l4proto = IPPROTO_ICMP, + .allow_clash = true, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = icmp_tuple_to_nlattr, .nlattr_tuple_size = icmp_nlattr_tuple_size, diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c index 327b8059025d..e508b3aa370a 100644 --- a/net/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/netfilter/nf_conntrack_proto_icmpv6.c @@ -343,6 +343,7 @@ void nf_conntrack_icmpv6_init_net(struct net *net) const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 = { .l4proto = IPPROTO_ICMPV6, + .allow_clash = true, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = icmpv6_tuple_to_nlattr, .nlattr_tuple_size = icmpv6_nlattr_tuple_size, -- 2.52.0 conntrack, xtables and nftables are distinct subsystems, don't use them in other subystems. Signed-off-by: Florian Westphal --- include/linux/audit.h | 1 - include/net/netfilter/nf_conntrack_tuple.h | 2 +- include/net/netfilter/nf_tables.h | 1 - net/bridge/netfilter/nf_conntrack_bridge.c | 3 +-- net/netfilter/nf_conntrack_h323_main.c | 1 + net/netfilter/nf_synproxy_core.c | 1 + net/netfilter/nf_tables_api.c | 1 + net/netfilter/nft_synproxy.c | 1 + 8 files changed, 6 insertions(+), 5 deletions(-) diff --git a/include/linux/audit.h b/include/linux/audit.h index 536f8ee8da81..14df25095e19 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -13,7 +13,6 @@ #include #include #include -#include #include #define AUDIT_INO_UNSET ((unsigned long)-1) diff --git a/include/net/netfilter/nf_conntrack_tuple.h b/include/net/netfilter/nf_conntrack_tuple.h index f7dd950ff250..4d55b7325707 100644 --- a/include/net/netfilter/nf_conntrack_tuple.h +++ b/include/net/netfilter/nf_conntrack_tuple.h @@ -11,7 +11,7 @@ #ifndef _NF_CONNTRACK_TUPLE_H #define _NF_CONNTRACK_TUPLE_H -#include +#include #include #include diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 0e266c2d0e7f..2597077442e5 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -6,7 +6,6 @@ #include #include #include -#include #include #include #include diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c index 6482de4d8750..3b28b84191be 100644 --- a/net/bridge/netfilter/nf_conntrack_bridge.c +++ b/net/bridge/netfilter/nf_conntrack_bridge.c @@ -16,8 +16,7 @@ #include #include -#include -#include +#include #include "../br_private.h" diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index 14f73872f647..17f1f453d481 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 3fa3f5dfb264..57f57e2fc80a 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -10,6 +10,7 @@ #include #include +#include #include #include diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 027bab30c238..e7247363c643 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include diff --git a/net/netfilter/nft_synproxy.c b/net/netfilter/nft_synproxy.c index 4d3e5a31b412..b71ef18b0e8c 100644 --- a/net/netfilter/nft_synproxy.c +++ b/net/netfilter/nft_synproxy.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include -- 2.52.0 several netfilter compilation units rely on implicit includes coming from nf_conntrack_proto_gre.h. Clean this up and add the required dependencies where needed. nf_conntrack.h requires net_generic() helper. Place various gre/ppp/vlan includes to where they are needed. Signed-off-by: Florian Westphal --- include/linux/netfilter/nf_conntrack_proto_gre.h | 3 --- include/net/netfilter/nf_conntrack.h | 1 + net/netfilter/nf_conntrack_bpf.c | 1 + net/netfilter/nf_conntrack_netlink.c | 1 + net/netfilter/nf_conntrack_proto_gre.c | 2 ++ net/netfilter/nf_flow_table_ip.c | 2 ++ net/netfilter/nf_flow_table_offload.c | 1 + net/netfilter/nf_flow_table_path.c | 1 + net/netfilter/nf_nat_ovs.c | 3 +++ net/netfilter/nf_nat_proto.c | 1 + net/netfilter/nft_flow_offload.c | 1 + net/sched/act_ct.c | 2 ++ net/sched/act_ctinfo.c | 1 + 13 files changed, 17 insertions(+), 3 deletions(-) diff --git a/include/linux/netfilter/nf_conntrack_proto_gre.h b/include/linux/netfilter/nf_conntrack_proto_gre.h index 34ce5d2f37a2..9ee7014400e8 100644 --- a/include/linux/netfilter/nf_conntrack_proto_gre.h +++ b/include/linux/netfilter/nf_conntrack_proto_gre.h @@ -1,9 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _CONNTRACK_PROTO_GRE_H #define _CONNTRACK_PROTO_GRE_H -#include -#include -#include struct nf_ct_gre { unsigned int stream_timeout; diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index aa0a7c82199e..bc42dd0e10e6 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -16,6 +16,7 @@ #include #include +#include #include #include #include diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c index 4a136fc3a9c0..4fe6d9d33329 100644 --- a/net/netfilter/nf_conntrack_bpf.c +++ b/net/netfilter/nf_conntrack_bpf.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 3a04665adf99..662f6bbfa805 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -32,6 +32,7 @@ #include #include +#include #include #include #include diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index af369e686fc5..b894bb7a97ad 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -33,12 +33,14 @@ #include #include #include +#include #include #include #include #include #include #include +#include #include #include diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 78883343e5d6..11da560f38bf 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -8,6 +8,8 @@ #include #include #include +#include +#include #include #include #include diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index d8f7bfd60ac6..b1966b68c48a 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include diff --git a/net/netfilter/nf_flow_table_path.c b/net/netfilter/nf_flow_table_path.c index eb24fe2715dc..6bb9579dcc2a 100644 --- a/net/netfilter/nf_flow_table_path.c +++ b/net/netfilter/nf_flow_table_path.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include diff --git a/net/netfilter/nf_nat_ovs.c b/net/netfilter/nf_nat_ovs.c index 0f9a559f6207..31474e8c034a 100644 --- a/net/netfilter/nf_nat_ovs.c +++ b/net/netfilter/nf_nat_ovs.c @@ -2,6 +2,9 @@ /* Support nat functions for openvswitch and used by OVS and TC conntrack. */ #include +#include +#include +#include /* Modelled after nf_nat_ipv[46]_fn(). * range is only used for new, uninitialized NAT state. diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c index b14a434b9561..97c0f841fc96 100644 --- a/net/netfilter/nf_nat_proto.c +++ b/net/netfilter/nf_nat_proto.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index b8f76c9057fd..179d0e59e2b5 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only +#include #include #include #include diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 2b6ac7069dc1..81d488655793 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -13,9 +13,11 @@ #include #include #include +#include #include #include #include +#include #include #include #include diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c index 71efe04d00b5..d2c750bab1d3 100644 --- a/net/sched/act_ctinfo.c +++ b/net/sched/act_ctinfo.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include -- 2.52.0 From: Scott Mitchell Currently, instance_create() uses GFP_ATOMIC because it's called while holding instances_lock spinlock. This makes allocation more likely to fail under memory pressure. Refactor nfqnl_recv_config() to drop RCU lock after instance_lookup() and peer_portid verification. A socket cannot simultaneously send a message and close, so the queue owned by the sending socket cannot be destroyed while processing its CONFIG message. This allows instance_create() to allocate with GFP_KERNEL_ACCOUNT before taking the spinlock. Suggested-by: Florian Westphal Signed-off-by: Scott Mitchell Signed-off-by: Florian Westphal --- net/netfilter/nfnetlink_queue.c | 75 +++++++++++++++------------------ 1 file changed, 34 insertions(+), 41 deletions(-) diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 8b7b39d8a109..8fa0807973c9 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -121,17 +121,9 @@ instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid) unsigned int h; int err; - spin_lock(&q->instances_lock); - if (instance_lookup(q, queue_num)) { - err = -EEXIST; - goto out_unlock; - } - - inst = kzalloc(sizeof(*inst), GFP_ATOMIC); - if (!inst) { - err = -ENOMEM; - goto out_unlock; - } + inst = kzalloc(sizeof(*inst), GFP_KERNEL_ACCOUNT); + if (!inst) + return ERR_PTR(-ENOMEM); inst->queue_num = queue_num; inst->peer_portid = portid; @@ -141,9 +133,15 @@ instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid) spin_lock_init(&inst->lock); INIT_LIST_HEAD(&inst->queue_list); + spin_lock(&q->instances_lock); + if (instance_lookup(q, queue_num)) { + err = -EEXIST; + goto out_unlock; + } + if (!try_module_get(THIS_MODULE)) { err = -EAGAIN; - goto out_free; + goto out_unlock; } h = instance_hashfn(queue_num); @@ -153,10 +151,9 @@ instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid) return inst; -out_free: - kfree(inst); out_unlock: spin_unlock(&q->instances_lock); + kfree(inst); return ERR_PTR(err); } @@ -1498,7 +1495,8 @@ static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info, struct nfqnl_msg_config_cmd *cmd = NULL; struct nfqnl_instance *queue; __u32 flags = 0, mask = 0; - int ret = 0; + + WARN_ON_ONCE(!lockdep_nfnl_is_held(NFNL_SUBSYS_QUEUE)); if (nfqa[NFQA_CFG_CMD]) { cmd = nla_data(nfqa[NFQA_CFG_CMD]); @@ -1544,47 +1542,44 @@ static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info, } } + /* Lookup queue under RCU. After peer_portid check (or for new queue + * in BIND case), the queue is owned by the socket sending this message. + * A socket cannot simultaneously send a message and close, so while + * processing this CONFIG message, nfqnl_rcv_nl_event() (triggered by + * socket close) cannot destroy this queue. Safe to use without RCU. + */ rcu_read_lock(); queue = instance_lookup(q, queue_num); if (queue && queue->peer_portid != NETLINK_CB(skb).portid) { - ret = -EPERM; - goto err_out_unlock; + rcu_read_unlock(); + return -EPERM; } + rcu_read_unlock(); if (cmd != NULL) { switch (cmd->command) { case NFQNL_CFG_CMD_BIND: - if (queue) { - ret = -EBUSY; - goto err_out_unlock; - } - queue = instance_create(q, queue_num, - NETLINK_CB(skb).portid); - if (IS_ERR(queue)) { - ret = PTR_ERR(queue); - goto err_out_unlock; - } + if (queue) + return -EBUSY; + queue = instance_create(q, queue_num, NETLINK_CB(skb).portid); + if (IS_ERR(queue)) + return PTR_ERR(queue); break; case NFQNL_CFG_CMD_UNBIND: - if (!queue) { - ret = -ENODEV; - goto err_out_unlock; - } + if (!queue) + return -ENODEV; instance_destroy(q, queue); - goto err_out_unlock; + return 0; case NFQNL_CFG_CMD_PF_BIND: case NFQNL_CFG_CMD_PF_UNBIND: break; default: - ret = -ENOTSUPP; - goto err_out_unlock; + return -EOPNOTSUPP; } } - if (!queue) { - ret = -ENODEV; - goto err_out_unlock; - } + if (!queue) + return -ENODEV; if (nfqa[NFQA_CFG_PARAMS]) { struct nfqnl_msg_config_params *params = @@ -1609,9 +1604,7 @@ static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info, spin_unlock_bh(&queue->lock); } -err_out_unlock: - rcu_read_unlock(); - return ret; + return 0; } static const struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = { -- 2.52.0 As far as I can see nothing bad can happen when NFTA_TARGET/MATCH_NAME are too large because this calls x_tables helpers which check for the length, but it seems better to already reject it during netlink parsing. Rest of the changes avoid silent u8/u16 truncations. For _TYPE, its expected to be only 1 or 0. In x_tables world, this variable is set by kernel, for IPT_SO_GET_REVISION_TARGET its 1, for all others its set to 0. As older versions of nf_tables permitted any value except 1 to mean 'match', keep this as-is but sanitize the value for consistency. Fixes: 0ca743a55991 ("netfilter: nf_tables: add compatibility layer for x_tables") Reviewed-by: Fernando Fernandez Mancera Signed-off-by: Florian Westphal --- net/netfilter/nft_compat.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 72711d62fddf..08f620311b03 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -134,7 +134,8 @@ static void nft_target_eval_bridge(const struct nft_expr *expr, } static const struct nla_policy nft_target_policy[NFTA_TARGET_MAX + 1] = { - [NFTA_TARGET_NAME] = { .type = NLA_NUL_STRING }, + [NFTA_TARGET_NAME] = { .type = NLA_NUL_STRING, + .len = XT_EXTENSION_MAXNAMELEN, }, [NFTA_TARGET_REV] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_TARGET_INFO] = { .type = NLA_BINARY }, }; @@ -434,7 +435,8 @@ static void nft_match_eval(const struct nft_expr *expr, } static const struct nla_policy nft_match_policy[NFTA_MATCH_MAX + 1] = { - [NFTA_MATCH_NAME] = { .type = NLA_NUL_STRING }, + [NFTA_MATCH_NAME] = { .type = NLA_NUL_STRING, + .len = XT_EXTENSION_MAXNAMELEN }, [NFTA_MATCH_REV] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_MATCH_INFO] = { .type = NLA_BINARY }, }; @@ -693,7 +695,12 @@ static int nfnl_compat_get_rcu(struct sk_buff *skb, name = nla_data(tb[NFTA_COMPAT_NAME]); rev = ntohl(nla_get_be32(tb[NFTA_COMPAT_REV])); - target = ntohl(nla_get_be32(tb[NFTA_COMPAT_TYPE])); + /* x_tables api checks for 'target == 1' to mean target, + * everything else means 'match'. + * In x_tables world, the number is set by kernel, not + * userspace. + */ + target = nla_get_be32(tb[NFTA_COMPAT_TYPE]) == htonl(1); switch(family) { case AF_INET: -- 2.52.0 From: Fernando Fernandez Mancera Since commit be102eb6a0e7 ("netfilter: nf_conncount: rework API to use sk_buff directly"), we skip the adding and trigger a GC when the ct is confirmed. For connections originated from local to local it doesn't work because the connection is confirmed on POSTROUTING, therefore tracking on the INPUT hook is always skipped. In order to fix this, we check whether skb input ifindex is set to loopback ifindex. If it is then we fallback on a GC plus track operation skipping the optimization. This fallback is necessary to avoid duplicated tracking of a packet train e.g 10 UDP datagrams sent on a burst when initiating the connection. Tested with xt_connlimit/nft_connlimit and OVS limit and with a HTTP server and iperf3 on UDP mode. Fixes: be102eb6a0e7 ("netfilter: nf_conncount: rework API to use sk_buff directly") Reported-by: Michal Slabihoudek Closes: https://lore.kernel.org/netfilter/6989BD9F-8C24-4397-9AD7-4613B28BF0DB@gooddata.com/ Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Florian Westphal --- net/netfilter/nf_conncount.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index 288936f5c1bf..14e62b3263cd 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -179,14 +179,25 @@ static int __nf_conncount_add(struct net *net, return -ENOENT; if (ct && nf_ct_is_confirmed(ct)) { - err = -EEXIST; - goto out_put; + /* local connections are confirmed in postrouting so confirmation + * might have happened before hitting connlimit + */ + if (skb->skb_iif != LOOPBACK_IFINDEX) { + err = -EEXIST; + goto out_put; + } + + /* this is likely a local connection, skip optimization to avoid + * adding duplicates from a 'packet train' + */ + goto check_connections; } if ((u32)jiffies == list->last_gc && (list->count - list->last_gc_count) < CONNCOUNT_GC_MAX_COLLECT) goto add_new_node; +check_connections: /* check the saved connections */ list_for_each_entry_safe(conn, conn_n, &list->head, node) { if (collect > CONNCOUNT_GC_MAX_COLLECT) -- 2.52.0 Quoting reporter: In net/netfilter/xt_tcpmss.c (lines 53-68), the TCP option parser reads op[i+1] directly without validating the remaining option length. If the last byte of the option field is not EOL/NOP (0/1), the code attempts to index op[i+1]. In the case where i + 1 == optlen, this causes an out-of-bounds read, accessing memory past the optlen boundary (either reading beyond the stack buffer _opt or the following payload). Reported-by: sungzii Signed-off-by: Florian Westphal --- net/netfilter/xt_tcpmss.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/xt_tcpmss.c b/net/netfilter/xt_tcpmss.c index 37704ab01799..0d32d4841cb3 100644 --- a/net/netfilter/xt_tcpmss.c +++ b/net/netfilter/xt_tcpmss.c @@ -61,7 +61,7 @@ tcpmss_mt(const struct sk_buff *skb, struct xt_action_param *par) return (mssval >= info->mss_min && mssval <= info->mss_max) ^ info->invert; } - if (op[i] < 2) + if (op[i] < 2 || i == optlen - 1) i++; else i += op[i+1] ? : 1; -- 2.52.0