This file contains the path discovery that is run from the forward chain for the packet offloading the flow into the flowtable. This consists of a series of calls to dev_fill_forward_path() for each device stack. More topologies may be supported in the future, so move this code to its own file to separate it from the nftables flow_offload expression. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 6 + net/netfilter/Makefile | 1 + net/netfilter/nf_flow_table_path.c | 267 ++++++++++++++++++++++++++ net/netfilter/nft_flow_offload.c | 252 ------------------------ 4 files changed, 274 insertions(+), 252 deletions(-) create mode 100644 net/netfilter/nf_flow_table_path.c diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index c003cd194fa2..e9f72d2558e9 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -222,6 +222,12 @@ struct nf_flow_route { struct flow_offload *flow_offload_alloc(struct nf_conn *ct); void flow_offload_free(struct flow_offload *flow); +struct nft_flowtable; +struct nft_pktinfo; +int nft_flow_route(const struct nft_pktinfo *pkt, const struct nf_conn *ct, + struct nf_flow_route *route, enum ip_conntrack_dir dir, + struct nft_flowtable *ft); + static inline int nf_flow_table_offload_add_cb(struct nf_flowtable *flow_table, flow_setup_cb_t *cb, void *cb_priv) diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index e43e20f529f8..6bfc250e474f 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -141,6 +141,7 @@ obj-$(CONFIG_NFT_FWD_NETDEV) += nft_fwd_netdev.o # flow table infrastructure obj-$(CONFIG_NF_FLOW_TABLE) += nf_flow_table.o nf_flow_table-objs := nf_flow_table_core.o nf_flow_table_ip.o \ + nf_flow_table_path.o \ nf_flow_table_offload.o nf_flow_table_xdp.o nf_flow_table-$(CONFIG_NF_FLOW_TABLE_PROCFS) += nf_flow_table_procfs.o ifeq ($(CONFIG_NF_FLOW_TABLE),m) diff --git a/net/netfilter/nf_flow_table_path.c b/net/netfilter/nf_flow_table_path.c new file mode 100644 index 000000000000..159aa5c8da60 --- /dev/null +++ b/net/netfilter/nf_flow_table_path.c @@ -0,0 +1,267 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static enum flow_offload_xmit_type nft_xmit_type(struct dst_entry *dst) +{ + if (dst_xfrm(dst)) + return FLOW_OFFLOAD_XMIT_XFRM; + + return FLOW_OFFLOAD_XMIT_NEIGH; +} + +static void nft_default_forward_path(struct nf_flow_route *route, + struct dst_entry *dst_cache, + enum ip_conntrack_dir dir) +{ + route->tuple[!dir].in.ifindex = dst_cache->dev->ifindex; + route->tuple[dir].dst = dst_cache; + route->tuple[dir].xmit_type = nft_xmit_type(dst_cache); +} + +static bool nft_is_valid_ether_device(const struct net_device *dev) +{ + if (!dev || (dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER || + dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr)) + return false; + + return true; +} + +static int nft_dev_fill_forward_path(const struct nf_flow_route *route, + const struct dst_entry *dst_cache, + const struct nf_conn *ct, + enum ip_conntrack_dir dir, u8 *ha, + struct net_device_path_stack *stack) +{ + const void *daddr = &ct->tuplehash[!dir].tuple.src.u3; + struct net_device *dev = dst_cache->dev; + struct neighbour *n; + u8 nud_state; + + if (!nft_is_valid_ether_device(dev)) + goto out; + + n = dst_neigh_lookup(dst_cache, daddr); + if (!n) + return -1; + + read_lock_bh(&n->lock); + nud_state = n->nud_state; + ether_addr_copy(ha, n->ha); + read_unlock_bh(&n->lock); + neigh_release(n); + + if (!(nud_state & NUD_VALID)) + return -1; + +out: + return dev_fill_forward_path(dev, ha, stack); +} + +struct nft_forward_info { + const struct net_device *indev; + const struct net_device *outdev; + const struct net_device *hw_outdev; + struct id { + __u16 id; + __be16 proto; + } encap[NF_FLOW_TABLE_ENCAP_MAX]; + u8 num_encaps; + u8 ingress_vlans; + u8 h_source[ETH_ALEN]; + u8 h_dest[ETH_ALEN]; + enum flow_offload_xmit_type xmit_type; +}; + +static void nft_dev_path_info(const struct net_device_path_stack *stack, + struct nft_forward_info *info, + unsigned char *ha, struct nf_flowtable *flowtable) +{ + const struct net_device_path *path; + int i; + + memcpy(info->h_dest, ha, ETH_ALEN); + + for (i = 0; i < stack->num_paths; i++) { + path = &stack->path[i]; + switch (path->type) { + case DEV_PATH_ETHERNET: + case DEV_PATH_DSA: + case DEV_PATH_VLAN: + case DEV_PATH_PPPOE: + info->indev = path->dev; + if (is_zero_ether_addr(info->h_source)) + memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); + + if (path->type == DEV_PATH_ETHERNET) + break; + if (path->type == DEV_PATH_DSA) { + i = stack->num_paths; + break; + } + + /* DEV_PATH_VLAN and DEV_PATH_PPPOE */ + if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) { + info->indev = NULL; + break; + } + if (!info->outdev) + info->outdev = path->dev; + info->encap[info->num_encaps].id = path->encap.id; + info->encap[info->num_encaps].proto = path->encap.proto; + info->num_encaps++; + if (path->type == DEV_PATH_PPPOE) + memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN); + break; + case DEV_PATH_BRIDGE: + if (is_zero_ether_addr(info->h_source)) + memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); + + switch (path->bridge.vlan_mode) { + case DEV_PATH_BR_VLAN_UNTAG_HW: + info->ingress_vlans |= BIT(info->num_encaps - 1); + break; + case DEV_PATH_BR_VLAN_TAG: + info->encap[info->num_encaps].id = path->bridge.vlan_id; + info->encap[info->num_encaps].proto = path->bridge.vlan_proto; + info->num_encaps++; + break; + case DEV_PATH_BR_VLAN_UNTAG: + info->num_encaps--; + break; + case DEV_PATH_BR_VLAN_KEEP: + break; + } + info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; + break; + default: + info->indev = NULL; + break; + } + } + if (!info->outdev) + info->outdev = info->indev; + + info->hw_outdev = info->indev; + + if (nf_flowtable_hw_offload(flowtable) && + nft_is_valid_ether_device(info->indev)) + info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; +} + +static bool nft_flowtable_find_dev(const struct net_device *dev, + struct nft_flowtable *ft) +{ + struct nft_hook *hook; + bool found = false; + + list_for_each_entry_rcu(hook, &ft->hook_list, list) { + if (!nft_hook_find_ops_rcu(hook, dev)) + continue; + + found = true; + break; + } + + return found; +} + +static void nft_dev_forward_path(struct nf_flow_route *route, + const struct nf_conn *ct, + enum ip_conntrack_dir dir, + struct nft_flowtable *ft) +{ + const struct dst_entry *dst = route->tuple[dir].dst; + struct net_device_path_stack stack; + struct nft_forward_info info = {}; + unsigned char ha[ETH_ALEN]; + int i; + + if (nft_dev_fill_forward_path(route, dst, ct, dir, ha, &stack) >= 0) + nft_dev_path_info(&stack, &info, ha, &ft->data); + + if (!info.indev || !nft_flowtable_find_dev(info.indev, ft)) + return; + + route->tuple[!dir].in.ifindex = info.indev->ifindex; + for (i = 0; i < info.num_encaps; i++) { + route->tuple[!dir].in.encap[i].id = info.encap[i].id; + route->tuple[!dir].in.encap[i].proto = info.encap[i].proto; + } + route->tuple[!dir].in.num_encaps = info.num_encaps; + route->tuple[!dir].in.ingress_vlans = info.ingress_vlans; + + if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) { + memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN); + memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN); + route->tuple[dir].out.ifindex = info.outdev->ifindex; + route->tuple[dir].out.hw_ifindex = info.hw_outdev->ifindex; + route->tuple[dir].xmit_type = info.xmit_type; + } +} + +int nft_flow_route(const struct nft_pktinfo *pkt, const struct nf_conn *ct, + struct nf_flow_route *route, enum ip_conntrack_dir dir, + struct nft_flowtable *ft) +{ + struct dst_entry *this_dst = skb_dst(pkt->skb); + struct dst_entry *other_dst = NULL; + struct flowi fl; + + memset(&fl, 0, sizeof(fl)); + switch (nft_pf(pkt)) { + case NFPROTO_IPV4: + fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip; + fl.u.ip4.saddr = ct->tuplehash[!dir].tuple.src.u3.ip; + fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex; + fl.u.ip4.flowi4_iif = this_dst->dev->ifindex; + fl.u.ip4.flowi4_dscp = ip4h_dscp(ip_hdr(pkt->skb)); + fl.u.ip4.flowi4_mark = pkt->skb->mark; + fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC; + break; + case NFPROTO_IPV6: + fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6; + fl.u.ip6.saddr = ct->tuplehash[!dir].tuple.src.u3.in6; + fl.u.ip6.flowi6_oif = nft_in(pkt)->ifindex; + fl.u.ip6.flowi6_iif = this_dst->dev->ifindex; + fl.u.ip6.flowlabel = ip6_flowinfo(ipv6_hdr(pkt->skb)); + fl.u.ip6.flowi6_mark = pkt->skb->mark; + fl.u.ip6.flowi6_flags = FLOWI_FLAG_ANYSRC; + break; + } + + if (!dst_hold_safe(this_dst)) + return -ENOENT; + + nf_route(nft_net(pkt), &other_dst, &fl, false, nft_pf(pkt)); + if (!other_dst) { + dst_release(this_dst); + return -ENOENT; + } + + nft_default_forward_path(route, this_dst, dir); + nft_default_forward_path(route, other_dst, !dir); + + if (route->tuple[dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH && + route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) { + nft_dev_forward_path(route, ct, dir, ft); + nft_dev_forward_path(route, ct, !dir, ft); + } + + return 0; +} +EXPORT_SYMBOL_GPL(nft_flow_route); diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 14dd1c0698c3..b8f76c9057fd 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -20,258 +20,6 @@ struct nft_flow_offload { struct nft_flowtable *flowtable; }; -static enum flow_offload_xmit_type nft_xmit_type(struct dst_entry *dst) -{ - if (dst_xfrm(dst)) - return FLOW_OFFLOAD_XMIT_XFRM; - - return FLOW_OFFLOAD_XMIT_NEIGH; -} - -static void nft_default_forward_path(struct nf_flow_route *route, - struct dst_entry *dst_cache, - enum ip_conntrack_dir dir) -{ - route->tuple[!dir].in.ifindex = dst_cache->dev->ifindex; - route->tuple[dir].dst = dst_cache; - route->tuple[dir].xmit_type = nft_xmit_type(dst_cache); -} - -static bool nft_is_valid_ether_device(const struct net_device *dev) -{ - if (!dev || (dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER || - dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr)) - return false; - - return true; -} - -static int nft_dev_fill_forward_path(const struct nf_flow_route *route, - const struct dst_entry *dst_cache, - const struct nf_conn *ct, - enum ip_conntrack_dir dir, u8 *ha, - struct net_device_path_stack *stack) -{ - const void *daddr = &ct->tuplehash[!dir].tuple.src.u3; - struct net_device *dev = dst_cache->dev; - struct neighbour *n; - u8 nud_state; - - if (!nft_is_valid_ether_device(dev)) - goto out; - - n = dst_neigh_lookup(dst_cache, daddr); - if (!n) - return -1; - - read_lock_bh(&n->lock); - nud_state = n->nud_state; - ether_addr_copy(ha, n->ha); - read_unlock_bh(&n->lock); - neigh_release(n); - - if (!(nud_state & NUD_VALID)) - return -1; - -out: - return dev_fill_forward_path(dev, ha, stack); -} - -struct nft_forward_info { - const struct net_device *indev; - const struct net_device *outdev; - const struct net_device *hw_outdev; - struct id { - __u16 id; - __be16 proto; - } encap[NF_FLOW_TABLE_ENCAP_MAX]; - u8 num_encaps; - u8 ingress_vlans; - u8 h_source[ETH_ALEN]; - u8 h_dest[ETH_ALEN]; - enum flow_offload_xmit_type xmit_type; -}; - -static void nft_dev_path_info(const struct net_device_path_stack *stack, - struct nft_forward_info *info, - unsigned char *ha, struct nf_flowtable *flowtable) -{ - const struct net_device_path *path; - int i; - - memcpy(info->h_dest, ha, ETH_ALEN); - - for (i = 0; i < stack->num_paths; i++) { - path = &stack->path[i]; - switch (path->type) { - case DEV_PATH_ETHERNET: - case DEV_PATH_DSA: - case DEV_PATH_VLAN: - case DEV_PATH_PPPOE: - info->indev = path->dev; - if (is_zero_ether_addr(info->h_source)) - memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); - - if (path->type == DEV_PATH_ETHERNET) - break; - if (path->type == DEV_PATH_DSA) { - i = stack->num_paths; - break; - } - - /* DEV_PATH_VLAN and DEV_PATH_PPPOE */ - if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) { - info->indev = NULL; - break; - } - if (!info->outdev) - info->outdev = path->dev; - info->encap[info->num_encaps].id = path->encap.id; - info->encap[info->num_encaps].proto = path->encap.proto; - info->num_encaps++; - if (path->type == DEV_PATH_PPPOE) - memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN); - break; - case DEV_PATH_BRIDGE: - if (is_zero_ether_addr(info->h_source)) - memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); - - switch (path->bridge.vlan_mode) { - case DEV_PATH_BR_VLAN_UNTAG_HW: - info->ingress_vlans |= BIT(info->num_encaps - 1); - break; - case DEV_PATH_BR_VLAN_TAG: - info->encap[info->num_encaps].id = path->bridge.vlan_id; - info->encap[info->num_encaps].proto = path->bridge.vlan_proto; - info->num_encaps++; - break; - case DEV_PATH_BR_VLAN_UNTAG: - info->num_encaps--; - break; - case DEV_PATH_BR_VLAN_KEEP: - break; - } - info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; - break; - default: - info->indev = NULL; - break; - } - } - if (!info->outdev) - info->outdev = info->indev; - - info->hw_outdev = info->indev; - - if (nf_flowtable_hw_offload(flowtable) && - nft_is_valid_ether_device(info->indev)) - info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; -} - -static bool nft_flowtable_find_dev(const struct net_device *dev, - struct nft_flowtable *ft) -{ - struct nft_hook *hook; - bool found = false; - - list_for_each_entry_rcu(hook, &ft->hook_list, list) { - if (!nft_hook_find_ops_rcu(hook, dev)) - continue; - - found = true; - break; - } - - return found; -} - -static void nft_dev_forward_path(struct nf_flow_route *route, - const struct nf_conn *ct, - enum ip_conntrack_dir dir, - struct nft_flowtable *ft) -{ - const struct dst_entry *dst = route->tuple[dir].dst; - struct net_device_path_stack stack; - struct nft_forward_info info = {}; - unsigned char ha[ETH_ALEN]; - int i; - - if (nft_dev_fill_forward_path(route, dst, ct, dir, ha, &stack) >= 0) - nft_dev_path_info(&stack, &info, ha, &ft->data); - - if (!info.indev || !nft_flowtable_find_dev(info.indev, ft)) - return; - - route->tuple[!dir].in.ifindex = info.indev->ifindex; - for (i = 0; i < info.num_encaps; i++) { - route->tuple[!dir].in.encap[i].id = info.encap[i].id; - route->tuple[!dir].in.encap[i].proto = info.encap[i].proto; - } - route->tuple[!dir].in.num_encaps = info.num_encaps; - route->tuple[!dir].in.ingress_vlans = info.ingress_vlans; - - if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) { - memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN); - memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN); - route->tuple[dir].out.ifindex = info.outdev->ifindex; - route->tuple[dir].out.hw_ifindex = info.hw_outdev->ifindex; - route->tuple[dir].xmit_type = info.xmit_type; - } -} - -static int nft_flow_route(const struct nft_pktinfo *pkt, - const struct nf_conn *ct, - struct nf_flow_route *route, - enum ip_conntrack_dir dir, - struct nft_flowtable *ft) -{ - struct dst_entry *this_dst = skb_dst(pkt->skb); - struct dst_entry *other_dst = NULL; - struct flowi fl; - - memset(&fl, 0, sizeof(fl)); - switch (nft_pf(pkt)) { - case NFPROTO_IPV4: - fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip; - fl.u.ip4.saddr = ct->tuplehash[!dir].tuple.src.u3.ip; - fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex; - fl.u.ip4.flowi4_iif = this_dst->dev->ifindex; - fl.u.ip4.flowi4_dscp = ip4h_dscp(ip_hdr(pkt->skb)); - fl.u.ip4.flowi4_mark = pkt->skb->mark; - fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC; - break; - case NFPROTO_IPV6: - fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6; - fl.u.ip6.saddr = ct->tuplehash[!dir].tuple.src.u3.in6; - fl.u.ip6.flowi6_oif = nft_in(pkt)->ifindex; - fl.u.ip6.flowi6_iif = this_dst->dev->ifindex; - fl.u.ip6.flowlabel = ip6_flowinfo(ipv6_hdr(pkt->skb)); - fl.u.ip6.flowi6_mark = pkt->skb->mark; - fl.u.ip6.flowi6_flags = FLOWI_FLAG_ANYSRC; - break; - } - - if (!dst_hold_safe(this_dst)) - return -ENOENT; - - nf_route(nft_net(pkt), &other_dst, &fl, false, nft_pf(pkt)); - if (!other_dst) { - dst_release(this_dst); - return -ENOENT; - } - - nft_default_forward_path(route, this_dst, dir); - nft_default_forward_path(route, other_dst, !dir); - - if (route->tuple[dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH && - route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) { - nft_dev_forward_path(route, ct, dir, ft); - nft_dev_forward_path(route, ct, !dir, ft); - } - - return 0; -} - static bool nft_flow_offload_skip(struct sk_buff *skb, int family) { if (skb_sec_path(skb)) -- 2.47.3 Use dev_queue_xmit() for the XMIT_NEIGH case. Store the interface index of the real device behind the vlan/pppoe device, this introduces an extra lookup for the real device in the xmit path because rt->dst.dev provides the vlan/pppoe device. XMIT_NEIGH now looks more similar to XMIT_DIRECT but the check for stale dst and the neighbour lookup still remain in place which is convenient to deal with network topology changes. Note that nft_flow_route() needs to relax the check for _XMIT_NEIGH so the existing basic xfrm offload (which only works in one direction) does not break. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 1 + net/netfilter/nf_flow_table_core.c | 1 + net/netfilter/nf_flow_table_ip.c | 87 ++++++++++++++++----------- net/netfilter/nf_flow_table_path.c | 7 +-- 4 files changed, 57 insertions(+), 39 deletions(-) diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index e9f72d2558e9..efede742106c 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -140,6 +140,7 @@ struct flow_offload_tuple { u16 mtu; union { struct { + u32 ifidx; struct dst_entry *dst_cache; u32 dst_cookie; }; diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 9441ac3d8c1a..98d7b3708602 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -132,6 +132,7 @@ static int flow_offload_fill_route(struct flow_offload *flow, break; case FLOW_OFFLOAD_XMIT_XFRM: case FLOW_OFFLOAD_XMIT_NEIGH: + flow_tuple->ifidx = route->tuple[dir].out.ifindex; flow_tuple->dst_cache = dst; flow_tuple->dst_cookie = flow_offload_dst_cookie(flow_tuple); break; diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 8cd4cf7ae211..8b74fb34998e 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -333,19 +333,18 @@ static void nf_flow_encap_pop(struct sk_buff *skb, } } +struct nf_flow_xmit { + const void *dest; + const void *source; + struct net_device *outdev; +}; + static unsigned int nf_flow_queue_xmit(struct net *net, struct sk_buff *skb, - const struct flow_offload_tuple_rhash *tuplehash, - unsigned short type) + struct nf_flow_xmit *xmit) { - struct net_device *outdev; - - outdev = dev_get_by_index_rcu(net, tuplehash->tuple.out.ifidx); - if (!outdev) - return NF_DROP; - - skb->dev = outdev; - dev_hard_header(skb, skb->dev, type, tuplehash->tuple.out.h_dest, - tuplehash->tuple.out.h_source, skb->len); + skb->dev = xmit->outdev; + dev_hard_header(skb, skb->dev, ntohs(skb->protocol), + xmit->dest, xmit->source, skb->len); dev_queue_xmit(skb); return NF_STOLEN; @@ -424,10 +423,10 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, struct nf_flowtable_ctx ctx = { .in = state->in, }; + struct nf_flow_xmit xmit = {}; struct flow_offload *flow; - struct net_device *outdev; + struct neighbour *neigh; struct rtable *rt; - __be32 nexthop; int ret; tuplehash = nf_flow_offload_lookup(&ctx, flow_table, skb); @@ -454,25 +453,34 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, switch (tuplehash->tuple.xmit_type) { case FLOW_OFFLOAD_XMIT_NEIGH: rt = dst_rtable(tuplehash->tuple.dst_cache); - outdev = rt->dst.dev; - skb->dev = outdev; - nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr); + xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.ifidx); + if (!xmit.outdev) { + flow_offload_teardown(flow); + return NF_DROP; + } + neigh = ip_neigh_gw4(rt->dst.dev, rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr)); + if (IS_ERR(neigh)) { + flow_offload_teardown(flow); + return NF_DROP; + } + xmit.dest = neigh->ha; skb_dst_set_noref(skb, &rt->dst); - neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb); - ret = NF_STOLEN; break; case FLOW_OFFLOAD_XMIT_DIRECT: - ret = nf_flow_queue_xmit(state->net, skb, tuplehash, ETH_P_IP); - if (ret == NF_DROP) + xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.out.ifidx); + if (!xmit.outdev) { flow_offload_teardown(flow); + return NF_DROP; + } + xmit.dest = tuplehash->tuple.out.h_dest; + xmit.source = tuplehash->tuple.out.h_source; break; default: WARN_ON_ONCE(1); - ret = NF_DROP; - break; + return NF_DROP; } - return ret; + return nf_flow_queue_xmit(state->net, skb, &xmit); } EXPORT_SYMBOL_GPL(nf_flow_offload_ip_hook); @@ -719,9 +727,9 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, struct nf_flowtable_ctx ctx = { .in = state->in, }; - const struct in6_addr *nexthop; + struct nf_flow_xmit xmit = {}; struct flow_offload *flow; - struct net_device *outdev; + struct neighbour *neigh; struct rt6_info *rt; int ret; @@ -749,24 +757,33 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, switch (tuplehash->tuple.xmit_type) { case FLOW_OFFLOAD_XMIT_NEIGH: rt = dst_rt6_info(tuplehash->tuple.dst_cache); - outdev = rt->dst.dev; - skb->dev = outdev; - nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6); + xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.out.ifidx); + if (!xmit.outdev) { + flow_offload_teardown(flow); + return NF_DROP; + } + neigh = ip_neigh_gw6(rt->dst.dev, rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6)); + if (IS_ERR(neigh)) { + flow_offload_teardown(flow); + return NF_DROP; + } + xmit.dest = neigh->ha; skb_dst_set_noref(skb, &rt->dst); - neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb); - ret = NF_STOLEN; break; case FLOW_OFFLOAD_XMIT_DIRECT: - ret = nf_flow_queue_xmit(state->net, skb, tuplehash, ETH_P_IPV6); - if (ret == NF_DROP) + xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.out.ifidx); + if (!xmit.outdev) { flow_offload_teardown(flow); + return NF_DROP; + } + xmit.dest = tuplehash->tuple.out.h_dest; + xmit.source = tuplehash->tuple.out.h_source; break; default: WARN_ON_ONCE(1); - ret = NF_DROP; - break; + return NF_DROP; } - return ret; + return nf_flow_queue_xmit(state->net, skb, &xmit); } EXPORT_SYMBOL_GPL(nf_flow_offload_ipv6_hook); diff --git a/net/netfilter/nf_flow_table_path.c b/net/netfilter/nf_flow_table_path.c index 159aa5c8da60..15c042cab9fb 100644 --- a/net/netfilter/nf_flow_table_path.c +++ b/net/netfilter/nf_flow_table_path.c @@ -204,11 +204,11 @@ static void nft_dev_forward_path(struct nf_flow_route *route, } route->tuple[!dir].in.num_encaps = info.num_encaps; route->tuple[!dir].in.ingress_vlans = info.ingress_vlans; + route->tuple[dir].out.ifindex = info.outdev->ifindex; if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) { memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN); memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN); - route->tuple[dir].out.ifindex = info.outdev->ifindex; route->tuple[dir].out.hw_ifindex = info.hw_outdev->ifindex; route->tuple[dir].xmit_type = info.xmit_type; } @@ -256,11 +256,10 @@ int nft_flow_route(const struct nft_pktinfo *pkt, const struct nf_conn *ct, nft_default_forward_path(route, this_dst, dir); nft_default_forward_path(route, other_dst, !dir); - if (route->tuple[dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH && - route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) { + if (route->tuple[dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) nft_dev_forward_path(route, ct, dir, ft); + if (route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) nft_dev_forward_path(route, ct, !dir, ft); - } return 0; } -- 2.47.3 Push the vlan header from the flowtable xmit path, instead of passing the packet to the vlan device. This is based on a patch originally written by wenxu. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_ip.c | 25 +++++++++++++++++++++++++ net/netfilter/nf_flow_table_path.c | 7 ++++--- 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 8b74fb34998e..a4229fb65444 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -413,6 +413,25 @@ static int nf_flow_offload_forward(struct nf_flowtable_ctx *ctx, return 1; } +static int nf_flow_encap_push(struct sk_buff *skb, + struct flow_offload_tuple *tuple) +{ + int i; + + for (i = 0; i < tuple->encap_num; i++) { + switch (tuple->encap[i].proto) { + case htons(ETH_P_8021Q): + case htons(ETH_P_8021AD): + if (skb_vlan_push(skb, tuple->encap[i].proto, + tuple->encap[i].id) < 0) + return -1; + break; + } + } + + return 0; +} + unsigned int nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) @@ -450,6 +469,9 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, dir = tuplehash->tuple.dir; flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); + if (nf_flow_encap_push(skb, &flow->tuplehash[!dir].tuple) < 0) + return NF_DROP; + switch (tuplehash->tuple.xmit_type) { case FLOW_OFFLOAD_XMIT_NEIGH: rt = dst_rtable(tuplehash->tuple.dst_cache); @@ -754,6 +776,9 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, dir = tuplehash->tuple.dir; flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); + if (nf_flow_encap_push(skb, &flow->tuplehash[!dir].tuple) < 0) + return NF_DROP; + switch (tuplehash->tuple.xmit_type) { case FLOW_OFFLOAD_XMIT_NEIGH: rt = dst_rt6_info(tuplehash->tuple.dst_cache); diff --git a/net/netfilter/nf_flow_table_path.c b/net/netfilter/nf_flow_table_path.c index 15c042cab9fb..1cb04c3e6dde 100644 --- a/net/netfilter/nf_flow_table_path.c +++ b/net/netfilter/nf_flow_table_path.c @@ -119,13 +119,14 @@ static void nft_dev_path_info(const struct net_device_path_stack *stack, info->indev = NULL; break; } - if (!info->outdev) - info->outdev = path->dev; info->encap[info->num_encaps].id = path->encap.id; info->encap[info->num_encaps].proto = path->encap.proto; info->num_encaps++; - if (path->type == DEV_PATH_PPPOE) + if (path->type == DEV_PATH_PPPOE) { + if (!info->outdev) + info->outdev = path->dev; memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN); + } break; case DEV_PATH_BRIDGE: if (is_zero_ether_addr(info->h_source)) -- 2.47.3 Push the pppoe header from the flowtable xmit path, instead of passing the packet to the pppoe device which delivers the packet to the userspace pppd daemon for encapsulation. This is based on a patch originally written by wenxu. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_ip.c | 42 ++++++++++++++++++++++++++++++ net/netfilter/nf_flow_table_path.c | 9 ++----- 2 files changed, 44 insertions(+), 7 deletions(-) diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index a4229fb65444..61ab3102c8ec 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -413,6 +413,44 @@ static int nf_flow_offload_forward(struct nf_flowtable_ctx *ctx, return 1; } +static int nf_flow_pppoe_push(struct sk_buff *skb, u16 id) +{ + int data_len = skb->len + sizeof(__be16); + struct ppp_hdr { + struct pppoe_hdr hdr; + __be16 proto; + } *ph; + __be16 proto; + + if (skb_cow_head(skb, PPPOE_SES_HLEN)) + return -1; + + switch (skb->protocol) { + case htons(ETH_P_IP): + proto = htons(PPP_IP); + break; + case htons(ETH_P_IPV6): + proto = htons(PPP_IPV6); + break; + default: + return -1; + } + + __skb_push(skb, PPPOE_SES_HLEN); + skb_reset_network_header(skb); + + ph = (struct ppp_hdr *)(skb->data); + ph->hdr.ver = 1; + ph->hdr.type = 1; + ph->hdr.code = 0; + ph->hdr.sid = htons(id); + ph->hdr.length = htons(data_len); + ph->proto = proto; + skb->protocol = htons(ETH_P_PPP_SES); + + return 0; +} + static int nf_flow_encap_push(struct sk_buff *skb, struct flow_offload_tuple *tuple) { @@ -426,6 +464,10 @@ static int nf_flow_encap_push(struct sk_buff *skb, tuple->encap[i].id) < 0) return -1; break; + case htons(ETH_P_PPP_SES): + if (nf_flow_pppoe_push(skb, tuple->encap[i].id) < 0) + return -1; + break; } } diff --git a/net/netfilter/nf_flow_table_path.c b/net/netfilter/nf_flow_table_path.c index 1cb04c3e6dde..7ba6a0c4e5d8 100644 --- a/net/netfilter/nf_flow_table_path.c +++ b/net/netfilter/nf_flow_table_path.c @@ -122,11 +122,8 @@ static void nft_dev_path_info(const struct net_device_path_stack *stack, info->encap[info->num_encaps].id = path->encap.id; info->encap[info->num_encaps].proto = path->encap.proto; info->num_encaps++; - if (path->type == DEV_PATH_PPPOE) { - if (!info->outdev) - info->outdev = path->dev; + if (path->type == DEV_PATH_PPPOE) memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN); - } break; case DEV_PATH_BRIDGE: if (is_zero_ether_addr(info->h_source)) @@ -154,9 +151,7 @@ static void nft_dev_path_info(const struct net_device_path_stack *stack, break; } } - if (!info->outdev) - info->outdev = info->indev; - + info->outdev = info->indev; info->hw_outdev = info->indev; if (nf_flowtable_hw_offload(flowtable) && -- 2.47.3 hw_ifidx was originally introduced to store the real netdevice as a requirement for the hardware offload support in: 73f97025a972 ("netfilter: nft_flow_offload: use direct xmit if hardware offload is enabled") Since ("netfilter: flowtable: consolidate xmit path"), ifidx and hw_ifidx points to the real device in the xmit path, remove it. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 1 - net/netfilter/nf_flow_table_core.c | 1 - net/netfilter/nf_flow_table_offload.c | 2 +- net/netfilter/nf_flow_table_path.c | 3 --- 4 files changed, 1 insertion(+), 6 deletions(-) diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index efede742106c..89cfe7228398 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -146,7 +146,6 @@ struct flow_offload_tuple { }; struct { u32 ifidx; - u32 hw_ifidx; u8 h_source[ETH_ALEN]; u8 h_dest[ETH_ALEN]; } out; diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 98d7b3708602..6c6a5165f993 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -127,7 +127,6 @@ static int flow_offload_fill_route(struct flow_offload *flow, memcpy(flow_tuple->out.h_source, route->tuple[dir].out.h_source, ETH_ALEN); flow_tuple->out.ifidx = route->tuple[dir].out.ifindex; - flow_tuple->out.hw_ifidx = route->tuple[dir].out.hw_ifindex; dst_release(dst); break; case FLOW_OFFLOAD_XMIT_XFRM: diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index e06bc36f49fe..d8f7bfd60ac6 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -555,7 +555,7 @@ static void flow_offload_redirect(struct net *net, switch (this_tuple->xmit_type) { case FLOW_OFFLOAD_XMIT_DIRECT: this_tuple = &flow->tuplehash[dir].tuple; - ifindex = this_tuple->out.hw_ifidx; + ifindex = this_tuple->out.ifidx; break; case FLOW_OFFLOAD_XMIT_NEIGH: other_tuple = &flow->tuplehash[!dir].tuple; diff --git a/net/netfilter/nf_flow_table_path.c b/net/netfilter/nf_flow_table_path.c index 7ba6a0c4e5d8..50b2b7d0c579 100644 --- a/net/netfilter/nf_flow_table_path.c +++ b/net/netfilter/nf_flow_table_path.c @@ -75,7 +75,6 @@ static int nft_dev_fill_forward_path(const struct nf_flow_route *route, struct nft_forward_info { const struct net_device *indev; const struct net_device *outdev; - const struct net_device *hw_outdev; struct id { __u16 id; __be16 proto; @@ -152,7 +151,6 @@ static void nft_dev_path_info(const struct net_device_path_stack *stack, } } info->outdev = info->indev; - info->hw_outdev = info->indev; if (nf_flowtable_hw_offload(flowtable) && nft_is_valid_ether_device(info->indev)) @@ -205,7 +203,6 @@ static void nft_dev_forward_path(struct nf_flow_route *route, if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) { memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN); memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN); - route->tuple[dir].out.hw_ifindex = info.hw_outdev->ifindex; route->tuple[dir].xmit_type = info.xmit_type; } } -- 2.47.3 This simplifies IPIP tunnel support coming in follow up patches. No function changes are intended. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_ip.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 61ab3102c8ec..ac6641a866e0 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -480,6 +480,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, { struct flow_offload_tuple_rhash *tuplehash; struct nf_flowtable *flow_table = priv; + struct flow_offload_tuple *other_tuple; enum flow_offload_tuple_dir dir; struct nf_flowtable_ctx ctx = { .in = state->in, @@ -488,6 +489,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, struct flow_offload *flow; struct neighbour *neigh; struct rtable *rt; + __be32 ip_daddr; int ret; tuplehash = nf_flow_offload_lookup(&ctx, flow_table, skb); @@ -510,8 +512,10 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, dir = tuplehash->tuple.dir; flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); + other_tuple = &flow->tuplehash[!dir].tuple; + ip_daddr = other_tuple->src_v4.s_addr; - if (nf_flow_encap_push(skb, &flow->tuplehash[!dir].tuple) < 0) + if (nf_flow_encap_push(skb, other_tuple) < 0) return NF_DROP; switch (tuplehash->tuple.xmit_type) { @@ -522,7 +526,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, flow_offload_teardown(flow); return NF_DROP; } - neigh = ip_neigh_gw4(rt->dst.dev, rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr)); + neigh = ip_neigh_gw4(rt->dst.dev, rt_nexthop(rt, ip_daddr)); if (IS_ERR(neigh)) { flow_offload_teardown(flow); return NF_DROP; @@ -787,11 +791,13 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, { struct flow_offload_tuple_rhash *tuplehash; struct nf_flowtable *flow_table = priv; + struct flow_offload_tuple *other_tuple; enum flow_offload_tuple_dir dir; struct nf_flowtable_ctx ctx = { .in = state->in, }; struct nf_flow_xmit xmit = {}; + struct in6_addr *ip6_daddr; struct flow_offload *flow; struct neighbour *neigh; struct rt6_info *rt; @@ -817,8 +823,10 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, dir = tuplehash->tuple.dir; flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); + other_tuple = &flow->tuplehash[!dir].tuple; + ip6_daddr = &other_tuple->src_v6; - if (nf_flow_encap_push(skb, &flow->tuplehash[!dir].tuple) < 0) + if (nf_flow_encap_push(skb, other_tuple) < 0) return NF_DROP; switch (tuplehash->tuple.xmit_type) { @@ -829,7 +837,7 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, flow_offload_teardown(flow); return NF_DROP; } - neigh = ip_neigh_gw6(rt->dst.dev, rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6)); + neigh = ip_neigh_gw6(rt->dst.dev, rt6_nexthop(rt, ip6_daddr)); if (IS_ERR(neigh)) { flow_offload_teardown(flow); return NF_DROP; -- 2.47.3 From: Lorenzo Bianconi Introduce sw acceleration for rx path of IPIP tunnels relying on the netfilter flowtable infrastructure. Subsequent patches will add sw acceleration for IPIP tunnels tx path. This series introduces basic infrastructure to accelerate other tunnel types (e.g. IP6IP6). IPIP rx sw acceleration can be tested running the following scenario where the traffic is forwarded between two NICs (eth0 and eth1) and an IPIP tunnel is used to access a remote site (using eth1 as the underlay device): ETH0 -- TUN0 <==> ETH1 -- [IP network] -- TUN1 (192.168.100.2) $ip addr show 6: eth0: mtu 1500 qdisc noqueue state UP group default qlen 1000 link/ether 00:00:22:33:11:55 brd ff:ff:ff:ff:ff:ff inet 192.168.0.2/24 scope global eth0 valid_lft forever preferred_lft forever 7: eth1: mtu 1500 qdisc noqueue state UP group default qlen 1000 link/ether 00:11:22:33:11:55 brd ff:ff:ff:ff:ff:ff inet 192.168.1.1/24 scope global eth1 valid_lft forever preferred_lft forever 8: tun0@NONE: mtu 1480 qdisc noqueue state UNKNOWN group default qlen 1000 link/ipip 192.168.1.1 peer 192.168.1.2 inet 192.168.100.1/24 scope global tun0 valid_lft forever preferred_lft forever $ip route show default via 192.168.100.2 dev tun0 192.168.0.0/24 dev eth0 proto kernel scope link src 192.168.0.2 192.168.1.0/24 dev eth1 proto kernel scope link src 192.168.1.1 192.168.100.0/24 dev tun0 proto kernel scope link src 192.168.100.1 $nft list ruleset table inet filter { flowtable ft { hook ingress priority filter devices = { eth0, eth1 } } chain forward { type filter hook forward priority filter; policy accept; meta l4proto { tcp, udp } flow add @ft } } Reproducing the scenario described above using veths I got the following results: - TCP stream received from the IPIP tunnel: - net-next: (baseline) ~ 71Gbps - net-next + IPIP flowtbale support: ~101Gbps Signed-off-by: Lorenzo Bianconi Signed-off-by: Pablo Neira Ayuso --- include/linux/netdevice.h | 13 +++++ include/net/netfilter/nf_flow_table.h | 18 +++++++ net/ipv4/ipip.c | 25 ++++++++++ net/netfilter/nf_flow_table_core.c | 3 ++ net/netfilter/nf_flow_table_ip.c | 69 ++++++++++++++++++++++++--- net/netfilter/nf_flow_table_path.c | 38 ++++++++++++--- 6 files changed, 153 insertions(+), 13 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e808071dbb7d..bf99fe8622da 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -877,6 +877,7 @@ enum net_device_path_type { DEV_PATH_PPPOE, DEV_PATH_DSA, DEV_PATH_MTK_WDMA, + DEV_PATH_TUN, }; struct net_device_path { @@ -888,6 +889,18 @@ struct net_device_path { __be16 proto; u8 h_dest[ETH_ALEN]; } encap; + struct { + union { + struct in_addr src_v4; + struct in6_addr src_v6; + }; + union { + struct in_addr dst_v4; + struct in6_addr dst_v6; + }; + + u8 l3_proto; + } tun; struct { enum { DEV_PATH_BR_VLAN_KEEP, diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 89cfe7228398..497f5179b3da 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -107,6 +107,19 @@ enum flow_offload_xmit_type { #define NF_FLOW_TABLE_ENCAP_MAX 2 +struct flow_offload_tunnel { + union { + struct in_addr src_v4; + struct in6_addr src_v6; + }; + union { + struct in_addr dst_v4; + struct in6_addr dst_v6; + }; + + u8 l3_proto; +}; + struct flow_offload_tuple { union { struct in_addr src_v4; @@ -130,12 +143,15 @@ struct flow_offload_tuple { __be16 proto; } encap[NF_FLOW_TABLE_ENCAP_MAX]; + struct flow_offload_tunnel tun; + /* All members above are keys for lookups, see flow_offload_hash(). */ struct { } __hash; u8 dir:2, xmit_type:3, encap_num:2, + tun_num:2, in_vlan_ingress:2; u16 mtu; union { @@ -206,7 +222,9 @@ struct nf_flow_route { u16 id; __be16 proto; } encap[NF_FLOW_TABLE_ENCAP_MAX]; + struct flow_offload_tunnel tun; u8 num_encaps:2, + num_tuns:2, ingress_vlans:2; } in; struct { diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 3e03af073a1c..ff95b1b9908e 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -353,6 +353,30 @@ ipip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd) return ip_tunnel_ctl(dev, p, cmd); } +static int ipip_fill_forward_path(struct net_device_path_ctx *ctx, + struct net_device_path *path) +{ + struct ip_tunnel *tunnel = netdev_priv(ctx->dev); + const struct iphdr *tiph = &tunnel->parms.iph; + struct rtable *rt; + + rt = ip_route_output(dev_net(ctx->dev), tiph->daddr, 0, 0, 0, + RT_SCOPE_UNIVERSE); + if (IS_ERR(rt)) + return PTR_ERR(rt); + + path->type = DEV_PATH_TUN; + path->tun.src_v4.s_addr = tiph->saddr; + path->tun.dst_v4.s_addr = tiph->daddr; + path->tun.l3_proto = IPPROTO_IPIP; + path->dev = ctx->dev; + + ctx->dev = rt->dst.dev; + ip_rt_put(rt); + + return 0; +} + static const struct net_device_ops ipip_netdev_ops = { .ndo_init = ipip_tunnel_init, .ndo_uninit = ip_tunnel_uninit, @@ -362,6 +386,7 @@ static const struct net_device_ops ipip_netdev_ops = { .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, .ndo_tunnel_ctl = ipip_tunnel_ctl, + .ndo_fill_forward_path = ipip_fill_forward_path, }; #define IPIP_FEATURES (NETIF_F_SG | \ diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 6c6a5165f993..06e8251a6644 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -118,7 +118,10 @@ static int flow_offload_fill_route(struct flow_offload *flow, flow_tuple->in_vlan_ingress |= BIT(j); j++; } + + flow_tuple->tun = route->tuple[dir].in.tun; flow_tuple->encap_num = route->tuple[dir].in.num_encaps; + flow_tuple->tun_num = route->tuple[dir].in.num_tuns; switch (route->tuple[dir].xmit_type) { case FLOW_OFFLOAD_XMIT_DIRECT: diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index ac6641a866e0..2214e858e096 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -145,8 +145,11 @@ static bool ip_has_options(unsigned int thoff) static void nf_flow_tuple_encap(struct sk_buff *skb, struct flow_offload_tuple *tuple) { + __be16 inner_proto = skb->protocol; struct vlan_ethhdr *veth; struct pppoe_hdr *phdr; + struct iphdr *iph; + u16 offset = 0; int i = 0; if (skb_vlan_tag_present(skb)) { @@ -159,13 +162,26 @@ static void nf_flow_tuple_encap(struct sk_buff *skb, veth = (struct vlan_ethhdr *)skb_mac_header(skb); tuple->encap[i].id = ntohs(veth->h_vlan_TCI); tuple->encap[i].proto = skb->protocol; + inner_proto = veth->h_vlan_encapsulated_proto; + offset += VLAN_HLEN; break; case htons(ETH_P_PPP_SES): phdr = (struct pppoe_hdr *)skb_network_header(skb); tuple->encap[i].id = ntohs(phdr->sid); tuple->encap[i].proto = skb->protocol; + inner_proto = *((__be16 *)(phdr + 1)); + offset += PPPOE_SES_HLEN; break; } + + if (inner_proto == htons(ETH_P_IP)) { + iph = (struct iphdr *)(skb_network_header(skb) + offset); + if (iph->protocol == IPPROTO_IPIP) { + tuple->tun.dst_v4.s_addr = iph->daddr; + tuple->tun.src_v4.s_addr = iph->saddr; + tuple->tun.l3_proto = IPPROTO_IPIP; + } + } } struct nf_flowtable_ctx { @@ -277,11 +293,46 @@ static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb, return NF_STOLEN; } +static bool nf_flow_ip4_tunnel_proto(struct sk_buff *skb, u32 *psize) +{ + struct iphdr *iph; + u16 size; + + if (!pskb_may_pull(skb, sizeof(*iph) + *psize)) + return false; + + iph = (struct iphdr *)(skb_network_header(skb) + *psize); + size = iph->ihl << 2; + + if (ip_is_fragment(iph) || unlikely(ip_has_options(size))) + return false; + + if (iph->ttl <= 1) + return false; + + if (iph->protocol == IPPROTO_IPIP) + *psize += size; + + return true; +} + +static void nf_flow_ip4_tunnel_pop(struct sk_buff *skb) +{ + struct iphdr *iph = (struct iphdr *)skb_network_header(skb); + + if (iph->protocol != IPPROTO_IPIP) + return; + + skb_pull(skb, iph->ihl << 2); + skb_reset_network_header(skb); +} + static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto, u32 *offset) { + __be16 inner_proto = skb->protocol; struct vlan_ethhdr *veth; - __be16 inner_proto; + bool ret = false; switch (skb->protocol) { case htons(ETH_P_8021Q): @@ -291,19 +342,23 @@ static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto, veth = (struct vlan_ethhdr *)skb_mac_header(skb); if (veth->h_vlan_encapsulated_proto == proto) { *offset += VLAN_HLEN; - return true; + inner_proto = proto; + ret = true; } break; case htons(ETH_P_PPP_SES): if (nf_flow_pppoe_proto(skb, &inner_proto) && inner_proto == proto) { *offset += PPPOE_SES_HLEN; - return true; + ret = true; } break; } - return false; + if (inner_proto == htons(ETH_P_IP)) + ret = nf_flow_ip4_tunnel_proto(skb, offset); + + return ret; } static void nf_flow_encap_pop(struct sk_buff *skb, @@ -331,6 +386,9 @@ static void nf_flow_encap_pop(struct sk_buff *skb, break; } } + + if (skb->protocol == htons(ETH_P_IP)) + nf_flow_ip4_tunnel_pop(skb); } struct nf_flow_xmit { @@ -356,8 +414,7 @@ nf_flow_offload_lookup(struct nf_flowtable_ctx *ctx, { struct flow_offload_tuple tuple = {}; - if (skb->protocol != htons(ETH_P_IP) && - !nf_flow_skb_encap_protocol(skb, htons(ETH_P_IP), &ctx->offset)) + if (!nf_flow_skb_encap_protocol(skb, htons(ETH_P_IP), &ctx->offset)) return NULL; if (nf_flow_tuple_ip(ctx, skb, &tuple) < 0) diff --git a/net/netfilter/nf_flow_table_path.c b/net/netfilter/nf_flow_table_path.c index 50b2b7d0c579..64e2672a3c33 100644 --- a/net/netfilter/nf_flow_table_path.c +++ b/net/netfilter/nf_flow_table_path.c @@ -80,6 +80,8 @@ struct nft_forward_info { __be16 proto; } encap[NF_FLOW_TABLE_ENCAP_MAX]; u8 num_encaps; + struct flow_offload_tunnel tun; + u8 num_tuns; u8 ingress_vlans; u8 h_source[ETH_ALEN]; u8 h_dest[ETH_ALEN]; @@ -102,6 +104,7 @@ static void nft_dev_path_info(const struct net_device_path_stack *stack, case DEV_PATH_DSA: case DEV_PATH_VLAN: case DEV_PATH_PPPOE: + case DEV_PATH_TUN: info->indev = path->dev; if (is_zero_ether_addr(info->h_source)) memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); @@ -113,14 +116,27 @@ static void nft_dev_path_info(const struct net_device_path_stack *stack, break; } - /* DEV_PATH_VLAN and DEV_PATH_PPPOE */ - if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) { - info->indev = NULL; - break; + /* DEV_PATH_VLAN, DEV_PATH_PPPOE and DEV_PATH_TUN */ + if (path->type == DEV_PATH_TUN) { + if (info->num_tuns) { + info->indev = NULL; + break; + } + info->tun.src_v6 = path->tun.src_v6; + info->tun.dst_v6 = path->tun.dst_v6; + info->tun.l3_proto = path->tun.l3_proto; + info->num_tuns++; + } else { + if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) { + info->indev = NULL; + break; + } + info->encap[info->num_encaps].id = + path->encap.id; + info->encap[info->num_encaps].proto = + path->encap.proto; + info->num_encaps++; } - info->encap[info->num_encaps].id = path->encap.id; - info->encap[info->num_encaps].proto = path->encap.proto; - info->num_encaps++; if (path->type == DEV_PATH_PPPOE) memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN); break; @@ -196,6 +212,14 @@ static void nft_dev_forward_path(struct nf_flow_route *route, route->tuple[!dir].in.encap[i].id = info.encap[i].id; route->tuple[!dir].in.encap[i].proto = info.encap[i].proto; } + + if (info.num_tuns) { + route->tuple[!dir].in.tun.src_v6 = info.tun.dst_v6; + route->tuple[!dir].in.tun.dst_v6 = info.tun.src_v6; + route->tuple[!dir].in.tun.l3_proto = info.tun.l3_proto; + route->tuple[!dir].in.num_tuns = info.num_tuns; + } + route->tuple[!dir].in.num_encaps = info.num_encaps; route->tuple[!dir].in.ingress_vlans = info.ingress_vlans; route->tuple[dir].out.ifindex = info.outdev->ifindex; -- 2.47.3 From: Lorenzo Bianconi Introduce sw acceleration for tx path of IPIP tunnels relying on the netfilter flowtable infrastructure. This patch introduces basic infrastructure to accelerate other tunnel types (e.g. IP6IP6). IPIP sw tx acceleration can be tested running the following scenario where the traffic is forwarded between two NICs (eth0 and eth1) and an IPIP tunnel is used to access a remote site (using eth1 as the underlay device): ETH0 -- TUN0 <==> ETH1 -- [IP network] -- TUN1 (192.168.100.2) $ip addr show 6: eth0: mtu 1500 qdisc noqueue state UP group default qlen 1000 link/ether 00:00:22:33:11:55 brd ff:ff:ff:ff:ff:ff inet 192.168.0.2/24 scope global eth0 valid_lft forever preferred_lft forever 7: eth1: mtu 1500 qdisc noqueue state UP group default qlen 1000 link/ether 00:11:22:33:11:55 brd ff:ff:ff:ff:ff:ff inet 192.168.1.1/24 scope global eth1 valid_lft forever preferred_lft forever 8: tun0@NONE: mtu 1480 qdisc noqueue state UNKNOWN group default qlen 1000 link/ipip 192.168.1.1 peer 192.168.1.2 inet 192.168.100.1/24 scope global tun0 valid_lft forever preferred_lft forever $ip route show default via 192.168.100.2 dev tun0 192.168.0.0/24 dev eth0 proto kernel scope link src 192.168.0.2 192.168.1.0/24 dev eth1 proto kernel scope link src 192.168.1.1 192.168.100.0/24 dev tun0 proto kernel scope link src 192.168.100.1 $nft list ruleset table inet filter { flowtable ft { hook ingress priority filter devices = { eth0, eth1 } } chain forward { type filter hook forward priority filter; policy accept; meta l4proto { tcp, udp } flow add @ft } } Reproducing the scenario described above using veths I got the following results: - TCP stream trasmitted into the IPIP tunnel: - net-next: (baseline) ~ 85Gbps - net-next + IPIP flowtable support: ~102Gbps Co-developed-by: Pablo Neira Ayuso Signed-off-by: Lorenzo Bianconi Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_ip.c | 62 ++++++++++++++++++++++++++++++ net/netfilter/nf_flow_table_path.c | 48 +++++++++++++++++++++-- 2 files changed, 106 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 2214e858e096..e128b0fe9a7b 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -437,6 +437,9 @@ static int nf_flow_offload_forward(struct nf_flowtable_ctx *ctx, flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); mtu = flow->tuplehash[dir].tuple.mtu + ctx->offset; + if (flow->tuplehash[!dir].tuple.tun_num) + mtu -= sizeof(*iph); + if (unlikely(nf_flow_exceeds_mtu(skb, mtu))) return 0; @@ -508,6 +511,62 @@ static int nf_flow_pppoe_push(struct sk_buff *skb, u16 id) return 0; } +static int nf_flow_tunnel_ipip_push(struct net *net, struct sk_buff *skb, + struct flow_offload_tuple *tuple, + __be32 *ip_daddr) +{ + struct iphdr *iph = (struct iphdr *)skb_network_header(skb); + struct rtable *rt = dst_rtable(tuple->dst_cache); + u8 tos = iph->tos, ttl = iph->ttl; + __be16 frag_off = iph->frag_off; + u32 headroom = sizeof(*iph); + int err; + + err = iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4); + if (err) + return err; + + skb_set_inner_ipproto(skb, IPPROTO_IPIP); + headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len; + err = skb_cow_head(skb, headroom); + if (err) + return err; + + skb_scrub_packet(skb, true); + skb_clear_hash_if_not_l4(skb); + + /* Push down and install the IP header. */ + skb_push(skb, sizeof(*iph)); + skb_reset_network_header(skb); + + iph = ip_hdr(skb); + iph->version = 4; + iph->ihl = sizeof(*iph) >> 2; + iph->frag_off = ip_mtu_locked(&rt->dst) ? 0 : frag_off; + iph->protocol = tuple->tun.l3_proto; + iph->tos = tos; + iph->daddr = tuple->tun.src_v4.s_addr; + iph->saddr = tuple->tun.dst_v4.s_addr; + iph->ttl = ttl; + iph->tot_len = htons(skb->len); + __ip_select_ident(net, iph, skb_shinfo(skb)->gso_segs ?: 1); + ip_send_check(iph); + + *ip_daddr = tuple->tun.src_v4.s_addr; + + return 0; +} + +static int nf_flow_tunnel_v4_push(struct net *net, struct sk_buff *skb, + struct flow_offload_tuple *tuple, + __be32 *ip_daddr) +{ + if (tuple->tun_num) + return nf_flow_tunnel_ipip_push(net, skb, tuple, ip_daddr); + + return 0; +} + static int nf_flow_encap_push(struct sk_buff *skb, struct flow_offload_tuple *tuple) { @@ -572,6 +631,9 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, other_tuple = &flow->tuplehash[!dir].tuple; ip_daddr = other_tuple->src_v4.s_addr; + if (nf_flow_tunnel_v4_push(state->net, skb, other_tuple, &ip_daddr) < 0) + return NF_DROP; + if (nf_flow_encap_push(skb, other_tuple) < 0) return NF_DROP; diff --git a/net/netfilter/nf_flow_table_path.c b/net/netfilter/nf_flow_table_path.c index 64e2672a3c33..7d6668e4d424 100644 --- a/net/netfilter/nf_flow_table_path.c +++ b/net/netfilter/nf_flow_table_path.c @@ -190,7 +190,46 @@ static bool nft_flowtable_find_dev(const struct net_device *dev, return found; } -static void nft_dev_forward_path(struct nf_flow_route *route, +static int nft_flow_tunnel_update_route(const struct nft_pktinfo *pkt, + struct flow_offload_tunnel *tun, + struct nf_flow_route *route, + enum ip_conntrack_dir dir) +{ + struct dst_entry *cur_dst = route->tuple[dir].dst; + struct dst_entry *tun_dst = NULL; + struct flowi fl = {}; + + switch (nft_pf(pkt)) { + case NFPROTO_IPV4: + fl.u.ip4.daddr = tun->dst_v4.s_addr; + fl.u.ip4.saddr = tun->src_v4.s_addr; + fl.u.ip4.flowi4_iif = nft_in(pkt)->ifindex; + fl.u.ip4.flowi4_dscp = ip4h_dscp(ip_hdr(pkt->skb)); + fl.u.ip4.flowi4_mark = pkt->skb->mark; + fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC; + break; + case NFPROTO_IPV6: + fl.u.ip6.daddr = tun->dst_v6; + fl.u.ip6.saddr = tun->src_v6; + fl.u.ip6.flowi6_iif = nft_in(pkt)->ifindex; + fl.u.ip6.flowlabel = ip6_flowinfo(ipv6_hdr(pkt->skb)); + fl.u.ip6.flowi6_mark = pkt->skb->mark; + fl.u.ip6.flowi6_flags = FLOWI_FLAG_ANYSRC; + break; + } + + nf_route(nft_net(pkt), &tun_dst, &fl, false, nft_pf(pkt)); + if (!tun_dst) + return -ENOENT; + + route->tuple[dir].dst = tun_dst; + dst_release(cur_dst); + + return 0; +} + +static void nft_dev_forward_path(const struct nft_pktinfo *pkt, + struct nf_flow_route *route, const struct nf_conn *ct, enum ip_conntrack_dir dir, struct nft_flowtable *ft) @@ -213,7 +252,8 @@ static void nft_dev_forward_path(struct nf_flow_route *route, route->tuple[!dir].in.encap[i].proto = info.encap[i].proto; } - if (info.num_tuns) { + if (info.num_tuns && + !nft_flow_tunnel_update_route(pkt, &info.tun, route, dir)) { route->tuple[!dir].in.tun.src_v6 = info.tun.dst_v6; route->tuple[!dir].in.tun.dst_v6 = info.tun.src_v6; route->tuple[!dir].in.tun.l3_proto = info.tun.l3_proto; @@ -274,9 +314,9 @@ int nft_flow_route(const struct nft_pktinfo *pkt, const struct nf_conn *ct, nft_default_forward_path(route, other_dst, !dir); if (route->tuple[dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) - nft_dev_forward_path(route, ct, dir, ft); + nft_dev_forward_path(pkt, route, ct, dir, ft); if (route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) - nft_dev_forward_path(route, ct, !dir, ft); + nft_dev_forward_path(pkt, route, ct, !dir, ft); return 0; } -- 2.47.3 From: Lorenzo Bianconi Introduce specific selftest for IPIP flowtable SW acceleration in nft_flowtable.sh Signed-off-by: Lorenzo Bianconi Signed-off-by: Pablo Neira Ayuso --- .../selftests/net/netfilter/nft_flowtable.sh | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/tools/testing/selftests/net/netfilter/nft_flowtable.sh b/tools/testing/selftests/net/netfilter/nft_flowtable.sh index 45832df98295..1fbfc8ad8dcd 100755 --- a/tools/testing/selftests/net/netfilter/nft_flowtable.sh +++ b/tools/testing/selftests/net/netfilter/nft_flowtable.sh @@ -558,6 +558,73 @@ if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 ""; then ip netns exec "$nsr1" nft list ruleset fi +# IPIP tunnel test: +# Add IPIP tunnel interfaces and check flowtable acceleration. +test_ipip() { +if ! ip -net "$nsr1" link add name tun0 type ipip \ + local 192.168.10.1 remote 192.168.10.2 >/dev/null;then + echo "SKIP: could not add ipip tunnel" + [ "$ret" -eq 0 ] && ret=$ksft_skip + return +fi +ip -net "$nsr1" link set tun0 up +ip -net "$nsr1" addr add 192.168.100.1/24 dev tun0 +ip netns exec "$nsr1" sysctl net.ipv4.conf.tun0.forwarding=1 > /dev/null + +ip -net "$nsr2" link add name tun0 type ipip local 192.168.10.2 remote 192.168.10.1 +ip -net "$nsr2" link set tun0 up +ip -net "$nsr2" addr add 192.168.100.2/24 dev tun0 +ip netns exec "$nsr2" sysctl net.ipv4.conf.tun0.forwarding=1 > /dev/null + +ip -net "$nsr1" route change default via 192.168.100.2 +ip -net "$nsr2" route change default via 192.168.100.1 +ip -net "$ns2" route add default via 10.0.2.1 + +ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun0 accept' +ip netns exec "$nsr1" nft -a insert rule inet filter forward \ + 'meta oif "veth0" tcp sport 12345 ct mark set 1 flow add @f1 counter name routed_repl accept' + +if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 "IPIP tunnel"; then + echo "FAIL: flow offload for ns1/ns2 with IPIP tunnel" 1>&2 + ip netns exec "$nsr1" nft list ruleset + ret=1 +fi + +# Create vlan tagged devices for IPIP traffic. +ip -net "$nsr1" link add link veth1 name veth1.10 type vlan id 10 +ip -net "$nsr1" link set veth1.10 up +ip -net "$nsr1" addr add 192.168.20.1/24 dev veth1.10 +ip netns exec "$nsr1" sysctl net.ipv4.conf.veth1/10.forwarding=1 > /dev/null +ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif veth1.10 accept' +ip -net "$nsr1" link add name tun1 type ipip local 192.168.20.1 remote 192.168.20.2 +ip -net "$nsr1" link set tun1 up +ip -net "$nsr1" addr add 192.168.200.1/24 dev tun1 +ip -net "$nsr1" route change default via 192.168.200.2 +ip netns exec "$nsr1" sysctl net.ipv4.conf.tun1.forwarding=1 > /dev/null +ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun1 accept' + +ip -net "$nsr2" link add link veth0 name veth0.10 type vlan id 10 +ip -net "$nsr2" link set veth0.10 up +ip -net "$nsr2" addr add 192.168.20.2/24 dev veth0.10 +ip netns exec "$nsr2" sysctl net.ipv4.conf.veth0/10.forwarding=1 > /dev/null +ip -net "$nsr2" link add name tun1 type ipip local 192.168.20.2 remote 192.168.20.1 +ip -net "$nsr2" link set tun1 up +ip -net "$nsr2" addr add 192.168.200.2/24 dev tun1 +ip -net "$nsr2" route change default via 192.168.200.1 +ip netns exec "$nsr2" sysctl net.ipv4.conf.tun1.forwarding=1 > /dev/null + +if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 "IPIP tunnel over vlan"; then + echo "FAIL: flow offload for ns1/ns2 with IPIP tunnel over vlan" 1>&2 + ip netns exec "$nsr1" nft list ruleset + ret=1 +fi + +# Restore the previous configuration +ip -net "$nsr1" route change default via 192.168.10.2 +ip -net "$nsr2" route change default via 192.168.10.1 +ip -net "$ns2" route del default via 10.0.2.1 +} + # Another test: # Add bridge interface br0 to Router1, with NAT enabled. test_bridge() { @@ -643,6 +710,8 @@ ip -net "$nsr1" addr add dead:1::1/64 dev veth0 nodad ip -net "$nsr1" link set up dev veth0 } +test_ipip + test_bridge KEY_SHA="0x"$(ps -af | sha1sum | cut -d " " -f 1) -- 2.47.3 From: Fernando Fernandez Mancera When using nf_conncount infrastructure for non-confirmed connections a duplicated track is possible due to an optimization introduced since commit d265929930e2 ("netfilter: nf_conncount: reduce unnecessary GC"). In order to fix this introduce a new conncount API that receives directly an sk_buff struct. It fetches the tuple and zone and the corresponding ct from it. It comes with both existing conncount variants nf_conncount_count_skb() and nf_conncount_add_skb(). In addition remove the old API and adjust all the users to use the new one. This way, for each sk_buff struct it is possible to check if there is a ct present and already confirmed. If so, skip the add operation. Fixes: d265929930e2 ("netfilter: nf_conncount: reduce unnecessary GC") Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_count.h | 17 ++- net/netfilter/nf_conncount.c | 159 ++++++++++++++------- net/netfilter/nft_connlimit.c | 21 +-- net/netfilter/xt_connlimit.c | 14 +- net/openvswitch/conntrack.c | 16 +-- 5 files changed, 133 insertions(+), 94 deletions(-) diff --git a/include/net/netfilter/nf_conntrack_count.h b/include/net/netfilter/nf_conntrack_count.h index 1b58b5b91ff6..52a06de41aa0 100644 --- a/include/net/netfilter/nf_conntrack_count.h +++ b/include/net/netfilter/nf_conntrack_count.h @@ -18,15 +18,14 @@ struct nf_conncount_list { struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int keylen); void nf_conncount_destroy(struct net *net, struct nf_conncount_data *data); -unsigned int nf_conncount_count(struct net *net, - struct nf_conncount_data *data, - const u32 *key, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone); - -int nf_conncount_add(struct net *net, struct nf_conncount_list *list, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone); +unsigned int nf_conncount_count_skb(struct net *net, + const struct sk_buff *skb, + u16 l3num, + struct nf_conncount_data *data, + const u32 *key); + +int nf_conncount_add_skb(struct net *net, const struct sk_buff *skb, + u16 l3num, struct nf_conncount_list *list); void nf_conncount_list_init(struct nf_conncount_list *list); diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index 913ede2f57f9..dbaa3051577c 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -122,15 +122,65 @@ find_or_evict(struct net *net, struct nf_conncount_list *list, return ERR_PTR(-EAGAIN); } +static bool get_ct_or_tuple_from_skb(struct net *net, + const struct sk_buff *skb, + u16 l3num, + struct nf_conn **ct, + struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone **zone, + bool *refcounted) +{ + const struct nf_conntrack_tuple_hash *h; + enum ip_conntrack_info ctinfo; + struct nf_conn *found_ct; + + found_ct = nf_ct_get(skb, &ctinfo); + if (found_ct && !nf_ct_is_template(found_ct)) { + *tuple = found_ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + *zone = nf_ct_zone(found_ct); + *ct = found_ct; + return true; + } + + if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), l3num, net, tuple)) + return false; + + if (found_ct) + *zone = nf_ct_zone(found_ct); + + h = nf_conntrack_find_get(net, *zone, tuple); + if (!h) + return true; + + found_ct = nf_ct_tuplehash_to_ctrack(h); + *refcounted = true; + *ct = found_ct; + + return true; +} + static int __nf_conncount_add(struct net *net, - struct nf_conncount_list *list, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) + const struct sk_buff *skb, + u16 l3num, + struct nf_conncount_list *list) { + const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; const struct nf_conntrack_tuple_hash *found; struct nf_conncount_tuple *conn, *conn_n; + struct nf_conntrack_tuple tuple; + struct nf_conn *ct = NULL; struct nf_conn *found_ct; unsigned int collect = 0; + bool refcounted = false; + + if (!get_ct_or_tuple_from_skb(net, skb, l3num, &ct, &tuple, &zone, &refcounted)) + return -ENOENT; + + if (ct && nf_ct_is_confirmed(ct)) { + if (refcounted) + nf_ct_put(ct); + return 0; + } if ((u32)jiffies == list->last_gc) goto add_new_node; @@ -144,10 +194,10 @@ static int __nf_conncount_add(struct net *net, if (IS_ERR(found)) { /* Not found, but might be about to be confirmed */ if (PTR_ERR(found) == -EAGAIN) { - if (nf_ct_tuple_equal(&conn->tuple, tuple) && + if (nf_ct_tuple_equal(&conn->tuple, &tuple) && nf_ct_zone_id(&conn->zone, conn->zone.dir) == nf_ct_zone_id(zone, zone->dir)) - return 0; /* already exists */ + goto out_put; /* already exists */ } else { collect++; } @@ -156,7 +206,7 @@ static int __nf_conncount_add(struct net *net, found_ct = nf_ct_tuplehash_to_ctrack(found); - if (nf_ct_tuple_equal(&conn->tuple, tuple) && + if (nf_ct_tuple_equal(&conn->tuple, &tuple) && nf_ct_zone_equal(found_ct, zone, zone->dir)) { /* * We should not see tuples twice unless someone hooks @@ -165,7 +215,7 @@ static int __nf_conncount_add(struct net *net, * Attempt to avoid a re-add in this case. */ nf_ct_put(found_ct); - return 0; + goto out_put; } else if (already_closed(found_ct)) { /* * we do not care about connections which are @@ -188,31 +238,35 @@ static int __nf_conncount_add(struct net *net, if (conn == NULL) return -ENOMEM; - conn->tuple = *tuple; + conn->tuple = tuple; conn->zone = *zone; conn->cpu = raw_smp_processor_id(); conn->jiffies32 = (u32)jiffies; list_add_tail(&conn->node, &list->head); list->count++; list->last_gc = (u32)jiffies; + +out_put: + if (refcounted) + nf_ct_put(ct); return 0; } -int nf_conncount_add(struct net *net, - struct nf_conncount_list *list, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) +int nf_conncount_add_skb(struct net *net, + const struct sk_buff *skb, + u16 l3num, + struct nf_conncount_list *list) { int ret; /* check the saved connections */ spin_lock_bh(&list->list_lock); - ret = __nf_conncount_add(net, list, tuple, zone); + ret = __nf_conncount_add(net, skb, l3num, list); spin_unlock_bh(&list->list_lock); return ret; } -EXPORT_SYMBOL_GPL(nf_conncount_add); +EXPORT_SYMBOL_GPL(nf_conncount_add_skb); void nf_conncount_list_init(struct nf_conncount_list *list) { @@ -309,19 +363,22 @@ static void schedule_gc_worker(struct nf_conncount_data *data, int tree) static unsigned int insert_tree(struct net *net, + const struct sk_buff *skb, + u16 l3num, struct nf_conncount_data *data, struct rb_root *root, unsigned int hash, - const u32 *key, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) + const u32 *key) { struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES]; + const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; + bool do_gc = true, refcounted = false; + unsigned int count = 0, gc_count = 0; struct rb_node **rbnode, *parent; - struct nf_conncount_rb *rbconn; + struct nf_conntrack_tuple tuple; struct nf_conncount_tuple *conn; - unsigned int count = 0, gc_count = 0; - bool do_gc = true; + struct nf_conncount_rb *rbconn; + struct nf_conn *ct = NULL; spin_lock_bh(&nf_conncount_locks[hash]); restart: @@ -340,7 +397,7 @@ insert_tree(struct net *net, } else { int ret; - ret = nf_conncount_add(net, &rbconn->list, tuple, zone); + ret = nf_conncount_add_skb(net, skb, l3num, &rbconn->list); if (ret) count = 0; /* hotdrop */ else @@ -375,19 +432,24 @@ insert_tree(struct net *net, goto out_unlock; } - conn->tuple = *tuple; - conn->zone = *zone; - conn->cpu = raw_smp_processor_id(); - conn->jiffies32 = (u32)jiffies; - memcpy(rbconn->key, key, sizeof(u32) * data->keylen); + if (get_ct_or_tuple_from_skb(net, skb, l3num, &ct, &tuple, &zone, &refcounted)) { + conn->tuple = tuple; + conn->zone = *zone; + conn->cpu = raw_smp_processor_id(); + conn->jiffies32 = (u32)jiffies; + memcpy(rbconn->key, key, sizeof(u32) * data->keylen); - nf_conncount_list_init(&rbconn->list); - list_add(&conn->node, &rbconn->list.head); - count = 1; - rbconn->list.count = count; + nf_conncount_list_init(&rbconn->list); + list_add(&conn->node, &rbconn->list.head); + count = 1; + rbconn->list.count = count; - rb_link_node_rcu(&rbconn->node, parent, rbnode); - rb_insert_color(&rbconn->node, root); + rb_link_node_rcu(&rbconn->node, parent, rbnode); + rb_insert_color(&rbconn->node, root); + + if (refcounted) + nf_ct_put(ct); + } out_unlock: spin_unlock_bh(&nf_conncount_locks[hash]); return count; @@ -395,10 +457,10 @@ insert_tree(struct net *net, static unsigned int count_tree(struct net *net, + const struct sk_buff *skb, + u16 l3num, struct nf_conncount_data *data, - const u32 *key, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) + const u32 *key) { struct rb_root *root; struct rb_node *parent; @@ -422,7 +484,7 @@ count_tree(struct net *net, } else { int ret; - if (!tuple) { + if (!skb) { nf_conncount_gc_list(net, &rbconn->list); return rbconn->list.count; } @@ -437,7 +499,7 @@ count_tree(struct net *net, } /* same source network -> be counted! */ - ret = __nf_conncount_add(net, &rbconn->list, tuple, zone); + ret = __nf_conncount_add(net, skb, l3num, &rbconn->list); spin_unlock_bh(&rbconn->list.list_lock); if (ret) return 0; /* hotdrop */ @@ -446,10 +508,10 @@ count_tree(struct net *net, } } - if (!tuple) + if (!skb) return 0; - return insert_tree(net, data, root, hash, key, tuple, zone); + return insert_tree(net, skb, l3num, data, root, hash, key); } static void tree_gc_worker(struct work_struct *work) @@ -511,18 +573,19 @@ static void tree_gc_worker(struct work_struct *work) } /* Count and return number of conntrack entries in 'net' with particular 'key'. - * If 'tuple' is not null, insert it into the accounting data structure. - * Call with RCU read lock. + * If 'skb' is not null, insert the corresponding tuple into the accounting + * data structure. Call with RCU read lock. */ -unsigned int nf_conncount_count(struct net *net, - struct nf_conncount_data *data, - const u32 *key, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) +unsigned int nf_conncount_count_skb(struct net *net, + const struct sk_buff *skb, + u16 l3num, + struct nf_conncount_data *data, + const u32 *key) { - return count_tree(net, data, key, tuple, zone); + return count_tree(net, skb, l3num, data, key); + } -EXPORT_SYMBOL_GPL(nf_conncount_count); +EXPORT_SYMBOL_GPL(nf_conncount_count_skb); struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int keylen) { diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c index fc35a11cdca2..5df7134131d2 100644 --- a/net/netfilter/nft_connlimit.c +++ b/net/netfilter/nft_connlimit.c @@ -24,26 +24,11 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv, const struct nft_pktinfo *pkt, const struct nft_set_ext *ext) { - const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; - const struct nf_conntrack_tuple *tuple_ptr; - struct nf_conntrack_tuple tuple; - enum ip_conntrack_info ctinfo; - const struct nf_conn *ct; unsigned int count; + int err; - tuple_ptr = &tuple; - - ct = nf_ct_get(pkt->skb, &ctinfo); - if (ct != NULL) { - tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; - zone = nf_ct_zone(ct); - } else if (!nf_ct_get_tuplepr(pkt->skb, skb_network_offset(pkt->skb), - nft_pf(pkt), nft_net(pkt), &tuple)) { - regs->verdict.code = NF_DROP; - return; - } - - if (nf_conncount_add(nft_net(pkt), priv->list, tuple_ptr, zone)) { + err = nf_conncount_add_skb(nft_net(pkt), pkt->skb, nft_pf(pkt), priv->list); + if (err) { regs->verdict.code = NF_DROP; return; } diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index 0189f8b6b0bd..848287ab79cf 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -31,8 +31,6 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) { struct net *net = xt_net(par); const struct xt_connlimit_info *info = par->matchinfo; - struct nf_conntrack_tuple tuple; - const struct nf_conntrack_tuple *tuple_ptr = &tuple; const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; enum ip_conntrack_info ctinfo; const struct nf_conn *ct; @@ -40,13 +38,8 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) u32 key[5]; ct = nf_ct_get(skb, &ctinfo); - if (ct != NULL) { - tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + if (ct) zone = nf_ct_zone(ct); - } else if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), - xt_family(par), net, &tuple)) { - goto hotdrop; - } if (xt_family(par) == NFPROTO_IPV6) { const struct ipv6hdr *iph = ipv6_hdr(skb); @@ -69,10 +62,9 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) key[1] = zone->id; } - connections = nf_conncount_count(net, info->data, key, tuple_ptr, - zone); + connections = nf_conncount_count_skb(net, skb, xt_family(par), info->data, key); if (connections == 0) - /* kmalloc failed, drop it entirely */ + /* kmalloc failed or tuple couldn't be found, drop it entirely */ goto hotdrop; return (connections > info->limit) ^ !!(info->flags & XT_CONNLIMIT_INVERT); diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index e573e9221302..a0811e1fba65 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -928,8 +928,8 @@ static u32 ct_limit_get(const struct ovs_ct_limit_info *info, u16 zone) } static int ovs_ct_check_limit(struct net *net, - const struct ovs_conntrack_info *info, - const struct nf_conntrack_tuple *tuple) + const struct sk_buff *skb, + const struct ovs_conntrack_info *info) { struct ovs_net *ovs_net = net_generic(net, ovs_net_id); const struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info; @@ -942,8 +942,9 @@ static int ovs_ct_check_limit(struct net *net, if (per_zone_limit == OVS_CT_LIMIT_UNLIMITED) return 0; - connections = nf_conncount_count(net, ct_limit_info->data, - &conncount_key, tuple, &info->zone); + connections = nf_conncount_count_skb(net, skb, info->family, + ct_limit_info->data, + &conncount_key); if (connections > per_zone_limit) return -ENOMEM; @@ -972,8 +973,7 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key, #if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) if (static_branch_unlikely(&ovs_ct_limit_enabled)) { if (!nf_ct_is_confirmed(ct)) { - err = ovs_ct_check_limit(net, info, - &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + err = ovs_ct_check_limit(net, skb, info); if (err) { net_warn_ratelimited("openvswitch: zone: %u " "exceeds conntrack limit\n", @@ -1770,8 +1770,8 @@ static int __ovs_ct_limit_get_zone_limit(struct net *net, zone_limit.limit = limit; nf_ct_zone_init(&ct_zone, zone_id, NF_CT_DEFAULT_ZONE_DIR, 0); - zone_limit.count = nf_conncount_count(net, data, &conncount_key, NULL, - &ct_zone); + zone_limit.count = nf_conncount_count_skb(net, NULL, 0, data, + &conncount_key); return nla_put_nohdr(reply, sizeof(zone_limit), &zone_limit); } -- 2.47.3 From: Fernando Fernandez Mancera For convenience when performing GC over the connection list, make nf_conncount_gc_list() to disable BH. This unifies the behavior with nf_conncount_add() and nf_conncount_count(). Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conncount.c | 24 +++++++++++++++++------- net/netfilter/nft_connlimit.c | 7 +------ 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index dbaa3051577c..eabce7e141f8 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -278,8 +278,8 @@ void nf_conncount_list_init(struct nf_conncount_list *list) EXPORT_SYMBOL_GPL(nf_conncount_list_init); /* Return true if the list is empty. Must be called with BH disabled. */ -bool nf_conncount_gc_list(struct net *net, - struct nf_conncount_list *list) +static bool __nf_conncount_gc_list(struct net *net, + struct nf_conncount_list *list) { const struct nf_conntrack_tuple_hash *found; struct nf_conncount_tuple *conn, *conn_n; @@ -291,10 +291,6 @@ bool nf_conncount_gc_list(struct net *net, if ((u32)jiffies == READ_ONCE(list->last_gc)) return false; - /* don't bother if other cpu is already doing GC */ - if (!spin_trylock(&list->list_lock)) - return false; - list_for_each_entry_safe(conn, conn_n, &list->head, node) { found = find_or_evict(net, list, conn); if (IS_ERR(found)) { @@ -323,7 +319,21 @@ bool nf_conncount_gc_list(struct net *net, if (!list->count) ret = true; list->last_gc = (u32)jiffies; - spin_unlock(&list->list_lock); + + return ret; +} + +bool nf_conncount_gc_list(struct net *net, + struct nf_conncount_list *list) +{ + bool ret; + + /* don't bother if other cpu is already doing GC */ + if (!spin_trylock_bh(&list->list_lock)) + return false; + + ret = __nf_conncount_gc_list(net, list); + spin_unlock_bh(&list->list_lock); return ret; } diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c index 5df7134131d2..41770bde39d3 100644 --- a/net/netfilter/nft_connlimit.c +++ b/net/netfilter/nft_connlimit.c @@ -223,13 +223,8 @@ static void nft_connlimit_destroy_clone(const struct nft_ctx *ctx, static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr) { struct nft_connlimit *priv = nft_expr_priv(expr); - bool ret; - local_bh_disable(); - ret = nf_conncount_gc_list(net, priv->list); - local_bh_enable(); - - return ret; + return nf_conncount_gc_list(net, priv->list); } static struct nft_expr_type nft_connlimit_type; -- 2.47.3 From: Fernando Fernandez Mancera Connlimit expression can be used for all kind of packets and not only for packets with connection state new. See this ruleset as example: table ip filter { chain input { type filter hook input priority filter; policy accept; tcp dport 22 ct count over 4 counter } } Currently, if the connection count goes over the limit the counter will count the packets. When a connection is closed, the connection count won't decrement as it should because it is only updated for new connections due to an optimization on __nf_conncount_add() that prevents updating the list if the connection is duplicated. To solve this problem, check whether the connection was skipped and if so, update the list. Adjust count_tree() too so the same fix is applied for xt_connlimit. Fixes: 976afca1ceba ("netfilter: nf_conncount: Early exit in nf_conncount_lookup() and cleanup") Closes: https://lore.kernel.org/netfilter/trinity-85c72a88-d762-46c3-be97-36f10e5d9796-1761173693813@3c-app-mailcom-bs12/ Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conncount.c | 12 ++++++++---- net/netfilter/nft_connlimit.c | 13 +++++++++++-- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index eabce7e141f8..81915ef99a83 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -179,7 +179,7 @@ static int __nf_conncount_add(struct net *net, if (ct && nf_ct_is_confirmed(ct)) { if (refcounted) nf_ct_put(ct); - return 0; + return -EEXIST; } if ((u32)jiffies == list->last_gc) @@ -408,7 +408,7 @@ insert_tree(struct net *net, int ret; ret = nf_conncount_add_skb(net, skb, l3num, &rbconn->list); - if (ret) + if (ret && ret != -EEXIST) count = 0; /* hotdrop */ else count = rbconn->list.count; @@ -511,10 +511,14 @@ count_tree(struct net *net, /* same source network -> be counted! */ ret = __nf_conncount_add(net, skb, l3num, &rbconn->list); spin_unlock_bh(&rbconn->list.list_lock); - if (ret) + if (ret && ret != -EEXIST) { return 0; /* hotdrop */ - else + } else { + /* -EEXIST means add was skipped, update the list */ + if (ret == -EEXIST) + nf_conncount_gc_list(net, &rbconn->list); return rbconn->list.count; + } } } diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c index 41770bde39d3..714a59485935 100644 --- a/net/netfilter/nft_connlimit.c +++ b/net/netfilter/nft_connlimit.c @@ -29,8 +29,17 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv, err = nf_conncount_add_skb(nft_net(pkt), pkt->skb, nft_pf(pkt), priv->list); if (err) { - regs->verdict.code = NF_DROP; - return; + if (err == -EEXIST) { + /* Call gc to update the list count if any connection has + * been closed already. This is useful for softlimit + * connections like limiting bandwidth based on a number + * of open connections. + */ + nf_conncount_gc_list(nft_net(pkt), priv->list); + } else { + regs->verdict.code = NF_DROP; + return; + } } count = READ_ONCE(priv->list->count); -- 2.47.3 From: Fernando Fernandez Mancera This is useful to update the limit or flags without clearing the connections tracked. Use READ_ONCE() on packetpath as it can be modified on controlplane. Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_connlimit.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c index 714a59485935..4a7aef1674bc 100644 --- a/net/netfilter/nft_connlimit.c +++ b/net/netfilter/nft_connlimit.c @@ -44,7 +44,7 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv, count = READ_ONCE(priv->list->count); - if ((count > priv->limit) ^ priv->invert) { + if ((count > READ_ONCE(priv->limit)) ^ READ_ONCE(priv->invert)) { regs->verdict.code = NFT_BREAK; return; } @@ -131,6 +131,16 @@ static int nft_connlimit_obj_init(const struct nft_ctx *ctx, return nft_connlimit_do_init(ctx, tb, priv); } +static void nft_connlimit_obj_update(struct nft_object *obj, + struct nft_object *newobj) +{ + struct nft_connlimit *newpriv = nft_obj_data(newobj); + struct nft_connlimit *priv = nft_obj_data(obj); + + priv->limit = newpriv->limit; + priv->invert = newpriv->invert; +} + static void nft_connlimit_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj) { @@ -160,6 +170,7 @@ static const struct nft_object_ops nft_connlimit_obj_ops = { .init = nft_connlimit_obj_init, .destroy = nft_connlimit_obj_destroy, .dump = nft_connlimit_obj_dump, + .update = nft_connlimit_obj_update, }; static struct nft_object_type nft_connlimit_obj_type __read_mostly = { -- 2.47.3 From: Lorenzo Bianconi Introduce the capability to send TCP traffic over IPv6 to nft_flowtable netfilter selftest. Signed-off-by: Lorenzo Bianconi Signed-off-by: Pablo Neira Ayuso --- .../selftests/net/netfilter/nft_flowtable.sh | 47 +++++++++++++------ 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/tools/testing/selftests/net/netfilter/nft_flowtable.sh b/tools/testing/selftests/net/netfilter/nft_flowtable.sh index 1fbfc8ad8dcd..24b4e60b9145 100755 --- a/tools/testing/selftests/net/netfilter/nft_flowtable.sh +++ b/tools/testing/selftests/net/netfilter/nft_flowtable.sh @@ -127,6 +127,8 @@ ip -net "$nsr1" addr add fee1:2::1/64 dev veth1 nodad ip -net "$nsr2" addr add 192.168.10.2/24 dev veth0 ip -net "$nsr2" addr add fee1:2::2/64 dev veth0 nodad +ip netns exec "$nsr1" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null +ip netns exec "$nsr2" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null for i in 0 1; do ip netns exec "$nsr1" sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null ip netns exec "$nsr2" sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null @@ -153,7 +155,9 @@ ip -net "$ns1" route add default via dead:1::1 ip -net "$ns2" route add default via dead:2::1 ip -net "$nsr1" route add default via 192.168.10.2 +ip -6 -net "$nsr1" route add default via fee1:2::2 ip -net "$nsr2" route add default via 192.168.10.1 +ip -6 -net "$nsr2" route add default via fee1:2::1 ip netns exec "$nsr1" nft -f - < "$ns2out" & + timeout "$SOCAT_TIMEOUT" ip netns exec "$nsb" socat -${proto} TCP${proto}-LISTEN:12345,reuseaddr STDIO < "$infile" > "$ns2out" & lpid=$! busywait 1000 listener_ready - timeout "$SOCAT_TIMEOUT" ip netns exec "$nsa" socat -4 TCP:"$dstip":"$dstport" STDIO < "$infile" > "$ns1out" + timeout "$SOCAT_TIMEOUT" ip netns exec "$nsa" socat -${proto} TCP${proto}:"$dstip":"$dstport" STDIO < "$infile" > "$ns1out" socatc=$? wait $lpid @@ -394,8 +399,11 @@ test_tcp_forwarding_ip() test_tcp_forwarding() { local pmtu="$3" + local proto="$4" + local dstip="$5" + local dstport="$6" - test_tcp_forwarding_ip "$1" "$2" "$pmtu" 10.0.2.99 12345 + test_tcp_forwarding_ip "$1" "$2" "$pmtu" "$proto" "$dstip" "$dstport" return $? } @@ -403,6 +411,9 @@ test_tcp_forwarding() test_tcp_forwarding_set_dscp() { local pmtu="$3" + local proto="$4" + local dstip="$5" + local dstport="$6" ip netns exec "$nsr1" nft -f - <&2 @@ -495,6 +506,14 @@ else ret=1 fi +if test_tcp_forwarding "$ns1" "$ns2" 0 6 "[dead:2::99]" 12345; then + echo "PASS: IPv6 flow offloaded for ns1/ns2" +else + echo "FAIL: IPv6 flow offload for ns1/ns2:" 1>&2 + ip netns exec "$nsr1" nft list ruleset + ret=1 +fi + # delete default route, i.e. ns2 won't be able to reach ns1 and # will depend on ns1 being masqueraded in nsr1. # expect ns1 has nsr1 address. @@ -520,7 +539,7 @@ table ip nat { EOF check_dscp "dscp_none" "0" -if ! test_tcp_forwarding_set_dscp "$ns1" "$ns2" 0 ""; then +if ! test_tcp_forwarding_set_dscp "$ns1" "$ns2" 0 4 10.0.2.99 12345; then echo "FAIL: flow offload for ns1/ns2 with dscp update and no pmtu discovery" 1>&2 exit 0 fi @@ -546,7 +565,7 @@ ip netns exec "$ns2" sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null ip netns exec "$nsr1" nft reset counters table inet filter >/dev/null ip netns exec "$ns2" nft reset counters table inet filter >/dev/null -if ! test_tcp_forwarding_set_dscp "$ns1" "$ns2" 1 ""; then +if ! test_tcp_forwarding_set_dscp "$ns1" "$ns2" 1 4 10.0.2.99 12345; then echo "FAIL: flow offload for ns1/ns2 with dscp update and pmtu discovery" 1>&2 exit 0 fi @@ -752,7 +771,7 @@ ip -net "$ns2" route del 192.168.10.1 via 10.0.2.1 ip -net "$ns2" route add default via 10.0.2.1 ip -net "$ns2" route add default via dead:2::1 -if test_tcp_forwarding "$ns1" "$ns2" 1; then +if test_tcp_forwarding "$ns1" "$ns2" 1 4 10.0.2.99 12345; then check_counters "ipsec tunnel mode for ns1/ns2" else echo "FAIL: ipsec tunnel mode for ns1/ns2" -- 2.47.3 From: Randy Dunlap Fix the kernel-doc format for struct members to be "@member" instead of "@ member" to avoid kernel-doc warnings. Warning: ip6t_srh.h:60 struct member 'next_hdr' not described in 'ip6t_srh' Warning: ip6t_srh.h:60 struct member 'hdr_len' not described in 'ip6t_srh' Warning: ip6t_srh.h:60 struct member 'segs_left' not described in 'ip6t_srh' Warning: ip6t_srh.h:60 struct member 'last_entry' not described in 'ip6t_srh' Warning: ip6t_srh.h:60 struct member 'tag' not described in 'ip6t_srh' Warning: ip6t_srh.h:60 struct member 'mt_flags' not described in 'ip6t_srh' Warning: ip6t_srh.h:60 struct member 'mt_invflags' not described in 'ip6t_srh' Warning: ip6t_srh.h:93 struct member 'next_hdr' not described in 'ip6t_srh1' Warning: ip6t_srh.h:93 struct member 'hdr_len' not described in 'ip6t_srh1' Warning: ip6t_srh.h:93 struct member 'segs_left' not described in 'ip6t_srh1' Warning: ip6t_srh.h:93 struct member 'last_entry' not described in 'ip6t_srh1' Warning: ip6t_srh.h:93 struct member 'tag' not described in 'ip6t_srh1' Warning: ip6t_srh.h:93 struct member 'psid_addr' not described in 'ip6t_srh1' Warning: ip6t_srh.h:93 struct member 'nsid_addr' not described in 'ip6t_srh1' Warning: ip6t_srh.h:93 struct member 'lsid_addr' not described in 'ip6t_srh1' Warning: ip6t_srh.h:93 struct member 'psid_msk' not described in 'ip6t_srh1' Warning: ip6t_srh.h:93 struct member 'nsid_msk' not described in 'ip6t_srh1' Warning: ip6t_srh.h:93 struct member 'lsid_msk' not described in 'ip6t_srh1' Warning: ip6t_srh.h:93 struct member 'mt_flags' not described in 'ip6t_srh1' Warning: ip6t_srh.h:93 struct member 'mt_invflags' not described in 'ip6t_srh1' Signed-off-by: Randy Dunlap Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter_ipv6/ip6t_srh.h | 40 ++++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/include/uapi/linux/netfilter_ipv6/ip6t_srh.h b/include/uapi/linux/netfilter_ipv6/ip6t_srh.h index 54ed83360dac..80c66c8ece82 100644 --- a/include/uapi/linux/netfilter_ipv6/ip6t_srh.h +++ b/include/uapi/linux/netfilter_ipv6/ip6t_srh.h @@ -41,13 +41,13 @@ /** * struct ip6t_srh - SRH match options - * @ next_hdr: Next header field of SRH - * @ hdr_len: Extension header length field of SRH - * @ segs_left: Segments left field of SRH - * @ last_entry: Last entry field of SRH - * @ tag: Tag field of SRH - * @ mt_flags: match options - * @ mt_invflags: Invert the sense of match options + * @next_hdr: Next header field of SRH + * @hdr_len: Extension header length field of SRH + * @segs_left: Segments left field of SRH + * @last_entry: Last entry field of SRH + * @tag: Tag field of SRH + * @mt_flags: match options + * @mt_invflags: Invert the sense of match options */ struct ip6t_srh { @@ -62,19 +62,19 @@ struct ip6t_srh { /** * struct ip6t_srh1 - SRH match options (revision 1) - * @ next_hdr: Next header field of SRH - * @ hdr_len: Extension header length field of SRH - * @ segs_left: Segments left field of SRH - * @ last_entry: Last entry field of SRH - * @ tag: Tag field of SRH - * @ psid_addr: Address of previous SID in SRH SID list - * @ nsid_addr: Address of NEXT SID in SRH SID list - * @ lsid_addr: Address of LAST SID in SRH SID list - * @ psid_msk: Mask of previous SID in SRH SID list - * @ nsid_msk: Mask of next SID in SRH SID list - * @ lsid_msk: MAsk of last SID in SRH SID list - * @ mt_flags: match options - * @ mt_invflags: Invert the sense of match options + * @next_hdr: Next header field of SRH + * @hdr_len: Extension header length field of SRH + * @segs_left: Segments left field of SRH + * @last_entry: Last entry field of SRH + * @tag: Tag field of SRH + * @psid_addr: Address of previous SID in SRH SID list + * @nsid_addr: Address of NEXT SID in SRH SID list + * @lsid_addr: Address of LAST SID in SRH SID list + * @psid_msk: Mask of previous SID in SRH SID list + * @nsid_msk: Mask of next SID in SRH SID list + * @lsid_msk: MAsk of last SID in SRH SID list + * @mt_flags: match options + * @mt_invflags: Invert the sense of match options */ struct ip6t_srh1 { -- 2.47.3 From: Randy Dunlap In include/uapi/linux/netfilter/nf_tables.h, correct the kernel-doc comments for mistyped enum names and enum values to avoid these kernel-doc warnings and improve the documentation: nf_tables.h:896: warning: Enum value 'NFT_EXTHDR_OP_TCPOPT' not described in enum 'nft_exthdr_op' nf_tables.h:896: warning: Excess enum value 'NFT_EXTHDR_OP_TCP' description in 'nft_exthdr_op' nf_tables.h:1210: warning: expecting prototype for enum nft_flow_attributes. Prototype was for enum nft_offload_attributes instead nf_tables.h:1428: warning: expecting prototype for enum nft_reject_code. Prototype was for enum nft_reject_inet_code instead (add beginning '@' to each enum value description:) nf_tables.h:1493: warning: Enum value 'NFTA_TPROXY_FAMILY' not described in enum 'nft_tproxy_attributes' nf_tables.h:1493: warning: Enum value 'NFTA_TPROXY_REG_ADDR' not described in enum 'nft_tproxy_attributes' nf_tables.h:1493: warning: Enum value 'NFTA_TPROXY_REG_PORT' not described in enum 'nft_tproxy_attributes' nf_tables.h:1796: warning: expecting prototype for enum nft_device_attributes. Prototype was for enum nft_devices_attributes instead Signed-off-by: Randy Dunlap Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 7c0c915f0306..45c71f7d21c2 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -881,7 +881,7 @@ enum nft_exthdr_flags { * enum nft_exthdr_op - nf_tables match options * * @NFT_EXTHDR_OP_IPV6: match against ipv6 extension headers - * @NFT_EXTHDR_OP_TCP: match against tcp options + * @NFT_EXTHDR_OP_TCPOPT: match against tcp options * @NFT_EXTHDR_OP_IPV4: match against ipv4 options * @NFT_EXTHDR_OP_SCTP: match against sctp chunks * @NFT_EXTHDR_OP_DCCP: match against dccp otions @@ -1200,7 +1200,7 @@ enum nft_ct_attributes { #define NFTA_CT_MAX (__NFTA_CT_MAX - 1) /** - * enum nft_flow_attributes - ct offload expression attributes + * enum nft_offload_attributes - ct offload expression attributes * @NFTA_FLOW_TABLE_NAME: flow table name (NLA_STRING) */ enum nft_offload_attributes { @@ -1410,7 +1410,7 @@ enum nft_reject_types { }; /** - * enum nft_reject_code - Generic reject codes for IPv4/IPv6 + * enum nft_reject_inet_code - Generic reject codes for IPv4/IPv6 * * @NFT_REJECT_ICMPX_NO_ROUTE: no route to host / network unreachable * @NFT_REJECT_ICMPX_PORT_UNREACH: port unreachable @@ -1480,9 +1480,9 @@ enum nft_nat_attributes { /** * enum nft_tproxy_attributes - nf_tables tproxy expression netlink attributes * - * NFTA_TPROXY_FAMILY: Target address family (NLA_U32: nft_registers) - * NFTA_TPROXY_REG_ADDR: Target address register (NLA_U32: nft_registers) - * NFTA_TPROXY_REG_PORT: Target port register (NLA_U32: nft_registers) + * @NFTA_TPROXY_FAMILY: Target address family (NLA_U32: nft_registers) + * @NFTA_TPROXY_REG_ADDR: Target address register (NLA_U32: nft_registers) + * @NFTA_TPROXY_REG_PORT: Target port register (NLA_U32: nft_registers) */ enum nft_tproxy_attributes { NFTA_TPROXY_UNSPEC, @@ -1783,7 +1783,7 @@ enum nft_synproxy_attributes { #define NFTA_SYNPROXY_MAX (__NFTA_SYNPROXY_MAX - 1) /** - * enum nft_device_attributes - nf_tables device netlink attributes + * enum nft_devices_attributes - nf_tables device netlink attributes * * @NFTA_DEVICE_NAME: name of this device (NLA_STRING) * @NFTA_DEVICE_PREFIX: device name prefix, a simple wildcard (NLA_STRING) -- 2.47.3