Implement noref variants for existing dst_cache helpers interacting with dst_entry. This is required for implementing noref flows, which avoid redundant atomic operations. Signed-off-by: Marek Mietus --- include/net/dst_cache.h | 71 +++++++++++++++++++++ net/core/dst_cache.c | 133 +++++++++++++++++++++++++++++++++++++--- 2 files changed, 194 insertions(+), 10 deletions(-) diff --git a/include/net/dst_cache.h b/include/net/dst_cache.h index 1961699598e2..8d425cd75fd3 100644 --- a/include/net/dst_cache.h +++ b/include/net/dst_cache.h @@ -23,6 +23,23 @@ struct dst_cache { */ struct dst_entry *dst_cache_get(struct dst_cache *dst_cache); +/** + * dst_cache_get_rcu - perform cache lookup under RCU + * @dst_cache: the cache + * + * Perform cache lookup without taking a reference on the dst. + * Must be called with local BH disabled, and within an rcu read side + * critical section. + * + * The caller should use dst_cache_get_ip4_rcu() if it need to retrieve the + * source address to be used when xmitting to the cached dst. + * local BH must be disabled. + * + * Return: Pointer to retrieved rtable if cache is initialized and + * cached dst is valid, NULL otherwise. + */ +struct dst_entry *dst_cache_get_rcu(struct dst_cache *dst_cache); + /** * dst_cache_get_ip4 - perform cache lookup and fetch ipv4 source address * @dst_cache: the cache @@ -32,6 +49,21 @@ struct dst_entry *dst_cache_get(struct dst_cache *dst_cache); */ struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr); +/** + * dst_cache_get_ip4_rcu - lookup cache and ipv4 source under RCU + * @dst_cache: the cache + * @saddr: return value for the retrieved source address + * + * Perform cache lookup and fetch ipv4 source without taking a + * reference on the dst. + * Must be called with local BH disabled, and within an rcu read side + * critical section. + * + * Return: Pointer to retrieved rtable if cache is initialized and + * cached dst is valid, NULL otherwise. + */ +struct rtable *dst_cache_get_ip4_rcu(struct dst_cache *dst_cache, __be32 *saddr); + /** * dst_cache_set_ip4 - store the ipv4 dst into the cache * @dst_cache: the cache @@ -43,6 +75,17 @@ struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr); void dst_cache_set_ip4(struct dst_cache *dst_cache, struct dst_entry *dst, __be32 saddr); +/** + * dst_cache_steal_ip4 - store the ipv4 dst into the cache and steal its + * reference + * @dst_cache: the cache + * @dst: the entry to be cached whose reference will be stolen + * @saddr: the source address to be stored inside the cache + * + * local BH must be disabled + */ +void dst_cache_steal_ip4(struct dst_cache *dst_cache, struct dst_entry *dst, + __be32 saddr); #if IS_ENABLED(CONFIG_IPV6) /** @@ -56,6 +99,18 @@ void dst_cache_set_ip4(struct dst_cache *dst_cache, struct dst_entry *dst, void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst, const struct in6_addr *saddr); +/** + * dst_cache_steal_ip6 - store the ipv6 dst into the cache and steal its + * reference + * @dst_cache: the cache + * @dst: the entry to be cached whose reference will be stolen + * @saddr: the source address to be stored inside the cache + * + * local BH must be disabled + */ +void dst_cache_steal_ip6(struct dst_cache *dst_cache, struct dst_entry *dst, + const struct in6_addr *saddr); + /** * dst_cache_get_ip6 - perform cache lookup and fetch ipv6 source address * @dst_cache: the cache @@ -65,6 +120,22 @@ void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst, */ struct dst_entry *dst_cache_get_ip6(struct dst_cache *dst_cache, struct in6_addr *saddr); + +/** + * dst_cache_get_ip6_rcu - lookup cache and ipv6 source under RCU + * @dst_cache: the cache + * @saddr: return value for the retrieved source address + * + * Perform cache lookup and fetch ipv6 source without taking a + * reference on the dst. + * Must be called with local BH disabled, and within an rcu read side + * critical section. + * + * Return: Pointer to retrieved dst_entry if cache is initialized and + * cached dst is valid, NULL otherwise. + */ +struct dst_entry *dst_cache_get_ip6_rcu(struct dst_cache *dst_cache, + struct in6_addr *saddr); #endif /** diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c index 9ab4902324e1..52418cfb9b8a 100644 --- a/net/core/dst_cache.c +++ b/net/core/dst_cache.c @@ -25,20 +25,27 @@ struct dst_cache_pcpu { }; }; -static void dst_cache_per_cpu_dst_set(struct dst_cache_pcpu *dst_cache, - struct dst_entry *dst, u32 cookie) +static void __dst_cache_per_cpu_dst_set(struct dst_cache_pcpu *dst_cache, + struct dst_entry *dst, u32 cookie) { DEBUG_NET_WARN_ON_ONCE(!in_softirq()); dst_release(dst_cache->dst); - if (dst) - dst_hold(dst); dst_cache->cookie = cookie; dst_cache->dst = dst; } -static struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache, - struct dst_cache_pcpu *idst) +static void dst_cache_per_cpu_dst_set(struct dst_cache_pcpu *dst_cache, + struct dst_entry *dst, u32 cookie) +{ + if (dst) + dst_hold(dst); + + __dst_cache_per_cpu_dst_set(dst_cache, dst, cookie); +} + +static struct dst_entry *__dst_cache_per_cpu_get(struct dst_cache *dst_cache, + struct dst_cache_pcpu *idst) { struct dst_entry *dst; @@ -47,14 +54,10 @@ static struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache, if (!dst) goto fail; - /* the cache already hold a dst reference; it can't go away */ - dst_hold(dst); - if (unlikely(!time_after(idst->refresh_ts, READ_ONCE(dst_cache->reset_ts)) || (READ_ONCE(dst->obsolete) && !dst->ops->check(dst, idst->cookie)))) { dst_cache_per_cpu_dst_set(idst, NULL, 0); - dst_release(dst); goto fail; } return dst; @@ -64,6 +67,18 @@ static struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache, return NULL; } +static struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache, + struct dst_cache_pcpu *idst) +{ + struct dst_entry *dst; + + dst = __dst_cache_per_cpu_get(dst_cache, idst); + if (dst) + /* the cache already hold a dst reference; it can't go away */ + dst_hold(dst); + return dst; +} + struct dst_entry *dst_cache_get(struct dst_cache *dst_cache) { struct dst_entry *dst; @@ -78,6 +93,20 @@ struct dst_entry *dst_cache_get(struct dst_cache *dst_cache) } EXPORT_SYMBOL_GPL(dst_cache_get); +struct dst_entry *dst_cache_get_rcu(struct dst_cache *dst_cache) +{ + struct dst_entry *dst; + + if (!dst_cache->cache) + return NULL; + + local_lock_nested_bh(&dst_cache->cache->bh_lock); + dst = __dst_cache_per_cpu_get(dst_cache, this_cpu_ptr(dst_cache->cache)); + local_unlock_nested_bh(&dst_cache->cache->bh_lock); + return dst; +} +EXPORT_SYMBOL_GPL(dst_cache_get_rcu); + struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr) { struct dst_cache_pcpu *idst; @@ -100,6 +129,28 @@ struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr) } EXPORT_SYMBOL_GPL(dst_cache_get_ip4); +struct rtable *dst_cache_get_ip4_rcu(struct dst_cache *dst_cache, __be32 *saddr) +{ + struct dst_cache_pcpu *idst; + struct dst_entry *dst; + + if (!dst_cache->cache) + return NULL; + + local_lock_nested_bh(&dst_cache->cache->bh_lock); + idst = this_cpu_ptr(dst_cache->cache); + dst = __dst_cache_per_cpu_get(dst_cache, idst); + if (!dst) { + local_unlock_nested_bh(&dst_cache->cache->bh_lock); + return NULL; + } + + *saddr = idst->in_saddr.s_addr; + local_unlock_nested_bh(&dst_cache->cache->bh_lock); + return dst_rtable(dst); +} +EXPORT_SYMBOL_GPL(dst_cache_get_ip4_rcu); + void dst_cache_set_ip4(struct dst_cache *dst_cache, struct dst_entry *dst, __be32 saddr) { @@ -116,6 +167,24 @@ void dst_cache_set_ip4(struct dst_cache *dst_cache, struct dst_entry *dst, } EXPORT_SYMBOL_GPL(dst_cache_set_ip4); +void dst_cache_steal_ip4(struct dst_cache *dst_cache, struct dst_entry *dst, + __be32 saddr) +{ + struct dst_cache_pcpu *idst; + + if (!dst_cache->cache) { + dst_release(dst); + return; + } + + local_lock_nested_bh(&dst_cache->cache->bh_lock); + idst = this_cpu_ptr(dst_cache->cache); + __dst_cache_per_cpu_dst_set(idst, dst, 0); + idst->in_saddr.s_addr = saddr; + local_unlock_nested_bh(&dst_cache->cache->bh_lock); +} +EXPORT_SYMBOL_GPL(dst_cache_steal_ip4); + #if IS_ENABLED(CONFIG_IPV6) void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst, const struct in6_addr *saddr) @@ -135,6 +204,26 @@ void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst, } EXPORT_SYMBOL_GPL(dst_cache_set_ip6); +void dst_cache_steal_ip6(struct dst_cache *dst_cache, struct dst_entry *dst, + const struct in6_addr *saddr) +{ + struct dst_cache_pcpu *idst; + + if (!dst_cache->cache) { + dst_release(dst); + return; + } + + local_lock_nested_bh(&dst_cache->cache->bh_lock); + + idst = this_cpu_ptr(dst_cache->cache); + __dst_cache_per_cpu_dst_set(idst, dst, + rt6_get_cookie(dst_rt6_info(dst))); + idst->in6_saddr = *saddr; + local_unlock_nested_bh(&dst_cache->cache->bh_lock); +} +EXPORT_SYMBOL_GPL(dst_cache_steal_ip6); + struct dst_entry *dst_cache_get_ip6(struct dst_cache *dst_cache, struct in6_addr *saddr) { @@ -158,6 +247,30 @@ struct dst_entry *dst_cache_get_ip6(struct dst_cache *dst_cache, return dst; } EXPORT_SYMBOL_GPL(dst_cache_get_ip6); + +struct dst_entry *dst_cache_get_ip6_rcu(struct dst_cache *dst_cache, + struct in6_addr *saddr) +{ + struct dst_cache_pcpu *idst; + struct dst_entry *dst; + + if (!dst_cache->cache) + return NULL; + + local_lock_nested_bh(&dst_cache->cache->bh_lock); + + idst = this_cpu_ptr(dst_cache->cache); + dst = __dst_cache_per_cpu_get(dst_cache, idst); + if (!dst) { + local_unlock_nested_bh(&dst_cache->cache->bh_lock); + return NULL; + } + + *saddr = idst->in6_saddr; + local_unlock_nested_bh(&dst_cache->cache->bh_lock); + return dst; +} +EXPORT_SYMBOL_GPL(dst_cache_get_ip6_rcu); #endif int dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp) -- 2.51.0 iptunnel_xmit assumes that a reference was taken on the dst passed to it, and uses that reference. This forces callers to reference the dst, preventing noref optimizations. Convert iptunnel_xmit to be noref and drop the requirement that a ref be taken on the dst. Signed-off-by: Marek Mietus --- net/ipv4/ip_tunnel.c | 2 ++ net/ipv4/ip_tunnel_core.c | 2 +- net/ipv4/udp_tunnel_core.c | 1 + net/ipv6/sit.c | 1 + 4 files changed, 5 insertions(+), 1 deletion(-) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 158a30ae7c5f..8a0c611ab1bf 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -655,6 +655,7 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)), 0); + ip_rt_put(rt); return; tx_error: DEV_STATS_INC(dev, tx_errors); @@ -844,6 +845,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)), 0); + ip_rt_put(rt); return; #if IS_ENABLED(CONFIG_IPV6) diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 2e61ac137128..70f0f123b0ba 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -61,7 +61,7 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, skb_scrub_packet(skb, xnet); skb_clear_hash_if_not_l4(skb); - skb_dst_set(skb, &rt->dst); + skb_dst_set_noref(skb, &rt->dst); memset(IPCB(skb), 0, sizeof(*IPCB(skb))); IPCB(skb)->flags = ipcb_flags; diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index b1f667c52cb2..a34066d91375 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -192,6 +192,7 @@ void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP, tos, ttl, df, xnet, ipcb_flags); + ip_rt_put(rt); } EXPORT_SYMBOL_GPL(udp_tunnel_xmit_skb); diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index cf37ad9686e6..a0d699082747 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1028,6 +1028,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)), 0); + ip_rt_put(rt); return NETDEV_TX_OK; tx_error_icmp: -- 2.51.0 udp_tunnel{6,}_xmit_skb assume that a reference was taken on the dst passed to them, and use that reference. This forces callers to reference the dst, preventing noref optimizations. Convert udp_tunnel{6,}_xmit_skb to be noref and drop the requirement that a ref be taken on the dst. Signed-off-by: Marek Mietus --- drivers/net/amt.c | 3 +++ drivers/net/bareudp.c | 2 ++ drivers/net/geneve.c | 2 ++ drivers/net/gtp.c | 5 +++++ drivers/net/ovpn/udp.c | 2 ++ drivers/net/vxlan/vxlan_core.c | 2 ++ drivers/net/wireguard/socket.c | 2 ++ net/ipv4/udp_tunnel_core.c | 1 - net/ipv6/ip6_udp_tunnel.c | 2 +- net/sctp/ipv6.c | 1 + net/sctp/protocol.c | 1 + net/tipc/udp_media.c | 2 ++ 12 files changed, 23 insertions(+), 2 deletions(-) diff --git a/drivers/net/amt.c b/drivers/net/amt.c index 902c817a0dea..e9eeaa7b6fe7 100644 --- a/drivers/net/amt.c +++ b/drivers/net/amt.c @@ -1050,6 +1050,7 @@ static bool amt_send_membership_update(struct amt_dev *amt, false, false, 0); + ip_rt_put(rt); amt_update_gw_status(amt, AMT_STATUS_SENT_UPDATE, true); return false; } @@ -1108,6 +1109,7 @@ static void amt_send_multicast_data(struct amt_dev *amt, false, false, 0); + ip_rt_put(rt); } static bool amt_send_membership_query(struct amt_dev *amt, @@ -1167,6 +1169,7 @@ static bool amt_send_membership_query(struct amt_dev *amt, false, false, 0); + ip_rt_put(rt); amt_update_relay_status(tunnel, AMT_STATUS_SENT_QUERY, true); return false; } diff --git a/drivers/net/bareudp.c b/drivers/net/bareudp.c index 0df3208783ad..92ee4a36f86f 100644 --- a/drivers/net/bareudp.c +++ b/drivers/net/bareudp.c @@ -364,6 +364,7 @@ static int bareudp_xmit_skb(struct sk_buff *skb, struct net_device *dev, !net_eq(bareudp->net, dev_net(bareudp->dev)), !test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags), 0); + ip_rt_put(rt); return 0; free_dst: @@ -433,6 +434,7 @@ static int bareudp6_xmit_skb(struct sk_buff *skb, struct net_device *dev, !test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags), 0); + dst_release(dst); return 0; free_dst: diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 77b0c3d52041..169a2b7d83e0 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -926,6 +926,7 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, !net_eq(geneve->net, dev_net(geneve->dev)), !test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags), 0); + ip_rt_put(rt); return 0; } @@ -1019,6 +1020,7 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, !test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags), 0); + dst_release(dst); return 0; } #endif diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index 4213c3b2d532..3cd1f16136a3 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -449,6 +449,7 @@ static int gtp0_send_echo_resp_ip(struct gtp_dev *gtp, struct sk_buff *skb) dev_net(gtp->dev)), false, 0); + ip_rt_put(rt); return 0; } @@ -708,6 +709,7 @@ static int gtp1u_send_echo_resp(struct gtp_dev *gtp, struct sk_buff *skb) dev_net(gtp->dev)), false, 0); + ip_rt_put(rt); return 0; } @@ -1308,6 +1310,7 @@ static netdev_tx_t gtp_dev_xmit(struct sk_buff *skb, struct net_device *dev) !net_eq(sock_net(pktinfo.pctx->sk), dev_net(dev)), false, 0); + ip_rt_put(pktinfo.rt); break; case AF_INET6: #if IS_ENABLED(CONFIG_IPV6) @@ -1318,6 +1321,7 @@ static netdev_tx_t gtp_dev_xmit(struct sk_buff *skb, struct net_device *dev) 0, pktinfo.gtph_port, pktinfo.gtph_port, false, 0); + dst_release(&pktinfo.rt6->dst); #else goto tx_err; #endif @@ -2409,6 +2413,7 @@ static int gtp_genl_send_echo_req(struct sk_buff *skb, struct genl_info *info) !net_eq(sock_net(sk), dev_net(gtp->dev)), false, 0); + ip_rt_put(rt); return 0; } diff --git a/drivers/net/ovpn/udp.c b/drivers/net/ovpn/udp.c index d6a0f7a0b75d..c82ba71b6aff 100644 --- a/drivers/net/ovpn/udp.c +++ b/drivers/net/ovpn/udp.c @@ -200,6 +200,7 @@ static int ovpn_udp4_output(struct ovpn_peer *peer, struct ovpn_bind *bind, udp_tunnel_xmit_skb(rt, sk, skb, fl.saddr, fl.daddr, 0, ip4_dst_hoplimit(&rt->dst), 0, fl.fl4_sport, fl.fl4_dport, false, sk->sk_no_check_tx, 0); + ip_rt_put(rt); ret = 0; err: local_bh_enable(); @@ -275,6 +276,7 @@ static int ovpn_udp6_output(struct ovpn_peer *peer, struct ovpn_bind *bind, udp_tunnel6_xmit_skb(dst, sk, skb, skb->dev, &fl.saddr, &fl.daddr, 0, ip6_dst_hoplimit(dst), 0, fl.fl6_sport, fl.fl6_dport, udp_get_no_check6_tx(sk), 0); + dst_release(dst); ret = 0; err: local_bh_enable(); diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index e957aa12a8a4..09ddf0586176 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -2538,6 +2538,7 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, pkey->u.ipv4.dst, tos, ttl, df, src_port, dst_port, xnet, !udp_sum, ipcb_flags); + ip_rt_put(rt); #if IS_ENABLED(CONFIG_IPV6) } else { struct vxlan_sock *sock6; @@ -2613,6 +2614,7 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, &saddr, &pkey->u.ipv6.dst, tos, ttl, pkey->label, src_port, dst_port, !udp_sum, ip6cb_flags); + dst_release(ndst); #endif } vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX, pkt_len); diff --git a/drivers/net/wireguard/socket.c b/drivers/net/wireguard/socket.c index 253488f8c00f..ee7d9c675909 100644 --- a/drivers/net/wireguard/socket.c +++ b/drivers/net/wireguard/socket.c @@ -85,6 +85,7 @@ static int send4(struct wg_device *wg, struct sk_buff *skb, udp_tunnel_xmit_skb(rt, sock, skb, fl.saddr, fl.daddr, ds, ip4_dst_hoplimit(&rt->dst), 0, fl.fl4_sport, fl.fl4_dport, false, false, 0); + ip_rt_put(rt); goto out; err: @@ -152,6 +153,7 @@ static int send6(struct wg_device *wg, struct sk_buff *skb, udp_tunnel6_xmit_skb(dst, sock, skb, skb->dev, &fl.saddr, &fl.daddr, ds, ip6_dst_hoplimit(dst), 0, fl.fl6_sport, fl.fl6_dport, false, 0); + dst_release(dst); goto out; err: diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index a34066d91375..b1f667c52cb2 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -192,7 +192,6 @@ void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP, tos, ttl, df, xnet, ipcb_flags); - ip_rt_put(rt); } EXPORT_SYMBOL_GPL(udp_tunnel_xmit_skb); diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index cef3e0210744..d58815db8182 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -95,7 +95,7 @@ void udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, uh->len = htons(skb->len); - skb_dst_set(skb, dst); + skb_dst_set_noref(skb, dst); udp6_set_csum(nocheck, skb, saddr, daddr, skb->len); diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 531cb0690007..38fd1cf3148f 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -264,6 +264,7 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *t) udp_tunnel6_xmit_skb(dst, sk, skb, NULL, &fl6->saddr, &fl6->daddr, tclass, ip6_dst_hoplimit(dst), label, sctp_sk(sk)->udp_port, t->encap_port, false, 0); + dst_release(dst); return 0; } diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 2c3398f75d76..ff18ed0a65ff 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1074,6 +1074,7 @@ static inline int sctp_v4_xmit(struct sk_buff *skb, struct sctp_transport *t) fl4->daddr, dscp, ip4_dst_hoplimit(dst), df, sctp_sk(sk)->udp_port, t->encap_port, false, false, 0); + dst_release(dst); return 0; } diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index b85ab0fb3b8c..ba4ff5b3354f 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -198,6 +198,7 @@ static int tipc_udp_xmit(struct net *net, struct sk_buff *skb, udp_tunnel_xmit_skb(rt, ub->ubsock->sk, skb, src->ipv4.s_addr, dst->ipv4.s_addr, 0, ttl, 0, src->port, dst->port, false, true, 0); + ip_rt_put(rt); #if IS_ENABLED(CONFIG_IPV6) } else { if (!ndst) { @@ -220,6 +221,7 @@ static int tipc_udp_xmit(struct net *net, struct sk_buff *skb, udp_tunnel6_xmit_skb(ndst, ub->ubsock->sk, skb, NULL, &src->ipv6, &dst->ipv6, 0, ttl, 0, src->port, dst->port, false, 0); + dst_release(ndst); #endif } local_bh_enable(); -- 2.51.0 Update udp_tunnel{,6}_dst_lookup to return noref dsts when possible. This is done using a new boolean which indicates whether the returned dst is noref. When the returned dst is noref, the dst is only valid inside the RCU read-side critical section in which it was queried. Update all callers to properly use the new noref argument and convert all tunnels that use udp_tunnel{,6}_dst_lookup to noref. This affects bareudp, geneve and vxlan tunnels. Signed-off-by: Marek Mietus --- drivers/net/bareudp.c | 30 ++++++++++++----- drivers/net/geneve.c | 61 +++++++++++++++++++++++----------- drivers/net/vxlan/vxlan_core.c | 41 +++++++++++++++-------- include/net/udp_tunnel.h | 6 ++-- net/ipv4/udp_tunnel_core.c | 16 ++++++--- net/ipv6/ip6_udp_tunnel.c | 17 +++++++--- 6 files changed, 116 insertions(+), 55 deletions(-) diff --git a/drivers/net/bareudp.c b/drivers/net/bareudp.c index 92ee4a36f86f..1aa3d5d74a84 100644 --- a/drivers/net/bareudp.c +++ b/drivers/net/bareudp.c @@ -315,6 +315,7 @@ static int bareudp_xmit_skb(struct sk_buff *skb, struct net_device *dev, int min_headroom; __u8 tos, ttl; __be32 saddr; + bool noref; int err; if (skb_vlan_inet_prepare(skb, skb->protocol != htons(ETH_P_TEB))) @@ -329,7 +330,8 @@ static int bareudp_xmit_skb(struct sk_buff *skb, struct net_device *dev, rt = udp_tunnel_dst_lookup(skb, dev, bareudp->net, 0, &saddr, &info->key, sport, bareudp->port, key->tos, use_cache ? - (struct dst_cache *)&info->dst_cache : NULL); + (struct dst_cache *)&info->dst_cache : NULL, + &noref); if (IS_ERR(rt)) return PTR_ERR(rt); @@ -364,7 +366,8 @@ static int bareudp_xmit_skb(struct sk_buff *skb, struct net_device *dev, !net_eq(bareudp->net, dev_net(bareudp->dev)), !test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags), 0); - ip_rt_put(rt); + if (!noref) + ip_rt_put(rt); return 0; free_dst: @@ -386,6 +389,7 @@ static int bareudp6_xmit_skb(struct sk_buff *skb, struct net_device *dev, int min_headroom; __u8 prio, ttl; __be16 sport; + bool noref; int err; if (skb_vlan_inet_prepare(skb, skb->protocol != htons(ETH_P_TEB))) @@ -400,7 +404,8 @@ static int bareudp6_xmit_skb(struct sk_buff *skb, struct net_device *dev, dst = udp_tunnel6_dst_lookup(skb, dev, bareudp->net, sock, 0, &saddr, key, sport, bareudp->port, key->tos, use_cache ? - (struct dst_cache *) &info->dst_cache : NULL); + (struct dst_cache *)&info->dst_cache : NULL, + &noref); if (IS_ERR(dst)) return PTR_ERR(dst); @@ -434,11 +439,13 @@ static int bareudp6_xmit_skb(struct sk_buff *skb, struct net_device *dev, !test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags), 0); - dst_release(dst); + if (!noref) + dst_release(dst); return 0; free_dst: - dst_release(dst); + if (!noref) + dst_release(dst); return err; } @@ -507,6 +514,7 @@ static int bareudp_fill_metadata_dst(struct net_device *dev, struct bareudp_dev *bareudp = netdev_priv(dev); bool use_cache; __be16 sport; + bool noref; use_cache = ip_tunnel_dst_cache_usable(skb, info); sport = udp_flow_src_port(bareudp->net, skb, @@ -520,11 +528,13 @@ static int bareudp_fill_metadata_dst(struct net_device *dev, rt = udp_tunnel_dst_lookup(skb, dev, bareudp->net, 0, &saddr, &info->key, sport, bareudp->port, info->key.tos, - use_cache ? &info->dst_cache : NULL); + use_cache ? &info->dst_cache : NULL, + &noref); if (IS_ERR(rt)) return PTR_ERR(rt); - ip_rt_put(rt); + if (!noref) + ip_rt_put(rt); info->key.u.ipv4.src = saddr; } else if (ip_tunnel_info_af(info) == AF_INET6) { struct dst_entry *dst; @@ -534,11 +544,13 @@ static int bareudp_fill_metadata_dst(struct net_device *dev, dst = udp_tunnel6_dst_lookup(skb, dev, bareudp->net, sock, 0, &saddr, &info->key, sport, bareudp->port, info->key.tos, - use_cache ? &info->dst_cache : NULL); + use_cache ? &info->dst_cache : NULL, + &noref); if (IS_ERR(dst)) return PTR_ERR(dst); - dst_release(dst); + if (!noref) + dst_release(dst); info->key.u.ipv6.src = saddr; } else { return -EINVAL; diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 169a2b7d83e0..4d9c7ec29d40 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -766,7 +766,8 @@ static void geneve_build_header(struct genevehdr *geneveh, ip_tunnel_info_opts_get(geneveh->options, info); } -static int geneve_build_skb(struct dst_entry *dst, struct sk_buff *skb, +static int geneve_build_skb(struct dst_entry *dst, bool noref, + struct sk_buff *skb, const struct ip_tunnel_info *info, bool xnet, int ip_hdr_len, bool inner_proto_inherit) @@ -797,7 +798,8 @@ static int geneve_build_skb(struct dst_entry *dst, struct sk_buff *skb, return 0; free_dst: - dst_release(dst); + if (!noref) + dst_release(dst); return err; } @@ -831,6 +833,7 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, __be16 df = 0; __be32 saddr; __be16 sport; + bool noref; int err; if (skb_vlan_inet_prepare(skb, inner_proto_inherit)) @@ -849,7 +852,8 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, &info->key, sport, geneve->cfg.info.key.tp_dst, tos, use_cache ? - (struct dst_cache *)&info->dst_cache : NULL); + (struct dst_cache *)&info->dst_cache : NULL, + &noref); if (IS_ERR(rt)) return PTR_ERR(rt); @@ -857,7 +861,8 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, GENEVE_IPV4_HLEN + info->options_len, netif_is_any_bridge_port(dev)); if (err < 0) { - dst_release(&rt->dst); + if (!noref) + dst_release(&rt->dst); return err; } else if (err) { struct ip_tunnel_info *info; @@ -868,7 +873,8 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, unclone = skb_tunnel_info_unclone(skb); if (unlikely(!unclone)) { - dst_release(&rt->dst); + if (!noref) + dst_release(&rt->dst); return -ENOMEM; } @@ -877,13 +883,15 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, } if (!pskb_may_pull(skb, ETH_HLEN)) { - dst_release(&rt->dst); + if (!noref) + dst_release(&rt->dst); return -EINVAL; } skb->protocol = eth_type_trans(skb, geneve->dev); __netif_rx(skb); - dst_release(&rt->dst); + if (!noref) + dst_release(&rt->dst); return -EMSGSIZE; } @@ -916,7 +924,7 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, } } - err = geneve_build_skb(&rt->dst, skb, info, xnet, sizeof(struct iphdr), + err = geneve_build_skb(&rt->dst, noref, skb, info, xnet, sizeof(struct iphdr), inner_proto_inherit); if (unlikely(err)) return err; @@ -926,7 +934,8 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, !net_eq(geneve->net, dev_net(geneve->dev)), !test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags), 0); - ip_rt_put(rt); + if (!noref) + ip_rt_put(rt); return 0; } @@ -944,6 +953,7 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool use_cache; __u8 prio, ttl; __be16 sport; + bool noref; int err; if (skb_vlan_inet_prepare(skb, inner_proto_inherit)) @@ -962,7 +972,8 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, &saddr, key, sport, geneve->cfg.info.key.tp_dst, prio, use_cache ? - (struct dst_cache *)&info->dst_cache : NULL); + (struct dst_cache *)&info->dst_cache : NULL, + &noref); if (IS_ERR(dst)) return PTR_ERR(dst); @@ -970,7 +981,8 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, GENEVE_IPV6_HLEN + info->options_len, netif_is_any_bridge_port(dev)); if (err < 0) { - dst_release(dst); + if (!noref) + dst_release(dst); return err; } else if (err) { struct ip_tunnel_info *info = skb_tunnel_info(skb); @@ -980,7 +992,8 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, unclone = skb_tunnel_info_unclone(skb); if (unlikely(!unclone)) { - dst_release(dst); + if (!noref) + dst_release(dst); return -ENOMEM; } @@ -989,13 +1002,15 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, } if (!pskb_may_pull(skb, ETH_HLEN)) { - dst_release(dst); + if (!noref) + dst_release(dst); return -EINVAL; } skb->protocol = eth_type_trans(skb, geneve->dev); __netif_rx(skb); - dst_release(dst); + if (!noref) + dst_release(dst); return -EMSGSIZE; } @@ -1009,7 +1024,7 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, ttl = key->ttl; ttl = ttl ? : ip6_dst_hoplimit(dst); } - err = geneve_build_skb(dst, skb, info, xnet, sizeof(struct ipv6hdr), + err = geneve_build_skb(dst, noref, skb, info, xnet, sizeof(struct ipv6hdr), inner_proto_inherit); if (unlikely(err)) return err; @@ -1020,7 +1035,8 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, !test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags), 0); - dst_release(dst); + if (!noref) + dst_release(dst); return 0; } #endif @@ -1083,6 +1099,7 @@ static int geneve_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) struct ip_tunnel_info *info = skb_tunnel_info(skb); struct geneve_dev *geneve = netdev_priv(dev); __be16 sport; + bool noref; if (ip_tunnel_info_af(info) == AF_INET) { struct rtable *rt; @@ -1104,11 +1121,13 @@ static int geneve_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) &info->key, sport, geneve->cfg.info.key.tp_dst, tos, - use_cache ? &info->dst_cache : NULL); + use_cache ? &info->dst_cache : NULL, + &noref); if (IS_ERR(rt)) return PTR_ERR(rt); - ip_rt_put(rt); + if (!noref) + ip_rt_put(rt); info->key.u.ipv4.src = saddr; #if IS_ENABLED(CONFIG_IPV6) } else if (ip_tunnel_info_af(info) == AF_INET6) { @@ -1130,11 +1149,13 @@ static int geneve_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) dst = udp_tunnel6_dst_lookup(skb, dev, geneve->net, gs6->sock, 0, &saddr, &info->key, sport, geneve->cfg.info.key.tp_dst, prio, - use_cache ? &info->dst_cache : NULL); + use_cache ? &info->dst_cache : NULL, + &noref); if (IS_ERR(dst)) return PTR_ERR(dst); - dst_release(dst); + if (!noref) + dst_release(dst); info->key.u.ipv6.src = saddr; #endif } else { diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 09ddf0586176..f01340b99e08 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -2298,6 +2298,7 @@ static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev, int addr_family, __be16 dst_port, int dst_ifindex, __be32 vni, struct dst_entry *dst, + bool noref, u32 rt_flags) { #if IS_ENABLED(CONFIG_IPV6) @@ -2313,7 +2314,8 @@ static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev, vxlan->cfg.flags & VXLAN_F_LOCALBYPASS) { struct vxlan_dev *dst_vxlan; - dst_release(dst); + if (!noref) + dst_release(dst); dst_vxlan = vxlan_find_vni(vxlan->net, dst_ifindex, vni, addr_family, dst_port, vxlan->cfg.flags); @@ -2346,6 +2348,7 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, unsigned int pkt_len = skb->len; __be16 src_port = 0, dst_port; struct dst_entry *ndst = NULL; + bool noref = false; int addr_family; __u8 tos, ttl; int ifindex; @@ -2471,7 +2474,8 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, rt = udp_tunnel_dst_lookup(skb, dev, vxlan->net, ifindex, &saddr, pkey, src_port, dst_port, - tos, use_cache ? dst_cache : NULL); + tos, use_cache ? dst_cache : NULL, + &noref); if (IS_ERR(rt)) { err = PTR_ERR(rt); reason = SKB_DROP_REASON_IP_OUTNOROUTES; @@ -2485,7 +2489,7 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, /* Bypass encapsulation if the destination is local */ err = encap_bypass_if_local(skb, dev, vxlan, AF_INET, dst_port, ifindex, vni, - &rt->dst, rt->rt_flags); + &rt->dst, noref, rt->rt_flags); if (err) goto out_unlock; @@ -2521,7 +2525,8 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, unclone->key.u.ipv4.dst = saddr; } vxlan_encap_bypass(skb, vxlan, vxlan, vni, false); - dst_release(ndst); + if (!noref) + dst_release(ndst); goto out_unlock; } @@ -2538,7 +2543,8 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, pkey->u.ipv4.dst, tos, ttl, df, src_port, dst_port, xnet, !udp_sum, ipcb_flags); - ip_rt_put(rt); + if (!noref) + ip_rt_put(rt); #if IS_ENABLED(CONFIG_IPV6) } else { struct vxlan_sock *sock6; @@ -2557,7 +2563,8 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, ndst = udp_tunnel6_dst_lookup(skb, dev, vxlan->net, sock6->sock, ifindex, &saddr, pkey, src_port, dst_port, tos, - use_cache ? dst_cache : NULL); + use_cache ? dst_cache : NULL, + &noref); if (IS_ERR(ndst)) { err = PTR_ERR(ndst); ndst = NULL; @@ -2573,7 +2580,7 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, err = encap_bypass_if_local(skb, dev, vxlan, AF_INET6, dst_port, ifindex, vni, - ndst, rt6i_flags); + ndst, noref, rt6i_flags); if (err) goto out_unlock; } @@ -2596,7 +2603,8 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, } vxlan_encap_bypass(skb, vxlan, vxlan, vni, false); - dst_release(ndst); + if (!noref) + dst_release(ndst); goto out_unlock; } @@ -2614,7 +2622,8 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, &saddr, &pkey->u.ipv6.dst, tos, ttl, pkey->label, src_port, dst_port, !udp_sum, ip6cb_flags); - dst_release(ndst); + if (!noref) + dst_release(ndst); #endif } vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX, pkt_len); @@ -2634,7 +2643,8 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, DEV_STATS_INC(dev, collisions); else if (err == -ENETUNREACH) DEV_STATS_INC(dev, tx_carrier_errors); - dst_release(ndst); + if (!noref) + dst_release(ndst); DEV_STATS_INC(dev, tx_errors); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_ERRORS, 0); kfree_skb_reason(skb, reason); @@ -3222,6 +3232,7 @@ static int vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) struct vxlan_dev *vxlan = netdev_priv(dev); struct ip_tunnel_info *info = skb_tunnel_info(skb); __be16 sport, dport; + bool noref; sport = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min, vxlan->cfg.port_max, true); @@ -3238,10 +3249,11 @@ static int vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) &info->key.u.ipv4.src, &info->key, sport, dport, info->key.tos, - &info->dst_cache); + &info->dst_cache, &noref); if (IS_ERR(rt)) return PTR_ERR(rt); - ip_rt_put(rt); + if (!noref) + ip_rt_put(rt); } else { #if IS_ENABLED(CONFIG_IPV6) struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock); @@ -3254,10 +3266,11 @@ static int vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) 0, &info->key.u.ipv6.src, &info->key, sport, dport, info->key.tos, - &info->dst_cache); + &info->dst_cache, &noref); if (IS_ERR(ndst)) return PTR_ERR(ndst); - dst_release(ndst); + if (!noref) + dst_release(ndst); #else /* !CONFIG_IPV6 */ return -EPFNOSUPPORT; #endif diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index 9acef2fbd2fd..ba53a71b90ec 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -153,7 +153,8 @@ struct rtable *udp_tunnel_dst_lookup(struct sk_buff *skb, __be32 *saddr, const struct ip_tunnel_key *key, __be16 sport, __be16 dport, u8 tos, - struct dst_cache *dst_cache); + struct dst_cache *dst_cache, + bool *noref); struct dst_entry *udp_tunnel6_dst_lookup(struct sk_buff *skb, struct net_device *dev, struct net *net, @@ -161,7 +162,8 @@ struct dst_entry *udp_tunnel6_dst_lookup(struct sk_buff *skb, struct in6_addr *saddr, const struct ip_tunnel_key *key, __be16 sport, __be16 dport, u8 dsfield, - struct dst_cache *dst_cache); + struct dst_cache *dst_cache, + bool *noref); struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, unsigned short family, const unsigned long *flags, diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index b1f667c52cb2..978cd59281f6 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -233,16 +233,19 @@ struct rtable *udp_tunnel_dst_lookup(struct sk_buff *skb, __be32 *saddr, const struct ip_tunnel_key *key, __be16 sport, __be16 dport, u8 tos, - struct dst_cache *dst_cache) + struct dst_cache *dst_cache, + bool *noref) { struct rtable *rt = NULL; struct flowi4 fl4; #ifdef CONFIG_DST_CACHE if (dst_cache) { - rt = dst_cache_get_ip4(dst_cache, saddr); - if (rt) + rt = dst_cache_get_ip4_rcu(dst_cache, saddr); + if (rt) { + *noref = true; return rt; + } } #endif @@ -267,9 +270,12 @@ struct rtable *udp_tunnel_dst_lookup(struct sk_buff *skb, ip_rt_put(rt); return ERR_PTR(-ELOOP); } + *noref = false; #ifdef CONFIG_DST_CACHE - if (dst_cache) - dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr); + if (dst_cache) { + dst_cache_steal_ip4(dst_cache, &rt->dst, fl4.saddr); + *noref = true; + } #endif *saddr = fl4.saddr; return rt; diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index d58815db8182..b166ba225551 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -126,6 +126,7 @@ EXPORT_SYMBOL_GPL(udp_tunnel6_xmit_skb); * @dport: UDP destination port * @dsfield: The traffic class field * @dst_cache: The dst cache to use for lookup + * @noref: Is the returned dst noref? * This function performs a route lookup on a UDP tunnel * * It returns a valid dst pointer and stores src address to be used in @@ -140,16 +141,19 @@ struct dst_entry *udp_tunnel6_dst_lookup(struct sk_buff *skb, struct in6_addr *saddr, const struct ip_tunnel_key *key, __be16 sport, __be16 dport, u8 dsfield, - struct dst_cache *dst_cache) + struct dst_cache *dst_cache, + bool *noref) { struct dst_entry *dst = NULL; struct flowi6 fl6; #ifdef CONFIG_DST_CACHE if (dst_cache) { - dst = dst_cache_get_ip6(dst_cache, saddr); - if (dst) + dst = dst_cache_get_ip6_rcu(dst_cache, saddr); + if (dst) { + *noref = true; return dst; + } } #endif memset(&fl6, 0, sizeof(fl6)); @@ -173,9 +177,12 @@ struct dst_entry *udp_tunnel6_dst_lookup(struct sk_buff *skb, dst_release(dst); return ERR_PTR(-ELOOP); } + *noref = false; #ifdef CONFIG_DST_CACHE - if (dst_cache) - dst_cache_set_ip6(dst_cache, dst, &fl6.saddr); + if (dst_cache) { + dst_cache_steal_ip6(dst_cache, dst, &fl6.saddr); + *noref = true; + } #endif *saddr = fl6.saddr; return dst; -- 2.51.0 ovpn_udp{4,6}_output unnecessarily reference the dst_entry from the dst_cache when interacting with the cache. Reduce this overhead by avoiding the redundant refcount increments. These changes are safe as both ipv4 and ip6 support noref xmit under RCU which is already the case for ovpn. Signed-off-by: Marek Mietus --- drivers/net/ovpn/udp.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/net/ovpn/udp.c b/drivers/net/ovpn/udp.c index c82ba71b6aff..31827a2ab6ec 100644 --- a/drivers/net/ovpn/udp.c +++ b/drivers/net/ovpn/udp.c @@ -158,7 +158,7 @@ static int ovpn_udp4_output(struct ovpn_peer *peer, struct ovpn_bind *bind, int ret; local_bh_disable(); - rt = dst_cache_get_ip4(cache, &fl.saddr); + rt = dst_cache_get_ip4_rcu(cache, &fl.saddr); if (rt) goto transmit; @@ -194,13 +194,12 @@ static int ovpn_udp4_output(struct ovpn_peer *peer, struct ovpn_bind *bind, ret); goto err; } - dst_cache_set_ip4(cache, &rt->dst, fl.saddr); + dst_cache_steal_ip4(cache, &rt->dst, fl.saddr); transmit: udp_tunnel_xmit_skb(rt, sk, skb, fl.saddr, fl.daddr, 0, ip4_dst_hoplimit(&rt->dst), 0, fl.fl4_sport, fl.fl4_dport, false, sk->sk_no_check_tx, 0); - ip_rt_put(rt); ret = 0; err: local_bh_enable(); @@ -236,7 +235,7 @@ static int ovpn_udp6_output(struct ovpn_peer *peer, struct ovpn_bind *bind, }; local_bh_disable(); - dst = dst_cache_get_ip6(cache, &fl.saddr); + dst = dst_cache_get_ip6_rcu(cache, &fl.saddr); if (dst) goto transmit; @@ -260,7 +259,7 @@ static int ovpn_udp6_output(struct ovpn_peer *peer, struct ovpn_bind *bind, &bind->remote.in6, ret); goto err; } - dst_cache_set_ip6(cache, dst, &fl.saddr); + dst_cache_steal_ip6(cache, dst, &fl.saddr); transmit: /* user IPv6 packets may be larger than the transport interface @@ -276,7 +275,6 @@ static int ovpn_udp6_output(struct ovpn_peer *peer, struct ovpn_bind *bind, udp_tunnel6_xmit_skb(dst, sk, skb, skb->dev, &fl.saddr, &fl.daddr, 0, ip6_dst_hoplimit(dst), 0, fl.fl6_sport, fl.fl6_dport, udp_get_no_check6_tx(sk), 0); - dst_release(dst); ret = 0; err: local_bh_enable(); -- 2.51.0 send{4,6} unnecessarily reference the dst_entry from the dst_cache when interacting with the cache. Reduce this overhead by avoiding the redundant refcount increments. This is only possible in flows where the cache is used. Otherwise, we fall-back to a referenced dst. These changes are safe as both ipv4 and ip6 support noref xmit under RCU which is already the case for the wireguard send{4,6} functions. Signed-off-by: Marek Mietus --- drivers/net/wireguard/socket.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/net/wireguard/socket.c b/drivers/net/wireguard/socket.c index ee7d9c675909..b311965269a1 100644 --- a/drivers/net/wireguard/socket.c +++ b/drivers/net/wireguard/socket.c @@ -46,7 +46,7 @@ static int send4(struct wg_device *wg, struct sk_buff *skb, fl.fl4_sport = inet_sk(sock)->inet_sport; if (cache) - rt = dst_cache_get_ip4(cache, &fl.saddr); + rt = dst_cache_get_ip4_rcu(cache, &fl.saddr); if (!rt) { security_sk_classify_flow(sock, flowi4_to_flowi_common(&fl)); @@ -78,14 +78,15 @@ static int send4(struct wg_device *wg, struct sk_buff *skb, goto err; } if (cache) - dst_cache_set_ip4(cache, &rt->dst, fl.saddr); + dst_cache_steal_ip4(cache, &rt->dst, fl.saddr); } skb->ignore_df = 1; udp_tunnel_xmit_skb(rt, sock, skb, fl.saddr, fl.daddr, ds, ip4_dst_hoplimit(&rt->dst), 0, fl.fl4_sport, fl.fl4_dport, false, false, 0); - ip_rt_put(rt); + if (!cache) + ip_rt_put(rt); goto out; err: @@ -127,7 +128,7 @@ static int send6(struct wg_device *wg, struct sk_buff *skb, fl.fl6_sport = inet_sk(sock)->inet_sport; if (cache) - dst = dst_cache_get_ip6(cache, &fl.saddr); + dst = dst_cache_get_ip6_rcu(cache, &fl.saddr); if (!dst) { security_sk_classify_flow(sock, flowi6_to_flowi_common(&fl)); @@ -146,14 +147,15 @@ static int send6(struct wg_device *wg, struct sk_buff *skb, goto err; } if (cache) - dst_cache_set_ip6(cache, dst, &fl.saddr); + dst_cache_steal_ip6(cache, dst, &fl.saddr); } skb->ignore_df = 1; udp_tunnel6_xmit_skb(dst, sock, skb, skb->dev, &fl.saddr, &fl.daddr, ds, ip6_dst_hoplimit(dst), 0, fl.fl6_sport, fl.fl6_dport, false, 0); - dst_release(dst); + if (!cache) + dst_release(dst); goto out; err: -- 2.51.0 ip_md_tunnel_xmit unnecessarily references the dst_entry from the dst_cache when interacting with the cache. Reduce this overhead by avoiding the redundant refcount increments. This is only possible in flows where the cache is used. Otherwise, we fall-back to a referenced dst. This change is safe since ipv4 supports noref xmit under RCU which is already the case for ip_md_tunnel_xmit. Signed-off-by: Marek Mietus --- net/ipv4/ip_tunnel.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 8a0c611ab1bf..ab10759dd2e4 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -609,7 +609,7 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, use_cache = ip_tunnel_dst_cache_usable(skb, tun_info); if (use_cache) - rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr); + rt = dst_cache_get_ip4_rcu(&tun_info->dst_cache, &fl4.saddr); if (!rt) { rt = ip_route_output_key(tunnel->net, &fl4); if (IS_ERR(rt)) { @@ -617,11 +617,12 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, goto tx_error; } if (use_cache) - dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst, - fl4.saddr); + dst_cache_steal_ip4(&tun_info->dst_cache, &rt->dst, + fl4.saddr); } if (rt->dst.dev == dev) { - ip_rt_put(rt); + if (!use_cache) + ip_rt_put(rt); DEV_STATS_INC(dev, collisions); goto tx_error; } @@ -630,7 +631,8 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, df = htons(IP_DF); if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen, key->u.ipv4.dst, true)) { - ip_rt_put(rt); + if (!use_cache) + ip_rt_put(rt); goto tx_error; } @@ -647,7 +649,8 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len; if (skb_cow_head(skb, headroom)) { - ip_rt_put(rt); + if (!use_cache) + ip_rt_put(rt); goto tx_dropped; } @@ -655,7 +658,8 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)), 0); - ip_rt_put(rt); + if (!use_cache) + ip_rt_put(rt); return; tx_error: DEV_STATS_INC(dev, tx_errors); -- 2.51.0 ip_tunnel_xmit unnecessarily references the dst_entry from the dst_cache when interacting with the cache. Reduce this overhead by avoiding the redundant refcount increments. This is only possible in flows where the cache is used. Otherwise, we fall-back to a referenced dst. This change is safe since ipv4 supports noref xmit under RCU which is already the case for ip_tunnel_xmit. Signed-off-by: Marek Mietus --- net/ipv4/ip_tunnel.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index ab10759dd2e4..fa34e6cfbe35 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -681,6 +681,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, struct rtable *rt = NULL; /* Route to the other host */ __be16 payload_protocol; bool use_cache = false; + bool noref = true; struct flowi4 fl4; bool md = false; bool connected; @@ -775,11 +776,11 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, if (connected && md) { use_cache = ip_tunnel_dst_cache_usable(skb, tun_info); if (use_cache) - rt = dst_cache_get_ip4(&tun_info->dst_cache, - &fl4.saddr); + rt = dst_cache_get_ip4_rcu(&tun_info->dst_cache, + &fl4.saddr); } else { - rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, - &fl4.saddr) : NULL; + rt = connected ? dst_cache_get_ip4_rcu(&tunnel->dst_cache, + &fl4.saddr) : NULL; } if (!rt) { @@ -790,15 +791,18 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, goto tx_error; } if (use_cache) - dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst, - fl4.saddr); + dst_cache_steal_ip4(&tun_info->dst_cache, &rt->dst, + fl4.saddr); else if (!md && connected) - dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst, - fl4.saddr); + dst_cache_steal_ip4(&tunnel->dst_cache, &rt->dst, + fl4.saddr); + else + noref = false; } if (rt->dst.dev == dev) { - ip_rt_put(rt); + if (!noref) + ip_rt_put(rt); DEV_STATS_INC(dev, collisions); goto tx_error; } @@ -808,7 +812,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, df |= (inner_iph->frag_off & htons(IP_DF)); if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) { - ip_rt_put(rt); + if (!noref) + ip_rt_put(rt); goto tx_error; } @@ -839,7 +844,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, + rt->dst.header_len + ip_encap_hlen(&tunnel->encap); if (skb_cow_head(skb, max_headroom)) { - ip_rt_put(rt); + if (!noref) + ip_rt_put(rt); DEV_STATS_INC(dev, tx_dropped); kfree_skb(skb); return; @@ -849,7 +855,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)), 0); - ip_rt_put(rt); + if (!noref) + ip_rt_put(rt); return; #if IS_ENABLED(CONFIG_IPV6) -- 2.51.0 sctp_v{4,6}_xmit unnecessarily clone the dst from the transport when sending an encapsulated skb. Reduce this overhead by avoiding the refcount increment introduced by cloning the dst. Since t->dst is already assumed to be valid throughout both functions, it's safe to use the dst without incrementing the refcount. Signed-off-by: Marek Mietus --- net/sctp/ipv6.c | 5 ++--- net/sctp/protocol.c | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 38fd1cf3148f..8c28441009fa 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -219,7 +219,7 @@ int sctp_udp_v6_err(struct sock *sk, struct sk_buff *skb) static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *t) { - struct dst_entry *dst = dst_clone(t->dst); + struct dst_entry *dst = t->dst; struct flowi6 *fl6 = &t->fl.u.ip6; struct sock *sk = skb->sk; struct ipv6_pinfo *np = inet6_sk(sk); @@ -243,7 +243,7 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *t) if (!t->encap_port || !sctp_sk(sk)->udp_port) { int res; - skb_dst_set(skb, dst); + skb_dst_set(skb, dst_clone(dst)); rcu_read_lock(); res = ip6_xmit(sk, skb, fl6, sk->sk_mark, rcu_dereference(np->opt), @@ -264,7 +264,6 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *t) udp_tunnel6_xmit_skb(dst, sk, skb, NULL, &fl6->saddr, &fl6->daddr, tclass, ip6_dst_hoplimit(dst), label, sctp_sk(sk)->udp_port, t->encap_port, false, 0); - dst_release(dst); return 0; } diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index ff18ed0a65ff..8a00bb0a8ae5 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1038,7 +1038,7 @@ static int sctp_inet_supported_addrs(const struct sctp_sock *opt, /* Wrapper routine that calls the ip transmit routine. */ static inline int sctp_v4_xmit(struct sk_buff *skb, struct sctp_transport *t) { - struct dst_entry *dst = dst_clone(t->dst); + struct dst_entry *dst = t->dst; struct flowi4 *fl4 = &t->fl.u.ip4; struct sock *sk = skb->sk; struct inet_sock *inet = inet_sk(sk); @@ -1056,7 +1056,7 @@ static inline int sctp_v4_xmit(struct sk_buff *skb, struct sctp_transport *t) SCTP_INC_STATS(sock_net(sk), SCTP_MIB_OUTSCTPPACKS); if (!t->encap_port || !sctp_sk(sk)->udp_port) { - skb_dst_set(skb, dst); + skb_dst_set(skb, dst_clone(dst)); return __ip_queue_xmit(sk, skb, &t->fl, dscp); } @@ -1074,7 +1074,6 @@ static inline int sctp_v4_xmit(struct sk_buff *skb, struct sctp_transport *t) fl4->daddr, dscp, ip4_dst_hoplimit(dst), df, sctp_sk(sk)->udp_port, t->encap_port, false, false, 0); - dst_release(dst); return 0; } -- 2.51.0 ipip6_tunnel_xmit unnecessarily references the dst_entry from the dst_cache when interacting with the cache. Reduce this overhead by avoiding the redundant refcount increments. This change is safe since ipv4 supports noref xmit under RCU which is already the case for ipip6_tunnel_xmit. Signed-off-by: Marek Mietus --- net/ipv6/sit.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index a0d699082747..e9183e502242 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -933,14 +933,14 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, IPPROTO_IPV6, 0, dst, tiph->saddr, 0, 0, sock_net_uid(tunnel->net, NULL)); - rt = dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr); + rt = dst_cache_get_ip4_rcu(&tunnel->dst_cache, &fl4.saddr); if (!rt) { rt = ip_route_output_flow(tunnel->net, &fl4, NULL); if (IS_ERR(rt)) { DEV_STATS_INC(dev, tx_carrier_errors); goto tx_error_icmp; } - dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst, fl4.saddr); + dst_cache_steal_ip4(&tunnel->dst_cache, &rt->dst, fl4.saddr); } if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { @@ -951,13 +951,11 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, tdev = rt->dst.dev; if (tdev == dev) { - ip_rt_put(rt); DEV_STATS_INC(dev, collisions); goto tx_error; } if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4)) { - ip_rt_put(rt); goto tx_error; } @@ -966,7 +964,6 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, if (mtu < IPV4_MIN_MTU) { DEV_STATS_INC(dev, collisions); - ip_rt_put(rt); goto tx_error; } @@ -980,7 +977,6 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, if (skb->len > mtu && !skb_is_gso(skb)) { icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - ip_rt_put(rt); goto tx_error; } } @@ -1003,7 +999,6 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb) { - ip_rt_put(rt); DEV_STATS_INC(dev, tx_dropped); kfree_skb(skb); return NETDEV_TX_OK; @@ -1019,16 +1014,13 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, ttl = iph6->hop_limit; tos = INET_ECN_encapsulate(tos, ipv6_get_dsfield(iph6)); - if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0) { - ip_rt_put(rt); + if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0) goto tx_error; - } skb_set_inner_ipproto(skb, IPPROTO_IPV6); iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)), 0); - ip_rt_put(rt); return NETDEV_TX_OK; tx_error_icmp: -- 2.51.0 tipc_udp_xmit unnecessarily references the dst_entry from the dst_cache when interacting with the cache. Reduce this overhead by avoiding the redundant refcount increments. This change is safe as both ipv4 and ip6 support noref xmit under RCU which is already the case for tipc_udp_xmit. Signed-off-by: Marek Mietus --- net/tipc/udp_media.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index ba4ff5b3354f..cc1ef043aaf0 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -175,7 +175,7 @@ static int tipc_udp_xmit(struct net *net, struct sk_buff *skb, int ttl, err; local_bh_disable(); - ndst = dst_cache_get(cache); + ndst = dst_cache_get_rcu(cache); if (dst->proto == htons(ETH_P_IP)) { struct rtable *rt = dst_rtable(ndst); @@ -191,14 +191,13 @@ static int tipc_udp_xmit(struct net *net, struct sk_buff *skb, err = PTR_ERR(rt); goto tx_error; } - dst_cache_set_ip4(cache, &rt->dst, fl.saddr); + dst_cache_steal_ip4(cache, &rt->dst, fl.saddr); } ttl = ip4_dst_hoplimit(&rt->dst); udp_tunnel_xmit_skb(rt, ub->ubsock->sk, skb, src->ipv4.s_addr, dst->ipv4.s_addr, 0, ttl, 0, src->port, dst->port, false, true, 0); - ip_rt_put(rt); #if IS_ENABLED(CONFIG_IPV6) } else { if (!ndst) { @@ -215,13 +214,12 @@ static int tipc_udp_xmit(struct net *net, struct sk_buff *skb, err = PTR_ERR(ndst); goto tx_error; } - dst_cache_set_ip6(cache, ndst, &fl6.saddr); + dst_cache_steal_ip6(cache, ndst, &fl6.saddr); } ttl = ip6_dst_hoplimit(ndst); udp_tunnel6_xmit_skb(ndst, ub->ubsock->sk, skb, NULL, &src->ipv6, &dst->ipv6, 0, ttl, 0, src->port, dst->port, false, 0); - dst_release(ndst); #endif } local_bh_enable(); -- 2.51.0