ipv6_pinfo.saddr_cache is either NULL or &np->saddr. We do not need 8 bytes, a boolean is enough. Signed-off-by: Eric Dumazet --- include/linux/ipv6.h | 4 ++-- include/net/ip6_route.h | 4 ++-- net/ipv6/af_inet6.c | 2 +- net/ipv6/inet6_connection_sock.c | 2 +- net/ipv6/ip6_output.c | 3 ++- net/ipv6/route.c | 4 ++-- net/ipv6/tcp_ipv6.c | 4 ++-- 7 files changed, 12 insertions(+), 11 deletions(-) diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index f43314517396777105cc20ba30cac9c651b7dbf9..55c4d1e4dd7df803440e3a3cf18245a495ad949b 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -216,10 +216,10 @@ struct inet6_cork { struct ipv6_pinfo { struct in6_addr saddr; struct in6_pktinfo sticky_pktinfo; - const struct in6_addr *daddr_cache; #ifdef CONFIG_IPV6_SUBTREES - const struct in6_addr *saddr_cache; + bool saddr_cache; #endif + const struct in6_addr *daddr_cache; __be32 flow_label; __u32 frag_size; diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 59f48ca3abdf5a8aef6b4ece13f9a1774fc04f38..223c02d4268858cd3f1c83f949877dabc17efbc8 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -230,7 +230,7 @@ static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb) */ static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst, const struct in6_addr *daddr, - const struct in6_addr *saddr) + bool saddr_set) { struct ipv6_pinfo *np = inet6_sk(sk); @@ -238,7 +238,7 @@ static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst, sk_setup_caps(sk, dst); np->daddr_cache = daddr; #ifdef CONFIG_IPV6_SUBTREES - np->saddr_cache = saddr; + np->saddr_cache = saddr_set; #endif } diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 1992621e3f3f4b5b5c63e857b7b1c90576d3766e..c342f8daea7fa9469fa7f3a2d1f0a78572b9ae9a 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -857,7 +857,7 @@ int inet6_sk_rebuild_header(struct sock *sk) return PTR_ERR(dst); } - ip6_dst_store(sk, dst, NULL, NULL); + ip6_dst_store(sk, dst, NULL, false); } return 0; diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 333e43434dd78d73f960708a327c704a185e88d3..1947ccdb00df2301be1a8ce651d635dafd08c3b4 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -91,7 +91,7 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk, dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); if (!IS_ERR(dst)) - ip6_dst_store(sk, dst, NULL, NULL); + ip6_dst_store(sk, dst, NULL, false); } return dst; } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 9d64c13bab5eacb4cc05c78cccd86a7aeb36d37e..82ff6e1293d04dc9d69a661080cd0ae965cf766c 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1102,7 +1102,8 @@ static struct dst_entry *ip6_sk_dst_check(struct sock *sk, */ if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) || #ifdef CONFIG_IPV6_SUBTREES - ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || + ip6_rt_check(&rt->rt6i_src, &fl6->saddr, + np->saddr_cache ? &np->saddr : NULL) || #endif (fl6->flowi6_oif && fl6->flowi6_oif != dst_dev(dst)->ifindex)) { dst_release(dst); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 3371f16b7a3e615bbb41ee0d1a7c9187a761fc0c..e1b0aebf8bf92b711581ddb5cde8d9a840e33036 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3036,9 +3036,9 @@ void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, &sk->sk_v6_daddr : NULL, #ifdef CONFIG_IPV6_SUBTREES ipv6_addr_equal(&fl6->saddr, &np->saddr) ? - &np->saddr : + true : #endif - NULL); + false); } static bool ip6_redirect_nh_match(const struct fib6_result *res, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 08dabc47a6e7334b89b306af3a1e1c89c9935bb6..3e41ac94beb7d6fdfb6743ea5dbd609140234219 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -299,7 +299,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, inet->inet_rcv_saddr = LOOPBACK4_IPV6; sk->sk_gso_type = SKB_GSO_TCPV6; - ip6_dst_store(sk, dst, NULL, NULL); + ip6_dst_store(sk, dst, NULL, false); icsk->icsk_ext_hdr_len = 0; if (opt) @@ -1458,7 +1458,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * memcpy(newnp, np, sizeof(struct ipv6_pinfo)); - ip6_dst_store(newsk, dst, NULL, NULL); + ip6_dst_store(newsk, dst, NULL, false); newnp->saddr = ireq->ir_v6_loc_addr; -- 2.51.0.384.g4c02a37b29-goog ipv6_pinfo.daddr_cache is either NULL or &sk->sk_v6_daddr We do not need 8 bytes, a boolean is enough. Signed-off-by: Eric Dumazet --- include/linux/ipv6.h | 2 +- include/net/ip6_route.h | 4 ++-- net/ipv6/af_inet6.c | 2 +- net/ipv6/inet6_connection_sock.c | 2 +- net/ipv6/ip6_output.c | 3 ++- net/ipv6/route.c | 3 +-- net/ipv6/tcp_ipv6.c | 4 ++-- 7 files changed, 10 insertions(+), 10 deletions(-) diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 55c4d1e4dd7df803440e3a3cf18245a495ad949b..8e6d9f8b3dc80c3904ff13e1d218b9527a554e35 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -219,7 +219,7 @@ struct ipv6_pinfo { #ifdef CONFIG_IPV6_SUBTREES bool saddr_cache; #endif - const struct in6_addr *daddr_cache; + bool daddr_cache; __be32 flow_label; __u32 frag_size; diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 223c02d4268858cd3f1c83f949877dabc17efbc8..7c5512baa4b2b7503494b1ae02756df29ef93666 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -229,14 +229,14 @@ static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb) * Store a destination cache entry in a socket */ static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst, - const struct in6_addr *daddr, + bool daddr_set, bool saddr_set) { struct ipv6_pinfo *np = inet6_sk(sk); np->dst_cookie = rt6_get_cookie(dst_rt6_info(dst)); sk_setup_caps(sk, dst); - np->daddr_cache = daddr; + np->daddr_cache = daddr_set; #ifdef CONFIG_IPV6_SUBTREES np->saddr_cache = saddr_set; #endif diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index c342f8daea7fa9469fa7f3a2d1f0a78572b9ae9a..1b0314644e0ccce137158160945b11511588c1df 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -857,7 +857,7 @@ int inet6_sk_rebuild_header(struct sock *sk) return PTR_ERR(dst); } - ip6_dst_store(sk, dst, NULL, false); + ip6_dst_store(sk, dst, false, false); } return 0; diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 1947ccdb00df2301be1a8ce651d635dafd08c3b4..ea5cf3fdfdd648e43f4c53611f61509ce06d4cf8 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -91,7 +91,7 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk, dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); if (!IS_ERR(dst)) - ip6_dst_store(sk, dst, NULL, false); + ip6_dst_store(sk, dst, false, false); } return dst; } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 82ff6e1293d04dc9d69a661080cd0ae965cf766c..f904739e99b907a5704c32452ff585479e369727 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1100,7 +1100,8 @@ static struct dst_entry *ip6_sk_dst_check(struct sock *sk, * sockets. * 2. oif also should be the same. */ - if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) || + if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, + np->daddr_cache ? &sk->sk_v6_daddr : NULL) || #ifdef CONFIG_IPV6_SUBTREES ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache ? &np->saddr : NULL) || diff --git a/net/ipv6/route.c b/net/ipv6/route.c index e1b0aebf8bf92b711581ddb5cde8d9a840e33036..aee6a10b112aac6b17a2ca241b7ecc42ab883a2f 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3032,8 +3032,7 @@ void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, #endif ip6_dst_store(sk, dst, - ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ? - &sk->sk_v6_daddr : NULL, + ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr), #ifdef CONFIG_IPV6_SUBTREES ipv6_addr_equal(&fl6->saddr, &np->saddr) ? true : diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 3e41ac94beb7d6fdfb6743ea5dbd609140234219..b76504eebcfa9272fed909655cdc695e82e721dc 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -299,7 +299,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, inet->inet_rcv_saddr = LOOPBACK4_IPV6; sk->sk_gso_type = SKB_GSO_TCPV6; - ip6_dst_store(sk, dst, NULL, false); + ip6_dst_store(sk, dst, false, false); icsk->icsk_ext_hdr_len = 0; if (opt) @@ -1458,7 +1458,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * memcpy(newnp, np, sizeof(struct ipv6_pinfo)); - ip6_dst_store(newsk, dst, NULL, false); + ip6_dst_store(newsk, dst, false, false); newnp->saddr = ireq->ir_v6_loc_addr; -- 2.51.0.384.g4c02a37b29-goog Add READ_ONCE() annotations because np->rxpmtu can be changed while udpv6_recvmsg() and rawv6_recvmsg() read it. Since this is a very rarely used feature, and that udpv6_recvmsg() and rawv6_recvmsg() read np->rxopt anyway, change the test order so that np->rxpmtu does not need to be in a hot cache line. Signed-off-by: Eric Dumazet --- net/ipv6/raw.c | 2 +- net/ipv6/udp.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 4ae07a67b4d4f1be6730c252d246e79ff9c73d4c..e369f54844dd9456a819db77435eaef33d162932 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -445,7 +445,7 @@ static int rawv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (flags & MSG_ERRQUEUE) return ipv6_recv_error(sk, msg, len, addr_len); - if (np->rxpmtu && np->rxopt.bits.rxpmtu) + if (np->rxopt.bits.rxpmtu && READ_ONCE(np->rxpmtu)) return ipv6_recv_rxpmtu(sk, msg, len, addr_len); skb = skb_recv_datagram(sk, flags, &err); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index b70369f3cd3223cfde07556b1cb1636e8bc78d49..e87d0ef861f88af3ff7bf9dd5045c4d4601036e3 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -479,7 +479,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (flags & MSG_ERRQUEUE) return ipv6_recv_error(sk, msg, len, addr_len); - if (np->rxpmtu && np->rxopt.bits.rxpmtu) + if (np->rxopt.bits.rxpmtu && READ_ONCE(np->rxpmtu)) return ipv6_recv_rxpmtu(sk, msg, len, addr_len); try_again: -- 2.51.0.384.g4c02a37b29-goog Move fields used in tx fast path at the beginning of the structure, and seldom used ones at the end. Note that rxopt is also in the first cache line. Signed-off-by: Eric Dumazet --- include/linux/ipv6.h | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 8e6d9f8b3dc80c3904ff13e1d218b9527a554e35..43b7bb82873881b38a461031b784f55c740a0741 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -214,18 +214,21 @@ struct inet6_cork { /* struct ipv6_pinfo - ipv6 private area */ struct ipv6_pinfo { + /* Used in tx path (inet6_csk_route_socket(), ip6_xmit()) */ struct in6_addr saddr; - struct in6_pktinfo sticky_pktinfo; + __be32 flow_label; + u32 dst_cookie; + struct ipv6_txoptions __rcu *opt; + s16 hop_limit; + u8 pmtudisc; + u8 tclass; #ifdef CONFIG_IPV6_SUBTREES bool saddr_cache; #endif bool daddr_cache; - __be32 flow_label; - __u32 frag_size; - - s16 hop_limit; u8 mcast_hops; + u32 frag_size; int ucast_oif; int mcast_oif; @@ -233,7 +236,7 @@ struct ipv6_pinfo { /* pktoption flags */ union { struct { - __u16 srcrt:1, + u16 srcrt:1, osrcrt:1, rxinfo:1, rxoinfo:1, @@ -250,29 +253,25 @@ struct ipv6_pinfo { recvfragsize:1; /* 1 bits hole */ } bits; - __u16 all; + u16 all; } rxopt; /* sockopt flags */ - __u8 srcprefs; /* 001: prefer temporary address + u8 srcprefs; /* 001: prefer temporary address * 010: prefer public address * 100: prefer care-of address */ - __u8 pmtudisc; - __u8 min_hopcount; - __u8 tclass; + u8 min_hopcount; __be32 rcv_flowinfo; + struct in6_pktinfo sticky_pktinfo; - __u32 dst_cookie; + struct sk_buff *pktoptions; + struct sk_buff *rxpmtu; + struct inet6_cork cork; struct ipv6_mc_socklist __rcu *ipv6_mc_list; struct ipv6_ac_socklist *ipv6_ac_list; struct ipv6_fl_socklist __rcu *ipv6_fl_list; - - struct ipv6_txoptions __rcu *opt; - struct sk_buff *pktoptions; - struct sk_buff *rxpmtu; - struct inet6_cork cork; }; /* We currently use available bits from inet_sk(sk)->inet_flags, -- 2.51.0.384.g4c02a37b29-goog Commit 5a465a0da13e ("udp: Fix multiple wraparounds of sk->sk_rmem_alloc.") allowed to slightly overshoot sk->sk_rmem_alloc, when many cpus are trying to feed packets to a common UDP socket. This patch, combined with the following one reduces false sharing on the victim socket under DDOS. Signed-off-by: Eric Dumazet --- net/ipv4/udp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index cca41c569f37621404829e096306ba7d78ce4d43..edd846fee90ff7850356a5cb3400ce96856e5429 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1739,8 +1739,8 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) if (rcvbuf > INT_MAX >> 1) goto drop; - /* Always allow at least one packet for small buffer. */ - if (rmem > rcvbuf) + /* Accept the packet if queue is empty. */ + if (rmem) goto drop; } -- 2.51.0.384.g4c02a37b29-goog Avoid piling too many producers on the busylock by updating sk_rmem_alloc before busylock acquisition. Signed-off-by: Eric Dumazet --- net/ipv4/udp.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index edd846fee90ff7850356a5cb3400ce96856e5429..658ae87827991a78c25c2172d52e772c94ea217f 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1753,13 +1753,16 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) if (rmem > (rcvbuf >> 1)) { skb_condense(skb); size = skb->truesize; + rmem = atomic_add_return(size, &sk->sk_rmem_alloc); + if (rmem > rcvbuf) + goto uncharge_drop; busy = busylock_acquire(sk); + } else { + atomic_add(size, &sk->sk_rmem_alloc); } udp_set_dev_scratch(skb); - atomic_add(size, &sk->sk_rmem_alloc); - spin_lock(&list->lock); err = udp_rmem_schedule(sk, size); if (err) { -- 2.51.0.384.g4c02a37b29-goog UDP receivers suffer from sk_rmem_alloc updates, currently sharing a cache line with fields that need to be read-mostly (sock_read_rx group): 1) RFS enabled hosts read sk_napi_id from __udpv6_queue_rcv_skb(). 2) sk->sk_rcvbuf is read from __udp_enqueue_schedule_skb() /* --- cacheline 3 boundary (192 bytes) --- */ struct { atomic_t rmem_alloc; /* 0xc0 0x4 */ // Oops int len; /* 0xc4 0x4 */ struct sk_buff * head; /* 0xc8 0x8 */ struct sk_buff * tail; /* 0xd0 0x8 */ } sk_backlog; /* 0xc0 0x18 */ __u8 __cacheline_group_end__sock_write_rx[0]; /* 0xd8 0 */ __u8 __cacheline_group_begin__sock_read_rx[0]; /* 0xd8 0 */ struct dst_entry * sk_rx_dst; /* 0xd8 0x8 */ int sk_rx_dst_ifindex;/* 0xe0 0x4 */ u32 sk_rx_dst_cookie; /* 0xe4 0x4 */ unsigned int sk_ll_usec; /* 0xe8 0x4 */ unsigned int sk_napi_id; /* 0xec 0x4 */ u16 sk_busy_poll_budget;/* 0xf0 0x2 */ u8 sk_prefer_busy_poll;/* 0xf2 0x1 */ u8 sk_userlocks; /* 0xf3 0x1 */ int sk_rcvbuf; /* 0xf4 0x4 */ struct sk_filter * sk_filter; /* 0xf8 0x8 */ Move sk_error (which is less often dirtied) there. Alternative would be to cache align sock_read_rx but this has more implications/risks. Signed-off-by: Eric Dumazet --- include/net/sock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/sock.h b/include/net/sock.h index 0fd465935334160eeda7c1ea608f5d6161f02cb1..867dc44140d4c1b56ecfab1220c81133fe0394a0 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -394,7 +394,6 @@ struct sock { atomic_t sk_drops; __s32 sk_peek_off; - struct sk_buff_head sk_error_queue; struct sk_buff_head sk_receive_queue; /* * The backlog queue is special, it is always used with @@ -412,6 +411,7 @@ struct sock { } sk_backlog; #define sk_rmem_alloc sk_backlog.rmem_alloc + struct sk_buff_head sk_error_queue; __cacheline_group_end(sock_write_rx); __cacheline_group_begin(sock_read_rx); -- 2.51.0.384.g4c02a37b29-goog Generic sk_drops_inc() reads sk->sk_drop_counters. We know the precise location for UDP sockets. Move sk_drop_counters out of sock_read_rxtx so that sock_write_rxtx starts at a cache line boundary. Signed-off-by: Eric Dumazet --- include/net/sock.h | 2 +- include/net/udp.h | 5 +++++ net/core/sock.c | 1 - net/ipv4/udp.c | 12 ++++++------ net/ipv6/udp.c | 6 +++--- 5 files changed, 15 insertions(+), 11 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 867dc44140d4c1b56ecfab1220c81133fe0394a0..82bcdb7d7e6779de41ace0dde3a8b54e6adb0c14 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -451,7 +451,6 @@ struct sock { #ifdef CONFIG_XFRM struct xfrm_policy __rcu *sk_policy[2]; #endif - struct numa_drop_counters *sk_drop_counters; __cacheline_group_end(sock_read_rxtx); __cacheline_group_begin(sock_write_rxtx); @@ -568,6 +567,7 @@ struct sock { #ifdef CONFIG_BPF_SYSCALL struct bpf_local_storage __rcu *sk_bpf_storage; #endif + struct numa_drop_counters *sk_drop_counters; struct rcu_head sk_rcu; netns_tracker ns_tracker; struct xarray sk_user_frags; diff --git a/include/net/udp.h b/include/net/udp.h index 93b159f30e884ce7d30e2d2240b846441c5e135b..a08822e294b038c0d00d4a5f5cac62286a207926 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -295,6 +295,11 @@ static inline void udp_lib_init_sock(struct sock *sk) set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); } +static inline void udp_drops_inc(struct sock *sk) +{ + numa_drop_add(&udp_sk(sk)->drop_counters, 1); +} + /* hash routines shared between UDPv4/6 and UDP-Litev4/6 */ static inline int udp_lib_hash(struct sock *sk) { diff --git a/net/core/sock.c b/net/core/sock.c index 1f8ef4d8bcd9e8084eda82cad44c010071ceb171..21742da19e45bbe53e84b8a87d5a23bc2d2275f8 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -4444,7 +4444,6 @@ static int __init sock_struct_check(void) #ifdef CONFIG_MEMCG CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg); #endif - CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_drop_counters); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 658ae87827991a78c25c2172d52e772c94ea217f..25143f932447df2a84dd113ca33e1ccf15b3503c 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1790,7 +1790,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) atomic_sub(skb->truesize, &sk->sk_rmem_alloc); drop: - sk_drops_inc(sk); + udp_drops_inc(sk); busylock_release(busy); return err; } @@ -1855,7 +1855,7 @@ static struct sk_buff *__first_packet_length(struct sock *sk, IS_UDPLITE(sk)); __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, IS_UDPLITE(sk)); - sk_drops_inc(sk); + udp_drops_inc(sk); __skb_unlink(skb, rcvq); *total += skb->truesize; kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM); @@ -2011,7 +2011,7 @@ int udp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) __UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, is_udplite); __UDP_INC_STATS(net, UDP_MIB_INERRORS, is_udplite); - sk_drops_inc(sk); + udp_drops_inc(sk); kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM); goto try_again; } @@ -2081,7 +2081,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, if (unlikely(err)) { if (!peeking) { - sk_drops_inc(sk); + udp_drops_inc(sk); UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); } @@ -2452,7 +2452,7 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); drop: __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); - sk_drops_inc(sk); + udp_drops_inc(sk); sk_skb_reason_drop(sk, skb, drop_reason); return -1; } @@ -2537,7 +2537,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, nskb = skb_clone(skb, GFP_ATOMIC); if (unlikely(!nskb)) { - sk_drops_inc(sk); + udp_drops_inc(sk); __UDP_INC_STATS(net, UDP_MIB_RCVBUFERRORS, IS_UDPLITE(sk)); __UDP_INC_STATS(net, UDP_MIB_INERRORS, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index e87d0ef861f88af3ff7bf9dd5045c4d4601036e3..9f4d340d1e3a63d38f80138ef9f6aac4a33afa05 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -524,7 +524,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } if (unlikely(err)) { if (!peeking) { - sk_drops_inc(sk); + udp_drops_inc(sk); SNMP_INC_STATS(mib, UDP_MIB_INERRORS); } kfree_skb(skb); @@ -908,7 +908,7 @@ static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) __UDP6_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); drop: __UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); - sk_drops_inc(sk); + udp_drops_inc(sk); sk_skb_reason_drop(sk, skb, drop_reason); return -1; } @@ -1013,7 +1013,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, } nskb = skb_clone(skb, GFP_ATOMIC); if (unlikely(!nskb)) { - sk_drops_inc(sk); + udp_drops_inc(sk); __UDP6_INC_STATS(net, UDP_MIB_RCVBUFERRORS, IS_UDPLITE(sk)); __UDP6_INC_STATS(net, UDP_MIB_INERRORS, -- 2.51.0.384.g4c02a37b29-goog While having all spinlocks packed into an array was a space saver, this also caused NUMA imbalance and hash collisions. UDPv6 socket size becomes 1600 after this patch. Signed-off-by: Eric Dumazet --- include/linux/udp.h | 1 + include/net/udp.h | 1 + net/ipv4/udp.c | 20 ++------------------ 3 files changed, 4 insertions(+), 18 deletions(-) diff --git a/include/linux/udp.h b/include/linux/udp.h index 6ed008ab166557e868c1918daaaa5d551b7989a7..e554890c4415b411f35007d3ece9e6042db7a544 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -109,6 +109,7 @@ struct udp_sock { */ struct hlist_node tunnel_list; struct numa_drop_counters drop_counters; + spinlock_t busylock ____cacheline_aligned_in_smp; }; #define udp_test_bit(nr, sk) \ diff --git a/include/net/udp.h b/include/net/udp.h index a08822e294b038c0d00d4a5f5cac62286a207926..eecd64097f91196897f45530540b9c9b68c5ba4e 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -289,6 +289,7 @@ static inline void udp_lib_init_sock(struct sock *sk) struct udp_sock *up = udp_sk(sk); sk->sk_drop_counters = &up->drop_counters; + spin_lock_init(&up->busylock); skb_queue_head_init(&up->reader_queue); INIT_HLIST_NODE(&up->tunnel_list); up->forward_threshold = sk->sk_rcvbuf >> 2; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 25143f932447df2a84dd113ca33e1ccf15b3503c..7d1444821ee51a19cd5fd0dd5b8d096104c9283c 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1689,17 +1689,11 @@ static void udp_skb_dtor_locked(struct sock *sk, struct sk_buff *skb) * to relieve pressure on the receive_queue spinlock shared by consumer. * Under flood, this means that only one producer can be in line * trying to acquire the receive_queue spinlock. - * These busylock can be allocated on a per cpu manner, instead of a - * per socket one (that would consume a cache line per socket) */ -static int udp_busylocks_log __read_mostly; -static spinlock_t *udp_busylocks __read_mostly; - -static spinlock_t *busylock_acquire(void *ptr) +static spinlock_t *busylock_acquire(struct sock *sk) { - spinlock_t *busy; + spinlock_t *busy = &udp_sk(sk)->busylock; - busy = udp_busylocks + hash_ptr(ptr, udp_busylocks_log); spin_lock(busy); return busy; } @@ -3997,7 +3991,6 @@ static void __init bpf_iter_register(void) void __init udp_init(void) { unsigned long limit; - unsigned int i; udp_table_init(&udp_table, "UDP"); limit = nr_free_buffer_pages() / 8; @@ -4006,15 +3999,6 @@ void __init udp_init(void) sysctl_udp_mem[1] = limit; sysctl_udp_mem[2] = sysctl_udp_mem[0] * 2; - /* 16 spinlocks per cpu */ - udp_busylocks_log = ilog2(nr_cpu_ids) + 4; - udp_busylocks = kmalloc(sizeof(spinlock_t) << udp_busylocks_log, - GFP_KERNEL); - if (!udp_busylocks) - panic("UDP: failed to alloc udp_busylocks\n"); - for (i = 0; i < (1U << udp_busylocks_log); i++) - spin_lock_init(udp_busylocks + i); - if (register_pernet_subsys(&udp_sysctl_ops)) panic("UDP: failed to init sysctl parameters.\n"); -- 2.51.0.384.g4c02a37b29-goog Move skb freeing from udp recvmsg() path to the cpu which allocated/received it, as TCP did in linux-5.17. This increases max thoughput by 20% to 30%, depending on number of BH producers. Signed-off-by: Eric Dumazet --- net/ipv4/udp.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 7d1444821ee51a19cd5fd0dd5b8d096104c9283c..0c40426628eb2306b609881341a51307c4993871 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1825,6 +1825,13 @@ void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len) if (unlikely(READ_ONCE(udp_sk(sk)->peeking_with_offset))) sk_peek_offset_bwd(sk, len); + if (!skb_shared(skb)) { + if (unlikely(udp_skb_has_head_state(skb))) + skb_release_head_state(skb); + skb_attempt_defer_free(skb); + return; + } + if (!skb_unref(skb)) return; -- 2.51.0.384.g4c02a37b29-goog