Some sockets suffer from heavy false sharing on sk->sk_drops, and fields in the same cache line. Add sk->sk_drop_counters to: - move the drop counter(s) to dedicated cache lines. - Add basic NUMA awareness to these drop counter(s). Following patches will use this infrastructure for UDP and RAW sockets. sk_clone_lock() is not yet ready, it would need to properly set newsk->sk_drop_counters if we plan to use this for TCP sockets. v2: used Paolo suggestion from https://lore.kernel.org/netdev/8f09830a-d83d-43c9-b36b-88ba0a23e9b2@redhat.com/ Signed-off-by: Eric Dumazet --- include/net/sock.h | 32 +++++++++++++++++++++++++++++++- net/core/sock.c | 2 ++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/include/net/sock.h b/include/net/sock.h index 9edb42ff06224cb8a1dd4f84af25bc22d1803ca9..73cd3316e288bde912dd96637e52d226575c2ffd 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -102,6 +102,11 @@ struct net; typedef __u32 __bitwise __portpair; typedef __u64 __bitwise __addrpair; +struct socket_drop_counters { + atomic_t drops0 ____cacheline_aligned_in_smp; + atomic_t drops1 ____cacheline_aligned_in_smp; +}; + /** * struct sock_common - minimal network layer representation of sockets * @skc_daddr: Foreign IPv4 addr @@ -282,6 +287,7 @@ struct sk_filter; * @sk_err_soft: errors that don't cause failure but are the cause of a * persistent failure not just 'timed out' * @sk_drops: raw/udp drops counter + * @sk_drop_counters: optional pointer to socket_drop_counters * @sk_ack_backlog: current listen backlog * @sk_max_ack_backlog: listen backlog set in listen() * @sk_uid: user id of owner @@ -449,6 +455,7 @@ struct sock { #ifdef CONFIG_XFRM struct xfrm_policy __rcu *sk_policy[2]; #endif + struct socket_drop_counters *sk_drop_counters; __cacheline_group_end(sock_read_rxtx); __cacheline_group_begin(sock_write_rxtx); @@ -2684,7 +2691,18 @@ struct sock_skb_cb { static inline void sk_drops_add(struct sock *sk, int segs) { - atomic_add(segs, &sk->sk_drops); + struct socket_drop_counters *sdc = sk->sk_drop_counters; + + if (sdc) { + int n = numa_node_id() % 2; + + if (n) + atomic_add(segs, &sdc->drops1); + else + atomic_add(segs, &sdc->drops0); + } else { + atomic_add(segs, &sk->sk_drops); + } } static inline void sk_drops_inc(struct sock *sk) @@ -2694,11 +2712,23 @@ static inline void sk_drops_inc(struct sock *sk) static inline int sk_drops_read(const struct sock *sk) { + const struct socket_drop_counters *sdc = sk->sk_drop_counters; + + if (sdc) { + DEBUG_NET_WARN_ON_ONCE(atomic_read(&sk->sk_drops)); + return atomic_read(&sdc->drops0) + atomic_read(&sdc->drops1); + } return atomic_read(&sk->sk_drops); } static inline void sk_drops_reset(struct sock *sk) { + struct socket_drop_counters *sdc = sk->sk_drop_counters; + + if (sdc) { + atomic_set(&sdc->drops0, 0); + atomic_set(&sdc->drops1, 0); + } atomic_set(&sk->sk_drops, 0); } diff --git a/net/core/sock.c b/net/core/sock.c index 75368823969a7992a55a6f40d87ffb8886de2f39..e66ad1ec3a2d969b71835a492806563519459749 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2505,6 +2505,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_wmem_queued = 0; newsk->sk_forward_alloc = 0; newsk->sk_reserved_mem = 0; + DEBUG_NET_WARN_ON_ONCE(newsk->sk_drop_counters); sk_drops_reset(newsk); newsk->sk_send_head = NULL; newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; @@ -4457,6 +4458,7 @@ static int __init sock_struct_check(void) #ifdef CONFIG_MEMCG CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg); #endif + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_drop_counters); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem); -- 2.51.0.261.g7ce5a0a67e-goog