Currently, only skb dst is cleared (thanks to skb_dst_drop()) Make sure skb->destructor, conntrack and extensions are cleared. Signed-off-by: Eric Dumazet --- net/core/skbuff.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 5b4bc8b1c7d5674c19b64f8b15685d74632048fe..eeddb9e737ff28e47c77739db7b25ea68e5aa735 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1149,11 +1149,10 @@ void skb_release_head_state(struct sk_buff *skb) skb); #endif + skb->destructor = NULL; } -#if IS_ENABLED(CONFIG_NF_CONNTRACK) - nf_conntrack_put(skb_nfct(skb)); -#endif - skb_ext_put(skb); + nf_reset_ct(skb); + skb_ext_reset(skb); } /* Free everything but the sk_buff shell. */ -- 2.51.2.1041.gc1ab5b90ca-goog There is a lack of NUMA awareness and more generally lack of slab caches affinity on TX completion path. Modern drivers are using napi_consume_skb(), hoping to cache sk_buff in per-cpu caches so that they can be recycled in RX path. Only use this if the skb was allocated on the same cpu, otherwise use skb_attempt_defer_free() so that the skb is freed on the original cpu. This removes contention on SLUB spinlocks and data structures. After this patch, I get ~50% improvement for an UDP tx workload on an AMD EPYC 9B45 (IDPF 200Gbit NIC with 32 TX queues). 80 Mpps -> 120 Mpps. Profiling one of the 32 cpus servicing NIC interrupts : Before: mpstat -P 511 1 1 Average: CPU %usr %nice %sys %iowait %irq %soft %steal %guest %gnice %idle Average: 511 0.00 0.00 0.00 0.00 0.00 98.00 0.00 0.00 0.00 2.00 31.01% ksoftirqd/511 [kernel.kallsyms] [k] queued_spin_lock_slowpath 12.45% swapper [kernel.kallsyms] [k] queued_spin_lock_slowpath 5.60% ksoftirqd/511 [kernel.kallsyms] [k] __slab_free 3.31% ksoftirqd/511 [kernel.kallsyms] [k] idpf_tx_clean_buf_ring 3.27% ksoftirqd/511 [kernel.kallsyms] [k] idpf_tx_splitq_clean_all 2.95% ksoftirqd/511 [kernel.kallsyms] [k] idpf_tx_splitq_start 2.52% ksoftirqd/511 [kernel.kallsyms] [k] fq_dequeue 2.32% ksoftirqd/511 [kernel.kallsyms] [k] read_tsc 2.25% ksoftirqd/511 [kernel.kallsyms] [k] build_detached_freelist 2.15% ksoftirqd/511 [kernel.kallsyms] [k] kmem_cache_free 2.11% swapper [kernel.kallsyms] [k] __slab_free 2.06% ksoftirqd/511 [kernel.kallsyms] [k] idpf_features_check 2.01% ksoftirqd/511 [kernel.kallsyms] [k] idpf_tx_splitq_clean_hdr 1.97% ksoftirqd/511 [kernel.kallsyms] [k] skb_release_data 1.52% ksoftirqd/511 [kernel.kallsyms] [k] sock_wfree 1.34% swapper [kernel.kallsyms] [k] idpf_tx_clean_buf_ring 1.23% swapper [kernel.kallsyms] [k] idpf_tx_splitq_clean_all 1.15% ksoftirqd/511 [kernel.kallsyms] [k] dma_unmap_page_attrs 1.11% swapper [kernel.kallsyms] [k] idpf_tx_splitq_start 1.03% swapper [kernel.kallsyms] [k] fq_dequeue 0.94% swapper [kernel.kallsyms] [k] kmem_cache_free 0.93% swapper [kernel.kallsyms] [k] read_tsc 0.81% ksoftirqd/511 [kernel.kallsyms] [k] napi_consume_skb 0.79% swapper [kernel.kallsyms] [k] idpf_tx_splitq_clean_hdr 0.77% ksoftirqd/511 [kernel.kallsyms] [k] skb_free_head 0.76% swapper [kernel.kallsyms] [k] idpf_features_check 0.72% swapper [kernel.kallsyms] [k] skb_release_data 0.69% swapper [kernel.kallsyms] [k] build_detached_freelist 0.58% ksoftirqd/511 [kernel.kallsyms] [k] skb_release_head_state 0.56% ksoftirqd/511 [kernel.kallsyms] [k] __put_partials 0.55% ksoftirqd/511 [kernel.kallsyms] [k] kmem_cache_free_bulk 0.48% swapper [kernel.kallsyms] [k] sock_wfree After: mpstat -P 511 1 1 Average: CPU %usr %nice %sys %iowait %irq %soft %steal %guest %gnice %idle Average: 511 0.00 0.00 0.00 0.00 0.00 51.49 0.00 0.00 0.00 48.51 19.10% swapper [kernel.kallsyms] [k] idpf_tx_splitq_clean_hdr 13.86% swapper [kernel.kallsyms] [k] idpf_tx_clean_buf_ring 10.80% swapper [kernel.kallsyms] [k] skb_attempt_defer_free 10.57% swapper [kernel.kallsyms] [k] idpf_tx_splitq_clean_all 7.18% swapper [kernel.kallsyms] [k] queued_spin_lock_slowpath 6.69% swapper [kernel.kallsyms] [k] sock_wfree 5.55% swapper [kernel.kallsyms] [k] dma_unmap_page_attrs 3.10% swapper [kernel.kallsyms] [k] fq_dequeue 3.00% swapper [kernel.kallsyms] [k] skb_release_head_state 2.73% swapper [kernel.kallsyms] [k] read_tsc 2.48% swapper [kernel.kallsyms] [k] idpf_tx_splitq_start 1.20% swapper [kernel.kallsyms] [k] idpf_features_check 1.13% swapper [kernel.kallsyms] [k] napi_consume_skb 0.93% swapper [kernel.kallsyms] [k] idpf_vport_splitq_napi_poll 0.64% swapper [kernel.kallsyms] [k] native_send_call_func_single_ipi 0.60% swapper [kernel.kallsyms] [k] acpi_processor_ffh_cstate_enter 0.53% swapper [kernel.kallsyms] [k] io_idle 0.43% swapper [kernel.kallsyms] [k] netif_skb_features 0.41% swapper [kernel.kallsyms] [k] __direct_call_cpuidle_state_enter2 0.40% swapper [kernel.kallsyms] [k] native_irq_return_iret 0.40% swapper [kernel.kallsyms] [k] idpf_tx_buf_hw_update 0.36% swapper [kernel.kallsyms] [k] sched_clock_noinstr 0.34% swapper [kernel.kallsyms] [k] handle_softirqs 0.32% swapper [kernel.kallsyms] [k] net_rx_action 0.32% swapper [kernel.kallsyms] [k] dql_completed 0.32% swapper [kernel.kallsyms] [k] validate_xmit_skb 0.31% swapper [kernel.kallsyms] [k] skb_network_protocol 0.29% swapper [kernel.kallsyms] [k] skb_csum_hwoffload_help 0.29% swapper [kernel.kallsyms] [k] x2apic_send_IPI 0.28% swapper [kernel.kallsyms] [k] ktime_get 0.24% swapper [kernel.kallsyms] [k] __qdisc_run Signed-off-by: Eric Dumazet --- net/core/skbuff.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index eeddb9e737ff28e47c77739db7b25ea68e5aa735..7ac5f8aa1235a55db02b40b5a0f51bb3fa53fa03 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1476,6 +1476,11 @@ void napi_consume_skb(struct sk_buff *skb, int budget) DEBUG_NET_WARN_ON_ONCE(!in_softirq()); + if (skb->alloc_cpu != smp_processor_id() && !skb_shared(skb)) { + skb_release_head_state(skb); + return skb_attempt_defer_free(skb); + } + if (!skb_unref(skb)) return; -- 2.51.2.1041.gc1ab5b90ca-goog skb_defer_max value is very conservative, and can be increased to avoid too many calls to kick_defer_list_purge(). Signed-off-by: Eric Dumazet --- Documentation/admin-guide/sysctl/net.rst | 4 ++-- net/core/hotdata.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst index 991773dcb9cfe57f64bffabc018549b712aed9b0..369a738a68193e897d880eeb2c5a22cd90833938 100644 --- a/Documentation/admin-guide/sysctl/net.rst +++ b/Documentation/admin-guide/sysctl/net.rst @@ -355,9 +355,9 @@ skb_defer_max ------------- Max size (in skbs) of the per-cpu list of skbs being freed -by the cpu which allocated them. Used by TCP stack so far. +by the cpu which allocated them. -Default: 64 +Default: 128 optmem_max ---------- diff --git a/net/core/hotdata.c b/net/core/hotdata.c index 95d0a4df10069e4529fb9e5b58e8391574085cf1..dddd5c287cf08ba75aec1cc546fd1bc48c0f7b26 100644 --- a/net/core/hotdata.c +++ b/net/core/hotdata.c @@ -20,7 +20,7 @@ struct net_hotdata net_hotdata __cacheline_aligned = { .dev_tx_weight = 64, .dev_rx_weight = 64, .sysctl_max_skb_frags = MAX_SKB_FRAGS, - .sysctl_skb_defer_max = 64, + .sysctl_skb_defer_max = 128, .sysctl_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE }; EXPORT_SYMBOL(net_hotdata); -- 2.51.2.1041.gc1ab5b90ca-goog