This is a followup of commit e20dfbad8aab ("net: fix napi_consume_skb() with alien skbs"). Now the per-cpu napi_skb_cache is populated from TX completion path, we can make use of this cache, especially for cpus not used from a driver NAPI poll (primary user of napi_cache). We can use the napi_skb_cache only if current context is not from hard irq. With this patch, I consistently reach 130 Mpps on my UDP tx stress test and reduce SLUB spinlock contention to smaller values. Note there is still some SLUB contention for skb->head allocations. I had to tune /sys/kernel/slab/skbuff_small_head/cpu_partial and /sys/kernel/slab/skbuff_small_head/min_partial depending on the platform taxonomy. Signed-off-by: Eric Dumazet --- net/core/skbuff.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c6b065c0a2af265159ee6188469936767a295729..bda7e2196060f97c9fb7f8effd5276a7f5db3a74 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -662,8 +662,13 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, if (unlikely(node != NUMA_NO_NODE && node != numa_mem_id())) goto fallback; - if (flags & SKB_ALLOC_NAPI) + if (flags & SKB_ALLOC_NAPI) { skb = napi_skb_cache_get(true); + } else if (!in_hardirq() && !irqs_disabled()) { + local_bh_disable(); + skb = napi_skb_cache_get(false); + local_bh_enable(); + } if (!skb) { fallback: -- 2.52.0.rc1.455.g30608eb744-goog