Make all previous callers to out simply return directly, as the out label only had return ret. Signed-off-by: Jon Kohler --- drivers/net/tun.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 8192740357a0..dcce49a26800 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -2435,8 +2435,7 @@ static int tun_xdp_one(struct tun_struct *tun, build: skb = build_skb(xdp->data_hard_start, buflen); if (!skb) { - ret = -ENOMEM; - goto out; + return -ENOMEM; } skb_reserve(skb, xdp->data - xdp->data_hard_start); @@ -2455,8 +2454,7 @@ static int tun_xdp_one(struct tun_struct *tun, if (tun_vnet_hdr_tnl_to_skb(tun->flags, features, skb, tnl_hdr)) { atomic_long_inc(&tun->rx_frame_errors); kfree_skb(skb); - ret = -EINVAL; - goto out; + return -EINVAL; } skb->protocol = eth_type_trans(skb, tun->dev); @@ -2467,8 +2465,7 @@ static int tun_xdp_one(struct tun_struct *tun, if (skb_xdp) { ret = do_xdp_generic(xdp_prog, &skb); if (ret != XDP_PASS) { - ret = 0; - goto out; + return 0; } } @@ -2502,7 +2499,6 @@ static int tun_xdp_one(struct tun_struct *tun, if (rxhash) tun_flow_update(tun, rxhash, tfile); -out: return ret; } -- 2.43.0 Add drop reason to kfree_skb calls in tun_xdp_one and ensure that all failing paths properly increment drop counter. Signed-off-by: Jon Kohler --- drivers/net/tun.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index dcce49a26800..68ad46ab04a4 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -2392,8 +2392,10 @@ static int tun_xdp_one(struct tun_struct *tun, bool skb_xdp = false; struct page *page; - if (unlikely(datasize < ETH_HLEN)) + if (unlikely(datasize < ETH_HLEN)) { + dev_core_stats_rx_dropped_inc(tun->dev); return -EINVAL; + } xdp_prog = rcu_dereference(tun->xdp_prog); if (xdp_prog) { @@ -2407,6 +2409,7 @@ static int tun_xdp_one(struct tun_struct *tun, act = bpf_prog_run_xdp(xdp_prog, xdp); ret = tun_xdp_act(tun, xdp_prog, xdp, act); if (ret < 0) { + /* tun_xdp_act already handles drop statistics */ put_page(virt_to_head_page(xdp->data)); return ret; } @@ -2435,6 +2438,7 @@ static int tun_xdp_one(struct tun_struct *tun, build: skb = build_skb(xdp->data_hard_start, buflen); if (!skb) { + dev_core_stats_rx_dropped_inc(tun->dev); return -ENOMEM; } @@ -2453,7 +2457,8 @@ static int tun_xdp_one(struct tun_struct *tun, tnl_hdr = (struct virtio_net_hdr_v1_hash_tunnel *)gso; if (tun_vnet_hdr_tnl_to_skb(tun->flags, features, skb, tnl_hdr)) { atomic_long_inc(&tun->rx_frame_errors); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_DEV_HDR); + dev_core_stats_rx_dropped_inc(tun->dev); return -EINVAL; } @@ -2479,7 +2484,8 @@ static int tun_xdp_one(struct tun_struct *tun, if (unlikely(tfile->detached)) { spin_unlock(&queue->lock); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_DEV_READY); + dev_core_stats_rx_dropped_inc(tun->dev); return -EBUSY; } -- 2.43.0 Fold kfree_skb and consume_skb for tun_put_user into tun_put_user and rework kfree_skb to take a drop reason. Add drop reason to all drop sites and ensure that all failing paths properly increment drop counter. Signed-off-by: Jon Kohler --- drivers/net/tun.c | 51 +++++++++++++++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 17 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 68ad46ab04a4..e0f5e1fe4bd0 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -2035,6 +2035,7 @@ static ssize_t tun_put_user(struct tun_struct *tun, struct sk_buff *skb, struct iov_iter *iter) { + enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; struct tun_pi pi = { 0, skb->protocol }; ssize_t total; int vlan_offset = 0; @@ -2051,8 +2052,11 @@ static ssize_t tun_put_user(struct tun_struct *tun, total = skb->len + vlan_hlen + vnet_hdr_sz; if (!(tun->flags & IFF_NO_PI)) { - if (iov_iter_count(iter) < sizeof(pi)) - return -EINVAL; + if (iov_iter_count(iter) < sizeof(pi)) { + ret = -EINVAL; + drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; + goto drop; + } total += sizeof(pi); if (iov_iter_count(iter) < total) { @@ -2060,8 +2064,11 @@ static ssize_t tun_put_user(struct tun_struct *tun, pi.flags |= TUN_PKT_STRIP; } - if (copy_to_iter(&pi, sizeof(pi), iter) != sizeof(pi)) - return -EFAULT; + if (copy_to_iter(&pi, sizeof(pi), iter) != sizeof(pi)) { + ret = -EFAULT; + drop_reason = SKB_DROP_REASON_SKB_UCOPY_FAULT; + goto drop; + } } if (vnet_hdr_sz) { @@ -2070,8 +2077,10 @@ static ssize_t tun_put_user(struct tun_struct *tun, ret = tun_vnet_hdr_tnl_from_skb(tun->flags, tun->dev, skb, &hdr); - if (ret) - return ret; + if (ret) { + drop_reason = SKB_DROP_REASON_DEV_HDR; + goto drop; + } /* * Drop the packet if the configured header size is too small @@ -2080,8 +2089,10 @@ static ssize_t tun_put_user(struct tun_struct *tun, gso = (struct virtio_net_hdr *)&hdr; ret = __tun_vnet_hdr_put(vnet_hdr_sz, tun->dev->features, iter, gso); - if (ret) - return ret; + if (ret) { + drop_reason = SKB_DROP_REASON_DEV_HDR; + goto drop; + } } if (vlan_hlen) { @@ -2094,23 +2105,33 @@ static ssize_t tun_put_user(struct tun_struct *tun, vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto); ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset); - if (ret || !iov_iter_count(iter)) - goto done; + if (ret || !iov_iter_count(iter)) { + drop_reason = SKB_DROP_REASON_DEV_HDR; + goto drop; + } ret = copy_to_iter(&veth, sizeof(veth), iter); - if (ret != sizeof(veth) || !iov_iter_count(iter)) - goto done; + if (ret != sizeof(veth) || !iov_iter_count(iter)) { + drop_reason = SKB_DROP_REASON_DEV_HDR; + goto drop; + } } skb_copy_datagram_iter(skb, vlan_offset, iter, skb->len - vlan_offset); -done: /* caller is in process context, */ preempt_disable(); dev_sw_netstats_tx_add(tun->dev, 1, skb->len + vlan_hlen); preempt_enable(); + consume_skb(skb); + return total; + +drop: + dev_core_stats_tx_dropped_inc(tun->dev); + kfree_skb_reason(skb, drop_reason); + return ret; } static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err) @@ -2182,10 +2203,6 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile, struct sk_buff *skb = ptr; ret = tun_put_user(tun, tfile, skb, to); - if (unlikely(ret < 0)) - kfree_skb(skb); - else - consume_skb(skb); } return ret; -- 2.43.0 Improve on commit 4b4f052e2d89 ("net: tun: track dropped skb via kfree_skb_reason()") and commit ab00af85d2f8 ("net: tun: rebuild error handling in tun_get_user") by updating all potential drop sites in tun_get_user with appropriate drop reasons. Rework goto free_skb to goto drop, so that drop statistics will be updated. Redirect early failures to drop_stats_only, which doesn't need to worry about skb as it wouldn't be allocated yet. Cc: Chuang Wang Signed-off-by: Jon Kohler --- drivers/net/tun.c | 53 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 13 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index e0f5e1fe4bd0..97f130bc5fed 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1657,6 +1657,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, } err = tun_xdp_act(tun, xdp_prog, &xdp, act); if (err < 0) { + /* tun_xdp_act already handles drop statistics */ if (act == XDP_REDIRECT || act == XDP_TX) put_page(alloc_frag->page); goto out; @@ -1720,12 +1721,17 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, gso = (struct virtio_net_hdr *)&hdr; if (!(tun->flags & IFF_NO_PI)) { - if (len < sizeof(pi)) - return -EINVAL; + if (len < sizeof(pi)) { + err = -EINVAL; + goto drop_stats_only; + } + len -= sizeof(pi); - if (!copy_from_iter_full(&pi, sizeof(pi), from)) - return -EFAULT; + if (!copy_from_iter_full(&pi, sizeof(pi), from)) { + err = -EFAULT; + goto drop_stats_only; + } } if (tun->flags & IFF_VNET_HDR) { @@ -1734,16 +1740,20 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, features = tun_vnet_hdr_guest_features(vnet_hdr_sz); hdr_len = __tun_vnet_hdr_get(vnet_hdr_sz, tun->flags, features, from, gso); - if (hdr_len < 0) - return hdr_len; + if (hdr_len < 0) { + err = hdr_len; + goto drop_stats_only; + } len -= vnet_hdr_sz; } if ((tun->flags & TUN_TYPE_MASK) == IFF_TAP) { align += NET_IP_ALIGN; - if (unlikely(len < ETH_HLEN || (hdr_len && hdr_len < ETH_HLEN))) - return -EINVAL; + if (unlikely(len < ETH_HLEN || (hdr_len && hdr_len < ETH_HLEN))) { + err = -EINVAL; + goto drop_stats_only; + } } good_linear = SKB_MAX_HEAD(align); @@ -1769,9 +1779,18 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, */ skb = tun_build_skb(tun, tfile, from, gso, len, &skb_xdp); err = PTR_ERR_OR_ZERO(skb); - if (err) + if (err) { + drop_reason = err == -ENOMEM ? + SKB_DROP_REASON_NOMEM : + SKB_DROP_REASON_SKB_UCOPY_FAULT; goto drop; + } if (!skb) + /* tun_build_skb can return null with no err ptr + * from XDP paths, return total_len and always + * appear successful to caller, as drop statistics + * are already handled. + */ return total_len; } else { if (!zerocopy) { @@ -1796,8 +1815,10 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, } err = PTR_ERR_OR_ZERO(skb); - if (err) + if (err) { + drop_reason = SKB_DROP_REASON_NOMEM; goto drop; + } if (zerocopy) err = zerocopy_sg_from_iter(skb, from); @@ -1814,7 +1835,8 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, if (tun_vnet_hdr_tnl_to_skb(tun->flags, features, skb, &hdr)) { atomic_long_inc(&tun->rx_frame_errors); err = -EINVAL; - goto free_skb; + drop_reason = SKB_DROP_REASON_DEV_HDR; + goto drop; } switch (tun->flags & TUN_TYPE_MASK) { @@ -1831,6 +1853,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, break; default: err = -EINVAL; + drop_reason = SKB_DROP_REASON_INVALID_PROTO; goto drop; } } @@ -1938,7 +1961,8 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, spin_unlock_bh(&queue->lock); rcu_read_unlock(); err = -EBUSY; - goto free_skb; + drop_reason = SKB_DROP_REASON_DEV_READY; + goto drop; } __skb_queue_tail(queue, skb); @@ -1969,7 +1993,6 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, if (err != -EAGAIN) dev_core_stats_rx_dropped_inc(tun->dev); -free_skb: if (!IS_ERR_OR_NULL(skb)) kfree_skb_reason(skb, drop_reason); @@ -1980,6 +2003,10 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, } return err ?: total_len; + +drop_stats_only: + dev_core_stats_rx_dropped_inc(tun->dev); + return err; } static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from) -- 2.43.0 Optimize TUN_MSG_PTR batch processing by allocating sk_buff structures in bulk from the per-CPU NAPI cache using napi_skb_cache_get_bulk. This reduces allocation overhead and improves efficiency, especially when IFF_NAPI is enabled and GRO is feeding entries back to the cache. If bulk allocation cannot fully satisfy the batch, gracefully drop only the uncovered portion, allowing the rest of the batch to proceed, which is what already happens in the previous case where build_skb() would fail and return -ENOMEM. Signed-off-by: Jon Kohler --- drivers/net/tun.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 97f130bc5fed..64f944cce517 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -2420,13 +2420,13 @@ static void tun_put_page(struct tun_page *tpage) static int tun_xdp_one(struct tun_struct *tun, struct tun_file *tfile, struct xdp_buff *xdp, int *flush, - struct tun_page *tpage) + struct tun_page *tpage, + struct sk_buff *skb) { unsigned int datasize = xdp->data_end - xdp->data; struct virtio_net_hdr *gso = xdp->data_hard_start; struct virtio_net_hdr_v1_hash_tunnel *tnl_hdr; struct bpf_prog *xdp_prog; - struct sk_buff *skb = NULL; struct sk_buff_head *queue; netdev_features_t features; u32 rxhash = 0, act; @@ -2437,6 +2437,7 @@ static int tun_xdp_one(struct tun_struct *tun, struct page *page; if (unlikely(datasize < ETH_HLEN)) { + kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_SMALL); dev_core_stats_rx_dropped_inc(tun->dev); return -EINVAL; } @@ -2454,6 +2455,7 @@ static int tun_xdp_one(struct tun_struct *tun, ret = tun_xdp_act(tun, xdp_prog, xdp, act); if (ret < 0) { /* tun_xdp_act already handles drop statistics */ + kfree_skb_reason(skb, SKB_DROP_REASON_XDP); put_page(virt_to_head_page(xdp->data)); return ret; } @@ -2463,6 +2465,7 @@ static int tun_xdp_one(struct tun_struct *tun, *flush = true; fallthrough; case XDP_TX: + napi_consume_skb(skb, 1); return 0; case XDP_PASS: break; @@ -2475,13 +2478,15 @@ static int tun_xdp_one(struct tun_struct *tun, tpage->page = page; tpage->count = 1; } + napi_consume_skb(skb, 1); return 0; } } build: - skb = build_skb(xdp->data_hard_start, buflen); + skb = build_skb_around(skb, xdp->data_hard_start, buflen); if (!skb) { + kfree_skb_reason(skb, SKB_DROP_REASON_NOMEM); dev_core_stats_rx_dropped_inc(tun->dev); return -ENOMEM; } @@ -2566,9 +2571,11 @@ static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) if (m->msg_controllen == sizeof(struct tun_msg_ctl) && ctl && ctl->type == TUN_MSG_PTR) { struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; + int flush = 0, queued = 0, num_skbs = 0; struct tun_page tpage; int n = ctl->num; - int flush = 0, queued = 0; + /* Max size of VHOST_NET_BATCH */ + void *skbs[64]; memset(&tpage, 0, sizeof(tpage)); @@ -2576,13 +2583,24 @@ static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) rcu_read_lock(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); - for (i = 0; i < n; i++) { + num_skbs = napi_skb_cache_get_bulk(skbs, n); + + for (i = 0; i < num_skbs; i++) { + struct sk_buff *skb = skbs[i]; xdp = &((struct xdp_buff *)ctl->ptr)[i]; - ret = tun_xdp_one(tun, tfile, xdp, &flush, &tpage); + ret = tun_xdp_one(tun, tfile, xdp, &flush, &tpage, + skb); if (ret > 0) queued += ret; } + /* Handle remaining xdp_buff entries if num_skbs < ctl->num */ + for (i = num_skbs; i < ctl->num; i++) { + xdp = &((struct xdp_buff *)ctl->ptr)[i]; + dev_core_stats_rx_dropped_inc(tun->dev); + put_page(virt_to_head_page(xdp->data)); + } + if (flush) xdp_do_flush(); -- 2.43.0 Use napi_build_skb for small payload SKBs that end up using the tun_build_skb path. Signed-off-by: Jon Kohler --- drivers/net/tun.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 64f944cce517..27c502786a04 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1538,7 +1538,11 @@ static struct sk_buff *__tun_build_skb(struct tun_file *tfile, int buflen, int len, int pad, int metasize) { - struct sk_buff *skb = build_skb(buf, buflen); + struct sk_buff *skb; + + local_bh_disable(); + skb = napi_build_skb(buf, buflen); + local_bh_enable(); if (!skb) return ERR_PTR(-ENOMEM); -- 2.43.0 With build_skb paths now utilizing the local NAPI SKB cache, switch to using napi_consume_skb() in tun_put_user. This ensures the local SKB cache is refilled on read, improving memory locality and performance, especially when RX and TX run on the same worker thread (e.g., vhost workers). As napi_consume_skb() also performs deferred SKB freeing, SKBs allocated on different CPUs benefit as well. Cc: Eric Dumazet Cc: Nick Hudson Signed-off-by: Jon Kohler --- drivers/net/tun.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 27c502786a04..b48a66b39e0a 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -2155,7 +2155,9 @@ static ssize_t tun_put_user(struct tun_struct *tun, dev_sw_netstats_tx_add(tun->dev, 1, skb->len + vlan_hlen); preempt_enable(); - consume_skb(skb); + local_bh_disable(); + napi_consume_skb(skb, 1); + local_bh_enable(); return total; -- 2.43.0 Export skb_defer_free_flush so that it can be invoked in other modules, which is helpful in situations where processing the deferred backlog at specific cache refill points may be preferred. Signed-off-by: Jon Kohler --- include/linux/skbuff.h | 1 + net/core/dev.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ff90281ddf90..daa2d7480fbd 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1365,6 +1365,7 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size); struct sk_buff *build_skb_around(struct sk_buff *skb, void *data, unsigned int frag_size); void skb_attempt_defer_free(struct sk_buff *skb); +void skb_defer_free_flush(void); u32 napi_skb_cache_get_bulk(void **skbs, u32 n); struct sk_buff *napi_build_skb(void *data, unsigned int frag_size); diff --git a/net/core/dev.c b/net/core/dev.c index 9094c0fb8c68..c4f535be7b6d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6774,7 +6774,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done) } EXPORT_SYMBOL(napi_complete_done); -static void skb_defer_free_flush(void) +void skb_defer_free_flush(void) { struct llist_node *free_list; struct sk_buff *skb, *next; @@ -6795,6 +6795,7 @@ static void skb_defer_free_flush(void) } } } +EXPORT_SYMBOL_GPL(skb_defer_free_flush); #if defined(CONFIG_NET_RX_BUSY_POLL) -- 2.43.0 Call skb_defer_free_flush() immediately before invoking napi_skb_cache_get_bulk() in the XDP batch path. This ensures any deferred skb frees are processed so that the NAPI skb cache is refilled just in time for use. Keeping the cache warm helps reduce unnecessary IPIs during heavy transmit workloads. Signed-off-by: Jon Kohler --- drivers/net/tun.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index b48a66b39e0a..7d7f1ddcb707 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -2589,6 +2589,12 @@ static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) rcu_read_lock(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); + /* Attempt to flush deferred free list immediately + * prior to bulk get, which will help repopulate the local + * cache and help reduce the amount of IPIs a TX hot core + * will receive when the defer list grows high. + */ + skb_defer_free_flush(); num_skbs = napi_skb_cache_get_bulk(skbs, n); for (i = 0; i < num_skbs; i++) { -- 2.43.0