Switch to {tun|tap}_ring_{consume|produce} in both tun/tap as well as vhost_net to avoid ptr_ring tail drops. For tun, disable dev->lltx to ensure that tun_net_xmit is not called even though the netdev queue is stopped (it can happen due to unconsume or queue resize). Consequently, the update of trans_start in tun_net_xmit is also removed. Instead of the rx_ring, the virtqueue now saves the interface type IF_TAP, IF_TUN, (or IF_NONE) to call tun/tap wrappers. +--------------------------------+-----------+----------+ | pktgen benchmarks to Debian VM | Stock | Patched | | i5 6300HQ, 20M packets | | | +-----------------+--------------+-----------+----------+ | TAP | Transmitted | 195 Kpps | 183 Kpps | | +--------------+-----------+----------+ | | Lost | 1615 Kpps | 0 pps | +-----------------+--------------+-----------+----------+ | TAP+vhost_net | Transmitted | 589 Kpps | 588 Kpps | | +--------------+-----------+----------+ | | Lost | 1164 Kpps | 0 pps | +-----------------+--------------+-----------+----------+ Co-developed-by: Tim Gebauer Signed-off-by: Tim Gebauer Co-developed by: Jon Kohler Signed-off-by: Jon Kohler Signed-off-by: Simon Schippers --- drivers/net/tap.c | 4 +- drivers/net/tun.c | 20 ++++------ drivers/vhost/net.c | 92 ++++++++++++++++++++++++++++++--------------- 3 files changed, 71 insertions(+), 45 deletions(-) diff --git a/drivers/net/tap.c b/drivers/net/tap.c index 56b8fe376e4a..2847db4e3cc7 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -805,7 +805,7 @@ static void *__tap_ring_consume(struct tap_queue *q) return ptr; } -static __always_unused void *tap_ring_consume(struct tap_queue *q) +static void *tap_ring_consume(struct tap_queue *q) { void *ptr; @@ -868,7 +868,7 @@ static ssize_t tap_do_read(struct tap_queue *q, TASK_INTERRUPTIBLE); /* Read frames from the queue */ - skb = ptr_ring_consume(&q->ring); + skb = tap_ring_consume(q); if (skb) break; if (noblock) { diff --git a/drivers/net/tun.c b/drivers/net/tun.c index dc2d267d30d7..9da6e794a80f 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -931,7 +931,6 @@ static int tun_net_init(struct net_device *dev) dev->vlan_features = dev->features & ~(NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX); - dev->lltx = true; tun->flags = (tun->flags & ~TUN_FEATURES) | (ifr->ifr_flags & TUN_FEATURES); @@ -1002,9 +1001,9 @@ static unsigned int run_ebpf_filter(struct tun_struct *tun, /* Produce a packet into the transmit ring. If the ring becomes full, the * netdev queue is stopped until the consumer wakes it again. */ -static __always_unused int tun_ring_produce(struct ptr_ring *ring, - struct netdev_queue *queue, - struct sk_buff *skb) +static int tun_ring_produce(struct ptr_ring *ring, + struct netdev_queue *queue, + struct sk_buff *skb) { int ret; @@ -1089,7 +1088,7 @@ static void *__tun_ring_consume(struct tun_file *tfile) return ptr; } -static void __always_unused *tun_ring_consume(struct tun_file *tfile) +static void *tun_ring_consume(struct tun_file *tfile) { void *ptr; @@ -1161,15 +1160,12 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) nf_reset_ct(skb); - if (ptr_ring_produce(&tfile->tx_ring, skb)) { + queue = netdev_get_tx_queue(dev, txq); + if (unlikely(tun_ring_produce(&tfile->tx_ring, queue, skb))) { drop_reason = SKB_DROP_REASON_FULL_RING; goto drop; } - /* dev->lltx requires to do our own update of trans_start */ - queue = netdev_get_tx_queue(dev, txq); - txq_trans_cond_update(queue); - /* Notify and wake up reader process */ if (tfile->flags & TUN_FASYNC) kill_fasync(&tfile->fasync, SIGIO, POLL_IN); @@ -2220,7 +2216,7 @@ static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err) void *ptr = NULL; int error = 0; - ptr = ptr_ring_consume(&tfile->tx_ring); + ptr = tun_ring_consume(tfile); if (ptr) goto out; if (noblock) { @@ -2232,7 +2228,7 @@ static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err) while (1) { set_current_state(TASK_INTERRUPTIBLE); - ptr = ptr_ring_consume(&tfile->tx_ring); + ptr = tun_ring_consume(tfile); if (ptr) break; if (signal_pending(current)) { diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 35ded4330431..022efca1d4af 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -90,6 +90,12 @@ enum { VHOST_NET_VQ_MAX = 2, }; +enum if_type { + IF_NONE = 0, + IF_TUN = 1, + IF_TAP = 2, +}; + struct vhost_net_ubuf_ref { /* refcount follows semantics similar to kref: * 0: object is released @@ -131,6 +137,8 @@ struct vhost_net_virtqueue { struct vhost_net_buf rxq; /* Batched XDP buffs */ struct xdp_buff *xdp; + /* Interface type */ + enum if_type type; }; struct vhost_net { @@ -176,24 +184,50 @@ static void *vhost_net_buf_consume(struct vhost_net_buf *rxq) return ret; } -static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq) +static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq, + struct sock *sk) { + struct file *file = sk->sk_socket->file; struct vhost_net_buf *rxq = &nvq->rxq; rxq->head = 0; - rxq->tail = ptr_ring_consume_batched(nvq->rx_ring, rxq->queue, - VHOST_NET_BATCH); + switch (nvq->type) { + case IF_TUN: + rxq->tail = tun_ring_consume_batched(file, rxq->queue, + VHOST_NET_BATCH); + break; + case IF_TAP: + rxq->tail = tap_ring_consume_batched(file, rxq->queue, + VHOST_NET_BATCH); + break; + case IF_NONE: + return 0; + } return rxq->tail; } -static void vhost_net_buf_unproduce(struct vhost_net_virtqueue *nvq) +static void vhost_net_buf_unproduce(struct vhost_net_virtqueue *nvq, + struct socket *sk) { struct vhost_net_buf *rxq = &nvq->rxq; - - if (nvq->rx_ring && !vhost_net_buf_is_empty(rxq)) { - ptr_ring_unconsume(nvq->rx_ring, rxq->queue + rxq->head, - vhost_net_buf_get_size(rxq), - tun_ptr_free); + struct file *file; + + if (sk && !vhost_net_buf_is_empty(rxq)) { + file = sk->file; + switch (nvq->type) { + case IF_TUN: + tun_ring_unconsume(file, rxq->queue + rxq->head, + vhost_net_buf_get_size(rxq), + tun_ptr_free); + break; + case IF_TAP: + tap_ring_unconsume(file, rxq->queue + rxq->head, + vhost_net_buf_get_size(rxq), + tun_ptr_free); + break; + case IF_NONE: + return; + } rxq->head = rxq->tail = 0; } } @@ -209,14 +243,15 @@ static int vhost_net_buf_peek_len(void *ptr) return __skb_array_len_with_tag(ptr); } -static int vhost_net_buf_peek(struct vhost_net_virtqueue *nvq) +static int vhost_net_buf_peek(struct vhost_net_virtqueue *nvq, + struct sock *sk) { struct vhost_net_buf *rxq = &nvq->rxq; if (!vhost_net_buf_is_empty(rxq)) goto out; - if (!vhost_net_buf_produce(nvq)) + if (!vhost_net_buf_produce(nvq, sk)) return 0; out: @@ -991,8 +1026,8 @@ static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk) int len = 0; unsigned long flags; - if (rvq->rx_ring) - return vhost_net_buf_peek(rvq); + if (rvq->type) + return vhost_net_buf_peek(rvq, sk); spin_lock_irqsave(&sk->sk_receive_queue.lock, flags); head = skb_peek(&sk->sk_receive_queue); @@ -1201,7 +1236,7 @@ static void handle_rx(struct vhost_net *net) goto out; } busyloop_intr = false; - if (nvq->rx_ring) + if (nvq->type) msg.msg_control = vhost_net_buf_consume(&nvq->rxq); /* On overrun, truncate and discard */ if (unlikely(headcount > UIO_MAXIOV)) { @@ -1357,7 +1392,7 @@ static int vhost_net_open(struct inode *inode, struct file *f) n->vqs[i].batched_xdp = 0; n->vqs[i].vhost_hlen = 0; n->vqs[i].sock_hlen = 0; - n->vqs[i].rx_ring = NULL; + n->vqs[i].rx_ring = IF_NONE; vhost_net_buf_init(&n->vqs[i].rxq); } vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX, @@ -1387,8 +1422,8 @@ static struct socket *vhost_net_stop_vq(struct vhost_net *n, sock = vhost_vq_get_backend(vq); vhost_net_disable_vq(n, vq); vhost_vq_set_backend(vq, NULL); - vhost_net_buf_unproduce(nvq); - nvq->rx_ring = NULL; + vhost_net_buf_unproduce(nvq, sock); + nvq->type = IF_NONE; mutex_unlock(&vq->mutex); return sock; } @@ -1468,18 +1503,13 @@ static struct socket *get_raw_socket(int fd) return ERR_PTR(r); } -static struct ptr_ring *get_tap_ptr_ring(struct file *file) +static enum if_type get_if_type(struct file *file) { - struct ptr_ring *ring; - ring = tun_get_tx_ring(file); - if (!IS_ERR(ring)) - goto out; - ring = tap_get_ptr_ring(file); - if (!IS_ERR(ring)) - goto out; - ring = NULL; -out: - return ring; + if (tap_is_tap_file(file)) + return IF_TAP; + if (tun_is_tun_file(file)) + return IF_TUN; + return IF_NONE; } static struct socket *get_tap_socket(int fd) @@ -1561,7 +1591,7 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) vhost_net_disable_vq(n, vq); vhost_vq_set_backend(vq, sock); - vhost_net_buf_unproduce(nvq); + vhost_net_buf_unproduce(nvq, sock); r = vhost_vq_init_access(vq); if (r) goto err_used; @@ -1570,9 +1600,9 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) goto err_used; if (index == VHOST_NET_VQ_RX) { if (sock) - nvq->rx_ring = get_tap_ptr_ring(sock->file); + nvq->type = get_if_type(sock->file); else - nvq->rx_ring = NULL; + nvq->type = IF_NONE; } oldubufs = nvq->ubufs; -- 2.43.0