The implementation is inspired by ptr_ring_empty. Co-developed-by: Tim Gebauer Signed-off-by: Tim Gebauer Signed-off-by: Simon Schippers --- include/linux/ptr_ring.h | 71 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h index 551329220e4f..6b8cfaecf478 100644 --- a/include/linux/ptr_ring.h +++ b/include/linux/ptr_ring.h @@ -243,6 +243,77 @@ static inline bool ptr_ring_empty_bh(struct ptr_ring *r) return ret; } +/* + * Check if a spare capacity of cnt is available without taking any locks. + * + * If cnt==0 or cnt > r->size it acts the same as __ptr_ring_empty. + * + * The same requirements apply as described for __ptr_ring_empty. + */ +static inline bool __ptr_ring_spare(struct ptr_ring *r, int cnt) +{ + int size = r->size; + int to_check; + + if (unlikely(!size || cnt < 0)) + return true; + + if (cnt > size) + cnt = 0; + + to_check = READ_ONCE(r->consumer_head) - cnt; + + if (to_check < 0) + to_check += size; + + return !r->queue[to_check]; +} + +static inline bool ptr_ring_spare(struct ptr_ring *r, int cnt) +{ + bool ret; + + spin_lock(&r->consumer_lock); + ret = __ptr_ring_spare(r, cnt); + spin_unlock(&r->consumer_lock); + + return ret; +} + +static inline bool ptr_ring_spare_irq(struct ptr_ring *r, int cnt) +{ + bool ret; + + spin_lock_irq(&r->consumer_lock); + ret = __ptr_ring_spare(r, cnt); + spin_unlock_irq(&r->consumer_lock); + + return ret; +} + +static inline bool ptr_ring_spare_any(struct ptr_ring *r, int cnt) +{ + unsigned long flags; + bool ret; + + spin_lock_irqsave(&r->consumer_lock, flags); + ret = __ptr_ring_spare(r, cnt); + spin_unlock_irqrestore(&r->consumer_lock, flags); + + return ret; +} + +static inline bool ptr_ring_spare_bh(struct ptr_ring *r, int cnt) +{ + bool ret; + + spin_lock_bh(&r->consumer_lock); + ret = __ptr_ring_spare(r, cnt); + spin_unlock_bh(&r->consumer_lock); + + return ret; +} + /* Must only be called after __ptr_ring_peek returned !NULL */ static inline void __ptr_ring_discard_one(struct ptr_ring *r) { -- 2.43.0 The netdev queue is stopped in tun_net_xmit after inserting an SKB into the ring buffer if the ring buffer became full because of that. If the insertion into the ptr_ring fails, the netdev queue is also stopped and the SKB is dropped. However, this never happened in my testing. To ensure that the ptr_ring change is available to the consumer before the netdev queue stop, an smp_wmb() is used. Then in tun_ring_recv, the new helper wake_netdev_queue is called in the blocking wait queue and after consuming an SKB from the ptr_ring. This helper first checks if the netdev queue has stopped. Then with the paired smp_rmb() it is known that tun_net_xmit will not produce SKBs anymore. With that knowledge, the helper can then wake the netdev queue if there is at least a single spare slot in the ptr_ring by calling ptr_ring_spare with cnt=1. Co-developed-by: Tim Gebauer Signed-off-by: Tim Gebauer Signed-off-by: Simon Schippers --- drivers/net/tun.c | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index cc6c50180663..735498e221d8 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1060,13 +1060,21 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) nf_reset_ct(skb); - if (ptr_ring_produce(&tfile->tx_ring, skb)) { + queue = netdev_get_tx_queue(dev, txq); + if (unlikely(ptr_ring_produce(&tfile->tx_ring, skb))) { + /* Paired with smp_rmb() in wake_netdev_queue. */ + smp_wmb(); + netif_tx_stop_queue(queue); drop_reason = SKB_DROP_REASON_FULL_RING; goto drop; } + if (ptr_ring_full(&tfile->tx_ring)) { + /* Paired with smp_rmb() in wake_netdev_queue. */ + smp_wmb(); + netif_tx_stop_queue(queue); + } /* dev->lltx requires to do our own update of trans_start */ - queue = netdev_get_tx_queue(dev, txq); txq_trans_cond_update(queue); /* Notify and wake up reader process */ @@ -2110,6 +2118,24 @@ static ssize_t tun_put_user(struct tun_struct *tun, return total; } +static inline void wake_netdev_queue(struct tun_file *tfile) +{ + struct netdev_queue *txq; + struct net_device *dev; + + rcu_read_lock(); + dev = rcu_dereference(tfile->tun)->dev; + txq = netdev_get_tx_queue(dev, tfile->queue_index); + + if (netif_tx_queue_stopped(txq)) { + /* Paired with smp_wmb() in tun_net_xmit. */ + smp_rmb(); + if (ptr_ring_spare(&tfile->tx_ring, 1)) + netif_tx_wake_queue(txq); + } + rcu_read_unlock(); +} + static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err) { DECLARE_WAITQUEUE(wait, current); @@ -2139,7 +2165,7 @@ static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err) error = -EFAULT; break; } - + wake_netdev_queue(tfile); schedule(); } @@ -2147,6 +2173,7 @@ static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err) remove_wait_queue(&tfile->socket.wq.wait, &wait); out: + wake_netdev_queue(tfile); *err = error; return ptr; } -- 2.43.0 Stopping the netdev queue is done in tun_net_xmit, as TAP uses this method as its ndo_start_xmit. To wake the queue, the new helper wake_netdev_queue is called in tap_do_read. This is done in the blocking wait queue and after consuming an SKB from the ptr_ring. This helper first checks if the netdev queue has stopped. Then with the smp_rmb(), which is paired with the smp_wmb() of tun_net_xmit, it is known that tun_net_xmit will not produce SKBs anymore. With that knowledge, the helper can then wake the netdev queue if there is at least a single spare slot. This check is done by calling the method ptr_ring_spare. Co-developed-by: Tim Gebauer Signed-off-by: Tim Gebauer Signed-off-by: Simon Schippers --- drivers/net/tap.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/drivers/net/tap.c b/drivers/net/tap.c index 1197f245e873..4d874672bcd7 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -753,6 +753,24 @@ static ssize_t tap_put_user(struct tap_queue *q, return ret ? ret : total; } +static inline void wake_netdev_queue(struct tap_queue *q) +{ + struct netdev_queue *txq; + struct net_device *dev; + + rcu_read_lock(); + dev = rcu_dereference(q->tap)->dev; + txq = netdev_get_tx_queue(dev, q->queue_index); + + if (netif_tx_queue_stopped(txq)) { + /* Paired with smp_wmb() in tun_net_xmit. */ + smp_rmb(); + if (ptr_ring_spare(&q->ring, 1)) + netif_tx_wake_queue(txq); + } + rcu_read_unlock(); +} + static ssize_t tap_do_read(struct tap_queue *q, struct iov_iter *to, int noblock, struct sk_buff *skb) @@ -785,12 +803,16 @@ static ssize_t tap_do_read(struct tap_queue *q, ret = -ERESTARTSYS; break; } + wake_netdev_queue(q); + /* Nothing to read, let's sleep */ schedule(); } if (!noblock) finish_wait(sk_sleep(&q->sk), &wait); + wake_netdev_queue(q); + put: if (skb) { ret = tap_put_user(q, skb, to); -- 2.43.0 Stopping the queue is done in tun_net_xmit. Waking the queue is done by calling one of the helpers, tun_wake_netdev_queue and tap_wake_netdev_queue. For that, in get_wake_netdev_queue, the correct method is determined and saved in the function pointer wake_netdev_queue of the vhost_net_virtqueue. Then, each time after consuming a batch in vhost_net_buf_produce, wake_netdev_queue is called. Co-developed-by: Tim Gebauer Signed-off-by: Tim Gebauer Signed-off-by: Simon Schippers --- drivers/net/tap.c | 6 ++++++ drivers/net/tun.c | 6 ++++++ drivers/vhost/net.c | 34 ++++++++++++++++++++++++++++------ include/linux/if_tap.h | 2 ++ include/linux/if_tun.h | 3 +++ 5 files changed, 45 insertions(+), 6 deletions(-) diff --git a/drivers/net/tap.c b/drivers/net/tap.c index 4d874672bcd7..0bad9e3d59af 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -1198,6 +1198,12 @@ struct socket *tap_get_socket(struct file *file) } EXPORT_SYMBOL_GPL(tap_get_socket); +void tap_wake_netdev_queue(struct file *file) +{ + wake_netdev_queue(file->private_data); +} +EXPORT_SYMBOL_GPL(tap_wake_netdev_queue); + struct ptr_ring *tap_get_ptr_ring(struct file *file) { struct tap_queue *q; diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 735498e221d8..e85589b596ac 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -3739,6 +3739,12 @@ struct socket *tun_get_socket(struct file *file) } EXPORT_SYMBOL_GPL(tun_get_socket); +void tun_wake_netdev_queue(struct file *file) +{ + wake_netdev_queue(file->private_data); +} +EXPORT_SYMBOL_GPL(tun_wake_netdev_queue); + struct ptr_ring *tun_get_tx_ring(struct file *file) { struct tun_file *tfile; diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 6edac0c1ba9b..e837d3a334f1 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -130,6 +130,7 @@ struct vhost_net_virtqueue { struct vhost_net_buf rxq; /* Batched XDP buffs */ struct xdp_buff *xdp; + void (*wake_netdev_queue)(struct file *f); }; struct vhost_net { @@ -175,13 +176,16 @@ static void *vhost_net_buf_consume(struct vhost_net_buf *rxq) return ret; } -static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq) +static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq, + struct sock *sk) { + struct file *file = sk->sk_socket->file; struct vhost_net_buf *rxq = &nvq->rxq; rxq->head = 0; rxq->tail = ptr_ring_consume_batched(nvq->rx_ring, rxq->queue, VHOST_NET_BATCH); + nvq->wake_netdev_queue(file); return rxq->tail; } @@ -208,14 +212,15 @@ static int vhost_net_buf_peek_len(void *ptr) return __skb_array_len_with_tag(ptr); } -static int vhost_net_buf_peek(struct vhost_net_virtqueue *nvq) +static int vhost_net_buf_peek(struct vhost_net_virtqueue *nvq, + struct sock *sk) { struct vhost_net_buf *rxq = &nvq->rxq; if (!vhost_net_buf_is_empty(rxq)) goto out; - if (!vhost_net_buf_produce(nvq)) + if (!vhost_net_buf_produce(nvq, sk)) return 0; out: @@ -994,7 +999,7 @@ static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk) unsigned long flags; if (rvq->rx_ring) - return vhost_net_buf_peek(rvq); + return vhost_net_buf_peek(rvq, sk); spin_lock_irqsave(&sk->sk_receive_queue.lock, flags); head = skb_peek(&sk->sk_receive_queue); @@ -1499,6 +1504,19 @@ static struct socket *get_tap_socket(int fd) return sock; } +static void (*get_wake_netdev_queue(struct file *file))(struct file *file) +{ + struct ptr_ring *ring; + + ring = tun_get_tx_ring(file); + if (!IS_ERR(ring)) + return tun_wake_netdev_queue; + ring = tap_get_ptr_ring(file); + if (!IS_ERR(ring)) + return tap_wake_netdev_queue; + return NULL; +} + static struct socket *get_socket(int fd) { struct socket *sock; @@ -1570,10 +1588,14 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) if (r) goto err_used; if (index == VHOST_NET_VQ_RX) { - if (sock) + if (sock) { nvq->rx_ring = get_tap_ptr_ring(sock->file); - else + nvq->wake_netdev_queue = + get_wake_netdev_queue(sock->file); + } else { nvq->rx_ring = NULL; + nvq->wake_netdev_queue = NULL; + } } oldubufs = nvq->ubufs; diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h index 553552fa635c..02b2809784b5 100644 --- a/include/linux/if_tap.h +++ b/include/linux/if_tap.h @@ -10,6 +10,7 @@ struct socket; #if IS_ENABLED(CONFIG_TAP) struct socket *tap_get_socket(struct file *); +void tap_wake_netdev_queue(struct file *file); struct ptr_ring *tap_get_ptr_ring(struct file *file); #else #include @@ -18,6 +19,7 @@ static inline struct socket *tap_get_socket(struct file *f) { return ERR_PTR(-EINVAL); } +static inline void tap_wake_netdev_queue(struct file *f) {} static inline struct ptr_ring *tap_get_ptr_ring(struct file *f) { return ERR_PTR(-EINVAL); diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h index 80166eb62f41..04c504bb1954 100644 --- a/include/linux/if_tun.h +++ b/include/linux/if_tun.h @@ -21,6 +21,7 @@ struct tun_msg_ctl { #if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE) struct socket *tun_get_socket(struct file *); +void tun_wake_netdev_queue(struct file *file); struct ptr_ring *tun_get_tx_ring(struct file *file); static inline bool tun_is_xdp_frame(void *ptr) @@ -50,6 +51,8 @@ static inline struct socket *tun_get_socket(struct file *f) return ERR_PTR(-EINVAL); } +static inline void tun_wake_netdev_queue(struct file *f) {} + static inline struct ptr_ring *tun_get_tx_ring(struct file *f) { return ERR_PTR(-EINVAL); -- 2.43.0