The implementation is inspired by ptr_ring_empty.

Co-developed-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
Signed-off-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
---
 include/linux/ptr_ring.h | 71 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 71 insertions(+)

diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
index 551329220e4f..6b8cfaecf478 100644
--- a/include/linux/ptr_ring.h
+++ b/include/linux/ptr_ring.h
@@ -243,6 +243,77 @@ static inline bool ptr_ring_empty_bh(struct ptr_ring *r)
 	return ret;
 }
 
+/*
+ * Check if a spare capacity of cnt is available without taking any locks.
+ *
+ * If cnt==0 or cnt > r->size it acts the same as __ptr_ring_empty.
+ *
+ * The same requirements apply as described for __ptr_ring_empty.
+ */
+static inline bool __ptr_ring_spare(struct ptr_ring *r, int cnt)
+{
+	int size = r->size;
+	int to_check;
+
+	if (unlikely(!size || cnt < 0))
+		return true;
+
+	if (cnt > size)
+		cnt = 0;
+
+	to_check = READ_ONCE(r->consumer_head) - cnt;
+
+	if (to_check < 0)
+		to_check += size;
+
+	return !r->queue[to_check];
+}
+
+static inline bool ptr_ring_spare(struct ptr_ring *r, int cnt)
+{
+	bool ret;
+
+	spin_lock(&r->consumer_lock);
+	ret = __ptr_ring_spare(r, cnt);
+	spin_unlock(&r->consumer_lock);
+
+	return ret;
+}
+
+static inline bool ptr_ring_spare_irq(struct ptr_ring *r, int cnt)
+{
+	bool ret;
+
+	spin_lock_irq(&r->consumer_lock);
+	ret = __ptr_ring_spare(r, cnt);
+	spin_unlock_irq(&r->consumer_lock);
+
+	return ret;
+}
+
+static inline bool ptr_ring_spare_any(struct ptr_ring *r, int cnt)
+{
+	unsigned long flags;
+	bool ret;
+
+	spin_lock_irqsave(&r->consumer_lock, flags);
+	ret = __ptr_ring_spare(r, cnt);
+	spin_unlock_irqrestore(&r->consumer_lock, flags);
+
+	return ret;
+}
+
+static inline bool ptr_ring_spare_bh(struct ptr_ring *r, int cnt)
+{
+	bool ret;
+
+	spin_lock_bh(&r->consumer_lock);
+	ret = __ptr_ring_spare(r, cnt);
+	spin_unlock_bh(&r->consumer_lock);
+
+	return ret;
+}
+
 /* Must only be called after __ptr_ring_peek returned !NULL */
 static inline void __ptr_ring_discard_one(struct ptr_ring *r)
 {
-- 
2.43.0

The netdev queue is stopped in tun_net_xmit after inserting an SKB into
the ring buffer if the ring buffer became full because of that. If the
insertion into the ptr_ring fails, the netdev queue is also stopped and
the SKB is dropped. However, this never happened in my testing. To ensure
that the ptr_ring change is available to the consumer before the netdev
queue stop, an smp_wmb() is used.

Then in tun_ring_recv, the new helper wake_netdev_queue is called in the
blocking wait queue and after consuming an SKB from the ptr_ring. This
helper first checks if the netdev queue has stopped. Then with the paired
smp_rmb() it is known that tun_net_xmit will not produce SKBs anymore.
With that knowledge, the helper can then wake the netdev queue if there is
at least a single spare slot in the ptr_ring by calling ptr_ring_spare
with cnt=1.

Co-developed-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
Signed-off-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
---
 drivers/net/tun.c | 33 ++++++++++++++++++++++++++++++---
 1 file changed, 30 insertions(+), 3 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index cc6c50180663..735498e221d8 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1060,13 +1060,21 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	nf_reset_ct(skb);
 
-	if (ptr_ring_produce(&tfile->tx_ring, skb)) {
+	queue = netdev_get_tx_queue(dev, txq);
+	if (unlikely(ptr_ring_produce(&tfile->tx_ring, skb))) {
+		/* Paired with smp_rmb() in wake_netdev_queue. */
+		smp_wmb();
+		netif_tx_stop_queue(queue);
 		drop_reason = SKB_DROP_REASON_FULL_RING;
 		goto drop;
 	}
+	if (ptr_ring_full(&tfile->tx_ring)) {
+		/* Paired with smp_rmb() in wake_netdev_queue. */
+		smp_wmb();
+		netif_tx_stop_queue(queue);
+	}
 
 	/* dev->lltx requires to do our own update of trans_start */
-	queue = netdev_get_tx_queue(dev, txq);
 	txq_trans_cond_update(queue);
 
 	/* Notify and wake up reader process */
@@ -2110,6 +2118,24 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 	return total;
 }
 
+static inline void wake_netdev_queue(struct tun_file *tfile)
+{
+	struct netdev_queue *txq;
+	struct net_device *dev;
+
+	rcu_read_lock();
+	dev = rcu_dereference(tfile->tun)->dev;
+	txq = netdev_get_tx_queue(dev, tfile->queue_index);
+
+	if (netif_tx_queue_stopped(txq)) {
+		/* Paired with smp_wmb() in tun_net_xmit. */
+		smp_rmb();
+		if (ptr_ring_spare(&tfile->tx_ring, 1))
+			netif_tx_wake_queue(txq);
+	}
+	rcu_read_unlock();
+}
+
 static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
 {
 	DECLARE_WAITQUEUE(wait, current);
@@ -2139,7 +2165,7 @@ static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
 			error = -EFAULT;
 			break;
 		}
-
+		wake_netdev_queue(tfile);
 		schedule();
 	}
 
@@ -2147,6 +2173,7 @@ static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
 	remove_wait_queue(&tfile->socket.wq.wait, &wait);
 
 out:
+	wake_netdev_queue(tfile);
 	*err = error;
 	return ptr;
 }
-- 
2.43.0

Stopping the netdev queue is done in tun_net_xmit, as TAP uses this method
as its ndo_start_xmit.

To wake the queue, the new helper wake_netdev_queue is called in
tap_do_read. This is done in the blocking wait queue and after consuming
an SKB from the ptr_ring. This helper first checks if the netdev queue has
stopped. Then with the smp_rmb(), which is paired with the smp_wmb() of
tun_net_xmit, it is known that tun_net_xmit will not produce SKBs anymore.
With that knowledge, the helper can then wake the netdev queue if there is
at least a single spare slot. This check is done by calling the method
ptr_ring_spare.

Co-developed-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
Signed-off-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
---
 drivers/net/tap.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 1197f245e873..4d874672bcd7 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -753,6 +753,24 @@ static ssize_t tap_put_user(struct tap_queue *q,
 	return ret ? ret : total;
 }
 
+static inline void wake_netdev_queue(struct tap_queue *q)
+{
+	struct netdev_queue *txq;
+	struct net_device *dev;
+
+	rcu_read_lock();
+	dev = rcu_dereference(q->tap)->dev;
+	txq = netdev_get_tx_queue(dev, q->queue_index);
+
+	if (netif_tx_queue_stopped(txq)) {
+		/* Paired with smp_wmb() in tun_net_xmit. */
+		smp_rmb();
+		if (ptr_ring_spare(&q->ring, 1))
+			netif_tx_wake_queue(txq);
+	}
+	rcu_read_unlock();
+}
+
 static ssize_t tap_do_read(struct tap_queue *q,
 			   struct iov_iter *to,
 			   int noblock, struct sk_buff *skb)
@@ -785,12 +803,16 @@ static ssize_t tap_do_read(struct tap_queue *q,
 			ret = -ERESTARTSYS;
 			break;
 		}
+		wake_netdev_queue(q);
+
 		/* Nothing to read, let's sleep */
 		schedule();
 	}
 	if (!noblock)
 		finish_wait(sk_sleep(&q->sk), &wait);
 
+	wake_netdev_queue(q);
+
 put:
 	if (skb) {
 		ret = tap_put_user(q, skb, to);
-- 
2.43.0

Stopping the queue is done in tun_net_xmit.

Waking the queue is done by calling one of the helpers,
tun_wake_netdev_queue and tap_wake_netdev_queue. For that, in
get_wake_netdev_queue, the correct method is determined and saved in the
function pointer wake_netdev_queue of the vhost_net_virtqueue. Then, each
time after consuming a batch in vhost_net_buf_produce, wake_netdev_queue
is called.

Co-developed-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
Signed-off-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
---
 drivers/net/tap.c      |  6 ++++++
 drivers/net/tun.c      |  6 ++++++
 drivers/vhost/net.c    | 34 ++++++++++++++++++++++++++++------
 include/linux/if_tap.h |  2 ++
 include/linux/if_tun.h |  3 +++
 5 files changed, 45 insertions(+), 6 deletions(-)

diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 4d874672bcd7..0bad9e3d59af 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -1198,6 +1198,12 @@ struct socket *tap_get_socket(struct file *file)
 }
 EXPORT_SYMBOL_GPL(tap_get_socket);
 
+void tap_wake_netdev_queue(struct file *file)
+{
+	wake_netdev_queue(file->private_data);
+}
+EXPORT_SYMBOL_GPL(tap_wake_netdev_queue);
+
 struct ptr_ring *tap_get_ptr_ring(struct file *file)
 {
 	struct tap_queue *q;
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 735498e221d8..e85589b596ac 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -3739,6 +3739,12 @@ struct socket *tun_get_socket(struct file *file)
 }
 EXPORT_SYMBOL_GPL(tun_get_socket);
 
+void tun_wake_netdev_queue(struct file *file)
+{
+	wake_netdev_queue(file->private_data);
+}
+EXPORT_SYMBOL_GPL(tun_wake_netdev_queue);
+
 struct ptr_ring *tun_get_tx_ring(struct file *file)
 {
 	struct tun_file *tfile;
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 6edac0c1ba9b..e837d3a334f1 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -130,6 +130,7 @@ struct vhost_net_virtqueue {
 	struct vhost_net_buf rxq;
 	/* Batched XDP buffs */
 	struct xdp_buff *xdp;
+	void (*wake_netdev_queue)(struct file *f);
 };
 
 struct vhost_net {
@@ -175,13 +176,16 @@ static void *vhost_net_buf_consume(struct vhost_net_buf *rxq)
 	return ret;
 }
 
-static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq)
+static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq,
+								 struct sock *sk)
 {
+	struct file *file = sk->sk_socket->file;
 	struct vhost_net_buf *rxq = &nvq->rxq;
 
 	rxq->head = 0;
 	rxq->tail = ptr_ring_consume_batched(nvq->rx_ring, rxq->queue,
 					      VHOST_NET_BATCH);
+	nvq->wake_netdev_queue(file);
 	return rxq->tail;
 }
 
@@ -208,14 +212,15 @@ static int vhost_net_buf_peek_len(void *ptr)
 	return __skb_array_len_with_tag(ptr);
 }
 
-static int vhost_net_buf_peek(struct vhost_net_virtqueue *nvq)
+static int vhost_net_buf_peek(struct vhost_net_virtqueue *nvq,
+							  struct sock *sk)
 {
 	struct vhost_net_buf *rxq = &nvq->rxq;
 
 	if (!vhost_net_buf_is_empty(rxq))
 		goto out;
 
-	if (!vhost_net_buf_produce(nvq))
+	if (!vhost_net_buf_produce(nvq, sk))
 		return 0;
 
 out:
@@ -994,7 +999,7 @@ static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk)
 	unsigned long flags;
 
 	if (rvq->rx_ring)
-		return vhost_net_buf_peek(rvq);
+		return vhost_net_buf_peek(rvq, sk);
 
 	spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
 	head = skb_peek(&sk->sk_receive_queue);
@@ -1499,6 +1504,19 @@ static struct socket *get_tap_socket(int fd)
 	return sock;
 }
 
+static void (*get_wake_netdev_queue(struct file *file))(struct file *file)
+{
+	struct ptr_ring *ring;
+
+	ring = tun_get_tx_ring(file);
+	if (!IS_ERR(ring))
+		return tun_wake_netdev_queue;
+	ring = tap_get_ptr_ring(file);
+	if (!IS_ERR(ring))
+		return tap_wake_netdev_queue;
+	return NULL;
+}
+
 static struct socket *get_socket(int fd)
 {
 	struct socket *sock;
@@ -1570,10 +1588,14 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
 		if (r)
 			goto err_used;
 		if (index == VHOST_NET_VQ_RX) {
-			if (sock)
+			if (sock) {
 				nvq->rx_ring = get_tap_ptr_ring(sock->file);
-			else
+				nvq->wake_netdev_queue =
+					get_wake_netdev_queue(sock->file);
+			} else {
 				nvq->rx_ring = NULL;
+				nvq->wake_netdev_queue = NULL;
+			}
 		}
 
 		oldubufs = nvq->ubufs;
diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h
index 553552fa635c..02b2809784b5 100644
--- a/include/linux/if_tap.h
+++ b/include/linux/if_tap.h
@@ -10,6 +10,7 @@ struct socket;
 
 #if IS_ENABLED(CONFIG_TAP)
 struct socket *tap_get_socket(struct file *);
+void tap_wake_netdev_queue(struct file *file);
 struct ptr_ring *tap_get_ptr_ring(struct file *file);
 #else
 #include <linux/err.h>
@@ -18,6 +19,7 @@ static inline struct socket *tap_get_socket(struct file *f)
 {
 	return ERR_PTR(-EINVAL);
 }
+static inline void tap_wake_netdev_queue(struct file *f) {}
 static inline struct ptr_ring *tap_get_ptr_ring(struct file *f)
 {
 	return ERR_PTR(-EINVAL);
diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
index 80166eb62f41..04c504bb1954 100644
--- a/include/linux/if_tun.h
+++ b/include/linux/if_tun.h
@@ -21,6 +21,7 @@ struct tun_msg_ctl {
 
 #if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
 struct socket *tun_get_socket(struct file *);
+void tun_wake_netdev_queue(struct file *file);
 struct ptr_ring *tun_get_tx_ring(struct file *file);
 
 static inline bool tun_is_xdp_frame(void *ptr)
@@ -50,6 +51,8 @@ static inline struct socket *tun_get_socket(struct file *f)
 	return ERR_PTR(-EINVAL);
 }
 
+static inline void tun_wake_netdev_queue(struct file *f) {}
+
 static inline struct ptr_ring *tun_get_tx_ring(struct file *f)
 {
 	return ERR_PTR(-EINVAL);
-- 
2.43.0