Introduce the __ptr_ring_full_next() helper, which lets callers check if the ptr_ring will become full after the next insertion. This is useful for proactively managing capacity before the ring is actually full. Callers must ensure the ring is not already full before using this helper. This is because __ptr_ring_discard_one() may zero entries in reverse order, the slot after the current producer position may be cleared before the current one. This must be considered when using this check. Note: This function is especially relevant when paired with the memory ordering guarantees of __ptr_ring_produce() (smp_wmb()), allowing for safe producer/consumer coordination. Co-developed-by: Tim Gebauer Signed-off-by: Tim Gebauer Co-developed-by: Jon Kohler Signed-off-by: Jon Kohler Signed-off-by: Simon Schippers --- include/linux/ptr_ring.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h index 534531807d95..da141cc8b075 100644 --- a/include/linux/ptr_ring.h +++ b/include/linux/ptr_ring.h @@ -96,6 +96,31 @@ static inline bool ptr_ring_full_bh(struct ptr_ring *r) return ret; } +/* + * Checks if the ptr_ring will become full after the next insertion. + * + * Note: Callers must ensure that the ptr_ring is not full before calling + * this function, as __ptr_ring_discard_one invalidates entries in + * reverse order. Because the next entry (rather than the current one) + * may be zeroed after an insertion, failing to account for this can + * cause false negatives when checking whether the ring will become full + * on the next insertion. + */ +static inline bool __ptr_ring_full_next(struct ptr_ring *r) +{ + int p; + + if (unlikely(r->size <= 1)) + return true; + + p = r->producer + 1; + + if (unlikely(p >= r->size)) + p = 0; + + return r->queue[p]; +} + /* Note: callers invoking this in a loop must use a compiler barrier, * for example cpu_relax(). Callers must hold producer_lock. * Callers are responsible for making sure pointer that is being queued -- 2.43.0 Add __ptr_ring_consume_created_space() to check whether the previous __ptr_ring_consume() call successfully consumed an element and created space in the ring buffer. This enables callers to conditionally notify producers when space becomes available. The function is only valid immediately after a single consume operation and should not be used after calling __ptr_ring_consume_batched(). Co-developed-by: Tim Gebauer Signed-off-by: Tim Gebauer Co-developed by: Jon Kohler Signed-off-by: Jon Kohler Signed-off-by: Simon Schippers --- include/linux/ptr_ring.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h index da141cc8b075..76d6840b45a3 100644 --- a/include/linux/ptr_ring.h +++ b/include/linux/ptr_ring.h @@ -453,6 +453,23 @@ static inline int ptr_ring_consume_batched_bh(struct ptr_ring *r, return ret; } +/* + * Check if the previous consume operation created space + * + * Returns true if the last call to __ptr_ring_consume() has created + * space in the ring buffer (i.e., an element was consumed). + * + * Note: This function is only valid immediately after a single call to + * __ptr_ring_consume(). If multiple calls to ptr_ring_consume*() have + * been made, this check must be performed after each call individually. + * Likewise, do not use this function after calling + * __ptr_ring_consume_batched(). + */ +static inline bool __ptr_ring_consume_created_space(struct ptr_ring *r) +{ + return r->consumer_tail >= r->consumer_head; +} + /* Cast to structure type and call a function without discarding from FIFO. * Function must return a value. * Callers must take consumer_lock. -- 2.43.0 Implement new ring buffer produce and consume functions for tun and tap drivers that provide lockless producer-consumer synchronization and netdev queue management to prevent ptr_ring tail drop and permanent starvation. - tun_ring_produce(): Produces packets to the ptr_ring with proper memory barriers and proactively stops the netdev queue when the ring is about to become full. - __tun_ring_consume() / __tap_ring_consume(): Internal consume functions that check if the netdev queue was stopped due to a full ring, and wake it when space becomes available. Uses memory barriers to ensure proper ordering between producer and consumer. - tun_ring_consume() / tap_ring_consume(): Wrapper functions that acquire the consumer lock before calling the internal consume functions. Key features: - Proactive queue stopping using __ptr_ring_full_next() to stop the queue before it becomes completely full. - Not stopping the queue when the ptr_ring is full already, because if the consumer empties all entries in the meantime, stopping the queue would cause permanent starvation. - Conditional queue waking using __ptr_ring_consume_created_space() to wake the queue only when space is actually created in the ring. - Prevents permanent starvation by ensuring the queue is also woken when the ring becomes empty, which can happen when racing the producer. NB: __always_unused on unused functions, to be removed later in the series to not break bisectability. Co-developed-by: Tim Gebauer Signed-off-by: Tim Gebauer Co-developed by: Jon Kohler Signed-off-by: Jon Kohler Signed-off-by: Simon Schippers --- drivers/net/tap.c | 63 +++++++++++++++++++++++++++++ drivers/net/tun.c | 101 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 164 insertions(+) diff --git a/drivers/net/tap.c b/drivers/net/tap.c index 1197f245e873..c370a02789eb 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -753,6 +753,69 @@ static ssize_t tap_put_user(struct tap_queue *q, return ret ? ret : total; } +/* + * Consume a packet from the transmit ring. Callers must hold + * the consumer_lock of the ptr_ring. If the ring was full and the + * queue was stopped, this may wake up the queue if space is created. + */ +static void *__tap_ring_consume(struct tap_queue *q) +{ + struct ptr_ring *ring = &q->ring; + struct netdev_queue *txq; + struct net_device *dev; + bool stopped; + void *ptr; + + ptr = __ptr_ring_peek(ring); + if (!ptr) + return ptr; + + /* Paired with smp_wmb() in the ring producer path. Ensures we + * see any updated netdev queue state caused by a full ring. + * Needed for proper synchronization between the ring and the + * netdev queue. + */ + smp_rmb(); + rcu_read_lock(); + dev = rcu_dereference(q->tap)->dev; + txq = netdev_get_tx_queue(dev, q->queue_index); + stopped = netif_tx_queue_stopped(txq); + + /* Ensures the read for a stopped queue completes before the + * discard, so that we don't miss the window to wake the queue if + * needed. + */ + smp_rmb(); + __ptr_ring_discard_one(ring); + + /* If the queue was stopped (meaning the producer couldn't have + * inserted new entries just now), and we have actually created + * space in the ring, or the ring is now empty (due to a race + * with the producer), then it is now safe to wake the queue. + */ + if (unlikely(stopped && + (__ptr_ring_consume_created_space(ring) || + __ptr_ring_empty(ring)))) { + /* Paired with smp_rmb() in tun_ring_produce. */ + smp_wmb(); + netif_tx_wake_queue(txq); + } + rcu_read_unlock(); + + return ptr; +} + +static __always_unused void *tap_ring_consume(struct tap_queue *q) +{ + void *ptr; + + spin_lock(&q->ring.consumer_lock); + ptr = __tap_ring_consume(q); + spin_unlock(&q->ring.consumer_lock); + + return ptr; +} + static ssize_t tap_do_read(struct tap_queue *q, struct iov_iter *to, int noblock, struct sk_buff *skb) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 8192740357a0..3b9d8d406ff5 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -999,6 +999,107 @@ static unsigned int run_ebpf_filter(struct tun_struct *tun, return len; } +/* Produce a packet into the transmit ring. If the ring becomes full, the + * netdev queue is stopped until the consumer wakes it again. + */ +static __always_unused int tun_ring_produce(struct ptr_ring *ring, + struct netdev_queue *queue, + struct sk_buff *skb) +{ + int ret; + + spin_lock(&ring->producer_lock); + + /* Pairs with smp_wmb() in __tun_ring_consume/__tap_ring_consume. + * Ensures that freed space by the consumer is visible. + */ + smp_rmb(); + + /* Do not stop the netdev queue if the ptr_ring is full already. + * The consumer could empty out the ptr_ring in the meantime + * without noticing the stopped netdev queue, resulting in a + * stopped netdev queue and an empty ptr_ring. In this case the + * netdev queue would stay stopped forever. + */ + if (unlikely(!__ptr_ring_full(ring) && + __ptr_ring_full_next(ring))) + netif_tx_stop_queue(queue); + + /* Note: __ptr_ring_produce has an internal smp_wmb() to synchronize the + * state with the consumer. This ensures that after adding an entry to + * the ring, any stopped queue state is visible to the consumer after + * dequeueing. + */ + ret = __ptr_ring_produce(ring, skb); + + spin_unlock(&ring->producer_lock); + + return ret; +} + +/* + * Consume a packet from the transmit ring. Callers must hold + * the consumer_lock of the ptr_ring. If the ring was full and the + * queue was stopped, this may wake up the queue if space is created. + */ +static void *__tun_ring_consume(struct tun_file *tfile) +{ + struct ptr_ring *ring = &tfile->tx_ring; + struct netdev_queue *txq; + struct net_device *dev; + bool stopped; + void *ptr; + + ptr = __ptr_ring_peek(ring); + if (!ptr) + return ptr; + + /* Paired with smp_wmb() in the ring producer path. Ensures we + * see any updated netdev queue state caused by a full ring. + * Needed for proper synchronization between the ring and the + * netdev queue. + */ + smp_rmb(); + rcu_read_lock(); + dev = rcu_dereference(tfile->tun)->dev; + txq = netdev_get_tx_queue(dev, tfile->queue_index); + stopped = netif_tx_queue_stopped(txq); + + /* Ensures the read for a stopped queue completes before the + * discard, so that we don't miss the window to wake the queue if + * needed. + */ + smp_rmb(); + __ptr_ring_discard_one(ring); + + /* If the queue was stopped (meaning the producer couldn't have + * inserted new entries just now), and we have actually created + * space in the ring, or the ring is now empty (due to a race + * with the producer), then it is now safe to wake the queue. + */ + if (unlikely(stopped && + (__ptr_ring_consume_created_space(ring) || + __ptr_ring_empty(ring)))) { + /* Paired with smp_rmb() in tun_ring_produce. */ + smp_wmb(); + netif_tx_wake_queue(txq); + } + rcu_read_unlock(); + + return ptr; +} + +static void __always_unused *tun_ring_consume(struct tun_file *tfile) +{ + void *ptr; + + spin_lock(&tfile->tx_ring.consumer_lock); + ptr = __tun_ring_consume(tfile); + spin_unlock(&tfile->tx_ring.consumer_lock); + + return ptr; +} + /* Net device start xmit */ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) { -- 2.43.0 Add tun_ring_consume_batched() and tap_ring_consume_batched() to allow consuming multiple items from the respective ring buffer in a single lock acquisition. Heavily inspired by ptr_ring_consume_batched() and will be used for bulk dequeue operations (e.g. vhost-net). Co-developed-by: Tim Gebauer Signed-off-by: Tim Gebauer Co-developed by: Jon Kohler Signed-off-by: Jon Kohler Signed-off-by: Simon Schippers --- drivers/net/tap.c | 21 +++++++++++++++++++++ drivers/net/tun.c | 21 +++++++++++++++++++++ include/linux/if_tap.h | 6 ++++++ include/linux/if_tun.h | 7 +++++++ 4 files changed, 55 insertions(+) diff --git a/drivers/net/tap.c b/drivers/net/tap.c index c370a02789eb..01717c8fd284 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -816,6 +816,27 @@ static __always_unused void *tap_ring_consume(struct tap_queue *q) return ptr; } +int tap_ring_consume_batched(struct file *file, void **array, int n) +{ + struct tap_queue *q = file->private_data; + void *ptr; + int i; + + spin_lock(&q->ring.consumer_lock); + + for (i = 0; i < n; i++) { + ptr = __tap_ring_consume(q); + if (!ptr) + break; + array[i] = ptr; + } + + spin_unlock(&q->ring.consumer_lock); + + return i; +} +EXPORT_SYMBOL_GPL(tap_ring_consume_batched); + static ssize_t tap_do_read(struct tap_queue *q, struct iov_iter *to, int noblock, struct sk_buff *skb) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 3b9d8d406ff5..42df185341ad 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -3816,6 +3816,27 @@ struct socket *tun_get_socket(struct file *file) } EXPORT_SYMBOL_GPL(tun_get_socket); +int tun_ring_consume_batched(struct file *file, void **array, int n) +{ + struct tun_file *tfile = file->private_data; + void *ptr; + int i; + + spin_lock(&tfile->tx_ring.consumer_lock); + + for (i = 0; i < n; i++) { + ptr = __tun_ring_consume(tfile); + if (!ptr) + break; + array[i] = ptr; + } + + spin_unlock(&tfile->tx_ring.consumer_lock); + + return i; +} +EXPORT_SYMBOL_GPL(tun_ring_consume_batched); + struct ptr_ring *tun_get_tx_ring(struct file *file) { struct tun_file *tfile; diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h index 553552fa635c..cf8b90320b8d 100644 --- a/include/linux/if_tap.h +++ b/include/linux/if_tap.h @@ -11,6 +11,7 @@ struct socket; #if IS_ENABLED(CONFIG_TAP) struct socket *tap_get_socket(struct file *); struct ptr_ring *tap_get_ptr_ring(struct file *file); +int tap_ring_consume_batched(struct file *file, void **array, int n); #else #include #include @@ -22,6 +23,11 @@ static inline struct ptr_ring *tap_get_ptr_ring(struct file *f) { return ERR_PTR(-EINVAL); } +static inline int tap_ring_consume_batched(struct file *f, + void **array, int n) +{ + return 0; +} #endif /* CONFIG_TAP */ /* diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h index 80166eb62f41..444dda75a372 100644 --- a/include/linux/if_tun.h +++ b/include/linux/if_tun.h @@ -22,6 +22,7 @@ struct tun_msg_ctl { #if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE) struct socket *tun_get_socket(struct file *); struct ptr_ring *tun_get_tx_ring(struct file *file); +int tun_ring_consume_batched(struct file *file, void **array, int n); static inline bool tun_is_xdp_frame(void *ptr) { @@ -55,6 +56,12 @@ static inline struct ptr_ring *tun_get_tx_ring(struct file *f) return ERR_PTR(-EINVAL); } +static inline int tun_ring_consume_batched(struct file *file, + void **array, int n) +{ + return 0; +} + static inline bool tun_is_xdp_frame(void *ptr) { return false; -- 2.43.0 Add tun_ring_unconsume() and tap_ring_unconsume() wrappers to allow external modules (e.g. vhost-net) to return previously consumed entries back to the ring. This complements tun_ring_consume_batched() and tap_ring_consume_batched() and enables proper error handling when consumed packets need to be rolled back. The functions delegate to ptr_ring_unconsume() and take a destroy callback for entries that cannot be returned to the ring. Co-developed-by: Tim Gebauer Signed-off-by: Tim Gebauer Co-developed by: Jon Kohler Signed-off-by: Jon Kohler Signed-off-by: Simon Schippers --- drivers/net/tap.c | 10 ++++++++++ drivers/net/tun.c | 10 ++++++++++ include/linux/if_tap.h | 4 ++++ include/linux/if_tun.h | 5 +++++ 4 files changed, 29 insertions(+) diff --git a/drivers/net/tap.c b/drivers/net/tap.c index 01717c8fd284..0069e2f177f4 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -837,6 +837,16 @@ int tap_ring_consume_batched(struct file *file, void **array, int n) } EXPORT_SYMBOL_GPL(tap_ring_consume_batched); +void tap_ring_unconsume(struct file *file, void **batch, int n, + void (*destroy)(void *)) +{ + struct tap_queue *q = file->private_data; + struct ptr_ring *ring = &q->ring; + + ptr_ring_unconsume(ring, batch, n, destroy); +} +EXPORT_SYMBOL_GPL(tap_ring_unconsume); + static ssize_t tap_do_read(struct tap_queue *q, struct iov_iter *to, int noblock, struct sk_buff *skb) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 42df185341ad..bf109440d2c7 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -3837,6 +3837,16 @@ int tun_ring_consume_batched(struct file *file, void **array, int n) } EXPORT_SYMBOL_GPL(tun_ring_consume_batched); +void tun_ring_unconsume(struct file *file, void **batch, int n, + void (*destroy)(void *)) +{ + struct tun_file *tfile = file->private_data; + struct ptr_ring *ring = &tfile->tx_ring; + + ptr_ring_unconsume(ring, batch, n, destroy); +} +EXPORT_SYMBOL_GPL(tun_ring_unconsume); + struct ptr_ring *tun_get_tx_ring(struct file *file) { struct tun_file *tfile; diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h index cf8b90320b8d..28326a69745a 100644 --- a/include/linux/if_tap.h +++ b/include/linux/if_tap.h @@ -12,6 +12,8 @@ struct socket; struct socket *tap_get_socket(struct file *); struct ptr_ring *tap_get_ptr_ring(struct file *file); int tap_ring_consume_batched(struct file *file, void **array, int n); +void tap_ring_unconsume(struct file *file, void **batch, int n, + void (*destroy)(void *)); #else #include #include @@ -28,6 +30,8 @@ static inline int tap_ring_consume_batched(struct file *f, { return 0; } +static inline void tap_ring_unconsume(struct file *file, void **batch, + int n, void (*destroy)(void *)) {} #endif /* CONFIG_TAP */ /* diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h index 444dda75a372..1274c6b34eb6 100644 --- a/include/linux/if_tun.h +++ b/include/linux/if_tun.h @@ -23,6 +23,8 @@ struct tun_msg_ctl { struct socket *tun_get_socket(struct file *); struct ptr_ring *tun_get_tx_ring(struct file *file); int tun_ring_consume_batched(struct file *file, void **array, int n); +void tun_ring_unconsume(struct file *file, void **batch, int n, + void (*destroy)(void *)); static inline bool tun_is_xdp_frame(void *ptr) { @@ -62,6 +64,9 @@ static inline int tun_ring_consume_batched(struct file *file, return 0; } +static inline void tun_ring_unconsume(struct file *file, void **batch, + int n, void (*destroy)(void *)) {} + static inline bool tun_is_xdp_frame(void *ptr) { return false; -- 2.43.0 Add tun_is_tun_file() and tap_is_tap_file() helper functions to check if a file is a TUN or TAP file, which will be utilized by vhost-net. Co-developed-by: Tim Gebauer Signed-off-by: Tim Gebauer Co-developed by: Jon Kohler Signed-off-by: Jon Kohler Signed-off-by: Simon Schippers --- drivers/net/tap.c | 13 +++++++++++++ drivers/net/tun.c | 13 +++++++++++++ include/linux/if_tap.h | 5 +++++ include/linux/if_tun.h | 6 ++++++ 4 files changed, 37 insertions(+) diff --git a/drivers/net/tap.c b/drivers/net/tap.c index 0069e2f177f4..56b8fe376e4a 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -1283,6 +1283,19 @@ struct ptr_ring *tap_get_ptr_ring(struct file *file) } EXPORT_SYMBOL_GPL(tap_get_ptr_ring); +bool tap_is_tap_file(struct file *file) +{ + struct tap_queue *q; + + if (file->f_op != &tap_fops) + return false; + q = file->private_data; + if (!q) + return false; + return true; +} +EXPORT_SYMBOL_GPL(tap_is_tap_file); + int tap_queue_resize(struct tap_dev *tap) { struct net_device *dev = tap->dev; diff --git a/drivers/net/tun.c b/drivers/net/tun.c index bf109440d2c7..dc2d267d30d7 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -3860,6 +3860,19 @@ struct ptr_ring *tun_get_tx_ring(struct file *file) } EXPORT_SYMBOL_GPL(tun_get_tx_ring); +bool tun_is_tun_file(struct file *file) +{ + struct tun_file *tfile; + + if (file->f_op != &tun_fops) + return false; + tfile = file->private_data; + if (!tfile) + return false; + return true; +} +EXPORT_SYMBOL_GPL(tun_is_tun_file); + module_init(tun_init); module_exit(tun_cleanup); MODULE_DESCRIPTION(DRV_DESCRIPTION); diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h index 28326a69745a..14194342b784 100644 --- a/include/linux/if_tap.h +++ b/include/linux/if_tap.h @@ -14,6 +14,7 @@ struct ptr_ring *tap_get_ptr_ring(struct file *file); int tap_ring_consume_batched(struct file *file, void **array, int n); void tap_ring_unconsume(struct file *file, void **batch, int n, void (*destroy)(void *)); +bool tap_is_tap_file(struct file *file); #else #include #include @@ -32,6 +33,10 @@ static inline int tap_ring_consume_batched(struct file *f, } static inline void tap_ring_unconsume(struct file *file, void **batch, int n, void (*destroy)(void *)) {} +static inline bool tap_is_tap_file(struct file *f) +{ + return false; +} #endif /* CONFIG_TAP */ /* diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h index 1274c6b34eb6..0910c6dbac20 100644 --- a/include/linux/if_tun.h +++ b/include/linux/if_tun.h @@ -25,6 +25,7 @@ struct ptr_ring *tun_get_tx_ring(struct file *file); int tun_ring_consume_batched(struct file *file, void **array, int n); void tun_ring_unconsume(struct file *file, void **batch, int n, void (*destroy)(void *)); +bool tun_is_tun_file(struct file *file); static inline bool tun_is_xdp_frame(void *ptr) { @@ -67,6 +68,11 @@ static inline int tun_ring_consume_batched(struct file *file, static inline void tun_ring_unconsume(struct file *file, void **batch, int n, void (*destroy)(void *)) {} +static inline bool tun_is_tun_file(struct file *f) +{ + return false; +} + static inline bool tun_is_xdp_frame(void *ptr) { return false; -- 2.43.0 Switch to {tun|tap}_ring_{consume|produce} in both tun/tap as well as vhost_net to avoid ptr_ring tail drops. For tun, disable dev->lltx to ensure that tun_net_xmit is not called even though the netdev queue is stopped (it can happen due to unconsume or queue resize). Consequently, the update of trans_start in tun_net_xmit is also removed. Instead of the rx_ring, the virtqueue now saves the interface type IF_TAP, IF_TUN, (or IF_NONE) to call tun/tap wrappers. +--------------------------------+-----------+----------+ | pktgen benchmarks to Debian VM | Stock | Patched | | i5 6300HQ, 20M packets | | | +-----------------+--------------+-----------+----------+ | TAP | Transmitted | 195 Kpps | 183 Kpps | | +--------------+-----------+----------+ | | Lost | 1615 Kpps | 0 pps | +-----------------+--------------+-----------+----------+ | TAP+vhost_net | Transmitted | 589 Kpps | 588 Kpps | | +--------------+-----------+----------+ | | Lost | 1164 Kpps | 0 pps | +-----------------+--------------+-----------+----------+ Co-developed-by: Tim Gebauer Signed-off-by: Tim Gebauer Co-developed by: Jon Kohler Signed-off-by: Jon Kohler Signed-off-by: Simon Schippers --- drivers/net/tap.c | 4 +- drivers/net/tun.c | 20 ++++------ drivers/vhost/net.c | 92 ++++++++++++++++++++++++++++++--------------- 3 files changed, 71 insertions(+), 45 deletions(-) diff --git a/drivers/net/tap.c b/drivers/net/tap.c index 56b8fe376e4a..2847db4e3cc7 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -805,7 +805,7 @@ static void *__tap_ring_consume(struct tap_queue *q) return ptr; } -static __always_unused void *tap_ring_consume(struct tap_queue *q) +static void *tap_ring_consume(struct tap_queue *q) { void *ptr; @@ -868,7 +868,7 @@ static ssize_t tap_do_read(struct tap_queue *q, TASK_INTERRUPTIBLE); /* Read frames from the queue */ - skb = ptr_ring_consume(&q->ring); + skb = tap_ring_consume(q); if (skb) break; if (noblock) { diff --git a/drivers/net/tun.c b/drivers/net/tun.c index dc2d267d30d7..9da6e794a80f 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -931,7 +931,6 @@ static int tun_net_init(struct net_device *dev) dev->vlan_features = dev->features & ~(NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX); - dev->lltx = true; tun->flags = (tun->flags & ~TUN_FEATURES) | (ifr->ifr_flags & TUN_FEATURES); @@ -1002,9 +1001,9 @@ static unsigned int run_ebpf_filter(struct tun_struct *tun, /* Produce a packet into the transmit ring. If the ring becomes full, the * netdev queue is stopped until the consumer wakes it again. */ -static __always_unused int tun_ring_produce(struct ptr_ring *ring, - struct netdev_queue *queue, - struct sk_buff *skb) +static int tun_ring_produce(struct ptr_ring *ring, + struct netdev_queue *queue, + struct sk_buff *skb) { int ret; @@ -1089,7 +1088,7 @@ static void *__tun_ring_consume(struct tun_file *tfile) return ptr; } -static void __always_unused *tun_ring_consume(struct tun_file *tfile) +static void *tun_ring_consume(struct tun_file *tfile) { void *ptr; @@ -1161,15 +1160,12 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) nf_reset_ct(skb); - if (ptr_ring_produce(&tfile->tx_ring, skb)) { + queue = netdev_get_tx_queue(dev, txq); + if (unlikely(tun_ring_produce(&tfile->tx_ring, queue, skb))) { drop_reason = SKB_DROP_REASON_FULL_RING; goto drop; } - /* dev->lltx requires to do our own update of trans_start */ - queue = netdev_get_tx_queue(dev, txq); - txq_trans_cond_update(queue); - /* Notify and wake up reader process */ if (tfile->flags & TUN_FASYNC) kill_fasync(&tfile->fasync, SIGIO, POLL_IN); @@ -2220,7 +2216,7 @@ static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err) void *ptr = NULL; int error = 0; - ptr = ptr_ring_consume(&tfile->tx_ring); + ptr = tun_ring_consume(tfile); if (ptr) goto out; if (noblock) { @@ -2232,7 +2228,7 @@ static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err) while (1) { set_current_state(TASK_INTERRUPTIBLE); - ptr = ptr_ring_consume(&tfile->tx_ring); + ptr = tun_ring_consume(tfile); if (ptr) break; if (signal_pending(current)) { diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 35ded4330431..022efca1d4af 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -90,6 +90,12 @@ enum { VHOST_NET_VQ_MAX = 2, }; +enum if_type { + IF_NONE = 0, + IF_TUN = 1, + IF_TAP = 2, +}; + struct vhost_net_ubuf_ref { /* refcount follows semantics similar to kref: * 0: object is released @@ -131,6 +137,8 @@ struct vhost_net_virtqueue { struct vhost_net_buf rxq; /* Batched XDP buffs */ struct xdp_buff *xdp; + /* Interface type */ + enum if_type type; }; struct vhost_net { @@ -176,24 +184,50 @@ static void *vhost_net_buf_consume(struct vhost_net_buf *rxq) return ret; } -static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq) +static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq, + struct sock *sk) { + struct file *file = sk->sk_socket->file; struct vhost_net_buf *rxq = &nvq->rxq; rxq->head = 0; - rxq->tail = ptr_ring_consume_batched(nvq->rx_ring, rxq->queue, - VHOST_NET_BATCH); + switch (nvq->type) { + case IF_TUN: + rxq->tail = tun_ring_consume_batched(file, rxq->queue, + VHOST_NET_BATCH); + break; + case IF_TAP: + rxq->tail = tap_ring_consume_batched(file, rxq->queue, + VHOST_NET_BATCH); + break; + case IF_NONE: + return 0; + } return rxq->tail; } -static void vhost_net_buf_unproduce(struct vhost_net_virtqueue *nvq) +static void vhost_net_buf_unproduce(struct vhost_net_virtqueue *nvq, + struct socket *sk) { struct vhost_net_buf *rxq = &nvq->rxq; - - if (nvq->rx_ring && !vhost_net_buf_is_empty(rxq)) { - ptr_ring_unconsume(nvq->rx_ring, rxq->queue + rxq->head, - vhost_net_buf_get_size(rxq), - tun_ptr_free); + struct file *file; + + if (sk && !vhost_net_buf_is_empty(rxq)) { + file = sk->file; + switch (nvq->type) { + case IF_TUN: + tun_ring_unconsume(file, rxq->queue + rxq->head, + vhost_net_buf_get_size(rxq), + tun_ptr_free); + break; + case IF_TAP: + tap_ring_unconsume(file, rxq->queue + rxq->head, + vhost_net_buf_get_size(rxq), + tun_ptr_free); + break; + case IF_NONE: + return; + } rxq->head = rxq->tail = 0; } } @@ -209,14 +243,15 @@ static int vhost_net_buf_peek_len(void *ptr) return __skb_array_len_with_tag(ptr); } -static int vhost_net_buf_peek(struct vhost_net_virtqueue *nvq) +static int vhost_net_buf_peek(struct vhost_net_virtqueue *nvq, + struct sock *sk) { struct vhost_net_buf *rxq = &nvq->rxq; if (!vhost_net_buf_is_empty(rxq)) goto out; - if (!vhost_net_buf_produce(nvq)) + if (!vhost_net_buf_produce(nvq, sk)) return 0; out: @@ -991,8 +1026,8 @@ static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk) int len = 0; unsigned long flags; - if (rvq->rx_ring) - return vhost_net_buf_peek(rvq); + if (rvq->type) + return vhost_net_buf_peek(rvq, sk); spin_lock_irqsave(&sk->sk_receive_queue.lock, flags); head = skb_peek(&sk->sk_receive_queue); @@ -1201,7 +1236,7 @@ static void handle_rx(struct vhost_net *net) goto out; } busyloop_intr = false; - if (nvq->rx_ring) + if (nvq->type) msg.msg_control = vhost_net_buf_consume(&nvq->rxq); /* On overrun, truncate and discard */ if (unlikely(headcount > UIO_MAXIOV)) { @@ -1357,7 +1392,7 @@ static int vhost_net_open(struct inode *inode, struct file *f) n->vqs[i].batched_xdp = 0; n->vqs[i].vhost_hlen = 0; n->vqs[i].sock_hlen = 0; - n->vqs[i].rx_ring = NULL; + n->vqs[i].rx_ring = IF_NONE; vhost_net_buf_init(&n->vqs[i].rxq); } vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX, @@ -1387,8 +1422,8 @@ static struct socket *vhost_net_stop_vq(struct vhost_net *n, sock = vhost_vq_get_backend(vq); vhost_net_disable_vq(n, vq); vhost_vq_set_backend(vq, NULL); - vhost_net_buf_unproduce(nvq); - nvq->rx_ring = NULL; + vhost_net_buf_unproduce(nvq, sock); + nvq->type = IF_NONE; mutex_unlock(&vq->mutex); return sock; } @@ -1468,18 +1503,13 @@ static struct socket *get_raw_socket(int fd) return ERR_PTR(r); } -static struct ptr_ring *get_tap_ptr_ring(struct file *file) +static enum if_type get_if_type(struct file *file) { - struct ptr_ring *ring; - ring = tun_get_tx_ring(file); - if (!IS_ERR(ring)) - goto out; - ring = tap_get_ptr_ring(file); - if (!IS_ERR(ring)) - goto out; - ring = NULL; -out: - return ring; + if (tap_is_tap_file(file)) + return IF_TAP; + if (tun_is_tun_file(file)) + return IF_TUN; + return IF_NONE; } static struct socket *get_tap_socket(int fd) @@ -1561,7 +1591,7 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) vhost_net_disable_vq(n, vq); vhost_vq_set_backend(vq, sock); - vhost_net_buf_unproduce(nvq); + vhost_net_buf_unproduce(nvq, sock); r = vhost_vq_init_access(vq); if (r) goto err_used; @@ -1570,9 +1600,9 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) goto err_used; if (index == VHOST_NET_VQ_RX) { if (sock) - nvq->rx_ring = get_tap_ptr_ring(sock->file); + nvq->type = get_if_type(sock->file); else - nvq->rx_ring = NULL; + nvq->type = IF_NONE; } oldubufs = nvq->ubufs; -- 2.43.0 tun_get_tx_ring and tap_get_ptr_ring no longer have in-tree consumers and can be dropped. Co-developed-by: Tim Gebauer Signed-off-by: Tim Gebauer Co-developed by: Jon Kohler Signed-off-by: Jon Kohler Signed-off-by: Simon Schippers --- drivers/net/tap.c | 13 ------------- drivers/net/tun.c | 13 ------------- include/linux/if_tap.h | 5 ----- include/linux/if_tun.h | 6 ------ 4 files changed, 37 deletions(-) diff --git a/drivers/net/tap.c b/drivers/net/tap.c index 2847db4e3cc7..fd87db829913 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -1270,19 +1270,6 @@ struct socket *tap_get_socket(struct file *file) } EXPORT_SYMBOL_GPL(tap_get_socket); -struct ptr_ring *tap_get_ptr_ring(struct file *file) -{ - struct tap_queue *q; - - if (file->f_op != &tap_fops) - return ERR_PTR(-EINVAL); - q = file->private_data; - if (!q) - return ERR_PTR(-EBADFD); - return &q->ring; -} -EXPORT_SYMBOL_GPL(tap_get_ptr_ring); - bool tap_is_tap_file(struct file *file) { struct tap_queue *q; diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 9da6e794a80f..32f53e31a5a7 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -3843,19 +3843,6 @@ void tun_ring_unconsume(struct file *file, void **batch, int n, } EXPORT_SYMBOL_GPL(tun_ring_unconsume); -struct ptr_ring *tun_get_tx_ring(struct file *file) -{ - struct tun_file *tfile; - - if (file->f_op != &tun_fops) - return ERR_PTR(-EINVAL); - tfile = file->private_data; - if (!tfile) - return ERR_PTR(-EBADFD); - return &tfile->tx_ring; -} -EXPORT_SYMBOL_GPL(tun_get_tx_ring); - bool tun_is_tun_file(struct file *file) { struct tun_file *tfile; diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h index 14194342b784..0e427b979c11 100644 --- a/include/linux/if_tap.h +++ b/include/linux/if_tap.h @@ -10,7 +10,6 @@ struct socket; #if IS_ENABLED(CONFIG_TAP) struct socket *tap_get_socket(struct file *); -struct ptr_ring *tap_get_ptr_ring(struct file *file); int tap_ring_consume_batched(struct file *file, void **array, int n); void tap_ring_unconsume(struct file *file, void **batch, int n, void (*destroy)(void *)); @@ -22,10 +21,6 @@ static inline struct socket *tap_get_socket(struct file *f) { return ERR_PTR(-EINVAL); } -static inline struct ptr_ring *tap_get_ptr_ring(struct file *f) -{ - return ERR_PTR(-EINVAL); -} static inline int tap_ring_consume_batched(struct file *f, void **array, int n) { diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h index 0910c6dbac20..80b734173a80 100644 --- a/include/linux/if_tun.h +++ b/include/linux/if_tun.h @@ -21,7 +21,6 @@ struct tun_msg_ctl { #if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE) struct socket *tun_get_socket(struct file *); -struct ptr_ring *tun_get_tx_ring(struct file *file); int tun_ring_consume_batched(struct file *file, void **array, int n); void tun_ring_unconsume(struct file *file, void **batch, int n, void (*destroy)(void *)); @@ -54,11 +53,6 @@ static inline struct socket *tun_get_socket(struct file *f) return ERR_PTR(-EINVAL); } -static inline struct ptr_ring *tun_get_tx_ring(struct file *f) -{ - return ERR_PTR(-EINVAL); -} - static inline int tun_ring_consume_batched(struct file *file, void **array, int n) { -- 2.43.0