From: Jesper Dangaard Brouer With the introduction of BQL (Byte Queue Limits) for veth, there are now two independent mechanisms that can stop a transmit queue: - DRV_XOFF: set by netif_tx_stop_queue() when the ptr_ring is full - STACK_XOFF: set by BQL when the byte-in-flight limit is reached If either mechanism stalls without a corresponding wake/completion, the queue stops permanently. Enable the net device watchdog timer and implement ndo_tx_timeout as a failsafe recovery. The timeout handler resets BQL state (clearing STACK_XOFF) and wakes the queue (clearing DRV_XOFF), covering both stop mechanisms. The watchdog fires after 16 seconds, which accommodates worst-case NAPI processing (budget=64 packets x 250ms per-packet consumer delay) without false positives under normal backpressure. Signed-off-by: Jesper Dangaard Brouer Tested-by: Jonas Köppeler --- drivers/net/veth.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index b9a79d066703..beb4c31d8fd7 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -1426,6 +1426,22 @@ static int veth_set_channels(struct net_device *dev, goto out; } +static void veth_tx_timeout(struct net_device *dev, unsigned int txqueue) +{ + struct netdev_queue *txq = netdev_get_tx_queue(dev, txqueue); + + netdev_err(dev, + "veth backpressure(0x%lX) stalled(n:%ld) TXQ(%u) re-enable\n", + txq->state, atomic_long_read(&txq->trans_timeout), txqueue); + + /* Clear both stop mechanisms: + * - DRV_XOFF: set by netif_tx_stop_queue (ptr_ring backpressure) + * - STACK_XOFF: set by BQL when byte limit is reached + */ + netdev_tx_reset_queue(txq); + netif_tx_wake_queue(txq); +} + static int veth_open(struct net_device *dev) { struct veth_priv *priv = netdev_priv(dev); @@ -1764,6 +1780,7 @@ static const struct net_device_ops veth_netdev_ops = { .ndo_bpf = veth_xdp, .ndo_xdp_xmit = veth_ndo_xdp_xmit, .ndo_get_peer_dev = veth_peer_dev, + .ndo_tx_timeout = veth_tx_timeout, }; static const struct xdp_metadata_ops veth_xdp_metadata_ops = { @@ -1803,6 +1820,7 @@ static void veth_setup(struct net_device *dev) dev->priv_destructor = veth_dev_free; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; dev->max_mtu = ETH_MAX_MTU; + dev->watchdog_timeo = msecs_to_jiffies(16000); dev->hw_features = VETH_FEATURES; dev->hw_enc_features = VETH_FEATURES; -- 2.43.0