Patches 1/3 and 2/3 address two candidate races that could lead to a TCOMP completion being missed on PCIe-attached macb instances. This patch adds a defence-in-depth safety net, in case a further race remains that we have not identified. The watchdog is a per-queue delayed_work that runs once per second. It snapshots queue->tx_tail; if the ring is non-empty (queue->tx_head != queue->tx_tail) and tx_tail has not advanced since the previous tick, it calls macb_tx_restart(). No new recovery logic is introduced. macb_tx_restart() already exists in this file, is correctly locked (tx_ptr_lock, bp->lock), and verifies that the hardware's TBQP is behind the driver's head index before re-asserting TSTART. On a healthy ring it is a no-op at the hardware level; the watchdog only supplies the missing trigger. On a healthy queue the per-tick cost is one spin_lock_irqsave() / spin_unlock_irqrestore() and one branch. The delayed_work is only scheduled between macb_open() and macb_close(), and is cancelled synchronously on close. Context for submission: on our 24-node Raspberry Pi 5 fleet, before this series, an out-of-band user-space watchdog (monitoring tx_packets from /sys/class/net/.../statistics and toggling the link down/up when it froze) was required to keep nodes usable. We include this kernel-side watchdog as a cleaner in-kernel equivalent for any residual stall that patches 1 and 2 do not cover. We are willing to drop this patch if the view is that 1 and 2 should stand alone. Link: https://github.com/cilium/cilium/issues/43198 Link: https://bugs.launchpad.net/ubuntu/+source/linux-raspi/+bug/2133877 Signed-off-by: Lukasz Raczylo --- drivers/net/ethernet/cadence/macb.h | 5 ++ drivers/net/ethernet/cadence/macb_main.c | 59 ++++++++++++++++++++++++ 2 files changed, 64 insertions(+) diff --git a/drivers/net/ethernet/cadence/macb.h b/drivers/net/ethernet/cadence/macb.h index 2de56017e..9115f2b47 100644 --- a/drivers/net/ethernet/cadence/macb.h +++ b/drivers/net/ethernet/cadence/macb.h @@ -1278,6 +1278,11 @@ struct macb_queue { dma_addr_t tx_ring_dma; struct work_struct tx_error_task; bool txubr_pending; + + /* TX stall watchdog -- see macb_tx_stall_watchdog() in macb_main.c */ + struct delayed_work tx_stall_watchdog_work; + unsigned int tx_stall_last_tail; + struct napi_struct napi_tx; dma_addr_t rx_ring_dma; diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index ea231b1c5..ea2306ef7 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -2002,6 +2002,59 @@ static int macb_tx_poll(struct napi_struct *napi, int budget) return work_done; } +#define MACB_TX_STALL_INTERVAL_MS 1000 + +/* + * TX stall watchdog. + * + * Defence-in-depth against lost TCOMP interrupts. macb already has a + * recovery chain (tx_pending -> txubr_pending -> macb_tx_restart()) + * that fires on TCOMP; if TCOMP itself is lost the TX ring stalls + * silently until something else kicks TSTART. This watchdog runs + * once per second per queue, snapshots tx_tail, and calls + * macb_tx_restart() if the ring is non-empty and tx_tail has not + * advanced since the previous tick. + * + * macb_tx_restart() already checks the hardware's TBQP against the + * driver's head index before re-asserting TSTART, so on a healthy + * ring this is a no-op at the hardware level. The watchdog only + * adds the missing trigger. + */ +static void macb_tx_stall_watchdog(struct work_struct *work) +{ + struct macb_queue *queue = container_of(to_delayed_work(work), + struct macb_queue, + tx_stall_watchdog_work); + struct macb *bp = queue->bp; + unsigned int cur_tail, cur_head; + bool stalled = false; + unsigned long flags; + + if (!netif_running(bp->dev)) + return; + + spin_lock_irqsave(&queue->tx_ptr_lock, flags); + cur_tail = queue->tx_tail; + cur_head = queue->tx_head; + if (cur_head != cur_tail && + cur_tail == queue->tx_stall_last_tail) + stalled = true; + else + queue->tx_stall_last_tail = cur_tail; + spin_unlock_irqrestore(&queue->tx_ptr_lock, flags); + + if (stalled) { + netdev_warn_once(bp->dev, + "TX stall detected on queue %u (tail=%u head=%u); re-kicking TSTART\n", + (unsigned int)(queue - bp->queues), + cur_tail, cur_head); + macb_tx_restart(queue); + } + + schedule_delayed_work(&queue->tx_stall_watchdog_work, + msecs_to_jiffies(MACB_TX_STALL_INTERVAL_MS)); +} + static void macb_hresp_error_task(struct work_struct *work) { struct macb *bp = from_work(bp, work, hresp_err_bh_work); @@ -3190,6 +3243,9 @@ static int macb_open(struct net_device *dev) for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) { napi_enable(&queue->napi_rx); napi_enable(&queue->napi_tx); + queue->tx_stall_last_tail = queue->tx_tail; + schedule_delayed_work(&queue->tx_stall_watchdog_work, + msecs_to_jiffies(MACB_TX_STALL_INTERVAL_MS)); } macb_init_hw(bp); @@ -3240,6 +3296,7 @@ static int macb_close(struct net_device *dev) for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) { napi_disable(&queue->napi_rx); napi_disable(&queue->napi_tx); + cancel_delayed_work_sync(&queue->tx_stall_watchdog_work); netdev_tx_reset_queue(netdev_get_tx_queue(dev, q)); } @@ -4802,6 +4859,8 @@ static int macb_init_dflt(struct platform_device *pdev) } INIT_WORK(&queue->tx_error_task, macb_tx_error_task); + INIT_DELAYED_WORK(&queue->tx_stall_watchdog_work, + macb_tx_stall_watchdog); q++; } -- 2.53.0