Implement .ndo_tx_timeout for MANA so any stalled TX queue can be detected and a device-controlled port reset for all queues can be scheduled to a ordered workqueue. The reset for all queues on stall detection is recomended by hardware team. The change introduces a single ordered workqueue ("mana_per_port_queue_reset_wq") with WQ_UNBOUND | WQ_MEM_RECLAIM and queues exactly one work_struct per port onto it. This achieves: * Global FIFO across all port reset requests (alloc_ordered_workqueue). * Natural per-port de-duplication: the same work_struct cannot be queued twice while pending/running. * Avoids hogging a per-CPU kworker for long, may-sleep reset paths (WQ_UNBOUND). * Guarantees forward progress during memory pressure (WQ_MEM_RECLAIM rescuer). Reviewed-by: Haiyang Zhang Signed-off-by: Dipayaan Roy --- Changes in v2: - Fixed cosmetic changes. --- drivers/net/ethernet/microsoft/mana/mana_en.c | 82 +++++++++++++++++++ include/net/mana/gdma.h | 6 +- include/net/mana/mana.h | 15 ++++ 3 files changed, 102 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index f4fc86f20213..05b7046ae3b5 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -258,6 +258,45 @@ static int mana_get_gso_hs(struct sk_buff *skb) return gso_hs; } +static void mana_per_port_queue_reset_work_handler(struct work_struct *work) +{ + struct mana_queue_reset_work *reset_queue_work = + container_of(work, struct mana_queue_reset_work, work); + struct mana_port_context *apc = reset_queue_work->apc; + struct net_device *ndev = apc->ndev; + struct mana_context *ac = apc->ac; + int err; + + if (!rtnl_trylock()) { + /* Someone else holds RTNL, requeue and exit. */ + queue_work(ac->per_port_queue_reset_wq, + &apc->queue_reset_work.work); + return; + } + + /* Pre-allocate buffers to prevent failure in mana_attach later */ + err = mana_pre_alloc_rxbufs(apc, ndev->mtu, apc->num_queues); + if (err) { + netdev_err(ndev, "Insufficient memory for reset post tx stall detection\n"); + goto out; + } + + err = mana_detach(ndev, false); + if (err) { + netdev_err(ndev, "mana_detach failed: %d\n", err); + goto dealloc_pre_rxbufs; + } + + err = mana_attach(ndev); + if (err) + netdev_err(ndev, "mana_attach failed: %d\n", err); + +dealloc_pre_rxbufs: + mana_pre_dealloc_rxbufs(apc); +out: + rtnl_unlock(); +} + netdev_tx_t mana_start_xmit(struct sk_buff *skb, struct net_device *ndev) { enum mana_tx_pkt_format pkt_fmt = MANA_SHORT_PKT_FMT; @@ -762,6 +801,25 @@ static int mana_change_mtu(struct net_device *ndev, int new_mtu) return err; } +static void mana_tx_timeout(struct net_device *netdev, unsigned int txqueue) +{ + struct mana_port_context *apc = netdev_priv(netdev); + struct mana_context *ac = apc->ac; + struct gdma_context *gc = ac->gdma_dev->gdma_context; + + netdev_warn(netdev, "%s(): called on txq: %u\n", __func__, txqueue); + + /* Already in service, hence tx queue reset is not required.*/ + if (gc->in_service) + return; + + /* Note: If there are pending queue reset work for this port(apc), + * subsequent request queued up from here are ignored. This is because + * we are using the same work instance per port(apc). + */ + queue_work(ac->per_port_queue_reset_wq, &apc->queue_reset_work.work); +} + static int mana_shaper_set(struct net_shaper_binding *binding, const struct net_shaper *shaper, struct netlink_ext_ack *extack) @@ -844,7 +902,9 @@ static const struct net_device_ops mana_devops = { .ndo_bpf = mana_bpf, .ndo_xdp_xmit = mana_xdp_xmit, .ndo_change_mtu = mana_change_mtu, + .ndo_tx_timeout = mana_tx_timeout, .net_shaper_ops = &mana_shaper_ops, + }; static void mana_cleanup_port_context(struct mana_port_context *apc) @@ -3218,6 +3278,7 @@ static int mana_probe_port(struct mana_context *ac, int port_idx, ndev->min_mtu = ETH_MIN_MTU; ndev->needed_headroom = MANA_HEADROOM; ndev->dev_port = port_idx; + ndev->watchdog_timeo = MANA_TXQ_TIMEOUT; SET_NETDEV_DEV(ndev, gc->dev); netif_set_tso_max_size(ndev, GSO_MAX_SIZE); @@ -3255,6 +3316,11 @@ static int mana_probe_port(struct mana_context *ac, int port_idx, debugfs_create_u32("current_speed", 0400, apc->mana_port_debugfs, &apc->speed); + /* Initialize the per port queue reset work.*/ + apc->queue_reset_work.apc = apc; + INIT_WORK(&apc->queue_reset_work.work, + mana_per_port_queue_reset_work_handler); + return 0; free_indir: @@ -3456,6 +3522,15 @@ int mana_probe(struct gdma_dev *gd, bool resuming) if (ac->num_ports > MAX_PORTS_IN_MANA_DEV) ac->num_ports = MAX_PORTS_IN_MANA_DEV; + ac->per_port_queue_reset_wq = + alloc_ordered_workqueue("mana_per_port_queue_reset_wq", + WQ_UNBOUND | WQ_MEM_RECLAIM); + if (!ac->per_port_queue_reset_wq) { + dev_err(dev, "Failed to allocate per port queue reset workqueue\n"); + err = -ENOMEM; + goto out; + } + if (!resuming) { for (i = 0; i < ac->num_ports; i++) { err = mana_probe_port(ac, i, &ac->ports[i]); @@ -3528,6 +3603,8 @@ void mana_remove(struct gdma_dev *gd, bool suspending) */ rtnl_lock(); + cancel_work_sync(&apc->queue_reset_work.work); + err = mana_detach(ndev, false); if (err) netdev_err(ndev, "Failed to detach vPort %d: %d\n", @@ -3547,6 +3624,11 @@ void mana_remove(struct gdma_dev *gd, bool suspending) free_netdev(ndev); } + if (ac->per_port_queue_reset_wq) { + destroy_workqueue(ac->per_port_queue_reset_wq); + ac->per_port_queue_reset_wq = NULL; + } + mana_destroy_eq(ac); out: mana_gd_deregister_device(gd); diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index 57df78cfbf82..1f8c536ba3be 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -591,6 +591,9 @@ enum { /* Driver can self reset on FPGA Reconfig EQE notification */ #define GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE BIT(17) +/* Driver detects stalled send queues and recovers them */ +#define GDMA_DRV_CAP_FLAG_1_HANDLE_STALL_SQ_RECOVERY BIT(18) + #define GDMA_DRV_CAP_FLAGS1 \ (GDMA_DRV_CAP_FLAG_1_EQ_SHARING_MULTI_VPORT | \ GDMA_DRV_CAP_FLAG_1_NAPI_WKDONE_FIX | \ @@ -599,7 +602,8 @@ enum { GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP | \ GDMA_DRV_CAP_FLAG_1_DYNAMIC_IRQ_ALLOC_SUPPORT | \ GDMA_DRV_CAP_FLAG_1_SELF_RESET_ON_EQE | \ - GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE) + GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE | \ + GDMA_DRV_CAP_FLAG_1_HANDLE_STALL_SQ_RECOVERY) #define GDMA_DRV_CAP_FLAGS2 0 diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index 0921485565c0..e0b44ae2226a 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -67,6 +67,11 @@ enum TRI_STATE { #define MANA_RX_FRAG_ALIGNMENT 64 +/* Timeout value for Txq stall detection & recovery used by ndo_tx_timeout. + * The value is chosen after considering fpga re-config scenarios. + */ +#define MANA_TXQ_TIMEOUT (15 * HZ) + struct mana_stats_rx { u64 packets; u64 bytes; @@ -475,13 +480,23 @@ struct mana_context { struct mana_eq *eqs; struct dentry *mana_eqs_debugfs; + struct workqueue_struct *per_port_queue_reset_wq; struct net_device *ports[MAX_PORTS_IN_MANA_DEV]; }; +struct mana_queue_reset_work { + + /* Work structure */ + struct work_struct work; + /* Pointer to the port context */ + struct mana_port_context *apc; +}; + struct mana_port_context { struct mana_context *ac; struct net_device *ndev; + struct mana_queue_reset_work queue_reset_work; u8 mac_addr[ETH_ALEN]; -- 2.43.0