Add devlink health support for diagnostics Signed-off-by: Oleksij Rempel --- drivers/net/usb/lan78xx.c | 388 +++++++++++++++++++++++++++++++++++++- 1 file changed, 387 insertions(+), 1 deletion(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index ad620b56443b..221be42e06f4 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -41,6 +42,11 @@ #define THROTTLE_JIFFIES (HZ / 8) #define UNLINK_TIMEOUT_MS 3 +#define LAN78XX_STALL_PAUSE_THRESH 100 +#define LAN78XX_LIVELOCK_DROP_THRESH 10000 +#define LAN78XX_LIVELOCK_DROP_RATIO 10 +#define LAN78XX_TX_TIMEOUT_DROP_THRESH 1000 + #define RX_MAX_QUEUE_MEMORY (60 * 1518) #define SS_USB_PKT_SIZE (1024) @@ -373,6 +379,10 @@ struct lan78xx_priv { u32 wol; }; +struct lan78xx_devlink_priv { + struct lan78xx_net *dev; +}; + enum skb_state { illegal = 0, tx_start, @@ -411,6 +421,19 @@ struct statstage { struct lan78xx_statstage64 curr_stat; }; +struct lan78xx_stat_snapshot { + ktime_t time; + + u64 tx_pause_total; + u64 tx_unicast_total; + u64 rx_total_frames; + u64 rx_hw_drop_total; + u64 rx_sw_packets_total; + + u32 last_delta_pause; + u32 last_delta_drops; +}; + struct irq_domain_data { struct irq_domain *irqdomain; unsigned int phyirq; @@ -477,6 +500,35 @@ struct lan78xx_net { struct phylink *phylink; struct phylink_config phylink_config; + + struct devlink *devlink; + struct devlink_health_reporter *fifo_reporter; + struct devlink_health_reporter *internal_err_reporter; + struct lan78xx_stat_snapshot snapshot; +}; + +struct lan78xx_dump_ctx { + const char *msg; + ktime_t ts; /* Timestamp of detection */ + + union { + struct { + u64 delta_pause; + u64 delta_rx; + u64 delta_hw_drop; + u64 delta_sw_rx; + } fifo; + struct { + u32 int_sts; /* The ISR's view of INT_STS */ + u32 int_enp; /* The ISR's view of INT_ENP_CTL */ + } err; + }; +}; + +/* Register Dump Map Structure */ +struct lan78xx_reg_map { + u32 reg; + const char *name; }; /* use ethtool to change the level for any given device */ @@ -484,6 +536,87 @@ static int msg_level = -1; module_param(msg_level, int, 0); MODULE_PARM_DESC(msg_level, "Override default message level"); +/* Helper macro to map register to name string */ +#define LAN78XX_DUMP_REG(reg) { reg, #reg } + +static const struct lan78xx_reg_map lan78xx_fifo_regs[] = { + /* --- FIFO Control & Status --- + * specific enable/reset bits. + * used_bytes tells us if the bottleneck is USB (TX high) or MAC + * (RX high). + */ + LAN78XX_DUMP_REG(FCT_TX_CTL), + LAN78XX_DUMP_REG(FCT_RX_CTL), + + /* --- Data Path Usage --- + * Capture total buffer usage including USB endpoint overhead. + * If DP_STOR is high but FCT_USED is low, data is stuck in the USB + * layer. + */ + LAN78XX_DUMP_REG(TX_DP_STOR), + LAN78XX_DUMP_REG(RX_DP_STOR), + + /* --- FIFO Boundaries --- + * verify if the FIFO partitioning has been corrupted or misconfigured. + */ + LAN78XX_DUMP_REG(FCT_TX_FIFO_END), + LAN78XX_DUMP_REG(FCT_RX_FIFO_END), + + /* --- Flow Control --- + * Critical for "Pause Storm" debugging. + * Check if thresholds are set correctly and if Pause frames are enabled. + */ + LAN78XX_DUMP_REG(FCT_FLOW), + LAN78XX_DUMP_REG(FLOW), + + /* --- Configuration & Speed --- + * Mismatches between MAC speed (1G) and USB speed (HighSpeed) + * are the #1 cause of buffer overflows. + */ + LAN78XX_DUMP_REG(MAC_CR), /* MAC Speed/Duplex */ + LAN78XX_DUMP_REG(USB_CFG0), /* USB Speed/Burst Cap Enable */ + LAN78XX_DUMP_REG(BURST_CAP), /* USB Burst Size Limit */ + LAN78XX_DUMP_REG(BULK_IN_DLY), /* Inter-packet delay settings */ + + /* --- Debug Pointers --- + * Internal read/write pointers for the FIFO RAM. + * Helps detect if the hardware pointer logic has wrapped or frozen. + */ + LAN78XX_DUMP_REG(DP_SEL), + LAN78XX_DUMP_REG(DP_CMD), +}; + +static const struct lan78xx_reg_map lan78xx_err_regs[] = { + /* --- Interrupt Status --- + * The "Smoking Gun". Reveals if the error was triggered by: + * - MAC_ERR_INT: Internal logic overflow/underflow. + * - PHY_INT: Link loss or signal degradation. + * - TDFO/RDFO: FIFO Overflows (redundant but explicit). + */ + LAN78XX_DUMP_REG(INT_STS), + LAN78XX_DUMP_REG(INT_EP_CTL), + + /* --- System Health --- + * Check for invalid power states (D3 while active) or stuck resets. + * HW_CFG also contains the "Soft Reset" status bit. + */ + LAN78XX_DUMP_REG(HW_CFG), + LAN78XX_DUMP_REG(PMT_CTL), + + /* --- Bus Integrity --- + * USB_CFG1 contains Low Power Mode (LPM) and Suspend guards. + */ + LAN78XX_DUMP_REG(USB_CFG0), + LAN78XX_DUMP_REG(USB_CFG1), + + /* --- MAC Status --- + * Verify if the receiver is actually enabled (RXEN) and if + * filtering (Promiscuous/Multicast) is set as expected. + */ + LAN78XX_DUMP_REG(MAC_CR), + LAN78XX_DUMP_REG(MAC_RX), +}; + static struct sk_buff *lan78xx_get_buf(struct sk_buff_head *buf_pool) { if (skb_queue_empty(buf_pool)) @@ -831,12 +964,67 @@ static void lan78xx_check_stat_rollover(struct lan78xx_net *dev, memcpy(&dev->stats.saved, stats, sizeof(struct lan78xx_statstage)); } +static void lan78xx_check_stat_anomalies(struct lan78xx_net *dev) +{ + u64 delta_pause, delta_rx, delta_hw_drop, delta_sw_rx; + struct lan78xx_dump_ctx ctx = {0}; + struct lan78xx_stat_snapshot now; + const char *anomaly_msg = NULL; + + /* 1. Capture "Now" (Atomic-ish collection) */ + now.time = ktime_get_real(); + + mutex_lock(&dev->stats.access_lock); + now.tx_pause_total = dev->stats.curr_stat.tx_pause_frames; + now.rx_total_frames = dev->stats.curr_stat.rx_unicast_frames + + dev->stats.curr_stat.rx_broadcast_frames + + dev->stats.curr_stat.rx_multicast_frames; + now.rx_hw_drop_total = dev->stats.curr_stat.rx_dropped_frames; + now.tx_unicast_total = dev->stats.curr_stat.tx_unicast_frames; + mutex_unlock(&dev->stats.access_lock); + + now.rx_sw_packets_total = dev->net->stats.rx_packets; + + delta_pause = now.tx_pause_total - dev->snapshot.tx_pause_total; + delta_rx = now.rx_total_frames - dev->snapshot.rx_total_frames; + delta_hw_drop = now.rx_hw_drop_total - dev->snapshot.rx_hw_drop_total; + delta_sw_rx = now.rx_sw_packets_total - dev->snapshot.rx_sw_packets_total; + + now.last_delta_pause = (u32)delta_pause; + now.last_delta_drops = (u32)delta_hw_drop; + + dev->snapshot = now; + + if (delta_pause > LAN78XX_STALL_PAUSE_THRESH && delta_rx == 0) { + anomaly_msg = "Stall: Pause Storm & No RX"; + } else if (delta_hw_drop > LAN78XX_LIVELOCK_DROP_THRESH && + delta_hw_drop > (delta_sw_rx * LAN78XX_LIVELOCK_DROP_RATIO)) { + anomaly_msg = "Stall: RX Livelock Detected (Excessive Drop Ratio)"; + } + + if (!anomaly_msg) + return; + + /* 5. Reporting */ + ctx.msg = anomaly_msg; + ctx.ts = now.time; + ctx.fifo.delta_pause = delta_pause; + ctx.fifo.delta_rx = delta_rx; + ctx.fifo.delta_hw_drop = delta_hw_drop; + ctx.fifo.delta_sw_rx = delta_sw_rx; + + netdev_warn(dev->net, "%s (HW Drops: +%llu, SW RX: +%llu)\n", + ctx.msg, delta_hw_drop, delta_sw_rx); + + devlink_health_report(dev->fifo_reporter, ctx.msg, &ctx); +} + static void lan78xx_update_stats(struct lan78xx_net *dev) { + struct lan78xx_statstage lan78xx_stats; u32 *p, *count, *max; u64 *data; int i; - struct lan78xx_statstage lan78xx_stats; if (usb_autopm_get_interface(dev->intf) < 0) return; @@ -856,6 +1044,8 @@ static void lan78xx_update_stats(struct lan78xx_net *dev) mutex_unlock(&dev->stats.access_lock); + lan78xx_check_stat_anomalies(dev); + usb_autopm_put_interface(dev->intf); } @@ -1625,6 +1815,18 @@ static void lan78xx_status(struct lan78xx_net *dev, struct urb *urb) if (dev->domain_data.phyirq > 0) generic_handle_irq_safe(dev->domain_data.phyirq); + } else if (intdata & (INT_ENP_TDFO_INT | INT_ENP_TDFU_INT | + INT_ENP_RDFO_INT | INT_ENP_MAC_ERR_INT)) { + struct lan78xx_dump_ctx ctx = {0}; + + ctx.msg = "HW Interrupt Error"; + ctx.ts = ktime_get_real(); + ctx.err.int_sts = intdata; + + netdev_warn(dev->net, "HW Error detected: 0x%08x, triggering health report\n", + intdata); + + devlink_health_report(dev->internal_err_reporter, ctx.msg, &ctx); } else { netdev_warn(dev->net, "unexpected interrupt: 0x%08x\n", intdata); @@ -4705,6 +4907,148 @@ static void intr_complete(struct urb *urb) } } +static int lan78xx_dump_regs(struct lan78xx_net *dev, struct devlink_fmsg *fmsg, + const struct lan78xx_reg_map *map, size_t count) +{ + int ret, i; + u32 val; + + for (i = 0; i < count; i++) { + ret = lan78xx_read_reg(dev, map[i].reg, &val); + if (ret) + return ret; + devlink_fmsg_u32_pair_put(fmsg, map[i].name, val); + } + return 0; +} + +static int lan78xx_fifo_dump(struct devlink_health_reporter *reporter, + struct devlink_fmsg *fmsg, void *priv_ctx, + struct netlink_ext_ack *extack) +{ + struct lan78xx_net *dev = devlink_health_reporter_priv(reporter); + struct lan78xx_dump_ctx *ctx = priv_ctx; + + /* 1. Context Snapshot: + * Dump the specific counters that triggered the threshold. + * Registers may have changed since the decision was made. + */ + if (ctx) { + devlink_fmsg_string_pair_put(fmsg, "trigger_reason", ctx->msg); + devlink_fmsg_u64_pair_put(fmsg, "timestamp_ns", + ktime_to_ns(ctx->ts)); + + devlink_fmsg_obj_nest_start(fmsg); + devlink_fmsg_u64_pair_put(fmsg, "trigger_delta_pause", + ctx->fifo.delta_pause); + devlink_fmsg_u64_pair_put(fmsg, "trigger_delta_rx", + ctx->fifo.delta_rx); + devlink_fmsg_u64_pair_put(fmsg, "trigger_delta_hw_drop", + ctx->fifo.delta_hw_drop); + devlink_fmsg_u64_pair_put(fmsg, "trigger_delta_sw_rx", + ctx->fifo.delta_sw_rx); + devlink_fmsg_obj_nest_end(fmsg); + } + + /* USB Speed is critical for interpreting throughput/stall issues */ + devlink_fmsg_u8_pair_put(fmsg, "usb_speed_enum", dev->udev->speed); + + /* 2. Live Register Dump */ + return lan78xx_dump_regs(dev, fmsg, lan78xx_fifo_regs, + ARRAY_SIZE(lan78xx_fifo_regs)); +} + +static int lan78xx_internal_err_dump(struct devlink_health_reporter *reporter, + struct devlink_fmsg *fmsg, void *priv_ctx, + struct netlink_ext_ack *extack) +{ + struct lan78xx_net *dev = devlink_health_reporter_priv(reporter); + struct lan78xx_dump_ctx *ctx = priv_ctx; + + /* Interrupt status is "write-1-to-clear" or cleared on read. + * We must dump the value seen by the ISR, not the current register + * value. + */ + if (ctx) { + devlink_fmsg_string_pair_put(fmsg, "trigger_reason", ctx->msg); + devlink_fmsg_u64_pair_put(fmsg, "timestamp_ns", + ktime_to_ns(ctx->ts)); + + devlink_fmsg_u32_pair_put(fmsg, "trigger_int_sts", + ctx->err.int_sts); + devlink_fmsg_u32_pair_put(fmsg, "trigger_int_enp", + ctx->err.int_enp); + } + + return lan78xx_dump_regs(dev, fmsg, lan78xx_err_regs, + ARRAY_SIZE(lan78xx_err_regs)); +} + +static const struct devlink_health_reporter_ops lan78xx_fifo_ops = { + .name = "fifo", + .dump = lan78xx_fifo_dump, +}; + +static const struct devlink_health_reporter_ops lan78xx_internal_err_ops = { + .name = "internal_err", + .dump = lan78xx_internal_err_dump, +}; + +static int lan78xx_health_init(struct lan78xx_net *dev) +{ + dev->fifo_reporter = devlink_health_reporter_create(dev->devlink, + &lan78xx_fifo_ops, + dev); + if (IS_ERR(dev->fifo_reporter)) { + netdev_warn(dev->net, "Failed to create fifo reporter\n"); + + return PTR_ERR(dev->fifo_reporter); + } + + dev->internal_err_reporter = + devlink_health_reporter_create(dev->devlink, + &lan78xx_internal_err_ops, dev); + if (IS_ERR(dev->internal_err_reporter)) { + netdev_warn(dev->net, "Failed to create internal_err reporter\n"); + devlink_health_reporter_destroy(dev->fifo_reporter); + + return PTR_ERR(dev->internal_err_reporter); + } + + return 0; +} + +static void lan78xx_health_cleanup(struct lan78xx_net *dev) +{ + devlink_health_reporter_destroy(dev->fifo_reporter); + devlink_health_reporter_destroy(dev->internal_err_reporter); +} + +static int lan78xx_devlink_info_get(struct devlink *devlink, + struct devlink_info_req *req, + struct netlink_ext_ack *extack) +{ + struct lan78xx_devlink_priv *dl_priv = devlink_priv(devlink); + struct lan78xx_net *dev = dl_priv->dev; + char buf[16]; + + snprintf(buf, sizeof(buf), "0x%04X", dev->chipid); + devlink_info_version_fixed_put(req, + DEVLINK_INFO_VERSION_GENERIC_ASIC_ID, + buf); + + snprintf(buf, sizeof(buf), "0x%04X", dev->chiprev); + devlink_info_version_fixed_put(req, + DEVLINK_INFO_VERSION_GENERIC_ASIC_REV, + buf); + + return 0; +} + +static const struct devlink_ops lan78xx_devlink_ops = { + .info_get = lan78xx_devlink_info_get, +}; + static void lan78xx_disconnect(struct usb_interface *intf) { struct lan78xx_net *dev; @@ -4719,6 +5063,13 @@ static void lan78xx_disconnect(struct usb_interface *intf) udev = interface_to_usbdev(intf); net = dev->net; + lan78xx_health_cleanup(dev); + if (dev->devlink) { + devlink_unregister(dev->devlink); + devlink_free(dev->devlink); + dev->devlink = NULL; + } + rtnl_lock(); phylink_stop(dev->phylink); phylink_disconnect_phy(dev->phylink); @@ -4749,6 +5100,30 @@ static void lan78xx_disconnect(struct usb_interface *intf) static void lan78xx_tx_timeout(struct net_device *net, unsigned int txqueue) { struct lan78xx_net *dev = netdev_priv(net); + struct lan78xx_dump_ctx ctx = {0}; + s64 diff_ms; + + /* Calculate time since last health check */ + ctx.ts = ktime_get_real(); + diff_ms = ktime_ms_delta(ctx.ts, dev->snapshot.time); + + /* We rely on the trend data captured during the last valid stat update + * to infer the system state before the crash. + */ + if (dev->snapshot.last_delta_pause > LAN78XX_STALL_PAUSE_THRESH) + ctx.msg = "TX Timeout (Flow Control Storm?)"; + else if (dev->snapshot.last_delta_drops > LAN78XX_TX_TIMEOUT_DROP_THRESH) + ctx.msg = "TX Timeout (FIFO Drop Storm?)"; + else + ctx.msg = "TX Timeout"; + + ctx.fifo.delta_pause = dev->snapshot.last_delta_pause; + ctx.fifo.delta_hw_drop = dev->snapshot.last_delta_drops; + + netdev_warn(dev->net, "%s (Last stat update: %lld ms ago)\n", + ctx.msg, diff_ms); + + devlink_health_report(dev->fifo_reporter, ctx.msg, &ctx); unlink_urbs(dev, &dev->txq); napi_schedule(&dev->napi); @@ -5157,6 +5532,17 @@ static int lan78xx_probe(struct usb_interface *intf, pm_runtime_set_autosuspend_delay(&udev->dev, DEFAULT_AUTOSUSPEND_DELAY); + dev->devlink = devlink_alloc(&lan78xx_devlink_ops, + sizeof(struct lan78xx_devlink_priv), + &udev->dev); + if (dev->devlink) { + struct lan78xx_devlink_priv *dl_priv = devlink_priv(dev->devlink); + + dl_priv->dev = dev; + devlink_register(dev->devlink); + lan78xx_health_init(dev); + } + return 0; phy_uninit: -- 2.47.3