From: Eddie Phillips When the system is under extreme memory pressure, page allocations can fail during the Rx buffer refill loop. If the number of buffers posted to hardware falls below a critical low threshold and the refill loop exits due to allocation failures, the queue can stall: 1. The device drops incoming packets because there are no descriptors. 2. Since no packets are processed, no Rx completions are generated. 3. Because no completions occur, NAPI is never scheduled, preventing the refill loop from running again even after memory is freed. This results in a permanent queue stall. Resolve this by introducing a starvation recovery timer for each Rx queue. If the number of buffers posted to hardware falls below a critical low threshold, start a timer to periodically reschedule NAPI. Once NAPI runs and successfully refills the queue above the threshold, the timer is not rescheduled. Also add a new ethtool statistic "rx_critical_low_bufs" to track the number of times the starvation recovery timer is triggered. Cc: stable@vger.kernel.org Fixes: 9b8dd5e5ea48 ("gve: DQO: Add RX path") Reviewed-by: Jordan Rhee Signed-off-by: Eddie Phillips Signed-off-by: Harshitha Ramamurthy --- drivers/net/ethernet/google/gve/gve.h | 4 ++++ drivers/net/ethernet/google/gve/gve_ethtool.c | 14 +++++++++++++- drivers/net/ethernet/google/gve/gve_rx_dqo.c | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 49 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/google/gve/gve.h b/drivers/net/ethernet/google/gve/gve.h index 2f7bd330..8378bef2 100644 --- a/drivers/net/ethernet/google/gve/gve.h +++ b/drivers/net/ethernet/google/gve/gve.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -41,6 +42,7 @@ /* Interval to schedule a stats report update, 20000ms. */ #define GVE_STATS_REPORT_TIMER_PERIOD 20000 +#define GVE_RX_NAPI_RESCHED_MS 20 /* msecs */ /* Numbers of NIC tx/rx stats in stats report. */ #define NIC_TX_STATS_REPORT_NUM 0 @@ -318,6 +320,7 @@ struct gve_rx_ring { u64 rx_copied_pkt; /* free-running total number of copied packets */ u64 rx_skb_alloc_fail; /* free-running count of skb alloc fails */ u64 rx_buf_alloc_fail; /* free-running count of buffer alloc fails */ + u64 rx_critical_low_bufs; /* count of critical low buffer events */ u64 rx_desc_err_dropped_pkt; /* free-running count of packets dropped by descriptor error */ /* free-running count of unsplit packets due to header buffer overflow or hdr_len is 0 */ u64 rx_hsplit_unsplit_pkt; @@ -334,6 +337,7 @@ struct gve_rx_ring { struct gve_queue_resources *q_resources; /* head and tail pointer idx */ dma_addr_t q_resources_bus; /* dma address for the queue resources */ struct u64_stats_sync statss; /* sync stats for 32bit archs */ + struct timer_list starvation_timer; /* for queue starvation recovery */ struct gve_rx_ctx ctx; /* Info for packet currently being processed in this ring. */ diff --git a/drivers/net/ethernet/google/gve/gve_ethtool.c b/drivers/net/ethernet/google/gve/gve_ethtool.c index a0e0472b..71b6efbf 100644 --- a/drivers/net/ethernet/google/gve/gve_ethtool.c +++ b/drivers/net/ethernet/google/gve/gve_ethtool.c @@ -46,6 +46,7 @@ static const char gve_gstrings_main_stats[][ETH_GSTRING_LEN] = { "rx_hsplit_unsplit_pkt", "interface_up_cnt", "interface_down_cnt", "reset_cnt", "page_alloc_fail", "dma_mapping_error", "stats_report_trigger_cnt", + "rx_critical_low_bufs", }; static const char gve_gstrings_rx_stats[][ETH_GSTRING_LEN] = { @@ -58,6 +59,7 @@ static const char gve_gstrings_rx_stats[][ETH_GSTRING_LEN] = { "rx_xdp_aborted[%u]", "rx_xdp_drop[%u]", "rx_xdp_pass[%u]", "rx_xdp_tx[%u]", "rx_xdp_redirect[%u]", "rx_xdp_tx_errors[%u]", "rx_xdp_redirect_errors[%u]", "rx_xdp_alloc_fails[%u]", + "rx_critical_low_bufs[%u]", }; static const char gve_gstrings_tx_stats[][ETH_GSTRING_LEN] = { @@ -151,12 +153,14 @@ gve_get_ethtool_stats(struct net_device *netdev, { u64 tmp_rx_pkts, tmp_rx_hsplit_pkt, tmp_rx_bytes, tmp_rx_hsplit_bytes, tmp_rx_skb_alloc_fail, tmp_rx_buf_alloc_fail, + tmp_rx_critical_low_bufs, tmp_rx_desc_err_dropped_pkt, tmp_rx_hsplit_unsplit_pkt, tmp_tx_pkts, tmp_tx_bytes, tmp_xdp_tx_errors, tmp_xdp_redirect_errors; u64 rx_buf_alloc_fail, rx_desc_err_dropped_pkt, rx_hsplit_unsplit_pkt, rx_pkts, rx_hsplit_pkt, rx_skb_alloc_fail, rx_bytes, tx_pkts, tx_bytes, - tx_dropped, xdp_tx_errors, xdp_redirect_errors; + rx_critical_low_bufs, tx_dropped, xdp_tx_errors, + xdp_redirect_errors; int rx_base_stats_idx, max_rx_stats_idx, max_tx_stats_idx; int stats_idx, stats_region_len, nic_stats_len; struct stats *report_stats; @@ -197,6 +201,7 @@ gve_get_ethtool_stats(struct net_device *netdev, for (rx_pkts = 0, rx_bytes = 0, rx_hsplit_pkt = 0, rx_skb_alloc_fail = 0, rx_buf_alloc_fail = 0, + rx_critical_low_bufs = 0, rx_desc_err_dropped_pkt = 0, rx_hsplit_unsplit_pkt = 0, xdp_tx_errors = 0, xdp_redirect_errors = 0, ring = 0; @@ -212,6 +217,8 @@ gve_get_ethtool_stats(struct net_device *netdev, tmp_rx_bytes = rx->rbytes; tmp_rx_skb_alloc_fail = rx->rx_skb_alloc_fail; tmp_rx_buf_alloc_fail = rx->rx_buf_alloc_fail; + tmp_rx_critical_low_bufs = + rx->rx_critical_low_bufs; tmp_rx_desc_err_dropped_pkt = rx->rx_desc_err_dropped_pkt; tmp_rx_hsplit_unsplit_pkt = @@ -226,6 +233,7 @@ gve_get_ethtool_stats(struct net_device *netdev, rx_bytes += tmp_rx_bytes; rx_skb_alloc_fail += tmp_rx_skb_alloc_fail; rx_buf_alloc_fail += tmp_rx_buf_alloc_fail; + rx_critical_low_bufs += tmp_rx_critical_low_bufs; rx_desc_err_dropped_pkt += tmp_rx_desc_err_dropped_pkt; rx_hsplit_unsplit_pkt += tmp_rx_hsplit_unsplit_pkt; xdp_tx_errors += tmp_xdp_tx_errors; @@ -269,6 +277,7 @@ gve_get_ethtool_stats(struct net_device *netdev, data[i++] = priv->page_alloc_fail; data[i++] = priv->dma_mapping_error; data[i++] = priv->stats_report_trigger_cnt; + data[i++] = rx_critical_low_bufs; i = GVE_MAIN_STATS_LEN; rx_base_stats_idx = 0; @@ -337,6 +346,8 @@ gve_get_ethtool_stats(struct net_device *netdev, tmp_rx_hsplit_bytes = rx->rx_hsplit_bytes; tmp_rx_skb_alloc_fail = rx->rx_skb_alloc_fail; tmp_rx_buf_alloc_fail = rx->rx_buf_alloc_fail; + tmp_rx_critical_low_bufs = + rx->rx_critical_low_bufs; tmp_rx_desc_err_dropped_pkt = rx->rx_desc_err_dropped_pkt; tmp_xdp_tx_errors = rx->xdp_tx_errors; @@ -381,6 +392,7 @@ gve_get_ethtool_stats(struct net_device *netdev, } while (u64_stats_fetch_retry(&priv->rx[ring].statss, start)); i += GVE_XDP_ACTIONS + 3; /* XDP rx counters */ + data[i++] = tmp_rx_critical_low_bufs; } } else { i += priv->rx_cfg.num_queues * NUM_GVE_RX_CNTS; diff --git a/drivers/net/ethernet/google/gve/gve_rx_dqo.c b/drivers/net/ethernet/google/gve/gve_rx_dqo.c index 02cba280..303db4fa 100644 --- a/drivers/net/ethernet/google/gve/gve_rx_dqo.c +++ b/drivers/net/ethernet/google/gve/gve_rx_dqo.c @@ -18,6 +18,16 @@ #include #include +static void gve_rx_starvation_timer(struct timer_list *t) +{ + struct gve_rx_ring *rx = timer_container_of(rx, t, starvation_timer); + struct gve_priv *priv = rx->gve; + struct gve_notify_block *block; + + block = &priv->ntfy_blocks[rx->ntfy_id]; + napi_schedule(&block->napi); +} + static void gve_rx_free_hdr_bufs(struct gve_priv *priv, struct gve_rx_ring *rx) { struct device *hdev = &priv->pdev->dev; @@ -120,6 +130,7 @@ void gve_rx_stop_ring_dqo(struct gve_priv *priv, int idx) if (rx->dqo.page_pool) page_pool_disable_direct_recycling(rx->dqo.page_pool); + timer_delete_sync(&rx->starvation_timer); gve_remove_napi(priv, ntfy_idx); gve_rx_remove_from_block(priv, idx); gve_rx_reset_ring_dqo(priv, idx); @@ -136,6 +147,8 @@ void gve_rx_free_ring_dqo(struct gve_priv *priv, struct gve_rx_ring *rx, u32 qpl_id; int i; + timer_shutdown_sync(&rx->starvation_timer); + completion_queue_slots = rx->dqo.complq.mask + 1; buffer_queue_slots = rx->dqo.bufq.mask + 1; @@ -232,6 +245,7 @@ int gve_rx_alloc_ring_dqo(struct gve_priv *priv, rx->gve = priv; rx->q_num = idx; rx->packet_buffer_size = cfg->packet_buffer_size; + timer_setup(&rx->starvation_timer, gve_rx_starvation_timer, 0); if (cfg->xdp) { rx->packet_buffer_truesize = GVE_XDP_RX_BUFFER_SIZE_DQO; @@ -365,6 +379,7 @@ void gve_rx_post_buffers_dqo(struct gve_rx_ring *rx) struct gve_rx_compl_queue_dqo *complq = &rx->dqo.complq; struct gve_rx_buf_queue_dqo *bufq = &rx->dqo.bufq; struct gve_priv *priv = rx->gve; + u32 num_bufs_avail_to_hw; u32 num_avail_slots; u32 num_full_slots; u32 num_posted = 0; @@ -400,6 +415,23 @@ void gve_rx_post_buffers_dqo(struct gve_rx_ring *rx) } rx->fill_cnt += num_posted; + + /* If the queue has fewer than GVE_RX_BUF_THRESH_DQO descriptors + * visible to the hardware, and no doorbell was written, the hardware + * is in danger of starving and cannot trigger interrupts. Start the + * timer to periodically reschedule NAPI and recover from starvation. + */ + num_bufs_avail_to_hw = + ((bufq->tail & ~(GVE_RX_BUF_THRESH_DQO - 1)) - + bufq->head) & bufq->mask; + + if (num_bufs_avail_to_hw < GVE_RX_BUF_THRESH_DQO) { + u64_stats_update_begin(&rx->statss); + rx->rx_critical_low_bufs++; + u64_stats_update_end(&rx->statss); + mod_timer(&rx->starvation_timer, + jiffies + msecs_to_jiffies(GVE_RX_NAPI_RESCHED_MS)); + } } static void gve_rx_skb_csum(struct sk_buff *skb, -- 2.55.0.rc2.803.g1fd1e6609c-goog