mlx5e_xdp_xmit() selects an XDP SQ (Send Queue) using smp_processor_id() (CPU ID). When doing XDP_REDIRECT from a CPU whose ID is >= priv->channels.num, mlx5e_xdp_xmit() returns -ENXIO and the redirect fails. Previous discussion proposed using modulo in mlx5e_xdp_xmit() to map CPU IDs into the channel range, but modulo/division is too costly in the hot path. Instead, this solution precomputes per-cpu priv->xdpsq assignments when channels are (re)configured and does a single lookup in mlx5e_xdp_xmit(). Because multiple CPUs map to the same xdpsq when CPU count exceeds channel count, serialize xdp_xmit on the ring with xdp_tx_lock. Fixes: 58b99ee3e3eb ("net/mlx5e: Add support for XDP_REDIRECT in device-out side") Link: https://lore.kernel.org/netdev/20251031231038.1092673-1-zijianzhang@bytedance.com/ Link: https://lore.kernel.org/netdev/44f69955-b566-4fb1-904d-f551046ff2d4@gmail.com Cc: stable@vger.kernel.org # 6.12+ Signed-off-by: Finn Dayton --- Testing: - XDP forwarding / XDP_REDIRECT verified with both low CPU ids and CPU ids > than number of send queues. - No -ENXIO observed, successful forwarding. drivers/net/ethernet/mellanox/mlx5/core/en.h | 4 +++ .../net/ethernet/mellanox/mlx5/core/en/xdp.c | 16 +++++++---- .../net/ethernet/mellanox/mlx5/core/en_main.c | 28 +++++++++++++++++++ 3 files changed, 43 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index ea2cd1f5d1d0..387954201640 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -519,6 +519,8 @@ struct mlx5e_xdpsq { /* control path */ struct mlx5_wq_ctrl wq_ctrl; struct mlx5e_channel *channel; + /* serialize writes by multiple CPUs to this send queue */ + spinlock_t xdp_tx_lock; } ____cacheline_aligned_in_smp; struct mlx5e_xdp_buff { @@ -909,6 +911,8 @@ struct mlx5e_priv { struct mlx5e_rq drop_rq; struct mlx5e_channels channels; + /* selects the xdpsq during mlx5e_xdp_xmit() */ + int __percpu *send_queue_idx_ptr; struct mlx5e_rx_res *rx_res; u32 *tx_rates; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index 80f9fc10877a..2dd44ad873a1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -845,7 +845,7 @@ int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, struct mlx5e_priv *priv = netdev_priv(dev); struct mlx5e_xdpsq *sq; int nxmit = 0; - int sq_num; + int send_queue_idx = 0; int i; /* this flag is sufficient, no need to test internal sq state */ @@ -855,13 +855,19 @@ int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) return -EINVAL; - sq_num = smp_processor_id(); - if (unlikely(sq_num >= priv->channels.num)) + if (unlikely(!priv->send_queue_idx_ptr)) return -ENXIO; - sq = priv->channels.c[sq_num]->xdpsq; + send_queue_idx = *this_cpu_ptr(priv->send_queue_idx_ptr); + if (unlikely(send_queue_idx >= priv->channels.num || send_queue_idx < 0)) + return -ENXIO; + sq = priv->channels.c[send_queue_idx]->xdpsq; + /* The number of queues configured on a netdev may be smaller than the + * CPU pool, so two CPUs might map to this queue. We must serialize writes. + */ + spin_lock(&sq->xdp_tx_lock); for (i = 0; i < n; i++) { struct mlx5e_xmit_data_frags xdptxdf = {}; struct xdp_frame *xdpf = frames[i]; @@ -941,7 +947,7 @@ int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, if (flags & XDP_XMIT_FLUSH) mlx5e_xmit_xdp_doorbell(sq); - + spin_unlock(&sq->xdp_tx_lock); return nxmit; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 7eb691c2a1bd..adef35d06b89 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -1492,6 +1492,7 @@ static int mlx5e_alloc_xdpsq(struct mlx5e_channel *c, sq->pdev = c->pdev; sq->mkey_be = c->mkey_be; sq->channel = c; + spin_lock_init(&sq->xdp_tx_lock); sq->uar_map = c->bfreg->map; sq->min_inline_mode = params->tx_min_inline_mode; sq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu) - ETH_FCS_LEN; @@ -3283,10 +3284,30 @@ static void mlx5e_build_txq_maps(struct mlx5e_priv *priv) smp_wmb(); } +static void build_priv_to_xdpsq_associations(struct mlx5e_priv *priv) +{ + /* + * Build the mapping from CPU to XDP send queue index for priv. + * This is used by mlx5e_xdp_xmit() to determine which xdpsq (send queue) + * should handle the xdptx data, based on the CPU running mlx5e_xdp_xmit() + * and the target priv (netdev). + */ + int send_queue_idx, cpu; + + if (unlikely(priv->channels.num == 0)) + return; + + for_each_possible_cpu(cpu) { + send_queue_idx = cpu % priv->channels.num; + *per_cpu_ptr(priv->send_queue_idx_ptr, cpu) = send_queue_idx; + } +} + void mlx5e_activate_priv_channels(struct mlx5e_priv *priv) { mlx5e_build_txq_maps(priv); mlx5e_activate_channels(priv, &priv->channels); + build_priv_to_xdpsq_associations(priv); mlx5e_xdp_tx_enable(priv); /* dev_watchdog() wants all TX queues to be started when the carrier is @@ -6263,8 +6284,14 @@ int mlx5e_priv_init(struct mlx5e_priv *priv, if (!priv->fec_ranges) goto err_free_channel_stats; + priv->send_queue_idx_ptr = alloc_percpu(int); + if (!priv->send_queue_idx_ptr) + goto err_free_fec_ranges; + return 0; +err_free_fec_ranges: + kfree(priv->fec_ranges); err_free_channel_stats: kfree(priv->channel_stats); err_free_tx_rates: @@ -6295,6 +6322,7 @@ void mlx5e_priv_cleanup(struct mlx5e_priv *priv) for (i = 0; i < priv->stats_nch; i++) kvfree(priv->channel_stats[i]); kfree(priv->channel_stats); + free_percpu(priv->send_queue_idx_ptr); kfree(priv->tx_rates); kfree(priv->txq2sq_stats); kfree(priv->txq2sq); -- 2.43.0