mlx5e_xdp_xmit() currently selects the xdpsq (send queue) using smp_processor_id() (i.e. cpu id). When doing XDP_REDIRECT from a cpu with id >= priv->channels.num, however, mlx5e_xdp_xmit() returns -ENXIO and the redirect fails. Previous discussion proposed using modulo or while loop subtraction in mlx5e_xdp_xmit() to map cpu id to send queue, but these approaches introduce hot path overhead on modern systems where the number of logical cores >> the number of XDP send queues (xdpsq). The below approach precomputes per-cpu priv->xdpsq assignments when channels are (re)configured and does a constant-time lookup in mlx5e_xdp_xmit(). Because multiple CPUs may now map to the same xdpsq (whenever cpu count exceeds channel count), we serialize writes in xdp_xmit with a tx_lock. Link: https://lore.kernel.org/all/610D8F9E-0038-46D9-AD8A-1D596236B1EF@spacex.com/ Link: https://lore.kernel.org/all/474c1f71-3a5c-4fe5-a01e-80f2ba95fd7e@bytedance.com/ Signed-off-by: Finn Dayton --- v2: - Removed unnecessary guards - Improved variable naming and placement - Change mapping from cpu -> index to cpu -> xdpsq - Call smp_wmb() after updates to mapping drivers/net/ethernet/mellanox/mlx5/core/en.h | 4 +++ .../net/ethernet/mellanox/mlx5/core/en/xdp.c | 17 ++++++------- .../net/ethernet/mellanox/mlx5/core/en_main.c | 25 +++++++++++++++++++ 3 files changed, 36 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index ea2cd1f5d1d0..713dc7f9bae3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -493,6 +493,8 @@ struct mlx5e_xdpsq { u16 pc; struct mlx5_wqe_ctrl_seg *doorbell_cseg; struct mlx5e_tx_mpwqe mpwqe; + /* serialize writes by multiple CPUs to this send queue */ + spinlock_t tx_lock; struct mlx5e_cq cq; @@ -898,6 +900,8 @@ struct mlx5e_priv { struct mlx5e_selq selq; struct mlx5e_txqsq **txq2sq; struct mlx5e_sq_stats **txq2sq_stats; + /* selects the xdpsq during mlx5e_xdp_xmit() */ + struct mlx5e_xdpsq * __percpu *send_queue_ptr; #ifdef CONFIG_MLX5_CORE_EN_DCB struct mlx5e_dcbx_dp dcbx_dp; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index 80f9fc10877a..1db83a69055c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -845,7 +845,6 @@ int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, struct mlx5e_priv *priv = netdev_priv(dev); struct mlx5e_xdpsq *sq; int nxmit = 0; - int sq_num; int i; /* this flag is sufficient, no need to test internal sq state */ @@ -854,14 +853,12 @@ int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) return -EINVAL; - - sq_num = smp_processor_id(); - - if (unlikely(sq_num >= priv->channels.num)) - return -ENXIO; - - sq = priv->channels.c[sq_num]->xdpsq; - + /* Per-CPU xdpsq mapping, rebuilt on channel (re)configuration while XDP TX is disabled */ + sq = *this_cpu_ptr(priv->send_queue_ptr); + /* The number of queues configured on a netdev may be smaller than the + * CPU pool, so two CPUs might map to this queue. We must serialize writes. + */ + spin_lock(&sq->tx_lock); for (i = 0; i < n; i++) { struct mlx5e_xmit_data_frags xdptxdf = {}; struct xdp_frame *xdpf = frames[i]; @@ -941,7 +938,7 @@ int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, if (flags & XDP_XMIT_FLUSH) mlx5e_xmit_xdp_doorbell(sq); - + spin_unlock(&sq->tx_lock); return nxmit; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index b6c12460b54a..434db74f096b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -1505,6 +1505,7 @@ static int mlx5e_alloc_xdpsq(struct mlx5e_channel *c, sq->stop_room = param->is_mpw ? mlx5e_stop_room_for_mpwqe(mdev) : mlx5e_stop_room_for_max_wqe(mdev); sq->max_sq_mpw_wqebbs = mlx5e_get_max_sq_aligned_wqebbs(mdev); + spin_lock_init(&sq->tx_lock); param->wq.db_numa_node = cpu_to_node(c->cpu); err = mlx5_wq_cyc_create(mdev, ¶m->wq, sqc_wq, wq, &sq->wq_ctrl); @@ -3283,10 +3284,27 @@ static void mlx5e_build_txq_maps(struct mlx5e_priv *priv) smp_wmb(); } +static void mlx5e_build_xdpsq_maps(struct mlx5e_priv *priv) +{ + /* Build mapping from CPU id to XDP send queue, used by + * mlx5e_xdp_xmit() to determine which send queue to transmit packet on. + */ + int cpu; + + for_each_possible_cpu(cpu) { + int send_queue_idx = cpu % priv->channels.num; + struct mlx5e_xdpsq *sq = priv->channels.c[send_queue_idx]->xdpsq; + *per_cpu_ptr(priv->send_queue_ptr, cpu) = sq; + } + /* Publish the new CPU->xdpsq map before re-enabling XDP TX */ + smp_wmb(); +} + void mlx5e_activate_priv_channels(struct mlx5e_priv *priv) { mlx5e_build_txq_maps(priv); mlx5e_activate_channels(priv, &priv->channels); + mlx5e_build_xdpsq_maps(priv); mlx5e_xdp_tx_enable(priv); /* dev_watchdog() wants all TX queues to be started when the carrier is @@ -6262,8 +6280,14 @@ int mlx5e_priv_init(struct mlx5e_priv *priv, if (!priv->fec_ranges) goto err_free_channel_stats; + priv->send_queue_ptr = alloc_percpu(struct mlx5e_xdpsq *); + if (!priv->send_queue_ptr) + goto err_free_fec_ranges; + return 0; +err_free_fec_ranges: + kfree(priv->fec_ranges); err_free_channel_stats: kfree(priv->channel_stats); err_free_tx_rates: @@ -6290,6 +6314,7 @@ void mlx5e_priv_cleanup(struct mlx5e_priv *priv) if (!priv->mdev) return; + free_percpu(priv->send_queue_ptr); kfree(priv->fec_ranges); for (i = 0; i < priv->stats_nch; i++) kvfree(priv->channel_stats[i]); -- 2.43.0