From: Shay Drory Refactor shared FDB LAG logic into a new lag/shared_fdb.c file to improve code organization and enable reuse. Move shared FDB specific functions from lag.c and introduce consolidated APIs: - mlx5_lag_shared_fdb_create() handles LAG activation with shared FDB - mlx5_lag_shared_fdb_destroy() handles LAG deactivation with shared FDB Update mlx5_do_bond(), mlx5_disable_lag() and mpesw.c to use the new APIs, which simplifies the shared FDB code paths. Signed-off-by: Shay Drory Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan --- .../net/ethernet/mellanox/mlx5/core/Makefile | 2 +- .../net/ethernet/mellanox/mlx5/core/lag/lag.c | 156 ++++-------------- .../net/ethernet/mellanox/mlx5/core/lag/lag.h | 26 +++ .../ethernet/mellanox/mlx5/core/lag/mpesw.c | 25 +-- .../mellanox/mlx5/core/lag/shared_fdb.c | 143 ++++++++++++++++ 5 files changed, 210 insertions(+), 142 deletions(-) create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lag/shared_fdb.c diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile index d39fe9c4a87c..19e50f0d55af 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -41,7 +41,7 @@ mlx5_core-$(CONFIG_MLX5_CORE_EN_DCB) += en_dcbnl.o en/port_buffer.o mlx5_core-$(CONFIG_PCI_HYPERV_INTERFACE) += en/hv_vhca_stats.o mlx5_core-$(CONFIG_MLX5_ESWITCH) += lag/mp.o lag/port_sel.o lib/geneve.o lib/port_tun.o \ en_rep.o en/rep/bond.o en/mod_hdr.o \ - en/mapping.o lag/mpesw.o + en/mapping.o lag/mpesw.o lag/shared_fdb.o mlx5_core-$(CONFIG_MLX5_CLS_ACT) += en_tc.o en/rep/tc.o en/rep/neigh.o \ lib/fs_chains.o en/tc_tun.o \ esw/indir_table.o en/tc_tun_encap.o \ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c index 22b7efea34b8..5dfdd799828f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c @@ -817,43 +817,6 @@ char *mlx5_get_str_port_sel_mode(enum mlx5_lag_mode mode, unsigned long flags) } } -static int mlx5_lag_create_single_fdb(struct mlx5_lag *ldev) -{ - int master_idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1); - struct mlx5_eswitch *master_esw; - struct mlx5_core_dev *dev0; - int i, j; - int err; - - if (master_idx < 0) - return -EINVAL; - - dev0 = mlx5_lag_pf(ldev, master_idx)->dev; - master_esw = dev0->priv.eswitch; - mlx5_ldev_for_each(i, 0, ldev) { - struct mlx5_eswitch *slave_esw; - - if (i == master_idx) - continue; - - slave_esw = mlx5_lag_pf(ldev, i)->dev->priv.eswitch; - - err = mlx5_eswitch_offloads_single_fdb_add_one(master_esw, - slave_esw, ldev->ports); - if (err) - goto err; - } - return 0; -err: - mlx5_ldev_for_each_reverse(j, i, 0, ldev) { - if (j == master_idx) - continue; - mlx5_eswitch_offloads_single_fdb_del_one(master_esw, - mlx5_lag_pf(ldev, j)->dev->priv.eswitch); - } - return err; -} - static int mlx5_create_lag(struct mlx5_lag *ldev, struct lag_tracker *tracker, enum mlx5_lag_mode mode, @@ -1218,12 +1181,15 @@ void mlx5_disable_lag(struct mlx5_lag *ldev) if (idx < 0) return; + if (shared_fdb) { + mlx5_lag_shared_fdb_destroy(ldev); + return; + } + dev0 = mlx5_lag_pf(ldev, idx)->dev; roce_lag = __mlx5_lag_is_roce(ldev); - if (shared_fdb) { - mlx5_lag_remove_devices(ldev); - } else if (roce_lag) { + if (roce_lag) { mlx5_lag_rescan_dev_locked(ldev, dev0, false); mlx5_ldev_for_each(i, 0, ldev) { if (i == idx) @@ -1236,49 +1202,8 @@ void mlx5_disable_lag(struct mlx5_lag *ldev) if (err) return; - if (shared_fdb || roce_lag) + if (roce_lag) mlx5_lag_add_devices(ldev); - - if (shared_fdb) - mlx5_lag_reload_ib_reps_from_locked(ldev, - MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV, - true); -} - -bool mlx5_lag_shared_fdb_supported(struct mlx5_lag *ldev) -{ - struct mlx5_core_dev *dev; - bool ret = false; - int idx; - int i; - - idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1); - if (idx < 0) - return false; - - mlx5_ldev_for_each(i, 0, ldev) { - if (i == idx) - continue; - dev = mlx5_lag_pf(ldev, i)->dev; - if (is_mdev_switchdev_mode(dev) && - mlx5_eswitch_vport_match_metadata_enabled(dev->priv.eswitch) && - MLX5_CAP_GEN(dev, lag_native_fdb_selection) && - MLX5_CAP_ESW(dev, root_ft_on_other_esw) && - mlx5_eswitch_get_npeers(dev->priv.eswitch) == - MLX5_CAP_GEN(dev, num_lag_ports) - 1) - continue; - return false; - } - - dev = mlx5_lag_pf(ldev, idx)->dev; - if (is_mdev_switchdev_mode(dev) && - mlx5_eswitch_vport_match_metadata_enabled(dev->priv.eswitch) && - mlx5_esw_offloads_devcom_is_ready(dev->priv.eswitch) && - MLX5_CAP_ESW(dev, esw_shared_ingress_acl) && - mlx5_eswitch_get_npeers(dev->priv.eswitch) == MLX5_CAP_GEN(dev, num_lag_ports) - 1) - ret = true; - - return ret; } static bool mlx5_lag_is_roce_lag(struct mlx5_lag *ldev) @@ -1493,47 +1418,37 @@ static void mlx5_do_bond(struct mlx5_lag *ldev) roce_lag = mlx5_lag_is_roce_lag(ldev); - if (shared_fdb || roce_lag) - mlx5_lag_remove_devices(ldev); - - err = mlx5_activate_lag(ldev, &tracker, - roce_lag ? MLX5_LAG_MODE_ROCE : - MLX5_LAG_MODE_SRIOV, - shared_fdb); - if (err) { - if (shared_fdb || roce_lag) - mlx5_lag_add_devices(ldev); - if (shared_fdb) - mlx5_lag_reload_ib_reps_from_locked(ldev, 0, - true); - - return; - } + if (shared_fdb) { + err = mlx5_lag_shared_fdb_create(ldev, &tracker, + MLX5_LAG_MODE_SRIOV); + if (err) + return; + } else { + if (roce_lag) + mlx5_lag_remove_devices(ldev); - if (roce_lag) { - struct mlx5_core_dev *dev; - - mlx5_lag_rescan_dev_locked(ldev, dev0, true); - mlx5_ldev_for_each(i, 0, ldev) { - if (i == idx) - continue; - dev = mlx5_lag_pf(ldev, i)->dev; - if (mlx5_get_roce_state(dev)) - mlx5_nic_vport_enable_roce(dev); - } - } else if (shared_fdb) { - mlx5_lag_rescan_dev_locked(ldev, dev0, true); - err = mlx5_lag_reload_ib_reps_from_locked(ldev, 0, - false); + err = mlx5_activate_lag(ldev, &tracker, + roce_lag ? MLX5_LAG_MODE_ROCE : + MLX5_LAG_MODE_SRIOV, + false); if (err) { - mlx5_lag_rescan_dev_locked(ldev, dev0, false); - mlx5_deactivate_lag(ldev); - mlx5_lag_add_devices(ldev); - mlx5_lag_reload_ib_reps_from_locked(ldev, 0, - true); - mlx5_core_err(dev0, "Failed to enable lag\n"); + if (roce_lag) + mlx5_lag_add_devices(ldev); return; } + + if (roce_lag) { + struct mlx5_core_dev *dev; + + mlx5_lag_rescan_dev_locked(ldev, dev0, true); + mlx5_ldev_for_each(i, 0, ldev) { + if (i == idx) + continue; + dev = mlx5_lag_pf(ldev, i)->dev; + if (mlx5_get_roce_state(dev)) + mlx5_nic_vport_enable_roce(dev); + } + } } if (tracker.tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) { ndev = mlx5_lag_active_backup_get_netdev(dev0); @@ -1545,7 +1460,8 @@ static void mlx5_do_bond(struct mlx5_lag *ldev) ndev); dev_put(ndev); } - mlx5_lag_set_vports_agg_speed(ldev); + if (!shared_fdb) + mlx5_lag_set_vports_agg_speed(ldev); } else if (mlx5_lag_should_modify_lag(ldev, do_bond)) { mlx5_modify_lag(ldev, &tracker); mlx5_lag_set_vports_agg_speed(ldev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h index 6afe7707d076..23c0457ce799 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h @@ -137,7 +137,33 @@ mlx5_lag_is_ready(struct mlx5_lag *ldev) return test_bit(MLX5_LAG_FLAG_NDEVS_READY, &ldev->state_flags); } +#ifdef CONFIG_MLX5_ESWITCH +int mlx5_lag_shared_fdb_create(struct mlx5_lag *ldev, + struct lag_tracker *tracker, + enum mlx5_lag_mode mode); +void mlx5_lag_shared_fdb_destroy(struct mlx5_lag *ldev); +int mlx5_lag_create_single_fdb(struct mlx5_lag *ldev); bool mlx5_lag_shared_fdb_supported(struct mlx5_lag *ldev); +#else +static inline int mlx5_lag_shared_fdb_create(struct mlx5_lag *ldev, + struct lag_tracker *tracker, + enum mlx5_lag_mode mode) +{ + return -EOPNOTSUPP; +} + +static inline void mlx5_lag_shared_fdb_destroy(struct mlx5_lag *ldev) {} + +static inline int mlx5_lag_create_single_fdb(struct mlx5_lag *ldev) +{ + return -EOPNOTSUPP; +} + +static inline bool mlx5_lag_shared_fdb_supported(struct mlx5_lag *ldev) +{ + return false; +} +#endif bool mlx5_lag_check_prereq(struct mlx5_lag *ldev); int mlx5_lag_demux_init(struct mlx5_core_dev *dev, struct mlx5_flow_table_attr *ft_attr); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c index 8a349f8fd823..64e2d1dd5308 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c @@ -92,38 +92,21 @@ static int mlx5_lag_enable_mpesw(struct mlx5_lag *ldev) if (err) return err; - mlx5_lag_remove_devices(ldev); - - err = mlx5_activate_lag(ldev, NULL, MLX5_LAG_MODE_MPESW, true); + err = mlx5_lag_shared_fdb_create(ldev, NULL, MLX5_LAG_MODE_MPESW); if (err) { mlx5_core_warn(dev0, "Failed to create LAG in MPESW mode (%d)\n", err); - goto err_add_devices; + mlx5_mpesw_metadata_cleanup(ldev); + return err; } - mlx5_lag_rescan_dev_locked(ldev, dev0, true); - err = mlx5_lag_reload_ib_reps_from_locked(ldev, 0, false); - if (err) - goto err_rescan_drivers; - - mlx5_lag_set_vports_agg_speed(ldev); - return 0; - -err_rescan_drivers: - mlx5_lag_rescan_dev_locked(ldev, dev0, false); - mlx5_deactivate_lag(ldev); -err_add_devices: - mlx5_lag_add_devices(ldev); - mlx5_lag_reload_ib_reps_from_locked(ldev, 0, true); - mlx5_mpesw_metadata_cleanup(ldev); - return err; } void mlx5_lag_disable_mpesw(struct mlx5_lag *ldev) { if (ldev->mode == MLX5_LAG_MODE_MPESW) { mlx5_mpesw_metadata_cleanup(ldev); - mlx5_disable_lag(ldev); + mlx5_lag_shared_fdb_destroy(ldev); } } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/shared_fdb.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/shared_fdb.c new file mode 100644 index 000000000000..e5b8e9f1e6fd --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/shared_fdb.c @@ -0,0 +1,143 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#include +#include +#include +#include "mlx5_core.h" +#include "lag.h" +#include "eswitch.h" + +bool mlx5_lag_shared_fdb_supported(struct mlx5_lag *ldev) +{ + struct mlx5_core_dev *dev; + bool ret = false; + int idx; + int i; + + idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1); + if (idx < 0) + return false; + + mlx5_ldev_for_each(i, 0, ldev) { + if (i == idx) + continue; + dev = mlx5_lag_pf(ldev, i)->dev; + if (is_mdev_switchdev_mode(dev) && + mlx5_eswitch_vport_match_metadata_enabled(dev->priv.eswitch) && + MLX5_CAP_GEN(dev, lag_native_fdb_selection) && + MLX5_CAP_ESW(dev, root_ft_on_other_esw) && + mlx5_eswitch_get_npeers(dev->priv.eswitch) == + MLX5_CAP_GEN(dev, num_lag_ports) - 1) + continue; + return false; + } + + dev = mlx5_lag_pf(ldev, idx)->dev; + if (is_mdev_switchdev_mode(dev) && + mlx5_eswitch_vport_match_metadata_enabled(dev->priv.eswitch) && + mlx5_esw_offloads_devcom_is_ready(dev->priv.eswitch) && + MLX5_CAP_ESW(dev, esw_shared_ingress_acl) && + mlx5_eswitch_get_npeers(dev->priv.eswitch) == + MLX5_CAP_GEN(dev, num_lag_ports) - 1) + ret = true; + + return ret; +} + +int mlx5_lag_create_single_fdb(struct mlx5_lag *ldev) +{ + int master_idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1); + struct mlx5_eswitch *master_esw; + struct mlx5_core_dev *dev0; + int i, j; + int err; + + if (master_idx < 0) + return -EINVAL; + + dev0 = mlx5_lag_pf(ldev, master_idx)->dev; + master_esw = dev0->priv.eswitch; + mlx5_ldev_for_each(i, 0, ldev) { + struct mlx5_eswitch *slave_esw; + + if (i == master_idx) + continue; + + slave_esw = mlx5_lag_pf(ldev, i)->dev->priv.eswitch; + + err = mlx5_eswitch_offloads_single_fdb_add_one(master_esw, + slave_esw, + ldev->ports); + if (err) + goto err; + } + return 0; +err: + mlx5_ldev_for_each_reverse(j, i, 0, ldev) { + struct mlx5_eswitch *slave_esw; + + if (j == master_idx) + continue; + slave_esw = mlx5_lag_pf(ldev, j)->dev->priv.eswitch; + mlx5_eswitch_offloads_single_fdb_del_one(master_esw, slave_esw); + } + return err; +} + +int mlx5_lag_shared_fdb_create(struct mlx5_lag *ldev, + struct lag_tracker *tracker, + enum mlx5_lag_mode mode) +{ + int idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1); + struct mlx5_core_dev *dev0; + int err; + + if (idx < 0) + return -EINVAL; + + dev0 = mlx5_lag_pf(ldev, idx)->dev; + + mlx5_lag_remove_devices(ldev); + + err = mlx5_activate_lag(ldev, tracker, mode, true); + if (err) { + mlx5_core_warn(dev0, "Failed to create LAG in shared FDB mode (%d)\n", + err); + goto err_add_devices; + } + + mlx5_lag_rescan_dev_locked(ldev, dev0, true); + err = mlx5_lag_reload_ib_reps_from_locked(ldev, 0, false); + if (err) { + mlx5_core_err(dev0, "Failed to enable lag\n"); + goto err_rescan_drivers; + } + + mlx5_lag_set_vports_agg_speed(ldev); + return 0; + +err_rescan_drivers: + mlx5_lag_rescan_dev_locked(ldev, dev0, false); + mlx5_deactivate_lag(ldev); +err_add_devices: + mlx5_lag_add_devices(ldev); + mlx5_lag_reload_ib_reps_from_locked(ldev, 0, true); + return err; +} + +void mlx5_lag_shared_fdb_destroy(struct mlx5_lag *ldev) +{ + int err; + + mlx5_lag_remove_devices(ldev); + + err = mlx5_deactivate_lag(ldev); + if (err) + return; + + mlx5_lag_add_devices(ldev); + mlx5_lag_reload_ib_reps_from_locked(ldev, + MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV, + true); +} -- 2.44.0 From: Shay Drory This patch align the eswitch disable sequence with the switchdev-to-legacy mode transition, where eswitch must be disabled before device detachment. The consistent ordering is required for proper SD LAG cleanup which depends on eswitch state during teardown. Signed-off-by: Shay Drory Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 0c6e4efe38c8..fd285aeb9630 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1369,7 +1369,6 @@ static int mlx5_load(struct mlx5_core_dev *dev) static void mlx5_unload(struct mlx5_core_dev *dev) { - mlx5_eswitch_disable(dev->priv.eswitch); mlx5_devlink_traps_unregister(priv_to_devlink(dev)); mlx5_vhca_event_stop(dev); mlx5_sf_dev_table_destroy(dev); @@ -1484,6 +1483,7 @@ void mlx5_uninit_one(struct mlx5_core_dev *dev) mlx5_hwmon_dev_unregister(dev); mlx5_crdump_disable(dev); + mlx5_eswitch_disable(dev->priv.eswitch); mlx5_unregister_device(dev); if (!test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) { @@ -1568,6 +1568,7 @@ void mlx5_unload_one_devl_locked(struct mlx5_core_dev *dev, bool suspend) devl_assert_locked(priv_to_devlink(dev)); mutex_lock(&dev->intf_state_mutex); + mlx5_eswitch_disable(dev->priv.eswitch); mlx5_detach_device(dev, suspend); if (!test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) { -- 2.44.0 From: Shay Drory Move the E-swtich devcom component management from TC layer to ESW layer. This refactoring places devcom lifecycle management at the appropriate layer and prepares for SD LAG which needs devcom registration independent of the TC/representor initialization. Signed-off-by: Shay Drory Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan --- .../net/ethernet/mellanox/mlx5/core/en_tc.c | 20 ------------------- .../mellanox/mlx5/core/eswitch_offloads.c | 6 ++++++ 2 files changed, 6 insertions(+), 20 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index a9001d1c902f..3846c16c3138 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -5394,8 +5394,6 @@ int mlx5e_tc_esw_init(struct mlx5_rep_uplink_priv *uplink_priv) { const size_t sz_enc_opts = sizeof(struct tunnel_match_enc_opts); u8 mapping_id[MLX5_SW_IMAGE_GUID_MAX_BYTES]; - struct mlx5_devcom_match_attr attr = {}; - struct netdev_phys_item_id ppid; struct mlx5e_rep_priv *rpriv; struct mapping_ctx *mapping; struct mlx5_eswitch *esw; @@ -5456,14 +5454,6 @@ int mlx5e_tc_esw_init(struct mlx5_rep_uplink_priv *uplink_priv) goto err_action_counter; } - err = netif_get_port_parent_id(priv->netdev, &ppid, false); - if (!err) { - memcpy(&attr.key.buf, &ppid.id, ppid.id_len); - attr.flags = MLX5_DEVCOM_MATCH_FLAGS_NS; - attr.net = mlx5_core_net(esw->dev); - mlx5_esw_offloads_devcom_init(esw, &attr); - } - return 0; err_action_counter: @@ -5484,16 +5474,6 @@ int mlx5e_tc_esw_init(struct mlx5_rep_uplink_priv *uplink_priv) void mlx5e_tc_esw_cleanup(struct mlx5_rep_uplink_priv *uplink_priv) { - struct mlx5e_rep_priv *rpriv; - struct mlx5_eswitch *esw; - struct mlx5e_priv *priv; - - rpriv = container_of(uplink_priv, struct mlx5e_rep_priv, uplink_priv); - priv = netdev_priv(rpriv->netdev); - esw = priv->mdev->priv.eswitch; - - mlx5_esw_offloads_devcom_cleanup(esw); - mlx5e_tc_tun_cleanup(uplink_priv->encap); mapping_destroy(uplink_priv->tunnel_enc_opts_mapping); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 189be11c4c39..d9683d3ea0e7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -3866,6 +3866,7 @@ bool mlx5_esw_offloads_controller_valid(const struct mlx5_eswitch *esw, u32 cont int esw_offloads_enable(struct mlx5_eswitch *esw) { u8 mapping_id[MLX5_SW_IMAGE_GUID_MAX_BYTES]; + struct mlx5_devcom_match_attr attr = {}; struct mapping_ctx *reg_c0_obj_pool; struct mlx5_vport *vport; unsigned long i; @@ -3926,6 +3927,10 @@ int esw_offloads_enable(struct mlx5_eswitch *esw) if (err) goto err_vports; + memcpy(attr.key.buf, mapping_id, id_len); + attr.flags = MLX5_DEVCOM_MATCH_FLAGS_NS; + attr.net = mlx5_core_net(esw->dev); + mlx5_esw_offloads_devcom_init(esw, &attr); return 0; err_vports: @@ -3970,6 +3975,7 @@ static int esw_offloads_stop(struct mlx5_eswitch *esw, void esw_offloads_disable(struct mlx5_eswitch *esw) { + mlx5_esw_offloads_devcom_cleanup(esw); mlx5_eswitch_disable_pf_vf_vports(esw); mlx5_esw_offloads_rep_unload(esw, MLX5_VPORT_UPLINK); esw_set_passing_vport_metadata(esw, false); -- 2.44.0 From: Shay Drory Replace mlx5_eswitch_get_npeers() count-based check with a new mlx5_eswitch_is_peer() function that directly verifies the peer relationship between two eswitches. This change prepares for SD LAG support, which is a virtual LAG that does not have num_lag_ports capability and cannot use the count-based peer validation. Signed-off-by: Shay Drory Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan --- .../net/ethernet/mellanox/mlx5/core/eswitch.h | 11 ++--------- .../mellanox/mlx5/core/eswitch_offloads.c | 12 ++++++++++++ .../mellanox/mlx5/core/lag/shared_fdb.c | 17 +++++++---------- 3 files changed, 21 insertions(+), 19 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h index 8a94c38f8566..94a530d19828 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h @@ -955,6 +955,8 @@ int mlx5_eswitch_offloads_single_fdb_add_one(struct mlx5_eswitch *master_esw, void mlx5_eswitch_offloads_single_fdb_del_one(struct mlx5_eswitch *master_esw, struct mlx5_eswitch *slave_esw); int mlx5_eswitch_reload_ib_reps(struct mlx5_eswitch *esw); +bool mlx5_eswitch_is_peer(struct mlx5_eswitch *esw, + struct mlx5_eswitch *peer_esw); bool mlx5_eswitch_block_encap(struct mlx5_core_dev *dev, bool from_fdb); void mlx5_eswitch_unblock_encap(struct mlx5_core_dev *dev); @@ -970,13 +972,6 @@ static inline int mlx5_eswitch_num_vfs(struct mlx5_eswitch *esw) return 0; } -static inline int mlx5_eswitch_get_npeers(struct mlx5_eswitch *esw) -{ - if (mlx5_esw_allowed(esw)) - return esw->num_peers; - return 0; -} - static inline struct mlx5_flow_table * mlx5_eswitch_get_slow_fdb(struct mlx5_eswitch *esw) { @@ -1058,8 +1053,6 @@ static inline void mlx5_eswitch_offloads_single_fdb_del_one(struct mlx5_eswitch *master_esw, struct mlx5_eswitch *slave_esw) {} -static inline int mlx5_eswitch_get_npeers(struct mlx5_eswitch *esw) { return 0; } - static inline int mlx5_eswitch_reload_ib_reps(struct mlx5_eswitch *esw) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index d9683d3ea0e7..d65f30bb2f80 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -3296,6 +3296,18 @@ static int mlx5_esw_offloads_set_ns_peer(struct mlx5_eswitch *esw, return 0; } +bool mlx5_eswitch_is_peer(struct mlx5_eswitch *esw, + struct mlx5_eswitch *peer_esw) +{ + u16 peer_esw_i; + + if (!mlx5_esw_allowed(esw) || !mlx5_esw_allowed(peer_esw)) + return false; + + peer_esw_i = MLX5_CAP_GEN(peer_esw->dev, vhca_id); + return !!xa_load(&esw->paired, peer_esw_i); +} + static int mlx5_esw_offloads_devcom_event(int event, void *my_data, void *event_data) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/shared_fdb.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/shared_fdb.c index e5b8e9f1e6fd..b5cbe3409720 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/shared_fdb.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/shared_fdb.c @@ -10,7 +10,7 @@ bool mlx5_lag_shared_fdb_supported(struct mlx5_lag *ldev) { - struct mlx5_core_dev *dev; + struct mlx5_core_dev *dev0, *dev; bool ret = false; int idx; int i; @@ -19,6 +19,7 @@ bool mlx5_lag_shared_fdb_supported(struct mlx5_lag *ldev) if (idx < 0) return false; + dev0 = mlx5_lag_pf(ldev, idx)->dev; mlx5_ldev_for_each(i, 0, ldev) { if (i == idx) continue; @@ -27,19 +28,15 @@ bool mlx5_lag_shared_fdb_supported(struct mlx5_lag *ldev) mlx5_eswitch_vport_match_metadata_enabled(dev->priv.eswitch) && MLX5_CAP_GEN(dev, lag_native_fdb_selection) && MLX5_CAP_ESW(dev, root_ft_on_other_esw) && - mlx5_eswitch_get_npeers(dev->priv.eswitch) == - MLX5_CAP_GEN(dev, num_lag_ports) - 1) + mlx5_eswitch_is_peer(dev0->priv.eswitch, dev->priv.eswitch)) continue; return false; } - dev = mlx5_lag_pf(ldev, idx)->dev; - if (is_mdev_switchdev_mode(dev) && - mlx5_eswitch_vport_match_metadata_enabled(dev->priv.eswitch) && - mlx5_esw_offloads_devcom_is_ready(dev->priv.eswitch) && - MLX5_CAP_ESW(dev, esw_shared_ingress_acl) && - mlx5_eswitch_get_npeers(dev->priv.eswitch) == - MLX5_CAP_GEN(dev, num_lag_ports) - 1) + if (is_mdev_switchdev_mode(dev0) && + mlx5_eswitch_vport_match_metadata_enabled(dev0->priv.eswitch) && + mlx5_esw_offloads_devcom_is_ready(dev0->priv.eswitch) && + MLX5_CAP_ESW(dev0, esw_shared_ingress_acl)) ret = true; return ret; -- 2.44.0 From: Shay Drory Socket Direct (SD) secondaries devices will participate in LAG, even though they are silent. SD secondary devices share the same physical port as their primary but are separate PCI functions that need to be tracked alongside regular LAG ports. Extend lag_func with a group_id field to identify SD group membership and introduce a unified iterator that can filter by group. Add APIs for registering SD secondary devices in an existing LAG. Signed-off-by: Shay Drory Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan --- .../net/ethernet/mellanox/mlx5/core/lag/lag.c | 59 ++++++++++++++----- .../net/ethernet/mellanox/mlx5/core/lag/lag.h | 53 +++++++++++++++-- 2 files changed, 90 insertions(+), 22 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c index 5dfdd799828f..03cb02c7000d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c @@ -242,7 +242,7 @@ static void mlx5_ldev_free(struct kref *ref) unregister_netdevice_notifier_net(net, &ldev->nb); } - mlx5_ldev_for_each(i, 0, ldev) { + mlx5_lag_for_each(i, 0, ldev, MLX5_LAG_FILTER_ALL) { pf = mlx5_lag_pf(ldev, i); if (pf->port_change_nb.nb.notifier_call) { struct mlx5_nb *nb = &pf->port_change_nb; @@ -391,7 +391,7 @@ int mlx5_lag_get_dev_seq(struct mlx5_core_dev *dev) if (pf && pf->dev == dev) return 0; - mlx5_ldev_for_each(i, 0, ldev) { + mlx5_lag_for_each(i, 0, ldev, MLX5_LAG_FILTER_ALL) { if (i == master_idx) continue; pf = mlx5_lag_pf(ldev, i); @@ -1034,7 +1034,7 @@ static void mlx5_lag_assert_locked_transition(struct mlx5_lag *ldev) lockdep_assert_held(&ldev->lock); - i = mlx5_get_next_ldev_func(ldev, 0); + i = mlx5_get_next_lag_func(ldev, 0, MLX5_LAG_FILTER_PORTS); if (i < MLX5_MAX_PORTS) { pf = mlx5_lag_pf(ldev, i); devcom = pf->dev->priv.hca_devcom_comp; @@ -1482,7 +1482,7 @@ struct mlx5_devcom_comp_dev *mlx5_lag_get_devcom_comp(struct mlx5_lag *ldev) int i; mutex_lock(&ldev->lock); - i = mlx5_get_next_ldev_func(ldev, 0); + i = mlx5_get_next_lag_func(ldev, 0, MLX5_LAG_FILTER_PORTS); if (i < MLX5_MAX_PORTS) { pf = mlx5_lag_pf(ldev, i); devcom = pf->dev->priv.hca_devcom_comp; @@ -1965,8 +1965,9 @@ static void mlx5_ldev_remove_netdev(struct mlx5_lag *ldev, spin_unlock_irqrestore(&lag_lock, flags); } -static int mlx5_ldev_add_mdev(struct mlx5_lag *ldev, - struct mlx5_core_dev *dev) +int mlx5_ldev_add_mdev(struct mlx5_lag *ldev, + struct mlx5_core_dev *dev, + u32 group_id) { struct lag_func *pf; u32 idx; @@ -1985,8 +1986,14 @@ static int mlx5_ldev_add_mdev(struct mlx5_lag *ldev, pf->idx = idx; pf->dev = dev; + pf->group_id = group_id; dev->priv.lag = ldev; + if (group_id) + return 0; + + xa_set_mark(&ldev->pfs, idx, MLX5_LAG_XA_MARK_PORT); + MLX5_NB_INIT(&pf->port_change_nb, mlx5_lag_mpesw_port_change_event, PORT_CHANGE); mlx5_eq_notifier_register(dev, &pf->port_change_nb); @@ -1994,13 +2001,13 @@ static int mlx5_ldev_add_mdev(struct mlx5_lag *ldev, return 0; } -static void mlx5_ldev_remove_mdev(struct mlx5_lag *ldev, - struct mlx5_core_dev *dev) +void mlx5_ldev_remove_mdev(struct mlx5_lag *ldev, + struct mlx5_core_dev *dev) { struct lag_func *pf; int i; - mlx5_ldev_for_each(i, 0, ldev) { + mlx5_lag_for_each(i, 0, ldev, MLX5_LAG_FILTER_ALL) { pf = mlx5_lag_pf(ldev, i); if (pf->dev == dev) break; @@ -2035,7 +2042,7 @@ static int __mlx5_lag_dev_add_mdev(struct mlx5_core_dev *dev) mlx5_core_err(dev, "Failed to alloc lag dev\n"); return 0; } - err = mlx5_ldev_add_mdev(ldev, dev); + err = mlx5_ldev_add_mdev(ldev, dev, 0); if (err) { mlx5_core_err(dev, "Failed to add mdev to lag dev\n"); mlx5_ldev_put(ldev); @@ -2050,7 +2057,7 @@ static int __mlx5_lag_dev_add_mdev(struct mlx5_core_dev *dev) return -EAGAIN; } mlx5_ldev_get(ldev); - err = mlx5_ldev_add_mdev(ldev, dev); + err = mlx5_ldev_add_mdev(ldev, dev, 0); if (err) { mlx5_ldev_put(ldev); mutex_unlock(&ldev->lock); @@ -2187,27 +2194,47 @@ void mlx5_lag_add_netdev(struct mlx5_core_dev *dev, mlx5_queue_bond_work(ldev, 0); } -int mlx5_get_pre_ldev_func(struct mlx5_lag *ldev, int start_idx, int end_idx) +int mlx5_get_pre_lag_func(struct mlx5_lag *ldev, int start_idx, int end_idx, + u32 filter) { struct lag_func *pf; int i; for (i = start_idx; i >= end_idx; i--) { pf = xa_load(&ldev->pfs, i); - if (pf && pf->dev) + if (!pf || !pf->dev) + continue; + if (filter == MLX5_LAG_FILTER_PORTS) { + if (xa_get_mark(&ldev->pfs, i, MLX5_LAG_XA_MARK_PORT)) + return i; + } else if (filter == MLX5_LAG_FILTER_ALL || + filter == pf->group_id) { return i; + } } return -1; } -int mlx5_get_next_ldev_func(struct mlx5_lag *ldev, int start_idx) +int mlx5_get_next_lag_func(struct mlx5_lag *ldev, int start_idx, u32 filter) { struct lag_func *pf; unsigned long idx; - xa_for_each_start(&ldev->pfs, idx, pf, start_idx) - if (pf->dev) + if (filter == MLX5_LAG_FILTER_PORTS) { + xa_for_each_marked_start(&ldev->pfs, idx, pf, + MLX5_LAG_XA_MARK_PORT, start_idx) + if (pf->dev) + return idx; + return MLX5_MAX_PORTS; + } + + xa_for_each_start(&ldev->pfs, idx, pf, start_idx) { + if (!pf->dev) + continue; + if (filter == MLX5_LAG_FILTER_ALL || + filter == pf->group_id) return idx; + } return MLX5_MAX_PORTS; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h index 23c0457ce799..70baa7997364 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h @@ -15,6 +15,13 @@ * Note: XA_MARK_0 is reserved by XA_FLAGS_ALLOC for free-slot tracking. */ #define MLX5_LAG_XA_MARK_MASTER XA_MARK_1 +/* XArray mark for port-level entries (excludes SD secondaries) */ +#define MLX5_LAG_XA_MARK_PORT XA_MARK_2 + +/* Like xa_for_each_marked but starting from a given index */ +#define xa_for_each_marked_start(xa, index, entry, filter, start) \ + for (index = start, entry = xa_find(xa, &index, ULONG_MAX, filter); \ + entry; entry = xa_find_after(xa, &index, ULONG_MAX, filter)) #include "mlx5_core.h" #include "mp.h" @@ -50,6 +57,8 @@ struct lag_func { bool has_drop; unsigned int idx; /* xarray index assigned by LAG */ struct mlx5_nb port_change_nb; + u32 group_id; /* SD group ID, 0 = not SD */ + bool sd_fdb_active; /* set on all SD group members */ }; /* Used for collection of netdev event info. */ @@ -125,6 +134,20 @@ mlx5_lag_pf_by_dev_idx(struct mlx5_lag *ldev, int dev_idx) return NULL; } +/* Find lag_func by mlx5_core_dev pointer */ +static inline struct lag_func * +mlx5_lag_pf_by_dev(struct mlx5_lag *ldev, struct mlx5_core_dev *dev) +{ + struct lag_func *pf; + unsigned long idx; + + xa_for_each(&ldev->pfs, idx, pf) { + if (pf->dev == dev) + return pf; + } + return NULL; +} + static inline bool __mlx5_lag_is_active(struct mlx5_lag *ldev) { @@ -214,20 +237,38 @@ static inline bool mlx5_lag_is_supported(struct mlx5_core_dev *dev) return true; } -#define mlx5_ldev_for_each(i, start_index, ldev) \ - for (int tmp = start_index; tmp = mlx5_get_next_ldev_func(ldev, tmp), \ +/* Iterator filter constants for mlx5_lag_for_each() */ +#define MLX5_LAG_FILTER_ALL 0 /* iterate ALL devices */ +#define MLX5_LAG_FILTER_PORTS U32_MAX /* iterate ports only (XA_MARK_PORT) */ +/* any other value = iterate devices with that specific group_id */ + +#define mlx5_lag_for_each(i, start_index, ldev, filter) \ + for (int tmp = start_index; \ + tmp = mlx5_get_next_lag_func(ldev, tmp, filter), \ i = tmp, tmp < MLX5_MAX_PORTS; tmp++) -#define mlx5_ldev_for_each_reverse(i, start_index, end_index, ldev) \ +#define mlx5_lag_for_each_reverse(i, start_index, end_index, ldev, filter) \ for (int tmp = start_index, tmp1 = end_index; \ - tmp = mlx5_get_pre_ldev_func(ldev, tmp, tmp1), \ + tmp = mlx5_get_pre_lag_func(ldev, tmp, tmp1, filter), \ i = tmp, tmp >= tmp1; tmp--) -int mlx5_get_pre_ldev_func(struct mlx5_lag *ldev, int start_idx, int end_idx); -int mlx5_get_next_ldev_func(struct mlx5_lag *ldev, int start_idx); +/* Convenience wrappers - keeps existing behavior */ +#define mlx5_ldev_for_each(i, start_index, ldev) \ + mlx5_lag_for_each(i, start_index, ldev, MLX5_LAG_FILTER_PORTS) + +#define mlx5_ldev_for_each_reverse(i, start_index, end_index, ldev) \ + mlx5_lag_for_each_reverse(i, start_index, end_index, ldev, \ + MLX5_LAG_FILTER_PORTS) + +int mlx5_get_pre_lag_func(struct mlx5_lag *ldev, int start_idx, int end_idx, + u32 filter); +int mlx5_get_next_lag_func(struct mlx5_lag *ldev, int start_idx, u32 filter); int mlx5_lag_get_dev_index_by_seq(struct mlx5_lag *ldev, int seq); int mlx5_lag_num_devs(struct mlx5_lag *ldev); int mlx5_lag_num_netdevs(struct mlx5_lag *ldev); int mlx5_lag_reload_ib_reps_from_locked(struct mlx5_lag *ldev, u32 flags, bool cont_on_fail); +int mlx5_ldev_add_mdev(struct mlx5_lag *ldev, struct mlx5_core_dev *dev, + u32 group_id); +void mlx5_ldev_remove_mdev(struct mlx5_lag *ldev, struct mlx5_core_dev *dev); #endif /* __MLX5_LAG_H__ */ -- 2.44.0 From: Shay Drory Add a group_id parameter to mlx5_lag_shared_fdb_create() and mlx5_lag_shared_fdb_destroy() to scope shared FDB operations to a specific SD group. When group_id is U32_MAX, the functions operate on all LAG devices. When group_id is non-zero, they operate only on devices in that SD group without issuing FW LAG commands, since SD LAG is a pure software construct. Signed-off-by: Shay Drory Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan --- .../net/ethernet/mellanox/mlx5/core/lag/lag.c | 195 ++++++++++++++---- .../net/ethernet/mellanox/mlx5/core/lag/lag.h | 32 ++- .../ethernet/mellanox/mlx5/core/lag/mpesw.c | 7 +- .../mellanox/mlx5/core/lag/shared_fdb.c | 151 +++++++++++--- .../net/ethernet/mellanox/mlx5/core/lib/sd.c | 10 + .../net/ethernet/mellanox/mlx5/core/lib/sd.h | 10 + 6 files changed, 322 insertions(+), 83 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c index 03cb02c7000d..3decb49e9f19 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c @@ -370,6 +370,22 @@ int mlx5_lag_get_dev_index_by_seq(struct mlx5_lag *ldev, int seq) return -ENOENT; } +/* Return the appropriate iterator filter for a device in LAG: + * - SD shared FDB active: iterate only the device's SD group + * - SD group exists but shared FDB not active: iterate all devices + * - No SD: iterate ports only + */ +static u32 mlx5_lag_get_filter(struct mlx5_lag *ldev, struct mlx5_core_dev *dev) +{ + struct lag_func *pf = mlx5_lag_pf_by_dev(ldev, dev); + + if (pf && pf->sd_fdb_active) + return pf->group_id; + if (pf && pf->group_id) + return MLX5_LAG_FILTER_ALL; + return MLX5_LAG_FILTER_PORTS; +} + /* Reverse of mlx5_lag_get_dev_index_by_seq: given a device, return its * sequence number in the LAG. Master is always 0, others numbered * sequentially starting from 1. @@ -379,11 +395,13 @@ int mlx5_lag_get_dev_seq(struct mlx5_core_dev *dev) struct mlx5_lag *ldev = mlx5_lag_dev(dev); int master_idx, i, num = 1; struct lag_func *pf; + u32 filter; if (!ldev) return -ENOENT; - master_idx = mlx5_lag_get_master_idx(ldev); + filter = mlx5_lag_get_filter(ldev, dev); + master_idx = mlx5_lag_get_dev_index_by_seq_filter(ldev, 0, filter); if (master_idx < 0) return -ENOENT; @@ -391,7 +409,7 @@ int mlx5_lag_get_dev_seq(struct mlx5_core_dev *dev) if (pf && pf->dev == dev) return 0; - mlx5_lag_for_each(i, 0, ldev, MLX5_LAG_FILTER_ALL) { + mlx5_lag_for_each(i, 0, ldev, filter) { if (i == master_idx) continue; pf = mlx5_lag_pf(ldev, i); @@ -403,6 +421,69 @@ int mlx5_lag_get_dev_seq(struct mlx5_core_dev *dev) } EXPORT_SYMBOL(mlx5_lag_get_dev_seq); +/* seq 0 = master, then all remaining devices */ +static int mlx5_lag_get_dev_index_by_seq_all(struct mlx5_lag *ldev, int seq) +{ + int master_idx, i, num = 0; + + master_idx = mlx5_lag_get_master_idx(ldev); + + if (master_idx >= 0) { + if (seq == 0) + return master_idx; + num++; + } + + mlx5_lag_for_each(i, 0, ldev, MLX5_LAG_FILTER_ALL) { + if (i == master_idx) + continue; + if (num == seq) + return i; + num++; + } + return -ENOENT; +} + +/* From group POV, port-marked entry is the lag master */ +static int mlx5_lag_get_dev_index_by_seq_group(struct mlx5_lag *ldev, int seq, + u32 group_id) +{ + int i, num = 0; + + mlx5_lag_for_each(i, 0, ldev, group_id) { + if (xa_get_mark(&ldev->pfs, i, MLX5_LAG_XA_MARK_PORT)) { + if (seq == 0) + return i; + num++; + break; + } + } + + mlx5_lag_for_each(i, 0, ldev, group_id) { + if (xa_get_mark(&ldev->pfs, i, MLX5_LAG_XA_MARK_PORT)) + continue; + if (num == seq) + return i; + num++; + } + return -ENOENT; +} + +int mlx5_lag_get_dev_index_by_seq_filter(struct mlx5_lag *ldev, int seq, + u32 filter) +{ + if (!ldev) + return -ENOENT; + + if (!filter || filter == MLX5_LAG_FILTER_PORTS) + return mlx5_lag_get_dev_index_by_seq(ldev, seq); + + if (filter == MLX5_LAG_FILTER_ALL) + return mlx5_lag_get_dev_index_by_seq_all(ldev, seq); + + return mlx5_lag_get_dev_index_by_seq_group(ldev, seq, filter); +} + /* Devcom events for LAG master marking */ #define LAG_DEVCOM_PAIR (0) #define LAG_DEVCOM_UNPAIR (1) @@ -512,6 +593,14 @@ static bool __mlx5_lag_is_sriov(struct mlx5_lag *ldev) return ldev->mode == MLX5_LAG_MODE_SRIOV; } +static bool __mlx5_lag_is_sd_active(struct mlx5_lag *ldev, + struct mlx5_core_dev *dev) +{ + struct lag_func *pf = mlx5_lag_pf_by_dev(ldev, dev); + + return pf && pf->sd_fdb_active; +} + /* Create a mapping between steering slots and active ports. * As we have ldev->buckets slots per port first assume the native * mapping should be used. @@ -927,27 +1016,19 @@ int mlx5_deactivate_lag(struct mlx5_lag *ldev) u32 in[MLX5_ST_SZ_DW(destroy_lag_in)] = {}; bool roce_lag = __mlx5_lag_is_roce(ldev); unsigned long flags = ldev->mode_flags; - struct mlx5_eswitch *master_esw; struct mlx5_core_dev *dev0; int err; - int i; if (master_idx < 0) return -EINVAL; dev0 = mlx5_lag_pf(ldev, master_idx)->dev; - master_esw = dev0->priv.eswitch; ldev->mode = MLX5_LAG_MODE_NONE; ldev->mode_flags = 0; mlx5_lag_mp_reset(ldev); if (test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &flags)) { - mlx5_ldev_for_each(i, 0, ldev) { - if (i == master_idx) - continue; - mlx5_eswitch_offloads_single_fdb_del_one(master_esw, - mlx5_lag_pf(ldev, i)->dev->priv.eswitch); - } + mlx5_lag_destroy_single_fdb(ldev); clear_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &flags); } @@ -1026,7 +1107,7 @@ bool mlx5_lag_check_prereq(struct mlx5_lag *ldev) return true; } -static void mlx5_lag_assert_locked_transition(struct mlx5_lag *ldev) +static void mlx5_lag_assert_locked_transition(struct mlx5_lag *ldev, u32 filter) { struct mlx5_devcom_comp_dev *devcom = NULL; struct lag_func *pf; @@ -1034,17 +1115,21 @@ static void mlx5_lag_assert_locked_transition(struct mlx5_lag *ldev) lockdep_assert_held(&ldev->lock); - i = mlx5_get_next_lag_func(ldev, 0, MLX5_LAG_FILTER_PORTS); + i = mlx5_get_next_lag_func(ldev, 0, filter); if (i < MLX5_MAX_PORTS) { pf = mlx5_lag_pf(ldev, i); - devcom = pf->dev->priv.hca_devcom_comp; + if (filter == MLX5_LAG_FILTER_PORTS || + filter == MLX5_LAG_FILTER_ALL) + devcom = pf->dev->priv.hca_devcom_comp; + else + devcom = mlx5_sd_get_devcom(pf->dev); } mlx5_devcom_comp_assert_locked(devcom); } -static void mlx5_lag_drop_lock_for_reps(struct mlx5_lag *ldev) +static void mlx5_lag_drop_lock_for_reps(struct mlx5_lag *ldev, u32 filter) { - mlx5_lag_assert_locked_transition(ldev); + mlx5_lag_assert_locked_transition(ldev, filter); /* Keep PF membership stable while ldev->lock is dropped. Device add * and remove paths observe mode_changes_in_progress and retry. @@ -1075,21 +1160,22 @@ void mlx5_lag_rescan_dev_locked(struct mlx5_lag *ldev, * callbacks and take reps_lock. Drop ldev->lock so the only ordering * remains reps_lock -> ldev->lock from representor callbacks. */ - mlx5_lag_drop_lock_for_reps(ldev); + mlx5_lag_drop_lock_for_reps(ldev, mlx5_lag_get_filter(ldev, dev)); mlx5_rescan_drivers_locked(dev); mlx5_lag_retake_lock_after_reps(ldev); } -static void mlx5_lag_rescan_devices_locked(struct mlx5_lag *ldev, bool enable) +static void mlx5_lag_rescan_devices_locked_filter(struct mlx5_lag *ldev, + bool enable, u32 filter) { struct mlx5_core_dev *devs[MLX5_MAX_PORTS]; struct lag_func *pf; int num_devs = 0; int i; - mlx5_lag_assert_locked_transition(ldev); + mlx5_lag_assert_locked_transition(ldev, filter); - mlx5_ldev_for_each(i, 0, ldev) { + mlx5_lag_for_each(i, 0, ldev, filter) { pf = mlx5_lag_pf(ldev, i); if (pf->dev->priv.flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV) continue; @@ -1101,30 +1187,40 @@ static void mlx5_lag_rescan_devices_locked(struct mlx5_lag *ldev, bool enable) devs[num_devs++] = pf->dev; } - mlx5_lag_drop_lock_for_reps(ldev); + mlx5_lag_drop_lock_for_reps(ldev, filter); for (i = 0; i < num_devs; i++) mlx5_rescan_drivers_locked(devs[i]); mlx5_lag_retake_lock_after_reps(ldev); } +void mlx5_lag_add_devices_filter(struct mlx5_lag *ldev, u32 filter) +{ + mlx5_lag_rescan_devices_locked_filter(ldev, true, filter); +} + void mlx5_lag_add_devices(struct mlx5_lag *ldev) { - mlx5_lag_rescan_devices_locked(ldev, true); + mlx5_lag_add_devices_filter(ldev, MLX5_LAG_FILTER_PORTS); +} + +void mlx5_lag_remove_devices_filter(struct mlx5_lag *ldev, u32 filter) +{ + mlx5_lag_rescan_devices_locked_filter(ldev, false, filter); } void mlx5_lag_remove_devices(struct mlx5_lag *ldev) { - mlx5_lag_rescan_devices_locked(ldev, false); + mlx5_lag_remove_devices_filter(ldev, MLX5_LAG_FILTER_PORTS); } static int mlx5_lag_reload_ib_reps_unlocked(struct mlx5_lag *ldev, u32 flags, - bool cont_on_fail) + u32 filter, bool cont_on_fail) { struct lag_func *pf; int ret; int i; - mlx5_ldev_for_each(i, 0, ldev) { + mlx5_lag_for_each(i, 0, ldev, filter) { pf = mlx5_lag_pf(ldev, i); if (!(pf->dev->priv.flags & flags)) { struct mlx5_eswitch *esw; @@ -1142,7 +1238,7 @@ static int mlx5_lag_reload_ib_reps_unlocked(struct mlx5_lag *ldev, u32 flags, } static int mlx5_lag_reload_ib_reps(struct mlx5_lag *ldev, u32 flags, - bool cont_on_fail) + u32 filter, bool cont_on_fail) { int ret; @@ -1152,21 +1248,18 @@ static int mlx5_lag_reload_ib_reps(struct mlx5_lag *ldev, u32 flags, * load/unload callbacks can re-enter LAG netdev add/remove and take * ldev->lock. Keep the ordering reps_lock -> ldev->lock. */ - mlx5_lag_drop_lock_for_reps(ldev); - ret = mlx5_lag_reload_ib_reps_unlocked(ldev, flags, cont_on_fail); + mlx5_lag_drop_lock_for_reps(ldev, filter); + ret = mlx5_lag_reload_ib_reps_unlocked(ldev, flags, filter, + cont_on_fail); mlx5_lag_retake_lock_after_reps(ldev); return ret; } int mlx5_lag_reload_ib_reps_from_locked(struct mlx5_lag *ldev, u32 flags, - bool cont_on_fail) + u32 filter, bool cont_on_fail) { - int ret; - - ret = mlx5_lag_reload_ib_reps(ldev, flags, cont_on_fail); - - return ret; + return mlx5_lag_reload_ib_reps(ldev, flags, filter, cont_on_fail); } void mlx5_disable_lag(struct mlx5_lag *ldev) @@ -1182,7 +1275,7 @@ void mlx5_disable_lag(struct mlx5_lag *ldev) return; if (shared_fdb) { - mlx5_lag_shared_fdb_destroy(ldev); + mlx5_lag_shared_fdb_destroy(ldev, 0); return; } @@ -1420,7 +1513,8 @@ static void mlx5_do_bond(struct mlx5_lag *ldev) if (shared_fdb) { err = mlx5_lag_shared_fdb_create(ldev, &tracker, - MLX5_LAG_MODE_SRIOV); + MLX5_LAG_MODE_SRIOV, + 0); if (err) return; } else { @@ -2261,7 +2355,8 @@ bool mlx5_lag_is_active(struct mlx5_core_dev *dev) spin_lock_irqsave(&lag_lock, flags); ldev = mlx5_lag_dev(dev); - res = ldev && __mlx5_lag_is_active(ldev); + res = ldev && (__mlx5_lag_is_active(ldev) || + __mlx5_lag_is_sd_active(ldev, dev)); spin_unlock_irqrestore(&lag_lock, flags); return res; @@ -2294,10 +2389,17 @@ bool mlx5_lag_is_master(struct mlx5_core_dev *dev) spin_lock_irqsave(&lag_lock, flags); ldev = mlx5_lag_dev(dev); - idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1); - if (ldev && __mlx5_lag_is_active(ldev) && idx >= 0) { - pf = mlx5_lag_pf(ldev, idx); - res = pf && dev == pf->dev; + if (ldev) { + u32 filter; + + filter = mlx5_lag_get_filter(ldev, dev); + idx = mlx5_lag_get_dev_index_by_seq_filter(ldev, MLX5_LAG_P1, + filter); + if ((__mlx5_lag_is_active(ldev) || + __mlx5_lag_is_sd_active(ldev, dev)) && idx >= 0) { + pf = mlx5_lag_pf(ldev, idx); + res = pf && dev == pf->dev; + } } spin_unlock_irqrestore(&lag_lock, flags); @@ -2324,11 +2426,16 @@ bool mlx5_lag_is_shared_fdb(struct mlx5_core_dev *dev) { struct mlx5_lag *ldev; unsigned long flags; - bool res; + bool res = false; spin_lock_irqsave(&lag_lock, flags); ldev = mlx5_lag_dev(dev); - res = ldev && test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &ldev->mode_flags); + if (ldev) { + res = test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, + &ldev->mode_flags); + if (__mlx5_lag_is_sd(ldev, dev) && !__mlx5_lag_is_active(ldev)) + res = __mlx5_lag_is_sd_active(ldev, dev); + } spin_unlock_irqrestore(&lag_lock, flags); return res; @@ -2429,7 +2536,7 @@ struct mlx5_core_dev *mlx5_lag_get_next_peer_mdev(struct mlx5_core_dev *dev, int if (*i == MLX5_MAX_PORTS) goto unlock; - mlx5_ldev_for_each(idx, *i, ldev) { + mlx5_lag_for_each(idx, *i, ldev, mlx5_lag_get_filter(ldev, dev)) { pf = mlx5_lag_pf(ldev, idx); if (pf->dev != dev) break; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h index 70baa7997364..cbe201529661 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h @@ -148,6 +148,14 @@ mlx5_lag_pf_by_dev(struct mlx5_lag *ldev, struct mlx5_core_dev *dev) return NULL; } +static inline bool +__mlx5_lag_is_sd(struct mlx5_lag *ldev, struct mlx5_core_dev *dev) +{ + struct lag_func *pf = mlx5_lag_pf_by_dev(ldev, dev); + + return pf && pf->group_id != 0; +} + static inline bool __mlx5_lag_is_active(struct mlx5_lag *ldev) { @@ -163,25 +171,31 @@ mlx5_lag_is_ready(struct mlx5_lag *ldev) #ifdef CONFIG_MLX5_ESWITCH int mlx5_lag_shared_fdb_create(struct mlx5_lag *ldev, struct lag_tracker *tracker, - enum mlx5_lag_mode mode); -void mlx5_lag_shared_fdb_destroy(struct mlx5_lag *ldev); + enum mlx5_lag_mode mode, + u32 group_id); +void mlx5_lag_shared_fdb_destroy(struct mlx5_lag *ldev, u32 group_id); int mlx5_lag_create_single_fdb(struct mlx5_lag *ldev); +void mlx5_lag_destroy_single_fdb(struct mlx5_lag *ldev); bool mlx5_lag_shared_fdb_supported(struct mlx5_lag *ldev); +bool mlx5_lag_shared_fdb_supported_filter(struct mlx5_lag *ldev, u32 filter); #else static inline int mlx5_lag_shared_fdb_create(struct mlx5_lag *ldev, struct lag_tracker *tracker, - enum mlx5_lag_mode mode) + enum mlx5_lag_mode mode, + u32 group_id) { return -EOPNOTSUPP; } -static inline void mlx5_lag_shared_fdb_destroy(struct mlx5_lag *ldev) {} +static inline void mlx5_lag_shared_fdb_destroy(struct mlx5_lag *ldev, + u32 group_id) {} static inline int mlx5_lag_create_single_fdb(struct mlx5_lag *ldev) { return -EOPNOTSUPP; } +static inline void mlx5_lag_destroy_single_fdb(struct mlx5_lag *ldev) {} static inline bool mlx5_lag_shared_fdb_supported(struct mlx5_lag *ldev) { return false; @@ -211,11 +225,13 @@ void mlx5_ldev_add_debugfs(struct mlx5_core_dev *dev); void mlx5_ldev_remove_debugfs(struct dentry *dbg); void mlx5_disable_lag(struct mlx5_lag *ldev); void mlx5_lag_remove_devices(struct mlx5_lag *ldev); +void mlx5_lag_remove_devices_filter(struct mlx5_lag *ldev, u32 filter); int mlx5_deactivate_lag(struct mlx5_lag *ldev); void mlx5_lag_add_devices(struct mlx5_lag *ldev); void mlx5_lag_rescan_dev_locked(struct mlx5_lag *ldev, struct mlx5_core_dev *dev, bool enable); +void mlx5_lag_add_devices_filter(struct mlx5_lag *ldev, u32 filter); struct mlx5_devcom_comp_dev *mlx5_lag_get_devcom_comp(struct mlx5_lag *ldev); #ifdef CONFIG_MLX5_ESWITCH @@ -238,8 +254,8 @@ static inline bool mlx5_lag_is_supported(struct mlx5_core_dev *dev) } /* Iterator filter constants for mlx5_lag_for_each() */ -#define MLX5_LAG_FILTER_ALL 0 /* iterate ALL devices */ -#define MLX5_LAG_FILTER_PORTS U32_MAX /* iterate ports only (XA_MARK_PORT) */ +#define MLX5_LAG_FILTER_PORTS 0 /* iterate ports only (XA_MARK_PORT) */ +#define MLX5_LAG_FILTER_ALL U32_MAX /* iterate ALL devices */ /* any other value = iterate devices with that specific group_id */ #define mlx5_lag_for_each(i, start_index, ldev, filter) \ @@ -264,10 +280,12 @@ int mlx5_get_pre_lag_func(struct mlx5_lag *ldev, int start_idx, int end_idx, u32 filter); int mlx5_get_next_lag_func(struct mlx5_lag *ldev, int start_idx, u32 filter); int mlx5_lag_get_dev_index_by_seq(struct mlx5_lag *ldev, int seq); +int mlx5_lag_get_dev_index_by_seq_filter(struct mlx5_lag *ldev, int seq, + u32 filter); int mlx5_lag_num_devs(struct mlx5_lag *ldev); int mlx5_lag_num_netdevs(struct mlx5_lag *ldev); int mlx5_lag_reload_ib_reps_from_locked(struct mlx5_lag *ldev, u32 flags, - bool cont_on_fail); + u32 filter, bool cont_on_fail); int mlx5_ldev_add_mdev(struct mlx5_lag *ldev, struct mlx5_core_dev *dev, u32 group_id); void mlx5_ldev_remove_mdev(struct mlx5_lag *ldev, struct mlx5_core_dev *dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c index 64e2d1dd5308..2cb44084e239 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c @@ -85,14 +85,15 @@ static int mlx5_lag_enable_mpesw(struct mlx5_lag *ldev) !MLX5_CAP_PORT_SELECTION(dev0, port_select_flow_table) || !MLX5_CAP_GEN(dev0, create_lag_when_not_master_up) || !mlx5_lag_check_prereq(ldev) || - !mlx5_lag_shared_fdb_supported(ldev)) + !mlx5_lag_shared_fdb_supported_filter(ldev, MLX5_LAG_FILTER_ALL)) return -EOPNOTSUPP; err = mlx5_mpesw_metadata_set(ldev); if (err) return err; - err = mlx5_lag_shared_fdb_create(ldev, NULL, MLX5_LAG_MODE_MPESW); + err = mlx5_lag_shared_fdb_create(ldev, NULL, MLX5_LAG_MODE_MPESW, + MLX5_LAG_FILTER_ALL); if (err) { mlx5_core_warn(dev0, "Failed to create LAG in MPESW mode (%d)\n", err); mlx5_mpesw_metadata_cleanup(ldev); @@ -106,7 +107,7 @@ void mlx5_lag_disable_mpesw(struct mlx5_lag *ldev) { if (ldev->mode == MLX5_LAG_MODE_MPESW) { mlx5_mpesw_metadata_cleanup(ldev); - mlx5_lag_shared_fdb_destroy(ldev); + mlx5_lag_shared_fdb_destroy(ldev, MLX5_LAG_FILTER_ALL); } } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/shared_fdb.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/shared_fdb.c index b5cbe3409720..74d17664f54c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/shared_fdb.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/shared_fdb.c @@ -8,19 +8,19 @@ #include "lag.h" #include "eswitch.h" -bool mlx5_lag_shared_fdb_supported(struct mlx5_lag *ldev) +bool mlx5_lag_shared_fdb_supported_filter(struct mlx5_lag *ldev, u32 filter) { + int idx = mlx5_lag_get_dev_index_by_seq_filter(ldev, MLX5_LAG_P1, + filter); struct mlx5_core_dev *dev0, *dev; bool ret = false; - int idx; int i; - idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1); if (idx < 0) return false; dev0 = mlx5_lag_pf(ldev, idx)->dev; - mlx5_ldev_for_each(i, 0, ldev) { + mlx5_lag_for_each(i, 0, ldev, filter) { if (i == idx) continue; dev = mlx5_lag_pf(ldev, i)->dev; @@ -42,9 +42,16 @@ bool mlx5_lag_shared_fdb_supported(struct mlx5_lag *ldev) return ret; } -int mlx5_lag_create_single_fdb(struct mlx5_lag *ldev) +bool mlx5_lag_shared_fdb_supported(struct mlx5_lag *ldev) { - int master_idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1); + return mlx5_lag_shared_fdb_supported_filter(ldev, + MLX5_LAG_FILTER_PORTS); +} + +static int mlx5_lag_create_single_fdb_filter(struct mlx5_lag *ldev, u32 filter) +{ + int master_idx = mlx5_lag_get_dev_index_by_seq_filter(ldev, MLX5_LAG_P1, + filter); struct mlx5_eswitch *master_esw; struct mlx5_core_dev *dev0; int i, j; @@ -55,7 +62,7 @@ int mlx5_lag_create_single_fdb(struct mlx5_lag *ldev) dev0 = mlx5_lag_pf(ldev, master_idx)->dev; master_esw = dev0->priv.eswitch; - mlx5_ldev_for_each(i, 0, ldev) { + mlx5_lag_for_each(i, 0, ldev, filter) { struct mlx5_eswitch *slave_esw; if (i == master_idx) @@ -71,7 +78,7 @@ int mlx5_lag_create_single_fdb(struct mlx5_lag *ldev) } return 0; err: - mlx5_ldev_for_each_reverse(j, i, 0, ldev) { + mlx5_lag_for_each_reverse(j, i, 0, ldev, filter) { struct mlx5_eswitch *slave_esw; if (j == master_idx) @@ -82,59 +89,145 @@ int mlx5_lag_create_single_fdb(struct mlx5_lag *ldev) return err; } +static void mlx5_lag_destroy_single_fdb_filter(struct mlx5_lag *ldev, + u32 filter) +{ + int master_idx = mlx5_lag_get_dev_index_by_seq_filter(ldev, MLX5_LAG_P1, + filter); + struct mlx5_eswitch *master_esw; + struct mlx5_eswitch *peer_esw; + int i; + + if (master_idx < 0) + return; + + master_esw = mlx5_lag_pf(ldev, master_idx)->dev->priv.eswitch; + mlx5_lag_for_each(i, 0, ldev, filter) { + if (i == master_idx) + continue; + + peer_esw = mlx5_lag_pf(ldev, i)->dev->priv.eswitch; + mlx5_eswitch_offloads_single_fdb_del_one(master_esw, peer_esw); + } +} + +int mlx5_lag_create_single_fdb(struct mlx5_lag *ldev) +{ + return mlx5_lag_create_single_fdb_filter(ldev, MLX5_LAG_FILTER_ALL); +} + +void mlx5_lag_destroy_single_fdb(struct mlx5_lag *ldev) +{ + mlx5_lag_destroy_single_fdb_filter(ldev, MLX5_LAG_FILTER_ALL); +} + +/** + * mlx5_lag_shared_fdb_create - Create shared FDB LAG + * @ldev: LAG device + * @tracker: LAG tracker (NULL for SD) + * @mode: LAG mode (unused for SD) + * @group_id: SD group ID; 0 (MLX5_LAG_FILTER_PORTS) for ports LAG; + * MLX5_LAG_FILTER_ALL for all-device (mpesw) LAG + * + * When group_id is 0 (MLX5_LAG_FILTER_PORTS) or MLX5_LAG_FILTER_ALL, + * activates a FW LAG with shared FDB. + * When group_id is a specific SD group ID, creates a software-only shared + * FDB scoped to that group (no FW LAG commands). + */ int mlx5_lag_shared_fdb_create(struct mlx5_lag *ldev, struct lag_tracker *tracker, - enum mlx5_lag_mode mode) + enum mlx5_lag_mode mode, + u32 group_id) { - int idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1); + u32 filter = group_id ? group_id : MLX5_LAG_FILTER_PORTS; + int idx = mlx5_lag_get_dev_index_by_seq_filter(ldev, MLX5_LAG_P1, + filter); struct mlx5_core_dev *dev0; + struct lag_func *pf; int err; + int i; if (idx < 0) return -EINVAL; dev0 = mlx5_lag_pf(ldev, idx)->dev; - mlx5_lag_remove_devices(ldev); - - err = mlx5_activate_lag(ldev, tracker, mode, true); - if (err) { - mlx5_core_warn(dev0, "Failed to create LAG in shared FDB mode (%d)\n", - err); - goto err_add_devices; + mlx5_lag_remove_devices_filter(ldev, filter); + + if (filter == MLX5_LAG_FILTER_PORTS || filter == MLX5_LAG_FILTER_ALL) { + err = mlx5_activate_lag(ldev, tracker, mode, true); + if (err) { + mlx5_core_warn(dev0, + "Failed to create LAG in shared FDB mode (%d)\n", + err); + goto err_add_devices; + } + } else { + err = mlx5_lag_create_single_fdb_filter(ldev, group_id); + if (err) { + mlx5_core_warn(dev0, + "Failed to create SD shared FDB (%d)\n", + err); + goto err_add_devices; + } + mlx5_lag_for_each(i, 0, ldev, filter) { + pf = mlx5_lag_pf(ldev, i); + pf->sd_fdb_active = true; + } + BLOCKING_INIT_NOTIFIER_HEAD(&dev0->priv.lag_nh); } mlx5_lag_rescan_dev_locked(ldev, dev0, true); - err = mlx5_lag_reload_ib_reps_from_locked(ldev, 0, false); + err = mlx5_lag_reload_ib_reps_from_locked(ldev, 0, filter, false); if (err) { mlx5_core_err(dev0, "Failed to enable lag\n"); goto err_rescan_drivers; } - mlx5_lag_set_vports_agg_speed(ldev); + if (filter == MLX5_LAG_FILTER_PORTS || filter == MLX5_LAG_FILTER_ALL) + mlx5_lag_set_vports_agg_speed(ldev); return 0; err_rescan_drivers: mlx5_lag_rescan_dev_locked(ldev, dev0, false); - mlx5_deactivate_lag(ldev); + if (filter == MLX5_LAG_FILTER_PORTS || filter == MLX5_LAG_FILTER_ALL) { + mlx5_deactivate_lag(ldev); + } else { + mlx5_lag_for_each(i, 0, ldev, filter) { + pf = mlx5_lag_pf(ldev, i); + pf->sd_fdb_active = false; + } + mlx5_lag_destroy_single_fdb_filter(ldev, group_id); + } err_add_devices: - mlx5_lag_add_devices(ldev); - mlx5_lag_reload_ib_reps_from_locked(ldev, 0, true); + mlx5_lag_add_devices_filter(ldev, filter); + mlx5_lag_reload_ib_reps_from_locked(ldev, 0, filter, true); return err; } -void mlx5_lag_shared_fdb_destroy(struct mlx5_lag *ldev) +void mlx5_lag_shared_fdb_destroy(struct mlx5_lag *ldev, u32 group_id) { + u32 filter = group_id ? group_id : MLX5_LAG_FILTER_PORTS; + struct lag_func *pf; int err; + int i; - mlx5_lag_remove_devices(ldev); + mlx5_lag_remove_devices_filter(ldev, filter); - err = mlx5_deactivate_lag(ldev); - if (err) - return; + if (filter == MLX5_LAG_FILTER_PORTS || filter == MLX5_LAG_FILTER_ALL) { + err = mlx5_deactivate_lag(ldev); + if (err) + return; + } else { + mlx5_lag_for_each(i, 0, ldev, filter) { + pf = mlx5_lag_pf(ldev, i); + pf->sd_fdb_active = false; + } + mlx5_lag_destroy_single_fdb_filter(ldev, group_id); + } - mlx5_lag_add_devices(ldev); + mlx5_lag_add_devices_filter(ldev, filter); mlx5_lag_reload_ib_reps_from_locked(ldev, MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV, - true); + filter, true); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c index 6e199161b008..bbd77ae11e84 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c @@ -57,6 +57,16 @@ static struct mlx5_core_dev *mlx5_sd_get_primary(struct mlx5_core_dev *dev) return sd->primary ? dev : sd->primary_dev; } +struct mlx5_devcom_comp_dev *mlx5_sd_get_devcom(struct mlx5_core_dev *dev) +{ + struct mlx5_sd *sd = mlx5_get_sd(dev); + + if (!sd) + return NULL; + + return sd->devcom; +} + struct mlx5_core_dev * mlx5_sd_primary_get_peer(struct mlx5_core_dev *primary, int idx) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h index 9bfd5b9756b5..2ab259095d7e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h @@ -21,6 +21,16 @@ void mlx5_sd_put_adev(struct auxiliary_device *actual_adev, int mlx5_sd_init(struct mlx5_core_dev *dev); void mlx5_sd_cleanup(struct mlx5_core_dev *dev); +#ifdef CONFIG_MLX5_CORE_EN +struct mlx5_devcom_comp_dev *mlx5_sd_get_devcom(struct mlx5_core_dev *dev); +#else +static inline struct mlx5_devcom_comp_dev * +mlx5_sd_get_devcom(struct mlx5_core_dev *dev) +{ + return NULL; +} +#endif + #define mlx5_sd_for_each_dev_from_to(i, primary, ix_from, to, pos) \ for (i = ix_from; \ (pos = mlx5_sd_primary_get_peer(primary, i)) && pos != (to); i++) -- 2.44.0 From: Shay Drory Register SD secondary devices with the existing LAG structure by adding them to the primary's ldev xarray with a shared group_id. This ties the SD LAG lifecycle to the SD group lifecycle. Add sd_lag_state debugfs entry for LAG state visibility. To avoid race between this entry and LAG deletion, have debugfs creation and deletion done last on SD init and first on SD cleanup. Signed-off-by: Shay Drory Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan --- .../net/ethernet/mellanox/mlx5/core/lib/sd.c | 135 ++++++++++++++++-- 1 file changed, 121 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c index bbd77ae11e84..e341d814873a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c @@ -2,6 +2,7 @@ /* Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ #include "lib/sd.h" +#include "../lag/lag.h" #include "mlx5_core.h" #include "lib/mlx5.h" #include "fs_cmd.h" @@ -223,6 +224,108 @@ static void sd_cleanup(struct mlx5_core_dev *dev) kfree(sd); } +static int sd_lag_state_show(struct seq_file *file, void *priv) +{ + struct mlx5_core_dev *dev = file->private; + struct mlx5_lag *ldev; + struct lag_func *pf; + bool active = false; + int i; + + ldev = mlx5_lag_dev(dev); + if (!ldev) + return -EINVAL; + + mutex_lock(&ldev->lock); + mlx5_ldev_for_each(i, 0, ldev) { + pf = mlx5_lag_pf(ldev, i); + if (pf->dev == dev) { + active = pf->sd_fdb_active; + break; + } + } + mutex_unlock(&ldev->lock); + + seq_printf(file, "%s\n", active ? "active" : "disabled"); + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(sd_lag_state); + +/* SD LAG integration is optional. If LAG isn't available on this device + * (e.g. lag caps are off), or registering secondaries fails, just warn + * and continue - SD can operate without the LAG-side bookkeeping. + */ +static void sd_lag_init(struct mlx5_core_dev *dev) +{ + struct mlx5_core_dev *primary = mlx5_sd_get_primary(dev); + struct mlx5_sd *sd = mlx5_get_sd(primary); + struct mlx5_core_dev *pos, *to; + struct mlx5_lag *ldev; + struct lag_func *pf; + int err; + int i; + + ldev = mlx5_lag_dev(primary); + if (!ldev) { + sd_warn(primary, "%s: no ldev (LAG caps off?), skipping\n", + __func__); + return; + } + + mutex_lock(&ldev->lock); + pf = mlx5_lag_pf_by_dev(ldev, primary); + if (!pf) { + sd_warn(primary, "%s: primary not registered in ldev, skipping\n", + __func__); + goto out; + } + + pf->group_id = sd->group_id; + + mlx5_sd_for_each_secondary(i, primary, pos) { + err = mlx5_ldev_add_mdev(ldev, pos, sd->group_id); + if (err) { + sd_warn(primary, "%s: failed to add secondary %s to ldev: %d\n", + __func__, dev_name(pos->device), err); + goto err; + } + } + +out: + mutex_unlock(&ldev->lock); + return; + +err: + to = pos; + mlx5_sd_for_each_secondary_to(i, primary, to, pos) + mlx5_ldev_remove_mdev(ldev, pos); + pf->group_id = 0; + mutex_unlock(&ldev->lock); +} + +static void sd_lag_cleanup(struct mlx5_core_dev *dev) +{ + struct mlx5_core_dev *primary = mlx5_sd_get_primary(dev); + struct mlx5_core_dev *pos; + struct mlx5_lag *ldev; + struct lag_func *pf; + int i; + + ldev = mlx5_lag_dev(primary); + if (!ldev) + return; + + mutex_lock(&ldev->lock); + mlx5_sd_for_each_secondary(i, primary, pos) + mlx5_ldev_remove_mdev(ldev, pos); + + pf = mlx5_lag_pf_by_dev(ldev, primary); + if (pf) + pf->group_id = 0; + mutex_unlock(&ldev->lock); +} + static int sd_register(struct mlx5_core_dev *dev) { struct mlx5_devcom_comp_dev *devcom, *pos; @@ -473,27 +576,32 @@ int mlx5_sd_init(struct mlx5_core_dev *dev) if (err) goto err_sd_unregister; + mlx5_sd_for_each_secondary(i, primary, pos) { + err = sd_cmd_set_secondary(pos, primary, alias_key); + if (err) + goto err_unset_secondaries; + } + + sd_lag_init(primary); + primary_sd->dfs = debugfs_create_dir("multi-pf", mlx5_debugfs_get_dev_root(primary)); - debugfs_create_x32("group_id", 0400, primary_sd->dfs, - &primary_sd->group_id); - debugfs_create_file("primary", 0400, primary_sd->dfs, primary, - &dev_fops); - mlx5_sd_for_each_secondary(i, primary, pos) { char name[32]; - err = sd_cmd_set_secondary(pos, primary, alias_key); - if (err) - goto err_unset_secondaries; - snprintf(name, sizeof(name), "secondary_%d", i - 1); debugfs_create_file(name, 0400, primary_sd->dfs, pos, &dev_fops); - } + debugfs_create_file("sd_lag_state", 0400, primary_sd->dfs, primary, + &sd_lag_state_fops); + debugfs_create_x32("group_id", 0400, primary_sd->dfs, + &primary_sd->group_id); + debugfs_create_file("primary", 0400, primary_sd->dfs, primary, + &dev_fops); + sd_info(primary, "group id %#x, size %d, combined\n", sd->group_id, mlx5_devcom_comp_get_size(sd->devcom)); sd_print_group(primary); @@ -508,8 +616,6 @@ int mlx5_sd_init(struct mlx5_core_dev *dev) mlx5_sd_for_each_secondary_to(i, primary, to, pos) sd_cmd_unset_secondary(pos); sd_cmd_unset_primary(primary); - debugfs_remove_recursive(primary_sd->dfs); - primary_sd->dfs = NULL; err_sd_unregister: mlx5_sd_for_each_secondary(i, primary, pos) { struct mlx5_sd *peer_sd = mlx5_get_sd(pos); @@ -548,11 +654,12 @@ void mlx5_sd_cleanup(struct mlx5_core_dev *dev) if (primary_sd->state != MLX5_SD_STATE_UP) goto out_clear_peers; + debugfs_remove_recursive(primary_sd->dfs); + primary_sd->dfs = NULL; + sd_lag_cleanup(primary); mlx5_sd_for_each_secondary(i, primary, pos) sd_cmd_unset_secondary(pos); sd_cmd_unset_primary(primary); - debugfs_remove_recursive(primary_sd->dfs); - primary_sd->dfs = NULL; sd_info(primary, "group id %#x, uncombined\n", sd->group_id); primary_sd->state = MLX5_SD_STATE_DOWN; -- 2.44.0 From: Shay Drory Socket Direct devices manage their own LAG via SD LAG infrastructure. Block the standard netdev-event-driven LAG path (RoCE LAG and VF LAG) for SD devices to prevent conflicting LAG configurations. Expose mlx5_sd_is_supported() as a public helper that encapsulates all SD eligibility checks. Use it in mlx5_lag_dev_alloc() to skip netdev notifier registration for SD-capable devices at alloc time. Some sd code is reordered to expose the new function, no logic is changed. Signed-off-by: Shay Drory Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan --- .../net/ethernet/mellanox/mlx5/core/lag/lag.c | 13 ++-- .../net/ethernet/mellanox/mlx5/core/lib/sd.c | 60 ++++++++++++++----- .../net/ethernet/mellanox/mlx5/core/lib/sd.h | 11 ++++ 3 files changed, 63 insertions(+), 21 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c index 3decb49e9f19..a2c7e2927431 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c @@ -293,11 +293,14 @@ static struct mlx5_lag *mlx5_lag_dev_alloc(struct mlx5_core_dev *dev) INIT_DELAYED_WORK(&ldev->bond_work, mlx5_do_bond_work); INIT_WORK(&ldev->speed_update_work, mlx5_mpesw_speed_update_work); - ldev->nb.notifier_call = mlx5_lag_netdev_event; - write_pnet(&ldev->net, mlx5_core_net(dev)); - if (register_netdevice_notifier_net(read_pnet(&ldev->net), &ldev->nb)) { - ldev->nb.notifier_call = NULL; - mlx5_core_err(dev, "Failed to register LAG netdev notifier\n"); + if (!mlx5_sd_is_supported(dev)) { + ldev->nb.notifier_call = mlx5_lag_netdev_event; + write_pnet(&ldev->net, mlx5_core_net(dev)); + if (register_netdevice_notifier_net(read_pnet(&ldev->net), + &ldev->nb)) { + ldev->nb.notifier_call = NULL; + mlx5_core_err(dev, "Failed to register LAG netdev notifier\n"); + } } ldev->mode = MLX5_LAG_MODE_NONE; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c index e341d814873a..8991db3a19cf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c @@ -115,7 +115,28 @@ static bool ft_create_alias_supported(struct mlx5_core_dev *dev) return true; } -static bool mlx5_sd_is_supported(struct mlx5_core_dev *dev, u8 host_buses) +static int mlx5_query_sd(struct mlx5_core_dev *dev, bool *sdm, + u8 *host_buses) +{ + u32 out[MLX5_ST_SZ_DW(mpir_reg)]; + int err; + + err = mlx5_query_mpir_reg(dev, out); + if (err) + return err; + + *sdm = MLX5_GET(mpir_reg, out, sdm); + *host_buses = MLX5_GET(mpir_reg, out, host_buses); + + return 0; +} + +static u32 mlx5_sd_group_id(struct mlx5_core_dev *dev, u8 sd_group) +{ + return (u32)((MLX5_CAP_GEN(dev, native_port_num) << 8) | sd_group); +} + +static bool mlx5_sd_caps_supported(struct mlx5_core_dev *dev, u8 host_buses) { /* Honor the SW implementation limit */ if (host_buses > MLX5_SD_MAX_GROUP_SZ) @@ -142,25 +163,32 @@ static bool mlx5_sd_is_supported(struct mlx5_core_dev *dev, u8 host_buses) return true; } -static int mlx5_query_sd(struct mlx5_core_dev *dev, bool *sdm, - u8 *host_buses) +bool mlx5_sd_is_supported(struct mlx5_core_dev *dev) { - u32 out[MLX5_ST_SZ_DW(mpir_reg)]; + u8 host_buses, sd_group; + bool sdm; int err; - err = mlx5_query_mpir_reg(dev, out); - if (err) - return err; + /* Feature is currently implemented for PFs only */ + if (!mlx5_core_is_pf(dev)) + return false; - *sdm = MLX5_GET(mpir_reg, out, sdm); - *host_buses = MLX5_GET(mpir_reg, out, host_buses); + /* Block on embedded CPU PFs */ + if (mlx5_core_is_ecpf(dev)) + return false; - return 0; -} + err = mlx5_query_nic_vport_sd_group(dev, &sd_group); + if (err || !sd_group) + return false; -static u32 mlx5_sd_group_id(struct mlx5_core_dev *dev, u8 sd_group) -{ - return (u32)((MLX5_CAP_GEN(dev, native_port_num) << 8) | sd_group); + if (!MLX5_CAP_MCAM_REG(dev, mpir)) + return false; + + err = mlx5_query_sd(dev, &sdm, &host_buses); + if (err || !sdm) + return false; + + return mlx5_sd_caps_supported(dev, host_buses); } static int sd_init(struct mlx5_core_dev *dev) @@ -198,8 +226,8 @@ static int sd_init(struct mlx5_core_dev *dev) group_id = mlx5_sd_group_id(dev, sd_group); - if (!mlx5_sd_is_supported(dev, host_buses)) { - sd_warn(dev, "can't support requested netdev combining for group id 0x%x), skipping\n", + if (!mlx5_sd_caps_supported(dev, host_buses)) { + sd_warn(dev, "can't support requested netdev combining for group id 0x%x, skipping\n", group_id); return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h index 2ab259095d7e..bf59903ab23f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h @@ -4,6 +4,8 @@ #ifndef __MLX5_LIB_SD_H__ #define __MLX5_LIB_SD_H__ +#include + #define MLX5_SD_MAX_GROUP_SZ 2 struct mlx5_sd; @@ -18,6 +20,15 @@ struct auxiliary_device *mlx5_sd_get_adev(struct mlx5_core_dev *dev, void mlx5_sd_put_adev(struct auxiliary_device *actual_adev, struct auxiliary_device *adev); +#ifdef CONFIG_MLX5_CORE_EN +bool mlx5_sd_is_supported(struct mlx5_core_dev *dev); +#else +static inline bool mlx5_sd_is_supported(struct mlx5_core_dev *dev) +{ + return false; +} +#endif + int mlx5_sd_init(struct mlx5_core_dev *dev); void mlx5_sd_cleanup(struct mlx5_core_dev *dev); -- 2.44.0 From: Shay Drory SD devices are not compatible with multipath LAG since they use dedicated SD LAG for cross-socket connectivity. Add an SD check to the multipath prereq validation to prevent multipath LAG activation on SD-configured ports. Signed-off-by: Shay Drory Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan --- drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c index f42e051fa7e7..65c76bd748c6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c @@ -26,6 +26,10 @@ static bool mlx5_lag_multipath_check_prereq(struct mlx5_lag *ldev) if (__mlx5_lag_is_active(ldev) && !__mlx5_lag_is_multipath(ldev)) return false; + if (__mlx5_lag_is_sd(ldev, mlx5_lag_pf(ldev, idx0)->dev) || + __mlx5_lag_is_sd(ldev, mlx5_lag_pf(ldev, idx1)->dev)) + return false; + if (ldev->ports > MLX5_LAG_MULTIPATH_OFFLOADS_SUPPORTED_PORTS) return false; -- 2.44.0 From: Shay Drory In SD switchdev mode, network device resources such as channels and completion vectors must remain on the same PF rather than being distributed across SD group members. Modify mlx5_sd_ch_ix_get_dev_ix() to return 0 and mlx5_sd_ch_ix_get_vec_ix() to return the channel index directly when in switchdev mode, keeping resources local to the requesting PF. Signed-off-by: Shay Drory Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan --- drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c index 8991db3a19cf..ec606851feb8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c @@ -6,6 +6,7 @@ #include "mlx5_core.h" #include "lib/mlx5.h" #include "fs_cmd.h" +#include #include #include @@ -85,11 +86,17 @@ mlx5_sd_primary_get_peer(struct mlx5_core_dev *primary, int idx) int mlx5_sd_ch_ix_get_dev_ix(struct mlx5_core_dev *dev, int ch_ix) { + if (is_mdev_switchdev_mode(dev)) + return 0; + return ch_ix % mlx5_sd_get_host_buses(dev); } int mlx5_sd_ch_ix_get_vec_ix(struct mlx5_core_dev *dev, int ch_ix) { + if (is_mdev_switchdev_mode(dev)) + return ch_ix; + return ch_ix / mlx5_sd_get_host_buses(dev); } -- 2.44.0 From: Shay Drory With SD devices joining the LAG, peer flows are not created for all devcom peers - SD devices skip peers that belong to a different SD group. However, the delete path iterated all devcom peers unconditionally, attempting to delete from slots that were never populated. Track which peer slots are populated using a bitmap in mlx5e_tc_flow. The delete path now iterates only set bits, matching exactly the slots that were set up during flow creation. Signed-off-by: Shay Drory Signed-off-by: Tariq Toukan --- drivers/net/ethernet/mellanox/mlx5/core/en/tc_priv.h | 3 +++ drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 10 +++------- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_priv.h b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_priv.h index efb34de4cb7a..a0434ceebe69 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_priv.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_priv.h @@ -97,6 +97,9 @@ struct mlx5e_tc_flow { struct mlx5e_hairpin_entry *hpe; /* attached hairpin instance */ struct list_head hairpin; /* flows sharing the same hairpin */ struct list_head peer[MLX5_MAX_PORTS]; /* flows with peer flow */ + DECLARE_BITMAP(peer_used, MLX5_MAX_PORTS); /* tracks populated peer + * slots + */ struct list_head unready; /* flows not ready to be offloaded (e.g * due to missing route) */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 3846c16c3138..2a16368a948e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -2128,6 +2128,7 @@ static void mlx5e_tc_del_fdb_peer_flow(struct mlx5e_tc_flow *flow, mutex_lock(&esw->offloads.peer_mutex); list_del(&flow->peer[peer_index]); + clear_bit(peer_index, flow->peer_used); mutex_unlock(&esw->offloads.peer_mutex); list_for_each_entry_safe(peer_flow, tmp, &flow->peer_flows, peer_flows) { @@ -2147,16 +2148,10 @@ static void mlx5e_tc_del_fdb_peer_flow(struct mlx5e_tc_flow *flow, static void mlx5e_tc_del_fdb_peers_flow(struct mlx5e_tc_flow *flow) { - struct mlx5_devcom_comp_dev *devcom; - struct mlx5_devcom_comp_dev *pos; - struct mlx5_eswitch *peer_esw; int i; - devcom = flow->priv->mdev->priv.eswitch->devcom; - mlx5_devcom_for_each_peer_entry(devcom, peer_esw, pos) { - i = mlx5_lag_get_dev_seq(peer_esw->dev); + for_each_set_bit(i, flow->peer_used, MLX5_MAX_PORTS) mlx5e_tc_del_fdb_peer_flow(flow, i); - } } static void mlx5e_tc_del_flow(struct mlx5e_priv *priv, @@ -4618,6 +4613,7 @@ static int mlx5e_tc_add_fdb_peer_flow(struct flow_cls_offload *f, flow_flag_set(flow, DUP); mutex_lock(&esw->offloads.peer_mutex); list_add_tail(&flow->peer[i], &esw->offloads.peer_flows[i]); + set_bit(i, flow->peer_used); mutex_unlock(&esw->offloads.peer_mutex); out: -- 2.44.0 From: Shay Drory Enable TC flow steering for SD LAG mode by extending multiport eligibility checks and peer flow handling. SD LAG operates similarly to MPESW for TC offloads - flows on secondary devices need peer flow creation on the primary, and multiport forwarding rules are eligible when either MPESW or SD LAG is active. Add mlx5_lag_is_sd() helper to query SD LAG mode, and mlx5_sd_is_primary() to identify the primary device. Redirect uplink priv/proto_dev queries to the primary device's eswitch in SD configurations. Signed-off-by: Shay Drory Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan --- .../ethernet/mellanox/mlx5/core/en/tc_priv.h | 4 ++ .../net/ethernet/mellanox/mlx5/core/en_tc.c | 53 +++++++++++++++++-- .../mellanox/mlx5/core/eswitch_offloads.c | 8 +++ .../net/ethernet/mellanox/mlx5/core/lag/lag.c | 14 +++++ .../net/ethernet/mellanox/mlx5/core/lag/lag.h | 1 + .../net/ethernet/mellanox/mlx5/core/lib/sd.c | 15 +++++- .../net/ethernet/mellanox/mlx5/core/lib/sd.h | 2 + 7 files changed, 92 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_priv.h b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_priv.h index a0434ceebe69..28cab4bf525c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_priv.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_priv.h @@ -104,6 +104,10 @@ struct mlx5e_tc_flow { * due to missing route) */ struct list_head peer_flows; /* flows on peer */ + int peer_index; /* peer-flow index pinned at add time, used at del + * time so removal is independent of LAG state + * changes between add and del. + */ struct net_device *orig_dev; /* netdev adding flow first */ int tmp_entry_index; struct list_head tmp_list; /* temporary flow list used by neigh update */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 2a16368a948e..910492eb51f2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -71,6 +71,7 @@ #include #include "lag/lag.h" #include "lag/mp.h" +#include "lib/sd.h" #define MLX5E_TC_TABLE_NUM_GROUPS 4 #define MLX5E_TC_TABLE_MAX_GROUP_SIZE BIT(18) @@ -2132,7 +2133,7 @@ static void mlx5e_tc_del_fdb_peer_flow(struct mlx5e_tc_flow *flow, mutex_unlock(&esw->offloads.peer_mutex); list_for_each_entry_safe(peer_flow, tmp, &flow->peer_flows, peer_flows) { - if (peer_index != mlx5_lag_get_dev_seq(peer_flow->priv->mdev)) + if (peer_index != peer_flow->peer_index) continue; list_del(&peer_flow->peer_flows); @@ -4196,9 +4197,26 @@ static bool is_lag_dev(struct mlx5e_priv *priv, same_hw_reps(priv, peer_netdev)); } +static bool is_sd_eligible(struct mlx5e_priv *priv, + struct net_device *peer_netdev) +{ + struct mlx5e_priv *peer_priv; + + peer_priv = netdev_priv(peer_netdev); + return same_hw_reps(priv, peer_netdev) && + mlx5_lag_is_sd(priv->mdev) && + (mlx5_sd_get_primary(priv->mdev) == + mlx5_sd_get_primary(peer_priv->mdev)); +} + static bool is_multiport_eligible(struct mlx5e_priv *priv, struct net_device *out_dev) { - return same_hw_reps(priv, out_dev) && mlx5_lag_is_mpesw(priv->mdev); + struct mlx5_core_dev *primary = mlx5_sd_get_primary(priv->mdev); + + if (!primary) + return false; + + return same_hw_reps(priv, out_dev) && mlx5_lag_is_mpesw(primary); } bool mlx5e_is_valid_eswitch_fwd_dev(struct mlx5e_priv *priv, @@ -4207,6 +4225,9 @@ bool mlx5e_is_valid_eswitch_fwd_dev(struct mlx5e_priv *priv, if (is_merged_eswitch_vfs(priv, out_dev)) return true; + if (is_sd_eligible(priv, out_dev)) + return true; + if (is_multiport_eligible(priv, out_dev)) return true; @@ -4351,7 +4372,7 @@ static struct rhashtable *get_tc_ht(struct mlx5e_priv *priv, return &tc->ht; } -static bool is_peer_flow_needed(struct mlx5e_tc_flow *flow) +static bool is_peer_flow_needed(struct mlx5e_tc_flow *flow, bool *is_sd) { struct mlx5_esw_flow_attr *esw_attr = flow->attr->esw_attr; struct mlx5_flow_attr *attr = flow->attr; @@ -4372,6 +4393,13 @@ static bool is_peer_flow_needed(struct mlx5e_tc_flow *flow) if (mlx5_lag_is_mpesw(esw_attr->in_mdev)) return true; + if (mlx5_lag_is_sd(esw_attr->in_mdev) && + !mlx5_sd_is_primary(esw_attr->in_mdev)) { + if (!mlx5_lag_is_mpesw(mlx5_sd_get_primary(esw_attr->in_mdev))) + *is_sd = true; + return true; + } + return false; } @@ -4609,6 +4637,7 @@ static int mlx5e_tc_add_fdb_peer_flow(struct flow_cls_offload *f, goto out; } + peer_flow->peer_index = i; list_add_tail(&peer_flow->peer_flows, &flow->peer_flows); flow_flag_set(flow, DUP); mutex_lock(&esw->offloads.peer_mutex); @@ -4628,19 +4657,26 @@ mlx5e_add_fdb_flow(struct mlx5e_priv *priv, struct mlx5e_tc_flow **__flow) { struct mlx5_devcom_comp_dev *devcom = priv->mdev->priv.eswitch->devcom, *pos; + struct netlink_ext_ack *extack = f->common.extack; struct mlx5e_rep_priv *rpriv = priv->ppriv; struct mlx5_eswitch_rep *in_rep = rpriv->rep; struct mlx5_core_dev *in_mdev = priv->mdev; struct mlx5_eswitch *peer_esw; struct mlx5e_tc_flow *flow; + bool is_sd = false; int err; + if (mlx5_lag_is_sd(in_mdev) && !mlx5_lag_is_active(in_mdev)) { + NL_SET_ERR_MSG_MOD(extack, "SD shared FDB not yet active"); + return -EOPNOTSUPP; + } + flow = __mlx5e_add_fdb_flow(priv, f, flow_flags, filter_dev, in_rep, in_mdev); if (IS_ERR(flow)) return PTR_ERR(flow); - if (!is_peer_flow_needed(flow)) { + if (!is_peer_flow_needed(flow, &is_sd)) { *__flow = flow; return 0; } @@ -4651,6 +4687,15 @@ mlx5e_add_fdb_flow(struct mlx5e_priv *priv, } mlx5_devcom_for_each_peer_entry(devcom, peer_esw, pos) { + if (is_sd) { + /* SD shared FDB: only the matching SD primary. */ + if (mlx5_sd_get_primary(in_mdev) != + mlx5_sd_get_primary(peer_esw->dev)) + continue; + } else { + if (!mlx5_sd_is_primary(peer_esw->dev)) + continue; + } err = mlx5e_tc_add_fdb_peer_flow(f, flow, flow_flags, peer_esw); if (err) goto peer_clean; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index d65f30bb2f80..830fc910a080 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -4690,8 +4690,11 @@ EXPORT_SYMBOL(mlx5_eswitch_unregister_vport_reps_nested); void *mlx5_eswitch_get_uplink_priv(struct mlx5_eswitch *esw, u8 rep_type) { + struct mlx5_core_dev *primary = mlx5_sd_get_primary(esw->dev); struct mlx5_eswitch_rep *rep; + if (primary) + esw = primary->priv.eswitch; rep = mlx5_eswitch_get_rep(esw, MLX5_VPORT_UPLINK); return rep->rep_data[rep_type].priv; } @@ -4713,6 +4716,11 @@ EXPORT_SYMBOL(mlx5_eswitch_get_proto_dev); void *mlx5_eswitch_uplink_get_proto_dev(struct mlx5_eswitch *esw, u8 rep_type) { + struct mlx5_core_dev *primary = mlx5_sd_get_primary(esw->dev); + + if (primary) + esw = primary->priv.eswitch; + return mlx5_eswitch_get_proto_dev(esw, MLX5_VPORT_UPLINK, rep_type); } EXPORT_SYMBOL(mlx5_eswitch_uplink_get_proto_dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c index a2c7e2927431..dd3f18f85466 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c @@ -2425,6 +2425,20 @@ bool mlx5_lag_is_sriov(struct mlx5_core_dev *dev) } EXPORT_SYMBOL(mlx5_lag_is_sriov); +bool mlx5_lag_is_sd(struct mlx5_core_dev *dev) +{ + struct mlx5_lag *ldev; + unsigned long flags; + bool res; + + spin_lock_irqsave(&lag_lock, flags); + ldev = mlx5_lag_dev(dev); + res = ldev && __mlx5_lag_is_sd(ldev, dev); + spin_unlock_irqrestore(&lag_lock, flags); + + return res; +} + bool mlx5_lag_is_shared_fdb(struct mlx5_core_dev *dev) { struct mlx5_lag *ldev; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h index cbe201529661..e412bb85027c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h @@ -202,6 +202,7 @@ static inline bool mlx5_lag_shared_fdb_supported(struct mlx5_lag *ldev) } #endif bool mlx5_lag_check_prereq(struct mlx5_lag *ldev); +bool mlx5_lag_is_sd(struct mlx5_core_dev *dev); int mlx5_lag_demux_init(struct mlx5_core_dev *dev, struct mlx5_flow_table_attr *ft_attr); void mlx5_lag_demux_cleanup(struct mlx5_core_dev *dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c index ec606851feb8..25286ecd724e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c @@ -49,13 +49,16 @@ static int mlx5_sd_get_host_buses(struct mlx5_core_dev *dev) return sd->host_buses; } -static struct mlx5_core_dev *mlx5_sd_get_primary(struct mlx5_core_dev *dev) +struct mlx5_core_dev *mlx5_sd_get_primary(struct mlx5_core_dev *dev) { struct mlx5_sd *sd = mlx5_get_sd(dev); if (!sd) return dev; + if (!mlx5_devcom_comp_is_ready(sd->devcom)) + return NULL; + return sd->primary ? dev : sd->primary_dev; } @@ -69,6 +72,16 @@ struct mlx5_devcom_comp_dev *mlx5_sd_get_devcom(struct mlx5_core_dev *dev) return sd->devcom; } +bool mlx5_sd_is_primary(struct mlx5_core_dev *dev) +{ + struct mlx5_sd *sd = mlx5_get_sd(dev); + + if (!sd) + return true; + + return sd->primary; +} + struct mlx5_core_dev * mlx5_sd_primary_get_peer(struct mlx5_core_dev *primary, int idx) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h index bf59903ab23f..011702ff6f02 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h @@ -10,6 +10,8 @@ struct mlx5_sd; +struct mlx5_core_dev *mlx5_sd_get_primary(struct mlx5_core_dev *dev); +bool mlx5_sd_is_primary(struct mlx5_core_dev *dev); struct mlx5_core_dev *mlx5_sd_primary_get_peer(struct mlx5_core_dev *primary, int idx); int mlx5_sd_ch_ix_get_dev_ix(struct mlx5_core_dev *dev, int ch_ix); int mlx5_sd_ch_ix_get_vec_ix(struct mlx5_core_dev *dev, int ch_ix); -- 2.44.0 From: Shay Drory Change verify_num_vhca_ids() to count the number of unique vhca_ids and verify this count doesn't exceed max_num_vhca_id, rather than validating individual vhca_id values are within a specific range. The previous implementation checked if each vhca_id was in the range [0, max_num_vhca_id - 1], which is overly restrictive. The hardware capability max_rqt_vhca_id represents the maximum number of unique vhca_ids that can be used, not a range constraint on individual IDs. Signed-off-by: Shay Drory Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan --- .../net/ethernet/mellanox/mlx5/core/en/rqt.c | 27 ++++++++++++++----- 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.c index a3382f6a6b74..8511363f7bec 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.c @@ -8,13 +8,28 @@ static bool verify_num_vhca_ids(struct mlx5_core_dev *mdev, u32 *vhca_ids, unsigned int size) { unsigned int max_num_vhca_id = MLX5_CAP_GEN_2(mdev, max_rqt_vhca_id); - int i; + unsigned int unique_count = 0; + int i, j; + + /* Count unique vhca_ids */ + for (i = 0; i < size; i++) { + bool is_unique = true; + + /* Check if vhca_ids[i] was already seen */ + for (j = 0; j < i; j++) { + if (vhca_ids[j] == vhca_ids[i]) { + is_unique = false; + break; + } + } + if (is_unique) + unique_count++; + } - /* Verify that all vhca_ids are in range [0, max_num_vhca_ids - 1] */ - for (i = 0; i < size; i++) - if (vhca_ids[i] >= max_num_vhca_id) - return false; - return true; + /* Verify that number of unique vhca_ids doesn't exceed + * max_num_vhca_id + */ + return unique_count <= max_num_vhca_id; } static bool rqt_verify_vhca_ids(struct mlx5_core_dev *mdev, u32 *vhca_ids, -- 2.44.0