From: Or Har-Toov Currently, vports report only their parent's uplink speed, which in LAG setups does not reflect the true aggregated bandwidth. This makes it hard for upper-layer software to optimize load balancing decisions based on accurate bandwidth information. Fix the issue by calculating the possible maximum speed of a LAG as the sum of speeds of all active uplinks that are part of the LAG. Propagate this effective max speed to vports associated with the LAG whenever a relevant event occurs, such as physical port link state changes or LAG creation/modification. With this change, upper-layer components receive accurate bandwidth information corresponding to the active members of the LAG and can make better load balancing decisions. Signed-off-by: Or Har-Toov Reviewed-by: Maher Sanalla Reviewed-by: Mark Bloch Signed-off-by: Edward Srouji --- drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c | 158 +++++++++++++++++++++ drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h | 9 ++ .../net/ethernet/mellanox/mlx5/core/mlx5_core.h | 1 + drivers/net/ethernet/mellanox/mlx5/core/port.c | 24 ++++ drivers/net/ethernet/mellanox/mlx5/core/vport.c | 45 ++++++ include/linux/mlx5/vport.h | 4 + 6 files changed, 241 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c index 1ac933cd8f02..a042612dcde6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c @@ -996,6 +996,126 @@ static bool mlx5_lag_should_disable_lag(struct mlx5_lag *ldev, bool do_bond) ldev->mode != MLX5_LAG_MODE_MPESW; } +#ifdef CONFIG_MLX5_ESWITCH +static int +mlx5_lag_sum_devices_speed(struct mlx5_lag *ldev, u32 *sum_speed, + int (*get_speed)(struct mlx5_core_dev *, u32 *)) +{ + struct mlx5_core_dev *pf_mdev; + int pf_idx; + u32 speed; + int ret; + + *sum_speed = 0; + mlx5_ldev_for_each(pf_idx, 0, ldev) { + pf_mdev = ldev->pf[pf_idx].dev; + if (!pf_mdev) + continue; + + ret = get_speed(pf_mdev, &speed); + if (ret) { + mlx5_core_dbg(pf_mdev, + "Failed to get device speed using %ps. Device %s speed is not available (err=%d)\n", + get_speed, dev_name(pf_mdev->device), + ret); + return ret; + } + + *sum_speed += speed; + } + + return 0; +} + +static int mlx5_lag_sum_devices_max_speed(struct mlx5_lag *ldev, u32 *max_speed) +{ + return mlx5_lag_sum_devices_speed(ldev, max_speed, + mlx5_port_max_linkspeed); +} + +static void mlx5_lag_modify_device_vports_speed(struct mlx5_core_dev *mdev, + u32 speed) +{ + u16 op_mod = MLX5_VPORT_STATE_OP_MOD_ESW_VPORT; + struct mlx5_eswitch *esw = mdev->priv.eswitch; + struct mlx5_vport *vport; + unsigned long i; + int ret; + + if (!esw) + return; + + if (!MLX5_CAP_ESW(mdev, esw_vport_state_max_tx_speed)) + return; + + mlx5_esw_for_each_vport(esw, i, vport) { + if (!vport) + continue; + + if (vport->vport == MLX5_VPORT_UPLINK) + continue; + + ret = mlx5_modify_vport_max_tx_speed(mdev, op_mod, + vport->vport, true, speed); + if (ret) + mlx5_core_dbg(mdev, + "Failed to set vport %d speed %d, err=%d\n", + vport->vport, speed, ret); + } +} + +void mlx5_lag_set_vports_agg_speed(struct mlx5_lag *ldev) +{ + struct mlx5_core_dev *mdev; + u32 speed; + int pf_idx; + + speed = ldev->tracker.bond_speed_mbps; + + if (speed == SPEED_UNKNOWN) + return; + + /* If speed is not set, use the sum of max speeds of all PFs */ + if (!speed && mlx5_lag_sum_devices_max_speed(ldev, &speed)) + return; + + speed = speed / MLX5_MAX_TX_SPEED_UNIT; + + mlx5_ldev_for_each(pf_idx, 0, ldev) { + mdev = ldev->pf[pf_idx].dev; + if (!mdev) + continue; + + mlx5_lag_modify_device_vports_speed(mdev, speed); + } +} + +void mlx5_lag_reset_vports_speed(struct mlx5_lag *ldev) +{ + struct mlx5_core_dev *mdev; + u32 speed; + int pf_idx; + int ret; + + mlx5_ldev_for_each(pf_idx, 0, ldev) { + mdev = ldev->pf[pf_idx].dev; + if (!mdev) + continue; + + ret = mlx5_port_oper_linkspeed(mdev, &speed); + if (ret) { + mlx5_core_dbg(mdev, + "Failed to reset vports speed for device %s. Oper speed is not available (err=%d)\n", + dev_name(mdev->device), ret); + continue; + } + + speed = speed / MLX5_MAX_TX_SPEED_UNIT; + mlx5_lag_modify_device_vports_speed(mdev, speed); + } +} +#endif + static void mlx5_do_bond(struct mlx5_lag *ldev) { int idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1); @@ -1083,9 +1203,12 @@ static void mlx5_do_bond(struct mlx5_lag *ldev) ndev); dev_put(ndev); } + mlx5_lag_set_vports_agg_speed(ldev); } else if (mlx5_lag_should_modify_lag(ldev, do_bond)) { mlx5_modify_lag(ldev, &tracker); + mlx5_lag_set_vports_agg_speed(ldev); } else if (mlx5_lag_should_disable_lag(ldev, do_bond)) { + mlx5_lag_reset_vports_speed(ldev); mlx5_disable_lag(ldev); } } @@ -1286,6 +1409,38 @@ static int mlx5_handle_changeinfodata_event(struct mlx5_lag *ldev, return 1; } +static void mlx5_lag_update_tracker_speed(struct lag_tracker *tracker, + struct net_device *ndev) +{ + struct ethtool_link_ksettings lksettings; + struct net_device *bond_dev; + int err; + + if (netif_is_lag_master(ndev)) + bond_dev = ndev; + else + bond_dev = netdev_master_upper_dev_get(ndev); + + if (!bond_dev) { + tracker->bond_speed_mbps = SPEED_UNKNOWN; + return; + } + + err = __ethtool_get_link_ksettings(bond_dev, &lksettings); + if (err) { + netdev_dbg(bond_dev, + "Failed to get speed for bond dev %s, err=%d\n", + bond_dev->name, err); + tracker->bond_speed_mbps = SPEED_UNKNOWN; + return; + } + + if (lksettings.base.speed == SPEED_UNKNOWN) + tracker->bond_speed_mbps = 0; + else + tracker->bond_speed_mbps = lksettings.base.speed; +} + /* this handler is always registered to netdev events */ static int mlx5_lag_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) @@ -1317,6 +1472,9 @@ static int mlx5_lag_netdev_event(struct notifier_block *this, break; } + if (changed) + mlx5_lag_update_tracker_speed(&tracker, ndev); + ldev->tracker = tracker; if (changed) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h index 4918eee2b3da..8de5640a0161 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h @@ -48,6 +48,7 @@ struct lag_tracker { unsigned int is_bonded:1; unsigned int has_inactive:1; enum netdev_lag_hash hash_type; + u32 bond_speed_mbps; }; /* LAG data of a ConnectX card. @@ -116,6 +117,14 @@ int mlx5_deactivate_lag(struct mlx5_lag *ldev); void mlx5_lag_add_devices(struct mlx5_lag *ldev); struct mlx5_devcom_comp_dev *mlx5_lag_get_devcom_comp(struct mlx5_lag *ldev); +#ifdef CONFIG_MLX5_ESWITCH +void mlx5_lag_set_vports_agg_speed(struct mlx5_lag *ldev); +void mlx5_lag_reset_vports_speed(struct mlx5_lag *ldev); +#else +static inline void mlx5_lag_set_vports_agg_speed(struct mlx5_lag *ldev) {} +static inline void mlx5_lag_reset_vports_speed(struct mlx5_lag *ldev) {} +#endif + static inline bool mlx5_lag_is_supported(struct mlx5_core_dev *dev) { if (!MLX5_CAP_GEN(dev, vport_group_manager) || diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h index cfebc110c02f..9fdb9a543cf1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h @@ -381,6 +381,7 @@ const struct mlx5_link_info *mlx5_port_ptys2info(struct mlx5_core_dev *mdev, u32 mlx5_port_info2linkmodes(struct mlx5_core_dev *mdev, struct mlx5_link_info *info, bool force_legacy); +int mlx5_port_oper_linkspeed(struct mlx5_core_dev *mdev, u32 *speed); int mlx5_port_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed); #define MLX5_PPS_CAP(mdev) (MLX5_CAP_GEN((mdev), pps) && \ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c index 85a9e534f442..83044c9b6b41 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c @@ -1200,6 +1200,30 @@ u32 mlx5_port_info2linkmodes(struct mlx5_core_dev *mdev, return link_modes; } +int mlx5_port_oper_linkspeed(struct mlx5_core_dev *mdev, u32 *speed) +{ + const struct mlx5_link_info *table; + struct mlx5_port_eth_proto eproto; + u32 oper_speed = 0; + u32 max_size; + bool ext; + int err; + int i; + + ext = mlx5_ptys_ext_supported(mdev); + err = mlx5_port_query_eth_proto(mdev, 1, ext, &eproto); + if (err) + return err; + + mlx5e_port_get_link_mode_info_arr(mdev, &table, &max_size, false); + for (i = 0; i < max_size; ++i) + if (eproto.oper & MLX5E_PROT_MASK(i)) + oper_speed = max(oper_speed, table[i].speed); + + *speed = oper_speed; + return 0; +} + int mlx5_port_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed) { const struct mlx5_link_info *table; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c index 306affbcfd3b..78b1b291cfa4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c @@ -62,6 +62,28 @@ u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport) return MLX5_GET(query_vport_state_out, out, state); } +static int mlx5_query_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, + u16 vport, u8 other_vport, + u8 *admin_state) +{ + u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {}; + u32 in[MLX5_ST_SZ_DW(query_vport_state_in)] = {}; + int err; + + MLX5_SET(query_vport_state_in, in, opcode, + MLX5_CMD_OP_QUERY_VPORT_STATE); + MLX5_SET(query_vport_state_in, in, op_mod, opmod); + MLX5_SET(query_vport_state_in, in, vport_number, vport); + MLX5_SET(query_vport_state_in, in, other_vport, other_vport); + + err = mlx5_cmd_exec_inout(mdev, query_vport_state, in, out); + if (err) + return err; + + *admin_state = MLX5_GET(query_vport_state_out, out, admin_state); + return 0; +} + int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport, u8 other_vport, u8 state) { @@ -77,6 +99,29 @@ int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, return mlx5_cmd_exec_in(mdev, modify_vport_state, in); } +int mlx5_modify_vport_max_tx_speed(struct mlx5_core_dev *mdev, u8 opmod, + u16 vport, u8 other_vport, u16 max_tx_speed) +{ + u32 in[MLX5_ST_SZ_DW(modify_vport_state_in)] = {}; + u8 admin_state; + int err; + + err = mlx5_query_vport_admin_state(mdev, opmod, vport, other_vport, + &admin_state); + if (err) + return err; + + MLX5_SET(modify_vport_state_in, in, opcode, + MLX5_CMD_OP_MODIFY_VPORT_STATE); + MLX5_SET(modify_vport_state_in, in, op_mod, opmod); + MLX5_SET(modify_vport_state_in, in, vport_number, vport); + MLX5_SET(modify_vport_state_in, in, other_vport, other_vport); + MLX5_SET(modify_vport_state_in, in, admin_state, admin_state); + MLX5_SET(modify_vport_state_in, in, max_tx_speed, max_tx_speed); + + return mlx5_cmd_exec_in(mdev, modify_vport_state, in); +} + static int mlx5_query_nic_vport_context(struct mlx5_core_dev *mdev, u16 vport, bool other_vport, u32 *out) { diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index f876bfc0669c..2acf10e9f60a 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -41,6 +41,8 @@ (MLX5_CAP_GEN(mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) && \ mlx5_core_is_pf(mdev)) +#define MLX5_MAX_TX_SPEED_UNIT 100 + enum { MLX5_CAP_INLINE_MODE_L2, MLX5_CAP_INLINE_MODE_VPORT_CONTEXT, @@ -58,6 +60,8 @@ enum { u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport); int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport, u8 other_vport, u8 state); +int mlx5_modify_vport_max_tx_speed(struct mlx5_core_dev *mdev, u8 opmod, + u16 vport, u8 other_vport, u16 max_tx_speed); int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev, u16 vport, bool other, u8 *addr); int mlx5_query_mac_address(struct mlx5_core_dev *mdev, u8 *addr); -- 2.47.1