From: Or Har-Toov Introduce the max_tx_speed field to the query and modify_vport_state structures. Add the esw_vport_state_max_tx_speed capability bit, indicating the firmware support modifying the max_tx_speed field via the MODIFY_VPORT_STATE command. Signed-off-by: Or Har-Toov Reviewed-by: Maher Sanalla Reviewed-by: Mark Bloch Signed-off-by: Edward Srouji --- include/linux/mlx5/mlx5_ifc.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index e9dcd4bf355d..e844cfa4fe0a 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1071,7 +1071,9 @@ struct mlx5_ifc_e_switch_cap_bits { u8 esw_shared_ingress_acl[0x1]; u8 esw_uplink_ingress_acl[0x1]; u8 root_ft_on_other_esw[0x1]; - u8 reserved_at_a[0xf]; + u8 reserved_at_a[0x1]; + u8 esw_vport_state_max_tx_speed[0x1]; + u8 reserved_at_c[0xd]; u8 esw_functions_changed[0x1]; u8 reserved_at_1a[0x1]; u8 ecpf_vport_exists[0x1]; @@ -5445,7 +5447,8 @@ struct mlx5_ifc_query_vport_state_out_bits { u8 reserved_at_40[0x20]; - u8 reserved_at_60[0x18]; + u8 max_tx_speed[0x10]; + u8 reserved_at_70[0x8]; u8 admin_state[0x4]; u8 state[0x4]; }; @@ -7778,7 +7781,7 @@ struct mlx5_ifc_modify_vport_state_in_bits { u8 reserved_at_41[0xf]; u8 vport_number[0x10]; - u8 reserved_at_60[0x10]; + u8 max_tx_speed[0x10]; u8 ingress_connect[0x1]; u8 egress_connect[0x1]; u8 ingress_connect_valid[0x1]; -- 2.47.1 From: Or Har-Toov Currently, vports report only their parent's uplink speed, which in LAG setups does not reflect the true aggregated bandwidth. This makes it hard for upper-layer software to optimize load balancing decisions based on accurate bandwidth information. Fix the issue by calculating the possible maximum speed of a LAG as the sum of speeds of all active uplinks that are part of the LAG. Propagate this effective max speed to vports associated with the LAG whenever a relevant event occurs, such as physical port link state changes or LAG creation/modification. With this change, upper-layer components receive accurate bandwidth information corresponding to the active members of the LAG and can make better load balancing decisions. Signed-off-by: Or Har-Toov Reviewed-by: Maher Sanalla Reviewed-by: Mark Bloch Signed-off-by: Edward Srouji --- drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c | 158 +++++++++++++++++++++ drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h | 9 ++ .../net/ethernet/mellanox/mlx5/core/mlx5_core.h | 1 + drivers/net/ethernet/mellanox/mlx5/core/port.c | 24 ++++ drivers/net/ethernet/mellanox/mlx5/core/vport.c | 45 ++++++ include/linux/mlx5/vport.h | 4 + 6 files changed, 241 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c index 1ac933cd8f02..a042612dcde6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c @@ -996,6 +996,126 @@ static bool mlx5_lag_should_disable_lag(struct mlx5_lag *ldev, bool do_bond) ldev->mode != MLX5_LAG_MODE_MPESW; } +#ifdef CONFIG_MLX5_ESWITCH +static int +mlx5_lag_sum_devices_speed(struct mlx5_lag *ldev, u32 *sum_speed, + int (*get_speed)(struct mlx5_core_dev *, u32 *)) +{ + struct mlx5_core_dev *pf_mdev; + int pf_idx; + u32 speed; + int ret; + + *sum_speed = 0; + mlx5_ldev_for_each(pf_idx, 0, ldev) { + pf_mdev = ldev->pf[pf_idx].dev; + if (!pf_mdev) + continue; + + ret = get_speed(pf_mdev, &speed); + if (ret) { + mlx5_core_dbg(pf_mdev, + "Failed to get device speed using %ps. Device %s speed is not available (err=%d)\n", + get_speed, dev_name(pf_mdev->device), + ret); + return ret; + } + + *sum_speed += speed; + } + + return 0; +} + +static int mlx5_lag_sum_devices_max_speed(struct mlx5_lag *ldev, u32 *max_speed) +{ + return mlx5_lag_sum_devices_speed(ldev, max_speed, + mlx5_port_max_linkspeed); +} + +static void mlx5_lag_modify_device_vports_speed(struct mlx5_core_dev *mdev, + u32 speed) +{ + u16 op_mod = MLX5_VPORT_STATE_OP_MOD_ESW_VPORT; + struct mlx5_eswitch *esw = mdev->priv.eswitch; + struct mlx5_vport *vport; + unsigned long i; + int ret; + + if (!esw) + return; + + if (!MLX5_CAP_ESW(mdev, esw_vport_state_max_tx_speed)) + return; + + mlx5_esw_for_each_vport(esw, i, vport) { + if (!vport) + continue; + + if (vport->vport == MLX5_VPORT_UPLINK) + continue; + + ret = mlx5_modify_vport_max_tx_speed(mdev, op_mod, + vport->vport, true, speed); + if (ret) + mlx5_core_dbg(mdev, + "Failed to set vport %d speed %d, err=%d\n", + vport->vport, speed, ret); + } +} + +void mlx5_lag_set_vports_agg_speed(struct mlx5_lag *ldev) +{ + struct mlx5_core_dev *mdev; + u32 speed; + int pf_idx; + + speed = ldev->tracker.bond_speed_mbps; + + if (speed == SPEED_UNKNOWN) + return; + + /* If speed is not set, use the sum of max speeds of all PFs */ + if (!speed && mlx5_lag_sum_devices_max_speed(ldev, &speed)) + return; + + speed = speed / MLX5_MAX_TX_SPEED_UNIT; + + mlx5_ldev_for_each(pf_idx, 0, ldev) { + mdev = ldev->pf[pf_idx].dev; + if (!mdev) + continue; + + mlx5_lag_modify_device_vports_speed(mdev, speed); + } +} + +void mlx5_lag_reset_vports_speed(struct mlx5_lag *ldev) +{ + struct mlx5_core_dev *mdev; + u32 speed; + int pf_idx; + int ret; + + mlx5_ldev_for_each(pf_idx, 0, ldev) { + mdev = ldev->pf[pf_idx].dev; + if (!mdev) + continue; + + ret = mlx5_port_oper_linkspeed(mdev, &speed); + if (ret) { + mlx5_core_dbg(mdev, + "Failed to reset vports speed for device %s. Oper speed is not available (err=%d)\n", + dev_name(mdev->device), ret); + continue; + } + + speed = speed / MLX5_MAX_TX_SPEED_UNIT; + mlx5_lag_modify_device_vports_speed(mdev, speed); + } +} +#endif + static void mlx5_do_bond(struct mlx5_lag *ldev) { int idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1); @@ -1083,9 +1203,12 @@ static void mlx5_do_bond(struct mlx5_lag *ldev) ndev); dev_put(ndev); } + mlx5_lag_set_vports_agg_speed(ldev); } else if (mlx5_lag_should_modify_lag(ldev, do_bond)) { mlx5_modify_lag(ldev, &tracker); + mlx5_lag_set_vports_agg_speed(ldev); } else if (mlx5_lag_should_disable_lag(ldev, do_bond)) { + mlx5_lag_reset_vports_speed(ldev); mlx5_disable_lag(ldev); } } @@ -1286,6 +1409,38 @@ static int mlx5_handle_changeinfodata_event(struct mlx5_lag *ldev, return 1; } +static void mlx5_lag_update_tracker_speed(struct lag_tracker *tracker, + struct net_device *ndev) +{ + struct ethtool_link_ksettings lksettings; + struct net_device *bond_dev; + int err; + + if (netif_is_lag_master(ndev)) + bond_dev = ndev; + else + bond_dev = netdev_master_upper_dev_get(ndev); + + if (!bond_dev) { + tracker->bond_speed_mbps = SPEED_UNKNOWN; + return; + } + + err = __ethtool_get_link_ksettings(bond_dev, &lksettings); + if (err) { + netdev_dbg(bond_dev, + "Failed to get speed for bond dev %s, err=%d\n", + bond_dev->name, err); + tracker->bond_speed_mbps = SPEED_UNKNOWN; + return; + } + + if (lksettings.base.speed == SPEED_UNKNOWN) + tracker->bond_speed_mbps = 0; + else + tracker->bond_speed_mbps = lksettings.base.speed; +} + /* this handler is always registered to netdev events */ static int mlx5_lag_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) @@ -1317,6 +1472,9 @@ static int mlx5_lag_netdev_event(struct notifier_block *this, break; } + if (changed) + mlx5_lag_update_tracker_speed(&tracker, ndev); + ldev->tracker = tracker; if (changed) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h index 4918eee2b3da..8de5640a0161 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h @@ -48,6 +48,7 @@ struct lag_tracker { unsigned int is_bonded:1; unsigned int has_inactive:1; enum netdev_lag_hash hash_type; + u32 bond_speed_mbps; }; /* LAG data of a ConnectX card. @@ -116,6 +117,14 @@ int mlx5_deactivate_lag(struct mlx5_lag *ldev); void mlx5_lag_add_devices(struct mlx5_lag *ldev); struct mlx5_devcom_comp_dev *mlx5_lag_get_devcom_comp(struct mlx5_lag *ldev); +#ifdef CONFIG_MLX5_ESWITCH +void mlx5_lag_set_vports_agg_speed(struct mlx5_lag *ldev); +void mlx5_lag_reset_vports_speed(struct mlx5_lag *ldev); +#else +static inline void mlx5_lag_set_vports_agg_speed(struct mlx5_lag *ldev) {} +static inline void mlx5_lag_reset_vports_speed(struct mlx5_lag *ldev) {} +#endif + static inline bool mlx5_lag_is_supported(struct mlx5_core_dev *dev) { if (!MLX5_CAP_GEN(dev, vport_group_manager) || diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h index cfebc110c02f..9fdb9a543cf1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h @@ -381,6 +381,7 @@ const struct mlx5_link_info *mlx5_port_ptys2info(struct mlx5_core_dev *mdev, u32 mlx5_port_info2linkmodes(struct mlx5_core_dev *mdev, struct mlx5_link_info *info, bool force_legacy); +int mlx5_port_oper_linkspeed(struct mlx5_core_dev *mdev, u32 *speed); int mlx5_port_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed); #define MLX5_PPS_CAP(mdev) (MLX5_CAP_GEN((mdev), pps) && \ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c index 85a9e534f442..83044c9b6b41 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c @@ -1200,6 +1200,30 @@ u32 mlx5_port_info2linkmodes(struct mlx5_core_dev *mdev, return link_modes; } +int mlx5_port_oper_linkspeed(struct mlx5_core_dev *mdev, u32 *speed) +{ + const struct mlx5_link_info *table; + struct mlx5_port_eth_proto eproto; + u32 oper_speed = 0; + u32 max_size; + bool ext; + int err; + int i; + + ext = mlx5_ptys_ext_supported(mdev); + err = mlx5_port_query_eth_proto(mdev, 1, ext, &eproto); + if (err) + return err; + + mlx5e_port_get_link_mode_info_arr(mdev, &table, &max_size, false); + for (i = 0; i < max_size; ++i) + if (eproto.oper & MLX5E_PROT_MASK(i)) + oper_speed = max(oper_speed, table[i].speed); + + *speed = oper_speed; + return 0; +} + int mlx5_port_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed) { const struct mlx5_link_info *table; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c index 306affbcfd3b..78b1b291cfa4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c @@ -62,6 +62,28 @@ u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport) return MLX5_GET(query_vport_state_out, out, state); } +static int mlx5_query_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, + u16 vport, u8 other_vport, + u8 *admin_state) +{ + u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {}; + u32 in[MLX5_ST_SZ_DW(query_vport_state_in)] = {}; + int err; + + MLX5_SET(query_vport_state_in, in, opcode, + MLX5_CMD_OP_QUERY_VPORT_STATE); + MLX5_SET(query_vport_state_in, in, op_mod, opmod); + MLX5_SET(query_vport_state_in, in, vport_number, vport); + MLX5_SET(query_vport_state_in, in, other_vport, other_vport); + + err = mlx5_cmd_exec_inout(mdev, query_vport_state, in, out); + if (err) + return err; + + *admin_state = MLX5_GET(query_vport_state_out, out, admin_state); + return 0; +} + int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport, u8 other_vport, u8 state) { @@ -77,6 +99,29 @@ int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, return mlx5_cmd_exec_in(mdev, modify_vport_state, in); } +int mlx5_modify_vport_max_tx_speed(struct mlx5_core_dev *mdev, u8 opmod, + u16 vport, u8 other_vport, u16 max_tx_speed) +{ + u32 in[MLX5_ST_SZ_DW(modify_vport_state_in)] = {}; + u8 admin_state; + int err; + + err = mlx5_query_vport_admin_state(mdev, opmod, vport, other_vport, + &admin_state); + if (err) + return err; + + MLX5_SET(modify_vport_state_in, in, opcode, + MLX5_CMD_OP_MODIFY_VPORT_STATE); + MLX5_SET(modify_vport_state_in, in, op_mod, opmod); + MLX5_SET(modify_vport_state_in, in, vport_number, vport); + MLX5_SET(modify_vport_state_in, in, other_vport, other_vport); + MLX5_SET(modify_vport_state_in, in, admin_state, admin_state); + MLX5_SET(modify_vport_state_in, in, max_tx_speed, max_tx_speed); + + return mlx5_cmd_exec_in(mdev, modify_vport_state, in); +} + static int mlx5_query_nic_vport_context(struct mlx5_core_dev *mdev, u16 vport, bool other_vport, u32 *out) { diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index f876bfc0669c..2acf10e9f60a 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -41,6 +41,8 @@ (MLX5_CAP_GEN(mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) && \ mlx5_core_is_pf(mdev)) +#define MLX5_MAX_TX_SPEED_UNIT 100 + enum { MLX5_CAP_INLINE_MODE_L2, MLX5_CAP_INLINE_MODE_VPORT_CONTEXT, @@ -58,6 +60,8 @@ enum { u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport); int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport, u8 other_vport, u8 state); +int mlx5_modify_vport_max_tx_speed(struct mlx5_core_dev *mdev, u8 opmod, + u16 vport, u8 other_vport, u16 max_tx_speed); int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev, u16 vport, bool other, u8 *addr); int mlx5_query_mac_address(struct mlx5_core_dev *mdev, u8 *addr); -- 2.47.1 From: Or Har-Toov Add port change event handling logic for MPESW LAG mode, ensuring VFs are updated when the speed of LAG physical ports changes. This triggers a speed update workflow when relevant port state changes occur, enabling consistent and accurate reporting of VF bandwidth. Signed-off-by: Or Har-Toov Reviewed-by: Maher Sanalla Reviewed-by: Mark Bloch Signed-off-by: Edward Srouji --- drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c | 38 ++++++++++++++++++--- drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h | 2 ++ .../net/ethernet/mellanox/mlx5/core/lag/mpesw.c | 39 ++++++++++++++++++++++ .../net/ethernet/mellanox/mlx5/core/lag/mpesw.h | 14 ++++++++ drivers/net/ethernet/mellanox/mlx5/core/vport.c | 29 ++++++++++++++++ include/linux/mlx5/driver.h | 1 + include/linux/mlx5/vport.h | 2 ++ 7 files changed, 121 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c index a042612dcde6..0b931aaecef8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c @@ -233,14 +233,25 @@ static void mlx5_ldev_free(struct kref *ref) { struct mlx5_lag *ldev = container_of(ref, struct mlx5_lag, ref); struct net *net; + int i; if (ldev->nb.notifier_call) { net = read_pnet(&ldev->net); unregister_netdevice_notifier_net(net, &ldev->nb); } + mlx5_ldev_for_each(i, 0, ldev) { + if (ldev->pf[i].dev && + ldev->pf[i].port_change_nb.nb.notifier_call) { + struct mlx5_nb *nb = &ldev->pf[i].port_change_nb; + + mlx5_eq_notifier_unregister(ldev->pf[i].dev, nb); + } + } + mlx5_lag_mp_cleanup(ldev); cancel_delayed_work_sync(&ldev->bond_work); + cancel_work_sync(&ldev->speed_update_work); destroy_workqueue(ldev->wq); mutex_destroy(&ldev->lock); kfree(ldev); @@ -274,6 +285,7 @@ static struct mlx5_lag *mlx5_lag_dev_alloc(struct mlx5_core_dev *dev) kref_init(&ldev->ref); mutex_init(&ldev->lock); INIT_DELAYED_WORK(&ldev->bond_work, mlx5_do_bond_work); + INIT_WORK(&ldev->speed_update_work, mlx5_mpesw_speed_update_work); ldev->nb.notifier_call = mlx5_lag_netdev_event; write_pnet(&ldev->net, mlx5_core_net(dev)); @@ -1033,6 +1045,13 @@ static int mlx5_lag_sum_devices_max_speed(struct mlx5_lag *ldev, u32 *max_speed) mlx5_port_max_linkspeed); } +static int mlx5_lag_sum_devices_oper_speed(struct mlx5_lag *ldev, + u32 *oper_speed) +{ + return mlx5_lag_sum_devices_speed(ldev, oper_speed, + mlx5_port_oper_linkspeed); +} + static void mlx5_lag_modify_device_vports_speed(struct mlx5_core_dev *mdev, u32 speed) { @@ -1070,10 +1089,14 @@ void mlx5_lag_set_vports_agg_speed(struct mlx5_lag *ldev) u32 speed; int pf_idx; - speed = ldev->tracker.bond_speed_mbps; - - if (speed == SPEED_UNKNOWN) - return; + if (ldev->mode == MLX5_LAG_MODE_MPESW) { + if (mlx5_lag_sum_devices_oper_speed(ldev, &speed)) + return; + } else { + speed = ldev->tracker.bond_speed_mbps; + if (speed == SPEED_UNKNOWN) + return; + } /* If speed is not set, use the sum of max speeds of all PFs */ if (!speed && mlx5_lag_sum_devices_max_speed(ldev, &speed)) @@ -1520,6 +1543,10 @@ static void mlx5_ldev_add_mdev(struct mlx5_lag *ldev, ldev->pf[fn].dev = dev; dev->priv.lag = ldev; + + MLX5_NB_INIT(&ldev->pf[fn].port_change_nb, + mlx5_lag_mpesw_port_change_event, PORT_CHANGE); + mlx5_eq_notifier_register(dev, &ldev->pf[fn].port_change_nb); } static void mlx5_ldev_remove_mdev(struct mlx5_lag *ldev, @@ -1531,6 +1558,9 @@ static void mlx5_ldev_remove_mdev(struct mlx5_lag *ldev, if (ldev->pf[fn].dev != dev) return; + if (ldev->pf[fn].port_change_nb.nb.notifier_call) + mlx5_eq_notifier_unregister(dev, &ldev->pf[fn].port_change_nb); + ldev->pf[fn].dev = NULL; dev->priv.lag = NULL; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h index 8de5640a0161..be1afece5fdc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h @@ -39,6 +39,7 @@ struct lag_func { struct mlx5_core_dev *dev; struct net_device *netdev; bool has_drop; + struct mlx5_nb port_change_nb; }; /* Used for collection of netdev event info. */ @@ -67,6 +68,7 @@ struct mlx5_lag { struct lag_tracker tracker; struct workqueue_struct *wq; struct delayed_work bond_work; + struct work_struct speed_update_work; struct notifier_block nb; possible_net_t net; struct lag_mp lag_mp; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c index aad52d3a90e6..31464343f642 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c @@ -103,6 +103,8 @@ static int enable_mpesw(struct mlx5_lag *ldev) goto err_rescan_drivers; } + mlx5_lag_set_vports_agg_speed(ldev); + return 0; err_rescan_drivers: @@ -216,3 +218,40 @@ bool mlx5_lag_is_mpesw(struct mlx5_core_dev *dev) return ldev && ldev->mode == MLX5_LAG_MODE_MPESW; } EXPORT_SYMBOL(mlx5_lag_is_mpesw); + +void mlx5_mpesw_speed_update_work(struct work_struct *work) +{ + struct mlx5_lag *ldev = container_of(work, struct mlx5_lag, + speed_update_work); + + mutex_lock(&ldev->lock); + if (ldev->mode == MLX5_LAG_MODE_MPESW) { + if (ldev->mode_changes_in_progress) + queue_work(ldev->wq, &ldev->speed_update_work); + else + mlx5_lag_set_vports_agg_speed(ldev); + } + + mutex_unlock(&ldev->lock); +} + +int mlx5_lag_mpesw_port_change_event(struct notifier_block *nb, + unsigned long event, void *data) +{ + struct mlx5_nb *mlx5_nb = container_of(nb, struct mlx5_nb, nb); + struct lag_func *lag_func = container_of(mlx5_nb, + struct lag_func, + port_change_nb); + struct mlx5_core_dev *dev = lag_func->dev; + struct mlx5_lag *ldev = dev->priv.lag; + struct mlx5_eqe *eqe = data; + + if (!ldev) + return NOTIFY_DONE; + + if (eqe->sub_type == MLX5_PORT_CHANGE_SUBTYPE_DOWN || + eqe->sub_type == MLX5_PORT_CHANGE_SUBTYPE_ACTIVE) + queue_work(ldev->wq, &ldev->speed_update_work); + + return NOTIFY_OK; +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.h index 02520f27a033..f5d9b5c97b0d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.h @@ -32,4 +32,18 @@ bool mlx5_lag_is_mpesw(struct mlx5_core_dev *dev); void mlx5_lag_mpesw_disable(struct mlx5_core_dev *dev); int mlx5_lag_mpesw_enable(struct mlx5_core_dev *dev); +#ifdef CONFIG_MLX5_ESWITCH +void mlx5_mpesw_speed_update_work(struct work_struct *work); +int mlx5_lag_mpesw_port_change_event(struct notifier_block *nb, + unsigned long event, void *data); +#else +static inline void mlx5_mpesw_speed_update_work(struct work_struct *work) {} +static inline int mlx5_lag_mpesw_port_change_event(struct notifier_block *nb, + unsigned long event, + void *data) +{ + return NOTIFY_DONE; +} +#endif /* CONFIG_MLX5_ESWITCH */ + #endif /* __MLX5_LAG_MPESW_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c index 78b1b291cfa4..cb098d3eb2fa 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c @@ -122,6 +122,35 @@ int mlx5_modify_vport_max_tx_speed(struct mlx5_core_dev *mdev, u8 opmod, return mlx5_cmd_exec_in(mdev, modify_vport_state, in); } +int mlx5_query_vport_max_tx_speed(struct mlx5_core_dev *mdev, u8 op_mod, + u16 vport, u8 other_vport, u32 *max_tx_speed) +{ + u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {}; + u32 in[MLX5_ST_SZ_DW(query_vport_state_in)] = {}; + u32 state; + int err; + + MLX5_SET(query_vport_state_in, in, opcode, + MLX5_CMD_OP_QUERY_VPORT_STATE); + MLX5_SET(query_vport_state_in, in, op_mod, op_mod); + MLX5_SET(query_vport_state_in, in, vport_number, vport); + MLX5_SET(query_vport_state_in, in, other_vport, other_vport); + + err = mlx5_cmd_exec_inout(mdev, query_vport_state, in, out); + if (err) + return err; + + state = MLX5_GET(query_vport_state_out, out, state); + if (state == VPORT_STATE_DOWN) { + *max_tx_speed = 0; + return 0; + } + + *max_tx_speed = MLX5_GET(query_vport_state_out, out, max_tx_speed); + return 0; +} +EXPORT_SYMBOL_GPL(mlx5_query_vport_max_tx_speed); + static int mlx5_query_nic_vport_context(struct mlx5_core_dev *mdev, u16 vport, bool other_vport, u32 *out) { diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 1c54aa6f74fb..9e0ab3cfab73 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1149,6 +1149,7 @@ int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev *dev); bool mlx5_lag_is_roce(struct mlx5_core_dev *dev); bool mlx5_lag_is_sriov(struct mlx5_core_dev *dev); bool mlx5_lag_is_active(struct mlx5_core_dev *dev); +int mlx5_lag_query_bond_speed(struct net_device *bond_dev, u32 *speed); bool mlx5_lag_mode_is_hash(struct mlx5_core_dev *dev); bool mlx5_lag_is_master(struct mlx5_core_dev *dev); bool mlx5_lag_is_shared_fdb(struct mlx5_core_dev *dev); diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index 2acf10e9f60a..dfa2fe32217a 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -60,6 +60,8 @@ enum { u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport); int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport, u8 other_vport, u8 state); +int mlx5_query_vport_max_tx_speed(struct mlx5_core_dev *mdev, u8 op_mod, + u16 vport, u8 other_vport, u32 *max_tx_speed); int mlx5_modify_vport_max_tx_speed(struct mlx5_core_dev *mdev, u8 opmod, u16 vport, u8 other_vport, u16 max_tx_speed); int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev, -- 2.47.1 From: Or Har-Toov Add mlx5_lag_query_bond_speed() to query the aggregated speed of lag configurationsi with a bond device. Signed-off-by: Or Har-Toov Reviewed-by: Mark Bloch Signed-off-by: Edward Srouji --- drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c | 27 +++++++++++++++++++++++ include/linux/mlx5/driver.h | 2 +- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c index 0b931aaecef8..187ea8219ca9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c @@ -1464,6 +1464,33 @@ static void mlx5_lag_update_tracker_speed(struct lag_tracker *tracker, tracker->bond_speed_mbps = lksettings.base.speed; } +/* Returns speed in Mbps. */ +int mlx5_lag_query_bond_speed(struct mlx5_core_dev *mdev, u32 *speed) +{ + struct mlx5_lag *ldev; + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&lag_lock, flags); + ldev = mlx5_lag_dev(mdev); + if (!ldev) { + ret = -ENODEV; + goto unlock; + } + + *speed = ldev->tracker.bond_speed_mbps; + + if (*speed == SPEED_UNKNOWN) { + mlx5_core_dbg(mdev, "Bond speed is unknown\n"); + ret = -EINVAL; + } + +unlock: + spin_unlock_irqrestore(&lag_lock, flags); + return ret; +} +EXPORT_SYMBOL_GPL(mlx5_lag_query_bond_speed); + /* this handler is always registered to netdev events */ static int mlx5_lag_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 9e0ab3cfab73..e2d067b1e67b 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1149,7 +1149,7 @@ int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev *dev); bool mlx5_lag_is_roce(struct mlx5_core_dev *dev); bool mlx5_lag_is_sriov(struct mlx5_core_dev *dev); bool mlx5_lag_is_active(struct mlx5_core_dev *dev); -int mlx5_lag_query_bond_speed(struct net_device *bond_dev, u32 *speed); +int mlx5_lag_query_bond_speed(struct mlx5_core_dev *dev, u32 *speed); bool mlx5_lag_mode_is_hash(struct mlx5_core_dev *dev); bool mlx5_lag_is_master(struct mlx5_core_dev *dev); bool mlx5_lag_is_shared_fdb(struct mlx5_core_dev *dev); -- 2.47.1 From: Or Har-Toov Add IB_EVENT_DEVICE_SPEED_CHANGE for notifying user applications on device's ports speed changes. Signed-off-by: Or Har-Toov Reviewed-by: Mark Bloch Signed-off-by: Edward Srouji --- drivers/infiniband/core/verbs.c | 1 + include/rdma/ib_verbs.h | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 11b1a194de44..f495a2182c84 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -78,6 +78,7 @@ static const char * const ib_events[] = { [IB_EVENT_QP_LAST_WQE_REACHED] = "last WQE reached", [IB_EVENT_CLIENT_REREGISTER] = "client reregister", [IB_EVENT_GID_CHANGE] = "GID changed", + [IB_EVENT_DEVICE_SPEED_CHANGE] = "device speed change" }; const char *__attribute_const__ ib_event_msg(enum ib_event_type event) diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 6aad66bc5dd7..95f1e557cbb8 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -764,6 +764,7 @@ enum ib_event_type { IB_EVENT_CLIENT_REREGISTER, IB_EVENT_GID_CHANGE, IB_EVENT_WQ_FATAL, + IB_EVENT_DEVICE_SPEED_CHANGE, }; const char *__attribute_const__ ib_event_msg(enum ib_event_type event); -- 2.47.1 From: Or Har-Toov Introduce ib_port_attr_to_rate() to compute the data rate in 100 Mbps units (deci-Gb/sec) from a port's active_speed and active_width attributes. This generic helper removes duplicated speed-to-rate calculations, which are used by sysfs and the upcoming new verb. Signed-off-by: Or Har-Toov Reviewed-by: Mark Bloch Signed-off-by: Edward Srouji --- drivers/infiniband/core/verbs.c | 51 +++++++++++++++++++++++++++++++++++++++++ include/rdma/ib_verbs.h | 14 +++++++++++ 2 files changed, 65 insertions(+) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index f495a2182c84..8b56b6b62352 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -217,6 +217,57 @@ __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate) } EXPORT_SYMBOL(ib_rate_to_mbps); +struct ib_speed_attr { + const char *str; + int speed; +}; + +#define IB_SPEED_ATTR(speed_type, _str, _speed) \ + [speed_type] = {.str = _str, .speed = _speed} + +static const struct ib_speed_attr ib_speed_attrs[] = { + IB_SPEED_ATTR(IB_SPEED_SDR, " SDR", 25), + IB_SPEED_ATTR(IB_SPEED_DDR, " DDR", 50), + IB_SPEED_ATTR(IB_SPEED_QDR, " QDR", 100), + IB_SPEED_ATTR(IB_SPEED_FDR10, " FDR10", 100), + IB_SPEED_ATTR(IB_SPEED_FDR, " FDR", 140), + IB_SPEED_ATTR(IB_SPEED_EDR, " EDR", 250), + IB_SPEED_ATTR(IB_SPEED_HDR, " HDR", 500), + IB_SPEED_ATTR(IB_SPEED_NDR, " NDR", 1000), + IB_SPEED_ATTR(IB_SPEED_XDR, " XDR", 2000), +}; + +int ib_port_attr_to_speed_info(struct ib_port_attr *attr, + struct ib_port_speed_info *speed_info) +{ + int speed_idx = attr->active_speed; + + switch (attr->active_speed) { + case IB_SPEED_DDR: + case IB_SPEED_QDR: + case IB_SPEED_FDR10: + case IB_SPEED_FDR: + case IB_SPEED_EDR: + case IB_SPEED_HDR: + case IB_SPEED_NDR: + case IB_SPEED_XDR: + case IB_SPEED_SDR: + break; + default: + speed_idx = IB_SPEED_SDR; /* Default to SDR for invalid rates */ + break; + } + + speed_info->str = ib_speed_attrs[speed_idx].str; + speed_info->rate = ib_speed_attrs[speed_idx].speed; + speed_info->rate *= ib_width_enum_to_int(attr->active_width); + if (speed_info->rate < 0) + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL(ib_port_attr_to_speed_info); + __attribute_const__ enum rdma_transport_type rdma_node_get_transport(unsigned int node_type) { diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 95f1e557cbb8..b984f9581a73 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -878,6 +878,20 @@ __attribute_const__ int ib_rate_to_mult(enum ib_rate rate); */ __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate); +struct ib_port_speed_info { + const char *str; + int rate; /* in deci-Gb/sec (100 MBps units) */ +}; + +/** + * ib_port_attr_to_speed_info - Convert port attributes to speed information + * @attr: Port attributes containing active_speed and active_width + * @speed_info: Speed information to return + * + * Returns 0 on success, -EINVAL on error. + */ +int ib_port_attr_to_speed_info(struct ib_port_attr *attr, + struct ib_port_speed_info *speed_info); /** * enum ib_mr_type - memory region type -- 2.47.1 From: Or Har-Toov Update sysfs rate_show() to rely on ib_port_attr_to_speed_info() for converting IB port speed and width attributes to data rate and speed string. Signed-off-by: Or Har-Toov Reviewed-by: Maher Sanalla Reviewed-by: Mark Bloch Signed-off-by: Edward Srouji --- drivers/infiniband/core/sysfs.c | 56 ++++++----------------------------------- 1 file changed, 8 insertions(+), 48 deletions(-) diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 0ed862b38b44..bfaca07933d8 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -292,62 +292,22 @@ static ssize_t cap_mask_show(struct ib_device *ibdev, u32 port_num, static ssize_t rate_show(struct ib_device *ibdev, u32 port_num, struct ib_port_attribute *unused, char *buf) { + struct ib_port_speed_info speed_info; struct ib_port_attr attr; - char *speed = ""; - int rate; /* in deci-Gb/sec */ ssize_t ret; ret = ib_query_port(ibdev, port_num, &attr); if (ret) return ret; - switch (attr.active_speed) { - case IB_SPEED_DDR: - speed = " DDR"; - rate = 50; - break; - case IB_SPEED_QDR: - speed = " QDR"; - rate = 100; - break; - case IB_SPEED_FDR10: - speed = " FDR10"; - rate = 100; - break; - case IB_SPEED_FDR: - speed = " FDR"; - rate = 140; - break; - case IB_SPEED_EDR: - speed = " EDR"; - rate = 250; - break; - case IB_SPEED_HDR: - speed = " HDR"; - rate = 500; - break; - case IB_SPEED_NDR: - speed = " NDR"; - rate = 1000; - break; - case IB_SPEED_XDR: - speed = " XDR"; - rate = 2000; - break; - case IB_SPEED_SDR: - default: /* default to SDR for invalid rates */ - speed = " SDR"; - rate = 25; - break; - } - - rate *= ib_width_enum_to_int(attr.active_width); - if (rate < 0) - return -EINVAL; + ret = ib_port_attr_to_speed_info(&attr, &speed_info); + if (ret) + return ret; - return sysfs_emit(buf, "%d%s Gb/sec (%dX%s)\n", rate / 10, - rate % 10 ? ".5" : "", - ib_width_enum_to_int(attr.active_width), speed); + return sysfs_emit(buf, "%d%s Gb/sec (%dX%s)\n", speed_info.rate / 10, + speed_info.rate % 10 ? ".5" : "", + ib_width_enum_to_int(attr.active_width), + speed_info.str); } static const char *phys_state_to_str(enum ib_port_phys_state phys_state) -- 2.47.1 From: Or Har-Toov Add new ibv_query_port_speed() verb to enable applications to query the effective bandwidth of a port. This verb is particularly useful when the speed is not a multiplication of IB speed and width where width is 2^n. Signed-off-by: Or Har-Toov Reviewed-by: Mark Bloch Signed-off-by: Edward Srouji --- drivers/infiniband/core/device.c | 1 + drivers/infiniband/core/uverbs_std_types_device.c | 42 +++++++++++++++++++++++ include/rdma/ib_verbs.h | 2 ++ include/uapi/rdma/ib_user_ioctl_cmds.h | 6 ++++ 4 files changed, 51 insertions(+) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 13e8a1714bbd..04edc57592aa 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2816,6 +2816,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, query_gid); SET_DEVICE_OP(dev_ops, query_pkey); SET_DEVICE_OP(dev_ops, query_port); + SET_DEVICE_OP(dev_ops, query_port_speed); SET_DEVICE_OP(dev_ops, query_qp); SET_DEVICE_OP(dev_ops, query_srq); SET_DEVICE_OP(dev_ops, query_ucontext); diff --git a/drivers/infiniband/core/uverbs_std_types_device.c b/drivers/infiniband/core/uverbs_std_types_device.c index c0fd283d9d6c..a28f9f21bed8 100644 --- a/drivers/infiniband/core/uverbs_std_types_device.c +++ b/drivers/infiniband/core/uverbs_std_types_device.c @@ -209,6 +209,39 @@ static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_PORT)( &resp, sizeof(resp)); } +static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_PORT_SPEED)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_ucontext *ucontext; + struct ib_device *ib_dev; + u32 port_num; + u64 speed; + int ret; + + ucontext = ib_uverbs_get_ucontext(attrs); + if (IS_ERR(ucontext)) + return PTR_ERR(ucontext); + ib_dev = ucontext->device; + + if (!ib_dev->ops.query_port_speed) + return -EOPNOTSUPP; + + ret = uverbs_get_const(&port_num, attrs, + UVERBS_ATTR_QUERY_PORT_SPEED_PORT_NUM); + if (ret) + return ret; + + if (!rdma_is_port_valid(ib_dev, port_num)) + return -EINVAL; + + ret = ib_dev->ops.query_port_speed(ib_dev, port_num, &speed); + if (ret) + return ret; + + return uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_PORT_SPEED_RESP, + &speed, sizeof(speed)); +} + static int UVERBS_HANDLER(UVERBS_METHOD_GET_CONTEXT)( struct uverbs_attr_bundle *attrs) { @@ -469,6 +502,14 @@ DECLARE_UVERBS_NAMED_METHOD( active_speed_ex), UA_MANDATORY)); +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_QUERY_PORT_SPEED, + UVERBS_ATTR_CONST_IN(UVERBS_ATTR_QUERY_PORT_SPEED_PORT_NUM, u32, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_PORT_SPEED_RESP, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY)); + DECLARE_UVERBS_NAMED_METHOD( UVERBS_METHOD_QUERY_GID_TABLE, UVERBS_ATTR_CONST_IN(UVERBS_ATTR_QUERY_GID_TABLE_ENTRY_SIZE, u64, @@ -498,6 +539,7 @@ DECLARE_UVERBS_GLOBAL_METHODS(UVERBS_OBJECT_DEVICE, &UVERBS_METHOD(UVERBS_METHOD_INVOKE_WRITE), &UVERBS_METHOD(UVERBS_METHOD_INFO_HANDLES), &UVERBS_METHOD(UVERBS_METHOD_QUERY_PORT), + &UVERBS_METHOD(UVERBS_METHOD_QUERY_PORT_SPEED), &UVERBS_METHOD(UVERBS_METHOD_QUERY_CONTEXT), &UVERBS_METHOD(UVERBS_METHOD_QUERY_GID_TABLE), &UVERBS_METHOD(UVERBS_METHOD_QUERY_GID_ENTRY)); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index b984f9581a73..a4786395328a 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2418,6 +2418,8 @@ struct ib_device_ops { int comp_vector); int (*query_port)(struct ib_device *device, u32 port_num, struct ib_port_attr *port_attr); + int (*query_port_speed)(struct ib_device *device, u32 port_num, + u64 *speed); int (*modify_port)(struct ib_device *device, u32 port_num, int port_modify_mask, struct ib_port_modify *port_modify); diff --git a/include/uapi/rdma/ib_user_ioctl_cmds.h b/include/uapi/rdma/ib_user_ioctl_cmds.h index de6f5a94f1e3..35da4026f452 100644 --- a/include/uapi/rdma/ib_user_ioctl_cmds.h +++ b/include/uapi/rdma/ib_user_ioctl_cmds.h @@ -73,6 +73,7 @@ enum uverbs_methods_device { UVERBS_METHOD_QUERY_CONTEXT, UVERBS_METHOD_QUERY_GID_TABLE, UVERBS_METHOD_QUERY_GID_ENTRY, + UVERBS_METHOD_QUERY_PORT_SPEED, }; enum uverbs_attrs_invoke_write_cmd_attr_ids { @@ -86,6 +87,11 @@ enum uverbs_attrs_query_port_cmd_attr_ids { UVERBS_ATTR_QUERY_PORT_RESP, }; +enum uverbs_attrs_query_port_speed_cmd_attr_ids { + UVERBS_ATTR_QUERY_PORT_SPEED_PORT_NUM, + UVERBS_ATTR_QUERY_PORT_SPEED_RESP, +}; + enum uverbs_attrs_get_context_attr_ids { UVERBS_ATTR_GET_CONTEXT_NUM_COMP_VECTORS, UVERBS_ATTR_GET_CONTEXT_CORE_SUPPORT, -- 2.47.1 From: Or Har-Toov Raise IB_EVENT_DEVICE_SPEED_CHANGE whenever the speed of one of the device's ports changes. Usually all ports of the device changes together. This ensures user applications and upper-layer software are immediately notified when bandwidth changes, improving traffic management in dynamic environments. This is especially useful for vports which are part of a LAG configuration, to know if the effective speed of the LAG was changed. Signed-off-by: Or Har-Toov Reviewed-by: Mark Bloch Signed-off-by: Edward Srouji --- drivers/infiniband/hw/mlx5/main.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 40284bbb45d6..bea42acbeaad 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2838,6 +2838,14 @@ static int handle_port_change(struct mlx5_ib_dev *ibdev, struct mlx5_eqe *eqe, case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE: case MLX5_PORT_CHANGE_SUBTYPE_DOWN: case MLX5_PORT_CHANGE_SUBTYPE_INITIALIZED: + if (ibdev->ib_active) { + struct ib_event speed_event = {}; + + speed_event.device = &ibdev->ib_dev; + speed_event.event = IB_EVENT_DEVICE_SPEED_CHANGE; + ib_dispatch_event(&speed_event); + } + /* In RoCE, port up/down events are handled in * mlx5_netdev_event(). */ -- 2.47.1 From: Or Har-Toov Implement the query_port_speed callback for mlx5 driver to support querying effective port bandwidth. For LAG configurations, query the aggregated speed from the LAG layer or from the modified vport max_tx_speed. Signed-off-by: Or Har-Toov Reviewed-by: Mark Bloch Signed-off-by: Edward Srouji --- drivers/infiniband/hw/mlx5/main.c | 124 +++++++++++++++++++++++++++++++++++ drivers/infiniband/hw/mlx5/mlx5_ib.h | 2 + 2 files changed, 126 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index bea42acbeaad..47c19d527fa2 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1581,6 +1581,129 @@ static int mlx5_ib_rep_query_pkey(struct ib_device *ibdev, u32 port, u16 index, return 0; } +static int mlx5_ib_query_port_speed_from_port(struct mlx5_ib_dev *dev, + u32 port_num, u64 *speed) +{ + struct ib_port_speed_info speed_info; + struct ib_port_attr attr = {}; + int err; + + err = mlx5_ib_query_port(&dev->ib_dev, port_num, &attr); + if (err) + return err; + + if (attr.state == IB_PORT_DOWN) { + *speed = 0; + return 0; + } + + err = ib_port_attr_to_speed_info(&attr, &speed_info); + if (err) + return err; + + *speed = speed_info.rate; + return 0; +} + +static int mlx5_ib_query_port_speed_from_vport(struct mlx5_core_dev *mdev, + u8 op_mod, u16 vport, + u8 other_vport, u64 *speed, + struct mlx5_ib_dev *dev, + u32 port_num) +{ + u32 max_tx_speed; + int err; + + err = mlx5_query_vport_max_tx_speed(mdev, op_mod, vport, other_vport, + &max_tx_speed); + if (err) + return err; + + if (max_tx_speed == 0) + /* Value 0 indicates field not supported, fallback */ + return mlx5_ib_query_port_speed_from_port(dev, port_num, + speed); + + *speed = max_tx_speed; + return 0; +} + +static int mlx5_ib_query_port_speed_from_bond(struct mlx5_ib_dev *dev, + u32 port_num, u64 *speed) +{ + struct mlx5_core_dev *mdev = dev->mdev; + u32 bond_speed; + int err; + + err = mlx5_lag_query_bond_speed(mdev, &bond_speed); + if (err) + return err; + + *speed = bond_speed / MLX5_MAX_TX_SPEED_UNIT; + + return 0; +} + +static int mlx5_ib_query_port_speed_non_rep(struct mlx5_ib_dev *dev, + u32 port_num, u64 *speed) +{ + u16 op_mod = MLX5_VPORT_STATE_OP_MOD_VNIC_VPORT; + + if (mlx5_lag_is_roce(dev->mdev)) + return mlx5_ib_query_port_speed_from_bond(dev, port_num, + speed); + + return mlx5_ib_query_port_speed_from_vport(dev->mdev, op_mod, 0, false, + speed, dev, port_num); +} + +static int mlx5_ib_query_port_speed_rep(struct mlx5_ib_dev *dev, u32 port_num, + u64 *speed) +{ + struct mlx5_eswitch_rep *rep; + struct mlx5_core_dev *mdev; + u16 op_mod; + + if (!dev->port[port_num - 1].rep) { + mlx5_ib_warn(dev, "Representor doesn't exist for port %u\n", + port_num); + return -EINVAL; + } + + rep = dev->port[port_num - 1].rep; + mdev = mlx5_eswitch_get_core_dev(rep->esw); + if (!mdev) + return -ENODEV; + + if (rep->vport == MLX5_VPORT_UPLINK) { + if (mlx5_lag_is_sriov(mdev)) + return mlx5_ib_query_port_speed_from_bond(dev, + port_num, + speed); + + return mlx5_ib_query_port_speed_from_port(dev, port_num, + speed); + } + + op_mod = MLX5_VPORT_STATE_OP_MOD_ESW_VPORT; + return mlx5_ib_query_port_speed_from_vport(dev->mdev, op_mod, + rep->vport, true, speed, dev, + port_num); +} + +int mlx5_ib_query_port_speed(struct ib_device *ibdev, u32 port_num, u64 *speed) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + + if (mlx5_ib_port_link_layer(ibdev, port_num) == + IB_LINK_LAYER_INFINIBAND || mlx5_core_mp_enabled(dev->mdev)) + return mlx5_ib_query_port_speed_from_port(dev, port_num, speed); + else if (!dev->is_rep) + return mlx5_ib_query_port_speed_non_rep(dev, port_num, speed); + else + return mlx5_ib_query_port_speed_rep(dev, port_num, speed); +} + static int mlx5_ib_query_gid(struct ib_device *ibdev, u32 port, int index, union ib_gid *gid) { @@ -4305,6 +4428,7 @@ static const struct ib_device_ops mlx5_ib_dev_ops = { .query_device = mlx5_ib_query_device, .query_gid = mlx5_ib_query_gid, .query_pkey = mlx5_ib_query_pkey, + .query_port_speed = mlx5_ib_query_port_speed, .query_qp = mlx5_ib_query_qp, .query_srq = mlx5_ib_query_srq, .query_ucontext = mlx5_ib_query_ucontext, diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 09d82d5f95e3..cc6b3b6c713c 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1435,6 +1435,8 @@ int mlx5_query_mad_ifc_port(struct ib_device *ibdev, u32 port, struct ib_port_attr *props); int mlx5_ib_query_port(struct ib_device *ibdev, u32 port, struct ib_port_attr *props); +int mlx5_ib_query_port_speed(struct ib_device *ibdev, u32 port_num, + u64 *speed); void mlx5_ib_populate_pas(struct ib_umem *umem, size_t page_size, __be64 *pas, u64 access_flags); int mlx5_ib_get_cqe_size(struct ib_cq *ibcq); -- 2.47.1