Currently phylink_resolve() protects itself against concurrent phylink_bringup_phy() or phylink_disconnect_phy() calls which modify pl->phydev by relying on pl->state_mutex. The problem is that in phylink_resolve(), pl->state_mutex is in a lock inversion state with pl->phydev->lock. So pl->phydev->lock needs to be acquired prior to pl->state_mutex. But that requires dereferencing pl->phydev in the first place, and without pl->state_mutex, that is racy. Hence the reason for the extra lock. Currently it is redundant, but it will serve a functional purpose once mutex_lock(&phy->lock) will be moved outside of the mutex_lock(&pl->state_mutex) section. Another alternative considered would have been to let phylink_resolve() acquire the rtnl_mutex, which is also held when phylink_bringup_phy() and phylink_disconnect_phy() are called. But since phylink_disconnect_phy() runs under rtnl_lock(), it would deadlock with phylink_resolve() when calling flush_work(&pl->resolve). Additionally, it would have been undesirable because it would have unnecessarily blocked many other call paths as well in the entire kernel, so the smaller-scoped lock was preferred. Link: https://lore.kernel.org/netdev/aLb6puGVzR29GpPx@shell.armlinux.org.uk/ Signed-off-by: Vladimir Oltean --- v2->v3: - clarify in the commit message that rtnl_lock() in phylink_resolve() wouldn't have worked - s/phy_lock/phydev_mutex/ - expand and refactor critical section in phylink_disconnect_phy() - make use of the local "phy" variable in phylink_resolve() v2 at: https://lore.kernel.org/netdev/20250903152348.2998651-1-vladimir.oltean@nxp.com/ v1->v2: patch is new drivers/net/phy/phylink.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index c7f867b361dd..386d37f6bad4 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -67,6 +67,8 @@ struct phylink { struct timer_list link_poll; struct mutex state_mutex; + /* Serialize updates to pl->phydev with phylink_resolve() */ + struct mutex phydev_mutex; struct phylink_link_state phy_state; unsigned int phy_ib_mode; struct work_struct resolve; @@ -1582,8 +1584,11 @@ static void phylink_resolve(struct work_struct *w) struct phylink_link_state link_state; bool mac_config = false; bool retrigger = false; + struct phy_device *phy; bool cur_link_state; + mutex_lock(&pl->phydev_mutex); + phy = pl->phydev; mutex_lock(&pl->state_mutex); cur_link_state = phylink_link_is_up(pl); @@ -1617,11 +1622,11 @@ static void phylink_resolve(struct work_struct *w) /* If we have a phy, the "up" state is the union of both the * PHY and the MAC */ - if (pl->phydev) + if (phy) link_state.link &= pl->phy_state.link; /* Only update if the PHY link is up */ - if (pl->phydev && pl->phy_state.link) { + if (phy && pl->phy_state.link) { /* If the interface has changed, force a link down * event if the link isn't already down, and re-resolve. */ @@ -1685,6 +1690,7 @@ static void phylink_resolve(struct work_struct *w) queue_work(system_power_efficient_wq, &pl->resolve); } mutex_unlock(&pl->state_mutex); + mutex_unlock(&pl->phydev_mutex); } static void phylink_run_resolve(struct phylink *pl) @@ -1820,6 +1826,7 @@ struct phylink *phylink_create(struct phylink_config *config, if (!pl) return ERR_PTR(-ENOMEM); + mutex_init(&pl->phydev_mutex); mutex_init(&pl->state_mutex); INIT_WORK(&pl->resolve, phylink_resolve); @@ -2080,6 +2087,7 @@ static int phylink_bringup_phy(struct phylink *pl, struct phy_device *phy, dev_name(&phy->mdio.dev), phy->drv->name, irq_str); kfree(irq_str); + mutex_lock(&pl->phydev_mutex); mutex_lock(&phy->lock); mutex_lock(&pl->state_mutex); pl->phydev = phy; @@ -2125,6 +2133,7 @@ static int phylink_bringup_phy(struct phylink *pl, struct phy_device *phy, mutex_unlock(&pl->state_mutex); mutex_unlock(&phy->lock); + mutex_unlock(&pl->phydev_mutex); phylink_dbg(pl, "phy: %s setting supported %*pb advertising %*pb\n", @@ -2303,6 +2312,7 @@ void phylink_disconnect_phy(struct phylink *pl) ASSERT_RTNL(); + mutex_lock(&pl->phydev_mutex); phy = pl->phydev; if (phy) { mutex_lock(&phy->lock); @@ -2312,8 +2322,11 @@ void phylink_disconnect_phy(struct phylink *pl) pl->mac_tx_clk_stop = false; mutex_unlock(&pl->state_mutex); mutex_unlock(&phy->lock); - flush_work(&pl->resolve); + } + mutex_unlock(&pl->phydev_mutex); + if (phy) { + flush_work(&pl->resolve); phy_disconnect(phy); } } -- 2.34.1 Problem description =================== Lockdep reports a possible circular locking dependency (AB/BA) between &pl->state_mutex and &phy->lock, as follows. phylink_resolve() // acquires &pl->state_mutex -> phylink_major_config() -> phy_config_inband() // acquires &pl->phydev->lock whereas all the other call sites where &pl->state_mutex and &pl->phydev->lock have the locking scheme reversed. Everywhere else, &pl->phydev->lock is acquired at the top level, and &pl->state_mutex at the lower level. A clear example is phylink_bringup_phy(). The outlier is the newly introduced phy_config_inband() and the existing lock order is the correct one. To understand why it cannot be the other way around, it is sufficient to consider phylink_phy_change(), phylink's callback from the PHY device's phy->phy_link_change() virtual method, invoked by the PHY state machine. phy_link_up() and phy_link_down(), the (indirect) callers of phylink_phy_change(), are called with &phydev->lock acquired. Then phylink_phy_change() acquires its own &pl->state_mutex, to serialize changes made to its pl->phy_state and pl->link_config. So all other instances of &pl->state_mutex and &phydev->lock must be consistent with this order. Problem impact ============== I think the kernel runs a serious deadlock risk if an existing phylink_resolve() thread, which results in a phy_config_inband() call, is concurrent with a phy_link_up() or phy_link_down() call, which will deadlock on &pl->state_mutex in phylink_phy_change(). Practically speaking, the impact may be limited by the slow speed of the medium auto-negotiation protocol, which makes it unlikely for the current state to still be unresolved when a new one is detected, but I think the problem is there. Nonetheless, the problem was discovered using lockdep. Proposed solution ================= Practically speaking, the phy_config_inband() requirement of having phydev->lock acquired must transfer to the caller (phylink is the only caller). There, it must bubble up until immediately before &pl->state_mutex is acquired, for the cases where that takes place. Solution details, considerations, notes ======================================= This is the phy_config_inband() call graph: sfp_upstream_ops :: connect_phy() | v phylink_sfp_connect_phy() | v phylink_sfp_config_phy() | | sfp_upstream_ops :: module_insert() | | | v | phylink_sfp_module_insert() | | | | sfp_upstream_ops :: module_start() | | | | | v | | phylink_sfp_module_start() | | | | v v | phylink_sfp_config_optical() phylink_start() | | | phylink_resume() v v | | phylink_sfp_set_config() | | | v v v phylink_mac_initial_config() | phylink_resolve() | | phylink_ethtool_ksettings_set() v v v phylink_major_config() | v phy_config_inband() phylink_major_config() caller #1, phylink_mac_initial_config(), does not acquire &pl->state_mutex nor do its callers. It must acquire &pl->phydev->lock prior to calling phylink_major_config(). phylink_major_config() caller #2, phylink_resolve() acquires &pl->state_mutex, thus also needs to acquire &pl->phydev->lock. phylink_major_config() caller #3, phylink_ethtool_ksettings_set(), is completely uninteresting, because it only calls phylink_major_config() if pl->phydev is NULL (otherwise it calls phy_ethtool_ksettings_set()). We need to change nothing there. Other solutions =============== The lock inversion between &pl->state_mutex and &pl->phydev->lock has occurred at least once before, as seen in commit c718af2d00a3 ("net: phylink: fix ethtool -A with attached PHYs"). The solution there was to simply not call phy_set_asym_pause() under the &pl->state_mutex. That cannot be extended to our case though, where the phy_config_inband() call is much deeper inside the &pl->state_mutex section. Fixes: 5fd0f1a02e75 ("net: phylink: add negotiation of in-band capabilities") Signed-off-by: Vladimir Oltean --- v2->v3: - remove the patch hunk which replaced "pl->phydev" with the local "phy" variable in phylink_resolve(). It's now in the previous patch. v2 at: https://lore.kernel.org/netdev/20250903152348.2998651-2-vladimir.oltean@nxp.com/ v1->v2: - rebase over new patch which introduces pl->phy_lock - add "Other solutions" section v1 at: https://lore.kernel.org/netdev/20250902134141.2430896-1-vladimir.oltean@nxp.com/ drivers/net/phy/phy.c | 12 ++++-------- drivers/net/phy/phylink.c | 9 +++++++++ 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 13df28445f02..c02da57a4da5 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -1065,23 +1065,19 @@ EXPORT_SYMBOL_GPL(phy_inband_caps); */ int phy_config_inband(struct phy_device *phydev, unsigned int modes) { - int err; + lockdep_assert_held(&phydev->lock); if (!!(modes & LINK_INBAND_DISABLE) + !!(modes & LINK_INBAND_ENABLE) + !!(modes & LINK_INBAND_BYPASS) != 1) return -EINVAL; - mutex_lock(&phydev->lock); if (!phydev->drv) - err = -EIO; + return -EIO; else if (!phydev->drv->config_inband) - err = -EOPNOTSUPP; - else - err = phydev->drv->config_inband(phydev, modes); - mutex_unlock(&phydev->lock); + return -EOPNOTSUPP; - return err; + return phydev->drv->config_inband(phydev, modes); } EXPORT_SYMBOL(phy_config_inband); diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 386d37f6bad4..76cc6f6d671b 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -1425,6 +1425,7 @@ static void phylink_get_fixed_state(struct phylink *pl, static void phylink_mac_initial_config(struct phylink *pl, bool force_restart) { struct phylink_link_state link_state; + struct phy_device *phy = pl->phydev; switch (pl->req_link_an_mode) { case MLO_AN_PHY: @@ -1448,7 +1449,11 @@ static void phylink_mac_initial_config(struct phylink *pl, bool force_restart) link_state.link = false; phylink_apply_manual_flow(pl, &link_state); + if (phy) + mutex_lock(&phy->lock); phylink_major_config(pl, force_restart, &link_state); + if (phy) + mutex_unlock(&phy->lock); } static const char *phylink_pause_to_str(int pause) @@ -1589,6 +1594,8 @@ static void phylink_resolve(struct work_struct *w) mutex_lock(&pl->phydev_mutex); phy = pl->phydev; + if (phy) + mutex_lock(&phy->lock); mutex_lock(&pl->state_mutex); cur_link_state = phylink_link_is_up(pl); @@ -1690,6 +1697,8 @@ static void phylink_resolve(struct work_struct *w) queue_work(system_power_efficient_wq, &pl->resolve); } mutex_unlock(&pl->state_mutex); + if (phy) + mutex_unlock(&phy->lock); mutex_unlock(&pl->phydev_mutex); } -- 2.34.1