During 10GBASE-KR link training, the PHY state machine can be corrupted if device stop or rate change operations are initiated while training is in progress. This manifests as: - Link stability issues after interface down/up cycles - PHY state machine lockups requiring a full driver reset - Intermittent link failures on Inphi re-driver configurations The root cause is that the firmware mailbox operations for device stop and rate changes can interfere with ongoing KR training sequences, leaving the PHY in an inconsistent state. Add synchronization to prevent device operations from interrupting active KR training: - Introduce a mailbox mutex to serialize firmware command access - Wait for KR training completion (or timeout) before proceeding with stop/rate change operations - Only wait when KR training is actually active (KR mode with autoneg enabled or Inphi re-driver present) - Use a 500ms timeout to handle hung training sequences The mailbox mutex protects the critical section of firmware command submission and completion checking, preventing concurrent mailbox access from multiple code paths. Testing on AMD platforms with both direct-attach and Inphi re-driver configurations shows this eliminates PHY state corruption during interface operations and link changes. Fixes: 549b32af9f7c ("amd-xgbe: Simplify mailbox interface rate change code") Signed-off-by: Raju Rangoju --- drivers/net/ethernet/amd/xgbe/xgbe-drv.c | 2 + drivers/net/ethernet/amd/xgbe/xgbe-main.c | 1 + drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c | 51 ++++++++++++++++++++- drivers/net/ethernet/amd/xgbe/xgbe.h | 5 ++ 4 files changed, 57 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c index 23beea48ae26..3913eb7e1da3 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c @@ -1321,6 +1321,8 @@ static void xgbe_stop(struct xgbe_prv_data *pdata) DBGPR("-->xgbe_stop\n"); + xgbe_check_kr_training_in_progress(pdata); + if (test_bit(XGBE_STOPPED, &pdata->dev_state)) return; diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-main.c b/drivers/net/ethernet/amd/xgbe/xgbe-main.c index 7d45ea22a02e..5f3ab29707b7 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-main.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-main.c @@ -78,6 +78,7 @@ struct xgbe_prv_data *xgbe_alloc_pdata(struct device *dev) spin_lock_init(&pdata->xpcs_lock); mutex_init(&pdata->rss_mutex); + mutex_init(&pdata->mailbox_lock); spin_lock_init(&pdata->tstamp_lock); mutex_init(&pdata->i2c_mutex); init_completion(&pdata->i2c_complete); diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c index b8cf6ccfe641..142eb952a29c 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c @@ -2095,12 +2095,57 @@ static void xgbe_phy_pll_ctrl(struct xgbe_prv_data *pdata, bool enable) usleep_range(100, 200); } +static bool xgbe_phy_port_is_inphi(struct xgbe_prv_data *pdata) +{ + struct xgbe_phy_data *phy_data = pdata->phy_data; + + /* Re-driver models 4223 && 4227 are supported Inphi models */ + return phy_data->redrv && + (phy_data->redrv_model == XGBE_PHY_REDRV_MODEL_4223 || + phy_data->redrv_model == XGBE_PHY_REDRV_MODEL_4227); +} + +void xgbe_check_kr_training_in_progress(struct xgbe_prv_data *pdata) +{ + struct xgbe_phy_data *phy_data = pdata->phy_data; + unsigned long kr_timeout; + int wait; + + /* Only wait for KR training in specific conditions: + * - Inphi re-driver is present, OR + * - Currently in KR mode with autoneg enabled + */ + if (!xgbe_phy_port_is_inphi(pdata) && + !(phy_data->cur_mode == XGBE_MODE_KR && + pdata->phy.autoneg == AUTONEG_ENABLE)) + return; + + wait = XGBE_KR_TRAINING_WAIT_ITER; + while (wait--) { + /* Check if we've exceeded the AN timeout window */ + kr_timeout = pdata->kr_start_time + + msecs_to_jiffies(XGBE_AN_MS_TIMEOUT + + XGBE_KR_TRAINING_WAIT_MS); + if (time_after(jiffies, kr_timeout)) + break; + + /* Training is complete - no need to wait */ + if (pdata->an_result == XGBE_AN_COMPLETE) + return; + + usleep_range(10000, 11000); + } +} + static void xgbe_phy_perform_ratechange(struct xgbe_prv_data *pdata, - enum xgbe_mb_cmd cmd, enum xgbe_mb_subcmd sub_cmd) + enum xgbe_mb_cmd cmd, + enum xgbe_mb_subcmd sub_cmd) { unsigned int s0 = 0; unsigned int wait; + xgbe_check_kr_training_in_progress(pdata); + /* Disable PLL re-initialization during FW command processing */ xgbe_phy_pll_ctrl(pdata, false); @@ -2115,7 +2160,9 @@ static void xgbe_phy_perform_ratechange(struct xgbe_prv_data *pdata, XP_SET_BITS(s0, XP_DRIVER_SCRATCH_0, COMMAND, cmd); XP_SET_BITS(s0, XP_DRIVER_SCRATCH_0, SUB_COMMAND, sub_cmd); - /* Issue the command */ + /* Acquire mailbox lock for firmware command */ + guard(mutex)(&pdata->mailbox_lock); + XP_IOWRITE(pdata, XP_DRIVER_SCRATCH_0, s0); XP_IOWRITE(pdata, XP_DRIVER_SCRATCH_1, 0); XP_IOWRITE_BITS(pdata, XP_DRIVER_INT_REQ, REQUEST, 1); diff --git a/drivers/net/ethernet/amd/xgbe/xgbe.h b/drivers/net/ethernet/amd/xgbe/xgbe.h index 438033a71523..238eeee0d422 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe.h +++ b/drivers/net/ethernet/amd/xgbe/xgbe.h @@ -202,6 +202,7 @@ #define XGBE_AN_MS_TIMEOUT 500 #define XGBE_LINK_TIMEOUT 5 #define XGBE_KR_TRAINING_WAIT_ITER 50 +#define XGBE_KR_TRAINING_WAIT_MS 100 #define XGBE_SGMII_AN_LINK_DUPLEX BIT(1) #define XGBE_SGMII_AN_LINK_SPEED (BIT(2) | BIT(3)) @@ -1015,6 +1016,9 @@ struct xgbe_prv_data { /* RSS addressing mutex */ struct mutex rss_mutex; + /* Firmware mailbox mutex */ + struct mutex mailbox_lock; + /* Flags representing xgbe_state */ unsigned long dev_state; @@ -1252,6 +1256,7 @@ struct xgbe_prv_data { }; /* Function prototypes*/ +void xgbe_check_kr_training_in_progress(struct xgbe_prv_data *pdata); struct xgbe_prv_data *xgbe_alloc_pdata(struct device *); void xgbe_free_pdata(struct xgbe_prv_data *); void xgbe_set_counts(struct xgbe_prv_data *); -- 2.34.1