From: Weili Qian If device error occurs during live migration, qemu will reset the VF. At this time, VF reset and device reset are performed simultaneously. The VF reset will timeout. Therefore, the QM_RESETTING flag is used to ensure that VF reset and device reset are performed serially. Fixes: b0eed085903e ("hisi_acc_vfio_pci: Add support for VFIO live migration") Signed-off-by: Weili Qian --- .../vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 24 +++++++++++++++++++ .../vfio/pci/hisilicon/hisi_acc_vfio_pci.h | 2 ++ 2 files changed, 26 insertions(+) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index fe2ffcd00d6e..d55365b21f78 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -1188,14 +1188,37 @@ hisi_acc_vfio_pci_get_device_state(struct vfio_device *vdev, return 0; } +static void hisi_acc_vf_pci_reset_prepare(struct pci_dev *pdev) +{ + struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_drvdata(pdev); + struct hisi_qm *qm = hisi_acc_vdev->pf_qm; + struct device *dev = &qm->pdev->dev; + u32 delay = 0; + + /* All reset requests need to be queued for processing */ + while (test_and_set_bit(QM_RESETTING, &qm->misc_ctl)) { + msleep(1); + if (++delay > QM_RESET_WAIT_TIMEOUT) { + dev_err(dev, "reset prepare failed\n"); + return; + } + } + + hisi_acc_vdev->set_reset_flag = true; +} + static void hisi_acc_vf_pci_aer_reset_done(struct pci_dev *pdev) { struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_drvdata(pdev); + struct hisi_qm *qm = hisi_acc_vdev->pf_qm; if (hisi_acc_vdev->core_device.vdev.migration_flags != VFIO_MIGRATION_STOP_COPY) return; + if (hisi_acc_vdev->set_reset_flag) + clear_bit(QM_RESETTING, &qm->misc_ctl); + mutex_lock(&hisi_acc_vdev->state_mutex); hisi_acc_vf_reset(hisi_acc_vdev); mutex_unlock(&hisi_acc_vdev->state_mutex); @@ -1746,6 +1769,7 @@ static const struct pci_device_id hisi_acc_vfio_pci_table[] = { MODULE_DEVICE_TABLE(pci, hisi_acc_vfio_pci_table); static const struct pci_error_handlers hisi_acc_vf_err_handlers = { + .reset_prepare = hisi_acc_vf_pci_reset_prepare, .reset_done = hisi_acc_vf_pci_aer_reset_done, .error_detected = vfio_pci_core_aer_err_detected, }; diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h index cd55eba64dfb..a3d91a31e3d8 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h @@ -27,6 +27,7 @@ #define ERROR_CHECK_TIMEOUT 100 #define CHECK_DELAY_TIME 100 +#define QM_RESET_WAIT_TIMEOUT 60000 #define QM_SQC_VFT_BASE_SHIFT_V2 28 #define QM_SQC_VFT_BASE_MASK_V2 GENMASK(15, 0) @@ -128,6 +129,7 @@ struct hisi_acc_vf_migration_file { struct hisi_acc_vf_core_device { struct vfio_pci_core_device core_device; u8 match_done; + bool set_reset_flag; /* * io_base is only valid when dev_opened is true, * which is protected by open_mutex. -- 2.24.0 After a RAS error occurs on the accelerator device, the accelerator device will be reset. The live migration state will be abnormal after reset, and the original state needs to be restored during the reset process. Therefore, reset processing needs to be performed in a live migration scenario. Signed-off-by: Longfang Liu --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index d55365b21f78..e782c2274871 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -1212,8 +1212,7 @@ static void hisi_acc_vf_pci_aer_reset_done(struct pci_dev *pdev) struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_drvdata(pdev); struct hisi_qm *qm = hisi_acc_vdev->pf_qm; - if (hisi_acc_vdev->core_device.vdev.migration_flags != - VFIO_MIGRATION_STOP_COPY) + if (!hisi_acc_vdev->core_device.vdev.mig_ops) return; if (hisi_acc_vdev->set_reset_flag) -- 2.24.0 In special scenarios involving duplicate migrations, after the first migration is completed, if the original VF device is used again and then migrated to another destination, the state indicating data migration completion for the VF device is not reset. This results in the second migration to the destination being skipped without performing data migration. After the modification, it ensures that a complete data migration is performed after the subsequent migration. Signed-off-by: Longfang Liu --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index e782c2274871..394f1952a7ed 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -1583,6 +1583,7 @@ static int hisi_acc_vfio_pci_open_device(struct vfio_device *core_vdev) } hisi_acc_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING; hisi_acc_vdev->dev_opened = true; + hisi_acc_vdev->match_done = 0; mutex_unlock(&hisi_acc_vdev->open_mutex); } -- 2.24.0 When the number of QPs initialized by the device, as read via vft, is zero, it indicates either an abnormal device configuration or an abnormal read result. Returning 0 directly in this case would allow the live migration operation to complete successfully, leading to incorrect parameter configuration after migration and preventing the service from recovering normal functionality. Therefore, in such situations, an error should be returned to roll back the live migration operation. Signed-off-by: Longfang Liu --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 394f1952a7ed..e0cc20f5f38b 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -406,7 +406,7 @@ static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev, struct hisi_qm *pf_qm = hisi_acc_vdev->pf_qm; struct device *dev = &vf_qm->pdev->dev; u32 que_iso_state; - int ret; + int qp_num, ret; if (migf->total_length < QM_MATCH_SIZE || hisi_acc_vdev->match_done) return 0; @@ -423,18 +423,18 @@ static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev, } /* VF qp num check */ - ret = qm_get_vft(vf_qm, &vf_qm->qp_base); - if (ret <= 0) { + qp_num = qm_get_vft(vf_qm, &vf_qm->qp_base); + if (qp_num <= 0) { dev_err(dev, "failed to get vft qp nums\n"); - return ret; + return -EINVAL; } - if (ret != vf_data->qp_num) { + if (qp_num != vf_data->qp_num) { dev_err(dev, "failed to match VF qp num\n"); return -EINVAL; } - vf_qm->qp_num = ret; + vf_qm->qp_num = qp_num; /* VF isolation state check */ ret = qm_read_regs(pf_qm, QM_QUE_ISO_CFG_V, &que_iso_state, 1); -- 2.24.0