All of the necessary building blocks are now in place to support SR-IOV VF migration. Flip the enable/disable logic to match VF code and disable the feature only for platforms that don't meet the necessary prerequisites. To allow more testing and experiments, on DEBUG builds any missing prerequisites will be ignored. Signed-off-by: Michał Winiarski Reviewed-by: Michal Wajdeczko --- drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.c | 9 +++++ drivers/gpu/drm/xe/xe_sriov_pf_migration.c | 35 ++++++++++++++++--- drivers/gpu/drm/xe/xe_sriov_pf_migration.h | 1 + .../gpu/drm/xe/xe_sriov_pf_migration_types.h | 4 +-- 4 files changed, 42 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.c index d5d918ddce4fe..3174a8dee779e 100644 --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.c +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.c @@ -17,6 +17,7 @@ #include "xe_gt_sriov_pf_helpers.h" #include "xe_gt_sriov_pf_migration.h" #include "xe_gt_sriov_printk.h" +#include "xe_guc.h" #include "xe_guc_buf.h" #include "xe_guc_ct.h" #include "xe_migrate.h" @@ -1023,6 +1024,12 @@ static void action_ring_cleanup(void *arg) ptr_ring_cleanup(r, destroy_pf_packet); } +static void pf_gt_migration_check_support(struct xe_gt *gt) +{ + if (GUC_FIRMWARE_VER(>->uc.guc) < MAKE_GUC_VER(70, 54, 0)) + xe_sriov_pf_migration_disable(gt_to_xe(gt), "requires GuC version >= 70.54.0"); +} + /** * xe_gt_sriov_pf_migration_init() - Initialize support for VF migration. * @gt: the &xe_gt @@ -1039,6 +1046,8 @@ int xe_gt_sriov_pf_migration_init(struct xe_gt *gt) xe_gt_assert(gt, IS_SRIOV_PF(xe)); + pf_gt_migration_check_support(gt); + if (!pf_migration_supported(gt)) return 0; diff --git a/drivers/gpu/drm/xe/xe_sriov_pf_migration.c b/drivers/gpu/drm/xe/xe_sriov_pf_migration.c index de06cc690fc81..6c4b16409cc9a 100644 --- a/drivers/gpu/drm/xe/xe_sriov_pf_migration.c +++ b/drivers/gpu/drm/xe/xe_sriov_pf_migration.c @@ -46,13 +46,37 @@ bool xe_sriov_pf_migration_supported(struct xe_device *xe) { xe_assert(xe, IS_SRIOV_PF(xe)); - return xe->sriov.pf.migration.supported; + return IS_ENABLED(CONFIG_DRM_XE_DEBUG) || !xe->sriov.pf.migration.disabled; } -static bool pf_check_migration_support(struct xe_device *xe) +/** + * xe_sriov_pf_migration_disable() - Turn off SR-IOV VF migration support on PF. + * @xe: the &xe_device instance. + * @fmt: format string for the log message, to be combined with following VAs. + */ +void xe_sriov_pf_migration_disable(struct xe_device *xe, const char *fmt, ...) +{ + struct va_format vaf; + va_list va_args; + + xe_assert(xe, IS_SRIOV_PF(xe)); + + va_start(va_args, fmt); + vaf.fmt = fmt; + vaf.va = &va_args; + xe_sriov_notice(xe, "migration %s: %pV\n", + IS_ENABLED(CONFIG_DRM_XE_DEBUG) ? + "missing prerequisite" : "disabled", + &vaf); + va_end(va_args); + + xe->sriov.pf.migration.disabled = true; +} + +static void pf_migration_check_support(struct xe_device *xe) { - /* XXX: for now this is for feature enabling only */ - return IS_ENABLED(CONFIG_DRM_XE_DEBUG); + if (!xe_device_has_memirq(xe)) + xe_sriov_pf_migration_disable(xe, "requires memory-based IRQ support"); } static void pf_migration_cleanup(void *arg) @@ -77,7 +101,8 @@ int xe_sriov_pf_migration_init(struct xe_device *xe) xe_assert(xe, IS_SRIOV_PF(xe)); - xe->sriov.pf.migration.supported = pf_check_migration_support(xe); + pf_migration_check_support(xe); + if (!xe_sriov_pf_migration_supported(xe)) return 0; diff --git a/drivers/gpu/drm/xe/xe_sriov_pf_migration.h b/drivers/gpu/drm/xe/xe_sriov_pf_migration.h index b806298a0bb62..f8f408df84813 100644 --- a/drivers/gpu/drm/xe/xe_sriov_pf_migration.h +++ b/drivers/gpu/drm/xe/xe_sriov_pf_migration.h @@ -14,6 +14,7 @@ struct xe_sriov_packet; int xe_sriov_pf_migration_init(struct xe_device *xe); bool xe_sriov_pf_migration_supported(struct xe_device *xe); +void xe_sriov_pf_migration_disable(struct xe_device *xe, const char *fmt, ...); int xe_sriov_pf_migration_restore_produce(struct xe_device *xe, unsigned int vfid, struct xe_sriov_packet *data); struct xe_sriov_packet * diff --git a/drivers/gpu/drm/xe/xe_sriov_pf_migration_types.h b/drivers/gpu/drm/xe/xe_sriov_pf_migration_types.h index 363d673ee1dd5..7d9a8a278d915 100644 --- a/drivers/gpu/drm/xe/xe_sriov_pf_migration_types.h +++ b/drivers/gpu/drm/xe/xe_sriov_pf_migration_types.h @@ -14,8 +14,8 @@ * struct xe_sriov_pf_migration - Xe device level VF migration data */ struct xe_sriov_pf_migration { - /** @supported: indicates whether VF migration feature is supported */ - bool supported; + /** @disabled: indicates whether VF migration feature is disabled */ + bool disabled; }; /** -- 2.51.2 In certain scenarios (such as VF migration), VF driver needs to interact with PF driver. Add a helper to allow VF driver access to PF xe_device. Signed-off-by: Michał Winiarski Reviewed-by: Michal Wajdeczko --- drivers/gpu/drm/xe/xe_pci.c | 17 +++++++++++++++++ drivers/gpu/drm/xe/xe_pci.h | 3 +++ 2 files changed, 20 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c index 1cb30efe27ef1..9859e85c4f3a6 100644 --- a/drivers/gpu/drm/xe/xe_pci.c +++ b/drivers/gpu/drm/xe/xe_pci.c @@ -1232,6 +1232,23 @@ static struct pci_driver xe_pci_driver = { #endif }; +/** + * xe_pci_to_pf_device() - Get PF &xe_device. + * @pdev: the VF &pci_dev device + * + * Return: pointer to PF &xe_device, NULL otherwise. + */ +struct xe_device *xe_pci_to_pf_device(struct pci_dev *pdev) +{ + struct drm_device *drm; + + drm = pci_iov_get_pf_drvdata(pdev, &xe_pci_driver); + if (IS_ERR(drm)) + return NULL; + + return to_xe_device(drm); +} + int xe_register_pci_driver(void) { return pci_register_driver(&xe_pci_driver); diff --git a/drivers/gpu/drm/xe/xe_pci.h b/drivers/gpu/drm/xe/xe_pci.h index 611c1209b14cc..11bcc5fe2c5b9 100644 --- a/drivers/gpu/drm/xe/xe_pci.h +++ b/drivers/gpu/drm/xe/xe_pci.h @@ -6,7 +6,10 @@ #ifndef _XE_PCI_H_ #define _XE_PCI_H_ +struct pci_dev; + int xe_register_pci_driver(void); void xe_unregister_pci_driver(void); +struct xe_device *xe_pci_to_pf_device(struct pci_dev *pdev); #endif -- 2.51.2 Device specific VFIO driver variant for Xe will implement VF migration. Export everything that's needed for migration ops. Signed-off-by: Michał Winiarski Reviewed-by: Michal Wajdeczko --- drivers/gpu/drm/xe/Makefile | 4 + drivers/gpu/drm/xe/xe_sriov_vfio.c | 80 ++++++++++++++++ include/drm/intel/xe_sriov_vfio.h | 143 +++++++++++++++++++++++++++++ 3 files changed, 227 insertions(+) create mode 100644 drivers/gpu/drm/xe/xe_sriov_vfio.c create mode 100644 include/drm/intel/xe_sriov_vfio.h diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile index c9b60e19cecc6..a7e13a676f7d9 100644 --- a/drivers/gpu/drm/xe/Makefile +++ b/drivers/gpu/drm/xe/Makefile @@ -185,6 +185,10 @@ xe-$(CONFIG_PCI_IOV) += \ xe_sriov_pf_sysfs.o \ xe_tile_sriov_pf_debugfs.o +ifeq ($(CONFIG_PCI_IOV),y) + xe-$(CONFIG_XE_VFIO_PCI) += xe_sriov_vfio.o +endif + # include helpers for tests even when XE is built-in ifdef CONFIG_DRM_XE_KUNIT_TEST xe-y += tests/xe_kunit_helpers.o diff --git a/drivers/gpu/drm/xe/xe_sriov_vfio.c b/drivers/gpu/drm/xe/xe_sriov_vfio.c new file mode 100644 index 0000000000000..e9a7615bb5c51 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_sriov_vfio.c @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2025 Intel Corporation + */ + +#include +#include + +#include "xe_pci.h" +#include "xe_pm.h" +#include "xe_sriov_pf_control.h" +#include "xe_sriov_pf_helpers.h" +#include "xe_sriov_pf_migration.h" + +struct xe_device *xe_sriov_vfio_get_pf(struct pci_dev *pdev) +{ + return xe_pci_to_pf_device(pdev); +} +EXPORT_SYMBOL_FOR_MODULES(xe_sriov_vfio_get_pf, "xe-vfio-pci"); + +bool xe_sriov_vfio_migration_supported(struct xe_device *xe) +{ + if (!IS_SRIOV_PF(xe)) + return -EPERM; + + return xe_sriov_pf_migration_supported(xe); +} +EXPORT_SYMBOL_FOR_MODULES(xe_sriov_vfio_migration_supported, "xe-vfio-pci"); + +#define DEFINE_XE_SRIOV_VFIO_FUNCTION(_type, _func, _impl) \ +_type xe_sriov_vfio_##_func(struct xe_device *xe, unsigned int vfid) \ +{ \ + if (!IS_SRIOV_PF(xe)) \ + return -EPERM; \ + if (vfid == PFID || vfid > xe_sriov_pf_num_vfs(xe)) \ + return -EINVAL; \ + \ + guard(xe_pm_runtime_noresume)(xe); \ + \ + return xe_sriov_pf_##_impl(xe, vfid); \ +} \ +EXPORT_SYMBOL_FOR_MODULES(xe_sriov_vfio_##_func, "xe-vfio-pci") + +DEFINE_XE_SRIOV_VFIO_FUNCTION(int, wait_flr_done, control_wait_flr); +DEFINE_XE_SRIOV_VFIO_FUNCTION(int, suspend_device, control_pause_vf); +DEFINE_XE_SRIOV_VFIO_FUNCTION(int, resume_device, control_resume_vf); +DEFINE_XE_SRIOV_VFIO_FUNCTION(int, stop_copy_enter, control_trigger_save_vf); +DEFINE_XE_SRIOV_VFIO_FUNCTION(int, stop_copy_exit, control_finish_save_vf); +DEFINE_XE_SRIOV_VFIO_FUNCTION(int, resume_data_enter, control_trigger_restore_vf); +DEFINE_XE_SRIOV_VFIO_FUNCTION(int, resume_data_exit, control_finish_restore_vf); +DEFINE_XE_SRIOV_VFIO_FUNCTION(int, error, control_stop_vf); +DEFINE_XE_SRIOV_VFIO_FUNCTION(ssize_t, stop_copy_size, migration_size); + +ssize_t xe_sriov_vfio_data_read(struct xe_device *xe, unsigned int vfid, + char __user *buf, size_t len) +{ + if (!IS_SRIOV_PF(xe)) + return -EPERM; + if (vfid == PFID || vfid > xe_sriov_pf_num_vfs(xe)) + return -EINVAL; + + guard(xe_pm_runtime_noresume)(xe); + + return xe_sriov_pf_migration_read(xe, vfid, buf, len); +} +EXPORT_SYMBOL_FOR_MODULES(xe_sriov_vfio_data_read, "xe-vfio-pci"); + +ssize_t xe_sriov_vfio_data_write(struct xe_device *xe, unsigned int vfid, + const char __user *buf, size_t len) +{ + if (!IS_SRIOV_PF(xe)) + return -EPERM; + if (vfid == PFID || vfid > xe_sriov_pf_num_vfs(xe)) + return -EINVAL; + + guard(xe_pm_runtime_noresume)(xe); + + return xe_sriov_pf_migration_write(xe, vfid, buf, len); +} +EXPORT_SYMBOL_FOR_MODULES(xe_sriov_vfio_data_write, "xe-vfio-pci"); diff --git a/include/drm/intel/xe_sriov_vfio.h b/include/drm/intel/xe_sriov_vfio.h new file mode 100644 index 0000000000000..e9814e8149fd5 --- /dev/null +++ b/include/drm/intel/xe_sriov_vfio.h @@ -0,0 +1,143 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2025 Intel Corporation + */ + +#ifndef _XE_SRIOV_VFIO_H_ +#define _XE_SRIOV_VFIO_H_ + +#include + +struct pci_dev; +struct xe_device; + +/** + * xe_sriov_vfio_get_pf() - Get PF &xe_device. + * @pdev: the VF &pci_dev device + * + * Return: pointer to PF &xe_device, NULL otherwise. + */ +struct xe_device *xe_sriov_vfio_get_pf(struct pci_dev *pdev); + +/** + * xe_sriov_vfio_migration_supported() - Check if migration is supported. + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() + * + * Return: true if migration is supported, false otherwise. + */ +bool xe_sriov_vfio_migration_supported(struct xe_device *xe); + +/** + * xe_sriov_vfio_wait_flr_done() - Wait for VF FLR completion. + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() + * @vfid: the VF identifier (can't be 0) + * + * This function will wait until VF FLR is processed by PF on all tiles (or + * until timeout occurs). + * + * Return: 0 on success or a negative error code on failure. + */ +int xe_sriov_vfio_wait_flr_done(struct xe_device *xe, unsigned int vfid); + +/** + * xe_sriov_vfio_suspend_device() - Suspend VF. + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() + * @vfid: the VF identifier (can't be 0) + * + * This function will pause VF on all tiles/GTs. + * + * Return: 0 on success or a negative error code on failure. + */ +int xe_sriov_vfio_suspend_device(struct xe_device *xe, unsigned int vfid); + +/** + * xe_sriov_vfio_resume_device() - Resume VF. + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() + * @vfid: the VF identifier (can't be 0) + * + * This function will resume VF on all tiles. + * + * Return: 0 on success or a negative error code on failure. + */ +int xe_sriov_vfio_resume_device(struct xe_device *xe, unsigned int vfid); + +/** + * xe_sriov_vfio_stop_copy_enter() - Initiate a VF device migration data save. + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() + * @vfid: the VF identifier (can't be 0) + * + * Return: 0 on success or a negative error code on failure. + */ +int xe_sriov_vfio_stop_copy_enter(struct xe_device *xe, unsigned int vfid); + +/** + * xe_sriov_vfio_stop_copy_exit() - Finish a VF device migration data save. + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() + * @vfid: the VF identifier (can't be 0) + * + * Return: 0 on success or a negative error code on failure. + */ +int xe_sriov_vfio_stop_copy_exit(struct xe_device *xe, unsigned int vfid); + +/** + * xe_sriov_vfio_resume_data_enter() - Initiate a VF device migration data restore. + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() + * @vfid: the VF identifier (can't be 0) + * + * Return: 0 on success or a negative error code on failure. + */ +int xe_sriov_vfio_resume_data_enter(struct xe_device *xe, unsigned int vfid); + +/** + * xe_sriov_vfio_resume_data_exit() - Finish a VF device migration data restore. + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() + * @vfid: the VF identifier (can't be 0) + * + * Return: 0 on success or a negative error code on failure. + */ +int xe_sriov_vfio_resume_data_exit(struct xe_device *xe, unsigned int vfid); + +/** + * xe_sriov_vfio_error() - Move VF device to error state. + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() + * @vfid: the VF identifier (can't be 0) + * + * Reset is needed to move it out of error state. + * + * Return: 0 on success or a negative error code on failure. + */ +int xe_sriov_vfio_error(struct xe_device *xe, unsigned int vfid); + +/** + * xe_sriov_vfio_data_read() - Read migration data from the VF device. + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() + * @vfid: the VF identifier (can't be 0) + * @buf: start address of userspace buffer + * @len: requested read size from userspace + * + * Return: number of bytes that has been successfully read, + * 0 if no more migration data is available, -errno on failure. + */ +ssize_t xe_sriov_vfio_data_read(struct xe_device *xe, unsigned int vfid, + char __user *buf, size_t len); +/** + * xe_sriov_vfio_data_write() - Write migration data to the VF device. + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() + * @vfid: the VF identifier (can't be 0) + * @buf: start address of userspace buffer + * @len: requested write size from userspace + * + * Return: number of bytes that has been successfully written, -errno on failure. + */ +ssize_t xe_sriov_vfio_data_write(struct xe_device *xe, unsigned int vfid, + const char __user *buf, size_t len); +/** + * xe_sriov_vfio_stop_copy_size() - Get a size estimate of VF device migration data. + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() + * @vfid: the VF identifier (can't be 0) + * + * Return: migration data size in bytes or a negative error code on failure. + */ +ssize_t xe_sriov_vfio_stop_copy_size(struct xe_device *xe, unsigned int vfid); + +#endif -- 2.51.2 In addition to generic VFIO PCI functionality, the driver implements VFIO migration uAPI, allowing userspace to enable migration for Intel Graphics SR-IOV Virtual Functions. The driver binds to VF device and uses API exposed by Xe driver to transfer the VF migration data under the control of PF device. Signed-off-by: Michał Winiarski Acked-by: Rodrigo Vivi Reviewed-by: Kevin Tian --- MAINTAINERS | 7 + drivers/vfio/pci/Kconfig | 2 + drivers/vfio/pci/Makefile | 2 + drivers/vfio/pci/xe/Kconfig | 12 + drivers/vfio/pci/xe/Makefile | 3 + drivers/vfio/pci/xe/main.c | 573 +++++++++++++++++++++++++++++++++++ 6 files changed, 599 insertions(+) create mode 100644 drivers/vfio/pci/xe/Kconfig create mode 100644 drivers/vfio/pci/xe/Makefile create mode 100644 drivers/vfio/pci/xe/main.c diff --git a/MAINTAINERS b/MAINTAINERS index acc951f122eaf..adb5aa9cd29e9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -27025,6 +27025,13 @@ L: virtualization@lists.linux.dev S: Maintained F: drivers/vfio/pci/virtio +VFIO XE PCI DRIVER +M: Michał Winiarski +L: kvm@vger.kernel.org +L: intel-xe@lists.freedesktop.org +S: Supported +F: drivers/vfio/pci/xe + VGA_SWITCHEROO R: Lukas Wunner S: Maintained diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig index 2b0172f546652..c100f0ab87f2d 100644 --- a/drivers/vfio/pci/Kconfig +++ b/drivers/vfio/pci/Kconfig @@ -67,4 +67,6 @@ source "drivers/vfio/pci/nvgrace-gpu/Kconfig" source "drivers/vfio/pci/qat/Kconfig" +source "drivers/vfio/pci/xe/Kconfig" + endmenu diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile index cf00c0a7e55c8..f5d46aa9347b9 100644 --- a/drivers/vfio/pci/Makefile +++ b/drivers/vfio/pci/Makefile @@ -19,3 +19,5 @@ obj-$(CONFIG_VIRTIO_VFIO_PCI) += virtio/ obj-$(CONFIG_NVGRACE_GPU_VFIO_PCI) += nvgrace-gpu/ obj-$(CONFIG_QAT_VFIO_PCI) += qat/ + +obj-$(CONFIG_XE_VFIO_PCI) += xe/ diff --git a/drivers/vfio/pci/xe/Kconfig b/drivers/vfio/pci/xe/Kconfig new file mode 100644 index 0000000000000..cc9b6dac6ed39 --- /dev/null +++ b/drivers/vfio/pci/xe/Kconfig @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0-only +config XE_VFIO_PCI + tristate "VFIO support for Intel Graphics" + depends on DRM_XE && PCI_IOV + select VFIO_PCI_CORE + help + This option enables device specific VFIO driver variant for Intel Graphics. + In addition to generic VFIO PCI functionality, it implements VFIO + migration uAPI allowing userspace to enable migration for + Intel Graphics SR-IOV Virtual Functions supported by the Xe driver. + + If you don't know what to do here, say N. diff --git a/drivers/vfio/pci/xe/Makefile b/drivers/vfio/pci/xe/Makefile new file mode 100644 index 0000000000000..13aa0fd192cd4 --- /dev/null +++ b/drivers/vfio/pci/xe/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_XE_VFIO_PCI) += xe-vfio-pci.o +xe-vfio-pci-y := main.o diff --git a/drivers/vfio/pci/xe/main.c b/drivers/vfio/pci/xe/main.c new file mode 100644 index 0000000000000..0156b53c678b7 --- /dev/null +++ b/drivers/vfio/pci/xe/main.c @@ -0,0 +1,573 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright © 2025 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +struct xe_vfio_pci_migration_file { + struct file *filp; + /* serializes accesses to migration data */ + struct mutex lock; + struct xe_vfio_pci_core_device *xe_vdev; + u8 disabled:1; +}; + +struct xe_vfio_pci_core_device { + struct vfio_pci_core_device core_device; + struct xe_device *xe; + /* PF internal control uses vfid index starting from 1 */ + unsigned int vfid; + u8 deferred_reset:1; + /* protects migration state */ + struct mutex state_mutex; + enum vfio_device_mig_state mig_state; + /* protects the reset_done flow */ + spinlock_t reset_lock; + struct xe_vfio_pci_migration_file *migf; +}; + +#define xe_vdev_to_dev(xe_vdev) (&(xe_vdev)->core_device.pdev->dev) + +static void xe_vfio_pci_disable_file(struct xe_vfio_pci_migration_file *migf) +{ + mutex_lock(&migf->lock); + migf->disabled = true; + mutex_unlock(&migf->lock); +} + +static void xe_vfio_pci_put_file(struct xe_vfio_pci_core_device *xe_vdev) +{ + xe_vfio_pci_disable_file(xe_vdev->migf); + fput(xe_vdev->migf->filp); + xe_vdev->migf = NULL; +} + +static void xe_vfio_pci_reset(struct xe_vfio_pci_core_device *xe_vdev) +{ + if (xe_vdev->migf) + xe_vfio_pci_put_file(xe_vdev); + + xe_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING; +} + +static void xe_vfio_pci_state_mutex_lock(struct xe_vfio_pci_core_device *xe_vdev) +{ + mutex_lock(&xe_vdev->state_mutex); +} + +/* + * This function is called in all state_mutex unlock cases to + * handle a 'deferred_reset' if exists. + */ +static void xe_vfio_pci_state_mutex_unlock(struct xe_vfio_pci_core_device *xe_vdev) +{ +again: + spin_lock(&xe_vdev->reset_lock); + if (xe_vdev->deferred_reset) { + xe_vdev->deferred_reset = false; + spin_unlock(&xe_vdev->reset_lock); + xe_vfio_pci_reset(xe_vdev); + goto again; + } + mutex_unlock(&xe_vdev->state_mutex); + spin_unlock(&xe_vdev->reset_lock); +} + +static void xe_vfio_pci_reset_done(struct pci_dev *pdev) +{ + struct xe_vfio_pci_core_device *xe_vdev = pci_get_drvdata(pdev); + int ret; + + if (!pdev->is_virtfn) + return; + + /* + * VF FLR requires additional processing done by PF driver. + * The processing is done after FLR is already finished from PCIe + * perspective. + * In order to avoid a scenario where VF is used while PF processing + * is still in progress, additional synchronization point is needed. + */ + ret = xe_sriov_vfio_wait_flr_done(xe_vdev->xe, xe_vdev->vfid); + if (ret) + dev_err(&pdev->dev, "Failed to wait for FLR: %d\n", ret); + + if (!xe_vdev->vfid) + return; + + /* + * As the higher VFIO layers are holding locks across reset and using + * those same locks with the mm_lock we need to prevent ABBA deadlock + * with the state_mutex and mm_lock. + * In case the state_mutex was taken already we defer the cleanup work + * to the unlock flow of the other running context. + */ + spin_lock(&xe_vdev->reset_lock); + xe_vdev->deferred_reset = true; + if (!mutex_trylock(&xe_vdev->state_mutex)) { + spin_unlock(&xe_vdev->reset_lock); + return; + } + spin_unlock(&xe_vdev->reset_lock); + xe_vfio_pci_state_mutex_unlock(xe_vdev); + + xe_vfio_pci_reset(xe_vdev); +} + +static const struct pci_error_handlers xe_vfio_pci_err_handlers = { + .reset_done = xe_vfio_pci_reset_done, + .error_detected = vfio_pci_core_aer_err_detected, +}; + +static int xe_vfio_pci_open_device(struct vfio_device *core_vdev) +{ + struct xe_vfio_pci_core_device *xe_vdev = + container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev); + struct vfio_pci_core_device *vdev = &xe_vdev->core_device; + int ret; + + ret = vfio_pci_core_enable(vdev); + if (ret) + return ret; + + xe_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING; + + vfio_pci_core_finish_enable(vdev); + + return 0; +} + +static void xe_vfio_pci_close_device(struct vfio_device *core_vdev) +{ + struct xe_vfio_pci_core_device *xe_vdev = + container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev); + + xe_vfio_pci_state_mutex_lock(xe_vdev); + xe_vfio_pci_reset(xe_vdev); + xe_vfio_pci_state_mutex_unlock(xe_vdev); + vfio_pci_core_close_device(core_vdev); +} + +static int xe_vfio_pci_release_file(struct inode *inode, struct file *filp) +{ + struct xe_vfio_pci_migration_file *migf = filp->private_data; + + mutex_destroy(&migf->lock); + kfree(migf); + + return 0; +} + +static ssize_t xe_vfio_pci_save_read(struct file *filp, char __user *buf, size_t len, loff_t *pos) +{ + struct xe_vfio_pci_migration_file *migf = filp->private_data; + ssize_t ret; + + if (pos) + return -ESPIPE; + + mutex_lock(&migf->lock); + if (migf->disabled) { + mutex_unlock(&migf->lock); + return -ENODEV; + } + + ret = xe_sriov_vfio_data_read(migf->xe_vdev->xe, migf->xe_vdev->vfid, buf, len); + mutex_unlock(&migf->lock); + + return ret; +} + +static const struct file_operations xe_vfio_pci_save_fops = { + .owner = THIS_MODULE, + .read = xe_vfio_pci_save_read, + .release = xe_vfio_pci_release_file, + .llseek = noop_llseek, +}; + +static ssize_t xe_vfio_pci_resume_write(struct file *filp, const char __user *buf, + size_t len, loff_t *pos) +{ + struct xe_vfio_pci_migration_file *migf = filp->private_data; + ssize_t ret; + + if (pos) + return -ESPIPE; + + mutex_lock(&migf->lock); + if (migf->disabled) { + mutex_unlock(&migf->lock); + return -ENODEV; + } + + ret = xe_sriov_vfio_data_write(migf->xe_vdev->xe, migf->xe_vdev->vfid, buf, len); + mutex_unlock(&migf->lock); + + return ret; +} + +static const struct file_operations xe_vfio_pci_resume_fops = { + .owner = THIS_MODULE, + .write = xe_vfio_pci_resume_write, + .release = xe_vfio_pci_release_file, + .llseek = noop_llseek, +}; + +static const char *vfio_dev_state_str(u32 state) +{ + switch (state) { + case VFIO_DEVICE_STATE_RUNNING: return "running"; + case VFIO_DEVICE_STATE_RUNNING_P2P: return "running_p2p"; + case VFIO_DEVICE_STATE_STOP_COPY: return "stopcopy"; + case VFIO_DEVICE_STATE_STOP: return "stop"; + case VFIO_DEVICE_STATE_RESUMING: return "resuming"; + case VFIO_DEVICE_STATE_ERROR: return "error"; + default: return ""; + } +} + +enum xe_vfio_pci_file_type { + XE_VFIO_FILE_SAVE = 0, + XE_VFIO_FILE_RESUME, +}; + +static struct xe_vfio_pci_migration_file * +xe_vfio_pci_alloc_file(struct xe_vfio_pci_core_device *xe_vdev, + enum xe_vfio_pci_file_type type) +{ + struct xe_vfio_pci_migration_file *migf; + const struct file_operations *fops; + int flags; + + migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT); + if (!migf) + return ERR_PTR(-ENOMEM); + + fops = type == XE_VFIO_FILE_SAVE ? &xe_vfio_pci_save_fops : &xe_vfio_pci_resume_fops; + flags = type == XE_VFIO_FILE_SAVE ? O_RDONLY : O_WRONLY; + migf->filp = anon_inode_getfile("xe_vfio_mig", fops, migf, flags); + if (IS_ERR(migf->filp)) { + kfree(migf); + return ERR_CAST(migf->filp); + } + + mutex_init(&migf->lock); + migf->xe_vdev = xe_vdev; + xe_vdev->migf = migf; + + stream_open(migf->filp->f_inode, migf->filp); + + return migf; +} + +static struct file * +xe_vfio_set_state(struct xe_vfio_pci_core_device *xe_vdev, u32 new) +{ + u32 cur = xe_vdev->mig_state; + int ret; + + dev_dbg(xe_vdev_to_dev(xe_vdev), + "state: %s->%s\n", vfio_dev_state_str(cur), vfio_dev_state_str(new)); + + /* + * "STOP" handling is reused for "RUNNING_P2P", as the device doesn't + * have the capability to selectively block outgoing p2p DMA transfers. + * While the device is allowing BAR accesses when the VF is stopped, it + * is not processing any new workload requests, effectively stopping + * any outgoing DMA transfers (not just p2p). + * Any VRAM / MMIO accesses occurring during "RUNNING_P2P" are kept and + * will be migrated to target VF during stop-copy. + */ + if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) { + ret = xe_sriov_vfio_suspend_device(xe_vdev->xe, xe_vdev->vfid); + if (ret) + goto err; + + return NULL; + } + + if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) || + (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P)) + return NULL; + + if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) { + ret = xe_sriov_vfio_resume_device(xe_vdev->xe, xe_vdev->vfid); + if (ret) + goto err; + + return NULL; + } + + if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) { + struct xe_vfio_pci_migration_file *migf; + + migf = xe_vfio_pci_alloc_file(xe_vdev, XE_VFIO_FILE_SAVE); + if (IS_ERR(migf)) { + ret = PTR_ERR(migf); + goto err; + } + get_file(migf->filp); + + ret = xe_sriov_vfio_stop_copy_enter(xe_vdev->xe, xe_vdev->vfid); + if (ret) { + fput(migf->filp); + goto err; + } + + return migf->filp; + } + + if (cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) { + if (xe_vdev->migf) + xe_vfio_pci_put_file(xe_vdev); + + ret = xe_sriov_vfio_stop_copy_exit(xe_vdev->xe, xe_vdev->vfid); + if (ret) + goto err; + + return NULL; + } + + if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) { + struct xe_vfio_pci_migration_file *migf; + + migf = xe_vfio_pci_alloc_file(xe_vdev, XE_VFIO_FILE_RESUME); + if (IS_ERR(migf)) { + ret = PTR_ERR(migf); + goto err; + } + get_file(migf->filp); + + ret = xe_sriov_vfio_resume_data_enter(xe_vdev->xe, xe_vdev->vfid); + if (ret) { + fput(migf->filp); + goto err; + } + + return migf->filp; + } + + if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) { + if (xe_vdev->migf) + xe_vfio_pci_put_file(xe_vdev); + + ret = xe_sriov_vfio_resume_data_exit(xe_vdev->xe, xe_vdev->vfid); + if (ret) + goto err; + + return NULL; + } + + WARN(true, "Unknown state transition %d->%d", cur, new); + return ERR_PTR(-EINVAL); + +err: + dev_dbg(xe_vdev_to_dev(xe_vdev), + "Failed to transition state: %s->%s err=%d\n", + vfio_dev_state_str(cur), vfio_dev_state_str(new), ret); + return ERR_PTR(ret); +} + +static struct file * +xe_vfio_pci_set_device_state(struct vfio_device *core_vdev, + enum vfio_device_mig_state new_state) +{ + struct xe_vfio_pci_core_device *xe_vdev = + container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev); + enum vfio_device_mig_state next_state; + struct file *f = NULL; + int ret; + + xe_vfio_pci_state_mutex_lock(xe_vdev); + while (new_state != xe_vdev->mig_state) { + ret = vfio_mig_get_next_state(core_vdev, xe_vdev->mig_state, + new_state, &next_state); + if (ret) { + xe_sriov_vfio_error(xe_vdev->xe, xe_vdev->vfid); + f = ERR_PTR(ret); + break; + } + f = xe_vfio_set_state(xe_vdev, next_state); + if (IS_ERR(f)) + break; + + xe_vdev->mig_state = next_state; + + /* Multiple state transitions with non-NULL file in the middle */ + if (f && new_state != xe_vdev->mig_state) { + fput(f); + f = ERR_PTR(-EINVAL); + break; + } + } + xe_vfio_pci_state_mutex_unlock(xe_vdev); + + return f; +} + +static int xe_vfio_pci_get_device_state(struct vfio_device *core_vdev, + enum vfio_device_mig_state *curr_state) +{ + struct xe_vfio_pci_core_device *xe_vdev = + container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev); + + xe_vfio_pci_state_mutex_lock(xe_vdev); + *curr_state = xe_vdev->mig_state; + xe_vfio_pci_state_mutex_unlock(xe_vdev); + + return 0; +} + +static int xe_vfio_pci_get_data_size(struct vfio_device *vdev, + unsigned long *stop_copy_length) +{ + struct xe_vfio_pci_core_device *xe_vdev = + container_of(vdev, struct xe_vfio_pci_core_device, core_device.vdev); + + xe_vfio_pci_state_mutex_lock(xe_vdev); + *stop_copy_length = xe_sriov_vfio_stop_copy_size(xe_vdev->xe, xe_vdev->vfid); + xe_vfio_pci_state_mutex_unlock(xe_vdev); + + return 0; +} + +static const struct vfio_migration_ops xe_vfio_pci_migration_ops = { + .migration_set_state = xe_vfio_pci_set_device_state, + .migration_get_state = xe_vfio_pci_get_device_state, + .migration_get_data_size = xe_vfio_pci_get_data_size, +}; + +static void xe_vfio_pci_migration_init(struct xe_vfio_pci_core_device *xe_vdev) +{ + struct vfio_device *core_vdev = &xe_vdev->core_device.vdev; + struct pci_dev *pdev = to_pci_dev(core_vdev->dev); + struct xe_device *xe = xe_sriov_vfio_get_pf(pdev); + + if (!xe) + return; + if (!xe_sriov_vfio_migration_supported(xe)) + return; + + mutex_init(&xe_vdev->state_mutex); + spin_lock_init(&xe_vdev->reset_lock); + + /* PF internal control uses vfid index starting from 1 */ + xe_vdev->vfid = pci_iov_vf_id(pdev) + 1; + xe_vdev->xe = xe; + + core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P; + core_vdev->mig_ops = &xe_vfio_pci_migration_ops; +} + +static void xe_vfio_pci_migration_fini(struct xe_vfio_pci_core_device *xe_vdev) +{ + if (!xe_vdev->vfid) + return; + + mutex_destroy(&xe_vdev->state_mutex); +} + +static int xe_vfio_pci_init_dev(struct vfio_device *core_vdev) +{ + struct xe_vfio_pci_core_device *xe_vdev = + container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev); + + xe_vfio_pci_migration_init(xe_vdev); + + return vfio_pci_core_init_dev(core_vdev); +} + +static void xe_vfio_pci_release_dev(struct vfio_device *core_vdev) +{ + struct xe_vfio_pci_core_device *xe_vdev = + container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev); + + xe_vfio_pci_migration_fini(xe_vdev); +} + +static const struct vfio_device_ops xe_vfio_pci_ops = { + .name = "xe-vfio-pci", + .init = xe_vfio_pci_init_dev, + .release = xe_vfio_pci_release_dev, + .open_device = xe_vfio_pci_open_device, + .close_device = xe_vfio_pci_close_device, + .ioctl = vfio_pci_core_ioctl, + .device_feature = vfio_pci_core_ioctl_feature, + .read = vfio_pci_core_read, + .write = vfio_pci_core_write, + .mmap = vfio_pci_core_mmap, + .request = vfio_pci_core_request, + .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, + .bind_iommufd = vfio_iommufd_physical_bind, + .unbind_iommufd = vfio_iommufd_physical_unbind, + .attach_ioas = vfio_iommufd_physical_attach_ioas, + .detach_ioas = vfio_iommufd_physical_detach_ioas, +}; + +static int xe_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + struct xe_vfio_pci_core_device *xe_vdev; + int ret; + + xe_vdev = vfio_alloc_device(xe_vfio_pci_core_device, core_device.vdev, &pdev->dev, + &xe_vfio_pci_ops); + if (IS_ERR(xe_vdev)) + return PTR_ERR(xe_vdev); + + dev_set_drvdata(&pdev->dev, &xe_vdev->core_device); + + ret = vfio_pci_core_register_device(&xe_vdev->core_device); + if (ret) { + vfio_put_device(&xe_vdev->core_device.vdev); + return ret; + } + + return 0; +} + +static void xe_vfio_pci_remove(struct pci_dev *pdev) +{ + struct xe_vfio_pci_core_device *xe_vdev = pci_get_drvdata(pdev); + + vfio_pci_core_unregister_device(&xe_vdev->core_device); + vfio_put_device(&xe_vdev->core_device.vdev); +} + +#define INTEL_PCI_VFIO_DEVICE(_id) { \ + PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, (_id)) \ +} + +static const struct pci_device_id xe_vfio_pci_table[] = { + INTEL_PTL_IDS(INTEL_PCI_VFIO_DEVICE), + INTEL_WCL_IDS(INTEL_PCI_VFIO_DEVICE), + INTEL_BMG_IDS(INTEL_PCI_VFIO_DEVICE), + {} +}; +MODULE_DEVICE_TABLE(pci, xe_vfio_pci_table); + +static struct pci_driver xe_vfio_pci_driver = { + .name = "xe-vfio-pci", + .id_table = xe_vfio_pci_table, + .probe = xe_vfio_pci_probe, + .remove = xe_vfio_pci_remove, + .err_handler = &xe_vfio_pci_err_handlers, + .driver_managed_dma = true, +}; +module_pci_driver(xe_vfio_pci_driver); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Michał Winiarski "); +MODULE_DESCRIPTION("VFIO PCI driver with migration support for Intel Graphics"); -- 2.51.2