From: Vipin Sharma Implement the live update file handler callbacks to preserve a vfio-pci device across a Live Update. Subsequent commits will enable userspace to then retrieve this file after the Live Update. Live Update support is scoped only to cdev files (i.e. not VFIO_GROUP_GET_DEVICE_FD files). State about each device is serialized into a new ABI struct vfio_pci_core_device_ser. The contents of this struct are preserved across the Live Update to the next kernel using a combination of Kexec-Handover (KHO) to preserve the page(s) holding the struct and the Live Update Orchestrator (LUO) to preserve the physical address of the struct. For now the only contents of struct vfio_pci_core_device_ser the device's PCI segment number and BDF, so that the device can be uniquely identified after the Live Update. Require that userspace disables interrupts on the device prior to freeze() so that the device does not send any interrupts until new interrupt handlers have been set up by the next kernel. Reset the device and restore its state in the freeze() callback. This ensures the device can be received by the next kernel in a consistent state. Eventually this will be dropped and the device can be preserved across in a running state, but that requires further work in VFIO and the core PCI layer. Note that LUO holds a reference to this file when it is preserved. So VFIO is guaranteed that vfio_df_device_last_close() will not be called on this device no matter what userspace does. Signed-off-by: Vipin Sharma Co-developed-by: David Matlack Signed-off-by: David Matlack --- drivers/vfio/pci/vfio_pci.c | 2 +- drivers/vfio/pci/vfio_pci_core.c | 57 +++++---- drivers/vfio/pci/vfio_pci_liveupdate.c | 156 ++++++++++++++++++++++++- drivers/vfio/pci/vfio_pci_priv.h | 4 + drivers/vfio/vfio_main.c | 3 +- include/linux/kho/abi/vfio_pci.h | 15 +++ include/linux/vfio.h | 2 + 7 files changed, 213 insertions(+), 26 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 41dcbe4ace67..351480d13f6e 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -125,7 +125,7 @@ static int vfio_pci_open_device(struct vfio_device *core_vdev) return 0; } -static const struct vfio_device_ops vfio_pci_ops = { +const struct vfio_device_ops vfio_pci_ops = { .name = "vfio-pci", .init = vfio_pci_core_init_dev, .release = vfio_pci_core_release_dev, diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index d43745fe4c84..81f941323641 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -585,9 +585,42 @@ int vfio_pci_core_enable(struct vfio_pci_core_device *vdev) } EXPORT_SYMBOL_GPL(vfio_pci_core_enable); +void vfio_pci_core_try_reset(struct vfio_pci_core_device *vdev) +{ + struct pci_dev *pdev = vdev->pdev; + struct pci_dev *bridge = pci_upstream_bridge(pdev); + + lockdep_assert_held(&vdev->vdev.dev_set->lock); + + if (!vdev->reset_works) + return; + + /* + * Try to get the locks ourselves to prevent a deadlock. The + * success of this is dependent on being able to lock the device, + * which is not always possible. + * + * We cannot use the "try" reset interface here, since that will + * overwrite the previously restored configuration information. + */ + if (bridge && !pci_dev_trylock(bridge)) + return; + + if (!pci_dev_trylock(pdev)) + goto out; + + if (!__pci_reset_function_locked(pdev)) + vdev->needs_reset = false; + + pci_dev_unlock(pdev); +out: + if (bridge) + pci_dev_unlock(bridge); +} +EXPORT_SYMBOL_GPL(vfio_pci_core_try_reset); + void vfio_pci_core_disable(struct vfio_pci_core_device *vdev) { - struct pci_dev *bridge; struct pci_dev *pdev = vdev->pdev; struct vfio_pci_dummy_resource *dummy_res, *tmp; struct vfio_pci_ioeventfd *ioeventfd, *ioeventfd_tmp; @@ -687,27 +720,7 @@ void vfio_pci_core_disable(struct vfio_pci_core_device *vdev) */ pci_write_config_word(pdev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE); - /* - * Try to get the locks ourselves to prevent a deadlock. The - * success of this is dependent on being able to lock the device, - * which is not always possible. - * We can not use the "try" reset interface here, which will - * overwrite the previously restored configuration information. - */ - if (vdev->reset_works) { - bridge = pci_upstream_bridge(pdev); - if (bridge && !pci_dev_trylock(bridge)) - goto out_restore_state; - if (pci_dev_trylock(pdev)) { - if (!__pci_reset_function_locked(pdev)) - vdev->needs_reset = false; - pci_dev_unlock(pdev); - } - if (bridge) - pci_dev_unlock(bridge); - } - -out_restore_state: + vfio_pci_core_try_reset(vdev); pci_restore_state(pdev); out: pci_disable_device(pdev); diff --git a/drivers/vfio/pci/vfio_pci_liveupdate.c b/drivers/vfio/pci/vfio_pci_liveupdate.c index 5ea5af46b159..c4ebc7c486e5 100644 --- a/drivers/vfio/pci/vfio_pci_liveupdate.c +++ b/drivers/vfio/pci/vfio_pci_liveupdate.c @@ -6,27 +6,178 @@ * David Matlack */ +/** + * DOC: VFIO PCI Preservation via LUO + * + * VFIO PCI devices can be preserved over a kexec using the Live Update + * Orchestrator (LUO) file preservation. This allows userspace (such as a VMM) + * to transfer an in-use device to the next kernel. + * + * .. note:: + * The support for preserving VFIO PCI devices is currently *partial* and + * should be considered *experimental*. It should only be used by developers + * working on expanding the support for the time being. + * + * To avoid accidental usage while the support is still experimental, this + * support is hidden behind a default-disable config option + * ``CONFIG_VFIO_PCI_LIVEUPDATE``. Once the kernel support has stabilized and + * become complete, this option will be enabled by default when + * ``CONFIG_VFIO_PCI`` and ``CONFIG_LIVEUPDATE`` are enabled. + * + * Usage Example + * ============= + * + * VFIO PCI devices can be preserved across a kexec by preserving the file + * associated with the device in a LUO session:: + * + * device_fd = open("/dev/vfio/devices/X"); + * ... + * ioctl(session_fd, LIVEUPDATE_SESSION_PRESERVE_FD, { ..., device_fd, ...}); + * + * .. note:: + * LUO will hold an extra reference to the device file for as long as it is + * preserved, so there is no way for the file to be destroyed or the device + * to be unbound from the vfio-pci driver while it is preserved. + * + * Retrieving the file after kexec is not yet supported. + * + * Restrictions + * ============ + * + * The kernel imposes the following restrictions when preserving VFIO devices: + * + * * The device must be bound to the ``vfio-pci`` driver. + * + * * ``CONFIG_VFIO_PCI_ZDEV_KVM`` must not be enabled. This may be relaxed in + * the future. + * + * * The device not be an Intel display device. This may be relaxed in the + * future. + * + * * The device file must have been acquired from the VFIO character device, + * not ``VFIO_GROUP_GET_DEVICE_FD``. + * + * * The device must have interrupt disable prior to kexec. Failure to disable + * interrupts on the device will cause the ``reboot(LINUX_REBOOT_CMD_KEXEC)`` + * syscall (to initiate the kexec) to fail. + * + * Preservation Behavior + * ===================== + * + * The eventual goal of this support is to avoid disrupting the workload, state, + * or configuration of each preserved device during a Live Update. This would + * include allowing the device to perform DMA to preserved memory buffers and + * perform P2P DMA to other preserved devices. However, there are many pieces + * that still need to land in the kernel. + * + * For now, VFIO only preserves the following state for for devices: + * + * * The PCI Segment, Bus, Device, and Function numbers of the device. The + * kernel guarantees the these will not change across a kexec when a device + * is preserved. + * + * Since the kernel is not yet prepared to preserve all parts of the device and + * its dependencies (such as DMA mappings), VFIO currently resets and restores + * preserved devices back into an idle state during kexec, before handing off + * control to the next kernel. This will be relaxed in future versions of the + * kernel once it is safe to allow the device to keep running across kexec. + */ + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include #include #include #include +#include #include "vfio_pci_priv.h" static bool vfio_pci_liveupdate_can_preserve(struct liveupdate_file_handler *handler, struct file *file) { - return false; + struct vfio_device *device = vfio_device_from_file(file); + struct vfio_pci_core_device *vdev; + struct pci_dev *pdev; + + if (!device) + return false; + + /* Live Update support is limited to cdev files. */ + if (!vfio_device_cdev_opened(device)) + return false; + + if (device->ops != &vfio_pci_ops) + return false; + + vdev = container_of(device, struct vfio_pci_core_device, vdev); + pdev = vdev->pdev; + + /* + * Don't support specialized vfio-pci devices for now since they haven't + * been tested. + */ + if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM) || vfio_pci_is_intel_display(pdev)) + return false; + + return true; } static int vfio_pci_liveupdate_preserve(struct liveupdate_file_op_args *args) { - return -EOPNOTSUPP; + struct vfio_device *device = vfio_device_from_file(args->file); + struct vfio_pci_core_device_ser *ser; + struct vfio_pci_core_device *vdev; + struct pci_dev *pdev; + + vdev = container_of(device, struct vfio_pci_core_device, vdev); + pdev = vdev->pdev; + + ser = kho_alloc_preserve(sizeof(*ser)); + if (IS_ERR(ser)) + return PTR_ERR(ser); + + ser->bdf = pci_dev_id(pdev); + ser->domain = pci_domain_nr(pdev->bus); + + args->serialized_data = virt_to_phys(ser); + return 0; } static void vfio_pci_liveupdate_unpreserve(struct liveupdate_file_op_args *args) { + kho_unpreserve_free(phys_to_virt(args->serialized_data)); +} + +static int vfio_pci_liveupdate_freeze(struct liveupdate_file_op_args *args) +{ + struct vfio_device *device = vfio_device_from_file(args->file); + struct vfio_pci_core_device *vdev; + struct pci_dev *pdev; + int ret; + + vdev = container_of(device, struct vfio_pci_core_device, vdev); + pdev = vdev->pdev; + + guard(mutex)(&device->dev_set->lock); + + /* + * Userspace must disable interrupts on the device prior to freeze so + * that the device does not send any interrupts until new interrupt + * handlers have been established by the next kernel. + */ + if (vdev->irq_type != VFIO_PCI_NUM_IRQS) { + pci_err(pdev, "Freeze failed! Interrupts are still enabled.\n"); + return -EINVAL; + } + + ret = pci_load_saved_state(pdev, vdev->pci_saved_state); + if (ret) + return ret; + + vfio_pci_core_try_reset(vdev); + pci_restore_state(pdev); + return 0; } static int vfio_pci_liveupdate_retrieve(struct liveupdate_file_op_args *args) @@ -42,6 +193,7 @@ static const struct liveupdate_file_ops vfio_pci_liveupdate_file_ops = { .can_preserve = vfio_pci_liveupdate_can_preserve, .preserve = vfio_pci_liveupdate_preserve, .unpreserve = vfio_pci_liveupdate_unpreserve, + .freeze = vfio_pci_liveupdate_freeze, .retrieve = vfio_pci_liveupdate_retrieve, .finish = vfio_pci_liveupdate_finish, .owner = THIS_MODULE, diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index cbf46e09da30..fa5c7f544f8a 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -11,6 +11,10 @@ /* Cap maximum number of ioeventfds per device (arbitrary) */ #define VFIO_PCI_IOEVENTFD_MAX 1000 +extern const struct vfio_device_ops vfio_pci_ops; + +void vfio_pci_core_try_reset(struct vfio_pci_core_device *vdev); + struct vfio_pci_ioeventfd { struct list_head next; struct vfio_pci_core_device *vdev; diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 742477546b15..8b222f71bbab 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1436,7 +1436,7 @@ const struct file_operations vfio_device_fops = { #endif }; -static struct vfio_device *vfio_device_from_file(struct file *file) +struct vfio_device *vfio_device_from_file(struct file *file) { struct vfio_device_file *df = file->private_data; @@ -1444,6 +1444,7 @@ static struct vfio_device *vfio_device_from_file(struct file *file) return NULL; return df->device; } +EXPORT_SYMBOL_GPL(vfio_device_from_file); /** * vfio_file_is_valid - True if the file is valid vfio file diff --git a/include/linux/kho/abi/vfio_pci.h b/include/linux/kho/abi/vfio_pci.h index e2412b455e61..876aaf81dd92 100644 --- a/include/linux/kho/abi/vfio_pci.h +++ b/include/linux/kho/abi/vfio_pci.h @@ -9,6 +9,9 @@ #ifndef _LINUX_LIVEUPDATE_ABI_VFIO_PCI_H #define _LINUX_LIVEUPDATE_ABI_VFIO_PCI_H +#include +#include + /** * DOC: VFIO PCI Live Update ABI * @@ -25,4 +28,16 @@ #define VFIO_PCI_LUO_FH_COMPATIBLE "vfio-pci-v1" +/** + * struct vfio_pci_core_device_ser - Serialized state of a single VFIO PCI + * device. + * + * @domain: The device's PCI domain number (segment). + * @bdf: The device's PCI bus, device, and function number. + */ +struct vfio_pci_core_device_ser { + u32 domain; + u16 bdf; +} __packed; + #endif /* _LINUX_LIVEUPDATE_ABI_VFIO_PCI_H */ diff --git a/include/linux/vfio.h b/include/linux/vfio.h index e90859956514..e9d3ddb715c5 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -81,6 +81,8 @@ struct vfio_device { #endif }; +struct vfio_device *vfio_device_from_file(struct file *file); + /** * struct vfio_device_ops - VFIO bus driver device callbacks * -- 2.53.0.983.g0bb29b3bc5-goog