Introduce a function zpci_fmb_reenable_device() that checks for the state of the FMB and reuses the same buffer where appropriate. If FMB was not previously enabled, it enables it for the device. Call this function during a zPCI device re-enablement, which in turn implicitly ensures that the FMB is enabled for host devices during their KVM registration. This function also clears out the software counters, so that a program resetting an FMB would see all its counters restart from zero as expected. The function to clear the software counters is also separated into a static function as it is now reused in both zpci_fmb_enable_device() and zpci_fmb_reenable_device(). Signed-off-by: Omar Elghoul --- arch/s390/include/asm/pci.h | 1 + arch/s390/pci/pci.c | 75 +++++++++++++++++++++++++++++-------- 2 files changed, 61 insertions(+), 15 deletions(-) diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index 5dcf35f0f325..65014e52d559 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -323,6 +323,7 @@ void zpci_remove_parent_msi_domain(struct zpci_bus *zbus); /* FMB */ int zpci_fmb_enable_device(struct zpci_dev *); int zpci_fmb_disable_device(struct zpci_dev *); +int zpci_fmb_reenable_device(struct zpci_dev *zdev); /* Debug */ int zpci_debug_init(void); diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 39bd2adfc240..56cabb2dc291 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -164,22 +164,10 @@ int zpci_unregister_ioat(struct zpci_dev *zdev, u8 dmaas) return cc; } -/* Modify PCI: Set PCI function measurement parameters */ -int zpci_fmb_enable_device(struct zpci_dev *zdev) +static void zpci_fmb_clear_iommu_ctrs(struct zpci_dev *zdev) { - u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_SET_MEASURE); struct zpci_iommu_ctrs *ctrs; - struct zpci_fib fib = {0}; - unsigned long flags; - u8 cc, status; - - if (zdev->fmb || sizeof(*zdev->fmb) < zdev->fmb_length) - return -EINVAL; - - zdev->fmb = kmem_cache_zalloc(zdev_fmb_cache, GFP_KERNEL); - if (!zdev->fmb) - return -ENOMEM; - WARN_ON((u64) zdev->fmb & 0xf); + unsigned long flags = 0; /* reset software counters */ spin_lock_irqsave(&zdev->dom_lock, flags); @@ -192,7 +180,24 @@ int zpci_fmb_enable_device(struct zpci_dev *zdev) atomic64_set(&ctrs->sync_rpcits, 0); } spin_unlock_irqrestore(&zdev->dom_lock, flags); +} + +/* Modify PCI: Set PCI function measurement parameters */ +int zpci_fmb_enable_device(struct zpci_dev *zdev) +{ + u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_SET_MEASURE); + struct zpci_fib fib = {0}; + u8 cc, status; + + if (zdev->fmb || sizeof(*zdev->fmb) < zdev->fmb_length) + return -EINVAL; + + zdev->fmb = kmem_cache_zalloc(zdev_fmb_cache, GFP_KERNEL); + if (!zdev->fmb) + return -ENOMEM; + WARN_ON((u64) zdev->fmb & 0xf); + zpci_fmb_clear_iommu_ctrs(zdev); fib.fmb_addr = virt_to_phys(zdev->fmb); fib.gd = zdev->gisa; @@ -227,6 +232,41 @@ int zpci_fmb_disable_device(struct zpci_dev *zdev) } return cc ? -EIO : 0; } +EXPORT_SYMBOL_GPL(zpci_fmb_disable_device); + +int zpci_fmb_reenable_device(struct zpci_dev *zdev) +{ + u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_SET_MEASURE); + struct zpci_fib fib = {0}; + u8 cc, status; + + lockdep_assert_held(&zdev->fmb_lock); + + if (!zdev->fmb) + return zpci_fmb_enable_device(zdev); + + fib.gd = zdev->gisa; + cc = zpci_mod_fc(req, &fib, &status); /* Disable function measurement */ + + /* Unlike in zpci_fmb_disable_device(), cc == 3 is not a valid state here + * because we are re-enabling function measurement for the same function + * handle. + */ + if (cc) + return -EIO; + + zpci_fmb_clear_iommu_ctrs(zdev); + + fib.fmb_addr = virt_to_phys(zdev->fmb); + cc = zpci_mod_fc(req, &fib, &status); /* Re-enable function measurement */ + if (cc) { + kmem_cache_free(zdev_fmb_cache, zdev->fmb); + zdev->fmb = NULL; + return -EIO; + } + return 0; +} +EXPORT_SYMBOL_GPL(zpci_fmb_reenable_device); static int zpci_cfg_load(struct zpci_dev *zdev, int offset, u32 *val, u8 len) { @@ -729,9 +769,14 @@ int zpci_reenable_device(struct zpci_dev *zdev) } rc = zpci_iommu_register_ioat(zdev, &status); - if (rc) + if (rc) { zpci_disable_device(zdev); + return rc; + } + mutex_lock(&zdev->fmb_lock); + zpci_fmb_reenable_device(zdev); + mutex_unlock(&zdev->fmb_lock); return rc; } EXPORT_SYMBOL_GPL(zpci_reenable_device); -- 2.52.0 Set up a new VFIO feature for zPCI devices to share the latest FMB snapshot with userspace. This feature supports the same 4 FMB formats (0 through 3) that are already supported by the kernel. With VFIO_DEVICE_FEATURE_GET, allow the user driver to read the latest FMB snapshot as well as query whether the FMB is currently enabled on the function, itself indicating whether the FMB snapshot is valid. On the other hand, with VFIO_DEVICE_FEATURE_SET, the userspace driver can enable or disable the FMB. Signed-off-by: Omar Elghoul --- drivers/vfio/pci/vfio_pci_core.c | 2 + drivers/vfio/pci/vfio_pci_priv.h | 9 ++++ drivers/vfio/pci/vfio_pci_zdev.c | 77 ++++++++++++++++++++++++++++++++ include/uapi/linux/vfio.h | 43 ++++++++++++++++++ 4 files changed, 131 insertions(+) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 050e7542952e..07e13667d66a 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1569,6 +1569,8 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags, return vfio_pci_core_feature_token(vdev, flags, arg, argsz); case VFIO_DEVICE_FEATURE_DMA_BUF: return vfio_pci_core_feature_dma_buf(vdev, flags, arg, argsz); + case VFIO_DEVICE_FEATURE_ZPCI_FMB: + return vfio_pci_zdev_feature_fmb(vdev, flags, arg, argsz); default: return -ENOTTY; } diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index fca9d0dfac90..208e05942b48 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -93,6 +93,8 @@ int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev, struct vfio_info_cap *caps); int vfio_pci_zdev_open_device(struct vfio_pci_core_device *vdev); void vfio_pci_zdev_close_device(struct vfio_pci_core_device *vdev); +int vfio_pci_zdev_feature_fmb(struct vfio_pci_core_device *vdev, u32 flags, + void __user *arg, size_t argsz); #else static inline int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev, struct vfio_info_cap *caps) @@ -107,6 +109,13 @@ static inline int vfio_pci_zdev_open_device(struct vfio_pci_core_device *vdev) static inline void vfio_pci_zdev_close_device(struct vfio_pci_core_device *vdev) {} + +static inline int vfio_pci_zdev_feature_fmb(struct vfio_pci_core_device *vdev, + u32 flags, void __user *arg, + size_t argsz) +{ + return -ENOTTY; +} #endif static inline bool vfio_pci_is_vga(struct pci_dev *pdev) diff --git a/drivers/vfio/pci/vfio_pci_zdev.c b/drivers/vfio/pci/vfio_pci_zdev.c index 0990fdb146b7..1e9efe2bee69 100644 --- a/drivers/vfio/pci/vfio_pci_zdev.c +++ b/drivers/vfio/pci/vfio_pci_zdev.c @@ -167,3 +167,80 @@ void vfio_pci_zdev_close_device(struct vfio_pci_core_device *vdev) if (zpci_kvm_hook.kvm_unregister) zpci_kvm_hook.kvm_unregister(zdev); } + +int vfio_pci_zdev_feature_fmb(struct vfio_pci_core_device *vdev, u32 flags, + void __user *arg, size_t argsz) +{ + struct zpci_dev *zdev; + struct vfio_device_feature_zpci_fmb fmb = {0}; + u32 ops = VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_SET; + int ret; + + ret = vfio_check_feature(flags, argsz, ops, sizeof(fmb)); + if (ret != 1) + return ret; + + zdev = to_zpci(vdev->pdev); + if (!zdev) + return -ENODEV; + + mutex_lock(&zdev->fmb_lock); + if (flags & VFIO_DEVICE_FEATURE_SET) { + if (copy_from_user(&fmb, arg, sizeof(fmb))) { + ret = -EFAULT; + goto release_lock; + } + + if (fmb.flags & VFIO_DEVICE_FEATURE_ZPCI_FMB_FLAGS_ENABLED) + ret = zpci_fmb_reenable_device(zdev); + else + ret = zpci_fmb_disable_device(zdev); + goto release_lock; + } + + ret = 0; + if (zdev->fmb) { + fmb.flags |= VFIO_DEVICE_FEATURE_ZPCI_FMB_FLAGS_ENABLED; + } else { + fmb.flags &= ~VFIO_DEVICE_FEATURE_ZPCI_FMB_FLAGS_ENABLED; + goto release_lock; + } + + fmb.format = zdev->fmb->format; + fmb.fmt_ind = zdev->fmb->fmt_ind; + fmb.samples = zdev->fmb->samples; + fmb.last_update = zdev->fmb->last_update; + fmb.ld_ops = zdev->fmb->ld_ops; + fmb.st_ops = zdev->fmb->st_ops; + fmb.stb_ops = zdev->fmb->stb_ops; + fmb.rpcit_ops = zdev->fmb->rpcit_ops; + + switch (zdev->fmb->format) { + case 0: + if (zdev->fmb->fmt_ind & ZPCI_FMB_DMA_COUNTER_VALID) { + fmb.fmt0.dma_rbytes = zdev->fmb->fmt0.dma_rbytes; + fmb.fmt0.dma_wbytes = zdev->fmb->fmt0.dma_wbytes; + } + break; + case 1: + fmb.fmt1.rx_bytes = zdev->fmb->fmt1.rx_bytes; + fmb.fmt1.rx_packets = zdev->fmb->fmt1.rx_packets; + fmb.fmt1.tx_bytes = zdev->fmb->fmt1.tx_bytes; + fmb.fmt1.tx_packets = zdev->fmb->fmt1.tx_packets; + break; + case 2: + fmb.fmt2.consumed_work_units = zdev->fmb->fmt2.consumed_work_units; + fmb.fmt2.max_work_units = zdev->fmb->fmt2.max_work_units; + break; + case 3: + fmb.fmt3.tx_bytes = zdev->fmb->fmt3.tx_bytes; + break; + } + + if (copy_to_user(arg, &fmb, sizeof(fmb))) + ret = -EFAULT; + +release_lock: + mutex_unlock(&zdev->fmb_lock); + return ret; +} diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 5de618a3a5ee..6cbc34ff063e 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -1534,6 +1534,49 @@ struct vfio_device_feature_dma_buf { */ #define VFIO_DEVICE_FEATURE_MIG_PRECOPY_INFOv2 12 +/** + * Upon VFIO_DEVICE_FEATURE_GET, provide FMB passthrough for VFIO zPCI devices. + * + * Upon VFIO_DEVICE_FEATURE_SET, only the flags field is read while the + * remainder of the structure is ignored. This allows the driver to enable or + * disable the FMB while also leaving reserved bits for future flag expansion. + * All reserved fields should be zero for future compatibility. + */ +#define VFIO_DEVICE_FEATURE_ZPCI_FMB 13 +#define VFIO_DEVICE_FEATURE_ZPCI_FMB_FLAGS_ENABLED 0x1 + +struct vfio_device_feature_zpci_fmb { + __u64 flags; + __u32 format: 8; + __u32 fmt_ind: 24; + __u32 samples; + __u64 last_update; + __u64 ld_ops; + __u64 st_ops; + __u64 stb_ops; + __u64 rpcit_ops; + union { + struct { + __u64 dma_rbytes; + __u64 dma_wbytes; + } fmt0; + struct { + __u64 rx_bytes; + __u64 rx_packets; + __u64 tx_bytes; + __u64 tx_packets; + } fmt1; + struct { + __u64 consumed_work_units; + __u64 max_work_units; + } fmt2; + struct { + __u64 tx_bytes; + } fmt3; + }; + __u64 reserved[16]; +}; + /* -------- API for Type1 VFIO IOMMU -------- */ /** -- 2.52.0 Introduce a fence over enabling or disabling FMB via sysfs when the zPCI device is associated with a KVM. This will allow a KVM guest to use FMB passthrough and avoid the edge-case where the host disables FMB while the guest is still using it, which may cause partial counter resets and inconsistent reads which have no parallel in the architecture. With this patch, the userspace driver, likely QEMU, is still able to enable or disable the FMB using the VFIO device feature introduced in the previous patch, effectively securing what is associated with the VM state and isolating it from other processes on the host. For VFIO devices that are not associated with a KVM (i.e., for userspace drivers other than QEMU), this fence does not take effect. Signed-off-by: Omar Elghoul --- arch/s390/pci/pci_debug.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/s390/pci/pci_debug.c b/arch/s390/pci/pci_debug.c index c7ed7bf254b5..2601614b919b 100644 --- a/arch/s390/pci/pci_debug.c +++ b/arch/s390/pci/pci_debug.c @@ -149,9 +149,15 @@ static ssize_t pci_perf_seq_write(struct file *file, const char __user *ubuf, if (!zdev) return 0; + mutex_lock(&zdev->kzdev_lock); + if (zdev->kzdev) { + rc = -EPERM; + goto release_kzdev_and_out; + } + rc = kstrtoul_from_user(ubuf, count, 10, &val); if (rc) - return rc; + goto release_kzdev_and_out; mutex_lock(&zdev->fmb_lock); switch (val) { @@ -163,6 +169,9 @@ static ssize_t pci_perf_seq_write(struct file *file, const char __user *ubuf, break; } mutex_unlock(&zdev->fmb_lock); + +release_kzdev_and_out: + mutex_unlock(&zdev->kzdev_lock); return rc ? rc : count; } -- 2.52.0