Expand the VFIO DMABUF revocation state to three states: Not revoked, temporarily revoked, and permanently revoked. The first two are for existing transient revocation, e.g. across a function reset, and the DMABUF is put into the last in response to a new VFIO feature VFIO_DEVICE_FEATURE_DMA_BUF. VFIO_DEVICE_FEATURE_DMA_BUF passes a DMABUF by fd and requests that the DMABUF is permanently revoked. On success, it's guaranteed that the buffer can never be imported/attached/mmap()ed in future, that dynamic imports have been cleanly detached, and that all mappings have been made inaccessible/PTEs zapped. This is useful for lifecycle management, to reclaim VFIO PCI BAR ranges previously delegated to a subordinate client process: The driver process can ensure that the loaned resources are revoked when the client is deemed "done", and exported ranges can be safely re-used elsewhere. Refactor the revocation code out of vfio_pci_dma_buf_move() to a function common to move and the new feature request path. Signed-off-by: Matt Evans --- drivers/vfio/pci/vfio_pci_core.c | 6 +- drivers/vfio/pci/vfio_pci_dmabuf.c | 169 ++++++++++++++++++++++------- drivers/vfio/pci/vfio_pci_priv.h | 19 +++- include/uapi/linux/vfio.h | 20 ++++ 4 files changed, 173 insertions(+), 41 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 508a5eca910a..064906b25467 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1573,6 +1573,8 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags, return vfio_pci_core_feature_token(vdev, flags, arg, argsz); case VFIO_DEVICE_FEATURE_DMA_BUF: return vfio_pci_core_feature_dma_buf(vdev, flags, arg, argsz); + case VFIO_DEVICE_FEATURE_DMA_BUF_REVOKE: + return vfio_pci_core_feature_dma_buf_revoke(vdev, flags, arg, argsz); default: return -ENOTTY; } @@ -1784,7 +1786,7 @@ static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf, dma_resv_lock(priv->dmabuf->resv, NULL); - if (priv->revoked) { + if (priv->status != VFIO_PCI_DMABUF_OK) { pr_debug_ratelimited("%s VA 0x%lx, pgoff 0x%lx: DMABUF revoked/cleaned up\n", __func__, vmf->address, vma->vm_pgoff); dma_resv_unlock(priv->dmabuf->resv); @@ -1809,7 +1811,7 @@ static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf, scoped_guard(rwsem_read, &vdev->memory_lock) { /* Revocation status must be re-read, under memory_lock */ - if (!priv->revoked) { + if (priv->status == VFIO_PCI_DMABUF_OK) { int pres = vfio_pci_dma_buf_find_pfn(priv, vma, vmf->address, order, &pfn); diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c index 2fb09a2c0f6b..b47411992ab6 100644 --- a/drivers/vfio/pci/vfio_pci_dmabuf.c +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c @@ -19,7 +19,7 @@ static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf, if (!attachment->peer2peer) return -EOPNOTSUPP; - if (priv->revoked) + if (priv->status != VFIO_PCI_DMABUF_OK) return -ENODEV; if (!dma_buf_attach_revocable(attachment)) @@ -41,7 +41,7 @@ static int vfio_pci_dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct * * still safe because the fault handler ultimately prevents * access to a revoked buffer if it isn't caught here. */ - if (READ_ONCE(priv->revoked)) + if (READ_ONCE(priv->status) != VFIO_PCI_DMABUF_OK) return -ENODEV; if ((vma->vm_flags & VM_SHARED) == 0) return -EINVAL; @@ -81,7 +81,7 @@ vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment, dma_resv_assert_held(priv->dmabuf->resv); - if (priv->revoked) + if (priv->status != VFIO_PCI_DMABUF_OK) return ERR_PTR(-ENODEV); ret = dma_buf_phys_vec_to_sgt(attachment, priv->provider, @@ -291,7 +291,8 @@ static int vfio_pci_dmabuf_export(struct vfio_pci_core_device *vdev, INIT_LIST_HEAD(&priv->dmabufs_elm); down_write(&vdev->memory_lock); dma_resv_lock(priv->dmabuf->resv, NULL); - priv->revoked = !__vfio_pci_memory_enabled(vdev); + priv->status = __vfio_pci_memory_enabled(vdev) ? VFIO_PCI_DMABUF_OK : + VFIO_PCI_DMABUF_TEMP_REVOKED; list_add_tail(&priv->dmabufs_elm, &vdev->dmabufs); dma_resv_unlock(priv->dmabuf->resv); up_write(&vdev->memory_lock); @@ -322,7 +323,7 @@ int vfio_pci_dma_buf_iommufd_map(struct dma_buf_attachment *attachment, return -EOPNOTSUPP; priv = attachment->dmabuf->priv; - if (priv->revoked) + if (priv->status != VFIO_PCI_DMABUF_OK) return -ENODEV; /* More than one range to iommufd will require proper DMABUF support */ @@ -591,6 +592,64 @@ int vfio_pci_core_mmap_prep_dmabuf(struct vfio_pci_core_device *vdev, return ret; } +/* Set the DMABUF's revocation status (OK or temporarily/permanently revoked) */ +static void vfio_pci_dma_buf_set_status(struct vfio_pci_dma_buf *priv, + enum vfio_pci_dma_buf_status new_status) +{ + bool was_revoked; + + lockdep_assert_held_write(&priv->vdev->memory_lock); + + if (priv->status == VFIO_PCI_DMABUF_PERM_REVOKED || + priv->status == new_status) { + return; + } + + dma_resv_lock(priv->dmabuf->resv, NULL); + was_revoked = (priv->status == VFIO_PCI_DMABUF_TEMP_REVOKED); + + if (new_status != VFIO_PCI_DMABUF_OK) { + priv->status = new_status; /* Temp or permanently revoked */ + + if (was_revoked) { + /* + * TEMP_REVOKED is being upgraded to + * PERM_REVOKED. The buffer is already gone, + * don't wait on it again. + */ + dma_resv_unlock(priv->dmabuf->resv); + return; + } + } + + dma_buf_invalidate_mappings(priv->dmabuf); + dma_resv_wait_timeout(priv->dmabuf->resv, + DMA_RESV_USAGE_BOOKKEEP, false, + MAX_SCHEDULE_TIMEOUT); + dma_resv_unlock(priv->dmabuf->resv); + if (new_status != VFIO_PCI_DMABUF_OK) { + kref_put(&priv->kref, vfio_pci_dma_buf_done); + wait_for_completion(&priv->comp); + unmap_mapping_range(priv->dmabuf->file->f_mapping, + 0, priv->size, 1); + /* + * Re-arm the registered kref reference and the + * completion so the post-revoke state matches the + * post-creation state. An un-revoke followed by a + * new mapping needs the kref to be non-zero before + * kref_get(), and vfio_pci_dma_buf_cleanup() + * delegates its drain back through this revoke + * path on a possibly-already-revoked dma-buf. + */ + kref_init(&priv->kref); + reinit_completion(&priv->comp); + } else { + dma_resv_lock(priv->dmabuf->resv, NULL); + priv->status = VFIO_PCI_DMABUF_OK; + dma_resv_unlock(priv->dmabuf->resv); + } +} + void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked) { struct vfio_pci_dma_buf *priv; @@ -599,44 +658,15 @@ void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked) lockdep_assert_held_write(&vdev->memory_lock); /* * Holding memory_lock ensures a racing VMA fault observes - * priv->revoked properly. + * priv->status properly. */ list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) { if (!get_file_active(&priv->dmabuf->file)) continue; - - if (priv->revoked != revoked) { - dma_resv_lock(priv->dmabuf->resv, NULL); - if (revoked) - priv->revoked = true; - dma_buf_invalidate_mappings(priv->dmabuf); - dma_resv_wait_timeout(priv->dmabuf->resv, - DMA_RESV_USAGE_BOOKKEEP, false, - MAX_SCHEDULE_TIMEOUT); - dma_resv_unlock(priv->dmabuf->resv); - if (revoked) { - kref_put(&priv->kref, vfio_pci_dma_buf_done); - wait_for_completion(&priv->comp); - unmap_mapping_range(priv->dmabuf->file->f_mapping, - 0, priv->size, 1); - /* - * Re-arm the registered kref reference and the - * completion so the post-revoke state matches the - * post-creation state. An un-revoke followed by a - * new mapping needs the kref to be non-zero before - * kref_get(), and vfio_pci_dma_buf_cleanup() - * delegates its drain back through this revoke - * path on a possibly-already-revoked dma-buf. - */ - kref_init(&priv->kref); - reinit_completion(&priv->comp); - } else { - dma_resv_lock(priv->dmabuf->resv, NULL); - priv->revoked = false; - dma_resv_unlock(priv->dmabuf->resv); - } - } + vfio_pci_dma_buf_set_status(priv, revoked ? + VFIO_PCI_DMABUF_TEMP_REVOKED : + VFIO_PCI_DMABUF_OK); fput(priv->dmabuf->file); } } @@ -668,3 +698,66 @@ void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev) } up_write(&vdev->memory_lock); } + +#ifdef CONFIG_VFIO_PCI_DMABUF +int vfio_pci_core_feature_dma_buf_revoke( + struct vfio_pci_core_device *vdev, u32 flags, + struct vfio_device_feature_dma_buf_revoke __user *arg, + size_t argsz) +{ + struct vfio_device_feature_dma_buf_revoke db_revoke; + struct vfio_pci_dma_buf *priv; + struct dma_buf *dmabuf; + int ret; + + if (!vdev->pci_ops || !vdev->pci_ops->get_dmabuf_phys) + return -EOPNOTSUPP; + + ret = vfio_check_feature(flags, argsz, + VFIO_DEVICE_FEATURE_SET, + sizeof(db_revoke)); + if (ret != 1) + return ret; + + if (copy_from_user(&db_revoke, arg, sizeof(db_revoke))) + return -EFAULT; + + dmabuf = dma_buf_get(db_revoke.dmabuf_fd); + if (IS_ERR(dmabuf)) + return PTR_ERR(dmabuf); + + priv = dmabuf->priv; + /* + * Sanity-check the DMABUF is really a vfio_pci_dma_buf _and_ + * relates to the VFIO device it was provided with. + * + * If the DMABUF relates to this vdev then priv->vdev is + * stable because this open fd prevents cleanup. + * + * If it relates to a different vdev, reading priv->vdev might + * race with a concurrent cleanup on that device. But if so, + * it points to a non-matching vdev or NULL and is unusable + * either way. + */ + if (dmabuf->ops != &vfio_pci_dmabuf_ops || + READ_ONCE(priv->vdev) != vdev) { + ret = -ENODEV; + goto out_put_buf; + } + + scoped_guard(rwsem_write, &vdev->memory_lock) { + if (priv->status == VFIO_PCI_DMABUF_PERM_REVOKED) { + ret = -EBADFD; + } else { + vfio_pci_dma_buf_set_status(priv, + VFIO_PCI_DMABUF_PERM_REVOKED); + ret = 0; + } + } + +out_put_buf: + dma_buf_put(dmabuf); + + return ret; +} +#endif /* CONFIG_VFIO_PCI_DMABUF */ diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index db2e2aeae88f..3c2f2575b670 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -23,6 +23,12 @@ struct vfio_pci_ioeventfd { bool test_mem; }; +enum vfio_pci_dma_buf_status { + VFIO_PCI_DMABUF_OK = 0, + VFIO_PCI_DMABUF_TEMP_REVOKED = 1, + VFIO_PCI_DMABUF_PERM_REVOKED = 2, +}; + struct vfio_pci_dma_buf { struct dma_buf *dmabuf; struct vfio_pci_core_device *vdev; @@ -35,7 +41,7 @@ struct vfio_pci_dma_buf { struct kref kref; struct completion comp; unsigned long vma_pgoff_adjust; - u8 revoked : 1; + enum vfio_pci_dma_buf_status status; }; extern const struct vm_operations_struct vfio_pci_mmap_ops; @@ -148,6 +154,10 @@ void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked); int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, struct vfio_device_feature_dma_buf __user *arg, size_t argsz); +int vfio_pci_core_feature_dma_buf_revoke( + struct vfio_pci_core_device *vdev, u32 flags, + struct vfio_device_feature_dma_buf_revoke __user *arg, + size_t argsz); #else static inline int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, @@ -156,6 +166,13 @@ vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, { return -ENOTTY; } +static inline int vfio_pci_core_feature_dma_buf_revoke( + struct vfio_pci_core_device *vdev, u32 flags, + struct vfio_device_feature_dma_buf_revoke __user *arg, + size_t argsz) +{ + return -ENOTTY; +} #endif #endif diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 5de618a3a5ee..697c0bb4b9bc 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -1534,6 +1534,26 @@ struct vfio_device_feature_dma_buf { */ #define VFIO_DEVICE_FEATURE_MIG_PRECOPY_INFOv2 12 +/** + * Given a dma_buf fd previously created by + * VFIO_DEVICE_FEATURE_DMA_BUF, a SET of this feature requests that + * access to the corresponding DMABUF is immediately and permanently + * revoked. On successful return, the buffer is not accessible + * through any mmap() or dma-buf import. The buffer is permanently + * disabled, and VFIO refuses all map, mmap, attach, etc. requests. + * + * Return: 0 on success, -1 and errno is set on failure: + * + * EBADF, EINVAL: dmabuf_fd is not a DMABUF fd. + * ENODEV: The dmabuf_fd does not match this VFIO device. + * EBADFD: The DMABUF is already revoked. + */ +#define VFIO_DEVICE_FEATURE_DMA_BUF_REVOKE 13 + +struct vfio_device_feature_dma_buf_revoke { + __s32 dmabuf_fd; +}; + /* -------- API for Type1 VFIO IOMMU -------- */ /** -- 2.50.1 (Apple Git-155)