A VFIO DMABUF can export a subset of a BAR to userspace by fd; add support for mmap() of this fd. This provides another route for a process to map BARs, in which the process can only map a specific subset of a BAR represented by the exported DMABUF. mmap() support enables userspace driver designs that safely delegate access to BAR sub-ranges to other client processes by sharing a DMABUF fd, without having to share the (omnipotent) VFIO device fd with them. Since the main VFIO BAR mmap() is now DMABUF-aware, the new mmap() reuses the existing vm_ops. The lifecycle of an exported DMABUF remains decoupled from that of the device fd it came from, i.e. the device fd could be closed with DMABUF VMAs present, meaning a fault on a VMA could happen concurrently with vfio_pci_dma_buf_cleanup(). To deal with this scenario, the fault handler now temporarily takes a VFIO device registration to ensure the vdev remains valid, and then vdev->memory_lock can be taken on it. Signed-off-by: Matt Evans --- drivers/vfio/pci/vfio_pci_core.c | 80 ++++++++++++++++++++++++++---- drivers/vfio/pci/vfio_pci_dmabuf.c | 36 ++++++++++++++ drivers/vfio/pci/vfio_pci_priv.h | 2 + 3 files changed, 109 insertions(+), 9 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 5ea0bd4e7876..508a5eca910a 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -12,6 +12,8 @@ #include #include +#include +#include #include #include #include @@ -1735,19 +1737,78 @@ static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf, vm_fault_t ret = VM_FAULT_SIGBUS; /* - * We can rely on the existence of both a DMABUF (priv) and - * the VFIO device it was exported from (vdev). This fault's - * VMA was established using vfio_pci_core_mmap_prep_dmabuf() - * which transfers ownership of the VFIO device fd to the - * DMABUF, and so the VFIO device is held open because the - * VMA's vm_file (DMABUF) is open. + * The only thing this can rely on is that the DMABUF relating + * to the VMA's vm_file exists (priv). * - * Since vfio_pci_dma_buf_cleanup() cannot have happened, - * vdev must be valid; we can take memory_lock. + * A DMABUF for a VFIO device fd mmap() holds a reference to + * the original VFIO device fd, but an explicitly-exported + * DMABUF does not. The original fd might have closed, + * meaning this fault can race with + * vfio_pci_dma_buf_cleanup(), meaning the buffer could have + * been revoked (in which case priv->vdev might be NULL), and + * the VFIO device registration might have been dropped. + * + * With the goal of taking vdev->memory_lock in a world where + * vdev might not still exist: + * + * 1. Take the resv lock on the DMABUF: + * - If racing cleanup got in first, the buffer is revoked; + * stop/exit if so. + * - If we got in first, the buffer is not revoked so vdev is + * non-NULL, accessible, and cleanup _has not yet put the + * VFIO device registration_. So, the device refcount must + * be >0. + * + * 2. Take vfio_device registration (refcount guaranteed >0 + * hereafter). + * + * 3. Unlock the DMABUF's resv lock: + * - A racing cleanup can now complete. + * - But, the device refcount >0, meaning the vfio_device + * (and vfio_pcie_core device vdev) have not yet been + * freed. vdev is accessible, even if the DMABUF has been + * revoked or cleanup has happened, because + * vfio_unregister_group_dev() can't complete. + * + * 4. Take the vdev->memory_lock + * - Either the DMABUF is usable, or has been cleaned up. + * Whichever, it can no longer change under us. + * - Test the DMABUF revocation status again: if it was + * revoked between 1 and 4 return a SIGBUS. Otherwise, + * return a PFN. + * - It's not necessary to also take the resv lock, because + * the status/vdev can't change while memory_lock is held. + * + * 5. Unlock, done. */ + + dma_resv_lock(priv->dmabuf->resv, NULL); + + if (priv->revoked) { + pr_debug_ratelimited("%s VA 0x%lx, pgoff 0x%lx: DMABUF revoked/cleaned up\n", + __func__, vmf->address, vma->vm_pgoff); + dma_resv_unlock(priv->dmabuf->resv); + return VM_FAULT_SIGBUS; + } + + /* If the buffer isn't revoked, vdev is valid */ vdev = priv->vdev; + if (!vfio_device_try_get_registration(&vdev->vdev)) { + /* + * If vdev != NULL (above), the registration should + * already be >0 and so this try_get should never + * fail. + */ + dev_warn(&vdev->pdev->dev, "%s: Unexpected registration failure\n", + __func__); + dma_resv_unlock(priv->dmabuf->resv); + return VM_FAULT_SIGBUS; + } + dma_resv_unlock(priv->dmabuf->resv); + scoped_guard(rwsem_read, &vdev->memory_lock) { + /* Revocation status must be re-read, under memory_lock */ if (!priv->revoked) { int pres = vfio_pci_dma_buf_find_pfn(priv, vma, vmf->address, @@ -1766,6 +1827,7 @@ static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf, __func__, order, pfn, vmf->address, vma->vm_pgoff, (unsigned int)ret); + vfio_device_put_registration(&vdev->vdev); return ret; } @@ -1774,7 +1836,7 @@ static vm_fault_t vfio_pci_mmap_page_fault(struct vm_fault *vmf) return vfio_pci_mmap_huge_fault(vmf, 0); } -static const struct vm_operations_struct vfio_pci_mmap_ops = { +const struct vm_operations_struct vfio_pci_mmap_ops = { .fault = vfio_pci_mmap_page_fault, #ifdef CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP .huge_fault = vfio_pci_mmap_huge_fault, diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c index 8f7f1b909b94..2fb09a2c0f6b 100644 --- a/drivers/vfio/pci/vfio_pci_dmabuf.c +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c @@ -27,6 +27,41 @@ static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf, return 0; } + +static int vfio_pci_dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma) +{ + struct vfio_pci_dma_buf *priv = dmabuf->priv; + + /* + * If we observe that the buffer is revoked now then refuse + * the mmap(). This is a belt-and-braces early failure to + * ease debugging a revoked buffer being used. Userspace + * might also race an mmap() against an explicit revocation, + * or an action doing a temporary revoke; race scenarios are + * still safe because the fault handler ultimately prevents + * access to a revoked buffer if it isn't caught here. + */ + if (READ_ONCE(priv->revoked)) + return -ENODEV; + if ((vma->vm_flags & VM_SHARED) == 0) + return -EINVAL; + + /* + * dma_buf_mmap_internal() has asserted that the VMA is + * contained within the DMABUF size before calling this. + */ + + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); + + /* See comments in vfio_pci_core_mmap() re VM_ALLOW_ANY_UNCACHED. */ + vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED | VM_IO | VM_PFNMAP | + VM_DONTEXPAND | VM_DONTDUMP); + vma->vm_private_data = priv; + vma->vm_ops = &vfio_pci_mmap_ops; + + return 0; +} #endif /* CONFIG_VFIO_PCI_DMABUF */ static void vfio_pci_dma_buf_done(struct kref *kref) @@ -94,6 +129,7 @@ static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf) static const struct dma_buf_ops vfio_pci_dmabuf_ops = { #ifdef CONFIG_VFIO_PCI_DMABUF .attach = vfio_pci_dma_buf_attach, + .mmap = vfio_pci_dma_buf_mmap, #endif .map_dma_buf = vfio_pci_dma_buf_map, .unmap_dma_buf = vfio_pci_dma_buf_unmap, diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index 10833aabd7fb..db2e2aeae88f 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -38,6 +38,8 @@ struct vfio_pci_dma_buf { u8 revoked : 1; }; +extern const struct vm_operations_struct vfio_pci_mmap_ops; + bool vfio_pci_intx_mask(struct vfio_pci_core_device *vdev); void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev); -- 2.50.1 (Apple Git-155)