CoCo VMs get their private memory allocated from guest_memfd ("gmemfd") which is a KVM facility similar to memfd. The gmemfds does not allow mapping private memory to the userspace so the IOMMU_IOAS_MAP ioctl does not work. Use the existing IOMMU_IOAS_MAP_FILE ioctl to allow mapping from fd + offset. Detect the gmemfd case in pfn_reader_user_pin(). For the new guest_memfd type, no additional reference is taken as pinning is guaranteed by the KVM guest_memfd library. There is no KVM-GMEMFD->IOMMUFD direct notification mechanism as the assumption is that: 1) page stage change events will be handled by VMM which is going to call IOMMUFD to remap pages; 2) shrinking GMEMFD equals to VM memory unplug and VMM is going to handle it. Signed-off-by: Alexey Kardashevskiy --- This is for Trusted IO == TEE-IO == PCIe TDISP, etc. Previously posted here: https://lore.kernel.org/r/20250218111017.491719-13-aik@amd.com The main comment was "what is the lifetime of those folios()" and GMEMFD + QEMU should take care of it. And horrendous stuff like this is not really useful: https://github.com/AMDESE/linux-kvm/commit/7d73fd2cccb8489b1 --- include/linux/kvm_host.h | 4 + drivers/iommu/iommufd/pages.c | 80 ++++++++++++++++++-- virt/kvm/guest_memfd.c | 36 +++++++++ 3 files changed, 113 insertions(+), 7 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 995db7a7ba57..9369cf22b24e 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2673,4 +2673,8 @@ unsigned long kvm_get_vm_memory_attributes(struct kvm *kvm, gfn_t gfn); int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm, struct kvm_memory_attributes2 *attrs); +bool kvm_is_gmemfd(struct file *file); +struct folio *kvm_gmemfd_get_pfn(struct file *file, unsigned long index, + unsigned long *pfn, int *max_order); + #endif diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c index dbe51ecb9a20..4c07e39e17d0 100644 --- a/drivers/iommu/iommufd/pages.c +++ b/drivers/iommu/iommufd/pages.c @@ -56,6 +56,9 @@ #include #include #include +#include +#include +#include #include "double_span.h" #include "io_pagetable.h" @@ -660,7 +663,8 @@ static void batch_from_pages(struct pfn_batch *batch, struct page **pages, } static int batch_from_folios(struct pfn_batch *batch, struct folio ***folios_p, - unsigned long *offset_p, unsigned long npages) + unsigned long *offset_p, unsigned long npages, + bool do_pin) { int rc = 0; struct folio **folios = *folios_p; @@ -676,7 +680,7 @@ static int batch_from_folios(struct pfn_batch *batch, struct folio ***folios_p, if (!batch_add_pfn_num(batch, pfn, nr, BATCH_CPU_MEMORY)) break; - if (nr > 1) { + if (nr > 1 && do_pin) { rc = folio_add_pins(folio, nr - 1); if (rc) { batch_remove_pfn_num(batch, nr); @@ -697,6 +701,7 @@ static int batch_from_folios(struct pfn_batch *batch, struct folio ***folios_p, static void batch_unpin(struct pfn_batch *batch, struct iopt_pages *pages, unsigned int first_page_off, size_t npages) { + bool do_unpin = !kvm_is_gmemfd(pages->file); unsigned int cur = 0; while (first_page_off) { @@ -710,9 +715,12 @@ static void batch_unpin(struct pfn_batch *batch, struct iopt_pages *pages, size_t to_unpin = min_t(size_t, npages, batch->npfns[cur] - first_page_off); - unpin_user_page_range_dirty_lock( - pfn_to_page(batch->pfns[cur] + first_page_off), - to_unpin, pages->writable); + /* Do nothing for guest_memfd */ + if (do_unpin) + unpin_user_page_range_dirty_lock( + pfn_to_page(batch->pfns[cur] + first_page_off), + to_unpin, pages->writable); + iopt_pages_sub_npinned(pages, to_unpin); cur++; first_page_off = 0; @@ -872,6 +880,57 @@ static long pin_memfd_pages(struct pfn_reader_user *user, unsigned long start, return npages_out; } +static long pin_guest_memfd_pages(struct pfn_reader_user *user, loff_t start, unsigned long npages) +{ + struct page **upages = user->upages; + unsigned long offset = 0; + loff_t uptr = start; + long rc = 0; + + for (unsigned long i = 0; (uptr - start) < (npages << PAGE_SHIFT); ++i) { + unsigned long gfn = 0, pfn = 0; + int max_order = 0; + struct folio *folio; + + folio = kvm_gmemfd_get_pfn(user->file, uptr >> PAGE_SHIFT, &pfn, &max_order); + if (IS_ERR(folio)) + rc = PTR_ERR(folio); + + if (rc == -EINVAL && i == 0) { + pr_err_once("Must be vfio mmio at gfn=%lx pfn=%lx, skipping\n", gfn, pfn); + return rc; + } + + if (rc) { + pr_err("%s: %ld %ld %lx -> %lx\n", __func__, + rc, i, (unsigned long) uptr, (unsigned long) pfn); + break; + } + + if (i == 0) + offset = offset_in_folio(folio, start) >> PAGE_SHIFT; + + user->ufolios[i] = folio; + + if (upages) { + unsigned long np = (1UL << (max_order + PAGE_SHIFT)) - offset_in_folio(folio, uptr); + + for (unsigned long j = 0; j < np; ++j) + *upages++ = folio_page(folio, offset + j); + } + + uptr += (1UL << (max_order + PAGE_SHIFT)) - offset_in_folio(folio, uptr); + } + + if (!rc) { + rc = npages; + user->ufolios_next = user->ufolios; + user->ufolios_offset = offset; + } + + return rc; +} + static int pfn_reader_user_pin(struct pfn_reader_user *user, struct iopt_pages *pages, unsigned long start_index, @@ -925,7 +984,13 @@ static int pfn_reader_user_pin(struct pfn_reader_user *user, if (user->file) { start = pages->start + (start_index * PAGE_SIZE); - rc = pin_memfd_pages(user, start, npages); + if (kvm_is_gmemfd(pages->file)) { + rc = pin_guest_memfd_pages(user, start, npages); + } else { + pr_err("UNEXP PINFD start=%lx sz=%lx file=%lx", + start, npages << PAGE_SHIFT, (ulong) pages->file); + rc = pin_memfd_pages(user, start, npages); + } } else if (!remote_mm) { uptr = (uintptr_t)(pages->uptr + start_index * PAGE_SIZE); rc = pin_user_pages_fast(uptr, npages, user->gup_flags, @@ -1221,7 +1286,8 @@ static int pfn_reader_fill_span(struct pfn_reader *pfns) npages); else rc = batch_from_folios(&pfns->batch, &user->ufolios_next, - &user->ufolios_offset, npages); + &user->ufolios_offset, npages, + !kvm_is_gmemfd(pfns->pages->file)); return rc; } diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index e4e21068cf2a..2a313888c21b 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -1794,3 +1794,39 @@ void kvm_gmem_exit(void) rcu_barrier(); kmem_cache_destroy(kvm_gmem_inode_cachep); } + +bool kvm_is_gmemfd(struct file *file) +{ + if (!file) + return false; + + if (file->f_op != &kvm_gmem_fops) + return false; + + return true; +} +EXPORT_SYMBOL_GPL(kvm_is_gmemfd); + +struct folio *kvm_gmemfd_get_pfn(struct file *file, unsigned long index, + unsigned long *pfn, int *max_order) +{ + struct inode *inode = file_inode(file); + struct folio *folio; + + if (!inode || !kvm_is_gmemfd(file)) + return NULL; + + folio = kvm_gmem_get_folio(inode, index); + if (!folio) + return NULL; + + + *pfn = folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1)); + *max_order = folio_order(folio); + + folio_put(folio); + folio_unlock(folio); + + return folio; +} +EXPORT_SYMBOL_GPL(kvm_gmemfd_get_pfn); -- 2.52.0