gfn_to_pfn_cache currently relies on hva_to_pfn(), which resolves PFNs through GUP. GUP assumes that the page has a valid direct-map PTE, which is not true for guest_memfd created with GUEST_MEMFD_FLAG_NO_DIRECT_MAP, because their direct-map PTEs are explicitly invalidated via set_direct_map_valid_noflush(). Introduce a helper function, gpc_to_pfn(), that routes PFN lookup to kvm_gmem_get_pfn() for guest_memfd-backed memslots (regardless of whether GUEST_MEMFD_FLAG_NO_DIRECT_MAP is set), and otherwise falls back to the existing hva_to_pfn() path. Rename hva_to_pfn_retry() to gpc_to_pfn_retry() accordingly. Signed-off-by: Takahiro Itazuri --- virt/kvm/pfncache.c | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c index 728d2c1b488a..bf8d6090e283 100644 --- a/virt/kvm/pfncache.c +++ b/virt/kvm/pfncache.c @@ -152,22 +152,34 @@ static inline bool mmu_notifier_retry_cache(struct kvm *kvm, unsigned long mmu_s return kvm->mmu_invalidate_seq != mmu_seq; } -static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) +static kvm_pfn_t gpc_to_pfn(struct gfn_to_pfn_cache *gpc, struct page **page) { - /* Note, the new page offset may be different than the old! */ - void *old_khva = (void *)PAGE_ALIGN_DOWN((uintptr_t)gpc->khva); - kvm_pfn_t new_pfn = KVM_PFN_ERR_FAULT; - void *new_khva = NULL; - unsigned long mmu_seq; - struct page *page; + if (kvm_slot_has_gmem(gpc->memslot)) { + kvm_pfn_t pfn; + + kvm_gmem_get_pfn(gpc->kvm, gpc->memslot, gpa_to_gfn(gpc->gpa), + &pfn, page, NULL); + return pfn; + } struct kvm_follow_pfn kfp = { .slot = gpc->memslot, .gfn = gpa_to_gfn(gpc->gpa), .flags = FOLL_WRITE, .hva = gpc->uhva, - .refcounted_page = &page, + .refcounted_page = page, }; + return hva_to_pfn(&kfp); +} + +static kvm_pfn_t gpc_to_pfn_retry(struct gfn_to_pfn_cache *gpc) +{ + /* Note, the new page offset may be different than the old! */ + void *old_khva = (void *)PAGE_ALIGN_DOWN((uintptr_t)gpc->khva); + kvm_pfn_t new_pfn = KVM_PFN_ERR_FAULT; + void *new_khva = NULL; + unsigned long mmu_seq; + struct page *page; lockdep_assert_held(&gpc->refresh_lock); @@ -206,7 +218,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) cond_resched(); } - new_pfn = hva_to_pfn(&kfp); + new_pfn = gpc_to_pfn(gpc, &page); if (is_error_noslot_pfn(new_pfn)) goto out_error; @@ -319,7 +331,7 @@ static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned l } } - /* Note: the offset must be correct before calling hva_to_pfn_retry() */ + /* Note: the offset must be correct before calling gpc_to_pfn_retry() */ gpc->uhva += page_offset; /* @@ -327,7 +339,7 @@ static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned l * drop the lock and do the HVA to PFN lookup again. */ if (!gpc->valid || hva_change) { - ret = hva_to_pfn_retry(gpc); + ret = gpc_to_pfn_retry(gpc); } else { /* * If the HVA→PFN mapping was already valid, don't unmap it. -- 2.50.1 gfn_to_pfn_cache currently maps RAM PFNs with kmap(), which relies on the direct map. guest_memfd created with GUEST_MEMFD_FLAG_NO_DIRECT_MAP disable their direct-map PTEs via set_direct_map_valid_noflush(), so the linear address returned by kmap()/page_address() will fault if dereferenced. In some cases, gfn_to_pfn_cache dereferences the cached kernel host virtual address (khva) from atomic contexts where page faults cannot be tolerated. Therefore khva must always refer to a fault-free kernel mapping. Since mapping and unmapping happen exclusively in the refresh path, which may sleep, using vmap()/vunmap() for these pages is safe and sufficient. Introduce kvm_slot_no_direct_map() to detect guest_memfd slots without the direct map, and make gpc_map()/gpc_unmap() use vmap()/vunmap() for such pages. This allows the features based on gfn_to_pfn_cache (e.g. kvm-clock) to work correctly with guest_memfd regardless of whether its direct-map PTEs are valid. Signed-off-by: Takahiro Itazuri --- include/linux/kvm_host.h | 7 +++++++ virt/kvm/pfncache.c | 26 ++++++++++++++++++++------ 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 70e6a5210ceb..793d98f97928 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -628,6 +629,12 @@ static inline bool kvm_slot_dirty_track_enabled(const struct kvm_memory_slot *sl return slot->flags & KVM_MEM_LOG_DIRTY_PAGES; } +static inline bool kvm_slot_no_direct_map(const struct kvm_memory_slot *slot) +{ + return slot && kvm_slot_has_gmem(slot) && + mapping_no_direct_map(slot->gmem.file->f_mapping); +} + static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memslot) { return ALIGN(memslot->npages, BITS_PER_LONG) / 8; diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c index bf8d6090e283..87167d7f3feb 100644 --- a/virt/kvm/pfncache.c +++ b/virt/kvm/pfncache.c @@ -96,10 +96,16 @@ bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, unsigned long len) return true; } -static void *gpc_map(kvm_pfn_t pfn) +static void *gpc_map(struct gfn_to_pfn_cache *gpc, kvm_pfn_t pfn) { - if (pfn_valid(pfn)) - return kmap(pfn_to_page(pfn)); + if (pfn_valid(pfn)) { + struct page *page = pfn_to_page(pfn); + + if (kvm_slot_no_direct_map(gpc->memslot)) + return vmap(&page, 1, VM_MAP, PAGE_KERNEL); + + return kmap(page); + } #ifdef CONFIG_HAS_IOMEM return memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB); @@ -115,6 +121,11 @@ static void gpc_unmap(kvm_pfn_t pfn, void *khva) return; if (pfn_valid(pfn)) { + if (is_vmalloc_addr(khva)) { + vunmap(khva); + return; + } + kunmap(pfn_to_page(pfn)); return; } @@ -224,13 +235,16 @@ static kvm_pfn_t gpc_to_pfn_retry(struct gfn_to_pfn_cache *gpc) /* * Obtain a new kernel mapping if KVM itself will access the - * pfn. Note, kmap() and memremap() can both sleep, so this - * too must be done outside of gpc->lock! + * pfn. Note, kmap(), vmap() and memremap() can sleep, so this + * too must be done outside of gpc->lock! Note that even though + * the rwlock is dropped, it's still fine to read gpc->pfn and + * other fields because gpc->fresh_lock mutex prevents those + * from being changed. */ if (new_pfn == gpc->pfn) new_khva = old_khva; else - new_khva = gpc_map(new_pfn); + new_khva = gpc_map(gpc, new_pfn); if (!new_khva) { kvm_release_page_unused(page); -- 2.50.1