Invalidate pfncaches when guest_memfd invalidation or memory attribute updates render cached PFN resolutions stale. Reuse active_invalidate_count to synchronize with the existing retry logic and preserve ordering against mmu_invalidate_seq. Invalidation needs to be performed using HVA ranges so that both GPA-based and HVA-based pfncaches are covered. Internally GPA-based ones translate GPA to memslot/UHVA first and then resolve PFN, while HVA-based ones only resolve PFN and do not store memslot/GPA context. Technically, it is possible to make HVA-based pfncaches search the corresponding memslot/GPA when activated / refreshed, but it would add overhead to a greater ot lesser extent, regardless of guest_memfd-backed or not. At the time of writing, only Xen uses HVA-based pfncaches. Signed-off-by: Takahiro Itazuri Suggested-by: David Hildenbrand (Red Hat) --- virt/kvm/guest_memfd.c | 50 ++++++++++++++++++++++++++++++++++++++++++ virt/kvm/kvm_main.c | 45 +++++++++++++++++++++++++++++++++++++ virt/kvm/pfncache.c | 4 ++-- 3 files changed, 97 insertions(+), 2 deletions(-) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 79f34dad0c2f..eb2f1a7e54dc 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -215,6 +215,33 @@ static void __kvm_gmem_invalidate_start(struct gmem_file *f, pgoff_t start, struct kvm *kvm = f->kvm; unsigned long index; + /* + * Prevent pfncaches from being activated / refreshed using stale PFN + * resolutions. To invalidate pfncaches _before_ invalidating the + * secondary MMUs (i.e. without acquiring mmu_lock), pfncaches must use + * active_invalidate_count instead of mmu_invalidate_in_progress. + */ + spin_lock(&kvm->invalidate_lock); + kvm->active_invalidate_count++; + spin_unlock(&kvm->invalidate_lock); + + /* + * Invalidation of pfncaches must be done using a HVA range. pfncaches + * can be either GPA-based or HVA-based, and all pfncaches store uhva + * while HVA-based pfncaches do not have gpa/memslot info. Thus, + * using GFN ranges would miss invalidating HVA-based ones. + */ + xa_for_each_range(&f->bindings, index, slot, start, end - 1) { + pgoff_t pgoff = slot->gmem.pgoff; + gfn_t gfn_start = slot->base_gfn + max(pgoff, start) - pgoff; + gfn_t gfn_end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff; + + unsigned long hva_start = gfn_to_hva_memslot(slot, gfn_start); + unsigned long hva_end = gfn_to_hva_memslot(slot, gfn_end); + + gpc_invalidate_hva_range_start(kvm, hva_start, hva_end); + } + xa_for_each_range(&f->bindings, index, slot, start, end - 1) { pgoff_t pgoff = slot->gmem.pgoff; @@ -259,12 +286,35 @@ static void __kvm_gmem_invalidate_end(struct gmem_file *f, pgoff_t start, pgoff_t end) { struct kvm *kvm = f->kvm; + bool wake; if (xa_find(&f->bindings, &start, end - 1, XA_PRESENT)) { KVM_MMU_LOCK(kvm); kvm_mmu_invalidate_end(kvm); KVM_MMU_UNLOCK(kvm); } + + /* + * This must be done after the increment of mmu_invalidate_seq and + * smp_wmb() in kvm_mmu_invalidate_end() to guarantee that + * gpc_invalidate_retry() observes either the old (non-zero) + * active_invalidate_count or the new (incremented) mmu_invalidate_seq. + */ + spin_lock(&kvm->invalidate_lock); + if (!WARN_ON_ONCE(!kvm->active_invalidate_count)) + kvm->active_invalidate_count--; + wake = !kvm->active_invalidate_count; + spin_unlock(&kvm->invalidate_lock); + + /* + * guest_memfd invalidation itself doesn't need to block active memslots + * swap as bindings updates are serialized by filemap_invalidate_lock(). + * However, active_invalidate_count is shared with the MMU notifier + * path, so the waiter must be waked when active_invalidate_count drops + * to zero. + */ + if (wake) + rcuwait_wake_up(&kvm->memslots_update_rcuwait); } static void kvm_gmem_invalidate_end(struct inode *inode, pgoff_t start, diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index f51056e971d0..f56b98c85175 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2583,6 +2583,8 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end, .on_lock = kvm_mmu_invalidate_end, .may_block = true, }; + struct kvm_memslots *slots = kvm_memslots(kvm); + struct kvm_memory_slot *slot; unsigned long i; void *entry; int r = 0; @@ -2609,6 +2611,34 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end, cond_resched(); } + /* + * Prevent pfncaches from being activated / refreshed using stale PFN + * resolutions. To invalidate pfncaches _before_ invalidating the + * secondary MMUs (i.e. without acquiring mmu_lock), pfncaches must use + * active_invalidate_count instead of mmu_invalidate_in_progress. + */ + spin_lock(&kvm->invalidate_lock); + kvm->active_invalidate_count++; + spin_unlock(&kvm->invalidate_lock); + + /* + * Invalidation of pfncaches must be done using a HVA range. pfncaches + * can be either GPA-based or HVA-based, and all pfncaches store uhva + * while HVA-based pfncaches do not have gpa/memslot info. Thus, + * using GFN ranges would miss invalidating HVA-based ones. + */ + kvm_for_each_memslot(slot, slots) { + gfn_t gfn_start = max(start, slot->base_gfn); + gfn_t gfn_end = min(end, slot->base_gfn + slot->npages); + + if (gfn_start < gfn_end) { + unsigned long hva_start = gfn_to_hva_memslot(slot, gfn_start); + unsigned long hva_end = gfn_to_hva_memslot(slot, gfn_end); + + gpc_invalidate_hva_range_start(kvm, hva_start, hva_end); + } + } + kvm_handle_gfn_range(kvm, &pre_set_range); for (i = start; i < end; i++) { @@ -2620,6 +2650,21 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end, kvm_handle_gfn_range(kvm, &post_set_range); + /* + * This must be done after the increment of mmu_invalidate_seq and + * smp_wmb() in kvm_mmu_invalidate_end() to guarantee that + * gpc_invalidate_retry() observes either the old (non-zero) + * active_invalidate_count or the new (incremented) mmu_invalidate_seq. + * + * memslots_update_rcuwait does not need to be waked when + * active_invalidate_count drops to zero because active memslots swap is + * also done while holding slots_lock. + */ + spin_lock(&kvm->invalidate_lock); + if (!WARN_ON_ONCE(!kvm->active_invalidate_count)) + kvm->active_invalidate_count--; + spin_unlock(&kvm->invalidate_lock); + out_unlock: mutex_unlock(&kvm->slots_lock); diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c index 2880a36257c2..2b44da46d2ab 100644 --- a/virt/kvm/pfncache.c +++ b/virt/kvm/pfncache.c @@ -144,7 +144,7 @@ static void gpc_unmap(kvm_pfn_t pfn, void *khva) #endif } -static inline bool mmu_notifier_retry_cache(struct kvm *kvm, unsigned long mmu_seq) +static inline bool gpc_invalidate_retry(struct kvm *kvm, unsigned long mmu_seq) { /* * active_invalidate_count acts for all intents and purposes like @@ -274,7 +274,7 @@ static kvm_pfn_t gpc_to_pfn_retry(struct gfn_to_pfn_cache *gpc) * attempting to refresh. */ WARN_ON_ONCE(gpc->valid); - } while (mmu_notifier_retry_cache(gpc->kvm, mmu_seq)); + } while (gpc_invalidate_retry(gpc->kvm, mmu_seq)); gpc->valid = true; gpc->pfn = new_pfn; -- 2.50.1