From: Fred Griffoul Replace kvm_host_map usage with persistent gfn_to_pfn_cache for L1 APIC virtualization pages (APIC access, virtual APIC, and posted interrupt descriptor pages) to improve performance with unmanaged guest memory. The conversion involves several key changes: - Page loading in nested_get_vmcs12_pages(): load vmcs02 fields with pfncache PFNs after each cache has been checked and possibly activated or refreshed, during OUTSIDE_GUEST_MODE vCPU mode. - Invalidation window handling: since nested_get_vmcs12_pages() runs in OUTSIDE_GUEST_MODE, there's a window where caches can be invalidated by MMU notifications before entering IN_GUEST_MODE. implement is_nested_state_invalid() callback to monitor cache validity between OUTSIDE_GUEST_MODE and IN_GUEST_MODE transitions. This triggers KVM_REQ_GET_NESTED_STATE_PAGES when needed. - Cache access in event callbacks: the virtual APIC and posted interrupt descriptor pages are accessed by KVM in has_events() and check_events() nested_ops callbacks. These use the kernel HVA following the pfncache pattern of check/refresh, with both callbacks able to sleep if cache refresh is required. This eliminates expensive memremap/memunmap cycles for each L2 VM entry/exit, providing substantial performance improvements when using unmanaged memory such as guest_memfd or memory passed with mem= kernel parameter. The persistent caching approach maintains correctness through proper invalidation detection while avoiding the overhead of repeated mapping operations. Signed-off-by: Fred Griffoul --- arch/x86/kvm/vmx/nested.c | 169 +++++++++++++++++++++++++++++--------- arch/x86/kvm/vmx/vmx.h | 8 +- include/linux/kvm_host.h | 5 ++ 3 files changed, 139 insertions(+), 43 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 06187b8baa19..0cb66314d58b 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -329,8 +329,18 @@ static int nested_gpc_lock(struct gfn_to_pfn_cache *gpc, gpa_t gpa) if (!kvm_gpc_check(gpc, PAGE_SIZE) || (gpc->gpa != gpa)) { read_unlock(&gpc->lock); err = kvm_gpc_activate(gpc, gpa, PAGE_SIZE); - if (err) + if (err) { + /* + * Deactivate nested state caches to prevent + * kvm_gpc_invalid() from returning true in subsequent + * is_nested_state_invalid() calls. This prevents an + * infinite loop while entering guest mode. + */ + if (gpc->vcpu) + kvm_gpc_deactivate(gpc); + return err; + } goto retry; } @@ -343,14 +353,17 @@ static void nested_gpc_unlock(struct gfn_to_pfn_cache *gpc) read_unlock(&gpc->lock); } -static void nested_put_vmcs12_pages(struct kvm_vcpu *vcpu) +static int nested_gpc_hpa(struct gfn_to_pfn_cache *gpc, gpa_t gpa, hpa_t *hpa) { - struct vcpu_vmx *vmx = to_vmx(vcpu); + int err; + + err = nested_gpc_lock(gpc, gpa); + if (err) + return err; - kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map); - kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map); - kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map); - vmx->nested.pi_desc = NULL; + *hpa = pfn_to_hpa(gpc->pfn); + nested_gpc_unlock(gpc); + return 0; } /* @@ -373,6 +386,9 @@ static void free_nested(struct kvm_vcpu *vcpu) vmx->nested.smm.vmxon = false; vmx->nested.vmxon_ptr = INVALID_GPA; + kvm_gpc_deactivate(&vmx->nested.pi_desc_cache); + kvm_gpc_deactivate(&vmx->nested.virtual_apic_cache); + kvm_gpc_deactivate(&vmx->nested.apic_access_page_cache); kvm_gpc_deactivate(&vmx->nested.msr_bitmap_cache); free_vpid(vmx->nested.vpid02); @@ -389,8 +405,6 @@ static void free_nested(struct kvm_vcpu *vcpu) kfree(vmx->nested.cached_shadow_vmcs12); vmx->nested.cached_shadow_vmcs12 = NULL; - nested_put_vmcs12_pages(vcpu); - kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); nested_release_evmcs(vcpu); @@ -3361,7 +3375,8 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); - struct kvm_host_map *map; + struct gfn_to_pfn_cache *gpc; + hpa_t hpa; if (!vcpu->arch.pdptrs_from_userspace && !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { @@ -3376,10 +3391,10 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { - map = &vmx->nested.apic_access_page_map; + gpc = &vmx->nested.apic_access_page_cache; - if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->apic_access_addr), map)) { - vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(map->pfn)); + if (!nested_gpc_hpa(gpc, vmcs12->apic_access_addr, &hpa)) { + vmcs_write64(APIC_ACCESS_ADDR, hpa); } else { pr_debug_ratelimited("%s: no backing for APIC-access address in vmcs12\n", __func__); @@ -3392,10 +3407,10 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) } if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { - map = &vmx->nested.virtual_apic_map; + gpc = &vmx->nested.virtual_apic_cache; - if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) { - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn)); + if (!nested_gpc_hpa(gpc, vmcs12->virtual_apic_page_addr, &hpa)) { + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa); } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) && !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { @@ -3418,14 +3433,12 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) } if (nested_cpu_has_posted_intr(vmcs12)) { - map = &vmx->nested.pi_desc_map; + gpc = &vmx->nested.pi_desc_cache; - if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) { - vmx->nested.pi_desc = - (struct pi_desc *)(((void *)map->hva) + - offset_in_page(vmcs12->posted_intr_desc_addr)); + if (!nested_gpc_hpa(gpc, vmcs12->posted_intr_desc_addr & PAGE_MASK, &hpa)) { + vmx->nested.pi_desc_offset = offset_in_page(vmcs12->posted_intr_desc_addr); vmcs_write64(POSTED_INTR_DESC_ADDR, - pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr)); + hpa + offset_in_page(vmcs12->posted_intr_desc_addr)); } else { /* * Defer the KVM_INTERNAL_EXIT until KVM tries to @@ -3433,7 +3446,6 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) * descriptor. (Note that KVM may do this when it * should not, per the architectural specification.) */ - vmx->nested.pi_desc = NULL; pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR); } } @@ -3474,7 +3486,16 @@ static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu) static bool vmx_is_nested_state_invalid(struct kvm_vcpu *vcpu) { - return false; + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * @vcpu is in IN_GUEST_MODE, eliminating the need for individual gpc + * locks. Since kvm_gpc_invalid() doesn't verify gpc memslot + * generation, we can also skip acquiring the srcu lock. + */ + return kvm_gpc_invalid(&vmx->nested.apic_access_page_cache) || + kvm_gpc_invalid(&vmx->nested.virtual_apic_cache) || + kvm_gpc_invalid(&vmx->nested.pi_desc_cache); } static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) @@ -3969,9 +3990,55 @@ void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) } } +static void *nested_gpc_lock_if_active(struct gfn_to_pfn_cache *gpc) +{ +retry: + read_lock(&gpc->lock); + if (!gpc->active) { + read_unlock(&gpc->lock); + return NULL; + } + + if (!kvm_gpc_check(gpc, PAGE_SIZE)) { + read_unlock(&gpc->lock); + if (kvm_gpc_refresh(gpc, PAGE_SIZE)) + return NULL; + goto retry; + } + + return gpc->khva; +} + +static struct pi_desc *nested_lock_pi_desc(struct vcpu_vmx *vmx) +{ + u8 *pi_desc_page; + + pi_desc_page = nested_gpc_lock_if_active(&vmx->nested.pi_desc_cache); + if (!pi_desc_page) + return NULL; + + return (struct pi_desc *)(pi_desc_page + vmx->nested.pi_desc_offset); +} + +static void nested_unlock_pi_desc(struct vcpu_vmx *vmx) +{ + nested_gpc_unlock(&vmx->nested.pi_desc_cache); +} + +static void *nested_lock_vapic(struct vcpu_vmx *vmx) +{ + return nested_gpc_lock_if_active(&vmx->nested.virtual_apic_cache); +} + +static void nested_unlock_vapic(struct vcpu_vmx *vmx) +{ + nested_gpc_unlock(&vmx->nested.virtual_apic_cache); +} + static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + struct pi_desc *pi_desc; int max_irr; void *vapic_page; u16 status; @@ -3979,22 +4046,29 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) if (!vmx->nested.pi_pending) return 0; - if (!vmx->nested.pi_desc) + pi_desc = nested_lock_pi_desc(vmx); + if (!pi_desc) goto mmio_needed; vmx->nested.pi_pending = false; - if (!pi_test_and_clear_on(vmx->nested.pi_desc)) + if (!pi_test_and_clear_on(pi_desc)) { + nested_unlock_pi_desc(vmx); return 0; + } - max_irr = pi_find_highest_vector(vmx->nested.pi_desc); + max_irr = pi_find_highest_vector(pi_desc); if (max_irr > 0) { - vapic_page = vmx->nested.virtual_apic_map.hva; - if (!vapic_page) + vapic_page = nested_lock_vapic(vmx); + if (!vapic_page) { + nested_unlock_pi_desc(vmx); goto mmio_needed; + } + + __kvm_apic_update_irr(pi_desc->pir, vapic_page, &max_irr); + + nested_unlock_vapic(vmx); - __kvm_apic_update_irr(vmx->nested.pi_desc->pir, - vapic_page, &max_irr); status = vmcs_read16(GUEST_INTR_STATUS); if ((u8)max_irr > ((u8)status & 0xff)) { status &= ~0xff; @@ -4003,6 +4077,7 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) } } + nested_unlock_pi_desc(vmx); nested_mark_vmcs12_pages_dirty(vcpu); return 0; @@ -4122,8 +4197,10 @@ static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu) static bool vmx_has_nested_events(struct kvm_vcpu *vcpu, bool for_injection) { struct vcpu_vmx *vmx = to_vmx(vcpu); - void *vapic = vmx->nested.virtual_apic_map.hva; + struct pi_desc *pi_desc; int max_irr, vppr; + void *vapic; + bool res = false; if (nested_vmx_preemption_timer_pending(vcpu) || vmx->nested.mtf_pending) @@ -4142,23 +4219,33 @@ static bool vmx_has_nested_events(struct kvm_vcpu *vcpu, bool for_injection) __vmx_interrupt_blocked(vcpu)) return false; + vapic = nested_lock_vapic(vmx); if (!vapic) return false; vppr = *((u32 *)(vapic + APIC_PROCPRI)); + nested_unlock_vapic(vmx); + max_irr = vmx_get_rvi(); if ((max_irr & 0xf0) > (vppr & 0xf0)) return true; - if (vmx->nested.pi_pending && vmx->nested.pi_desc && - pi_test_on(vmx->nested.pi_desc)) { - max_irr = pi_find_highest_vector(vmx->nested.pi_desc); - if (max_irr > 0 && (max_irr & 0xf0) > (vppr & 0xf0)) - return true; + if (vmx->nested.pi_pending) { + pi_desc = nested_lock_pi_desc(vmx); + if (!pi_desc) + return false; + + if (pi_test_on(pi_desc)) { + max_irr = pi_find_highest_vector(pi_desc); + if (max_irr > 0 && (max_irr & 0xf0) > (vppr & 0xf0)) + res = true; + } + + nested_unlock_pi_desc(vmx); } - return false; + return res; } /* @@ -5106,7 +5193,7 @@ void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, vmx_update_cpu_dirty_logging(vcpu); } - nested_put_vmcs12_pages(vcpu); + nested_mark_vmcs12_pages_dirty(vcpu); if (vmx->nested.reload_vmcs01_apic_access_page) { vmx->nested.reload_vmcs01_apic_access_page = false; @@ -5391,6 +5478,10 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) kvm_gpc_init(&vmx->nested.msr_bitmap_cache, vcpu->kvm); + kvm_gpc_init_for_vcpu(&vmx->nested.apic_access_page_cache, vcpu); + kvm_gpc_init_for_vcpu(&vmx->nested.virtual_apic_cache, vcpu); + kvm_gpc_init_for_vcpu(&vmx->nested.pi_desc_cache, vcpu); + vmx->nested.vmcs02_initialized = false; vmx->nested.vmxon = true; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 3a6983222841..2c74c65d3383 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -158,11 +158,11 @@ struct nested_vmx { * Guest pages referred to in the vmcs02 with host-physical * pointers, so we must keep them pinned while L2 runs. */ - struct kvm_host_map apic_access_page_map; - struct kvm_host_map virtual_apic_map; - struct kvm_host_map pi_desc_map; + struct gfn_to_pfn_cache apic_access_page_cache; + struct gfn_to_pfn_cache virtual_apic_cache; + struct gfn_to_pfn_cache pi_desc_cache; - struct pi_desc *pi_desc; + u64 pi_desc_offset; bool pi_pending; u16 posted_intr_nv; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 2eb551a11818..dc622adb561f 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1526,6 +1526,11 @@ static inline bool kvm_gpc_is_hva_active(struct gfn_to_pfn_cache *gpc) return gpc->active && kvm_is_error_gpa(gpc->gpa); } +static inline bool kvm_gpc_invalid(struct gfn_to_pfn_cache *gpc) +{ + return gpc->active && !gpc->valid; +} + void kvm_sigset_activate(struct kvm_vcpu *vcpu); void kvm_sigset_deactivate(struct kvm_vcpu *vcpu); -- 2.51.0