Handle Machine Checks (#MC) that happen in the guest (by forwarding them to the host) outside of KVM's fastpath so that as much host state as possible is re-loaded before invoking the kernel's #MC handler. The only requirement is that KVM invokes the #MC handler before enabling IRQs (and even that could _probably_ be relaxed to handling #MCs before enabling preemption). Waiting to handle #MCs until "more" host state is loaded hardens KVM against flaws in the #MC handler, which has historically been quite brittle. E.g. prior to commit 5567d11c21a1 ("x86/mce: Send #MC singal from task work"), the #MC code could trigger a schedule() with IRQs and preemption disabled. That led to a KVM hack-a-fix in commit 1811d979c716 ("x86/kvm: move kvm_load/put_guest_xcr0 into atomic context"). Note, except for #MCs on VM-Enter, VMX already handles #MCs outside of the fastpath. Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index f14709a511aa..e8b158f73c79 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4335,14 +4335,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET; - /* - * We need to handle MC intercepts here before the vcpu has a chance to - * change the physical cpu - */ - if (unlikely(svm->vmcb->control.exit_code == - SVM_EXIT_EXCP_BASE + MC_VECTOR)) - svm_handle_mce(vcpu); - trace_kvm_exit(vcpu, KVM_ISA_SVM); svm_complete_interrupts(vcpu); @@ -4631,8 +4623,16 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu) { - if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_INTR) + switch (to_svm(vcpu)->vmcb->control.exit_code) { + case SVM_EXIT_EXCP_BASE + MC_VECTOR: + svm_handle_mce(vcpu); + break; + case SVM_EXIT_INTR: vcpu->arch.at_instruction_boundary = true; + break; + default: + break; + } } static void svm_setup_mce(struct kvm_vcpu *vcpu) -- 2.51.1.930.gacf6e81ea2-goog Handle Machine Checks (#MC) that happen on VM-Enter (VMX or TDX) outside of KVM's fastpath so that as much host state as possible is re-loaded before invoking the kernel's #MC handler. The only requirement is that KVM invokes the #MC handler before enabling IRQs (and even that could _probably_ be related to handling #MCs before enabling preemption). Waiting to handle #MCs until "more" host state is loaded hardens KVM against flaws in the #MC handler, which has historically been quite brittle. E.g. prior to commit 5567d11c21a1 ("x86/mce: Send #MC singal from task work"), the #MC code could trigger a schedule() with IRQs and preemption disabled. That led to a KVM hack-a-fix in commit 1811d979c716 ("x86/kvm: move kvm_load/put_guest_xcr0 into atomic context"). Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/main.c | 13 ++++++++++++- arch/x86/kvm/vmx/tdx.c | 3 --- arch/x86/kvm/vmx/vmx.c | 3 --- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index 0eb2773b2ae2..1beaec5b9727 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -608,6 +608,17 @@ static void vt_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, vmx_load_mmu_pgd(vcpu, root_hpa, pgd_level); } +static void vt_handle_exit_irqoff(struct kvm_vcpu *vcpu) +{ + if (unlikely((u16)vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY)) + kvm_machine_check(); + + if (is_td_vcpu(vcpu)) + return; + + return vmx_handle_exit_irqoff(vcpu); +} + static void vt_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) { if (is_td_vcpu(vcpu)) @@ -969,7 +980,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .load_mmu_pgd = vt_op(load_mmu_pgd), .check_intercept = vmx_check_intercept, - .handle_exit_irqoff = vmx_handle_exit_irqoff, + .handle_exit_irqoff = vt_op(handle_exit_irqoff), .update_cpu_dirty_logging = vt_op(update_cpu_dirty_logging), diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 326db9b9c567..a2f6ba3268d1 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1069,9 +1069,6 @@ fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) if (unlikely((tdx->vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) return EXIT_FASTPATH_NONE; - if (unlikely(vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY)) - kvm_machine_check(); - trace_kvm_exit(vcpu, KVM_ISA_VMX); if (unlikely(tdx_failed_vmentry(vcpu))) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 1021d3b65ea0..123dae8cf46b 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7527,9 +7527,6 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) if (unlikely(vmx->fail)) return EXIT_FASTPATH_NONE; - if (unlikely((u16)vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY)) - kvm_machine_check(); - trace_kvm_exit(vcpu, KVM_ISA_VMX); if (unlikely(vmx_get_exit_reason(vcpu).failed_vmentry)) -- 2.51.1.930.gacf6e81ea2-goog Move KVM's swapping of XFEATURE masks, i.e. XCR0 and XSS, out of the fastpath loop now that the guts of the #MC handler runs in task context, i.e. won't invoke schedule() with preemption disabled and clobber state (or crash the kernel) due to trying to context switch XSTATE with a mix of host and guest state. For all intents and purposes, this reverts commit 1811d979c716 ("x86/kvm: move kvm_load/put_guest_xcr0 into atomic context"), which papered over an egregious bug/flaw in the #MC handler where it would do schedule() even though IRQs are disabled. E.g. the call stack from the commit: kvm_load_guest_xcr0 ... kvm_x86_ops->run(vcpu) vmx_vcpu_run vmx_complete_atomic_exit kvm_machine_check do_machine_check do_memory_failure memory_failure lock_page Commit 1811d979c716 "fixed" the immediate issue of XRSTORS exploding, but completely ignored that scheduling out a vCPU task while IRQs and preemption is wildly broken. Thankfully, commit 5567d11c21a1 ("x86/mce: Send #MC singal from task work") (somewhat incidentally?) fixed that flaw by pushing the meat of the work to the user-return path, i.e. to task context. KVM has also hardened itself against #MC goofs by moving #MC forwarding to kvm_x86_ops.handle_exit_irqoff(), i.e. out of the fastpath. While that's by no means a robust fix, restoring as much state as possible before handling the #MC will hopefully provide some measure of protection in the event that #MC handling goes off the rails again. Note, KVM always intercepts XCR0 writes for vCPUs without protected state, e.g. there's no risk of consuming a stale XCR0 when determining if a PKRU update is needed; kvm_load_host_xfeatures() only reads, and never writes, vcpu->arch.xcr0. Deferring the XCR0 and XSS loads shaves ~300 cycles off the fastpath for Intel, and ~500 cycles for AMD. E.g. using INVD in KVM-Unit-Test's vmexit.c, which an extra hack to enable CR4.OXSAVE, latency numbers for AMD Turin go from ~2000 => 1500, and for Intel Emerald Rapids, go from ~1300 => ~1000. Cc: Jon Kohler Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b4b5d2d09634..b5c2879e3330 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1203,13 +1203,12 @@ void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lmsw); -void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) +static void kvm_load_guest_xfeatures(struct kvm_vcpu *vcpu) { if (vcpu->arch.guest_state_protected) return; if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) { - if (vcpu->arch.xcr0 != kvm_host.xcr0) xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); @@ -1217,6 +1216,27 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) vcpu->arch.ia32_xss != kvm_host.xss) wrmsrq(MSR_IA32_XSS, vcpu->arch.ia32_xss); } +} + +static void kvm_load_host_xfeatures(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.guest_state_protected) + return; + + if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) { + if (vcpu->arch.xcr0 != kvm_host.xcr0) + xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0); + + if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) && + vcpu->arch.ia32_xss != kvm_host.xss) + wrmsrq(MSR_IA32_XSS, kvm_host.xss); + } +} + +void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.guest_state_protected) + return; if (cpu_feature_enabled(X86_FEATURE_PKU) && vcpu->arch.pkru != vcpu->arch.host_pkru && @@ -1238,17 +1258,6 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) if (vcpu->arch.pkru != vcpu->arch.host_pkru) wrpkru(vcpu->arch.host_pkru); } - - if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) { - - if (vcpu->arch.xcr0 != kvm_host.xcr0) - xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0); - - if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) && - vcpu->arch.ia32_xss != kvm_host.xss) - wrmsrq(MSR_IA32_XSS, kvm_host.xss); - } - } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_load_host_xsave_state); @@ -11292,6 +11301,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (vcpu->arch.guest_fpu.xfd_err) wrmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); + kvm_load_guest_xfeatures(vcpu); + if (unlikely(vcpu->arch.switch_db_regs && !(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH))) { set_debugreg(DR7_FIXED_1, 7); @@ -11378,6 +11389,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); + kvm_load_host_xfeatures(vcpu); + /* * Sync xfd before calling handle_exit_irqoff() which may * rely on the fact that guest_fpu::xfd is up-to-date (e.g. -- 2.51.1.930.gacf6e81ea2-goog Move KVM's swapping of PKRU outside of the fastpath loop, as there is no KVM code anywhere in the fastpath that accesses guest/userspace memory, i.e. that can consume protection keys. As documented by commit 1be0e61c1f25 ("KVM, pkeys: save/restore PKRU when guest/host switches"), KVM just needs to ensure the host's PKRU is loaded when KVM (or the kernel at-large) may access userspace memory. And at the time of commit 1be0e61c1f25, KVM didn't have a fastpath, and PKU was strictly contained to VMX, i.e. there was no reason to swap PKRU outside of vmx_vcpu_run(). Over time, the "need" to swap PKRU close to VM-Enter was likely falsely solidified by the association with XFEATUREs in commit 37486135d3a7 ("KVM: x86: Fix pkru save/restore when guest CR4.PKE=0, move it to x86.c"), and XFEATURE swapping was in turn moved close to VM-Enter/VM-Exit as a KVM hack-a-fix ution for an #MC handler bug by commit 1811d979c716 ("x86/kvm: move kvm_load/put_guest_xcr0 into atomic context"). Deferring the PKRU loads shaves ~40 cycles off the fastpath for Intel, and ~60 cycles for AMD. E.g. using INVD in KVM-Unit-Test's vmexit.c, with extra hacks to enable CR4.PKE and PKRU=(-1u & ~0x3), latency numbers for AMD Turin go from ~1560 => ~1500, and for Intel Emerald Rapids, go from ~810 => ~770. Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 2 -- arch/x86/kvm/vmx/vmx.c | 4 ---- arch/x86/kvm/x86.c | 14 ++++++++++---- arch/x86/kvm/x86.h | 2 -- 4 files changed, 10 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index e8b158f73c79..e1fb853c263c 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4260,7 +4260,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) svm_set_dr6(vcpu, DR6_ACTIVE_LOW); clgi(); - kvm_load_guest_xsave_state(vcpu); /* * Hardware only context switches DEBUGCTL if LBR virtualization is @@ -4303,7 +4302,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl) update_debugctlmsr(vcpu->arch.host_debugctl); - kvm_load_host_xsave_state(vcpu); stgi(); /* Any pending NMI will happen here */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 123dae8cf46b..55d637cea84a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7465,8 +7465,6 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) vmx_set_interrupt_shadow(vcpu, 0); - kvm_load_guest_xsave_state(vcpu); - pt_guest_enter(vmx); atomic_switch_perf_msrs(vmx); @@ -7510,8 +7508,6 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) pt_guest_exit(vmx); - kvm_load_host_xsave_state(vcpu); - if (is_guest_mode(vcpu)) { /* * Track VMLAUNCH/VMRESUME that have made past guest state diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b5c2879e3330..6924006f0796 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1233,7 +1233,7 @@ static void kvm_load_host_xfeatures(struct kvm_vcpu *vcpu) } } -void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) +static void kvm_load_guest_pkru(struct kvm_vcpu *vcpu) { if (vcpu->arch.guest_state_protected) return; @@ -1244,9 +1244,8 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) wrpkru(vcpu->arch.pkru); } -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_load_guest_xsave_state); -void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) +static void kvm_load_host_pkru(struct kvm_vcpu *vcpu) { if (vcpu->arch.guest_state_protected) return; @@ -1259,7 +1258,6 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) wrpkru(vcpu->arch.host_pkru); } } -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_load_host_xsave_state); #ifdef CONFIG_X86_64 static inline u64 kvm_guest_supported_xfd(struct kvm_vcpu *vcpu) @@ -11331,6 +11329,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) guest_timing_enter_irqoff(); + /* + * Swap PKRU with hardware breakpoints disabled to minimize the number + * of flows where non-KVM code can run with guest state loaded. + */ + kvm_load_guest_pkru(vcpu); + for (;;) { /* * Assert that vCPU vs. VM APICv state is consistent. An APICv @@ -11359,6 +11363,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) ++vcpu->stat.exits; } + kvm_load_host_pkru(vcpu); + /* * Do this here before restoring debug registers on the host. And * since we do this before handling the vmexit, a DR access vmexit diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index f3dc77f006f9..24c754b0db2e 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -622,8 +622,6 @@ static inline void kvm_machine_check(void) #endif } -void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu); -void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu); int kvm_spec_ctrl_test_value(u64 value); int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, struct x86_exception *e); -- 2.51.1.930.gacf6e81ea2-goog