Move the logic of switching EFER.SVME in the guest outside of svm_set_efer(). This makes it possible to easily check the skip conditions separately (and add more) and reduce indentation level. No functional change intended. Signed-off-by: Yosry Ahmed --- arch/x86/kvm/svm/svm.c | 72 ++++++++++++++++++++++++------------------ 1 file changed, 42 insertions(+), 30 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 5f0136dbdde6b..4575a6a7d6c4e 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -200,11 +200,49 @@ static int get_npt_level(void) #endif } +static int svm_set_efer_svme(struct kvm_vcpu *vcpu, u64 old_efer, u64 new_efer) +{ + struct vcpu_svm *svm = to_svm(vcpu); + int r; + + if ((old_efer & EFER_SVME) == (new_efer & EFER_SVME)) + return 0; + + if (new_efer & EFER_SVME) { + r = svm_allocate_nested(svm); + if (r) + return r; + + /* + * Never intercept #GP for SEV guests, KVM can't decrypt guest + * memory to workaround the erratum. + */ + if (svm_gp_erratum_intercept && !sev_guest(vcpu->kvm)) + set_exception_intercept(svm, GP_VECTOR); + } else { + + svm_leave_nested(vcpu); + /* #GP intercept is still needed for vmware backdoor */ + if (!enable_vmware_backdoor) + clr_exception_intercept(svm, GP_VECTOR); + + /* + * Free the nested guest state, unless we are in SMM. In this + * case we will return to the nested guest as soon as we leave + * SMM. + */ + if (!is_smm(vcpu)) + svm_free_nested(svm); + } + return 0; +} + int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) { struct vcpu_svm *svm = to_svm(vcpu); u64 old_efer = vcpu->arch.efer; vcpu->arch.efer = efer; + int r; if (!npt_enabled) { /* Shadow paging assumes NX to be available. */ @@ -214,36 +252,10 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) efer &= ~EFER_LME; } - if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) { - if (!(efer & EFER_SVME)) { - svm_leave_nested(vcpu); - /* #GP intercept is still needed for vmware backdoor */ - if (!enable_vmware_backdoor) - clr_exception_intercept(svm, GP_VECTOR); - - /* - * Free the nested guest state, unless we are in SMM. - * In this case we will return to the nested guest - * as soon as we leave SMM. - */ - if (!is_smm(vcpu)) - svm_free_nested(svm); - - } else { - int ret = svm_allocate_nested(svm); - - if (ret) { - vcpu->arch.efer = old_efer; - return ret; - } - - /* - * Never intercept #GP for SEV guests, KVM can't - * decrypt guest memory to workaround the erratum. - */ - if (svm_gp_erratum_intercept && !sev_guest(vcpu->kvm)) - set_exception_intercept(svm, GP_VECTOR); - } + r = svm_set_efer_svme(vcpu, old_efer, efer); + if (r) { + vcpu->arch.efer = old_efer; + return r; } svm->vmcb->save.efer = efer | EFER_SVME; -- 2.53.0.rc1.225.gd81095ad13-goog KVM tracks when EFER.SVME is set and cleared to initialize and tear down nested state. However, it doesn't differentiate if EFER.SVME is getting toggled in L1 or L2+. Toggling EFER.SVME in L2+ is inconsequential from KVM's perspective, as the vCPU is still obviously using nested. This causes a problem if L2 sets then clears EFER.SVME without L1 interception, as KVM exits guest mode and tears down nested state while L2 is running, executing L1 without injecting a proper #VMEXIT. Technically, it's not a bug as the APM states that an L1 hypervisor should intercept EFER writes: The effect of turning off EFER.SVME while a guest is running is undefined; therefore, the VMM should always prevent guests from writing EFER. However, it would be nice if KVM handled it more gracefully. Signed-off-by: Yosry Ahmed --- arch/x86/kvm/svm/svm.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 4575a6a7d6c4e..eaf0f8053fbfb 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -208,6 +208,13 @@ static int svm_set_efer_svme(struct kvm_vcpu *vcpu, u64 old_efer, u64 new_efer) if ((old_efer & EFER_SVME) == (new_efer & EFER_SVME)) return 0; + /* + * An L2 guest setting or clearing EFER_SVME does not change whether or + * not the vCPU can use nested from KVM's perspective. + */ + if (is_guest_mode(vcpu)) + return 0; + if (new_efer & EFER_SVME) { r = svm_allocate_nested(svm); if (r) -- 2.53.0.rc1.225.gd81095ad13-goog Add a test to make sure that L2 toggling EFER.SVME works as intended. This is a regression test for KVM mistakenly leaving nested and tearing down nested state if L2 disables EFER.SVME, even though L1 is still using nested. Signed-off-by: Yosry Ahmed --- tools/testing/selftests/kvm/Makefile.kvm | 1 + .../kvm/x86/svm_nested_toggle_efer_svme.c | 76 +++++++++++++++++++ 2 files changed, 77 insertions(+) create mode 100644 tools/testing/selftests/kvm/x86/svm_nested_toggle_efer_svme.c diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index 58eee0474db6a..7bd2297b5494e 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -112,6 +112,7 @@ TEST_GEN_PROGS_x86 += x86/svm_vmcall_test TEST_GEN_PROGS_x86 += x86/svm_int_ctl_test TEST_GEN_PROGS_x86 += x86/svm_nested_shutdown_test TEST_GEN_PROGS_x86 += x86/svm_nested_soft_inject_test +TEST_GEN_PROGS_x86 += x86/svm_nested_toggle_efer_svme TEST_GEN_PROGS_x86 += x86/tsc_scaling_sync TEST_GEN_PROGS_x86 += x86/sync_regs_test TEST_GEN_PROGS_x86 += x86/ucna_injection_test diff --git a/tools/testing/selftests/kvm/x86/svm_nested_toggle_efer_svme.c b/tools/testing/selftests/kvm/x86/svm_nested_toggle_efer_svme.c new file mode 100644 index 0000000000000..5267fbdbf01cc --- /dev/null +++ b/tools/testing/selftests/kvm/x86/svm_nested_toggle_efer_svme.c @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2026, Google LLC. + */ +#include "kvm_util.h" +#include "vmx.h" +#include "svm_util.h" +#include "kselftest.h" + + +#define L2_GUEST_STACK_SIZE 64 + +static void l2_guest_code(void) +{ + unsigned long efer = rdmsr(MSR_EFER); + + /* generic_svm_setup() initializes EFER_SVME set for L2 */ + GUEST_ASSERT(efer & EFER_SVME); + + wrmsr(MSR_EFER, efer & ~EFER_SVME); + GUEST_ASSERT(rdmsr(MSR_EFER) == (efer & ~EFER_SVME)); + + wrmsr(MSR_EFER, efer); + GUEST_ASSERT(rdmsr(MSR_EFER) == efer); + + vmmcall(); +} + +static void l1_guest_code(struct svm_test_data *svm) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + + generic_svm_setup(svm, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + run_guest(svm->vmcb, svm->vmcb_gpa); + GUEST_ASSERT(svm->vmcb->control.exit_code == SVM_EXIT_VMMCALL); + + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + vm_vaddr_t nested_gva = 0; + + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM)); + + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); + + vcpu_alloc_svm(vm, &nested_gva); + vcpu_args_set(vcpu, 1, nested_gva); + + for (;;) { + struct ucall uc; + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + case UCALL_SYNC: + break; + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + } + +done: + kvm_vm_free(vm); + return 0; +} -- 2.53.0.rc1.225.gd81095ad13-goog