After VMRUN in guest mode, nested_sync_control_from_vmcb02() syncs fields written by the CPU from vmcb02 to the cached vmcb12. This is because the cached vmcb12 is used as the authoritative copy of some of the controls, and is the payload when saving/restoring nested state. NextRIP is also written by the CPU (in some cases) after VMRUN, but is not sync'd to the cached vmcb12. As a result, it is corrupted after save/restore (replaced by the original value written by L1 on nested VMRUN). This could cause problems for both KVM (e.g. when injecting a soft IRQ) or L1 (e.g. when using NextRIP to advance RIP after emulating an instruction). Fix this by sync'ing NextRIP to the cache after VMRUN of L2, but only after completing interrupts (not in nested_sync_control_from_vmcb02()), as KVM may update NextRIP (e.g. when re-injecting a soft IRQ). Fixes: cc440cdad5b7 ("KVM: nSVM: implement KVM_GET_NESTED_STATE and KVM_SET_NESTED_STATE") CC: stable@vger.kernel.org Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Signed-off-by: Yosry Ahmed --- arch/x86/kvm/svm/svm.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 8f8bc863e2143..07f096758f34f 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4435,6 +4435,16 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) svm_complete_interrupts(vcpu); + /* + * Update the cache after completing interrupts to get an accurate + * NextRIP, e.g. when re-injecting a soft interrupt. + * + * FIXME: Rework svm_get_nested_state() to not pull data from the + * cache (except for maybe int_ctl). + */ + if (is_guest_mode(vcpu)) + svm->nested.ctl.next_rip = svm->vmcb->control.next_rip; + return svm_exit_handlers_fastpath(vcpu); } -- 2.53.0.414.gf7e9f6c205-goog After VMRUN in guest mode, nested_sync_control_from_vmcb02() syncs fields written by the CPU from vmcb02 to the cached vmcb12. This is because the cached vmcb12 is used as the authoritative copy of some of the controls, and is the payload when saving/restoring nested state. int_state is also written by the CPU, specifically bit 0 (i.e. SVM_INTERRUPT_SHADOW_MASK) for nested VMs, but it is not sync'd to cached vmcb12. This does not cause a problem if KVM_SET_NESTED_STATE preceeds KVM_SET_VCPU_EVENTS in the restore path, as an interrupt shadow would be correctly restored to vmcb02 (KVM_SET_VCPU_EVENTS overwrites what KVM_SET_NESTED_STATE restored in int_state). However, if KVM_SET_VCPU_EVENTS preceeds KVM_SET_NESTED_STATE, an interrupt shadow would be restored into vmcb01 instead of vmcb02. This would mostly be benign for L1 (delays an interrupt), but not for L2. For L2, the vCPU could hang (e.g. if a wakeup interrupt is delivered before a HLT that should have been in an interrupt shadow). Sync int_state to the cached vmcb12 in nested_sync_control_from_vmcb02() to avoid this problem. With that, KVM_SET_NESTED_STATE restores the correct interrupt shadow state, and if KVM_SET_VCPU_EVENTS follows it would overwrite it with the same value. Fixes: cc440cdad5b7 ("KVM: nSVM: implement KVM_GET_NESTED_STATE and KVM_SET_NESTED_STATE") CC: stable@vger.kernel.org Signed-off-by: Yosry Ahmed --- arch/x86/kvm/svm/nested.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index de90b104a0dd5..9909ff237e5ca 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -521,6 +521,7 @@ void nested_sync_control_from_vmcb02(struct vcpu_svm *svm) u32 mask; svm->nested.ctl.event_inj = svm->vmcb->control.event_inj; svm->nested.ctl.event_inj_err = svm->vmcb->control.event_inj_err; + svm->nested.ctl.int_state = svm->vmcb->control.int_state; /* Only a few fields of int_ctl are written by the processor. */ mask = V_IRQ_MASK | V_TPR_MASK; -- 2.53.0.414.gf7e9f6c205-goog V_GIF_MASK is one of the fields written by the CPU after VMRUN, and sync'd by KVM from vmcb02 to cached vmcb12 after running L2. Part of the reason is to make sure V_GIF_MASK is saved/restored correctly, as the cached vmcb12 is the payload of nested state. Verify that V_GIF_MASK is saved/restored correctly in state_test by enabling vGIF in vmcb12, toggling GIF in L2 at different GUEST_SYNC() points, and verifying that V_GIF_MASK is correctly propagated to the nested state. Signed-off-by: Yosry Ahmed --- tools/testing/selftests/kvm/x86/state_test.c | 24 ++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tools/testing/selftests/kvm/x86/state_test.c b/tools/testing/selftests/kvm/x86/state_test.c index f2c7a1c297e37..57c7546f3d7c5 100644 --- a/tools/testing/selftests/kvm/x86/state_test.c +++ b/tools/testing/selftests/kvm/x86/state_test.c @@ -26,7 +26,9 @@ void svm_l2_guest_code(void) GUEST_SYNC(4); /* Exit to L1 */ vmcall(); + clgi(); GUEST_SYNC(6); + stgi(); /* Done, exit to L1 and never come back. */ vmcall(); } @@ -41,6 +43,8 @@ static void svm_l1_guest_code(struct svm_test_data *svm) generic_svm_setup(svm, svm_l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + vmcb->control.int_ctl |= (V_GIF_ENABLE_MASK | V_GIF_MASK); + GUEST_SYNC(3); run_guest(vmcb, svm->vmcb_gpa); GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL); @@ -222,6 +226,24 @@ static void __attribute__((__flatten__)) guest_code(void *arg) GUEST_DONE(); } +void svm_check_nested_state(int stage, struct kvm_x86_state *state) +{ + struct vmcb *vmcb = (struct vmcb *)state->nested.data.svm; + + if (kvm_cpu_has(X86_FEATURE_VGIF)) { + if (stage == 4) + TEST_ASSERT_EQ(!!(vmcb->control.int_ctl & V_GIF_MASK), 1); + if (stage == 6) + TEST_ASSERT_EQ(!!(vmcb->control.int_ctl & V_GIF_MASK), 0); + } +} + +void check_nested_state(int stage, struct kvm_x86_state *state) +{ + if (kvm_has_cap(KVM_CAP_NESTED_STATE) && kvm_cpu_has(X86_FEATURE_SVM)) + svm_check_nested_state(stage, state); +} + int main(int argc, char *argv[]) { uint64_t *xstate_bv, saved_xstate_bv; @@ -278,6 +300,8 @@ int main(int argc, char *argv[]) kvm_vm_release(vm); + check_nested_state(stage, state); + /* Restore state in a new VM. */ vcpu = vm_recreate_with_one_vcpu(vm); vcpu_load_state(vcpu, state); -- 2.53.0.414.gf7e9f6c205-goog Similar to vGIF, extend state_test to make sure that next_rip is saved correctly in nested state. GUEST_SYNC() in L2 causes IO emulation by KVM, which advances the RIP to the value of next_rip. Hence, if next_rip is saved correctly, its value should match the saved RIP value. Signed-off-by: Yosry Ahmed --- tools/testing/selftests/kvm/x86/state_test.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/testing/selftests/kvm/x86/state_test.c b/tools/testing/selftests/kvm/x86/state_test.c index 57c7546f3d7c5..992a52504a4ab 100644 --- a/tools/testing/selftests/kvm/x86/state_test.c +++ b/tools/testing/selftests/kvm/x86/state_test.c @@ -236,6 +236,17 @@ void svm_check_nested_state(int stage, struct kvm_x86_state *state) if (stage == 6) TEST_ASSERT_EQ(!!(vmcb->control.int_ctl & V_GIF_MASK), 0); } + + if (kvm_cpu_has(X86_FEATURE_NRIPS)) { + /* + * GUEST_SYNC() causes IO emulation in KVM, in which case the + * RIP is advanced before exiting to userspace. Hence, the RIP + * in the saved state should be the same as nRIP saved by the + * CPU in the VMCB. + */ + if (stage == 6) + TEST_ASSERT_EQ(vmcb->control.next_rip, state->regs.rip); + } } void check_nested_state(int stage, struct kvm_x86_state *state) -- 2.53.0.414.gf7e9f6c205-goog For guests with NRIPS disabled, L1 does not provide NextRIP when running an L2 with an injected soft interrupt, instead it advances the current RIP before running it. KVM uses the current RIP as the NextRIP in vmcb02 to emulate a CPU without NRIPS. However, after L2 runs the first time, NextRIP will be updated by the CPU and/or KVM, and the current RIP is no longer the correct value to use in vmcb02. Hence, after save/restore, use the current RIP if and only if a nested run is pending, otherwise use NextRIP. Fixes: cc440cdad5b7 ("KVM: nSVM: implement KVM_GET_NESTED_STATE and KVM_SET_NESTED_STATE") CC: stable@vger.kernel.org Signed-off-by: Yosry Ahmed --- arch/x86/kvm/svm/nested.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 9909ff237e5ca..f3ed1bdbe76c9 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -845,17 +845,24 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, vmcb02->control.event_inj_err = svm->nested.ctl.event_inj_err; /* - * next_rip is consumed on VMRUN as the return address pushed on the + * NextRIP is consumed on VMRUN as the return address pushed on the * stack for injected soft exceptions/interrupts. If nrips is exposed - * to L1, take it verbatim from vmcb12. If nrips is supported in - * hardware but not exposed to L1, stuff the actual L2 RIP to emulate - * what a nrips=0 CPU would do (L1 is responsible for advancing RIP - * prior to injecting the event). + * to L1, take it verbatim from vmcb12. + * + * If nrips is supported in hardware but not exposed to L1, stuff the + * actual L2 RIP to emulate what a nrips=0 CPU would do (L1 is + * responsible for advancing RIP prior to injecting the event). This is + * only the case for the first L2 run after VMRUN. After that (e.g. + * during save/restore), NextRIP is updated by the CPU and/or KVM, and + * the value of the L2 RIP from vmcb12 should not be used. */ - if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS)) - vmcb02->control.next_rip = svm->nested.ctl.next_rip; - else if (boot_cpu_has(X86_FEATURE_NRIPS)) - vmcb02->control.next_rip = vmcb12_rip; + if (boot_cpu_has(X86_FEATURE_NRIPS)) { + if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS) || + !svm->nested.nested_run_pending) + vmcb02->control.next_rip = svm->nested.ctl.next_rip; + else + vmcb02->control.next_rip = vmcb12_rip; + } svm->nmi_l1_to_l2 = is_evtinj_nmi(vmcb02->control.event_inj); if (is_evtinj_soft(vmcb02->control.event_inj)) { -- 2.53.0.414.gf7e9f6c205-goog For guests with NRIPS disabled, L1 does not provide NextRIP when running an L2 with an injected soft interrupt, instead it advances L2's RIP before running it. KVM uses L2's current RIP as the NextRIP in vmcb02 to emulate a CPU without NRIPS. However, in svm_set_nested_state(), the value used for L2's current RIP comes from vmcb02, which is just whatever the vCPU had in vmcb02 before restoring nested state (zero on a freshly created vCPU). Passing the cached RIP value instead (i.e. kvm_rip_read()) would only fix the issue if registers are restored before nested state. Instead, split the logic of setting NextRIP in vmcb02. Handle the 'normal' case of initializing vmcb02's NextRIP using NextRIP from vmcb12 (or KVM_GET_NESTED_STATE's payload) in nested_vmcb02_prepare_control(). Delay the special case of stuffing L2's current RIP into vmcb02's NextRIP until shortly before the vCPU is run, to make sure the most up-to-date value of RIP is used regardless of KVM_SET_REGS and KVM_SET_NESTED_STATE's relative ordering. Fixes: cc440cdad5b7 ("KVM: nSVM: implement KVM_GET_NESTED_STATE and KVM_SET_NESTED_STATE") CC: stable@vger.kernel.org Suggested-by: Sean Christopherson Signed-off-by: Yosry Ahmed --- arch/x86/kvm/svm/nested.c | 25 ++++++++----------------- arch/x86/kvm/svm/svm.c | 17 +++++++++++++++++ 2 files changed, 25 insertions(+), 17 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index f3ed1bdbe76c9..dcd4a8eb156f2 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -845,24 +845,15 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, vmcb02->control.event_inj_err = svm->nested.ctl.event_inj_err; /* - * NextRIP is consumed on VMRUN as the return address pushed on the - * stack for injected soft exceptions/interrupts. If nrips is exposed - * to L1, take it verbatim from vmcb12. - * - * If nrips is supported in hardware but not exposed to L1, stuff the - * actual L2 RIP to emulate what a nrips=0 CPU would do (L1 is - * responsible for advancing RIP prior to injecting the event). This is - * only the case for the first L2 run after VMRUN. After that (e.g. - * during save/restore), NextRIP is updated by the CPU and/or KVM, and - * the value of the L2 RIP from vmcb12 should not be used. + * If nrips is exposed to L1, take NextRIP as-is. Otherwise, L1 + * advances L2's RIP before VMRUN instead of using NextRIP. KVM will + * stuff the current RIP as vmcb02's NextRIP before L2 is run. After + * the first run of L2 (e.g. after save+restore), NextRIP is updated by + * the CPU and/or KVM and should be used regardless of L1's support. */ - if (boot_cpu_has(X86_FEATURE_NRIPS)) { - if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS) || - !svm->nested.nested_run_pending) - vmcb02->control.next_rip = svm->nested.ctl.next_rip; - else - vmcb02->control.next_rip = vmcb12_rip; - } + if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS) || + !svm->nested.nested_run_pending) + vmcb02->control.next_rip = svm->nested.ctl.next_rip; svm->nmi_l1_to_l2 = is_evtinj_nmi(vmcb02->control.event_inj); if (is_evtinj_soft(vmcb02->control.event_inj)) { diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 07f096758f34f..ded4372f2d499 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3660,6 +3660,23 @@ static int pre_svm_run(struct kvm_vcpu *vcpu) if (svm->current_vmcb->asid_generation != sd->asid_generation) new_asid(svm, sd); + /* + * If nrips is supported in hardware but not exposed to L1, stuff the + * actual L2 RIP to emulate what a nrips=0 CPU would do (L1 is + * responsible for advancing RIP prior to injecting the event). Once L2 + * runs after L1 executes VMRUN, NextRIP is updated by the CPU and/or + * KVM, and this is no longer needed. + * + * This is done here (as opposed to when preparing vmcb02) to use the + * most up-to-date value of RIP regardless of the order of restoring + * registers and nested state in the vCPU save+restore path. + */ + if (is_guest_mode(vcpu) && svm->nested.nested_run_pending) { + if (boot_cpu_has(X86_FEATURE_NRIPS) && + !guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS)) + svm->vmcb->control.next_rip = kvm_rip_read(vcpu); + } + return 0; } -- 2.53.0.414.gf7e9f6c205-goog In the save+restore path, when restoring nested state, the values of RIP and CS base passed into nested_vmcb02_prepare_control() are mostly incorrect. They are both pulled from the vmcb02. For CS base, the value is only correct if system regs are restored before nested state. The value of RIP is whatever the vCPU had in vmcb02 before restoring nested state (zero on a freshly created vCPU). Instead, take a similar approach to NextRIP, and delay initializing the RIP tracking fields until shortly before the vCPU is run, to make sure the most up-to-date values of RIP and CS base are used regardless of KVM_SET_SREGS, KVM_SET_REGS, and KVM_SET_NESTED_STATE's relative ordering. Fixes: cc440cdad5b7 ("KVM: nSVM: implement KVM_GET_NESTED_STATE and KVM_SET_NESTED_STATE") CC: stable@vger.kernel.org Suggested-by: Sean Christopherson Signed-off-by: Yosry Ahmed --- arch/x86/kvm/svm/nested.c | 17 ++++++++--------- arch/x86/kvm/svm/svm.c | 10 ++++++++++ 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index dcd4a8eb156f2..4499241b4e401 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -742,9 +742,7 @@ static bool is_evtinj_nmi(u32 evtinj) return type == SVM_EVTINJ_TYPE_NMI; } -static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, - unsigned long vmcb12_rip, - unsigned long vmcb12_csbase) +static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) { u32 int_ctl_vmcb01_bits = V_INTR_MASKING_MASK; u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK; @@ -856,14 +854,15 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, vmcb02->control.next_rip = svm->nested.ctl.next_rip; svm->nmi_l1_to_l2 = is_evtinj_nmi(vmcb02->control.event_inj); + + /* + * soft_int_csbase, soft_int_old_rip, and soft_int_next_rip (if L1 + * doesn't have NRIPS) are initialized later, before the vCPU is run. + */ if (is_evtinj_soft(vmcb02->control.event_inj)) { svm->soft_int_injected = true; - svm->soft_int_csbase = vmcb12_csbase; - svm->soft_int_old_rip = vmcb12_rip; if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS)) svm->soft_int_next_rip = svm->nested.ctl.next_rip; - else - svm->soft_int_next_rip = vmcb12_rip; } /* LBR_CTL_ENABLE_MASK is controlled by svm_update_lbrv() */ @@ -961,7 +960,7 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, nested_svm_copy_common_state(svm->vmcb01.ptr, svm->nested.vmcb02.ptr); svm_switch_vmcb(svm, &svm->nested.vmcb02); - nested_vmcb02_prepare_control(svm, vmcb12->save.rip, vmcb12->save.cs.base); + nested_vmcb02_prepare_control(svm); nested_vmcb02_prepare_save(svm, vmcb12); ret = nested_svm_load_cr3(&svm->vcpu, svm->nested.save.cr3, @@ -1906,7 +1905,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, nested_copy_vmcb_control_to_cache(svm, ctl); svm_switch_vmcb(svm, &svm->nested.vmcb02); - nested_vmcb02_prepare_control(svm, svm->vmcb->save.rip, svm->vmcb->save.cs.base); + nested_vmcb02_prepare_control(svm); /* * While the nested guest CR3 is already checked and set by diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index ded4372f2d499..7948e601ea784 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3670,11 +3670,21 @@ static int pre_svm_run(struct kvm_vcpu *vcpu) * This is done here (as opposed to when preparing vmcb02) to use the * most up-to-date value of RIP regardless of the order of restoring * registers and nested state in the vCPU save+restore path. + * + * Simiarly, initialize svm->soft_int_* fields here to use the most + * up-to-date values of RIP and CS base, regardless of restore order. */ if (is_guest_mode(vcpu) && svm->nested.nested_run_pending) { if (boot_cpu_has(X86_FEATURE_NRIPS) && !guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS)) svm->vmcb->control.next_rip = kvm_rip_read(vcpu); + + if (svm->soft_int_injected) { + svm->soft_int_csbase = svm->vmcb->save.cs.base; + svm->soft_int_old_rip = kvm_rip_read(vcpu); + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS)) + svm->soft_int_next_rip = kvm_rip_read(vcpu); + } } return 0; -- 2.53.0.414.gf7e9f6c205-goog Update svm_nested_soft_inject_test such that L1 syncs to userspace before running L2. The test then enables single-stepping and steps through guest code until VMRUN is execute, and saves/restores the VM immediately after (before L2 runs). This reproduces a bug in save/restore where L2's RIP is not used correctly to construct the vmcb02 at the destination. Signed-off-by: Yosry Ahmed --- .../testing/selftests/kvm/lib/x86/processor.c | 8 +- .../kvm/x86/svm_nested_soft_inject_test.c | 74 +++++++++++++++---- 2 files changed, 65 insertions(+), 17 deletions(-) diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c index fab18e9be66c9..7e0213a88697d 100644 --- a/tools/testing/selftests/kvm/lib/x86/processor.c +++ b/tools/testing/selftests/kvm/lib/x86/processor.c @@ -1275,6 +1275,8 @@ struct kvm_x86_state *vcpu_save_state(struct kvm_vcpu *vcpu) return state; } +#define LOAD_REGS_BEFORE_NESTED 1 + void vcpu_load_state(struct kvm_vcpu *vcpu, struct kvm_x86_state *state) { vcpu_sregs_set(vcpu, &state->sregs); @@ -1287,10 +1289,14 @@ void vcpu_load_state(struct kvm_vcpu *vcpu, struct kvm_x86_state *state) vcpu_events_set(vcpu, &state->events); vcpu_mp_state_set(vcpu, &state->mp_state); vcpu_debugregs_set(vcpu, &state->debugregs); - vcpu_regs_set(vcpu, &state->regs); + if (LOAD_REGS_BEFORE_NESTED) + vcpu_regs_set(vcpu, &state->regs); if (state->nested.size) vcpu_nested_state_set(vcpu, &state->nested); + + if (!LOAD_REGS_BEFORE_NESTED) + vcpu_regs_set(vcpu, &state->regs); } void kvm_x86_state_cleanup(struct kvm_x86_state *state) diff --git a/tools/testing/selftests/kvm/x86/svm_nested_soft_inject_test.c b/tools/testing/selftests/kvm/x86/svm_nested_soft_inject_test.c index 4bd1655f9e6d0..dfefd8eed392a 100644 --- a/tools/testing/selftests/kvm/x86/svm_nested_soft_inject_test.c +++ b/tools/testing/selftests/kvm/x86/svm_nested_soft_inject_test.c @@ -101,6 +101,7 @@ static void l1_guest_code(struct svm_test_data *svm, uint64_t is_nmi, uint64_t i vmcb->control.next_rip = vmcb->save.rip; } + GUEST_SYNC(true); run_guest(vmcb, svm->vmcb_gpa); __GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL, "Expected VMMCAL #VMEXIT, got '0x%lx', info1 = '0x%lx, info2 = '0x%lx'", @@ -131,6 +132,7 @@ static void l1_guest_code(struct svm_test_data *svm, uint64_t is_nmi, uint64_t i /* The return address pushed on stack, skip over UD2 */ vmcb->control.next_rip = vmcb->save.rip + 2; + GUEST_SYNC(true); run_guest(vmcb, svm->vmcb_gpa); __GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_HLT, "Expected HLT #VMEXIT, got '0x%lx', info1 = '0x%lx, info2 = '0x%lx'", @@ -140,6 +142,24 @@ static void l1_guest_code(struct svm_test_data *svm, uint64_t is_nmi, uint64_t i GUEST_DONE(); } +static struct kvm_vcpu *save_and_restore_vm(struct kvm_vm *vm, struct kvm_vcpu *vcpu) +{ + struct kvm_x86_state *state = vcpu_save_state(vcpu); + + kvm_vm_release(vm); + vcpu = vm_recreate_with_one_vcpu(vm); + vcpu_load_state(vcpu, state); + kvm_x86_state_cleanup(state); + return vcpu; +} + +static bool is_nested_run_pending(struct kvm_vcpu *vcpu) +{ + struct kvm_x86_state *state = vcpu_save_state(vcpu); + + return state->nested.size && (state->nested.flags & KVM_STATE_NESTED_RUN_PENDING); +} + static void run_test(bool is_nmi) { struct kvm_vcpu *vcpu; @@ -173,22 +193,44 @@ static void run_test(bool is_nmi) memset(&debug, 0, sizeof(debug)); vcpu_guest_debug_set(vcpu, &debug); - struct ucall uc; - - alarm(2); - vcpu_run(vcpu); - alarm(0); - TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); - - switch (get_ucall(vcpu, &uc)) { - case UCALL_ABORT: - REPORT_GUEST_ASSERT(uc); - break; - /* NOT REACHED */ - case UCALL_DONE: - goto done; - default: - TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd); + for (;;) { + struct kvm_guest_debug debug; + struct ucall uc; + + alarm(2); + vcpu_run(vcpu); + alarm(0); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_SYNC: + /* + * L1 syncs before calling run_guest(), single-step over + * all instructions until VMRUN, and save+restore right + * after it (before L2 actually runs). + */ + debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP; + vcpu_guest_debug_set(vcpu, &debug); + + do { + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_DEBUG); + } while (!is_nested_run_pending(vcpu)); + + memset(&debug, 0, sizeof(debug)); + vcpu_guest_debug_set(vcpu, &debug); + vcpu = save_and_restore_vm(vm, vcpu); + break; + + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + /* NOT REACHED */ + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd); + } } done: kvm_vm_free(vm); -- 2.53.0.414.gf7e9f6c205-goog