In the save/restore path, if KVM_SET_NESTED_STATE is performed before restoring REGS and/or SREGS , the values of CS and RIP used to initialize the vmcb02's NextRIP and soft interrupt tracking RIPs are incorrect. Recalculate them up after CS is set, or REGS are restored. This is only needed when a nested run is pending during restore. After L2 runs for the first time, any soft interrupts injected by L1 are already delivered or tracked by KVM separately for re-injection, so the CS and RIP values are no longer relevant. If KVM_SET_NESTED_STATE is performed after both REGS and SREGS are restored, it will just overwrite the fields. Fixes: cc440cdad5b7 ("KVM: nSVM: implement KVM_GET_NESTED_STATE and KVM_SET_NESTED_STATE") CC: stable@vger.kernel.org Signed-off-by: Yosry Ahmed --- arch/x86/include/asm/kvm-x86-ops.h | 1 + arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm/nested.c | 4 +++- arch/x86/kvm/svm/svm.c | 21 +++++++++++++++++++++ arch/x86/kvm/x86.c | 2 ++ 5 files changed, 28 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index de709fb5bd76..7221517ea3e6 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -54,6 +54,7 @@ KVM_X86_OP(cache_reg) KVM_X86_OP(get_rflags) KVM_X86_OP(set_rflags) KVM_X86_OP(get_if_flag) +KVM_X86_OP_OPTIONAL(post_user_set_regs) KVM_X86_OP(flush_tlb_all) KVM_X86_OP(flush_tlb_current) #if IS_ENABLED(CONFIG_HYPERV) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ff07c45e3c73..feadd9579159 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1789,6 +1789,7 @@ struct kvm_x86_ops { unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); bool (*get_if_flag)(struct kvm_vcpu *vcpu); + void (*post_user_set_regs)(struct kvm_vcpu *vcpu); void (*flush_tlb_all)(struct kvm_vcpu *vcpu); void (*flush_tlb_current)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index af7a0113f269..22680aa31c28 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -766,7 +766,9 @@ void nested_vmcb02_prepare_rips(struct kvm_vcpu *vcpu, unsigned long csbase, else if (boot_cpu_has(X86_FEATURE_NRIPS)) svm->vmcb->control.next_rip = rip; - if (!is_evtinj_soft(svm->nested.ctl.event_inj)) + /* L1's injected events should be cleared after the first run of L2 */ + if (!is_evtinj_soft(svm->nested.ctl.event_inj) || + WARN_ON_ONCE(!svm->nested.nested_run_pending)) return; svm->soft_int_injected = true; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 8f8bc863e214..5729da2b300d 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1477,6 +1477,24 @@ static bool svm_get_if_flag(struct kvm_vcpu *vcpu) : kvm_get_rflags(vcpu) & X86_EFLAGS_IF; } +static void svm_fixup_nested_rips(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + /* + * In the save/restore path, if nested state is restored before + * RIP or CS, then fixing up the vmcb02 (and soft IRQ tracking) is + * needed. This is only the case if a nested run is pending (i.e. L2 + * is yet to run after L1's VMRUN). Otherwise, any soft IRQ injected by + * L1 should have been delivered to L2 or is being tracked separately by + * KVM for re-injection. Similarly, NextRIP would have already been + * updated by the CPU and/or KVM. + */ + if (svm->nested.nested_run_pending) + nested_vmcb02_prepare_rips(vcpu, svm->vmcb->save.cs.base, + kvm_rip_read(vcpu)); +} + static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) { kvm_register_mark_available(vcpu, reg); @@ -1826,6 +1844,8 @@ static void svm_set_segment(struct kvm_vcpu *vcpu, if (seg == VCPU_SREG_SS) /* This is symmetric with svm_get_segment() */ svm->vmcb->save.cpl = (var->dpl & 3); + else if (seg == VCPU_SREG_CS) + svm_fixup_nested_rips(vcpu); vmcb_mark_dirty(svm->vmcb, VMCB_SEG); } @@ -5172,6 +5192,7 @@ struct kvm_x86_ops svm_x86_ops __initdata = { .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, .get_if_flag = svm_get_if_flag, + .post_user_set_regs = svm_fixup_nested_rips, .flush_tlb_all = svm_flush_tlb_all, .flush_tlb_current = svm_flush_tlb_current, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index db3f393192d9..35fe1d337273 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12112,6 +12112,8 @@ static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) kvm_rip_write(vcpu, regs->rip); kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED); + kvm_x86_call(post_user_set_regs)(vcpu); + vcpu->arch.exception.pending = false; vcpu->arch.exception_vmexit.pending = false; -- 2.53.0.273.g2a3d683680-goog