To end an ongoing game of whack-a-mole between KVM and syzkaller, WARN on illegally cancelling a pending nested VM-Enter if and only if userspace has NOT gained control of the vCPU since the nested run was initiated. As proven time and time again by syzkaller, userspace can clobber vCPU state so as to force a VM-Exit that violates KVM's architectural modelling of VMRUN/VMLAUNCH/VMRESUME. To detect that userspace has gained control, while minimizing the risk of operating on stale data, convert nested_run_pending from a pure boolean to a tri-state of sorts, where '0' is still "not pending", '1' is "pending", and '2' is "pending but untrusted". Then on KVM_RUN, if the flag is in the "trusted pending" state, move it to "untrusted pending". Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 8 +++++++- arch/x86/kvm/svm/nested.c | 11 +++++++---- arch/x86/kvm/svm/svm.c | 2 +- arch/x86/kvm/vmx/nested.c | 12 +++++++----- arch/x86/kvm/vmx/vmx.c | 2 +- arch/x86/kvm/x86.c | 7 +++++++ arch/x86/kvm/x86.h | 10 ++++++++++ 7 files changed, 40 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a1bf0aaedad8..acec03d7bb1e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1105,8 +1105,14 @@ struct kvm_vcpu_arch { * can only occur at instruction boundaries. The only exception is * VMX's "notify" exits, which exist in large part to break the CPU out * of infinite ucode loops, but can corrupt vCPU state in the process! + * + * For all intents and purposes, this is a boolean, but it's tracked as + * a u8 so that KVM can detect when userspace may have stuffed vCPU + * state and generated an architecturally-impossible VM-Exit. */ - bool nested_run_pending; +#define KVM_NESTED_RUN_PENDING 1 +#define KVM_NESTED_RUN_PENDING_UNTRUSTED 2 + u8 nested_run_pending; #if IS_ENABLED(CONFIG_HYPERV) hpa_t hv_root_tdp; diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 782d6a34d173..af741823c08c 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -1131,7 +1131,7 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) if (!npt_enabled) vmcb01->save.cr3 = kvm_read_cr3(vcpu); - vcpu->arch.nested_run_pending = 1; + vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING; if (enter_svm_guest_mode(vcpu, vmcb12_gpa, true) || !nested_svm_merge_msrpm(vcpu)) { @@ -1277,7 +1277,8 @@ void nested_svm_vmexit(struct vcpu_svm *svm) /* Exit Guest-Mode */ leave_guest_mode(vcpu); svm->nested.vmcb12_gpa = 0; - WARN_ON_ONCE(vcpu->arch.nested_run_pending); + + kvm_warn_on_nested_run_pending(vcpu); kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); @@ -1984,8 +1985,10 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET)); - vcpu->arch.nested_run_pending = - !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); + if (kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING) + vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING_UNTRUSTED; + else + vcpu->arch.nested_run_pending = 0; svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 112731515ee3..7de2c6621b98 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -5035,7 +5035,7 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) if (ret) goto unmap_save; - vcpu->arch.nested_run_pending = 1; + vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING; unmap_save: kvm_vcpu_unmap(vcpu, &map_save); diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 65e45cee871c..6a107025e7e2 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3844,7 +3844,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) * We're finally done with prerequisite checking, and can start with * the nested entry. */ - vcpu->arch.nested_run_pending = 1; + vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING; vmx->nested.has_preemption_timer_deadline = false; status = nested_vmx_enter_non_root_mode(vcpu, true); if (unlikely(status != NVMX_VMENTRY_SUCCESS)) @@ -5056,7 +5056,7 @@ void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, vmx->nested.mtf_pending = false; /* trying to cancel vmlaunch/vmresume is a bug */ - WARN_ON_ONCE(vcpu->arch.nested_run_pending); + kvm_warn_on_nested_run_pending(vcpu); #ifdef CONFIG_KVM_HYPERV if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { @@ -6679,7 +6679,7 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) unsigned long exit_qual; u32 exit_intr_info; - WARN_ON_ONCE(vcpu->arch.nested_run_pending); + kvm_warn_on_nested_run_pending(vcpu); /* * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM @@ -6987,8 +6987,10 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) return 0; - vcpu->arch.nested_run_pending = - !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); + if (kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING) + vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING_UNTRUSTED; + else + vcpu->arch.nested_run_pending = 0; vmx->nested.mtf_pending = !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 0e7e0a17bb75..dbe0f234aefb 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8423,7 +8423,7 @@ int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) if (ret) return ret; - vcpu->arch.nested_run_pending = 1; + vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING; vmx->nested.smm.guest_mode = false; } return 0; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 879cdeb6adde..cad16c83dcff 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12090,6 +12090,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) if (r <= 0) goto out; + /* + * If userspace may have modified vCPU state, mark nested_run_pending + * as "untrusted" to avoid triggering false-positive WARNs. + */ + if (vcpu->arch.nested_run_pending == KVM_NESTED_RUN_PENDING) + vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING_UNTRUSTED; + r = vcpu_run(vcpu); out: diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 44a28d343d40..38a905fa86de 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -188,6 +188,16 @@ static inline bool kvm_can_set_cpuid_and_feature_msrs(struct kvm_vcpu *vcpu) return vcpu->arch.last_vmentry_cpu == -1 && !is_guest_mode(vcpu); } +/* + * WARN if a nested VM-Enter is pending completion, and userspace hasn't gained + * control since the nested VM-Enter was initiated (in which case, userspace + * may have modified vCPU state to induce an architecturally invalid VM-Exit). + */ +static inline void kvm_warn_on_nested_run_pending(struct kvm_vcpu *vcpu) +{ + WARN_ON_ONCE(vcpu->arch.nested_run_pending == KVM_NESTED_RUN_PENDING); +} + static inline void kvm_set_mp_state(struct kvm_vcpu *vcpu, int mp_state) { vcpu->arch.mp_state = mp_state; -- 2.53.0.473.g4a7958ca14-goog