Explicitly finalize kvm_cpu_caps as part of each vendor's setup flow to fix a bug where clearing SHSTK and IBT due to lack of CET XFEATURE support makes kvm-intel.ko unloadable when nested=1. The late clearing results in nested_vmx_setup_{entry,exit}_ctls() clearing VM_{ENTRY,EXIT}_LOAD_CET_STATE when nested_vmx_setup_ctls_msrs() runs during the CPU compatibility checks, ultimately leading to a mismatched VMCS config due to the reference config having the CET bits set, but every CPU's "local" config having the bits cleared. Note, kvm_caps.supported_{xcr0,xss} are unconditionally initialized by kvm_x86_vendor_init(), before calling into vendor code, and not referenced between ops->hardware_setup() and their current/old location. Fixes: 69cc3e886582 ("KVM: x86: Add XSS support for CET_KERNEL and CET_USER") Cc: stable@vger.kernel.org Cc: Mathias Krause Cc: John Allen Cc: Rick Edgecombe Cc: Chao Gao Cc: Binbin Wu Cc: Xiaoyao Li Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.c | 21 +++++++++++++++++++-- arch/x86/kvm/cpuid.h | 3 ++- arch/x86/kvm/svm/svm.c | 4 +++- arch/x86/kvm/vmx/vmx.c | 4 +++- arch/x86/kvm/x86.c | 14 -------------- arch/x86/kvm/x86.h | 2 ++ 6 files changed, 29 insertions(+), 19 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 575244af9c9f..267e59b405c1 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -826,7 +826,7 @@ do { \ /* DS is defined by ptrace-abi.h on 32-bit builds. */ #undef DS -void kvm_set_cpu_caps(void) +void kvm_initialize_cpu_caps(void) { memset(kvm_cpu_caps, 0, sizeof(kvm_cpu_caps)); @@ -1289,7 +1289,24 @@ void kvm_set_cpu_caps(void) kvm_cpu_cap_clear(X86_FEATURE_RDPID); } } -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cpu_caps); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_initialize_cpu_caps); + +void kvm_finalize_cpu_caps(void) +{ + if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) + kvm_caps.supported_xss = 0; + + if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && + !kvm_cpu_cap_has(X86_FEATURE_IBT)) + kvm_caps.supported_xss &= ~XFEATURE_MASK_CET_ALL; + + if ((kvm_caps.supported_xss & XFEATURE_MASK_CET_ALL) != XFEATURE_MASK_CET_ALL) { + kvm_cpu_cap_clear(X86_FEATURE_SHSTK); + kvm_cpu_cap_clear(X86_FEATURE_IBT); + kvm_caps.supported_xss &= ~XFEATURE_MASK_CET_ALL; + } +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_finalize_cpu_caps); #undef F #undef SCATTERED_F diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index d3f5ae15a7ca..3b0b4b1adb97 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -8,7 +8,8 @@ #include extern u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly; -void kvm_set_cpu_caps(void); +void kvm_initialize_cpu_caps(void); +void kvm_finalize_cpu_caps(void); void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu); struct kvm_cpuid_entry2 *kvm_find_cpuid_entry2(struct kvm_cpuid_entry2 *entries, diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 7803d2781144..0c23fcaedcc5 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -5305,7 +5305,7 @@ static __init void svm_adjust_mmio_mask(void) static __init void svm_set_cpu_caps(void) { - kvm_set_cpu_caps(); + kvm_initialize_cpu_caps(); kvm_caps.supported_perf_cap = 0; @@ -5387,6 +5387,8 @@ static __init void svm_set_cpu_caps(void) */ kvm_cpu_cap_clear(X86_FEATURE_BUS_LOCK_DETECT); kvm_cpu_cap_clear(X86_FEATURE_MSR_IMM); + + kvm_finalize_cpu_caps(); } static __init int svm_hardware_setup(void) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 27acafd03381..7d373e32ea9c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8173,7 +8173,7 @@ static __init u64 vmx_get_perf_capabilities(void) static __init void vmx_set_cpu_caps(void) { - kvm_set_cpu_caps(); + kvm_initialize_cpu_caps(); /* CPUID 0x1 */ if (nested) @@ -8230,6 +8230,8 @@ static __init void vmx_set_cpu_caps(void) kvm_cpu_cap_clear(X86_FEATURE_SHSTK); kvm_cpu_cap_clear(X86_FEATURE_IBT); } + + kvm_finalize_cpu_caps(); } static bool vmx_is_io_intercepted(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8acfdfc583a1..36385e6aebfa 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -220,7 +220,6 @@ static DEFINE_PER_CPU(struct kvm_user_return_msrs, user_return_msrs); | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \ | XFEATURE_MASK_PKRU | XFEATURE_MASK_XTILE) -#define XFEATURE_MASK_CET_ALL (XFEATURE_MASK_CET_USER | XFEATURE_MASK_CET_KERNEL) /* * Note, KVM supports exposing PT to the guest, but does not support context * switching PT via XSTATE (KVM's PT virtualization relies on perf; swapping @@ -10138,19 +10137,6 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) if (!tdp_enabled) kvm_caps.supported_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; - if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) - kvm_caps.supported_xss = 0; - - if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && - !kvm_cpu_cap_has(X86_FEATURE_IBT)) - kvm_caps.supported_xss &= ~XFEATURE_MASK_CET_ALL; - - if ((kvm_caps.supported_xss & XFEATURE_MASK_CET_ALL) != XFEATURE_MASK_CET_ALL) { - kvm_cpu_cap_clear(X86_FEATURE_SHSTK); - kvm_cpu_cap_clear(X86_FEATURE_IBT); - kvm_caps.supported_xss &= ~XFEATURE_MASK_CET_ALL; - } - if (kvm_caps.has_tsc_control) { /* * Make sure the user can only configure tsc_khz values that diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 70e81f008030..9edfac5d5ffb 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -483,6 +483,8 @@ extern struct kvm_host_values kvm_host; extern bool enable_pmu; extern bool enable_mediated_pmu; +#define XFEATURE_MASK_CET_ALL (XFEATURE_MASK_CET_USER | XFEATURE_MASK_CET_KERNEL) + /* * Get a filtered version of KVM's supported XCR0 that strips out dynamic * features for which the current process doesn't (yet) have permission to use. -- 2.52.0.457.g6b5491de43-goog Add a flag to track when KVM is actively configuring its CPU caps, and WARN if a cap is set or cleared if KVM isn't in its configuration stage. Modifying CPU caps after {svm,vmx}_set_cpu_caps() can be fatal to KVM, as vendor setup code expects the CPU caps to be frozen at that point, e.g. will do additional configuration based on the caps. Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.c | 8 ++++++++ arch/x86/kvm/cpuid.h | 4 ++++ 2 files changed, 12 insertions(+) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 267e59b405c1..2f01511135c2 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -36,6 +36,9 @@ u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly; EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_cpu_caps); +bool kvm_is_configuring_cpu_caps __read_mostly; +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_is_configuring_cpu_caps); + struct cpuid_xstate_sizes { u32 eax; u32 ebx; @@ -830,6 +833,9 @@ void kvm_initialize_cpu_caps(void) { memset(kvm_cpu_caps, 0, sizeof(kvm_cpu_caps)); + WARN_ON_ONCE(kvm_is_configuring_cpu_caps); + kvm_is_configuring_cpu_caps = true; + BUILD_BUG_ON(sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps)) > sizeof(boot_cpu_data.x86_capability)); @@ -1305,6 +1311,8 @@ void kvm_finalize_cpu_caps(void) kvm_cpu_cap_clear(X86_FEATURE_IBT); kvm_caps.supported_xss &= ~XFEATURE_MASK_CET_ALL; } + + kvm_is_configuring_cpu_caps = false; } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_finalize_cpu_caps); diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 3b0b4b1adb97..07175dff24d6 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -8,6 +8,8 @@ #include extern u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly; +extern bool kvm_is_configuring_cpu_caps __read_mostly; + void kvm_initialize_cpu_caps(void); void kvm_finalize_cpu_caps(void); @@ -189,6 +191,7 @@ static __always_inline void kvm_cpu_cap_clear(unsigned int x86_feature) { unsigned int x86_leaf = __feature_leaf(x86_feature); + WARN_ON_ONCE(!kvm_is_configuring_cpu_caps); kvm_cpu_caps[x86_leaf] &= ~__feature_bit(x86_feature); } @@ -196,6 +199,7 @@ static __always_inline void kvm_cpu_cap_set(unsigned int x86_feature) { unsigned int x86_leaf = __feature_leaf(x86_feature); + WARN_ON_ONCE(!kvm_is_configuring_cpu_caps); kvm_cpu_caps[x86_leaf] |= __feature_bit(x86_feature); } -- 2.52.0.457.g6b5491de43-goog When kvm-intel.ko refuses to load due to a mismatched VMCS config, print all mismatching offsets+values to make it easier to debug goofs during development, and it to make it at least feasible to triage failures that occur during production. E.g. if a physical core is flaky or is running with the "wrong" microcode patch loaded, then a CPU can get a legitimate mismatch even without KVM bugs. Print the mismatches as 32-bit values as a compromise between hand coding every field (to provide precise information) and printing individual bytes (requires more effort to deduce the mismatch bit(s)). All fields in the VMCS config are either 32-bit or 64-bit values, i.e. in many cases, printing 32-bit values will be 100% precise, and in the others it's close enough, especially when considering that MSR values are split into EDX:EAX anyways. E.g. on mismatch CET entry/exit controls, KVM will print: kvm_intel: VMCS config on CPU 0 doesn't match reference config: Offset 76 REF = 0x107fffff, CPU0 = 0x007fffff, mismatch = 0x10000000 Offset 84 REF = 0x0010f3ff, CPU0 = 0x0000f3ff, mismatch = 0x00100000 Opportunistically tweak the wording on the initial error message to say "mismatch" instead of "inconsistent", as the VMCS config itself isn't inconsistent, and the wording conflates the cross-CPU compatibility check with the error_on_inconsistent_vmcs_config knob that treats inconsistent VMCS configurations as errors (e.g. if a CPU supports CET entry controls but no CET exit controls). Cc: Jim Mattson Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 7d373e32ea9c..700a8c47b4ca 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2962,8 +2962,22 @@ int vmx_check_processor_compat(void) } if (nested) nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept); + if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config))) { - pr_err("Inconsistent VMCS config on CPU %d\n", cpu); + u32 *gold = (void *)&vmcs_config; + u32 *mine = (void *)&vmcs_conf; + int i; + + BUILD_BUG_ON(sizeof(struct vmcs_config) % sizeof(u32)); + + pr_err("VMCS config on CPU %d doesn't match reference config:\n", cpu); + for (i = 0; i < sizeof(struct vmcs_config) / sizeof(u32); i++) { + if (gold[i] == mine[i]) + continue; + + pr_cont(" Offset %lu REF = 0x%08x, CPU%u = 0x%08x, mismatch = 0x%08x\n", + i * sizeof(u32), gold[i], cpu, mine[i], gold[i] ^ mine[i]); + } return -EIO; } return 0; -- 2.52.0.457.g6b5491de43-goog