Move the call to nested_vmx_setup_ctls_msrs() from vmx_hardware_setup() to nested_vmx_hardware_setup() so that the nested code can deal with ordering dependencies without having to straddle vmx_hardware_setup() and nested_vmx_hardware_setup(). Specifically, an upcoming change will sanitize the vmcs12 fields based on hardware support, and that code needs to run _before_ the MSRs are configured, because the lovely vmcs_enum MSR depends on the max support vmcs12 field. No functional change intended. Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 2 ++ arch/x86/kvm/vmx/vmx.c | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 6137e5307d0f..61113ead3d7b 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -7407,6 +7407,8 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) { int i; + nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept); + if (!cpu_has_vmx_shadow_vmcs()) enable_shadow_vmcs = 0; if (enable_shadow_vmcs) { diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 6b96f7aea20b..5bb67566e43a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8670,8 +8670,6 @@ __init int vmx_hardware_setup(void) * can hide/show features based on kvm_cpu_cap_has(). */ if (nested) { - nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept); - r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers); if (r) return r; -- 2.52.0.457.g6b5491de43-goog Add a wrapper macro, ENC_TO_VMCS12_IDX(), to get a vmcs12 index given a field encoding in anticipation of add a macro to get from a vmcs12 index back to the field encoding. And because open coding ROL16(n, 6) everywhere is gross. No functional change intended. Suggested-by: Xiaoyao Li Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/hyperv_evmcs.c | 2 +- arch/x86/kvm/vmx/hyperv_evmcs.h | 2 +- arch/x86/kvm/vmx/vmcs.h | 1 + arch/x86/kvm/vmx/vmcs12.c | 4 ++-- arch/x86/kvm/vmx/vmcs12.h | 2 +- 5 files changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx/hyperv_evmcs.c b/arch/x86/kvm/vmx/hyperv_evmcs.c index 904bfcd1519b..cc728c9a3de5 100644 --- a/arch/x86/kvm/vmx/hyperv_evmcs.c +++ b/arch/x86/kvm/vmx/hyperv_evmcs.c @@ -7,7 +7,7 @@ #include "hyperv_evmcs.h" #define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x) -#define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \ +#define EVMCS1_FIELD(number, name, clean_field)[ENC_TO_VMCS12_IDX(number)] = \ {EVMCS1_OFFSET(name), clean_field} const struct evmcs_field vmcs_field_to_evmcs_1[] = { diff --git a/arch/x86/kvm/vmx/hyperv_evmcs.h b/arch/x86/kvm/vmx/hyperv_evmcs.h index 6536290f4274..fc7c4e7bd1bf 100644 --- a/arch/x86/kvm/vmx/hyperv_evmcs.h +++ b/arch/x86/kvm/vmx/hyperv_evmcs.h @@ -130,7 +130,7 @@ static __always_inline int evmcs_field_offset(unsigned long field, u16 *clean_field) { const struct evmcs_field *evmcs_field; - unsigned int index = ROL16(field, 6); + unsigned int index = ENC_TO_VMCS12_IDX(field); if (unlikely(index >= nr_evmcs_1_fields)) return -ENOENT; diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index b25625314658..9aa204c87661 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -12,6 +12,7 @@ #include "capabilities.h" #define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) +#define ENC_TO_VMCS12_IDX(enc) ROL16(enc, 6) struct vmcs_hdr { u32 revision_id:31; diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c index 4233b5ca9461..c2ac9e1a50b3 100644 --- a/arch/x86/kvm/vmx/vmcs12.c +++ b/arch/x86/kvm/vmx/vmcs12.c @@ -4,10 +4,10 @@ #include "vmcs12.h" #define VMCS12_OFFSET(x) offsetof(struct vmcs12, x) -#define FIELD(number, name) [ROL16(number, 6)] = VMCS12_OFFSET(name) +#define FIELD(number, name) [ENC_TO_VMCS12_IDX(number)] = VMCS12_OFFSET(name) #define FIELD64(number, name) \ FIELD(number, name), \ - [ROL16(number##_HIGH, 6)] = VMCS12_OFFSET(name) + sizeof(u32) + [ENC_TO_VMCS12_IDX(number##_HIGH)] = VMCS12_OFFSET(name) + sizeof(u32) const unsigned short vmcs12_field_offsets[] = { FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h index 4ad6b16525b9..7a5fdd9b27ba 100644 --- a/arch/x86/kvm/vmx/vmcs12.h +++ b/arch/x86/kvm/vmx/vmcs12.h @@ -385,7 +385,7 @@ static inline short get_vmcs12_field_offset(unsigned long field) if (field >> 15) return -ENOENT; - index = ROL16(field, 6); + index = ENC_TO_VMCS12_IDX(field); if (index >= nr_vmcs12_fields) return -ENOENT; -- 2.52.0.457.g6b5491de43-goog Disallow access (VMREAD/VMWRITE), both emulated and via a shadow VMCS, to VMCS fields that the loaded incarnation of KVM doesn't support, e.g. due to lack of hardware support, as a middle ground between allowing access to any vmcs12 field defined by KVM (current behavior) and gating access based on the userspace-defined vCPU model (the most functionally correct, but very costly, implementation). Disallowing access to unsupported fields helps a tiny bit in terms of closing the virtualization hole (see below), but the main motivation is to avoid having to weed out unsupported fields when synchronizing between vmcs12 and a shadow VMCS. Because shadow VMCS accesses are done via VMREAD and VMWRITE, KVM _must_ filter out unsupported fields (or eat VMREAD/VMWRITE failures), and filtering out just shadow VMCS fields is about the same amount of effort, and arguably much more confusing. As a bonus, this also fixes a KVM-Unit-Test failure bug when running on _hardware_ without support for TSC Scaling, which fails with the same signature as the bug fixed by commit ba1f82456ba8 ("KVM: nVMX: Dynamically compute max VMCS index for vmcs12"): FAIL: VMX_VMCS_ENUM.MAX_INDEX expected: 19, actual: 17 Dynamically computing the max VMCS index only resolved the issue where KVM was hardcoding max index, but for CPUs with TSC Scaling, that was "good enough". Reviewed-by: Chao Gao Reviewed-by: Xin Li Cc: Yosry Ahmed Link: https://lore.kernel.org/all/20251026201911.505204-22-xin@zytor.com Link: https://lore.kernel.org/all/YR2Tf9WPNEzrE7Xg@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 15 +++++---- arch/x86/kvm/vmx/vmcs.h | 8 +++++ arch/x86/kvm/vmx/vmcs12.c | 70 +++++++++++++++++++++++++++++++++++++-- arch/x86/kvm/vmx/vmcs12.h | 6 ++-- 4 files changed, 89 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 61113ead3d7b..ac7a17560c8f 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -111,6 +111,9 @@ static void init_vmcs_shadow_fields(void) field <= GUEST_TR_AR_BYTES, "Update vmcs12_write_any() to drop reserved bits from AR_BYTES"); + if (get_vmcs12_field_offset(field) < 0) + continue; + /* * PML and the preemption timer can be emulated, but the * processor cannot vmwrite to fields that don't exist @@ -7074,12 +7077,6 @@ void nested_vmx_set_vmcs_shadowing_bitmap(void) } } -/* - * Indexing into the vmcs12 uses the VMCS encoding rotated left by 6. Undo - * that madness to get the encoding for comparison. - */ -#define VMCS12_IDX_TO_ENC(idx) ((u16)(((u16)(idx) >> 6) | ((u16)(idx) << 10))) - static u64 nested_vmx_calc_vmcs_enum_msr(void) { /* @@ -7407,6 +7404,12 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) { int i; + /* + * Note! The set of supported vmcs12 fields is consumed by both VMX + * MSR and shadow VMCS setup. + */ + nested_vmx_setup_vmcs12_fields(); + nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept); if (!cpu_has_vmx_shadow_vmcs()) diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index 9aa204c87661..66d747e265b1 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -11,7 +11,15 @@ #include "capabilities.h" +/* + * Indexing into the vmcs12 uses the VMCS encoding rotated left by 6 as a very + * rudimentary compression of the range of indices. The compression ratio is + * good enough to allow KVM to use a (very sparsely populated) array without + * wasting too much memory, while the "algorithm" is fast enough to be used to + * lookup vmcs12 fields on-demand, e.g. for emulation. + */ #define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) +#define VMCS12_IDX_TO_ENC(idx) ROL16(idx, 10) #define ENC_TO_VMCS12_IDX(enc) ROL16(enc, 6) struct vmcs_hdr { diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c index c2ac9e1a50b3..1ebe67c384ad 100644 --- a/arch/x86/kvm/vmx/vmcs12.c +++ b/arch/x86/kvm/vmx/vmcs12.c @@ -9,7 +9,7 @@ FIELD(number, name), \ [ENC_TO_VMCS12_IDX(number##_HIGH)] = VMCS12_OFFSET(name) + sizeof(u32) -const unsigned short vmcs12_field_offsets[] = { +static const u16 kvm_supported_vmcs12_field_offsets[] __initconst = { FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), FIELD(POSTED_INTR_NV, posted_intr_nv), FIELD(GUEST_ES_SELECTOR, guest_es_selector), @@ -158,4 +158,70 @@ const unsigned short vmcs12_field_offsets[] = { FIELD(HOST_SSP, host_ssp), FIELD(HOST_INTR_SSP_TABLE, host_ssp_tbl), }; -const unsigned int nr_vmcs12_fields = ARRAY_SIZE(vmcs12_field_offsets); + +u16 vmcs12_field_offsets[ARRAY_SIZE(kvm_supported_vmcs12_field_offsets)] __ro_after_init; +unsigned int nr_vmcs12_fields __ro_after_init; + +#define VMCS12_CASE64(enc) case enc##_HIGH: case enc + +static __init bool cpu_has_vmcs12_field(unsigned int idx) +{ + switch (VMCS12_IDX_TO_ENC(idx)) { + case VIRTUAL_PROCESSOR_ID: + return cpu_has_vmx_vpid(); + case POSTED_INTR_NV: + return cpu_has_vmx_posted_intr(); + VMCS12_CASE64(TSC_MULTIPLIER): + return cpu_has_vmx_tsc_scaling(); + case TPR_THRESHOLD: + VMCS12_CASE64(VIRTUAL_APIC_PAGE_ADDR): + return cpu_has_vmx_tpr_shadow(); + VMCS12_CASE64(APIC_ACCESS_ADDR): + return cpu_has_vmx_virtualize_apic_accesses(); + VMCS12_CASE64(POSTED_INTR_DESC_ADDR): + return cpu_has_vmx_posted_intr(); + case GUEST_INTR_STATUS: + return cpu_has_vmx_virtual_intr_delivery(); + VMCS12_CASE64(VM_FUNCTION_CONTROL): + VMCS12_CASE64(EPTP_LIST_ADDRESS): + return cpu_has_vmx_vmfunc(); + VMCS12_CASE64(EPT_POINTER): + return cpu_has_vmx_ept(); + VMCS12_CASE64(XSS_EXIT_BITMAP): + return cpu_has_vmx_xsaves(); + VMCS12_CASE64(ENCLS_EXITING_BITMAP): + return cpu_has_vmx_encls_vmexit(); + VMCS12_CASE64(GUEST_IA32_PERF_GLOBAL_CTRL): + VMCS12_CASE64(HOST_IA32_PERF_GLOBAL_CTRL): + return cpu_has_load_perf_global_ctrl(); + case SECONDARY_VM_EXEC_CONTROL: + return cpu_has_secondary_exec_ctrls(); + case GUEST_S_CET: + case GUEST_SSP: + case GUEST_INTR_SSP_TABLE: + case HOST_S_CET: + case HOST_SSP: + case HOST_INTR_SSP_TABLE: + return cpu_has_load_cet_ctrl(); + + /* KVM always emulates PML and the VMX preemption timer in software. */ + case GUEST_PML_INDEX: + case VMX_PREEMPTION_TIMER_VALUE: + default: + return true; + } +} + +void __init nested_vmx_setup_vmcs12_fields(void) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(kvm_supported_vmcs12_field_offsets); i++) { + if (!kvm_supported_vmcs12_field_offsets[i] || + !cpu_has_vmcs12_field(i)) + continue; + + vmcs12_field_offsets[i] = kvm_supported_vmcs12_field_offsets[i]; + nr_vmcs12_fields = i + 1; + } +} diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h index 7a5fdd9b27ba..21cd1b75e4fd 100644 --- a/arch/x86/kvm/vmx/vmcs12.h +++ b/arch/x86/kvm/vmx/vmcs12.h @@ -374,8 +374,10 @@ static inline void vmx_check_vmcs12_offsets(void) CHECK_OFFSET(guest_pml_index, 996); } -extern const unsigned short vmcs12_field_offsets[]; -extern const unsigned int nr_vmcs12_fields; +extern u16 vmcs12_field_offsets[] __ro_after_init; +extern unsigned int nr_vmcs12_fields __ro_after_init; + +void __init nested_vmx_setup_vmcs12_fields(void); static inline short get_vmcs12_field_offset(unsigned long field) { -- 2.52.0.457.g6b5491de43-goog Drop KVM's filtering of GUEST_INTR_STATUS when generating the shadow VMCS bitmap now that KVM drops GUEST_INTR_STATUS from the set of supported vmcs12 fields if the field isn't supported by hardware, and initialization of the shadow VMCS fields omits unsupported vmcs12 fields. Note, there is technically a small functional change here, as the vmcs12 filtering only requires support for Virtual Interrupt Delivery, whereas the shadow VMCS code being removed required "full" APICv support, i.e. required Virtual Interrupt Delivery *and* APIC Register Virtualizaton *and* Posted Interrupt support. Opportunistically tweak the comment to more precisely explain why the PML and VMX preemption timer fields need to be explicitly checked. Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index ac7a17560c8f..3ef4d7ab5723 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -115,9 +115,10 @@ static void init_vmcs_shadow_fields(void) continue; /* - * PML and the preemption timer can be emulated, but the - * processor cannot vmwrite to fields that don't exist - * on bare metal. + * KVM emulates PML and the VMX preemption timer irrespective + * of hardware support, but shadowing their related VMCS fields + * requires hardware support as the CPU will reject VMWRITEs to + * fields that don't exist. */ switch (field) { case GUEST_PML_INDEX: @@ -128,10 +129,6 @@ static void init_vmcs_shadow_fields(void) if (!cpu_has_vmx_preemption_timer()) continue; break; - case GUEST_INTR_STATUS: - if (!cpu_has_vmx_apicv()) - continue; - break; default: break; } -- 2.52.0.457.g6b5491de43-goog