From: Dapeng Mi Implement the PMU "world switch" between host perf and guest mediated PMU. When loading guest state, call into perf to switch from host to guest, and then load guest state into hardware, and then reverse those actions when putting guest state. On the KVM side, when loading guest state, zero PERF_GLOBAL_CTRL to ensure all counters are disabled, then load selectors and counters, and finally call into vendor code to load control/status information. While VMX and SVM use different mechanisms to avoid counting host activity while guest controls are loaded, both implementations require PERF_GLOBAL_CTRL to be zeroed when the event selectors are in flux. When putting guest state, reverse the order, and save and zero controls and status prior to saving+zeroing selectors and counters. Defer clearing PERF_GLOBAL_CTRL to vendor code, as only SVM needs to manually clear the MSR; VMX configures PERF_GLOBAL_CTRL to be atomically cleared by the CPU on VM-Exit. Handle the difference in MSR layouts between Intel and AMD by communicating the bases and stride via kvm_pmu_ops. Because KVM requires Intel v4 (and full-width writes) and AMD v2, the MSRs to load/save are constant for a given vendor, i.e. do not vary based on the guest PMU, and do not vary based on host PMU (because KVM will simply disable mediated PMU support if the necessary MSRs are unsupported). Except for retrieving the guest's PERF_GLOBAL_CTRL, which needs to be read before invoking any fastpath handler (spoiler alert), perform the context switch around KVM's inner run loop. State only needs to be synchronized from hardware before KVM can access the software "caches". Note, VMX already grabs the guest's PERF_GLOBAL_CTRL immediately after VM-Exit, as hardware saves value into the VMCS. Co-developed-by: Mingwei Zhang Signed-off-by: Mingwei Zhang Co-developed-by: Sandipan Das Signed-off-by: Sandipan Das Signed-off-by: Dapeng Mi Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm-x86-pmu-ops.h | 2 + arch/x86/include/asm/msr-index.h | 1 + arch/x86/kvm/pmu.c | 111 +++++++++++++++++++++++++ arch/x86/kvm/pmu.h | 10 +++ arch/x86/kvm/svm/pmu.c | 34 ++++++++ arch/x86/kvm/svm/svm.c | 3 + arch/x86/kvm/vmx/pmu_intel.c | 45 ++++++++++ arch/x86/kvm/x86.c | 4 + 8 files changed, 210 insertions(+) diff --git a/arch/x86/include/asm/kvm-x86-pmu-ops.h b/arch/x86/include/asm/kvm-x86-pmu-ops.h index ad2cc82abf79..f0aa6996811f 100644 --- a/arch/x86/include/asm/kvm-x86-pmu-ops.h +++ b/arch/x86/include/asm/kvm-x86-pmu-ops.h @@ -24,6 +24,8 @@ KVM_X86_PMU_OP_OPTIONAL(deliver_pmi) KVM_X86_PMU_OP_OPTIONAL(cleanup) KVM_X86_PMU_OP_OPTIONAL(write_global_ctrl) +KVM_X86_PMU_OP(mediated_load) +KVM_X86_PMU_OP(mediated_put) #undef KVM_X86_PMU_OP #undef KVM_X86_PMU_OP_OPTIONAL diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 06bd7aaae931..0f50a74797a4 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -1171,6 +1171,7 @@ #define MSR_CORE_PERF_GLOBAL_STATUS 0x0000038e #define MSR_CORE_PERF_GLOBAL_CTRL 0x0000038f #define MSR_CORE_PERF_GLOBAL_OVF_CTRL 0x00000390 +#define MSR_CORE_PERF_GLOBAL_STATUS_SET 0x00000391 #define MSR_PERF_METRICS 0x00000329 diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index b4c6a7704a01..77042cad3155 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -1234,3 +1234,114 @@ int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp) kfree(filter); return r; } + +static __always_inline u32 fixed_counter_msr(u32 idx) +{ + return kvm_pmu_ops.FIXED_COUNTER_BASE + idx * kvm_pmu_ops.MSR_STRIDE; +} + +static __always_inline u32 gp_counter_msr(u32 idx) +{ + return kvm_pmu_ops.GP_COUNTER_BASE + idx * kvm_pmu_ops.MSR_STRIDE; +} + +static __always_inline u32 gp_eventsel_msr(u32 idx) +{ + return kvm_pmu_ops.GP_EVENTSEL_BASE + idx * kvm_pmu_ops.MSR_STRIDE; +} + +static void kvm_pmu_load_guest_pmcs(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + u32 i; + + /* + * No need to zero out unexposed GP/fixed counters/selectors since RDPMC + * is intercepted if hardware has counters that aren't visible to the + * guest (KVM will inject #GP as appropriate). + */ + for (i = 0; i < pmu->nr_arch_gp_counters; i++) { + pmc = &pmu->gp_counters[i]; + + wrmsrl(gp_counter_msr(i), pmc->counter); + wrmsrl(gp_eventsel_msr(i), pmc->eventsel_hw); + } + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { + pmc = &pmu->fixed_counters[i]; + + wrmsrl(fixed_counter_msr(i), pmc->counter); + } +} + +void kvm_mediated_pmu_load(struct kvm_vcpu *vcpu) +{ + if (!kvm_vcpu_has_mediated_pmu(vcpu) || + KVM_BUG_ON(!lapic_in_kernel(vcpu), vcpu->kvm)) + return; + + lockdep_assert_irqs_disabled(); + + perf_load_guest_context(kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVTPC)); + + /* + * Disable all counters before loading event selectors and PMCs so that + * KVM doesn't enable or load guest counters while host events are + * active. VMX will enable/disabled counters at VM-Enter/VM-Exit by + * atomically loading PERF_GLOBAL_CONTROL. SVM effectively performs + * the switch by configuring all events to be GUEST_ONLY. + */ + wrmsrl(kvm_pmu_ops.PERF_GLOBAL_CTRL, 0); + + kvm_pmu_load_guest_pmcs(vcpu); + + kvm_pmu_call(mediated_load)(vcpu); +} + +static void kvm_pmu_put_guest_pmcs(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + u32 i; + + /* + * Clear selectors and counters to ensure hardware doesn't count using + * guest controls when the host (perf) restores its state. + */ + for (i = 0; i < pmu->nr_arch_gp_counters; i++) { + pmc = &pmu->gp_counters[i]; + + pmc->counter = rdpmc(i); + if (pmc->counter) + wrmsrl(gp_counter_msr(i), 0); + if (pmc->eventsel_hw) + wrmsrl(gp_eventsel_msr(i), 0); + } + + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { + pmc = &pmu->fixed_counters[i]; + + pmc->counter = rdpmc(INTEL_PMC_FIXED_RDPMC_BASE | i); + if (pmc->counter) + wrmsrl(fixed_counter_msr(i), 0); + } +} + +void kvm_mediated_pmu_put(struct kvm_vcpu *vcpu) +{ + if (!kvm_vcpu_has_mediated_pmu(vcpu) || + KVM_BUG_ON(!lapic_in_kernel(vcpu), vcpu->kvm)) + return; + + lockdep_assert_irqs_disabled(); + + /* + * Defer handling of PERF_GLOBAL_CTRL to vendor code. On Intel, it's + * atomically cleared on VM-Exit, i.e. doesn't need to be clear here. + */ + kvm_pmu_call(mediated_put)(vcpu); + + kvm_pmu_put_guest_pmcs(vcpu); + + perf_put_guest_context(); +} diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 1c9d26d60a60..e2e2d8476a3f 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -38,11 +38,19 @@ struct kvm_pmu_ops { void (*cleanup)(struct kvm_vcpu *vcpu); bool (*is_mediated_pmu_supported)(struct x86_pmu_capability *host_pmu); + void (*mediated_load)(struct kvm_vcpu *vcpu); + void (*mediated_put)(struct kvm_vcpu *vcpu); void (*write_global_ctrl)(u64 global_ctrl); const u64 EVENTSEL_EVENT; const int MAX_NR_GP_COUNTERS; const int MIN_NR_GP_COUNTERS; + + const u32 PERF_GLOBAL_CTRL; + const u32 GP_EVENTSEL_BASE; + const u32 GP_COUNTER_BASE; + const u32 FIXED_COUNTER_BASE; + const u32 MSR_STRIDE; }; void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops); @@ -240,6 +248,8 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu); int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp); void kvm_pmu_instruction_retired(struct kvm_vcpu *vcpu); void kvm_pmu_branch_retired(struct kvm_vcpu *vcpu); +void kvm_mediated_pmu_load(struct kvm_vcpu *vcpu); +void kvm_mediated_pmu_put(struct kvm_vcpu *vcpu); bool is_vmware_backdoor_pmc(u32 pmc_idx); bool kvm_need_perf_global_ctrl_intercept(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index a5e70a4e7647..c03720b30785 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -233,6 +233,32 @@ static bool amd_pmu_is_mediated_pmu_supported(struct x86_pmu_capability *host_pm return host_pmu->version >= 2; } +static void amd_mediated_pmu_load(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + u64 global_status; + + rdmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, global_status); + /* Clear host global_status MSR if non-zero. */ + if (global_status) + wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, global_status); + + wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET, pmu->global_status); + wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, pmu->global_ctrl); +} + +static void amd_mediated_pmu_put(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + + wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, 0); + rdmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, pmu->global_status); + + /* Clear global status bits if non-zero */ + if (pmu->global_status) + wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, pmu->global_status); +} + struct kvm_pmu_ops amd_pmu_ops __initdata = { .rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc, .msr_idx_to_pmc = amd_msr_idx_to_pmc, @@ -244,8 +270,16 @@ struct kvm_pmu_ops amd_pmu_ops __initdata = { .init = amd_pmu_init, .is_mediated_pmu_supported = amd_pmu_is_mediated_pmu_supported, + .mediated_load = amd_mediated_pmu_load, + .mediated_put = amd_mediated_pmu_put, .EVENTSEL_EVENT = AMD64_EVENTSEL_EVENT, .MAX_NR_GP_COUNTERS = KVM_MAX_NR_AMD_GP_COUNTERS, .MIN_NR_GP_COUNTERS = AMD64_NUM_COUNTERS, + + .PERF_GLOBAL_CTRL = MSR_AMD64_PERF_CNTR_GLOBAL_CTL, + .GP_EVENTSEL_BASE = MSR_F15H_PERF_CTL0, + .GP_COUNTER_BASE = MSR_F15H_PERF_CTR0, + .FIXED_COUNTER_BASE = 0, + .MSR_STRIDE = 2, }; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index add50b64256c..ca6f453cc160 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4416,6 +4416,9 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET; + if (!msr_write_intercepted(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_CTL)) + rdmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, vcpu_to_pmu(vcpu)->global_ctrl); + /* * We need to handle MC intercepts here before the vcpu has a chance to * change the physical cpu diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 6874c522577e..41a845de789e 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -786,6 +786,43 @@ static void intel_pmu_write_global_ctrl(u64 global_ctrl) vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, global_ctrl); } + +static void intel_mediated_pmu_load(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + u64 global_status, toggle; + + rdmsrq(MSR_CORE_PERF_GLOBAL_STATUS, global_status); + toggle = pmu->global_status ^ global_status; + if (global_status & toggle) + wrmsrq(MSR_CORE_PERF_GLOBAL_OVF_CTRL, global_status & toggle); + if (pmu->global_status & toggle) + wrmsrq(MSR_CORE_PERF_GLOBAL_STATUS_SET, pmu->global_status & toggle); + + wrmsrq(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl_hw); +} + +static void intel_mediated_pmu_put(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + + /* MSR_CORE_PERF_GLOBAL_CTRL is already saved at VM-exit. */ + rdmsrq(MSR_CORE_PERF_GLOBAL_STATUS, pmu->global_status); + + /* Clear hardware MSR_CORE_PERF_GLOBAL_STATUS MSR, if non-zero. */ + if (pmu->global_status) + wrmsrq(MSR_CORE_PERF_GLOBAL_OVF_CTRL, pmu->global_status); + + /* + * Clear hardware FIXED_CTR_CTRL MSR to avoid information leakage and + * also to avoid accidentally enabling fixed counters (based on guest + * state) while running in the host, e.g. when setting global ctrl. + */ + if (pmu->fixed_ctr_ctrl_hw) + wrmsrq(MSR_CORE_PERF_FIXED_CTR_CTRL, 0); +} + + struct kvm_pmu_ops intel_pmu_ops __initdata = { .rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc, .msr_idx_to_pmc = intel_msr_idx_to_pmc, @@ -799,9 +836,17 @@ struct kvm_pmu_ops intel_pmu_ops __initdata = { .cleanup = intel_pmu_cleanup, .is_mediated_pmu_supported = intel_pmu_is_mediated_pmu_supported, + .mediated_load = intel_mediated_pmu_load, + .mediated_put = intel_mediated_pmu_put, .write_global_ctrl = intel_pmu_write_global_ctrl, .EVENTSEL_EVENT = ARCH_PERFMON_EVENTSEL_EVENT, .MAX_NR_GP_COUNTERS = KVM_MAX_NR_INTEL_GP_COUNTERS, .MIN_NR_GP_COUNTERS = 1, + + .PERF_GLOBAL_CTRL = MSR_CORE_PERF_GLOBAL_CTRL, + .GP_EVENTSEL_BASE = MSR_P6_EVNTSEL0, + .GP_COUNTER_BASE = MSR_IA32_PMC0, + .FIXED_COUNTER_BASE = MSR_CORE_PERF_FIXED_CTR0, + .MSR_STRIDE = 1, }; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b8014435c988..7fb94ef64e18 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10906,6 +10906,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) run_flags |= KVM_RUN_LOAD_DEBUGCTL; vcpu->arch.host_debugctl = debug_ctl; + kvm_mediated_pmu_load(vcpu); + guest_timing_enter_irqoff(); for (;;) { @@ -10936,6 +10938,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) ++vcpu->stat.exits; } + kvm_mediated_pmu_put(vcpu); + /* * Do this here before restoring debug registers on the host. And * since we do this before handling the vmexit, a DR access vmexit -- 2.50.1.565.gc32cd1483b-goog