Make use of the extended Mediated PMU framework to introduce support for
PMC virtualization (X86_FEATURE_PERFCTR_VIRT). Since the guest PMU state
is saved in the VMCB State Save Area, extend the MSR access PMU ops to
handle host-initiated requests from code responsible for event filtering
and incrementing counters due to instruction emulation.

The underlying implementation depends on the availability of either VNMI
or AVIC for guest interrupt delivery. Synthesized overflows like those
resulting from incrementing counters in software due to instruction
emulation are still injected through the KVM_REQ_PMI path.

If both VNMI and AVIC are enabled, the hardware automatically chooses
AVIC. The advantage of using AVIC is that it lets the guest change the
delivery mode in the LVTPC. Unlike AVIC, VNMI ignores the LVTPC, as APIC
is emulated, and will always present the overflow interrupts as NMIs.
This is demonstrated by some failures in KUT's x86/pmu test.

The feature is enabled by default if the host supports it and also has
Mediated PMU enabled but it can also be switched manually using the new
"vpmc" parameter for the kvm_amd module.

Signed-off-by: Sandipan Das <sandipan.das@amd.com>
---
 arch/x86/include/asm/svm.h |   1 +
 arch/x86/kvm/svm/pmu.c     | 100 +++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/svm/svm.c     |  52 +++++++++++++++++++
 arch/x86/kvm/svm/svm.h     |   1 +
 4 files changed, 154 insertions(+)

diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index a80df935b580..adc30a9f950f 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -221,6 +221,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
 
 #define LBR_CTL_ENABLE_MASK BIT_ULL(0)
 #define VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK BIT_ULL(1)
+#define PMC_VIRTUALIZATION_ENABLE_MASK BIT_ULL(3)
 
 #define SVM_INTERRUPT_SHADOW_MASK	BIT_ULL(0)
 #define SVM_GUEST_INTERRUPT_MASK	BIT_ULL(1)
diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
index 8a32e1a9c07d..63d177df4daf 100644
--- a/arch/x86/kvm/svm/pmu.c
+++ b/arch/x86/kvm/svm/pmu.c
@@ -124,12 +124,50 @@ static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
 	return amd_msr_idx_to_pmc(vcpu, msr);
 }
 
+static int amd_virtualized_pmu_get_msr(struct kvm_vcpu *vcpu,
+				       struct msr_data *msr_info)
+{
+	struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+	struct kvm_pmc *pmc;
+	u32 msr = msr_info->index;
+
+	/* MSR_PERF_CNTR_GLOBAL_* */
+	switch (msr) {
+	case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
+		msr_info->data = save->perf_cntr_global_status;
+		return 0;
+	case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
+		msr_info->data = save->perf_cntr_global_control;
+		return 0;
+	}
+
+	/* MSR_PERFCTRn */
+	pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER);
+	if (pmc) {
+		msr_info->data = save->pmc[pmc->idx].perf_ctr;
+		return 0;
+	}
+
+	/* MSR_EVNTSELn */
+	pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL);
+	if (pmc) {
+		msr_info->data = save->pmc[pmc->idx].perf_ctl;
+		return 0;
+	}
+
+	return 1;
+}
+
 static int amd_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
 	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
 	struct kvm_pmc *pmc;
 	u32 msr = msr_info->index;
 
+	if (msr_info->host_initiated && kvm_vcpu_has_virtualized_pmu(vcpu))
+		return amd_virtualized_pmu_get_msr(vcpu, msr_info);
+
 	/* MSR_PERFCTRn */
 	pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER);
 	if (pmc) {
@@ -146,6 +184,44 @@ static int amd_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	return 1;
 }
 
+static int amd_virtualized_pmu_set_msr(struct kvm_vcpu *vcpu,
+				       struct msr_data *msr_info)
+{
+	struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+	struct kvm_pmc *pmc;
+	u32 msr = msr_info->index;
+	u64 data = msr_info->data;
+
+	/* MSR_PERF_CNTR_GLOBAL_* */
+	switch (msr) {
+	case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
+		save->perf_cntr_global_status = data;
+		return 0;
+	case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
+		save->perf_cntr_global_control = data;
+		return 0;
+	}
+
+	/* MSR_PERFCTRn */
+	pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER);
+	if (pmc) {
+		data &= pmc_bitmask(pmc);
+		save->pmc[pmc->idx].perf_ctr = data;
+		return 0;
+	}
+
+	/* MSR_EVNTSELn */
+	pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL);
+	if (pmc) {
+		data &= ~pmu->reserved_bits;
+		save->pmc[pmc->idx].perf_ctl = data;
+		return 0;
+	}
+
+	return 1;
+}
+
 static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
 	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -153,6 +229,9 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	u32 msr = msr_info->index;
 	u64 data = msr_info->data;
 
+	if (msr_info->host_initiated && kvm_vcpu_has_virtualized_pmu(vcpu))
+		return amd_virtualized_pmu_set_msr(vcpu, msr_info);
+
 	/* MSR_PERFCTRn */
 	pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER);
 	if (pmc) {
@@ -167,6 +246,8 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			pmc->eventsel = data;
 			pmc->eventsel_hw = (data & ~AMD64_EVENTSEL_HOSTONLY) |
 					   AMD64_EVENTSEL_GUESTONLY;
+			if (kvm_vcpu_has_virtualized_pmu(vcpu))
+				pmc->eventsel_hw = data;
 			kvm_pmu_request_counter_reprogram(pmc);
 		}
 		return 0;
@@ -228,6 +309,24 @@ static void amd_pmu_init(struct kvm_vcpu *vcpu)
 	}
 }
 
+static void amd_pmu_reset(struct kvm_vcpu *vcpu)
+{
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+	struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
+	int i;
+
+	if (!kvm_vcpu_has_virtualized_pmu(vcpu))
+		return;
+
+	for (i = 0; i < pmu->nr_arch_gp_counters; i++) {
+		save->pmc[i].perf_ctl = 0;
+		save->pmc[i].perf_ctr = 0;
+	}
+
+	save->perf_cntr_global_control = 0;
+	save->perf_cntr_global_status = 0;
+}
+
 static bool amd_pmu_is_mediated_pmu_supported(struct x86_pmu_capability *host_pmu)
 {
 	return host_pmu->version >= 2;
@@ -268,6 +367,7 @@ struct kvm_pmu_ops amd_pmu_ops __initdata = {
 	.set_msr = amd_pmu_set_msr,
 	.refresh = amd_pmu_refresh,
 	.init = amd_pmu_init,
+	.reset = amd_pmu_reset,
 
 	.is_mediated_pmu_supported = amd_pmu_is_mediated_pmu_supported,
 	.mediated_load = amd_mediated_pmu_load,
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 2797c3ab7854..425462f10266 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -180,6 +180,9 @@ module_param(vnmi, bool, 0444);
 
 module_param(enable_mediated_pmu, bool, 0444);
 
+bool vpmc = true;
+module_param(vpmc, bool, 0444);
+
 static bool svm_gp_erratum_intercept = true;
 
 static u8 rsm_ins_bytes[] = "\x0f\xaa";
@@ -1263,6 +1266,9 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
 	if (vcpu->kvm->arch.bus_lock_detection_enabled)
 		svm_set_intercept(svm, INTERCEPT_BUSLOCK);
 
+	if (vpmc)
+		control->virt_ext |= PMC_VIRTUALIZATION_ENABLE_MASK;
+
 	if (sev_guest(vcpu->kvm))
 		sev_init_vmcb(svm);
 
@@ -3467,6 +3473,30 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
 	       "excp_from:", save->last_excp_from,
 	       "excp_to:", save->last_excp_to);
 
+	if (kvm_vcpu_has_virtualized_pmu(vcpu)) {
+		pr_err("%-15s %016llx %-13s %016llx\n",
+		       "perf_ctl0:", save->pmc[0].perf_ctl,
+		       "perf_ctr0:", save->pmc[0].perf_ctr);
+		pr_err("%-15s %016llx %-13s %016llx\n",
+		       "perf_ctl1:", save->pmc[1].perf_ctl,
+		       "perf_ctr1:", save->pmc[1].perf_ctr);
+		pr_err("%-15s %016llx %-13s %016llx\n",
+		       "perf_ctl2:", save->pmc[2].perf_ctl,
+		       "perf_ctr2:", save->pmc[2].perf_ctr);
+		pr_err("%-15s %016llx %-13s %016llx\n",
+		       "perf_ctl3:", save->pmc[3].perf_ctl,
+		       "perf_ctr3:", save->pmc[3].perf_ctr);
+		pr_err("%-15s %016llx %-13s %016llx\n",
+		       "perf_ctl4:", save->pmc[4].perf_ctl,
+		       "perf_ctr4:", save->pmc[4].perf_ctr);
+		pr_err("%-15s %016llx %-13s %016llx\n",
+		       "perf_ctl5:", save->pmc[5].perf_ctl,
+		       "perf_ctr5:", save->pmc[5].perf_ctr);
+		pr_err("%-15s %016llx %-13s %016llx\n",
+		       "perf_cntr_global_control:", save->perf_cntr_global_control,
+		       "perf_cntr_global_status:", save->perf_cntr_global_status);
+	}
+
 	if (sev_es_guest(vcpu->kvm)) {
 		struct sev_es_save_area *vmsa = (struct sev_es_save_area *)save;
 
@@ -4273,6 +4303,15 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
 
 	amd_clear_divider();
 
+	/*
+	 * The save slot for PerfCntrGlobalCtl is of Swap Type C which means
+	 * that on VM-Exit, the state of this MSR is reset i.e. all counter
+	 * enable bits are set. According to the APM, the next VMRUN will fail
+	 * with a VMEXIT_INVALID_PMC error code unless it is cleared.
+	 */
+	if (kvm_vcpu_has_virtualized_pmu(vcpu))
+		wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, 0);
+
 	if (sev_es_guest(vcpu->kvm))
 		__svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted,
 				      sev_es_host_save_area(sd));
@@ -5506,6 +5545,19 @@ static __init int svm_hardware_setup(void)
 	if (!enable_pmu)
 		pr_info("PMU virtualization is disabled\n");
 
+	enable_virtualized_pmu = enable_mediated_pmu && kvm_pmu_cap.virtualized;
+
+	/*
+	 * Virtualized PMCs do not raise host interrupts on overflow. Instead,
+	 * they require either VNMI or AVIC as an interrupt delivery mechanism
+	 * for guests.
+	 */
+	vpmc = vpmc && (vnmi || avic) && enable_virtualized_pmu;
+	if (vpmc)
+		pr_info("PMC virtualization supported\n");
+	else
+		enable_virtualized_pmu = false;
+
 	svm_set_cpu_caps();
 
 	/*
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 58b9d168e0c8..346bbbbd0882 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -51,6 +51,7 @@ extern bool intercept_smi;
 extern bool x2avic_enabled;
 extern bool vnmi;
 extern int lbrv;
+extern bool vpmc;
 
 /*
  * Clean bits in VMCB.
-- 
2.43.0