Introduce PERF_PMU_CAP_VIRTUALIZED_VPMU as an extension to the existing
PERF_PMU_CAP_MEDIATED_VPMU to indicate support for hardware virtualized
PMU where the guest counter states are automatically saved and restored
during world switches.

Pass on the new capability through x86_pmu_cap so that any other entity,
such as KVM, can enquire if the host has hardware PMU virtualization.

Signed-off-by: Sandipan Das <sandipan.das@amd.com>
---
 arch/x86/events/core.c            | 1 +
 arch/x86/include/asm/perf_event.h | 1 +
 include/linux/perf_event.h        | 1 +
 3 files changed, 3 insertions(+)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index d58aa316b65a..8a8111762cc5 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -3131,6 +3131,7 @@ void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
 	cap->events_mask_len	= x86_pmu.events_mask_len;
 	cap->pebs_ept		= x86_pmu.pebs_ept;
 	cap->mediated		= !!(pmu.capabilities & PERF_PMU_CAP_MEDIATED_VPMU);
+	cap->virtualized	= !!(pmu.capabilities & PERF_PMU_CAP_VIRTUALIZED_VPMU);
 }
 EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
 
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 74db361a53d3..6c7d3b7623ad 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -293,6 +293,7 @@ struct x86_pmu_capability {
 	int		events_mask_len;
 	unsigned int	pebs_ept	:1;
 	unsigned int	mediated	:1;
+	unsigned int	virtualized	:1;
 };
 
 /*
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 3a9bd9c4c90e..3d7bec2b918d 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -306,6 +306,7 @@ struct perf_event_pmu_context;
 #define PERF_PMU_CAP_AUX_PAUSE		0x0200
 #define PERF_PMU_CAP_AUX_PREFER_LARGE	0x0400
 #define PERF_PMU_CAP_MEDIATED_VPMU	0x0800
+#define PERF_PMU_CAP_VIRTUALIZED_VPMU	0x1000
 
 /**
  * pmu::scope
-- 
2.43.0

Define a feature flag for bit 8 of CPUID leaf 0x8000000A EDX which
indicates support for virtualization of core performance monitoring
counters.

When this feature is available, a hypervisor can rely on hardware to
restore and save guest counter state when entering or exiting from
guest context.

Signed-off-by: Sandipan Das <sandipan.das@amd.com>
---
 arch/x86/include/asm/cpufeatures.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 286d509f9363..0a81e9631234 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -370,6 +370,7 @@
 #define X86_FEATURE_VMCBCLEAN		(15*32+ 5) /* "vmcb_clean" VMCB clean bits support */
 #define X86_FEATURE_FLUSHBYASID		(15*32+ 6) /* "flushbyasid" Flush-by-ASID support */
 #define X86_FEATURE_DECODEASSISTS	(15*32+ 7) /* "decodeassists" Decode Assists support */
+#define X86_FEATURE_PERFCTR_VIRT	(15*32+ 8) /* "perfctr_virt" PMC virtualization support */
 #define X86_FEATURE_PAUSEFILTER		(15*32+10) /* "pausefilter" Filtered pause intercept */
 #define X86_FEATURE_PFTHRESHOLD		(15*32+12) /* "pfthreshold" Pause filter threshold */
 #define X86_FEATURE_AVIC		(15*32+13) /* "avic" Virtual Interrupt Controller */
-- 
2.43.0

Set the PERF_PMU_CAP_VIRTUALIZED_VPMU flag for processors that support
X86_FEATURE_PERFCTR_VIRT.

Signed-off-by: Sandipan Das <sandipan.das@amd.com>
---
 arch/x86/events/amd/core.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
index 8179fb5f1ee3..0cd2f54778c0 100644
--- a/arch/x86/events/amd/core.c
+++ b/arch/x86/events/amd/core.c
@@ -1443,6 +1443,9 @@ static int __init amd_core_pmu_init(void)
 		static_call_update(amd_pmu_test_overflow, amd_pmu_test_overflow_status);
 	}
 
+	if (cpu_feature_enabled(X86_FEATURE_PERFCTR_VIRT))
+		x86_get_pmu(smp_processor_id())->capabilities |= PERF_PMU_CAP_VIRTUALIZED_VPMU;
+
 	/*
 	 * AMD Core perfctr has separate MSRs for the NB events, see
 	 * the amd/uncore.c driver.
-- 
2.43.0

Extend the Mediated PMU framework to support hardware virtualized PMUs.
The key differences with Mediated PMU are listed below.
  * Hardware saves and restores the guest PMU state on world switches.
  * The guest PMU state is saved in vendor-specific structures (such as
    VMCB or VMCS) instead of struct kvm_pmu.
  * Hardware relies on interrupt virtualization (such as VNMI or AVIC)
    to notify guests about counter overflows instead of receiving
    interrupts in host context after switching the delivery mode in
    LVTPC and then injecting them back in to the guest (KVM_REQ_PMI).

Parts of the original PMU load and put functionality are reused as the
active host events still need to be scheduled in and out in preparation
for world switches.

Event filtering and instruction emulation require the ability to change
the guest PMU state in software. Since struct kvm_pmu no longer has the
correct state, make use of host-initiated MSR accesses for accessing
MSR states directly from vendor-specific structures.

RDPMC is intercepted for legacy guests which do not have access to all
counters. Host-initiated MSR accesses are also used in such cases to
read the latest counter value from vendor-specific structures.

Signed-off-by: Sandipan Das <sandipan.das@amd.com>
---
 arch/x86/kvm/pmu.c           | 94 +++++++++++++++++++++++++++++-------
 arch/x86/kvm/pmu.h           |  6 +++
 arch/x86/kvm/svm/pmu.c       |  1 +
 arch/x86/kvm/vmx/pmu_intel.c |  1 +
 arch/x86/kvm/x86.c           |  4 ++
 arch/x86/kvm/x86.h           |  1 +
 6 files changed, 89 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 0e5048ae86fa..1453fb3a60a2 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -168,6 +168,43 @@ void kvm_handle_guest_mediated_pmi(void)
 	kvm_make_request(KVM_REQ_PMI, vcpu);
 }
 
+static __always_inline u32 fixed_counter_msr(u32 idx)
+{
+	return kvm_pmu_ops.FIXED_COUNTER_BASE + idx * kvm_pmu_ops.MSR_STRIDE;
+}
+
+static __always_inline u32 gp_counter_msr(u32 idx)
+{
+	return kvm_pmu_ops.GP_COUNTER_BASE + idx * kvm_pmu_ops.MSR_STRIDE;
+}
+
+static __always_inline u32 gp_eventsel_msr(u32 idx)
+{
+	return kvm_pmu_ops.GP_EVENTSEL_BASE + idx * kvm_pmu_ops.MSR_STRIDE;
+}
+
+static void kvm_pmu_get_msr_state(struct kvm_vcpu *vcpu, u32 index, u64 *data)
+{
+	struct msr_data msr_info;
+
+	msr_info.index = index;
+	msr_info.host_initiated = true;
+
+	KVM_BUG_ON(kvm_pmu_call(get_msr)(vcpu, &msr_info), vcpu->kvm);
+	*data = msr_info.data;
+}
+
+static void kvm_pmu_set_msr_state(struct kvm_vcpu *vcpu, u32 index, u64 data)
+{
+	struct msr_data msr_info;
+
+	msr_info.data = data;
+	msr_info.index = index;
+	msr_info.host_initiated = true;
+
+	KVM_BUG_ON(kvm_pmu_call(set_msr)(vcpu, &msr_info), vcpu->kvm);
+}
+
 static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
 {
 	struct kvm_pmu *pmu = pmc_to_pmu(pmc);
@@ -520,19 +557,22 @@ static bool pmc_is_event_allowed(struct kvm_pmc *pmc)
 
 static void kvm_mediated_pmu_refresh_event_filter(struct kvm_pmc *pmc)
 {
-	bool allowed = pmc_is_event_allowed(pmc);
 	struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+	struct kvm_vcpu *vcpu = pmc->vcpu;
 
 	if (pmc_is_gp(pmc)) {
 		pmc->eventsel_hw &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
-		if (allowed)
+		if (pmc_is_event_allowed(pmc))
 			pmc->eventsel_hw |= pmc->eventsel &
 					    ARCH_PERFMON_EVENTSEL_ENABLE;
+
+		if (kvm_vcpu_has_virtualized_pmu(vcpu))
+			kvm_pmu_set_msr_state(vcpu, gp_eventsel_msr(pmc->idx), pmc->eventsel_hw);
 	} else {
 		u64 mask = intel_fixed_bits_by_idx(pmc->idx - KVM_FIXED_PMC_BASE_IDX, 0xf);
 
 		pmu->fixed_ctr_ctrl_hw &= ~mask;
-		if (allowed)
+		if (pmc_is_event_allowed(pmc))
 			pmu->fixed_ctr_ctrl_hw |= pmu->fixed_ctr_ctrl & mask;
 	}
 }
@@ -740,6 +780,9 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
 	    kvm_is_cr0_bit_set(vcpu, X86_CR0_PE))
 		return 1;
 
+	if (kvm_vcpu_has_virtualized_pmu(pmc->vcpu))
+		kvm_pmu_get_msr_state(pmc->vcpu, gp_counter_msr(pmc->idx), &pmc->counter);
+
 	*data = pmc_read_counter(pmc) & mask;
 	return 0;
 }
@@ -974,6 +1017,9 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
 	    (kvm_pmu_has_perf_global_ctrl(pmu) || kvm_vcpu_has_mediated_pmu(vcpu)))
 		pmu->global_ctrl = GENMASK_ULL(pmu->nr_arch_gp_counters - 1, 0);
 
+	if (kvm_vcpu_has_virtualized_pmu(vcpu))
+		kvm_pmu_set_msr_state(vcpu, kvm_pmu_ops.PERF_GLOBAL_CTRL, pmu->global_ctrl);
+
 	if (kvm_vcpu_has_mediated_pmu(vcpu))
 		kvm_pmu_call(write_global_ctrl)(pmu->global_ctrl);
 
@@ -1099,6 +1145,11 @@ static void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu,
 	if (bitmap_empty(event_pmcs, X86_PMC_IDX_MAX))
 		return;
 
+	if (kvm_vcpu_has_virtualized_pmu(vcpu)) {
+		kvm_pmu_get_msr_state(vcpu, kvm_pmu_ops.PERF_GLOBAL_CTRL, &pmu->global_ctrl);
+		kvm_pmu_get_msr_state(vcpu, kvm_pmu_ops.PERF_GLOBAL_STATUS, &pmu->global_status);
+	}
+
 	if (!kvm_pmu_has_perf_global_ctrl(pmu))
 		bitmap_copy(bitmap, event_pmcs, X86_PMC_IDX_MAX);
 	else if (!bitmap_and(bitmap, event_pmcs,
@@ -1107,11 +1158,21 @@ static void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu,
 
 	idx = srcu_read_lock(&vcpu->kvm->srcu);
 	kvm_for_each_pmc(pmu, pmc, i, bitmap) {
+		if (kvm_vcpu_has_virtualized_pmu(vcpu))
+			kvm_pmu_get_msr_state(vcpu, gp_counter_msr(pmc->idx), &pmc->counter);
+
 		if (!pmc_is_event_allowed(pmc) || !cpl_is_matched(pmc))
 			continue;
 
 		kvm_pmu_incr_counter(pmc);
+
+		if (kvm_vcpu_has_virtualized_pmu(vcpu))
+			kvm_pmu_set_msr_state(vcpu, gp_counter_msr(pmc->idx), pmc->counter);
 	}
+
+	if (kvm_vcpu_has_virtualized_pmu(vcpu))
+		kvm_pmu_set_msr_state(vcpu, kvm_pmu_ops.PERF_GLOBAL_STATUS, pmu->global_status);
+
 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
 }
 
@@ -1270,21 +1331,6 @@ int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
 	return r;
 }
 
-static __always_inline u32 fixed_counter_msr(u32 idx)
-{
-	return kvm_pmu_ops.FIXED_COUNTER_BASE + idx * kvm_pmu_ops.MSR_STRIDE;
-}
-
-static __always_inline u32 gp_counter_msr(u32 idx)
-{
-	return kvm_pmu_ops.GP_COUNTER_BASE + idx * kvm_pmu_ops.MSR_STRIDE;
-}
-
-static __always_inline u32 gp_eventsel_msr(u32 idx)
-{
-	return kvm_pmu_ops.GP_EVENTSEL_BASE + idx * kvm_pmu_ops.MSR_STRIDE;
-}
-
 static void kvm_pmu_load_guest_pmcs(struct kvm_vcpu *vcpu)
 {
 	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -1319,6 +1365,12 @@ void kvm_mediated_pmu_load(struct kvm_vcpu *vcpu)
 
 	lockdep_assert_irqs_disabled();
 
+	/* Guest PMU state is restored by hardware at VM-Entry */
+	if (kvm_vcpu_has_virtualized_pmu(vcpu)) {
+		perf_load_guest_context(0);
+		return;
+	}
+
 	perf_load_guest_context(kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVTPC));
 
 	/*
@@ -1372,6 +1424,12 @@ void kvm_mediated_pmu_put(struct kvm_vcpu *vcpu)
 
 	lockdep_assert_irqs_disabled();
 
+	/* Guest PMU state is saved by hardware at VM-Exit */
+	if (kvm_vcpu_has_virtualized_pmu(vcpu)) {
+		perf_put_guest_context();
+		return;
+	}
+
 	/*
 	 * Defer handling of PERF_GLOBAL_CTRL to vendor code.  On Intel, it's
 	 * atomically cleared on VM-Exit, i.e. doesn't need to be clear here.
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index a0cd42cbea9d..55f0679b522d 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -47,6 +47,7 @@ struct kvm_pmu_ops {
 	const int MIN_NR_GP_COUNTERS;
 
 	const u32 PERF_GLOBAL_CTRL;
+	const u32 PERF_GLOBAL_STATUS;
 	const u32 GP_EVENTSEL_BASE;
 	const u32 GP_COUNTER_BASE;
 	const u32 FIXED_COUNTER_BASE;
@@ -76,6 +77,11 @@ static inline bool kvm_vcpu_has_mediated_pmu(struct kvm_vcpu *vcpu)
 	return enable_mediated_pmu && vcpu_to_pmu(vcpu)->version;
 }
 
+static inline bool kvm_vcpu_has_virtualized_pmu(struct kvm_vcpu *vcpu)
+{
+	return enable_virtualized_pmu && vcpu_to_pmu(vcpu)->version;
+}
+
 /*
  * KVM tracks all counters in 64-bit bitmaps, with general purpose counters
  * mapped to bits 31:0 and fixed counters mapped to 63:32, e.g. fixed counter 0
diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
index c03720b30785..8a32e1a9c07d 100644
--- a/arch/x86/kvm/svm/pmu.c
+++ b/arch/x86/kvm/svm/pmu.c
@@ -278,6 +278,7 @@ struct kvm_pmu_ops amd_pmu_ops __initdata = {
 	.MIN_NR_GP_COUNTERS = AMD64_NUM_COUNTERS,
 
 	.PERF_GLOBAL_CTRL = MSR_AMD64_PERF_CNTR_GLOBAL_CTL,
+	.PERF_GLOBAL_STATUS = MSR_AMD64_PERF_CNTR_GLOBAL_STATUS,
 	.GP_EVENTSEL_BASE = MSR_F15H_PERF_CTL0,
 	.GP_COUNTER_BASE = MSR_F15H_PERF_CTR0,
 	.FIXED_COUNTER_BASE = 0,
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 41a845de789e..9685af27c15c 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -845,6 +845,7 @@ struct kvm_pmu_ops intel_pmu_ops __initdata = {
 	.MIN_NR_GP_COUNTERS = 1,
 
 	.PERF_GLOBAL_CTRL = MSR_CORE_PERF_GLOBAL_CTRL,
+	.PERF_GLOBAL_STATUS = MSR_CORE_PERF_GLOBAL_STATUS,
 	.GP_EVENTSEL_BASE = MSR_P6_EVNTSEL0,
 	.GP_COUNTER_BASE = MSR_IA32_PMC0,
 	.FIXED_COUNTER_BASE = MSR_CORE_PERF_FIXED_CTR0,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6bdf7ef0b535..750535a53a30 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -191,6 +191,10 @@ module_param(enable_pmu, bool, 0444);
 bool __read_mostly enable_mediated_pmu;
 EXPORT_SYMBOL_GPL(enable_mediated_pmu);
 
+/* Enable/disable hardware PMU virtualization. */
+bool __read_mostly enable_virtualized_pmu;
+EXPORT_SYMBOL_GPL(enable_virtualized_pmu);
+
 bool __read_mostly eager_page_split = true;
 module_param(eager_page_split, bool, 0644);
 
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index bd1149768acc..8cca48d1eed7 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -446,6 +446,7 @@ extern struct kvm_host_values kvm_host;
 
 extern bool enable_pmu;
 extern bool enable_mediated_pmu;
+extern bool enable_virtualized_pmu;
 
 /*
  * Get a filtered version of KVM's supported XCR0 that strips out dynamic
-- 
2.43.0

When PMC virtualization (X86_FEATURE_PERFCTR_VIRT) is supported and the
feature is enabled, additional save slots are available in the VMCB for
the following MSRs.
  * Performance Counter Global Control (MSR 0xc0000301) (Swap Type C)
  * Performance Counter Global Status (MSR 0xc0000300) (Swap Type A)
  * Performance Event Select (MSR 0xc0010200..0xc001020a) (Swap Type C)
  * Performance Event Counter (MSR 0xc0010201..0xc001020b) (Swap Type C)

Define the additional VMCB fields that will be used by hardware to save
and restore the guest PMU state.

Signed-off-by: Sandipan Das <sandipan.das@amd.com>
---
 arch/x86/include/asm/svm.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index ffc27f676243..a80df935b580 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -327,7 +327,12 @@ struct vmcb_save_area {
 	u8 cpl;
 	u8 reserved_0xcc[4];
 	u64 efer;
-	u8 reserved_0xd8[112];
+	u8 reserved_0xd8[8];
+	struct {
+		u64 perf_ctl;
+		u64 perf_ctr;
+	} __packed pmc[6];
+	u8 reserved_0x140[8];
 	u64 cr4;
 	u64 cr3;
 	u64 cr0;
@@ -335,7 +340,9 @@ struct vmcb_save_area {
 	u64 dr6;
 	u64 rflags;
 	u64 rip;
-	u8 reserved_0x180[88];
+	u8 reserved_0x180[72];
+	u64 perf_cntr_global_status;
+	u64 perf_cntr_global_control;
 	u64 rsp;
 	u64 s_cet;
 	u64 ssp;
-- 
2.43.0

Make use of the extended Mediated PMU framework to introduce support for
PMC virtualization (X86_FEATURE_PERFCTR_VIRT). Since the guest PMU state
is saved in the VMCB State Save Area, extend the MSR access PMU ops to
handle host-initiated requests from code responsible for event filtering
and incrementing counters due to instruction emulation.

The underlying implementation depends on the availability of either VNMI
or AVIC for guest interrupt delivery. Synthesized overflows like those
resulting from incrementing counters in software due to instruction
emulation are still injected through the KVM_REQ_PMI path.

If both VNMI and AVIC are enabled, the hardware automatically chooses
AVIC. The advantage of using AVIC is that it lets the guest change the
delivery mode in the LVTPC. Unlike AVIC, VNMI ignores the LVTPC, as APIC
is emulated, and will always present the overflow interrupts as NMIs.
This is demonstrated by some failures in KUT's x86/pmu test.

The feature is enabled by default if the host supports it and also has
Mediated PMU enabled but it can also be switched manually using the new
"vpmc" parameter for the kvm_amd module.

Signed-off-by: Sandipan Das <sandipan.das@amd.com>
---
 arch/x86/include/asm/svm.h |   1 +
 arch/x86/kvm/svm/pmu.c     | 100 +++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/svm/svm.c     |  52 +++++++++++++++++++
 arch/x86/kvm/svm/svm.h     |   1 +
 4 files changed, 154 insertions(+)

diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index a80df935b580..adc30a9f950f 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -221,6 +221,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
 
 #define LBR_CTL_ENABLE_MASK BIT_ULL(0)
 #define VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK BIT_ULL(1)
+#define PMC_VIRTUALIZATION_ENABLE_MASK BIT_ULL(3)
 
 #define SVM_INTERRUPT_SHADOW_MASK	BIT_ULL(0)
 #define SVM_GUEST_INTERRUPT_MASK	BIT_ULL(1)
diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
index 8a32e1a9c07d..63d177df4daf 100644
--- a/arch/x86/kvm/svm/pmu.c
+++ b/arch/x86/kvm/svm/pmu.c
@@ -124,12 +124,50 @@ static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
 	return amd_msr_idx_to_pmc(vcpu, msr);
 }
 
+static int amd_virtualized_pmu_get_msr(struct kvm_vcpu *vcpu,
+				       struct msr_data *msr_info)
+{
+	struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+	struct kvm_pmc *pmc;
+	u32 msr = msr_info->index;
+
+	/* MSR_PERF_CNTR_GLOBAL_* */
+	switch (msr) {
+	case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
+		msr_info->data = save->perf_cntr_global_status;
+		return 0;
+	case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
+		msr_info->data = save->perf_cntr_global_control;
+		return 0;
+	}
+
+	/* MSR_PERFCTRn */
+	pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER);
+	if (pmc) {
+		msr_info->data = save->pmc[pmc->idx].perf_ctr;
+		return 0;
+	}
+
+	/* MSR_EVNTSELn */
+	pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL);
+	if (pmc) {
+		msr_info->data = save->pmc[pmc->idx].perf_ctl;
+		return 0;
+	}
+
+	return 1;
+}
+
 static int amd_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
 	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
 	struct kvm_pmc *pmc;
 	u32 msr = msr_info->index;
 
+	if (msr_info->host_initiated && kvm_vcpu_has_virtualized_pmu(vcpu))
+		return amd_virtualized_pmu_get_msr(vcpu, msr_info);
+
 	/* MSR_PERFCTRn */
 	pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER);
 	if (pmc) {
@@ -146,6 +184,44 @@ static int amd_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	return 1;
 }
 
+static int amd_virtualized_pmu_set_msr(struct kvm_vcpu *vcpu,
+				       struct msr_data *msr_info)
+{
+	struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+	struct kvm_pmc *pmc;
+	u32 msr = msr_info->index;
+	u64 data = msr_info->data;
+
+	/* MSR_PERF_CNTR_GLOBAL_* */
+	switch (msr) {
+	case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
+		save->perf_cntr_global_status = data;
+		return 0;
+	case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
+		save->perf_cntr_global_control = data;
+		return 0;
+	}
+
+	/* MSR_PERFCTRn */
+	pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER);
+	if (pmc) {
+		data &= pmc_bitmask(pmc);
+		save->pmc[pmc->idx].perf_ctr = data;
+		return 0;
+	}
+
+	/* MSR_EVNTSELn */
+	pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL);
+	if (pmc) {
+		data &= ~pmu->reserved_bits;
+		save->pmc[pmc->idx].perf_ctl = data;
+		return 0;
+	}
+
+	return 1;
+}
+
 static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
 	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -153,6 +229,9 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	u32 msr = msr_info->index;
 	u64 data = msr_info->data;
 
+	if (msr_info->host_initiated && kvm_vcpu_has_virtualized_pmu(vcpu))
+		return amd_virtualized_pmu_set_msr(vcpu, msr_info);
+
 	/* MSR_PERFCTRn */
 	pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER);
 	if (pmc) {
@@ -167,6 +246,8 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			pmc->eventsel = data;
 			pmc->eventsel_hw = (data & ~AMD64_EVENTSEL_HOSTONLY) |
 					   AMD64_EVENTSEL_GUESTONLY;
+			if (kvm_vcpu_has_virtualized_pmu(vcpu))
+				pmc->eventsel_hw = data;
 			kvm_pmu_request_counter_reprogram(pmc);
 		}
 		return 0;
@@ -228,6 +309,24 @@ static void amd_pmu_init(struct kvm_vcpu *vcpu)
 	}
 }
 
+static void amd_pmu_reset(struct kvm_vcpu *vcpu)
+{
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+	struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
+	int i;
+
+	if (!kvm_vcpu_has_virtualized_pmu(vcpu))
+		return;
+
+	for (i = 0; i < pmu->nr_arch_gp_counters; i++) {
+		save->pmc[i].perf_ctl = 0;
+		save->pmc[i].perf_ctr = 0;
+	}
+
+	save->perf_cntr_global_control = 0;
+	save->perf_cntr_global_status = 0;
+}
+
 static bool amd_pmu_is_mediated_pmu_supported(struct x86_pmu_capability *host_pmu)
 {
 	return host_pmu->version >= 2;
@@ -268,6 +367,7 @@ struct kvm_pmu_ops amd_pmu_ops __initdata = {
 	.set_msr = amd_pmu_set_msr,
 	.refresh = amd_pmu_refresh,
 	.init = amd_pmu_init,
+	.reset = amd_pmu_reset,
 
 	.is_mediated_pmu_supported = amd_pmu_is_mediated_pmu_supported,
 	.mediated_load = amd_mediated_pmu_load,
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 2797c3ab7854..425462f10266 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -180,6 +180,9 @@ module_param(vnmi, bool, 0444);
 
 module_param(enable_mediated_pmu, bool, 0444);
 
+bool vpmc = true;
+module_param(vpmc, bool, 0444);
+
 static bool svm_gp_erratum_intercept = true;
 
 static u8 rsm_ins_bytes[] = "\x0f\xaa";
@@ -1263,6 +1266,9 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
 	if (vcpu->kvm->arch.bus_lock_detection_enabled)
 		svm_set_intercept(svm, INTERCEPT_BUSLOCK);
 
+	if (vpmc)
+		control->virt_ext |= PMC_VIRTUALIZATION_ENABLE_MASK;
+
 	if (sev_guest(vcpu->kvm))
 		sev_init_vmcb(svm);
 
@@ -3467,6 +3473,30 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
 	       "excp_from:", save->last_excp_from,
 	       "excp_to:", save->last_excp_to);
 
+	if (kvm_vcpu_has_virtualized_pmu(vcpu)) {
+		pr_err("%-15s %016llx %-13s %016llx\n",
+		       "perf_ctl0:", save->pmc[0].perf_ctl,
+		       "perf_ctr0:", save->pmc[0].perf_ctr);
+		pr_err("%-15s %016llx %-13s %016llx\n",
+		       "perf_ctl1:", save->pmc[1].perf_ctl,
+		       "perf_ctr1:", save->pmc[1].perf_ctr);
+		pr_err("%-15s %016llx %-13s %016llx\n",
+		       "perf_ctl2:", save->pmc[2].perf_ctl,
+		       "perf_ctr2:", save->pmc[2].perf_ctr);
+		pr_err("%-15s %016llx %-13s %016llx\n",
+		       "perf_ctl3:", save->pmc[3].perf_ctl,
+		       "perf_ctr3:", save->pmc[3].perf_ctr);
+		pr_err("%-15s %016llx %-13s %016llx\n",
+		       "perf_ctl4:", save->pmc[4].perf_ctl,
+		       "perf_ctr4:", save->pmc[4].perf_ctr);
+		pr_err("%-15s %016llx %-13s %016llx\n",
+		       "perf_ctl5:", save->pmc[5].perf_ctl,
+		       "perf_ctr5:", save->pmc[5].perf_ctr);
+		pr_err("%-15s %016llx %-13s %016llx\n",
+		       "perf_cntr_global_control:", save->perf_cntr_global_control,
+		       "perf_cntr_global_status:", save->perf_cntr_global_status);
+	}
+
 	if (sev_es_guest(vcpu->kvm)) {
 		struct sev_es_save_area *vmsa = (struct sev_es_save_area *)save;
 
@@ -4273,6 +4303,15 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
 
 	amd_clear_divider();
 
+	/*
+	 * The save slot for PerfCntrGlobalCtl is of Swap Type C which means
+	 * that on VM-Exit, the state of this MSR is reset i.e. all counter
+	 * enable bits are set. According to the APM, the next VMRUN will fail
+	 * with a VMEXIT_INVALID_PMC error code unless it is cleared.
+	 */
+	if (kvm_vcpu_has_virtualized_pmu(vcpu))
+		wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, 0);
+
 	if (sev_es_guest(vcpu->kvm))
 		__svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted,
 				      sev_es_host_save_area(sd));
@@ -5506,6 +5545,19 @@ static __init int svm_hardware_setup(void)
 	if (!enable_pmu)
 		pr_info("PMU virtualization is disabled\n");
 
+	enable_virtualized_pmu = enable_mediated_pmu && kvm_pmu_cap.virtualized;
+
+	/*
+	 * Virtualized PMCs do not raise host interrupts on overflow. Instead,
+	 * they require either VNMI or AVIC as an interrupt delivery mechanism
+	 * for guests.
+	 */
+	vpmc = vpmc && (vnmi || avic) && enable_virtualized_pmu;
+	if (vpmc)
+		pr_info("PMC virtualization supported\n");
+	else
+		enable_virtualized_pmu = false;
+
 	svm_set_cpu_caps();
 
 	/*
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 58b9d168e0c8..346bbbbd0882 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -51,6 +51,7 @@ extern bool intercept_smi;
 extern bool x2avic_enabled;
 extern bool vnmi;
 extern int lbrv;
+extern bool vpmc;
 
 /*
  * Clean bits in VMCB.
-- 
2.43.0

For legacy guests, the host-initiated requests from the hypervisor to
access PMC-related MSRs will always be for MSR_F15H_PERF_CTLx and
MSR_F15H_PERF_CTRx instead of MSR_K7_EVNTSELx and MSR_K7_PERFCTRx
because of how GP_EVENTSEL_BASE and GP_COUNTER_BASE are set in the PMU
ops. In such cases, translate the index to the equivalent legacy MSR
as get_gp_pmc_amd() will otherwise return NULL.

Signed-off-by: Sandipan Das <sandipan.das@amd.com>
---
 arch/x86/kvm/svm/pmu.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
index 63d177df4daf..c893c1bef131 100644
--- a/arch/x86/kvm/svm/pmu.c
+++ b/arch/x86/kvm/svm/pmu.c
@@ -124,6 +124,16 @@ static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
 	return amd_msr_idx_to_pmc(vcpu, msr);
 }
 
+static inline u32 amd_pmu_adjust_msr_idx(struct kvm_vcpu *vcpu, u32 msr)
+{
+	if (!guest_cpu_cap_has(vcpu, X86_FEATURE_PERFCTR_CORE) &&
+	    msr >= MSR_F15H_PERF_CTL0 && msr <= MSR_F15H_PERF_CTR5)
+		msr = ((msr & 0x1) ? MSR_K7_PERFCTR0 : MSR_K7_EVNTSEL0) +
+		      ((msr - MSR_F15H_PERF_CTL0) / 2);
+
+	return msr;
+}
+
 static int amd_virtualized_pmu_get_msr(struct kvm_vcpu *vcpu,
 				       struct msr_data *msr_info)
 {
@@ -142,6 +152,8 @@ static int amd_virtualized_pmu_get_msr(struct kvm_vcpu *vcpu,
 		return 0;
 	}
 
+	msr = amd_pmu_adjust_msr_idx(vcpu, msr);
+
 	/* MSR_PERFCTRn */
 	pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER);
 	if (pmc) {
@@ -203,6 +215,8 @@ static int amd_virtualized_pmu_set_msr(struct kvm_vcpu *vcpu,
 		return 0;
 	}
 
+	msr = amd_pmu_adjust_msr_idx(vcpu, msr);
+
 	/* MSR_PERFCTRn */
 	pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER);
 	if (pmc) {
-- 
2.43.0