From: dongsheng <dongsheng.x.zhang@intel.com>

For Intel Atom CPUs, the PMU events "Instruction Retired" or
"Branch Instruction Retired" may be overcounted for some certain
instructions, like FAR CALL/JMP, RETF, IRET, VMENTRY/VMEXIT/VMPTRLD
and complex SGX/SMX/CSTATE instructions/flows.

The detailed information can be found in the errata (section SRF7):
https://edc.intel.com/content/www/us/en/design/products-and-solutions/processors-and-chipsets/sierra-forest/xeon-6700-series-processor-with-e-cores-specification-update/errata-details/

For the Atom platforms before Sierra Forest (including Sierra Forest),
Both 2 events "Instruction Retired" and "Branch Instruction Retired" would
be overcounted on these certain instructions, but for Clearwater Forest
only "Instruction Retired" event is overcounted on these instructions.

So add a helper detect_inst_overcount_flags() to detect whether the
platform has the overcount issue and the later patches would relax the
precise count check by leveraging the gotten overcount flags from this
helper.

Signed-off-by: dongsheng <dongsheng.x.zhang@intel.com>
[Rewrite comments and commit message - Dapeng]
Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Tested-by: Yi Lai <yi1.lai@intel.com>
[sean: put errata detection and tracking in pmu_init()]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 lib/x86/pmu.c       | 39 +++++++++++++++++++++++++++++++++++++++
 lib/x86/pmu.h       |  5 +++++
 lib/x86/processor.h | 26 ++++++++++++++++++++++++++
 3 files changed, 70 insertions(+)

diff --git a/lib/x86/pmu.c b/lib/x86/pmu.c
index fb46b196..67f3b23e 100644
--- a/lib/x86/pmu.c
+++ b/lib/x86/pmu.c
@@ -2,11 +2,50 @@
 
 struct pmu_caps pmu;
 
+/*
+ * For Intel Atom CPUs, the PMU events "Instruction Retired" or
+ * "Branch Instruction Retired" may be overcounted for some certain
+ * instructions, like FAR CALL/JMP, RETF, IRET, VMENTRY/VMEXIT/VMPTRLD
+ * and complex SGX/SMX/CSTATE instructions/flows.
+ *
+ * The detailed information can be found in the errata (section SRF7):
+ * https://edc.intel.com/content/www/us/en/design/products-and-solutions/processors-and-chipsets/sierra-forest/xeon-6700-series-processor-with-e-cores-specification-update/errata-details/
+ *
+ * For the Atom platforms before Sierra Forest (including Sierra Forest),
+ * Both 2 events "Instruction Retired" and "Branch Instruction Retired" would
+ * be overcounted on these certain instructions, but for Clearwater Forest
+ * only "Instruction Retired" event is overcounted on these instructions.
+ */
+static void pmu_detect_intel_overcount_errata(void)
+{
+	struct cpuid c = cpuid(1);
+
+	if (x86_family(c.a) == 0x6) {
+		switch (x86_model(c.a)) {
+		case 0xDD: /* Clearwater Forest */
+			pmu.errata.instructions_retired_overcount = true;
+			break;
+
+		case 0xAF: /* Sierra Forest */
+		case 0x4D: /* Avaton, Rangely */
+		case 0x5F: /* Denverton */
+		case 0x86: /* Jacobsville */
+			pmu.errata.instructions_retired_overcount = true;
+			pmu.errata.branches_retired_overcount = true;
+			break;
+		default:
+			break;
+		}
+	}
+}
+
 void pmu_init(void)
 {
 	pmu.is_intel = is_intel();
 
 	if (pmu.is_intel) {
+		pmu_detect_intel_overcount_errata();
+
 		pmu.version = this_cpu_property(X86_PROPERTY_PMU_VERSION);
 
 		if (pmu.version > 1) {
diff --git a/lib/x86/pmu.h b/lib/x86/pmu.h
index c7dc68c1..e84b37dc 100644
--- a/lib/x86/pmu.h
+++ b/lib/x86/pmu.h
@@ -73,6 +73,11 @@ struct pmu_caps {
 	u32 msr_global_status_clr;
 
 	u64 perf_cap;
+
+	struct {
+		bool instructions_retired_overcount;
+		bool branches_retired_overcount;
+	} errata;
 };
 
 extern struct pmu_caps pmu;
diff --git a/lib/x86/processor.h b/lib/x86/processor.h
index 8a73af5e..68bd774b 100644
--- a/lib/x86/processor.h
+++ b/lib/x86/processor.h
@@ -226,6 +226,32 @@ static inline bool is_intel(void)
 	return strcmp((char *)name, "GenuineIntel") == 0;
 }
 
+static inline u32 x86_family(u32 sig)
+{
+	u32 x86;
+
+	x86 = (sig >> 8) & 0xf;
+
+	if (x86 == 0xf)
+		x86 += (sig >> 20) & 0xff;
+
+	return x86;
+}
+
+static inline u32 x86_model(u32 sig)
+{
+	u32 fam, model;
+
+	fam = x86_family(sig);
+
+	model = (sig >> 4) & 0xf;
+
+	if (fam >= 0x6)
+		model += ((sig >> 16) & 0xf) << 4;
+
+	return model;
+}
+
 /*
  * Pack the information into a 64-bit value so that each X86_FEATURE_XXX can be
  * passed by value with no overhead.
-- 
2.52.0.rc2.455.g230fcf2819-goog

From: dongsheng <dongsheng.x.zhang@intel.com>

As the VM-Exit/VM-Entry overcount issue on Intel Atom platforms,
there is no way to validate the precise count for "instructions" and
"branches" events on these overcounted Atom platforms. Thus relax the
precise count validation on these overcounted platforms.

Signed-off-by: dongsheng <dongsheng.x.zhang@intel.com>
Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Tested-by: Yi Lai <yi1.lai@intel.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 x86/pmu.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/x86/pmu.c b/x86/pmu.c
index f932ccab..bd16211d 100644
--- a/x86/pmu.c
+++ b/x86/pmu.c
@@ -229,10 +229,15 @@ static void adjust_events_range(struct pmu_event *gp_events,
 	 * occur while running the measured code, e.g. if the host takes IRQs.
 	 */
 	if (pmu.is_intel && this_cpu_has_perf_global_ctrl()) {
-		gp_events[instruction_idx].min = LOOP_INSNS;
-		gp_events[instruction_idx].max = LOOP_INSNS;
-		gp_events[branch_idx].min = LOOP_BRANCHES;
-		gp_events[branch_idx].max = LOOP_BRANCHES;
+		if (!pmu.errata.instructions_retired_overcount) {
+			gp_events[instruction_idx].min = LOOP_INSNS;
+			gp_events[instruction_idx].max = LOOP_INSNS;
+		}
+
+		if (!pmu.errata.branches_retired_overcount) {
+			gp_events[branch_idx].min = LOOP_BRANCHES;
+			gp_events[branch_idx].max = LOOP_BRANCHES;
+		}
 	}
 
 	/*
-- 
2.52.0.rc2.455.g230fcf2819-goog

From: dongsheng <dongsheng.x.zhang@intel.com>

The current implementation mistakenly limits the width of fixed counters
to the width of GP counters.  Correct the logic to ensure fixed counters
are properly masked according to their own width.

Opportunistically refine the GP counter bitwidth processing code.

Signed-off-by: dongsheng <dongsheng.x.zhang@intel.com>
Co-developed-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Tested-by: Yi Lai <yi1.lai@intel.com>
[sean: keep measure_for_overflow() for fixed counter (see commit 7ec3b67a)]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 x86/pmu.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/x86/pmu.c b/x86/pmu.c
index bd16211d..96b76d04 100644
--- a/x86/pmu.c
+++ b/x86/pmu.c
@@ -547,19 +547,19 @@ static void check_counter_overflow(void)
 		uint64_t status;
 		int idx;
 
-		cnt.count = overflow_preset;
-		if (pmu_use_full_writes())
-			cnt.count &= (1ull << pmu.gp_counter_width) - 1;
-
 		if (i == pmu.nr_gp_counters) {
 			if (!pmu.is_intel)
 				break;
 
 			cnt.ctr = fixed_events[0].unit_sel;
 			cnt.count = measure_for_overflow(&cnt);
-			cnt.count &= (1ull << pmu.gp_counter_width) - 1;
+			cnt.count &= (1ull << pmu.fixed_counter_width) - 1;
 		} else {
 			cnt.ctr = MSR_GP_COUNTERx(i);
+
+			cnt.count = overflow_preset;
+			if (pmu_use_full_writes())
+				cnt.count &= (1ull << pmu.gp_counter_width) - 1;
 		}
 
 		if (i % 2)
-- 
2.52.0.rc2.455.g230fcf2819-goog

From: dongsheng <dongsheng.x.zhang@intel.com>

During the execution of __measure(), VM exits (e.g., due to
WRMSR/EXTERNAL_INTERRUPT) may occur. On systems affected by the
instruction overcount issue, each VM-Exit/VM-Entry can erroneously
increment the instruction count by one, leading to false failures in
overflow tests.

To address this, the patch introduces a range-based validation in place
of precise instruction count checks. Additionally, overflow_preset is
now statically set to 1 - LOOP_INSNS, rather than being dynamically
determined via measure_for_overflow().

These changes ensure consistent and predictable behavior aligned with the
intended loop instruction count, while avoiding modifications to the
subsequent status and status-clear testing logic.

The chosen validation range is empirically derived to maintain test
reliability across hardware variations.

Signed-off-by: dongsheng <dongsheng.x.zhang@intel.com>
Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Tested-by: Yi Lai <yi1.lai@intel.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 x86/pmu.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/x86/pmu.c b/x86/pmu.c
index 96b76d04..e1e98959 100644
--- a/x86/pmu.c
+++ b/x86/pmu.c
@@ -510,6 +510,21 @@ static void check_counters_many(void)
 
 static uint64_t measure_for_overflow(pmu_counter_t *cnt)
 {
+	/*
+	 * During the execution of __measure(), VM exits (e.g., due to
+	 * WRMSR/EXTERNAL_INTERRUPT) may occur. On systems affected by the
+	 * instruction overcount issue, each VM-Exit/VM-Entry can erroneously
+	 * increment the instruction count by one, leading to false failures
+	 * in overflow tests.
+	 *
+	 * To mitigate this, if the overcount issue is detected, hardcode the
+	 * overflow preset to (1 - LOOP_INSNS) instead of calculating it
+	 * dynamically. This ensures that an overflow will reliably occur,
+	 * regardless of any overcounting caused by VM exits.
+	 */
+	if (pmu.errata.instructions_retired_overcount)
+		return 1 - LOOP_INSNS;
+
 	__measure(cnt, 0);
 	/*
 	 * To generate overflow, i.e. roll over to '0', the initial count just
@@ -568,8 +583,12 @@ static void check_counter_overflow(void)
 			cnt.config &= ~EVNTSEL_INT;
 		idx = event_to_global_idx(&cnt);
 		__measure(&cnt, cnt.count);
-		if (pmu.is_intel)
-			report(cnt.count == 1, "cntr-%d", i);
+		if (pmu.is_intel) {
+			if (pmu.errata.instructions_retired_overcount)
+				report(cnt.count < 14, "cntr-%d", i);
+			else
+				report(cnt.count == 1, "cntr-%d", i);
+		}
 		else
 			report(cnt.count == 0xffffffffffff || cnt.count < 7, "cntr-%d", i);
 
-- 
2.52.0.rc2.455.g230fcf2819-goog

From: Dapeng Mi <dapeng1.mi@linux.intel.com>

Relax precise count check for emulated instructions tests on these
platforms with HW overcount issues.

Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
[sean: handle errata independently]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 x86/pmu.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/x86/pmu.c b/x86/pmu.c
index e1e98959..ccf4ee63 100644
--- a/x86/pmu.c
+++ b/x86/pmu.c
@@ -756,6 +756,8 @@ static void check_emulated_instr(void)
 		/* instructions */
 		.config = EVNTSEL_OS | EVNTSEL_USR | gp_events[instruction_idx].unit_sel,
 	};
+	const bool has_perf_global_ctrl = this_cpu_has_perf_global_ctrl();
+
 	report_prefix_push("emulated instruction");
 
 	if (this_cpu_has_perf_global_status())
@@ -769,7 +771,7 @@ static void check_emulated_instr(void)
 	wrmsr(MSR_GP_COUNTERx(0), brnch_start & gp_counter_width);
 	wrmsr(MSR_GP_COUNTERx(1), instr_start & gp_counter_width);
 
-	if (this_cpu_has_perf_global_ctrl()) {
+	if (has_perf_global_ctrl) {
 		eax = BIT(0) | BIT(1);
 		ecx = pmu.msr_global_ctl;
 		edx = 0;
@@ -784,17 +786,15 @@ static void check_emulated_instr(void)
 
 	// Check that the end count - start count is at least the expected
 	// number of instructions and branches.
-	if (this_cpu_has_perf_global_ctrl()) {
-		report(instr_cnt.count - instr_start == KVM_FEP_INSNS,
-		       "instruction count");
-		report(brnch_cnt.count - brnch_start == KVM_FEP_BRANCHES,
-		       "branch count");
-	} else {
-		report(instr_cnt.count - instr_start >= KVM_FEP_INSNS,
-		       "instruction count");
-		report(brnch_cnt.count - brnch_start >= KVM_FEP_BRANCHES,
-		       "branch count");
-	}
+	if (has_perf_global_ctrl && !pmu.errata.instructions_retired_overcount)
+		report(instr_cnt.count - instr_start == KVM_FEP_INSNS, "instruction count");
+	else
+		report(instr_cnt.count - instr_start >= KVM_FEP_INSNS, "instruction count");
+
+	if (has_perf_global_ctrl && !pmu.errata.branches_retired_overcount)
+		report(brnch_cnt.count - brnch_start == KVM_FEP_BRANCHES, "branch count");
+	else
+		report(brnch_cnt.count - brnch_start >= KVM_FEP_BRANCHES, "branch count");
 
 	if (this_cpu_has_perf_global_status()) {
 		// Additionally check that those counters overflowed properly.
-- 
2.52.0.rc2.455.g230fcf2819-goog

From: dongsheng <dongsheng.x.zhang@intel.com>

Increase the upper limit of the "llc references" test to accommodate
results observed on additional Intel CPU models, including CWF and
SRF.
These CPUs exhibited higher reference counts that previously caused
the test to fail.

Signed-off-by: dongsheng <dongsheng.x.zhang@intel.com>
Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Tested-by: Yi Lai <yi1.lai@intel.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 x86/pmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/x86/pmu.c b/x86/pmu.c
index ccf4ee63..b262ea59 100644
--- a/x86/pmu.c
+++ b/x86/pmu.c
@@ -116,7 +116,7 @@ struct pmu_event {
 	{"core cycles", 0x003c, 1*N, 50*N},
 	{"instructions", 0x00c0, 10*N, 10.2*N},
 	{"ref cycles", 0x013c, 1*N, 30*N},
-	{"llc references", 0x4f2e, 1, 2*N},
+	{"llc references", 0x4f2e, 1, 2.5*N},
 	{"llc misses", 0x412e, 1, 1*N},
 	{"branches", 0x00c4, 1*N, 1.1*N},
 	{"branch misses", 0x00c5, 1, 0.1*N},
-- 
2.52.0.rc2.455.g230fcf2819-goog

From: Dapeng Mi <dapeng1.mi@linux.intel.com>

Remove abundant data_cfg_match calculation.

Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Tested-by: Yi Lai <yi1.lai@intel.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 x86/pmu_pebs.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/x86/pmu_pebs.c b/x86/pmu_pebs.c
index 6e73fc34..2848cc1e 100644
--- a/x86/pmu_pebs.c
+++ b/x86/pmu_pebs.c
@@ -296,7 +296,6 @@ static void check_pebs_records(u64 bitmask, u64 pebs_data_cfg, bool use_adaptive
 		pebs_record_size = pebs_rec->format_size >> RECORD_SIZE_OFFSET;
 		pebs_idx_match = pebs_rec->applicable_counters & bitmask;
 		pebs_size_match = pebs_record_size == get_pebs_record_size(pebs_data_cfg, use_adaptive);
-		data_cfg_match = (pebs_rec->format_size & GENMASK_ULL(47, 0)) == pebs_data_cfg;
 		data_cfg_match = (pebs_rec->format_size & GENMASK_ULL(47, 0)) ==
 				 (use_adaptive ? pebs_data_cfg : 0);
 		expected = pebs_idx_match && pebs_size_match && data_cfg_match;
-- 
2.52.0.rc2.455.g230fcf2819-goog

From: Dapeng Mi <dapeng1.mi@linux.intel.com>

On Intel GNR/SRF platform, timed PEBS is introduced. Timed PEBS adds
a new "retired latency" field in basic info group to show the timing
info. IA32_PERF_CAPABILITIES.PEBS_TIMING_INFO[bit 17] is introduced to
indicate whether timed PEBS is supported.

After introducing timed PEBS, the PEBS record format field shrinks to
bits[31:0] and  the bits[47:32] is used to record retired latency.

Thus shrink the record format to bits[31:0] accordingly and avoid the
retired latency field is recognized a part of record format to compare
and cause failure on GNR/SRF.

Please find detailed information about timed PEBS in section 8.4.1
"Timed Processor Event Based Sampling" of "Intel Architecture
Instruction Set Extensions and Future Features".

Reviewed-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Tested-by: Yi Lai <yi1.lai@intel.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 lib/x86/pmu.h  | 6 ++++++
 x86/pmu_pebs.c | 8 +++++---
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/lib/x86/pmu.h b/lib/x86/pmu.h
index e84b37dc..cd6091af 100644
--- a/lib/x86/pmu.h
+++ b/lib/x86/pmu.h
@@ -20,6 +20,7 @@
 #define PMU_CAP_LBR_FMT	  0x3f
 #define PMU_CAP_FW_WRITES	(1ULL << 13)
 #define PMU_CAP_PEBS_BASELINE	(1ULL << 14)
+#define PMU_CAP_PEBS_TIMING_INFO	(1ULL << 17)
 #define PERF_CAP_PEBS_FORMAT           0xf00
 
 #define EVNSEL_EVENT_SHIFT	0
@@ -193,4 +194,9 @@ static inline bool pmu_has_pebs_baseline(void)
 	return pmu.perf_cap & PMU_CAP_PEBS_BASELINE;
 }
 
+static inline bool pmu_has_pebs_timing_info(void)
+{
+	return pmu.perf_cap & PMU_CAP_PEBS_TIMING_INFO;
+}
+
 #endif /* _X86_PMU_H_ */
diff --git a/x86/pmu_pebs.c b/x86/pmu_pebs.c
index 2848cc1e..bc37e8e3 100644
--- a/x86/pmu_pebs.c
+++ b/x86/pmu_pebs.c
@@ -277,6 +277,7 @@ static void check_pebs_records(u64 bitmask, u64 pebs_data_cfg, bool use_adaptive
 	unsigned int count = 0;
 	bool expected, pebs_idx_match, pebs_size_match, data_cfg_match;
 	void *cur_record;
+	u64 format_mask;
 
 	expected = (ds->pebs_index == ds->pebs_buffer_base) && !pebs_rec->format_size;
 	if (!(rdmsr(MSR_CORE_PERF_GLOBAL_STATUS) & GLOBAL_STATUS_BUFFER_OVF)) {
@@ -289,6 +290,8 @@ static void check_pebs_records(u64 bitmask, u64 pebs_data_cfg, bool use_adaptive
 		return;
 	}
 
+	/* Record format shrinks to bits[31:0] after timed PEBS is introduced. */
+	format_mask = pmu_has_pebs_timing_info() ? GENMASK_ULL(31, 0) : GENMASK_ULL(47, 0);
 	expected = ds->pebs_index >= ds->pebs_interrupt_threshold;
 	cur_record = (void *)pebs_buffer;
 	do {
@@ -296,8 +299,7 @@ static void check_pebs_records(u64 bitmask, u64 pebs_data_cfg, bool use_adaptive
 		pebs_record_size = pebs_rec->format_size >> RECORD_SIZE_OFFSET;
 		pebs_idx_match = pebs_rec->applicable_counters & bitmask;
 		pebs_size_match = pebs_record_size == get_pebs_record_size(pebs_data_cfg, use_adaptive);
-		data_cfg_match = (pebs_rec->format_size & GENMASK_ULL(47, 0)) ==
-				 (use_adaptive ? pebs_data_cfg : 0);
+		data_cfg_match = (pebs_rec->format_size & format_mask) == (use_adaptive ? pebs_data_cfg : 0);
 		expected = pebs_idx_match && pebs_size_match && data_cfg_match;
 		report(expected,
 		       "PEBS record (written seq %d) is verified (including size, counters and cfg).", count);
@@ -327,7 +329,7 @@ static void check_pebs_records(u64 bitmask, u64 pebs_data_cfg, bool use_adaptive
 			       pebs_record_size, get_pebs_record_size(pebs_data_cfg, use_adaptive));
 		if (!data_cfg_match)
 			printf("FAIL: The pebs_data_cfg (0x%lx) doesn't match with the effective MSR_PEBS_DATA_CFG (0x%lx).\n",
-			       pebs_rec->format_size & 0xffffffffffff, use_adaptive ? pebs_data_cfg : 0);
+			       pebs_rec->format_size & format_mask, use_adaptive ? pebs_data_cfg : 0);
 	}
 }
 
-- 
2.52.0.rc2.455.g230fcf2819-goog