From: dongsheng For Intel Atom CPUs, the PMU events "Instruction Retired" or "Branch Instruction Retired" may be overcounted for some certain instructions, like FAR CALL/JMP, RETF, IRET, VMENTRY/VMEXIT/VMPTRLD and complex SGX/SMX/CSTATE instructions/flows. The detailed information can be found in the errata (section SRF7): https://edc.intel.com/content/www/us/en/design/products-and-solutions/processors-and-chipsets/sierra-forest/xeon-6700-series-processor-with-e-cores-specification-update/errata-details/ For the Atom platforms before Sierra Forest (including Sierra Forest), Both 2 events "Instruction Retired" and "Branch Instruction Retired" would be overcounted on these certain instructions, but for Clearwater Forest only "Instruction Retired" event is overcounted on these instructions. So add a helper detect_inst_overcount_flags() to detect whether the platform has the overcount issue and the later patches would relax the precise count check by leveraging the gotten overcount flags from this helper. Signed-off-by: dongsheng [Rewrite comments and commit message - Dapeng] Signed-off-by: Dapeng Mi Tested-by: Yi Lai [sean: put errata detection and tracking in pmu_init()] Signed-off-by: Sean Christopherson --- lib/x86/pmu.c | 39 +++++++++++++++++++++++++++++++++++++++ lib/x86/pmu.h | 5 +++++ lib/x86/processor.h | 26 ++++++++++++++++++++++++++ 3 files changed, 70 insertions(+) diff --git a/lib/x86/pmu.c b/lib/x86/pmu.c index fb46b196..67f3b23e 100644 --- a/lib/x86/pmu.c +++ b/lib/x86/pmu.c @@ -2,11 +2,50 @@ struct pmu_caps pmu; +/* + * For Intel Atom CPUs, the PMU events "Instruction Retired" or + * "Branch Instruction Retired" may be overcounted for some certain + * instructions, like FAR CALL/JMP, RETF, IRET, VMENTRY/VMEXIT/VMPTRLD + * and complex SGX/SMX/CSTATE instructions/flows. + * + * The detailed information can be found in the errata (section SRF7): + * https://edc.intel.com/content/www/us/en/design/products-and-solutions/processors-and-chipsets/sierra-forest/xeon-6700-series-processor-with-e-cores-specification-update/errata-details/ + * + * For the Atom platforms before Sierra Forest (including Sierra Forest), + * Both 2 events "Instruction Retired" and "Branch Instruction Retired" would + * be overcounted on these certain instructions, but for Clearwater Forest + * only "Instruction Retired" event is overcounted on these instructions. + */ +static void pmu_detect_intel_overcount_errata(void) +{ + struct cpuid c = cpuid(1); + + if (x86_family(c.a) == 0x6) { + switch (x86_model(c.a)) { + case 0xDD: /* Clearwater Forest */ + pmu.errata.instructions_retired_overcount = true; + break; + + case 0xAF: /* Sierra Forest */ + case 0x4D: /* Avaton, Rangely */ + case 0x5F: /* Denverton */ + case 0x86: /* Jacobsville */ + pmu.errata.instructions_retired_overcount = true; + pmu.errata.branches_retired_overcount = true; + break; + default: + break; + } + } +} + void pmu_init(void) { pmu.is_intel = is_intel(); if (pmu.is_intel) { + pmu_detect_intel_overcount_errata(); + pmu.version = this_cpu_property(X86_PROPERTY_PMU_VERSION); if (pmu.version > 1) { diff --git a/lib/x86/pmu.h b/lib/x86/pmu.h index c7dc68c1..e84b37dc 100644 --- a/lib/x86/pmu.h +++ b/lib/x86/pmu.h @@ -73,6 +73,11 @@ struct pmu_caps { u32 msr_global_status_clr; u64 perf_cap; + + struct { + bool instructions_retired_overcount; + bool branches_retired_overcount; + } errata; }; extern struct pmu_caps pmu; diff --git a/lib/x86/processor.h b/lib/x86/processor.h index 8a73af5e..68bd774b 100644 --- a/lib/x86/processor.h +++ b/lib/x86/processor.h @@ -226,6 +226,32 @@ static inline bool is_intel(void) return strcmp((char *)name, "GenuineIntel") == 0; } +static inline u32 x86_family(u32 sig) +{ + u32 x86; + + x86 = (sig >> 8) & 0xf; + + if (x86 == 0xf) + x86 += (sig >> 20) & 0xff; + + return x86; +} + +static inline u32 x86_model(u32 sig) +{ + u32 fam, model; + + fam = x86_family(sig); + + model = (sig >> 4) & 0xf; + + if (fam >= 0x6) + model += ((sig >> 16) & 0xf) << 4; + + return model; +} + /* * Pack the information into a 64-bit value so that each X86_FEATURE_XXX can be * passed by value with no overhead. -- 2.52.0.rc2.455.g230fcf2819-goog From: dongsheng As the VM-Exit/VM-Entry overcount issue on Intel Atom platforms, there is no way to validate the precise count for "instructions" and "branches" events on these overcounted Atom platforms. Thus relax the precise count validation on these overcounted platforms. Signed-off-by: dongsheng Signed-off-by: Dapeng Mi Tested-by: Yi Lai Signed-off-by: Sean Christopherson --- x86/pmu.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/x86/pmu.c b/x86/pmu.c index f932ccab..bd16211d 100644 --- a/x86/pmu.c +++ b/x86/pmu.c @@ -229,10 +229,15 @@ static void adjust_events_range(struct pmu_event *gp_events, * occur while running the measured code, e.g. if the host takes IRQs. */ if (pmu.is_intel && this_cpu_has_perf_global_ctrl()) { - gp_events[instruction_idx].min = LOOP_INSNS; - gp_events[instruction_idx].max = LOOP_INSNS; - gp_events[branch_idx].min = LOOP_BRANCHES; - gp_events[branch_idx].max = LOOP_BRANCHES; + if (!pmu.errata.instructions_retired_overcount) { + gp_events[instruction_idx].min = LOOP_INSNS; + gp_events[instruction_idx].max = LOOP_INSNS; + } + + if (!pmu.errata.branches_retired_overcount) { + gp_events[branch_idx].min = LOOP_BRANCHES; + gp_events[branch_idx].max = LOOP_BRANCHES; + } } /* -- 2.52.0.rc2.455.g230fcf2819-goog From: dongsheng The current implementation mistakenly limits the width of fixed counters to the width of GP counters. Correct the logic to ensure fixed counters are properly masked according to their own width. Opportunistically refine the GP counter bitwidth processing code. Signed-off-by: dongsheng Co-developed-by: Dapeng Mi Signed-off-by: Dapeng Mi Tested-by: Yi Lai [sean: keep measure_for_overflow() for fixed counter (see commit 7ec3b67a)] Signed-off-by: Sean Christopherson --- x86/pmu.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/x86/pmu.c b/x86/pmu.c index bd16211d..96b76d04 100644 --- a/x86/pmu.c +++ b/x86/pmu.c @@ -547,19 +547,19 @@ static void check_counter_overflow(void) uint64_t status; int idx; - cnt.count = overflow_preset; - if (pmu_use_full_writes()) - cnt.count &= (1ull << pmu.gp_counter_width) - 1; - if (i == pmu.nr_gp_counters) { if (!pmu.is_intel) break; cnt.ctr = fixed_events[0].unit_sel; cnt.count = measure_for_overflow(&cnt); - cnt.count &= (1ull << pmu.gp_counter_width) - 1; + cnt.count &= (1ull << pmu.fixed_counter_width) - 1; } else { cnt.ctr = MSR_GP_COUNTERx(i); + + cnt.count = overflow_preset; + if (pmu_use_full_writes()) + cnt.count &= (1ull << pmu.gp_counter_width) - 1; } if (i % 2) -- 2.52.0.rc2.455.g230fcf2819-goog From: dongsheng During the execution of __measure(), VM exits (e.g., due to WRMSR/EXTERNAL_INTERRUPT) may occur. On systems affected by the instruction overcount issue, each VM-Exit/VM-Entry can erroneously increment the instruction count by one, leading to false failures in overflow tests. To address this, the patch introduces a range-based validation in place of precise instruction count checks. Additionally, overflow_preset is now statically set to 1 - LOOP_INSNS, rather than being dynamically determined via measure_for_overflow(). These changes ensure consistent and predictable behavior aligned with the intended loop instruction count, while avoiding modifications to the subsequent status and status-clear testing logic. The chosen validation range is empirically derived to maintain test reliability across hardware variations. Signed-off-by: dongsheng Signed-off-by: Dapeng Mi Tested-by: Yi Lai Signed-off-by: Sean Christopherson --- x86/pmu.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/x86/pmu.c b/x86/pmu.c index 96b76d04..e1e98959 100644 --- a/x86/pmu.c +++ b/x86/pmu.c @@ -510,6 +510,21 @@ static void check_counters_many(void) static uint64_t measure_for_overflow(pmu_counter_t *cnt) { + /* + * During the execution of __measure(), VM exits (e.g., due to + * WRMSR/EXTERNAL_INTERRUPT) may occur. On systems affected by the + * instruction overcount issue, each VM-Exit/VM-Entry can erroneously + * increment the instruction count by one, leading to false failures + * in overflow tests. + * + * To mitigate this, if the overcount issue is detected, hardcode the + * overflow preset to (1 - LOOP_INSNS) instead of calculating it + * dynamically. This ensures that an overflow will reliably occur, + * regardless of any overcounting caused by VM exits. + */ + if (pmu.errata.instructions_retired_overcount) + return 1 - LOOP_INSNS; + __measure(cnt, 0); /* * To generate overflow, i.e. roll over to '0', the initial count just @@ -568,8 +583,12 @@ static void check_counter_overflow(void) cnt.config &= ~EVNTSEL_INT; idx = event_to_global_idx(&cnt); __measure(&cnt, cnt.count); - if (pmu.is_intel) - report(cnt.count == 1, "cntr-%d", i); + if (pmu.is_intel) { + if (pmu.errata.instructions_retired_overcount) + report(cnt.count < 14, "cntr-%d", i); + else + report(cnt.count == 1, "cntr-%d", i); + } else report(cnt.count == 0xffffffffffff || cnt.count < 7, "cntr-%d", i); -- 2.52.0.rc2.455.g230fcf2819-goog From: Dapeng Mi Relax precise count check for emulated instructions tests on these platforms with HW overcount issues. Signed-off-by: Dapeng Mi [sean: handle errata independently] Signed-off-by: Sean Christopherson --- x86/pmu.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/x86/pmu.c b/x86/pmu.c index e1e98959..ccf4ee63 100644 --- a/x86/pmu.c +++ b/x86/pmu.c @@ -756,6 +756,8 @@ static void check_emulated_instr(void) /* instructions */ .config = EVNTSEL_OS | EVNTSEL_USR | gp_events[instruction_idx].unit_sel, }; + const bool has_perf_global_ctrl = this_cpu_has_perf_global_ctrl(); + report_prefix_push("emulated instruction"); if (this_cpu_has_perf_global_status()) @@ -769,7 +771,7 @@ static void check_emulated_instr(void) wrmsr(MSR_GP_COUNTERx(0), brnch_start & gp_counter_width); wrmsr(MSR_GP_COUNTERx(1), instr_start & gp_counter_width); - if (this_cpu_has_perf_global_ctrl()) { + if (has_perf_global_ctrl) { eax = BIT(0) | BIT(1); ecx = pmu.msr_global_ctl; edx = 0; @@ -784,17 +786,15 @@ static void check_emulated_instr(void) // Check that the end count - start count is at least the expected // number of instructions and branches. - if (this_cpu_has_perf_global_ctrl()) { - report(instr_cnt.count - instr_start == KVM_FEP_INSNS, - "instruction count"); - report(brnch_cnt.count - brnch_start == KVM_FEP_BRANCHES, - "branch count"); - } else { - report(instr_cnt.count - instr_start >= KVM_FEP_INSNS, - "instruction count"); - report(brnch_cnt.count - brnch_start >= KVM_FEP_BRANCHES, - "branch count"); - } + if (has_perf_global_ctrl && !pmu.errata.instructions_retired_overcount) + report(instr_cnt.count - instr_start == KVM_FEP_INSNS, "instruction count"); + else + report(instr_cnt.count - instr_start >= KVM_FEP_INSNS, "instruction count"); + + if (has_perf_global_ctrl && !pmu.errata.branches_retired_overcount) + report(brnch_cnt.count - brnch_start == KVM_FEP_BRANCHES, "branch count"); + else + report(brnch_cnt.count - brnch_start >= KVM_FEP_BRANCHES, "branch count"); if (this_cpu_has_perf_global_status()) { // Additionally check that those counters overflowed properly. -- 2.52.0.rc2.455.g230fcf2819-goog From: dongsheng Increase the upper limit of the "llc references" test to accommodate results observed on additional Intel CPU models, including CWF and SRF. These CPUs exhibited higher reference counts that previously caused the test to fail. Signed-off-by: dongsheng Signed-off-by: Dapeng Mi Tested-by: Yi Lai Signed-off-by: Sean Christopherson --- x86/pmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/x86/pmu.c b/x86/pmu.c index ccf4ee63..b262ea59 100644 --- a/x86/pmu.c +++ b/x86/pmu.c @@ -116,7 +116,7 @@ struct pmu_event { {"core cycles", 0x003c, 1*N, 50*N}, {"instructions", 0x00c0, 10*N, 10.2*N}, {"ref cycles", 0x013c, 1*N, 30*N}, - {"llc references", 0x4f2e, 1, 2*N}, + {"llc references", 0x4f2e, 1, 2.5*N}, {"llc misses", 0x412e, 1, 1*N}, {"branches", 0x00c4, 1*N, 1.1*N}, {"branch misses", 0x00c5, 1, 0.1*N}, -- 2.52.0.rc2.455.g230fcf2819-goog From: Dapeng Mi Remove abundant data_cfg_match calculation. Signed-off-by: Dapeng Mi Tested-by: Yi Lai Signed-off-by: Sean Christopherson --- x86/pmu_pebs.c | 1 - 1 file changed, 1 deletion(-) diff --git a/x86/pmu_pebs.c b/x86/pmu_pebs.c index 6e73fc34..2848cc1e 100644 --- a/x86/pmu_pebs.c +++ b/x86/pmu_pebs.c @@ -296,7 +296,6 @@ static void check_pebs_records(u64 bitmask, u64 pebs_data_cfg, bool use_adaptive pebs_record_size = pebs_rec->format_size >> RECORD_SIZE_OFFSET; pebs_idx_match = pebs_rec->applicable_counters & bitmask; pebs_size_match = pebs_record_size == get_pebs_record_size(pebs_data_cfg, use_adaptive); - data_cfg_match = (pebs_rec->format_size & GENMASK_ULL(47, 0)) == pebs_data_cfg; data_cfg_match = (pebs_rec->format_size & GENMASK_ULL(47, 0)) == (use_adaptive ? pebs_data_cfg : 0); expected = pebs_idx_match && pebs_size_match && data_cfg_match; -- 2.52.0.rc2.455.g230fcf2819-goog From: Dapeng Mi On Intel GNR/SRF platform, timed PEBS is introduced. Timed PEBS adds a new "retired latency" field in basic info group to show the timing info. IA32_PERF_CAPABILITIES.PEBS_TIMING_INFO[bit 17] is introduced to indicate whether timed PEBS is supported. After introducing timed PEBS, the PEBS record format field shrinks to bits[31:0] and the bits[47:32] is used to record retired latency. Thus shrink the record format to bits[31:0] accordingly and avoid the retired latency field is recognized a part of record format to compare and cause failure on GNR/SRF. Please find detailed information about timed PEBS in section 8.4.1 "Timed Processor Event Based Sampling" of "Intel Architecture Instruction Set Extensions and Future Features". Reviewed-by: Kan Liang Signed-off-by: Dapeng Mi Tested-by: Yi Lai Signed-off-by: Sean Christopherson --- lib/x86/pmu.h | 6 ++++++ x86/pmu_pebs.c | 8 +++++--- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/lib/x86/pmu.h b/lib/x86/pmu.h index e84b37dc..cd6091af 100644 --- a/lib/x86/pmu.h +++ b/lib/x86/pmu.h @@ -20,6 +20,7 @@ #define PMU_CAP_LBR_FMT 0x3f #define PMU_CAP_FW_WRITES (1ULL << 13) #define PMU_CAP_PEBS_BASELINE (1ULL << 14) +#define PMU_CAP_PEBS_TIMING_INFO (1ULL << 17) #define PERF_CAP_PEBS_FORMAT 0xf00 #define EVNSEL_EVENT_SHIFT 0 @@ -193,4 +194,9 @@ static inline bool pmu_has_pebs_baseline(void) return pmu.perf_cap & PMU_CAP_PEBS_BASELINE; } +static inline bool pmu_has_pebs_timing_info(void) +{ + return pmu.perf_cap & PMU_CAP_PEBS_TIMING_INFO; +} + #endif /* _X86_PMU_H_ */ diff --git a/x86/pmu_pebs.c b/x86/pmu_pebs.c index 2848cc1e..bc37e8e3 100644 --- a/x86/pmu_pebs.c +++ b/x86/pmu_pebs.c @@ -277,6 +277,7 @@ static void check_pebs_records(u64 bitmask, u64 pebs_data_cfg, bool use_adaptive unsigned int count = 0; bool expected, pebs_idx_match, pebs_size_match, data_cfg_match; void *cur_record; + u64 format_mask; expected = (ds->pebs_index == ds->pebs_buffer_base) && !pebs_rec->format_size; if (!(rdmsr(MSR_CORE_PERF_GLOBAL_STATUS) & GLOBAL_STATUS_BUFFER_OVF)) { @@ -289,6 +290,8 @@ static void check_pebs_records(u64 bitmask, u64 pebs_data_cfg, bool use_adaptive return; } + /* Record format shrinks to bits[31:0] after timed PEBS is introduced. */ + format_mask = pmu_has_pebs_timing_info() ? GENMASK_ULL(31, 0) : GENMASK_ULL(47, 0); expected = ds->pebs_index >= ds->pebs_interrupt_threshold; cur_record = (void *)pebs_buffer; do { @@ -296,8 +299,7 @@ static void check_pebs_records(u64 bitmask, u64 pebs_data_cfg, bool use_adaptive pebs_record_size = pebs_rec->format_size >> RECORD_SIZE_OFFSET; pebs_idx_match = pebs_rec->applicable_counters & bitmask; pebs_size_match = pebs_record_size == get_pebs_record_size(pebs_data_cfg, use_adaptive); - data_cfg_match = (pebs_rec->format_size & GENMASK_ULL(47, 0)) == - (use_adaptive ? pebs_data_cfg : 0); + data_cfg_match = (pebs_rec->format_size & format_mask) == (use_adaptive ? pebs_data_cfg : 0); expected = pebs_idx_match && pebs_size_match && data_cfg_match; report(expected, "PEBS record (written seq %d) is verified (including size, counters and cfg).", count); @@ -327,7 +329,7 @@ static void check_pebs_records(u64 bitmask, u64 pebs_data_cfg, bool use_adaptive pebs_record_size, get_pebs_record_size(pebs_data_cfg, use_adaptive)); if (!data_cfg_match) printf("FAIL: The pebs_data_cfg (0x%lx) doesn't match with the effective MSR_PEBS_DATA_CFG (0x%lx).\n", - pebs_rec->format_size & 0xffffffffffff, use_adaptive ? pebs_data_cfg : 0); + pebs_rec->format_size & format_mask, use_adaptive ? pebs_data_cfg : 0); } } -- 2.52.0.rc2.455.g230fcf2819-goog