Enable the bpf_get_branch_snapshot() BPF helper on ARM64 by implementing the perf_snapshot_branch_stack static call for ARM's Branch Record Buffer Extension (BRBE). The BPF helper bpf_get_branch_snapshot() allows BPF programs to capture hardware branch records on-demand. This was previously only available on x86 (Intel LBR, AMD BRS) but not on ARM64 despite BRBE being available since ARMv9. This implementation: - Follows the x86 snapshot pattern (intel_pmu_snapshot_branch_stack) - Performs atomic snapshot by pausing BRBE, reading records, and restoring previous state without disrupting ongoing perf events - Reads branch records directly from BRBE registers without event-specific filtering to minimize branch pollution - Handles all BRBE record types (complete, source-only, target-only) - Complies with ARM ARM synchronization requirements (ISB barriers per rule PPBZP) - Reuses existing BRBE infrastructure (select_brbe_bank, __read_brbe_regset, brbe_set_perf_entry_type, etc.) Signed-off-by: Puranjay Mohan --- This patch is only compile tested as I don't have access to hardware with BRBE. --- drivers/perf/arm_brbe.c | 95 ++++++++++++++++++++++++++++++++++++++++ drivers/perf/arm_brbe.h | 9 ++++ drivers/perf/arm_pmuv3.c | 5 ++- 3 files changed, 108 insertions(+), 1 deletion(-) diff --git a/drivers/perf/arm_brbe.c b/drivers/perf/arm_brbe.c index ba554e0c846c..cda7bf522c06 100644 --- a/drivers/perf/arm_brbe.c +++ b/drivers/perf/arm_brbe.c @@ -803,3 +803,98 @@ void brbe_read_filtered_entries(struct perf_branch_stack *branch_stack, done: branch_stack->nr = nr_filtered; } + +/* + * ARM-specific callback invoked through perf_snapshot_branch_stack static + * call, defined in include/linux/perf_event.h. See its definition for API + * details. It's up to caller to provide enough space in *entries* to fit all + * branch records, otherwise returned result will be truncated to *cnt* entries. + * + * This is similar to brbe_read_filtered_entries but optimized for snapshot mode: + * - No filtering based on event attributes (captures everything) + * - Minimal branches to avoid polluting the branch buffer + * - Direct register reads without event-specific processing + */ +int arm_brbe_snapshot_branch_stack(struct perf_branch_entry *entries, unsigned int cnt) +{ + unsigned long flags; + int nr_hw, nr_banks, nr_copied = 0; + u64 brbidr, brbfcr, brbcr; + + /* + * The sequence of steps to freeze BRBE should be completely inlined + * and contain no branches to minimize contamination of branch snapshot. + */ + local_irq_save(flags); + + /* Save current BRBE configuration */ + brbfcr = read_sysreg_s(SYS_BRBFCR_EL1); + brbcr = read_sysreg_s(SYS_BRBCR_EL1); + + /* Pause BRBE to freeze the buffer */ + write_sysreg_s(brbfcr | BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1); + isb(); + + /* Read BRBIDR to determine number of records */ + brbidr = read_sysreg_s(SYS_BRBIDR0_EL1); + if (!valid_brbidr(brbidr)) + goto out_restore; + + nr_hw = FIELD_GET(BRBIDR0_EL1_NUMREC_MASK, brbidr); + nr_banks = DIV_ROUND_UP(nr_hw, BRBE_BANK_MAX_ENTRIES); + + /* Read branch records from BRBE banks */ + for (int bank = 0; bank < nr_banks; bank++) { + int nr_remaining = nr_hw - (bank * BRBE_BANK_MAX_ENTRIES); + int nr_this_bank = min(nr_remaining, BRBE_BANK_MAX_ENTRIES); + + select_brbe_bank(bank); + + for (int i = 0; i < nr_this_bank; i++) { + struct brbe_regset bregs; + struct perf_branch_entry *entry; + + if (nr_copied >= cnt) + goto out_restore; + + if (!__read_brbe_regset(&bregs, i)) + goto out_restore; + + entry = &entries[nr_copied]; + perf_clear_branch_entry_bitfields(entry); + + /* Simple conversion without filtering */ + if (brbe_record_is_complete(bregs.brbinf)) { + entry->from = bregs.brbsrc; + entry->to = bregs.brbtgt; + } else if (brbe_record_is_source_only(bregs.brbinf)) { + entry->from = bregs.brbsrc; + entry->to = 0; + } else if (brbe_record_is_target_only(bregs.brbinf)) { + entry->from = 0; + entry->to = bregs.brbtgt; + } + + brbe_set_perf_entry_type(entry, bregs.brbinf); + entry->cycles = brbinf_get_cycles(bregs.brbinf); + + if (!brbe_record_is_target_only(bregs.brbinf)) { + entry->mispred = brbinf_get_mispredict(bregs.brbinf); + entry->predicted = !entry->mispred; + } + + if (!brbe_record_is_source_only(bregs.brbinf)) + entry->priv = brbinf_get_perf_priv(bregs.brbinf); + + nr_copied++; + } + } + +out_restore: + /* Restore BRBE to its previous state */ + write_sysreg_s(brbcr, SYS_BRBCR_EL1); + isb(); + write_sysreg_s(brbfcr, SYS_BRBFCR_EL1); + local_irq_restore(flags); + return nr_copied; +} diff --git a/drivers/perf/arm_brbe.h b/drivers/perf/arm_brbe.h index b7c7d8796c86..c2a1824437fb 100644 --- a/drivers/perf/arm_brbe.h +++ b/drivers/perf/arm_brbe.h @@ -10,6 +10,7 @@ struct arm_pmu; struct perf_branch_stack; struct perf_event; +struct perf_branch_entry; #ifdef CONFIG_ARM64_BRBE void brbe_probe(struct arm_pmu *arm_pmu); @@ -22,6 +23,8 @@ void brbe_disable(void); bool brbe_branch_attr_valid(struct perf_event *event); void brbe_read_filtered_entries(struct perf_branch_stack *branch_stack, const struct perf_event *event); +int arm_brbe_snapshot_branch_stack(struct perf_branch_entry *entries, + unsigned int cnt); #else static inline void brbe_probe(struct arm_pmu *arm_pmu) { } static inline unsigned int brbe_num_branch_records(const struct arm_pmu *armpmu) @@ -44,4 +47,10 @@ static void brbe_read_filtered_entries(struct perf_branch_stack *branch_stack, const struct perf_event *event) { } + +static inline int arm_brbe_snapshot_branch_stack(struct perf_branch_entry *entries, + unsigned int cnt) +{ + return 0; +} #endif diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c index 8014ff766cff..1a9f129a0f94 100644 --- a/drivers/perf/arm_pmuv3.c +++ b/drivers/perf/arm_pmuv3.c @@ -1449,8 +1449,11 @@ static int armv8_pmu_init(struct arm_pmu *cpu_pmu, char *name, cpu_pmu->set_event_filter = armv8pmu_set_event_filter; cpu_pmu->pmu.event_idx = armv8pmu_user_event_idx; - if (brbe_num_branch_records(cpu_pmu)) + if (brbe_num_branch_records(cpu_pmu)) { cpu_pmu->pmu.sched_task = armv8pmu_sched_task; + static_call_update(perf_snapshot_branch_stack, + arm_brbe_snapshot_branch_stack); + } cpu_pmu->name = name; cpu_pmu->map_event = map_event; base-commit: c286e7e9d1f1f3d90ad11c37e896f582b02d19c4 -- 2.47.3