Add ARM64 JIT inlining for bpf_get_cpu_time_counter() and bpf_cpu_time_counter_to_ns() kfuncs. bpf_get_cpu_time_counter() is JIT-inlined as: ISB // serialize instruction stream MRS Xn, CNTVCT_EL0 // read architected timer counter The ISB before the MRS is required for ordering, matching the kernel's arch_timer_read_cntvct_el0() implementation. On newer CPUs it will be JITed to: MRS Xn, CNTVCTSS_EL0 // self-synchronized (ISB not needed) bpf_cpu_time_counter_to_ns() is JIT-inlined using mult/shift constants computed at JIT time from the architected timer frequency (CNTFRQ_EL0): MOV Xtmp, #mult // load conversion multiplier MUL Xn, Xarg, Xtmp // delta_ticks * mult LSR Xn, Xn, #shift // >> shift = nanoseconds On systems with a 1GHz counter (e.g., Neoverse-V2), mult=1 and shift=0, so the conversion collapses to a single MOV (identity). Signed-off-by: Puranjay Mohan --- arch/arm64/include/asm/insn.h | 2 + arch/arm64/net/bpf_jit.h | 4 ++ arch/arm64/net/bpf_jit_comp.c | 54 +++++++++++++++++++ .../selftests/bpf/progs/verifier_cpu_cycles.c | 50 ++++++++++++++++- 4 files changed, 109 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h index f463a654a2bb..bb235a39cef0 100644 --- a/arch/arm64/include/asm/insn.h +++ b/arch/arm64/include/asm/insn.h @@ -139,6 +139,8 @@ enum aarch64_insn_system_register { AARCH64_INSN_SYSREG_TPIDR_EL1 = 0x4684, AARCH64_INSN_SYSREG_TPIDR_EL2 = 0x6682, AARCH64_INSN_SYSREG_SP_EL0 = 0x4208, + AARCH64_INSN_SYSREG_CNTVCT_EL0 = 0x5F02, + AARCH64_INSN_SYSREG_CNTVCTSS_EL0 = 0x5F06, }; enum aarch64_insn_variant { diff --git a/arch/arm64/net/bpf_jit.h b/arch/arm64/net/bpf_jit.h index d13de4222cfb..a525387439fe 100644 --- a/arch/arm64/net/bpf_jit.h +++ b/arch/arm64/net/bpf_jit.h @@ -326,6 +326,10 @@ aarch64_insn_gen_mrs(Rt, AARCH64_INSN_SYSREG_TPIDR_EL2) #define A64_MRS_SP_EL0(Rt) \ aarch64_insn_gen_mrs(Rt, AARCH64_INSN_SYSREG_SP_EL0) +#define A64_MRS_CNTVCT_EL0(Rt) \ + aarch64_insn_gen_mrs(Rt, AARCH64_INSN_SYSREG_CNTVCT_EL0) +#define A64_MRS_CNTVCTSS_EL0(Rt) \ + aarch64_insn_gen_mrs(Rt, AARCH64_INSN_SYSREG_CNTVCTSS_EL0) /* Barriers */ #define A64_SB aarch64_insn_get_sb_value() diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 0816c40fc7af..7da7507ab431 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -19,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -1571,10 +1573,54 @@ static int build_insn(const struct bpf_verifier_env *env, const struct bpf_insn case BPF_JMP | BPF_CALL: { const u8 r0 = bpf2a64[BPF_REG_0]; + const u8 r1 = bpf2a64[BPF_REG_1]; + const s32 imm = insn->imm; bool func_addr_fixed; u64 func_addr; u32 cpu_offset; + /* Inline kfunc bpf_get_cpu_time_counter() */ + if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && + imm == BPF_CALL_IMM(bpf_get_cpu_time_counter) && + bpf_jit_inlines_kfunc_call(imm)) { + /* + * With ECV (ARMv8.6+), CNTVCTSS_EL0 is self- + * synchronizing — no ISB needed. Without ECV, + * an ISB is required before reading CNTVCT_EL0 + * to prevent speculative/out-of-order reads. + * + * Matches arch_timer_read_cntvct_el0(). + */ + if (cpus_have_cap(ARM64_HAS_ECV)) { + emit(A64_MRS_CNTVCTSS_EL0(r0), ctx); + } else { + emit(A64_ISB, ctx); + emit(A64_MRS_CNTVCT_EL0(r0), ctx); + } + break; + } + + /* Inline kfunc bpf_cpu_time_counter_to_ns() */ + if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && + imm == BPF_CALL_IMM(bpf_cpu_time_counter_to_ns) && + bpf_jit_inlines_kfunc_call(imm)) { + u32 freq = arch_timer_get_cntfrq(); + + if (freq == NSEC_PER_SEC) { + /* 1 GHz counter: 1 tick = 1 ns, identity */ + emit(A64_MOV(1, r0, r1), ctx); + } else { + u32 mult, shift; + + clocks_calc_mult_shift(&mult, &shift, freq, NSEC_PER_SEC, 3600); + emit_a64_mov_i(1, tmp, mult, ctx); + emit(A64_MUL(1, r0, r1, tmp), ctx); + if (shift) + emit(A64_LSR(1, r0, r0, shift), ctx); + } + break; + } + /* Implement helper call to bpf_get_smp_processor_id() inline */ if (insn->src_reg == 0 && insn->imm == BPF_FUNC_get_smp_processor_id) { cpu_offset = offsetof(struct thread_info, cpu); @@ -3127,6 +3173,14 @@ bool bpf_jit_inlines_helper_call(s32 imm) } } +bool bpf_jit_inlines_kfunc_call(s32 imm) +{ + if (imm == BPF_CALL_IMM(bpf_get_cpu_time_counter) || + imm == BPF_CALL_IMM(bpf_cpu_time_counter_to_ns)) + return true; + return false; +} + void bpf_jit_free(struct bpf_prog *prog) { if (prog->jited) { diff --git a/tools/testing/selftests/bpf/progs/verifier_cpu_cycles.c b/tools/testing/selftests/bpf/progs/verifier_cpu_cycles.c index 26c02010ccf1..ab1b20e28084 100644 --- a/tools/testing/selftests/bpf/progs/verifier_cpu_cycles.c +++ b/tools/testing/selftests/bpf/progs/verifier_cpu_cycles.c @@ -56,7 +56,7 @@ __naked int bpf_rdtsc_jit_x86_64(void) SEC("syscall") __arch_arm64 __xlated("0: r1 = 42") -__xlated("1: r0 = r1") +__xlated("1: call kernel-function") __naked int bpf_cyc2ns_arm(void) { asm volatile( @@ -111,6 +111,54 @@ __naked int bpf_cyc2ns_jit_x86(void) ); } +SEC("syscall") +__arch_arm64 +__xlated("0: call kernel-function") +__naked int bpf_cntvct(void) +{ + asm volatile( + "call %[bpf_get_cpu_time_counter];" + "exit" + : + : __imm(bpf_get_cpu_time_counter) + : __clobber_all + ); +} + +SEC("syscall") +__arch_arm64 +/* + * With ECV: mrs x7, CNTVCTSS_EL0 + * Without ECV: isb; mrs x7, CNTVCT_EL0 + */ +__jited(" mrs x7, CNTVCT{{(SS_EL0|_EL0)}}") +__naked int bpf_cntvct_jit_arm64(void) +{ + asm volatile( + "call %[bpf_get_cpu_time_counter];" + "exit" + : + : __imm(bpf_get_cpu_time_counter) + : __clobber_all + ); +} + +SEC("syscall") +__arch_arm64 +/* bpf_cpu_time_counter_to_ns: mov (1GHz identity) or mul+lsr */ +__jited(" {{(mov x7, x0|mul x7, x0, x10)}}") +__naked int bpf_cyc2ns_jit_arm64(void) +{ + asm volatile( + "r1=0x2a;" + "call %[bpf_cpu_time_counter_to_ns];" + "exit" + : + : __imm(bpf_cpu_time_counter_to_ns) + : __clobber_all + ); +} + void rdtsc(void) { bpf_get_cpu_time_counter(); -- 2.52.0