This patch is extending[0] with some adjustments[1]. [0] https://lore.kernel.org/all/20250722150152.1158205-1-matt@readmodwrite.com/ [1] https://github.com/xdp-project/xdp-project/blob/main/areas/bench/patches/bench-lpm-trie-V3-adjusted.patch The 'noop' bench measures the overhead of the harness. Meaning the bpf_prog_test_run that calls bpf_loop with 10000 (NR_LOOPS) iterations in the lpm_producer loop. CPU: AMD EPYC 9684X sudo ./bench lpm-trie-noop --nr_entries=1 --producers=1 --affinity Setting up benchmark 'lpm-trie-noop'... Benchmark 'lpm-trie-noop' started. Iter 0 ( 42.501us): hits 74.567M/s ( 74.567M/prod) Iter 1 ( -5.155us): hits 74.630M/s ( 74.630M/prod) Iter 2 ( 0.123us): hits 74.620M/s ( 74.620M/prod) Iter 3 ( -7.127us): hits 74.611M/s ( 74.611M/prod) Iter 4 ( 7.334us): hits 74.609M/s ( 74.609M/prod) Iter 5 ( 0.163us): hits 74.620M/s ( 74.620M/prod) Iter 6 ( 0.213us): hits 74.610M/s ( 74.610M/prod) Summary: throughput 74.617 ± 0.008 M ops/s ( 74.617M ops/prod), latency 13.402 ns/op The baseline measures overhead of getting a random number and modulo, which can be used as a baseline comparsion against lpm-trie-lookup and lpm-trie-update. sudo ./bench lpm-trie-baseline --nr_entries=1 --producers=1 --affinity Setting up benchmark 'lpm-trie-baseline'... Benchmark 'lpm-trie-baseline' started. Iter 0 ( 44.996us): hits 36.308M/s ( 36.308M/prod) Iter 1 ( -1.535us): hits 36.330M/s ( 36.330M/prod) Iter 2 ( -3.919us): hits 36.310M/s ( 36.310M/prod) Iter 3 ( -1.004us): hits 36.330M/s ( 36.330M/prod) Iter 4 ( -1.476us): hits 36.320M/s ( 36.320M/prod) Iter 5 ( 0.468us): hits 36.330M/s ( 36.330M/prod) Iter 6 ( -0.304us): hits 36.330M/s ( 36.330M/prod) Summary: throughput 36.325 ± 0.008 M ops/s ( 36.325M ops/prod), latency 27.529 ns/op Thus, the overhead of bpf_get_prandom_u32() is 14.1 nanosec. Signed-off-by: Jesper Dangaard Brouer --- tools/testing/selftests/bpf/bench.c | 4 ++ .../selftests/bpf/benchs/bench_lpm_trie_map.c | 40 +++++++++++++++++++- tools/testing/selftests/bpf/progs/lpm_trie_bench.c | 31 ++++++++++++++-- 3 files changed, 70 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index fd15f60fd5a8..8a41aec89479 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -560,6 +560,8 @@ extern const struct bench bench_htab_mem; extern const struct bench bench_crypto_encrypt; extern const struct bench bench_crypto_decrypt; extern const struct bench bench_sockmap; +extern const struct bench bench_lpm_trie_noop; +extern const struct bench bench_lpm_trie_baseline; extern const struct bench bench_lpm_trie_lookup; extern const struct bench bench_lpm_trie_update; extern const struct bench bench_lpm_trie_delete; @@ -631,6 +633,8 @@ static const struct bench *benchs[] = { &bench_crypto_encrypt, &bench_crypto_decrypt, &bench_sockmap, + &bench_lpm_trie_noop, + &bench_lpm_trie_baseline, &bench_lpm_trie_lookup, &bench_lpm_trie_update, &bench_lpm_trie_delete, diff --git a/tools/testing/selftests/bpf/benchs/bench_lpm_trie_map.c b/tools/testing/selftests/bpf/benchs/bench_lpm_trie_map.c index 32a46c2402ea..4e0f12e359ba 100644 --- a/tools/testing/selftests/bpf/benchs/bench_lpm_trie_map.c +++ b/tools/testing/selftests/bpf/benchs/bench_lpm_trie_map.c @@ -87,7 +87,7 @@ static void __lpm_validate(void) }; } -enum { OP_LOOKUP = 1, OP_UPDATE, OP_DELETE, OP_FREE }; +enum { OP_NOOP=0, OP_BASELINE, OP_LOOKUP, OP_UPDATE, OP_DELETE, OP_FREE }; static void lpm_delete_validate(void) { @@ -175,6 +175,18 @@ static void lpm_setup(void) fill_map(fd); } +static void lpm_noop_setup(void) +{ + __lpm_setup(); + ctx.bench->bss->op = OP_NOOP; +} + +static void lpm_baseline_setup(void) +{ + __lpm_setup(); + ctx.bench->bss->op = OP_BASELINE; +} + static void lpm_lookup_setup(void) { lpm_setup(); @@ -208,7 +220,7 @@ static void lpm_measure(struct bench_res *res) res->duration_ns = atomic_swap(&ctx.bench->bss->duration_ns, 0); } -/* For LOOKUP, UPDATE, and DELETE */ +/* For NOOP, BASELINE, LOOKUP, UPDATE, and DELETE */ static void *lpm_producer(void *unused __always_unused) { int err; @@ -310,6 +322,30 @@ static void free_ops_report_final(struct bench_res res[], int res_cnt) latency / lat_divisor / env.producer_cnt, unit); } +/* noop bench measures harness-overhead */ +const struct bench bench_lpm_trie_noop = { + .name = "lpm-trie-noop", + .argp = &bench_lpm_trie_map_argp, + .validate = __lpm_validate, + .setup = lpm_noop_setup, + .producer_thread = lpm_producer, + .measure = lpm_measure, + .report_progress = ops_report_progress, + .report_final = ops_report_final, +}; + +/* baseline overhead for lookup and update */ +const struct bench bench_lpm_trie_baseline = { + .name = "lpm-trie-baseline", + .argp = &bench_lpm_trie_map_argp, + .validate = __lpm_validate, + .setup = lpm_baseline_setup, + .producer_thread = lpm_producer, + .measure = lpm_measure, + .report_progress = ops_report_progress, + .report_final = ops_report_final, +}; + const struct bench bench_lpm_trie_lookup = { .name = "lpm-trie-lookup", .argp = &bench_lpm_trie_map_argp, diff --git a/tools/testing/selftests/bpf/progs/lpm_trie_bench.c b/tools/testing/selftests/bpf/progs/lpm_trie_bench.c index 522e1cbef490..e4a5cecd6560 100644 --- a/tools/testing/selftests/bpf/progs/lpm_trie_bench.c +++ b/tools/testing/selftests/bpf/progs/lpm_trie_bench.c @@ -6,6 +6,7 @@ #include #include #include "bpf_misc.h" +#include "bpf_atomic.h" #define BPF_OBJ_NAME_LEN 16U #define MAX_ENTRIES 100000000 @@ -84,12 +85,30 @@ int BPF_PROG(trie_free_exit, struct work_struct *work) return 0; } -static void gen_random_key(struct trie_key *key) +static __always_inline void gen_random_key(struct trie_key *key) { key->prefixlen = prefixlen; key->data = bpf_get_prandom_u32() % nr_entries; } +static int noop(__u32 index, __u32 *unused) +{ + return 0; +} + +static int baseline(__u32 index, __u32 *unused) +{ + struct trie_key key; + __s64 blackbox; + + gen_random_key(&key); + /* Avoid compiler optimizing out the modulo */ + barrier_var(blackbox); + blackbox = READ_ONCE(key.data); + + return 0; +} + static int lookup(__u32 index, __u32 *unused) { struct trie_key key; @@ -148,13 +167,19 @@ int BPF_PROG(run_bench) start = bpf_ktime_get_ns(); switch (op) { + case 0: + loops = bpf_loop(NR_LOOPS, noop, NULL, 0); + break; case 1: - loops = bpf_loop(NR_LOOPS, lookup, NULL, 0); + loops = bpf_loop(NR_LOOPS, baseline, NULL, 0); break; case 2: - loops = bpf_loop(NR_LOOPS, update, NULL, 0); + loops = bpf_loop(NR_LOOPS, lookup, NULL, 0); break; case 3: + loops = bpf_loop(NR_LOOPS, update, NULL, 0); + break; + case 4: loops = bpf_loop(NR_LOOPS, delete, &need_refill, 0); break; default: