We can currently optimize uprobes on top of nop5 instructions, so application can define USDT_NOP to nop5 and use USDT macro to define optimized usdt probes. This works fine on new kernels, but could have performance penalty on older kernels, that do not have the support to optimize and to emulate nop5 instruction. execution of the usdt probe on top of nop: - nop -> trigger usdt -> emulate nop -> continue execution of the usdt probe on top of nop5: - nop5 -> trigger usdt -> single step nop5 -> continue Note the 'single step nop5' as the source of performance regression. To workaround that we change the USDT macro to emit nop,nop5 for the probe (instead of default nop) and make record of that in USDT record (more on that below). This can be detected by application (libbpf) and it can place the uprobe either on nop or nop5 based on the optimization support in the kernel. We make record of using the nop,nop5 instructions in the USDT ELF note data. Current elf note format is as follows: namesz (4B) | descsz (4B) | type (4B) | name | desc And current usdt record (with "stapsdt" name) placed in the note's desc data look like: loc_addr | 8 bytes base_addr | 8 bytes sema_addr | 8 bytes provider | zero terminated string name | zero terminated string args | zero terminated string None of the tested parsers (bpftrace-bcc, libbpf) checked that the args zero terminated byte is the actual end of the 'desc' data. As Andrii suggested we could use this and place extra zero byte right there as an indication for the parser we use the nop,nop5 instructions. It's bit tricky, but the other way would be to introduce new elf note type or note name and change all existing parsers to recognize it. With the change above the existing parsers would still recognize such usdt probes. Note we do not emit this extra byte if app defined its own nop through USDT_NOP macro. Suggested-by: Andrii Nakryiko Signed-off-by: Jiri Olsa --- tools/testing/selftests/bpf/usdt.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tools/testing/selftests/bpf/usdt.h b/tools/testing/selftests/bpf/usdt.h index 549d1f774810..57fa2902136c 100644 --- a/tools/testing/selftests/bpf/usdt.h +++ b/tools/testing/selftests/bpf/usdt.h @@ -312,9 +312,16 @@ struct usdt_sema { volatile unsigned short active; }; #ifndef USDT_NOP #if defined(__ia64__) || defined(__s390__) || defined(__s390x__) #define USDT_NOP nop 0 +#elif defined(__x86_64__) +#define USDT_NOP .byte 0x90, 0x0f, 0x1f, 0x44, 0x00, 0x0 /* nop, nop5 */ #else #define USDT_NOP nop #endif +#else +/* + * User define its own nop instruction, do not emit extra note data. + */ +#define __usdt_asm_extra #endif /* USDT_NOP */ /* @@ -403,6 +410,15 @@ struct usdt_sema { volatile unsigned short active; }; __asm__ __volatile__ ("" :: "m" (sema)); #endif +#ifndef __usdt_asm_extra +#ifdef __x86_64__ +#define __usdt_asm_extra \ + __usdt_asm1( .ascii "\0") +#else +#define __usdt_asm_extra +#endif +#endif + /* main USDT definition (nop and .note.stapsdt metadata) */ #define __usdt_probe(group, name, sema_def, sema, ...) do { \ sema_def(sema) \ @@ -420,6 +436,7 @@ struct usdt_sema { volatile unsigned short active; }; __usdt_asm_strz(name) \ __usdt_asm_args(__VA_ARGS__) \ __usdt_asm1( .ascii "\0") \ + __usdt_asm_extra \ __usdt_asm1(994: .balign 4) \ __usdt_asm1( .popsection) \ __usdt_asm1(.ifndef _.stapsdt.base) \ -- 2.51.1 Adding uprobe syscall feature detection that will be used in following changes. Signed-off-by: Jiri Olsa --- tools/lib/bpf/features.c | 22 ++++++++++++++++++++++ tools/lib/bpf/libbpf_internal.h | 2 ++ 2 files changed, 24 insertions(+) diff --git a/tools/lib/bpf/features.c b/tools/lib/bpf/features.c index b842b83e2480..587571c21d2d 100644 --- a/tools/lib/bpf/features.c +++ b/tools/lib/bpf/features.c @@ -506,6 +506,25 @@ static int probe_kern_arg_ctx_tag(int token_fd) return probe_fd(prog_fd); } +#ifdef __x86_64__ +#ifndef __NR_uprobe +#define __NR_uprobe 336 +#endif +static int probe_uprobe_syscall(int token_fd) +{ + /* + * When not executed from executed kernel provided trampoline, + * the uprobe syscall returns ENXIO error. + */ + return syscall(__NR_uprobe) == -1 && errno == ENXIO; +} +#else +static int probe_uprobe_syscall(int token_fd) +{ + return 0; +} +#endif + typedef int (*feature_probe_fn)(int /* token_fd */); static struct kern_feature_cache feature_cache; @@ -581,6 +600,9 @@ static struct kern_feature_desc { [FEAT_BTF_QMARK_DATASEC] = { "BTF DATASEC names starting from '?'", probe_kern_btf_qmark_datasec, }, + [FEAT_UPROBE_SYSCALL] = { + "Kernel supports uprobe syscall", probe_uprobe_syscall, + }, }; bool feat_supported(struct kern_feature_cache *cache, enum kern_feature_id feat_id) diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index fc59b21b51b5..69aa61c038a9 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -392,6 +392,8 @@ enum kern_feature_id { FEAT_ARG_CTX_TAG, /* Kernel supports '?' at the front of datasec names */ FEAT_BTF_QMARK_DATASEC, + /* Kernel supports uprobe syscall */ + FEAT_UPROBE_SYSCALL, __FEAT_CNT, }; -- 2.51.1 Adding support to parse extra info in usdt note record that indicates there's nop,nop5 emitted for probe. We detect this by checking extra zero byte placed in between args zero termination byte and desc data end. Please see [1] for more details. Together with uprobe syscall feature detection we can decide if we want to place the probe on top of nop or nop5. [1] https://github.com/libbpf/usdt Signed-off-by: Jiri Olsa --- tools/lib/bpf/usdt.c | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/usdt.c b/tools/lib/bpf/usdt.c index c174b4086673..5730295e69d3 100644 --- a/tools/lib/bpf/usdt.c +++ b/tools/lib/bpf/usdt.c @@ -241,6 +241,7 @@ struct usdt_note { long loc_addr; long base_addr; long sema_addr; + bool nop_combo; }; struct usdt_target { @@ -262,6 +263,7 @@ struct usdt_manager { bool has_bpf_cookie; bool has_sema_refcnt; bool has_uprobe_multi; + bool has_uprobe_syscall; }; struct usdt_manager *usdt_manager_new(struct bpf_object *obj) @@ -301,6 +303,11 @@ struct usdt_manager *usdt_manager_new(struct bpf_object *obj) * usdt probes. */ man->has_uprobe_multi = kernel_supports(obj, FEAT_UPROBE_MULTI_LINK); + + /* + * Detect kernel support for uprobe syscall to be used to pick usdt attach point. + */ + man->has_uprobe_syscall = kernel_supports(obj, FEAT_UPROBE_SYSCALL); return man; } @@ -784,6 +791,15 @@ static int collect_usdt_targets(struct usdt_manager *man, Elf *elf, const char * target = &targets[target_cnt]; memset(target, 0, sizeof(*target)); + /* + * We have usdt with nop,nop5 instruction and we detected uprobe syscall, + * so we can place the uprobe directly on nop5 (+1) to get it optimized. + */ + if (note.nop_combo && man->has_uprobe_syscall) { + usdt_abs_ip++; + usdt_rel_ip++; + } + target->abs_ip = usdt_abs_ip; target->rel_ip = usdt_rel_ip; target->sema_off = usdt_sema_off; @@ -1144,7 +1160,7 @@ struct bpf_link *usdt_manager_attach_usdt(struct usdt_manager *man, const struct static int parse_usdt_note(GElf_Nhdr *nhdr, const char *data, size_t name_off, size_t desc_off, struct usdt_note *note) { - const char *provider, *name, *args; + const char *provider, *name, *args, *end, *extra; long addrs[3]; size_t len; @@ -1182,6 +1198,15 @@ static int parse_usdt_note(GElf_Nhdr *nhdr, const char *data, size_t name_off, s if (args >= data + len) /* missing arguments spec */ return -EINVAL; + extra = memchr(args, '\0', data + len - args); + if (!extra) /* non-zero-terminated args */ + return -EINVAL; + ++extra; + end = data + len; + + /* check if we have one extra byte and if it's zero */ + note->nop_combo = (extra + 1) == end && *extra == 0; + note->provider = provider; note->name = name; if (*args == '\0' || *args == ':') -- 2.51.1 Adding test that attaches bpf program on usdt probe in 2 scenarios; - attach program on top of usdt_1 which is standard nop probe incidentally followed by nop5. The usdt probe does not have extra data in elf note record, so we expect the probe to land on the first nop without being optimized. - attach program on top of usdt_2 which is probe defined on top of nop,nop5 combo. The extra data in the elf note record and presence of upeobe syscall ensures that the probe is placed on top of nop5 and optimized. Signed-off-by: Jiri Olsa --- tools/testing/selftests/bpf/.gitignore | 2 + tools/testing/selftests/bpf/Makefile | 7 +- tools/testing/selftests/bpf/prog_tests/usdt.c | 82 +++++++++++++++++++ tools/testing/selftests/bpf/progs/test_usdt.c | 9 ++ tools/testing/selftests/bpf/usdt_1.c | 14 ++++ tools/testing/selftests/bpf/usdt_2.c | 13 +++ 6 files changed, 126 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/usdt_1.c create mode 100644 tools/testing/selftests/bpf/usdt_2.c diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index be1ee7ba7ce0..89f480729a6b 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -45,3 +45,5 @@ xdp_synproxy xdp_hw_metadata xdp_features verification_cert.h +usdt_1 +usdt_2 diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 34ea23c63bd5..4a21657e45f7 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -733,6 +733,10 @@ $(VERIFICATION_CERT) $(PRIVATE_KEY): $(VERIFY_SIG_SETUP) $(VERIFY_SIG_HDR): $(VERIFICATION_CERT) $(Q)xxd -i -n test_progs_verification_cert $< > $@ +ifeq ($(SRCARCH),$(filter $(SRCARCH),x86)) +USDTX := usdt_1.c usdt_2.c +endif + # Define test_progs test runner. TRUNNER_TESTS_DIR := prog_tests TRUNNER_BPF_PROGS_DIR := progs @@ -754,7 +758,8 @@ TRUNNER_EXTRA_SOURCES := test_progs.c \ json_writer.c \ $(VERIFY_SIG_HDR) \ flow_dissector_load.h \ - ip_check_defrag_frags.h + ip_check_defrag_frags.h \ + $(USDTX) TRUNNER_LIB_SOURCES := find_bit.c TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read \ $(OUTPUT)/liburandom_read.so \ diff --git a/tools/testing/selftests/bpf/prog_tests/usdt.c b/tools/testing/selftests/bpf/prog_tests/usdt.c index f4be5269fa90..a8ca2920c009 100644 --- a/tools/testing/selftests/bpf/prog_tests/usdt.c +++ b/tools/testing/selftests/bpf/prog_tests/usdt.c @@ -247,6 +247,86 @@ static void subtest_basic_usdt(bool optimized) #undef TRIGGER } +#ifdef __x86_64 +extern void usdt_1(void); +extern void usdt_2(void); + +/* nop, nop5 */ +static unsigned char nop15[6] = { 0x90, 0x0f, 0x1f, 0x44, 0x00, 0x00 }; + +static void *find_nop15(void *fn) +{ + int i; + + for (i = 0; i < 10; i++) { + if (!memcmp(nop15, fn + i, 5)) + return fn + i; + } + return NULL; +} + +static void subtest_optimized_attach(void) +{ + struct test_usdt *skel; + __u8 *addr_1, *addr_2; + + addr_1 = find_nop15(usdt_1); + if (!ASSERT_OK_PTR(addr_1, "find_nop15")) + return; + + addr_2 = find_nop15(usdt_2); + if (!ASSERT_OK_PTR(addr_2, "find_nop15")) + return; + + skel = test_usdt__open_and_load(); + if (!ASSERT_OK_PTR(skel, "test_usdt__open_and_load")) + return; + + /* + * Attach program on top of usdt_1 which is standard nop probe + * incidentally followed by nop5. The usdt probe does not have + * extra data in elf note record, so we expect the probe to land + * on the first nop without being optimized. + */ + skel->links.usdt_executed = bpf_program__attach_usdt(skel->progs.usdt_executed, + 0 /*self*/, "/proc/self/exe", + "optimized_attach", "usdt_1", NULL); + if (!ASSERT_OK_PTR(skel->links.usdt_executed, "bpf_program__attach_usdt")) + goto cleanup; + + usdt_1(); + usdt_1(); + + /* nop is on addr_1 address */ + ASSERT_EQ(*addr_1, 0xcc, "int3"); + ASSERT_EQ(skel->bss->executed, 2, "executed"); + + bpf_link__destroy(skel->links.usdt_executed); + + /* + * Attach program on top of usdt_2 which is probe defined on top + * of nop,nop5 combo. The extra data in the elf note record and + * presence of uprobe syscall ensures that the probe is placed + * on top of nop5 and optimized. + */ + skel->links.usdt_executed = bpf_program__attach_usdt(skel->progs.usdt_executed, + 0 /*self*/, "/proc/self/exe", + "optimized_attach", "usdt_2", NULL); + if (!ASSERT_OK_PTR(skel->links.usdt_executed, "bpf_program__attach_usdt")) + goto cleanup; + + usdt_2(); + usdt_2(); + + /* nop5 is on addr_2 + 1 address */ + ASSERT_EQ(*(addr_2 + 1), 0xe8, "call"); + ASSERT_EQ(skel->bss->executed, 4, "executed"); + +cleanup: + test_usdt__destroy(skel); +} +#endif + unsigned short test_usdt_100_semaphore SEC(".probes"); unsigned short test_usdt_300_semaphore SEC(".probes"); unsigned short test_usdt_400_semaphore SEC(".probes"); @@ -516,6 +596,8 @@ void test_usdt(void) #ifdef __x86_64__ if (test__start_subtest("basic_optimized")) subtest_basic_usdt(true); + if (test__start_subtest("optimized_attach")) + subtest_optimized_attach(); #endif if (test__start_subtest("multispec")) subtest_multispec_usdt(); diff --git a/tools/testing/selftests/bpf/progs/test_usdt.c b/tools/testing/selftests/bpf/progs/test_usdt.c index a78c87537b07..6911868cdf67 100644 --- a/tools/testing/selftests/bpf/progs/test_usdt.c +++ b/tools/testing/selftests/bpf/progs/test_usdt.c @@ -138,4 +138,13 @@ int usdt_sib(struct pt_regs *ctx) return 0; } +int executed; + +SEC("usdt") +int usdt_executed(struct pt_regs *ctx) +{ + executed++; + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/usdt_1.c b/tools/testing/selftests/bpf/usdt_1.c new file mode 100644 index 000000000000..0e00702b1701 --- /dev/null +++ b/tools/testing/selftests/bpf/usdt_1.c @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Include usdt.h with defined USDT_NOP macro will switch + * off the extra info in usdt probe. + */ +#define USDT_NOP .byte 0x90, 0x0f, 0x1f, 0x44, 0x00, 0x00 +#include "usdt.h" + +__attribute__((aligned(16))) +void usdt_1(void) +{ + USDT(optimized_attach, usdt_1); +} diff --git a/tools/testing/selftests/bpf/usdt_2.c b/tools/testing/selftests/bpf/usdt_2.c new file mode 100644 index 000000000000..fcb7417a1953 --- /dev/null +++ b/tools/testing/selftests/bpf/usdt_2.c @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Include usdt.h with the extra info in usdt probe and + * nop/nop5 combo for usdt attach point. + */ +#include "usdt.h" + +__attribute__((aligned(16))) +void usdt_2(void) +{ + USDT(optimized_attach, usdt_2); +} -- 2.51.1