Update softlock detection logic to detect any stalls due to BPF programs. When softlockup is detected, bpf_die will be added to a workqueue on a CPU. With this implementation termination handler will only get triggered when CONFIG_SOFTLOCKUP_DETECTOR is enabled. Inside bpf_die, we perform the text_poke to stub helpers/kfuncs. The current implementation handles termination of long running bpf_loop iterators both inlining and non-inlining case. The limitation of this implementation is that the termination handler atleast need a single CPU to run. Signed-off-by: Raj Sahu Signed-off-by: Siddharth Chintamaneni --- arch/x86/net/bpf_jit_comp.c | 132 ++++++++++++++++++++++++++++++++++++ include/linux/bpf.h | 2 + include/linux/filter.h | 6 ++ kernel/bpf/core.c | 35 +++++++++- kernel/watchdog.c | 8 +++ 5 files changed, 182 insertions(+), 1 deletion(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 107a44729675..4de9a8cdc465 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -2606,6 +2606,10 @@ st: if (is_imm8(insn->off)) if (arena_vm_start) pop_r12(&prog); } + /* emiting 5 byte nop for non-inline bpf_loop callback */ + if (bpf_is_subprog(bpf_prog) && bpf_prog->aux->is_bpf_loop_cb_non_inline) { + emit_nops(&prog, X86_PATCH_SIZE); + } EMIT1(0xC9); /* leave */ emit_return(&prog, image + addrs[i - 1] + (prog - temp)); break; @@ -3833,6 +3837,8 @@ bool bpf_jit_supports_private_stack(void) return true; } + + void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie) { #if defined(CONFIG_UNWINDER_ORC) @@ -3849,6 +3855,132 @@ void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp #endif } +void in_place_patch_bpf_prog(struct bpf_prog *prog) +{ + struct call_aux_states *call_states; + unsigned long new_target; + unsigned char *addr; + u8 ret_jmp_size = 1; + if (cpu_wants_rethunk()) { + ret_jmp_size = 5; + } + call_states = prog->term_states->patch_call_sites->call_states; + for (int i = 0; i < prog->term_states->patch_call_sites->call_sites_cnt; i++) { + + new_target = (unsigned long) bpf_termination_null_func; + if (call_states[i].is_bpf_loop_cb_inline) { + new_target = (unsigned long) bpf_loop_term_callback; + } + char new_insn[5]; + + addr = (unsigned char *)prog->bpf_func + call_states->jit_call_idx; + + unsigned long new_rel = (unsigned long)(new_target - (unsigned long)(addr + 5)); + new_insn[0] = 0xE8; + new_insn[1] = (new_rel >> 0) & 0xFF; + new_insn[2] = (new_rel >> 8) & 0xFF; + new_insn[3] = (new_rel >> 16) & 0xFF; + new_insn[4] = (new_rel >> 24) & 0xFF; + + smp_text_poke_batch_add(addr, new_insn, 5 /* call instruction len */, NULL); + } + + if (prog->aux->is_bpf_loop_cb_non_inline) { + + char new_insn[5] = { 0xB8, 0x01, 0x00, 0x00, 0x00 }; + char old_insn[5] = { 0x0F, 0x1F, 0x44, 0x00, 0x00 }; + smp_text_poke_batch_add(prog->bpf_func + prog->jited_len - + (1 + ret_jmp_size) /* leave, jmp/ ret */ - 5 /* nop size */, new_insn, 5 /* mov eax, 1 */, old_insn); + } + + + /* flush all text poke calls */ + smp_text_poke_batch_finish(); +} + +void bpf_die(struct bpf_prog *prog) +{ + u8 ret_jmp_size = 1; + if (cpu_wants_rethunk()) { + ret_jmp_size = 5; + } + + /* + * Replacing 5 byte nop in prologue with jmp instruction to ret + */ + unsigned long jmp_offset = prog->jited_len - (4 /* First endbr is 4 bytes */ + + 5 /* noop is 5 bytes */ + + ret_jmp_size /* 5 bytes of jmp return_thunk or 1 byte ret*/); + + char new_insn[5]; + new_insn[0] = 0xE9; + new_insn[1] = (jmp_offset >> 0) & 0xFF; + new_insn[2] = (jmp_offset >> 8) & 0xFF; + new_insn[3] = (jmp_offset >> 16) & 0xFF; + new_insn[4] = (jmp_offset >> 24) & 0xFF; + + smp_text_poke_batch_add(prog->bpf_func + 4, new_insn, 5, NULL); + + if (prog->aux->func_cnt) { + for (int i = 0; i < prog->aux->func_cnt; i++) { + in_place_patch_bpf_prog(prog->aux->func[i]); + } + } else { + in_place_patch_bpf_prog(prog); + } + +} + +void bpf_prog_termination_deferred(struct work_struct *work) +{ + struct bpf_term_aux_states *term_states = container_of(work, struct bpf_term_aux_states, + work); + struct bpf_prog *prog = term_states->prog; + + bpf_die(prog); +} + +static struct workqueue_struct *bpf_termination_wq; + +void bpf_softlockup(u32 dur_s) +{ + unsigned long addr; + struct unwind_state state; + struct bpf_prog *prog; + + for (unwind_start(&state, current, NULL, NULL); !unwind_done(&state); + unwind_next_frame(&state)) { + addr = unwind_get_return_address(&state); + if (!addr) + break; + + if (!is_bpf_text_address(addr)) + continue; + + rcu_read_lock(); + prog = bpf_prog_ksym_find(addr); + rcu_read_unlock(); + if (bpf_is_subprog(prog)) + continue; + + if (atomic_cmpxchg(&prog->term_states->bpf_die_in_progress, 0, 1)) + break; + + bpf_termination_wq = alloc_workqueue("bpf_termination_wq", WQ_UNBOUND, 1); + if (!bpf_termination_wq) + pr_err("Failed to alloc workqueue for bpf termination.\n"); + + queue_work(bpf_termination_wq, &prog->term_states->work); + + /* Currently nested programs are not terminated together. + * Removing this break will result in BPF trampolines being + * identified as is_bpf_text_address resulting in NULL ptr + * deref in next step. + */ + break; + } +} + void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke, struct bpf_prog *new, struct bpf_prog *old) { diff --git a/include/linux/bpf.h b/include/linux/bpf.h index caaee33744fc..03fce8f2c466 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -71,6 +71,7 @@ typedef int (*bpf_iter_init_seq_priv_t)(void *private_data, typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data); typedef unsigned int (*bpf_func_t)(const void *, const struct bpf_insn *); + struct bpf_iter_seq_info { const struct seq_operations *seq_ops; bpf_iter_init_seq_priv_t init_seq_private; @@ -1600,6 +1601,7 @@ struct bpf_term_patch_call_sites { struct bpf_term_aux_states { struct bpf_prog *prog; struct work_struct work; + atomic_t bpf_die_in_progress; struct bpf_term_patch_call_sites *patch_call_sites; }; diff --git a/include/linux/filter.h b/include/linux/filter.h index 9092d8ea95c8..4f0f8fe478bf 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1123,6 +1123,8 @@ int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len); bool sk_filter_charge(struct sock *sk, struct sk_filter *fp); void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp); +void *bpf_termination_null_func(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); +int bpf_loop_term_callback(u64 reg_loop_cnt, u64 *reg_loop_ctx); u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); #define __bpf_call_base_args \ ((u64 (*)(u64, u64, u64, u64, u64, const struct bpf_insn *)) \ @@ -1257,6 +1259,10 @@ bpf_jit_binary_pack_hdr(const struct bpf_prog *fp); void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns); void bpf_prog_pack_free(void *ptr, u32 size); +void bpf_softlockup(u32 dur_s); +void bpf_prog_termination_deferred(struct work_struct *work); +void bpf_die(struct bpf_prog *prog); +void in_place_patch_bpf_prog(struct bpf_prog *prog); static inline bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp) { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 93442ab2acde..7b0552d15be3 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -41,6 +41,7 @@ #include #include +#include #include /* Registers */ @@ -95,6 +96,37 @@ enum page_size_enum { __PAGE_SIZE = PAGE_SIZE }; +void *bpf_termination_null_func(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + return NULL; +} + +int bpf_loop_term_callback(u64 reg_loop_cnt, u64 *reg_loop_ctx) +{ + return 1; +} + + +void __weak in_place_patch_bpf_prog(struct bpf_prog *prog) +{ + return; +} + +void __weak bpf_die(struct bpf_prog *prog) +{ + return; +} + +void __weak bpf_prog_termination_deferred(struct work_struct *work) +{ + return; +} + +void __weak bpf_softlockup(u32 dur_s) +{ + return; +} + struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags) { gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags); @@ -134,11 +166,12 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag fp->jit_requested = ebpf_jit_enabled(); fp->blinding_requested = bpf_jit_blinding_enabled(fp); fp->term_states = term_states; + atomic_set(&fp->term_states->bpf_die_in_progress, 0); fp->term_states->patch_call_sites = patch_call_sites; fp->term_states->patch_call_sites->call_sites_cnt = 0; fp->term_states->patch_call_sites->call_states = NULL; fp->term_states->prog = fp; - + INIT_WORK(&fp->term_states->work, bpf_prog_termination_deferred); #ifdef CONFIG_CGROUP_BPF aux->cgroup_atype = CGROUP_BPF_ATTACH_TYPE_INVALID; #endif diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 80b56c002c7f..59c91c18ca0e 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -700,6 +701,13 @@ static int is_softlockup(unsigned long touch_ts, if (time_after_eq(now, period_ts + get_softlockup_thresh() * 3 / 4)) scx_softlockup(now - touch_ts); + /* + * Long running BPF programs can cause CPU's to stall. + * So trigger fast path termination to terminate such BPF programs. + */ + if (time_after_eq(now, period_ts + get_softlockup_thresh() * 3 / 4)) + bpf_softlockup(now - touch_ts); + /* Warn about unreasonable delays. */ if (time_after(now, period_ts + get_softlockup_thresh())) return now - touch_ts; -- 2.43.0