Introduced the definition of struct bpf_term_aux_states required to support fast-path termination of BPF programs. Added the memory allocation and free logic for newly added term_states feild in struct bpf_prog. Signed-off-by: Raj Sahu Signed-off-by: Siddharth Chintamaneni --- include/linux/bpf.h | 75 +++++++++++++++++++++++++++++---------------- kernel/bpf/core.c | 31 +++++++++++++++++++ 2 files changed, 79 insertions(+), 27 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8f6e87f0f3a8..caaee33744fc 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1584,6 +1584,25 @@ struct bpf_stream_stage { int len; }; +struct call_aux_states { + int call_bpf_insn_idx; + int jit_call_idx; + u8 is_helper_kfunc; + u8 is_bpf_loop; + u8 is_bpf_loop_cb_inline; +}; + +struct bpf_term_patch_call_sites { + u32 call_sites_cnt; + struct call_aux_states *call_states; +}; + +struct bpf_term_aux_states { + struct bpf_prog *prog; + struct work_struct work; + struct bpf_term_patch_call_sites *patch_call_sites; +}; + struct bpf_prog_aux { atomic64_t refcnt; u32 used_map_cnt; @@ -1618,6 +1637,7 @@ struct bpf_prog_aux { bool tail_call_reachable; bool xdp_has_frags; bool exception_cb; + bool is_bpf_loop_cb_non_inline; bool exception_boundary; bool is_extended; /* true if extended by freplace program */ bool jits_use_priv_stack; @@ -1696,33 +1716,34 @@ struct bpf_prog_aux { }; struct bpf_prog { - u16 pages; /* Number of allocated pages */ - u16 jited:1, /* Is our filter JIT'ed? */ - jit_requested:1,/* archs need to JIT the prog */ - gpl_compatible:1, /* Is filter GPL compatible? */ - cb_access:1, /* Is control block accessed? */ - dst_needed:1, /* Do we need dst entry? */ - blinding_requested:1, /* needs constant blinding */ - blinded:1, /* Was blinded */ - is_func:1, /* program is a bpf function */ - kprobe_override:1, /* Do we override a kprobe? */ - has_callchain_buf:1, /* callchain buffer allocated? */ - enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */ - call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */ - call_get_func_ip:1, /* Do we call get_func_ip() */ - tstamp_type_access:1, /* Accessed __sk_buff->tstamp_type */ - sleepable:1; /* BPF program is sleepable */ - enum bpf_prog_type type; /* Type of BPF program */ - enum bpf_attach_type expected_attach_type; /* For some prog types */ - u32 len; /* Number of filter blocks */ - u32 jited_len; /* Size of jited insns in bytes */ - u8 tag[BPF_TAG_SIZE]; - struct bpf_prog_stats __percpu *stats; - int __percpu *active; - unsigned int (*bpf_func)(const void *ctx, - const struct bpf_insn *insn); - struct bpf_prog_aux *aux; /* Auxiliary fields */ - struct sock_fprog_kern *orig_prog; /* Original BPF program */ + u16 pages; /* Number of allocated pages */ + u16 jited:1, /* Is our filter JIT'ed? */ + jit_requested:1,/* archs need to JIT the prog */ + gpl_compatible:1, /* Is filter GPL compatible? */ + cb_access:1, /* Is control block accessed? */ + dst_needed:1, /* Do we need dst entry? */ + blinding_requested:1, /* needs constant blinding */ + blinded:1, /* Was blinded */ + is_func:1, /* program is a bpf function */ + kprobe_override:1, /* Do we override a kprobe? */ + has_callchain_buf:1, /* callchain buffer allocated? */ + enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */ + call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */ + call_get_func_ip:1, /* Do we call get_func_ip() */ + tstamp_type_access:1, /* Accessed __sk_buff->tstamp_type */ + sleepable:1; /* BPF program is sleepable */ + enum bpf_prog_type type; /* Type of BPF program */ + enum bpf_attach_type expected_attach_type; /* For some prog types */ + u32 len; /* Number of filter blocks */ + u32 jited_len; /* Size of jited insns in bytes */ + u8 tag[BPF_TAG_SIZE]; + struct bpf_prog_stats __percpu *stats; + int __percpu *active; + unsigned int (*bpf_func)(const void *ctx, + const struct bpf_insn *insn); + struct bpf_prog_aux *aux; /* Auxiliary fields */ + struct sock_fprog_kern *orig_prog; /* Original BPF program */ + struct bpf_term_aux_states *term_states; /* Instructions for interpreter */ union { DECLARE_FLEX_ARRAY(struct sock_filter, insns); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ef01cc644a96..740b5a3a6b55 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -100,6 +100,8 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags); struct bpf_prog_aux *aux; struct bpf_prog *fp; + struct bpf_term_aux_states *term_states = NULL; + struct bpf_term_patch_call_sites *patch_call_sites = NULL; size = round_up(size, __PAGE_SIZE); fp = __vmalloc(size, gfp_flags); @@ -118,11 +120,24 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag return NULL; } + term_states = kzalloc(sizeof(*term_states), bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags)); + if (!term_states) + goto free_alloc_percpu; + + patch_call_sites = kzalloc(sizeof(*patch_call_sites), bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags)); + if (!patch_call_sites) + goto free_bpf_term_states; + fp->pages = size / PAGE_SIZE; fp->aux = aux; fp->aux->prog = fp; fp->jit_requested = ebpf_jit_enabled(); fp->blinding_requested = bpf_jit_blinding_enabled(fp); + fp->term_states = term_states; + fp->term_states->patch_call_sites = patch_call_sites; + fp->term_states->patch_call_sites->call_sites_cnt = 0; + fp->term_states->prog = fp; + #ifdef CONFIG_CGROUP_BPF aux->cgroup_atype = CGROUP_BPF_ATTACH_TYPE_INVALID; #endif @@ -140,6 +155,15 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag #endif return fp; + +free_bpf_term_states: + kfree(term_states); +free_alloc_percpu: + free_percpu(fp->active); + kfree(aux); + vfree(fp); + + return NULL; } struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) @@ -266,6 +290,7 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE); fp->pages = pages; fp->aux->prog = fp; + fp->term_states->prog = fp; /* We keep fp->aux from fp_old around in the new * reallocated structure. @@ -273,6 +298,7 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, fp_old->aux = NULL; fp_old->stats = NULL; fp_old->active = NULL; + fp_old->term_states = NULL; __bpf_prog_free(fp_old); } @@ -287,6 +313,11 @@ void __bpf_prog_free(struct bpf_prog *fp) kfree(fp->aux->poke_tab); kfree(fp->aux); } + if (fp->term_states) { + if (fp->term_states->patch_call_sites) + kfree(fp->term_states->patch_call_sites); + kfree(fp->term_states); + } free_percpu(fp->stats); free_percpu(fp->active); vfree(fp); -- 2.43.0 Create callsites tables and store jit indexes of RET_NULL calls to poke them later with dummy functions. Additional to jit indexes, meta data about helpers/kfuncs/loops is stored. Later this could be extended to remaining potential long running iterator helpers/kfuncs. Signed-off-by: Raj Sahu Signed-off-by: Siddharth Chintamaneni --- arch/x86/net/bpf_jit_comp.c | 9 +++ include/linux/bpf_verifier.h | 1 + kernel/bpf/core.c | 5 +- kernel/bpf/verifier.c | 135 +++++++++++++++++++++++++++++++---- 4 files changed, 137 insertions(+), 13 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 7e3fca164620..107a44729675 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -3733,6 +3733,15 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) } if (!image || !prog->is_func || extra_pass) { + + if (addrs) { + struct bpf_term_patch_call_sites *patch_call_sites = prog->term_states->patch_call_sites; + for (int i = 0; i < patch_call_sites->call_sites_cnt; i++) { + struct call_aux_states *call_states = patch_call_sites->call_states + i; + call_states->jit_call_idx = addrs[call_states->call_bpf_insn_idx]; + } + } + if (image) bpf_prog_fill_jited_linfo(prog, addrs + 1); out_addrs: diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 020de62bd09c..2c8bfde8191a 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -677,6 +677,7 @@ struct bpf_subprog_info { bool is_cb: 1; bool is_async_cb: 1; bool is_exception_cb: 1; + bool is_bpf_loop_cb_non_inline: 1; bool args_cached: 1; /* true if bpf_fastcall stack region is used by functions that can't be inlined */ bool keep_fastcall_stack: 1; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 740b5a3a6b55..93442ab2acde 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -136,6 +136,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag fp->term_states = term_states; fp->term_states->patch_call_sites = patch_call_sites; fp->term_states->patch_call_sites->call_sites_cnt = 0; + fp->term_states->patch_call_sites->call_states = NULL; fp->term_states->prog = fp; #ifdef CONFIG_CGROUP_BPF @@ -314,8 +315,10 @@ void __bpf_prog_free(struct bpf_prog *fp) kfree(fp->aux); } if (fp->term_states) { - if (fp->term_states->patch_call_sites) + if (fp->term_states->patch_call_sites) { + vfree(fp->term_states->patch_call_sites->call_states); kfree(fp->term_states->patch_call_sites); + } kfree(fp->term_states); } free_percpu(fp->stats); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b9394f8fac0e..1d27208e1078 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3491,6 +3491,7 @@ static int add_subprog_and_kfunc(struct bpf_verifier_env *env) * logic. 'subprog_cnt' should not be increased. */ subprog[env->subprog_cnt].start = insn_cnt; + subprog[env->subprog_cnt].is_bpf_loop_cb_non_inline = false; if (env->log.level & BPF_LOG_LEVEL2) for (i = 0; i < env->subprog_cnt; i++) @@ -11319,19 +11320,30 @@ static bool loop_flag_is_zero(struct bpf_verifier_env *env) static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno) { struct bpf_loop_inline_state *state = &cur_aux(env)->loop_inline_state; + struct bpf_subprog_info *prev_info, *info = subprog_info(env, subprogno); if (!state->initialized) { state->initialized = 1; state->fit_for_inline = loop_flag_is_zero(env); state->callback_subprogno = subprogno; + if (!state->fit_for_inline) + info->is_bpf_loop_cb_non_inline = 1; return; } - if (!state->fit_for_inline) + if (!state->fit_for_inline) { + info->is_bpf_loop_cb_non_inline = 1; return; + } state->fit_for_inline = (loop_flag_is_zero(env) && state->callback_subprogno == subprogno); + + if (state->callback_subprogno != subprogno) { + info->is_bpf_loop_cb_non_inline = 1; + prev_info = subprog_info(env, state->callback_subprogno); + prev_info->is_bpf_loop_cb_non_inline = 1; + } } /* Returns whether or not the given map type can potentially elide @@ -21120,6 +21132,9 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, int i, patch_len, delta = 0, len = env->prog->len; struct bpf_insn *insns = env->prog->insnsi; struct bpf_prog *new_prog; + struct bpf_term_aux_states *term_states = env->prog->term_states; + u32 call_sites_cnt = term_states->patch_call_sites->call_sites_cnt; + struct call_aux_states *call_states = term_states->patch_call_sites->call_states; bool rnd_hi32; rnd_hi32 = attr->prog_flags & BPF_F_TEST_RND_HI32; @@ -21205,6 +21220,15 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, insns = new_prog->insnsi; aux = env->insn_aux_data; delta += patch_len - 1; + + /* Adust call instruction offsets + * w.r.t adj_idx + */ + for (int iter = 0; iter < call_sites_cnt; iter++) { + if (call_states[iter].call_bpf_insn_idx < adj_idx) + continue; + call_states[iter].call_bpf_insn_idx += patch_len - 1; + } } return 0; @@ -21597,6 +21621,26 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->func_info_cnt = prog->aux->func_info_cnt; func[i]->aux->poke_tab = prog->aux->poke_tab; func[i]->aux->size_poke_tab = prog->aux->size_poke_tab; + func[i]->aux->is_bpf_loop_cb_non_inline = env->subprog_info[i].is_bpf_loop_cb_non_inline; + + if (prog->term_states->patch_call_sites->call_sites_cnt != 0) { + int call_sites_cnt = 0; + struct call_aux_states *func_call_states; + func_call_states = vzalloc(sizeof(*func_call_states) * len); + if (!func_call_states) + goto out_free; + for (int iter = 0; iter < prog->term_states->patch_call_sites->call_sites_cnt; iter++) { + struct call_aux_states call_states = prog->term_states->patch_call_sites->call_states[iter]; + if (call_states.call_bpf_insn_idx >= subprog_start + && call_states.call_bpf_insn_idx < subprog_end) { + func_call_states[call_sites_cnt] = call_states; + func_call_states[call_sites_cnt].call_bpf_insn_idx -= subprog_start; + call_sites_cnt++; + } + } + func[i]->term_states->patch_call_sites->call_sites_cnt = call_sites_cnt; + func[i]->term_states->patch_call_sites->call_states = func_call_states; + } for (j = 0; j < prog->aux->size_poke_tab; j++) { struct bpf_jit_poke_descriptor *poke; @@ -21886,15 +21930,21 @@ static void __fixup_collection_insert_kfunc(struct bpf_insn_aux_data *insn_aux, } static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, - struct bpf_insn *insn_buf, int insn_idx, int *cnt) + struct bpf_insn *insn_buf, int insn_idx, int *cnt, int *kfunc_btf_id) { const struct bpf_kfunc_desc *desc; + struct bpf_kfunc_call_arg_meta meta; + int err; if (!insn->imm) { verbose(env, "invalid kernel function call not eliminated in verifier pass\n"); return -EINVAL; } + err = fetch_kfunc_meta(env, insn, &meta, NULL); + if (err) + return err; + *cnt = 0; /* insn->imm has the btf func_id. Replace it with an offset relative to @@ -21908,8 +21958,11 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EFAULT; } - if (!bpf_jit_supports_far_kfunc_call()) + if (!bpf_jit_supports_far_kfunc_call()) { + if (meta.kfunc_flags & KF_RET_NULL) + *kfunc_btf_id = insn->imm; insn->imm = BPF_CALL_IMM(desc->addr); + } if (insn->off) return 0; if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl] || @@ -22019,6 +22072,13 @@ static int add_hidden_subprog(struct bpf_verifier_env *env, struct bpf_insn *pat return 0; } +static bool is_bpf_loop_call(struct bpf_insn *insn) +{ + return insn->code == (BPF_JMP | BPF_CALL) && + insn->src_reg == 0 && + insn->imm == BPF_FUNC_loop; +} + /* Do various post-verification rewrites in a single program pass. * These rewrites simplify JIT and interpreter implementations. */ @@ -22039,6 +22099,12 @@ static int do_misc_fixups(struct bpf_verifier_env *env) struct bpf_subprog_info *subprogs = env->subprog_info; u16 stack_depth = subprogs[cur_subprog].stack_depth; u16 stack_depth_extra = 0; + u32 call_sites_cnt = 0; + struct call_aux_states *call_states; + + call_states = vzalloc(sizeof(*call_states) * prog->len); + if (!call_states) + return -ENOMEM; if (env->seen_exception && !env->exception_callback_subprog) { struct bpf_insn *patch = insn_buf; @@ -22368,11 +22434,12 @@ static int do_misc_fixups(struct bpf_verifier_env *env) if (insn->src_reg == BPF_PSEUDO_CALL) goto next_insn; if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { - ret = fixup_kfunc_call(env, insn, insn_buf, i + delta, &cnt); + int kfunc_btf_id = 0; + ret = fixup_kfunc_call(env, insn, insn_buf, i + delta, &cnt, &kfunc_btf_id); if (ret) return ret; if (cnt == 0) - goto next_insn; + goto store_call_indices; new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); if (!new_prog) @@ -22381,6 +22448,12 @@ static int do_misc_fixups(struct bpf_verifier_env *env) delta += cnt - 1; env->prog = prog = new_prog; insn = new_prog->insnsi + i + delta; +store_call_indices: + if (kfunc_btf_id != 0) { + call_states[call_sites_cnt].call_bpf_insn_idx = i + delta; + call_states[call_sites_cnt].is_helper_kfunc = 1; + call_sites_cnt++; + } goto next_insn; } @@ -22859,6 +22932,15 @@ static int do_misc_fixups(struct bpf_verifier_env *env) func_id_name(insn->imm), insn->imm); return -EFAULT; } + + if ((fn->ret_type & PTR_MAYBE_NULL) || is_bpf_loop_call(insn)) { + call_states[call_sites_cnt].call_bpf_insn_idx = i + delta; + if (is_bpf_loop_call(insn)) + call_states[call_sites_cnt].is_bpf_loop = 1; + else + call_states[call_sites_cnt].is_helper_kfunc = 1; + call_sites_cnt++; + } insn->imm = fn->func - __bpf_call_base; next_insn: if (subprogs[cur_subprog + 1].start == i + delta + 1) { @@ -22879,6 +22961,8 @@ static int do_misc_fixups(struct bpf_verifier_env *env) insn++; } + env->prog->term_states->patch_call_sites->call_sites_cnt = call_sites_cnt; + env->prog->term_states->patch_call_sites->call_states = call_states; env->prog->aux->stack_depth = subprogs[0].stack_depth; for (i = 0; i < env->subprog_cnt; i++) { int delta = bpf_jit_supports_timed_may_goto() ? 2 : 1; @@ -23014,17 +23098,12 @@ static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env, call_insn_offset = position + 12; callback_offset = callback_start - call_insn_offset - 1; new_prog->insnsi[call_insn_offset].imm = callback_offset; + /* Marking offset field to identify loop cb */ + new_prog->insnsi[call_insn_offset].off = 0x1; return new_prog; } -static bool is_bpf_loop_call(struct bpf_insn *insn) -{ - return insn->code == (BPF_JMP | BPF_CALL) && - insn->src_reg == 0 && - insn->imm == BPF_FUNC_loop; -} - /* For all sub-programs in the program (including main) check * insn_aux_data to see if there are bpf_loop calls that require * inlining. If such calls are found the calls are replaced with a @@ -24584,6 +24663,35 @@ static int compute_scc(struct bpf_verifier_env *env) return err; } +static int fix_call_sites(struct bpf_verifier_env *env) +{ + int err = 0, i, subprog; + struct bpf_insn *insn; + struct bpf_prog *prog = env->prog; + struct bpf_term_aux_states *term_states = env->prog->term_states; + u32 *call_sites_cnt = &term_states->patch_call_sites->call_sites_cnt; + struct call_aux_states *call_states = term_states->patch_call_sites->call_states; + + if (!env->subprog_cnt) + return 0; + for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { + if (!bpf_pseudo_func(insn) && !bpf_pseudo_call(insn)) + continue; + + subprog = find_subprog(env, i + insn->imm + 1); + if (subprog < 0) + return -EFAULT; + + if (insn->off == 0x1) { + call_states[*call_sites_cnt].call_bpf_insn_idx = i; + call_states[*call_sites_cnt].is_bpf_loop_cb_inline = 1; + *call_sites_cnt = *call_sites_cnt + 1; + prog->insnsi[i].off = 0x0; /* Removing the marker */ + } + } + return err; +} + int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size) { u64 start_time = ktime_get_ns(); @@ -24769,6 +24877,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 : false; } + if (ret == 0) + ret = fix_call_sites(env); + if (ret == 0) ret = fixup_call_args(env); -- 2.43.0 Update softlock detection logic to detect any stalls due to BPF programs. When softlockup is detected, bpf_die will be added to a workqueue on a CPU. With this implementation termination handler will only get triggered when CONFIG_SOFTLOCKUP_DETECTOR is enabled. Inside bpf_die, we perform the text_poke to stub helpers/kfuncs. The current implementation handles termination of long running bpf_loop iterators both inlining and non-inlining case. The limitation of this implementation is that the termination handler atleast need a single CPU to run. Signed-off-by: Raj Sahu Signed-off-by: Siddharth Chintamaneni --- arch/x86/net/bpf_jit_comp.c | 132 ++++++++++++++++++++++++++++++++++++ include/linux/bpf.h | 2 + include/linux/filter.h | 6 ++ kernel/bpf/core.c | 35 +++++++++- kernel/watchdog.c | 8 +++ 5 files changed, 182 insertions(+), 1 deletion(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 107a44729675..4de9a8cdc465 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -2606,6 +2606,10 @@ st: if (is_imm8(insn->off)) if (arena_vm_start) pop_r12(&prog); } + /* emiting 5 byte nop for non-inline bpf_loop callback */ + if (bpf_is_subprog(bpf_prog) && bpf_prog->aux->is_bpf_loop_cb_non_inline) { + emit_nops(&prog, X86_PATCH_SIZE); + } EMIT1(0xC9); /* leave */ emit_return(&prog, image + addrs[i - 1] + (prog - temp)); break; @@ -3833,6 +3837,8 @@ bool bpf_jit_supports_private_stack(void) return true; } + + void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie) { #if defined(CONFIG_UNWINDER_ORC) @@ -3849,6 +3855,132 @@ void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp #endif } +void in_place_patch_bpf_prog(struct bpf_prog *prog) +{ + struct call_aux_states *call_states; + unsigned long new_target; + unsigned char *addr; + u8 ret_jmp_size = 1; + if (cpu_wants_rethunk()) { + ret_jmp_size = 5; + } + call_states = prog->term_states->patch_call_sites->call_states; + for (int i = 0; i < prog->term_states->patch_call_sites->call_sites_cnt; i++) { + + new_target = (unsigned long) bpf_termination_null_func; + if (call_states[i].is_bpf_loop_cb_inline) { + new_target = (unsigned long) bpf_loop_term_callback; + } + char new_insn[5]; + + addr = (unsigned char *)prog->bpf_func + call_states->jit_call_idx; + + unsigned long new_rel = (unsigned long)(new_target - (unsigned long)(addr + 5)); + new_insn[0] = 0xE8; + new_insn[1] = (new_rel >> 0) & 0xFF; + new_insn[2] = (new_rel >> 8) & 0xFF; + new_insn[3] = (new_rel >> 16) & 0xFF; + new_insn[4] = (new_rel >> 24) & 0xFF; + + smp_text_poke_batch_add(addr, new_insn, 5 /* call instruction len */, NULL); + } + + if (prog->aux->is_bpf_loop_cb_non_inline) { + + char new_insn[5] = { 0xB8, 0x01, 0x00, 0x00, 0x00 }; + char old_insn[5] = { 0x0F, 0x1F, 0x44, 0x00, 0x00 }; + smp_text_poke_batch_add(prog->bpf_func + prog->jited_len - + (1 + ret_jmp_size) /* leave, jmp/ ret */ - 5 /* nop size */, new_insn, 5 /* mov eax, 1 */, old_insn); + } + + + /* flush all text poke calls */ + smp_text_poke_batch_finish(); +} + +void bpf_die(struct bpf_prog *prog) +{ + u8 ret_jmp_size = 1; + if (cpu_wants_rethunk()) { + ret_jmp_size = 5; + } + + /* + * Replacing 5 byte nop in prologue with jmp instruction to ret + */ + unsigned long jmp_offset = prog->jited_len - (4 /* First endbr is 4 bytes */ + + 5 /* noop is 5 bytes */ + + ret_jmp_size /* 5 bytes of jmp return_thunk or 1 byte ret*/); + + char new_insn[5]; + new_insn[0] = 0xE9; + new_insn[1] = (jmp_offset >> 0) & 0xFF; + new_insn[2] = (jmp_offset >> 8) & 0xFF; + new_insn[3] = (jmp_offset >> 16) & 0xFF; + new_insn[4] = (jmp_offset >> 24) & 0xFF; + + smp_text_poke_batch_add(prog->bpf_func + 4, new_insn, 5, NULL); + + if (prog->aux->func_cnt) { + for (int i = 0; i < prog->aux->func_cnt; i++) { + in_place_patch_bpf_prog(prog->aux->func[i]); + } + } else { + in_place_patch_bpf_prog(prog); + } + +} + +void bpf_prog_termination_deferred(struct work_struct *work) +{ + struct bpf_term_aux_states *term_states = container_of(work, struct bpf_term_aux_states, + work); + struct bpf_prog *prog = term_states->prog; + + bpf_die(prog); +} + +static struct workqueue_struct *bpf_termination_wq; + +void bpf_softlockup(u32 dur_s) +{ + unsigned long addr; + struct unwind_state state; + struct bpf_prog *prog; + + for (unwind_start(&state, current, NULL, NULL); !unwind_done(&state); + unwind_next_frame(&state)) { + addr = unwind_get_return_address(&state); + if (!addr) + break; + + if (!is_bpf_text_address(addr)) + continue; + + rcu_read_lock(); + prog = bpf_prog_ksym_find(addr); + rcu_read_unlock(); + if (bpf_is_subprog(prog)) + continue; + + if (atomic_cmpxchg(&prog->term_states->bpf_die_in_progress, 0, 1)) + break; + + bpf_termination_wq = alloc_workqueue("bpf_termination_wq", WQ_UNBOUND, 1); + if (!bpf_termination_wq) + pr_err("Failed to alloc workqueue for bpf termination.\n"); + + queue_work(bpf_termination_wq, &prog->term_states->work); + + /* Currently nested programs are not terminated together. + * Removing this break will result in BPF trampolines being + * identified as is_bpf_text_address resulting in NULL ptr + * deref in next step. + */ + break; + } +} + void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke, struct bpf_prog *new, struct bpf_prog *old) { diff --git a/include/linux/bpf.h b/include/linux/bpf.h index caaee33744fc..03fce8f2c466 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -71,6 +71,7 @@ typedef int (*bpf_iter_init_seq_priv_t)(void *private_data, typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data); typedef unsigned int (*bpf_func_t)(const void *, const struct bpf_insn *); + struct bpf_iter_seq_info { const struct seq_operations *seq_ops; bpf_iter_init_seq_priv_t init_seq_private; @@ -1600,6 +1601,7 @@ struct bpf_term_patch_call_sites { struct bpf_term_aux_states { struct bpf_prog *prog; struct work_struct work; + atomic_t bpf_die_in_progress; struct bpf_term_patch_call_sites *patch_call_sites; }; diff --git a/include/linux/filter.h b/include/linux/filter.h index 9092d8ea95c8..4f0f8fe478bf 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1123,6 +1123,8 @@ int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len); bool sk_filter_charge(struct sock *sk, struct sk_filter *fp); void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp); +void *bpf_termination_null_func(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); +int bpf_loop_term_callback(u64 reg_loop_cnt, u64 *reg_loop_ctx); u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); #define __bpf_call_base_args \ ((u64 (*)(u64, u64, u64, u64, u64, const struct bpf_insn *)) \ @@ -1257,6 +1259,10 @@ bpf_jit_binary_pack_hdr(const struct bpf_prog *fp); void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns); void bpf_prog_pack_free(void *ptr, u32 size); +void bpf_softlockup(u32 dur_s); +void bpf_prog_termination_deferred(struct work_struct *work); +void bpf_die(struct bpf_prog *prog); +void in_place_patch_bpf_prog(struct bpf_prog *prog); static inline bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp) { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 93442ab2acde..7b0552d15be3 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -41,6 +41,7 @@ #include #include +#include #include /* Registers */ @@ -95,6 +96,37 @@ enum page_size_enum { __PAGE_SIZE = PAGE_SIZE }; +void *bpf_termination_null_func(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + return NULL; +} + +int bpf_loop_term_callback(u64 reg_loop_cnt, u64 *reg_loop_ctx) +{ + return 1; +} + + +void __weak in_place_patch_bpf_prog(struct bpf_prog *prog) +{ + return; +} + +void __weak bpf_die(struct bpf_prog *prog) +{ + return; +} + +void __weak bpf_prog_termination_deferred(struct work_struct *work) +{ + return; +} + +void __weak bpf_softlockup(u32 dur_s) +{ + return; +} + struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags) { gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags); @@ -134,11 +166,12 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag fp->jit_requested = ebpf_jit_enabled(); fp->blinding_requested = bpf_jit_blinding_enabled(fp); fp->term_states = term_states; + atomic_set(&fp->term_states->bpf_die_in_progress, 0); fp->term_states->patch_call_sites = patch_call_sites; fp->term_states->patch_call_sites->call_sites_cnt = 0; fp->term_states->patch_call_sites->call_states = NULL; fp->term_states->prog = fp; - + INIT_WORK(&fp->term_states->work, bpf_prog_termination_deferred); #ifdef CONFIG_CGROUP_BPF aux->cgroup_atype = CGROUP_BPF_ATTACH_TYPE_INVALID; #endif diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 80b56c002c7f..59c91c18ca0e 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -700,6 +701,13 @@ static int is_softlockup(unsigned long touch_ts, if (time_after_eq(now, period_ts + get_softlockup_thresh() * 3 / 4)) scx_softlockup(now - touch_ts); + /* + * Long running BPF programs can cause CPU's to stall. + * So trigger fast path termination to terminate such BPF programs. + */ + if (time_after_eq(now, period_ts + get_softlockup_thresh() * 3 / 4)) + bpf_softlockup(now - touch_ts); + /* Warn about unreasonable delays. */ if (time_after(now, period_ts + get_softlockup_thresh())) return now - touch_ts; -- 2.43.0 Adds tests checks for loops termination which are nested. 32/1 bpf_termination/bpf_termination:OK 32 bpf_termination:OK Summary: 1/1 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Raj Sahu Signed-off-by: Siddharth Chintamaneni --- .../bpf/prog_tests/bpf_termination.c | 39 +++++++++++++++ .../selftests/bpf/progs/bpf_termination.c | 47 +++++++++++++++++++ 2 files changed, 86 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/bpf_termination.c create mode 100644 tools/testing/selftests/bpf/progs/bpf_termination.c diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_termination.c b/tools/testing/selftests/bpf/prog_tests/bpf_termination.c new file mode 100644 index 000000000000..d060073db8f9 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/bpf_termination.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +#include "bpf_termination.skel.h" + +void test_loop_termination(void) +{ + struct bpf_termination *skel; + int err; + + skel = bpf_termination__open(); + if (!ASSERT_OK_PTR(skel, "bpf_termination__open")) + return; + + err = bpf_termination__load(skel); + if (!ASSERT_OK(err, "bpf_termination__load")) + goto out; + + skel->bss->pid = getpid(); + err = bpf_termination__attach(skel); + if (!ASSERT_OK(err, "bpf_termination__attach")) + goto out; + + /* Triggers long running BPF program */ + socket(AF_UNSPEC, SOCK_DGRAM, 0); + + /* If the program is not terminated, it doesn't reach this point */ + ASSERT_TRUE(true, "Program is terminated"); +out: + bpf_termination__destroy(skel); +} + +void test_bpf_termination(void) +{ + if (test__start_subtest("bpf_termination")) + test_loop_termination(); +} diff --git a/tools/testing/selftests/bpf/progs/bpf_termination.c b/tools/testing/selftests/bpf/progs/bpf_termination.c new file mode 100644 index 000000000000..36e97d84750b --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_termination.c @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +int pid; + +#define LOOPS_CNT 1 << 10 + +static int callback_fn4(void *ctx) { + return 0; +} + +static int callback_fn3(void *ctx) { + + bpf_loop(LOOPS_CNT, callback_fn4, NULL, 0); + return 0; + +} + + +static int callback_fn2(void *ctx) { + + bpf_loop(LOOPS_CNT, callback_fn3, NULL, 0); + return 0; + +} + +static int callback_fn(void *ctx) { + + bpf_loop(LOOPS_CNT, callback_fn2, NULL, 0); + return 0; + +} + +SEC("tp/syscalls/sys_enter_socket") +int bpf_loop_lr(void *ctx) { + + if ((bpf_get_current_pid_tgid() >> 32) != pid) + return 0; + + bpf_loop(LOOPS_CNT, callback_fn, NULL, 0); + + return 0; +} + +char _license[] SEC("license") = "GPL"; -- 2.43.0