Introduced the definition of struct bpf_term_aux_states
required to support fast-path termination of BPF programs.
Added the memory allocation and free logic for newly added
term_states feild in struct bpf_prog.

Signed-off-by: Raj Sahu <rjsu26@gmail.com>
Signed-off-by: Siddharth Chintamaneni <sidchintamaneni@gmail.com>
---
 include/linux/bpf.h | 75 +++++++++++++++++++++++++++++----------------
 kernel/bpf/core.c   | 31 +++++++++++++++++++
 2 files changed, 79 insertions(+), 27 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8f6e87f0f3a8..caaee33744fc 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1584,6 +1584,25 @@ struct bpf_stream_stage {
 	int len;
 };
 
+struct call_aux_states {
+	int call_bpf_insn_idx;
+	int jit_call_idx;
+	u8 is_helper_kfunc;
+	u8 is_bpf_loop;
+	u8 is_bpf_loop_cb_inline;
+};
+
+struct bpf_term_patch_call_sites {
+	u32 call_sites_cnt;
+	struct call_aux_states *call_states;
+};
+
+struct bpf_term_aux_states {
+	struct bpf_prog *prog;
+	struct work_struct work;
+	struct bpf_term_patch_call_sites *patch_call_sites;
+};
+
 struct bpf_prog_aux {
 	atomic64_t refcnt;
 	u32 used_map_cnt;
@@ -1618,6 +1637,7 @@ struct bpf_prog_aux {
 	bool tail_call_reachable;
 	bool xdp_has_frags;
 	bool exception_cb;
+	bool is_bpf_loop_cb_non_inline;
 	bool exception_boundary;
 	bool is_extended; /* true if extended by freplace program */
 	bool jits_use_priv_stack;
@@ -1696,33 +1716,34 @@ struct bpf_prog_aux {
 };
 
 struct bpf_prog {
-	u16			pages;		/* Number of allocated pages */
-	u16			jited:1,	/* Is our filter JIT'ed? */
-				jit_requested:1,/* archs need to JIT the prog */
-				gpl_compatible:1, /* Is filter GPL compatible? */
-				cb_access:1,	/* Is control block accessed? */
-				dst_needed:1,	/* Do we need dst entry? */
-				blinding_requested:1, /* needs constant blinding */
-				blinded:1,	/* Was blinded */
-				is_func:1,	/* program is a bpf function */
-				kprobe_override:1, /* Do we override a kprobe? */
-				has_callchain_buf:1, /* callchain buffer allocated? */
-				enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */
-				call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */
-				call_get_func_ip:1, /* Do we call get_func_ip() */
-				tstamp_type_access:1, /* Accessed __sk_buff->tstamp_type */
-				sleepable:1;	/* BPF program is sleepable */
-	enum bpf_prog_type	type;		/* Type of BPF program */
-	enum bpf_attach_type	expected_attach_type; /* For some prog types */
-	u32			len;		/* Number of filter blocks */
-	u32			jited_len;	/* Size of jited insns in bytes */
-	u8			tag[BPF_TAG_SIZE];
-	struct bpf_prog_stats __percpu *stats;
-	int __percpu		*active;
-	unsigned int		(*bpf_func)(const void *ctx,
-					    const struct bpf_insn *insn);
-	struct bpf_prog_aux	*aux;		/* Auxiliary fields */
-	struct sock_fprog_kern	*orig_prog;	/* Original BPF program */
+	u16				pages;		/* Number of allocated pages */
+	u16				jited:1,	/* Is our filter JIT'ed? */
+					jit_requested:1,/* archs need to JIT the prog */
+					gpl_compatible:1, /* Is filter GPL compatible? */
+					cb_access:1,	/* Is control block accessed? */
+					dst_needed:1,	/* Do we need dst entry? */
+					blinding_requested:1, /* needs constant blinding */
+					blinded:1,	/* Was blinded */
+					is_func:1,	/* program is a bpf function */
+					kprobe_override:1, /* Do we override a kprobe? */
+					has_callchain_buf:1, /* callchain buffer allocated? */
+					enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */
+					call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */
+					call_get_func_ip:1, /* Do we call get_func_ip() */
+					tstamp_type_access:1, /* Accessed __sk_buff->tstamp_type */
+					sleepable:1;	/* BPF program is sleepable */
+	enum bpf_prog_type		type;		/* Type of BPF program */
+	enum bpf_attach_type		expected_attach_type; /* For some prog types */
+	u32				len;		/* Number of filter blocks */
+	u32				jited_len;	/* Size of jited insns in bytes */
+	u8				tag[BPF_TAG_SIZE];
+	struct bpf_prog_stats __percpu	*stats;
+	int __percpu			*active;
+	unsigned int			(*bpf_func)(const void *ctx,
+						    const struct bpf_insn *insn);
+	struct bpf_prog_aux		*aux;		/* Auxiliary fields */
+	struct sock_fprog_kern		*orig_prog;	/* Original BPF program */
+	struct bpf_term_aux_states	*term_states;
 	/* Instructions for interpreter */
 	union {
 		DECLARE_FLEX_ARRAY(struct sock_filter, insns);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ef01cc644a96..740b5a3a6b55 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -100,6 +100,8 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
 	gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
 	struct bpf_prog_aux *aux;
 	struct bpf_prog *fp;
+	struct bpf_term_aux_states *term_states = NULL;
+	struct bpf_term_patch_call_sites *patch_call_sites = NULL;
 
 	size = round_up(size, __PAGE_SIZE);
 	fp = __vmalloc(size, gfp_flags);
@@ -118,11 +120,24 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
 		return NULL;
 	}
 
+	term_states = kzalloc(sizeof(*term_states), bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
+	if (!term_states)
+		goto free_alloc_percpu;
+
+	patch_call_sites = kzalloc(sizeof(*patch_call_sites), bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
+	if (!patch_call_sites)
+		goto free_bpf_term_states;
+
 	fp->pages = size / PAGE_SIZE;
 	fp->aux = aux;
 	fp->aux->prog = fp;
 	fp->jit_requested = ebpf_jit_enabled();
 	fp->blinding_requested = bpf_jit_blinding_enabled(fp);
+	fp->term_states = term_states;
+	fp->term_states->patch_call_sites = patch_call_sites;
+	fp->term_states->patch_call_sites->call_sites_cnt = 0;
+	fp->term_states->prog = fp;
+
 #ifdef CONFIG_CGROUP_BPF
 	aux->cgroup_atype = CGROUP_BPF_ATTACH_TYPE_INVALID;
 #endif
@@ -140,6 +155,15 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
 #endif
 
 	return fp;
+
+free_bpf_term_states:
+	kfree(term_states);
+free_alloc_percpu:
+	free_percpu(fp->active);
+	kfree(aux);
+	vfree(fp);
+
+	return NULL;
 }
 
 struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
@@ -266,6 +290,7 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
 		memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
 		fp->pages = pages;
 		fp->aux->prog = fp;
+		fp->term_states->prog = fp;
 
 		/* We keep fp->aux from fp_old around in the new
 		 * reallocated structure.
@@ -273,6 +298,7 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
 		fp_old->aux = NULL;
 		fp_old->stats = NULL;
 		fp_old->active = NULL;
+		fp_old->term_states = NULL;
 		__bpf_prog_free(fp_old);
 	}
 
@@ -287,6 +313,11 @@ void __bpf_prog_free(struct bpf_prog *fp)
 		kfree(fp->aux->poke_tab);
 		kfree(fp->aux);
 	}
+	if (fp->term_states) {
+		if (fp->term_states->patch_call_sites)
+			kfree(fp->term_states->patch_call_sites);
+		kfree(fp->term_states);
+	}
 	free_percpu(fp->stats);
 	free_percpu(fp->active);
 	vfree(fp);
-- 
2.43.0

Create callsites tables and store jit indexes of RET_NULL calls
to poke them later with dummy functions. Additional to jit indexes,
meta data about helpers/kfuncs/loops is stored. Later this could
be extended to remaining potential long running iterator helpers/kfuncs.

Signed-off-by: Raj Sahu <rjsu26@gmail.com>
Signed-off-by: Siddharth Chintamaneni <sidchintamaneni@gmail.com>
---
 arch/x86/net/bpf_jit_comp.c  |   9 +++
 include/linux/bpf_verifier.h |   1 +
 kernel/bpf/core.c            |   5 +-
 kernel/bpf/verifier.c        | 135 +++++++++++++++++++++++++++++++----
 4 files changed, 137 insertions(+), 13 deletions(-)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 7e3fca164620..107a44729675 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -3733,6 +3733,15 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 	}
 
 	if (!image || !prog->is_func || extra_pass) {
+		
+		if (addrs) {
+			struct bpf_term_patch_call_sites *patch_call_sites = prog->term_states->patch_call_sites;
+			for (int i = 0; i < patch_call_sites->call_sites_cnt; i++) {
+				struct call_aux_states *call_states = patch_call_sites->call_states + i;
+				call_states->jit_call_idx = addrs[call_states->call_bpf_insn_idx];
+			}
+		}
+		
 		if (image)
 			bpf_prog_fill_jited_linfo(prog, addrs + 1);
 out_addrs:
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 020de62bd09c..2c8bfde8191a 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -677,6 +677,7 @@ struct bpf_subprog_info {
 	bool is_cb: 1;
 	bool is_async_cb: 1;
 	bool is_exception_cb: 1;
+	bool is_bpf_loop_cb_non_inline: 1;
 	bool args_cached: 1;
 	/* true if bpf_fastcall stack region is used by functions that can't be inlined */
 	bool keep_fastcall_stack: 1;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 740b5a3a6b55..93442ab2acde 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -136,6 +136,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
 	fp->term_states = term_states;
 	fp->term_states->patch_call_sites = patch_call_sites;
 	fp->term_states->patch_call_sites->call_sites_cnt = 0;
+	fp->term_states->patch_call_sites->call_states = NULL;
 	fp->term_states->prog = fp;
 
 #ifdef CONFIG_CGROUP_BPF
@@ -314,8 +315,10 @@ void __bpf_prog_free(struct bpf_prog *fp)
 		kfree(fp->aux);
 	}
 	if (fp->term_states) {
-		if (fp->term_states->patch_call_sites)
+		if (fp->term_states->patch_call_sites) {
+			vfree(fp->term_states->patch_call_sites->call_states);
 			kfree(fp->term_states->patch_call_sites);
+		}
 		kfree(fp->term_states);
 	}
 	free_percpu(fp->stats);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b9394f8fac0e..1d27208e1078 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3491,6 +3491,7 @@ static int add_subprog_and_kfunc(struct bpf_verifier_env *env)
 	 * logic. 'subprog_cnt' should not be increased.
 	 */
 	subprog[env->subprog_cnt].start = insn_cnt;
+	subprog[env->subprog_cnt].is_bpf_loop_cb_non_inline = false;
 
 	if (env->log.level & BPF_LOG_LEVEL2)
 		for (i = 0; i < env->subprog_cnt; i++)
@@ -11319,19 +11320,30 @@ static bool loop_flag_is_zero(struct bpf_verifier_env *env)
 static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno)
 {
 	struct bpf_loop_inline_state *state = &cur_aux(env)->loop_inline_state;
+	struct bpf_subprog_info *prev_info, *info = subprog_info(env, subprogno);
 
 	if (!state->initialized) {
 		state->initialized = 1;
 		state->fit_for_inline = loop_flag_is_zero(env);
 		state->callback_subprogno = subprogno;
+		if (!state->fit_for_inline)
+			info->is_bpf_loop_cb_non_inline = 1;
 		return;
 	}
 
-	if (!state->fit_for_inline)
+	if (!state->fit_for_inline) {
+		info->is_bpf_loop_cb_non_inline = 1;
 		return;
+	}
 
 	state->fit_for_inline = (loop_flag_is_zero(env) &&
 				 state->callback_subprogno == subprogno);
+
+	if (state->callback_subprogno != subprogno) {
+		info->is_bpf_loop_cb_non_inline = 1;
+		prev_info = subprog_info(env, state->callback_subprogno);
+		prev_info->is_bpf_loop_cb_non_inline = 1;
+	}
 }
 
 /* Returns whether or not the given map type can potentially elide
@@ -21120,6 +21132,9 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
 	int i, patch_len, delta = 0, len = env->prog->len;
 	struct bpf_insn *insns = env->prog->insnsi;
 	struct bpf_prog *new_prog;
+	struct bpf_term_aux_states *term_states = env->prog->term_states;
+	u32 call_sites_cnt = term_states->patch_call_sites->call_sites_cnt;
+	struct call_aux_states *call_states = term_states->patch_call_sites->call_states;
 	bool rnd_hi32;
 
 	rnd_hi32 = attr->prog_flags & BPF_F_TEST_RND_HI32;
@@ -21205,6 +21220,15 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
 		insns = new_prog->insnsi;
 		aux = env->insn_aux_data;
 		delta += patch_len - 1;
+
+		/* Adust call instruction offsets
+		 * w.r.t adj_idx
+		 */
+		for (int iter = 0; iter < call_sites_cnt; iter++) {
+			if (call_states[iter].call_bpf_insn_idx < adj_idx)
+				continue;
+			call_states[iter].call_bpf_insn_idx += patch_len - 1;
+		}
 	}
 
 	return 0;
@@ -21597,6 +21621,26 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		func[i]->aux->func_info_cnt = prog->aux->func_info_cnt;
 		func[i]->aux->poke_tab = prog->aux->poke_tab;
 		func[i]->aux->size_poke_tab = prog->aux->size_poke_tab;
+		func[i]->aux->is_bpf_loop_cb_non_inline = env->subprog_info[i].is_bpf_loop_cb_non_inline;
+
+		if (prog->term_states->patch_call_sites->call_sites_cnt != 0) {
+			int call_sites_cnt = 0;
+			struct call_aux_states *func_call_states;
+			func_call_states = vzalloc(sizeof(*func_call_states) * len);
+			if (!func_call_states)
+				goto out_free;
+			for (int iter = 0; iter < prog->term_states->patch_call_sites->call_sites_cnt; iter++) {
+				struct call_aux_states call_states = prog->term_states->patch_call_sites->call_states[iter];
+				if (call_states.call_bpf_insn_idx >= subprog_start
+						&& call_states.call_bpf_insn_idx < subprog_end) {
+					func_call_states[call_sites_cnt] = call_states;
+					func_call_states[call_sites_cnt].call_bpf_insn_idx -= subprog_start;
+					call_sites_cnt++;
+				}
+			}
+			func[i]->term_states->patch_call_sites->call_sites_cnt = call_sites_cnt;
+			func[i]->term_states->patch_call_sites->call_states = func_call_states;
+		}
 
 		for (j = 0; j < prog->aux->size_poke_tab; j++) {
 			struct bpf_jit_poke_descriptor *poke;
@@ -21886,15 +21930,21 @@ static void __fixup_collection_insert_kfunc(struct bpf_insn_aux_data *insn_aux,
 }
 
 static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
-			    struct bpf_insn *insn_buf, int insn_idx, int *cnt)
+			    struct bpf_insn *insn_buf, int insn_idx, int *cnt, int *kfunc_btf_id)
 {
 	const struct bpf_kfunc_desc *desc;
+	struct bpf_kfunc_call_arg_meta meta;
+	int err;
 
 	if (!insn->imm) {
 		verbose(env, "invalid kernel function call not eliminated in verifier pass\n");
 		return -EINVAL;
 	}
 
+	err = fetch_kfunc_meta(env, insn, &meta, NULL);
+	if (err)
+		return err;
+
 	*cnt = 0;
 
 	/* insn->imm has the btf func_id. Replace it with an offset relative to
@@ -21908,8 +21958,11 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		return -EFAULT;
 	}
 
-	if (!bpf_jit_supports_far_kfunc_call())
+	if (!bpf_jit_supports_far_kfunc_call()) {
+		if (meta.kfunc_flags & KF_RET_NULL)
+			*kfunc_btf_id = insn->imm;
 		insn->imm = BPF_CALL_IMM(desc->addr);
+	}
 	if (insn->off)
 		return 0;
 	if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl] ||
@@ -22019,6 +22072,13 @@ static int add_hidden_subprog(struct bpf_verifier_env *env, struct bpf_insn *pat
 	return 0;
 }
 
+static bool is_bpf_loop_call(struct bpf_insn *insn)
+{
+	return insn->code == (BPF_JMP | BPF_CALL) &&
+		insn->src_reg == 0 &&
+		insn->imm == BPF_FUNC_loop;
+}
+
 /* Do various post-verification rewrites in a single program pass.
  * These rewrites simplify JIT and interpreter implementations.
  */
@@ -22039,6 +22099,12 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 	struct bpf_subprog_info *subprogs = env->subprog_info;
 	u16 stack_depth = subprogs[cur_subprog].stack_depth;
 	u16 stack_depth_extra = 0;
+	u32 call_sites_cnt = 0;
+	struct call_aux_states *call_states;
+
+	call_states = vzalloc(sizeof(*call_states) * prog->len);
+	if (!call_states)
+		return -ENOMEM;
 
 	if (env->seen_exception && !env->exception_callback_subprog) {
 		struct bpf_insn *patch = insn_buf;
@@ -22368,11 +22434,12 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 		if (insn->src_reg == BPF_PSEUDO_CALL)
 			goto next_insn;
 		if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
-			ret = fixup_kfunc_call(env, insn, insn_buf, i + delta, &cnt);
+			int kfunc_btf_id = 0;
+			ret = fixup_kfunc_call(env, insn, insn_buf, i + delta, &cnt, &kfunc_btf_id);
 			if (ret)
 				return ret;
 			if (cnt == 0)
-				goto next_insn;
+				goto store_call_indices;
 
 			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
 			if (!new_prog)
@@ -22381,6 +22448,12 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 			delta	 += cnt - 1;
 			env->prog = prog = new_prog;
 			insn	  = new_prog->insnsi + i + delta;
+store_call_indices:
+			if (kfunc_btf_id != 0) {
+				call_states[call_sites_cnt].call_bpf_insn_idx = i + delta;
+				call_states[call_sites_cnt].is_helper_kfunc = 1;
+				call_sites_cnt++;
+			}
 			goto next_insn;
 		}
 
@@ -22859,6 +22932,15 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 				     func_id_name(insn->imm), insn->imm);
 			return -EFAULT;
 		}
+
+		if ((fn->ret_type & PTR_MAYBE_NULL) || is_bpf_loop_call(insn)) {
+			call_states[call_sites_cnt].call_bpf_insn_idx = i + delta;	
+			if (is_bpf_loop_call(insn))
+				call_states[call_sites_cnt].is_bpf_loop = 1;
+			else
+				call_states[call_sites_cnt].is_helper_kfunc = 1;
+			call_sites_cnt++;
+		}
 		insn->imm = fn->func - __bpf_call_base;
 next_insn:
 		if (subprogs[cur_subprog + 1].start == i + delta + 1) {
@@ -22879,6 +22961,8 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 		insn++;
 	}
 
+	env->prog->term_states->patch_call_sites->call_sites_cnt = call_sites_cnt;
+	env->prog->term_states->patch_call_sites->call_states = call_states;
 	env->prog->aux->stack_depth = subprogs[0].stack_depth;
 	for (i = 0; i < env->subprog_cnt; i++) {
 		int delta = bpf_jit_supports_timed_may_goto() ? 2 : 1;
@@ -23014,17 +23098,12 @@ static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env,
 	call_insn_offset = position + 12;
 	callback_offset = callback_start - call_insn_offset - 1;
 	new_prog->insnsi[call_insn_offset].imm = callback_offset;
+	/* Marking offset field to identify loop cb */
+	new_prog->insnsi[call_insn_offset].off = 0x1;
 
 	return new_prog;
 }
 
-static bool is_bpf_loop_call(struct bpf_insn *insn)
-{
-	return insn->code == (BPF_JMP | BPF_CALL) &&
-		insn->src_reg == 0 &&
-		insn->imm == BPF_FUNC_loop;
-}
-
 /* For all sub-programs in the program (including main) check
  * insn_aux_data to see if there are bpf_loop calls that require
  * inlining. If such calls are found the calls are replaced with a
@@ -24584,6 +24663,35 @@ static int compute_scc(struct bpf_verifier_env *env)
 	return err;
 }
 
+static int fix_call_sites(struct bpf_verifier_env *env)
+{
+	int err = 0, i, subprog;
+	struct bpf_insn *insn;
+	struct bpf_prog *prog = env->prog;
+	struct bpf_term_aux_states *term_states = env->prog->term_states;
+	u32 *call_sites_cnt = &term_states->patch_call_sites->call_sites_cnt;
+	struct call_aux_states *call_states = term_states->patch_call_sites->call_states;
+
+	if (!env->subprog_cnt)
+		return 0;
+	for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
+		if (!bpf_pseudo_func(insn) && !bpf_pseudo_call(insn))
+			continue;
+
+		subprog = find_subprog(env, i + insn->imm + 1);
+		if (subprog < 0)
+			return -EFAULT;
+
+		if (insn->off == 0x1) {
+			call_states[*call_sites_cnt].call_bpf_insn_idx = i;
+			call_states[*call_sites_cnt].is_bpf_loop_cb_inline = 1;
+			*call_sites_cnt = *call_sites_cnt + 1;
+			prog->insnsi[i].off = 0x0; /* Removing the marker */
+		}
+	}
+	return err;
+}
+
 int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
 {
 	u64 start_time = ktime_get_ns();
@@ -24769,6 +24877,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
 								     : false;
 	}
 
+	if (ret == 0)
+		ret = fix_call_sites(env);
+
 	if (ret == 0)
 		ret = fixup_call_args(env);
 
-- 
2.43.0

Update softlock detection logic to detect any stalls due to
BPF programs. When softlockup is detected, bpf_die will be
added to a workqueue on a CPU. With this implementation termination
handler will only get triggered when CONFIG_SOFTLOCKUP_DETECTOR is
enabled.

Inside bpf_die, we perform the text_poke to stub helpers/kfuncs.
The current implementation handles termination of long running
bpf_loop iterators both inlining and non-inlining case.

The limitation of this implementation is that the termination handler
atleast need a single CPU to run.

Signed-off-by: Raj Sahu <rjsu26@gmail.com>
Signed-off-by: Siddharth Chintamaneni <sidchintamaneni@gmail.com>
---
 arch/x86/net/bpf_jit_comp.c | 132 ++++++++++++++++++++++++++++++++++++
 include/linux/bpf.h         |   2 +
 include/linux/filter.h      |   6 ++
 kernel/bpf/core.c           |  35 +++++++++-
 kernel/watchdog.c           |   8 +++
 5 files changed, 182 insertions(+), 1 deletion(-)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 107a44729675..4de9a8cdc465 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -2606,6 +2606,10 @@ st:			if (is_imm8(insn->off))
 				if (arena_vm_start)
 					pop_r12(&prog);
 			}
+			/* emiting 5 byte nop for non-inline bpf_loop callback */
+			if (bpf_is_subprog(bpf_prog) && bpf_prog->aux->is_bpf_loop_cb_non_inline) {
+				emit_nops(&prog, X86_PATCH_SIZE);
+			}
 			EMIT1(0xC9);         /* leave */
 			emit_return(&prog, image + addrs[i - 1] + (prog - temp));
 			break;
@@ -3833,6 +3837,8 @@ bool bpf_jit_supports_private_stack(void)
 	return true;
 }
 
+
+
 void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
 {
 #if defined(CONFIG_UNWINDER_ORC)
@@ -3849,6 +3855,132 @@ void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp
 #endif
 }
 
+void in_place_patch_bpf_prog(struct bpf_prog *prog)
+{
+	struct call_aux_states *call_states;
+	unsigned long new_target;
+	unsigned char *addr;
+	u8 ret_jmp_size = 1;
+	if (cpu_wants_rethunk()) {
+		ret_jmp_size = 5;
+	}
+	call_states = prog->term_states->patch_call_sites->call_states;
+	for (int i = 0; i < prog->term_states->patch_call_sites->call_sites_cnt; i++) {
+		
+		new_target = (unsigned long) bpf_termination_null_func;
+		if (call_states[i].is_bpf_loop_cb_inline) {
+			new_target = (unsigned long) bpf_loop_term_callback;	
+		}
+		char new_insn[5];
+
+		addr = (unsigned char *)prog->bpf_func + call_states->jit_call_idx;
+
+		unsigned long new_rel = (unsigned long)(new_target - (unsigned long)(addr + 5));
+		new_insn[0] = 0xE8;
+		new_insn[1] = (new_rel >> 0) & 0xFF;
+		new_insn[2] = (new_rel >> 8) & 0xFF;
+		new_insn[3] = (new_rel >> 16) & 0xFF;
+		new_insn[4] = (new_rel >> 24) & 0xFF;
+
+		smp_text_poke_batch_add(addr, new_insn, 5 /* call instruction len */, NULL);
+	}
+
+	if (prog->aux->is_bpf_loop_cb_non_inline) {
+		
+		char new_insn[5] = { 0xB8, 0x01, 0x00, 0x00, 0x00 };
+		char old_insn[5] = { 0x0F, 0x1F, 0x44, 0x00, 0x00 };
+		smp_text_poke_batch_add(prog->bpf_func + prog->jited_len - 
+				(1 + ret_jmp_size) /* leave, jmp/ ret */ - 5 /* nop size */, new_insn, 5 /* mov eax, 1 */, old_insn);
+	}
+
+
+	/* flush all text poke calls */
+	smp_text_poke_batch_finish();
+}
+
+void bpf_die(struct bpf_prog *prog)
+{
+	u8 ret_jmp_size = 1;
+	if (cpu_wants_rethunk()) {
+		ret_jmp_size = 5;
+	}
+
+	/*
+	 * Replacing 5 byte nop in prologue with jmp instruction to ret
+	 */
+	unsigned long jmp_offset = prog->jited_len - (4 /* First endbr is 4 bytes */ 
+					+ 5 /* noop is 5 bytes */ 
+					+ ret_jmp_size /* 5 bytes of jmp return_thunk or 1 byte ret*/);
+
+	char new_insn[5];
+	new_insn[0] = 0xE9;
+	new_insn[1] = (jmp_offset >> 0) & 0xFF;
+	new_insn[2] = (jmp_offset >> 8) & 0xFF;
+	new_insn[3] = (jmp_offset >> 16) & 0xFF;
+	new_insn[4] = (jmp_offset >> 24) & 0xFF;
+
+	smp_text_poke_batch_add(prog->bpf_func + 4, new_insn, 5, NULL);
+
+	if (prog->aux->func_cnt) {
+		for (int i = 0; i < prog->aux->func_cnt; i++) {
+			in_place_patch_bpf_prog(prog->aux->func[i]);
+		}
+	} else {
+		in_place_patch_bpf_prog(prog);
+	}
+
+}
+
+void bpf_prog_termination_deferred(struct work_struct *work)
+{
+	struct bpf_term_aux_states *term_states = container_of(work, struct bpf_term_aux_states,
+						 work);
+	struct bpf_prog *prog = term_states->prog;
+
+	bpf_die(prog);
+}
+
+static struct workqueue_struct *bpf_termination_wq;
+
+void bpf_softlockup(u32 dur_s)
+{
+	unsigned long addr;
+	struct unwind_state state;
+	struct bpf_prog *prog;
+
+	for (unwind_start(&state, current, NULL, NULL); !unwind_done(&state);
+	     unwind_next_frame(&state)) {
+		addr = unwind_get_return_address(&state);
+		if (!addr)
+			break;
+
+		if (!is_bpf_text_address(addr))
+			continue;
+
+		rcu_read_lock();
+		prog = bpf_prog_ksym_find(addr);
+		rcu_read_unlock();
+		if (bpf_is_subprog(prog))
+			continue;
+
+		if (atomic_cmpxchg(&prog->term_states->bpf_die_in_progress, 0, 1))
+			break;
+	
+		bpf_termination_wq = alloc_workqueue("bpf_termination_wq", WQ_UNBOUND, 1);
+		if (!bpf_termination_wq)
+			pr_err("Failed to alloc workqueue for bpf termination.\n");
+
+		queue_work(bpf_termination_wq, &prog->term_states->work);
+
+		/* Currently nested programs are not terminated together.
+		 * Removing this break will result in BPF trampolines being
+		 * identified as is_bpf_text_address resulting in NULL ptr
+		 * deref in next step.
+		 */
+		break;
+	}
+}
+
 void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
 			       struct bpf_prog *new, struct bpf_prog *old)
 {
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index caaee33744fc..03fce8f2c466 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -71,6 +71,7 @@ typedef int (*bpf_iter_init_seq_priv_t)(void *private_data,
 typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data);
 typedef unsigned int (*bpf_func_t)(const void *,
 				   const struct bpf_insn *);
+
 struct bpf_iter_seq_info {
 	const struct seq_operations *seq_ops;
 	bpf_iter_init_seq_priv_t init_seq_private;
@@ -1600,6 +1601,7 @@ struct bpf_term_patch_call_sites {
 struct bpf_term_aux_states {
 	struct bpf_prog *prog;
 	struct work_struct work;
+	atomic_t bpf_die_in_progress;
 	struct bpf_term_patch_call_sites *patch_call_sites;
 };
 
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 9092d8ea95c8..4f0f8fe478bf 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1123,6 +1123,8 @@ int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len);
 bool sk_filter_charge(struct sock *sk, struct sk_filter *fp);
 void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp);
 
+void *bpf_termination_null_func(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
+int bpf_loop_term_callback(u64 reg_loop_cnt, u64 *reg_loop_ctx);
 u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 #define __bpf_call_base_args \
 	((u64 (*)(u64, u64, u64, u64, u64, const struct bpf_insn *)) \
@@ -1257,6 +1259,10 @@ bpf_jit_binary_pack_hdr(const struct bpf_prog *fp);
 
 void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns);
 void bpf_prog_pack_free(void *ptr, u32 size);
+void bpf_softlockup(u32 dur_s);
+void bpf_prog_termination_deferred(struct work_struct *work);
+void bpf_die(struct bpf_prog *prog);
+void in_place_patch_bpf_prog(struct bpf_prog *prog);
 
 static inline bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
 {
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 93442ab2acde..7b0552d15be3 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -41,6 +41,7 @@
 #include <linux/execmem.h>
 
 #include <asm/barrier.h>
+#include <asm/unwind.h>
 #include <linux/unaligned.h>
 
 /* Registers */
@@ -95,6 +96,37 @@ enum page_size_enum {
 	__PAGE_SIZE = PAGE_SIZE
 };
 
+void *bpf_termination_null_func(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	return NULL;
+}
+
+int bpf_loop_term_callback(u64 reg_loop_cnt, u64 *reg_loop_ctx)
+{
+	return 1;
+}
+
+
+void __weak in_place_patch_bpf_prog(struct bpf_prog *prog)
+{
+	return;
+}
+
+void __weak bpf_die(struct bpf_prog *prog)
+{
+	return;
+}
+
+void __weak bpf_prog_termination_deferred(struct work_struct *work)
+{
+	return;
+}
+
+void __weak bpf_softlockup(u32 dur_s)
+{
+	return;
+}
+
 struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags)
 {
 	gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
@@ -134,11 +166,12 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
 	fp->jit_requested = ebpf_jit_enabled();
 	fp->blinding_requested = bpf_jit_blinding_enabled(fp);
 	fp->term_states = term_states;
+	atomic_set(&fp->term_states->bpf_die_in_progress, 0);
 	fp->term_states->patch_call_sites = patch_call_sites;
 	fp->term_states->patch_call_sites->call_sites_cnt = 0;
 	fp->term_states->patch_call_sites->call_states = NULL;
 	fp->term_states->prog = fp;
-
+	INIT_WORK(&fp->term_states->work, bpf_prog_termination_deferred);
 #ifdef CONFIG_CGROUP_BPF
 	aux->cgroup_atype = CGROUP_BPF_ATTACH_TYPE_INVALID;
 #endif
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 80b56c002c7f..59c91c18ca0e 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -25,6 +25,7 @@
 #include <linux/stop_machine.h>
 #include <linux/sysctl.h>
 #include <linux/tick.h>
+#include <linux/filter.h>
 
 #include <linux/sched/clock.h>
 #include <linux/sched/debug.h>
@@ -700,6 +701,13 @@ static int is_softlockup(unsigned long touch_ts,
 		if (time_after_eq(now, period_ts + get_softlockup_thresh() * 3 / 4))
 			scx_softlockup(now - touch_ts);
 
+		/*
+		 * Long running BPF programs can cause CPU's to stall.
+		 * So trigger fast path termination to terminate such BPF programs.
+		 */
+		if (time_after_eq(now, period_ts + get_softlockup_thresh() * 3 / 4))
+			bpf_softlockup(now - touch_ts);
+
 		/* Warn about unreasonable delays. */
 		if (time_after(now, period_ts + get_softlockup_thresh()))
 			return now - touch_ts;
-- 
2.43.0

Adds tests checks for loops termination which are nested.

32/1    bpf_termination/bpf_termination:OK
32      bpf_termination:OK
Summary: 1/1 PASSED, 0 SKIPPED, 0 FAILED

Signed-off-by: Raj Sahu <rjsu26@gmail.com>
Signed-off-by: Siddharth Chintamaneni <sidchintamaneni@gmail.com>
---
 .../bpf/prog_tests/bpf_termination.c          | 39 +++++++++++++++
 .../selftests/bpf/progs/bpf_termination.c     | 47 +++++++++++++++++++
 2 files changed, 86 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/bpf_termination.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_termination.c

diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_termination.c b/tools/testing/selftests/bpf/prog_tests/bpf_termination.c
new file mode 100644
index 000000000000..d060073db8f9
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_termination.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <test_progs.h>
+#include <sys/socket.h>
+
+#include "bpf_termination.skel.h"
+
+void test_loop_termination(void)
+{
+	struct bpf_termination *skel;
+	int err;
+	
+	skel = bpf_termination__open();
+	if (!ASSERT_OK_PTR(skel, "bpf_termination__open"))
+	        return;
+	
+	err = bpf_termination__load(skel);
+	if (!ASSERT_OK(err, "bpf_termination__load"))
+	        goto out;
+	
+	skel->bss->pid = getpid();
+	err = bpf_termination__attach(skel);
+	if (!ASSERT_OK(err, "bpf_termination__attach"))
+	        goto out;
+	
+	/* Triggers long running BPF program */
+	socket(AF_UNSPEC, SOCK_DGRAM, 0);
+
+	/* If the program is not terminated, it doesn't reach this point */
+	ASSERT_TRUE(true, "Program is terminated");
+out:
+       bpf_termination__destroy(skel);
+}
+
+void test_bpf_termination(void)
+{
+	if (test__start_subtest("bpf_termination"))
+		test_loop_termination();
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_termination.c b/tools/testing/selftests/bpf/progs/bpf_termination.c
new file mode 100644
index 000000000000..36e97d84750b
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_termination.c
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stddef.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+int pid;
+
+#define LOOPS_CNT 1 << 10
+
+static int callback_fn4(void *ctx) {
+	return 0;
+}
+
+static int callback_fn3(void *ctx) {
+
+	bpf_loop(LOOPS_CNT, callback_fn4, NULL, 0);
+	return 0;
+
+}
+
+
+static int callback_fn2(void *ctx) {
+
+	bpf_loop(LOOPS_CNT, callback_fn3, NULL, 0);
+	return 0;
+
+}
+
+static int callback_fn(void *ctx) {
+
+	bpf_loop(LOOPS_CNT, callback_fn2, NULL, 0);
+	return 0;
+
+}
+
+SEC("tp/syscalls/sys_enter_socket")
+int bpf_loop_lr(void *ctx) {
+
+	if ((bpf_get_current_pid_tgid() >> 32) != pid)
+		return 0;
+
+	bpf_loop(LOOPS_CNT, callback_fn, NULL, 0);
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
-- 
2.43.0