Compare the fixed btf_record header and each btf_field explicitly
instead of memcmp'ing the whole allocation at once.

This is necessary for the follow-on patches which extend record contents
with data outside fields but part of the record that can't be compared
meaningfully.

The comment is updated to reflect individual field comparison, and
retain useful information about zeroing behavior, while referencing
auxiliary data attached to records as a reason for the individual field
comparison.

The reference to auxiliary data attached to the record will be relevant
with the next patches.

Signed-off-by: Justin Suess <utilityemal77@gmail.com>
---
 kernel/bpf/syscall.c | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 3b1f0ba02f61..2caafce00f24 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -760,7 +760,8 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)
 bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b)
 {
 	bool a_has_fields = !IS_ERR_OR_NULL(rec_a), b_has_fields = !IS_ERR_OR_NULL(rec_b);
-	int size;
+	size_t size;
+	int i;
 
 	if (!a_has_fields && !b_has_fields)
 		return true;
@@ -768,7 +769,6 @@ bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *r
 		return false;
 	if (rec_a->cnt != rec_b->cnt)
 		return false;
-	size = struct_size(rec_a, fields, rec_a->cnt);
 	/* btf_parse_fields uses kzalloc to allocate a btf_record, so unused
 	 * members are zeroed out. So memcmp is safe to do without worrying
 	 * about padding/unused fields.
@@ -780,10 +780,24 @@ bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *r
 	 *
 	 * So while by default, we don't rely on the map BTF (which the records
 	 * were parsed from) matching for both records, which is not backwards
-	 * compatible, in case list_head is part of it, we implicitly rely on
-	 * that by way of depending on memcmp succeeding for it.
+	 * compatible; in case list_head is part of a record, we implicitly
+	 * rely on that by way of depending on memcmp succeeding for each
+	 * individual field.
+	 *
+	 * Comparing the whole record may be incorrect due to auxiliary data
+	 * attached to the record.
 	 */
-	return !memcmp(rec_a, rec_b, size);
+	size = offsetof(struct btf_record, fields);
+	if (memcmp(rec_a, rec_b, size))
+		return false;
+
+	for (i = 0; i < rec_a->cnt; i++) {
+		if (memcmp(&rec_a->fields[i], &rec_b->fields[i],
+			   sizeof(rec_a->fields[i])))
+			return false;
+	}
+
+	return true;
 }
 
 void bpf_obj_free_timer(const struct btf_record *rec, void *obj)
-- 
2.53.0

Queue final BTF teardown from the RCU callback onto a rcu_work, making
sure all rcu grace periods cease before proceeding with free work.

Necessary for follow on patches that need to sync an irq_work but aren't
safe in rcu context.

Work queue is per-btf object. Work is flushed during synchronous teardown.

Signed-off-by: Justin Suess <utilityemal77@gmail.com>
---
 kernel/bpf/btf.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 77af44d8a3ad..2b0511663319 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -28,6 +28,7 @@
 #include <linux/string.h>
 #include <linux/sysfs.h>
 #include <linux/overflow.h>
+#include <linux/workqueue.h>
 
 #include <net/netfilter/nf_bpf_link.h>
 
@@ -264,13 +265,13 @@ struct btf {
 	u32 data_size;
 	refcount_t refcnt;
 	u32 id;
-	struct rcu_head rcu;
+	/* Final free runs after an RCU grace period in process context. */
+	struct rcu_work free_work;
 	struct btf_kfunc_set_tab *kfunc_set_tab;
 	struct btf_id_dtor_kfunc_tab *dtor_kfunc_tab;
 	struct btf_struct_metas *struct_meta_tab;
 	struct btf_struct_ops_tab *struct_ops_tab;
 	struct btf_layout *layout;
-
 	/* split BTF support */
 	struct btf *base_btf;
 	u32 start_id; /* first type ID in this BTF (0 for base BTF) */
@@ -1880,9 +1881,10 @@ static void btf_free(struct btf *btf)
 	kfree(btf);
 }
 
-static void btf_free_rcu(struct rcu_head *rcu)
+static void btf_free_work(struct work_struct *work)
 {
-	struct btf *btf = container_of(rcu, struct btf, rcu);
+	struct rcu_work *rwork = to_rcu_work(work);
+	struct btf *btf = container_of(rwork, struct btf, free_work);
 
 	btf_free(btf);
 }
@@ -1901,7 +1903,7 @@ void btf_put(struct btf *btf)
 {
 	if (btf && refcount_dec_and_test(&btf->refcnt)) {
 		btf_free_id(btf);
-		call_rcu(&btf->rcu, btf_free_rcu);
+		queue_rcu_work(system_wq, &btf->free_work);
 	}
 }
 
@@ -6013,6 +6015,7 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat
 		goto errout_free;
 
 	btf_verifier_env_free(env);
+	INIT_RCU_WORK(&btf->free_work, btf_free_work);
 	refcount_set(&btf->refcnt, 1);
 	return btf;
 
@@ -6400,6 +6403,7 @@ static struct btf *btf_parse_base(struct btf_verifier_env *env, const char *name
 		goto errout;
 
 	btf_check_sorted(btf);
+	INIT_RCU_WORK(&btf->free_work, btf_free_work);
 	refcount_set(&btf->refcnt, 1);
 
 	return btf;
@@ -6535,6 +6539,7 @@ static struct btf *btf_parse_module(const char *module_name, const void *data,
 
 	btf_verifier_env_free(env);
 	btf_check_sorted(btf);
+	INIT_RCU_WORK(&btf->free_work, btf_free_work);
 	refcount_set(&btf->refcnt, 1);
 	return btf;
 
@@ -8446,6 +8451,7 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
 				sysfs_remove_bin_file(btf_kobj, btf_mod->sysfs_attr);
 			purge_cand_cache(btf_mod->btf);
 			btf_put(btf_mod->btf);
+			flush_rcu_work(&btf_mod->btf->free_work);
 			kfree(btf_mod->sysfs_attr);
 			kfree(btf_mod);
 			break;
-- 
2.53.0

Defer freeing of referenced kptrs using irq_work queue.

This fixes a deadlock in BPF tracing programs running under NMI.

Each kptr is tagged with an auxiliary data field storing an llist_node
and a pointer to the object to be freed. These are assembled together
to form a queue for deletion outside NMI.

Add a field to each data structure capable of holding referenced kptrs
to store the llist_head, as well as an irq_work struct to the btf kptr
field to store the task callback.

The llist_nodes are linked in the queue safely, allowing them to be torn
down once NMI is over.

This irq_work struct is foribly synchronized on btf teardown, enabled by
the change in btf cleanup code introduced in the previous commit, adding
the rcu_work teardown.

At dtor time, if the execution is in an nmi context, enqueue the
referenced kptr nodes in the llist_head and enqueue a job to drain the
list, calling the respective dtor callback from a safe context.

If running outside nmi, use synchronous dtor path.

This touches arraymap, hashtab, and bpf local storage. It's important to
note however, that the bpf_local_storage code rejects nmi updates
already, the code changes in that case are just to accommodate the changes
to the record extending the kptr.

Cc: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Reported-by: Justin Suess <utilityemal77@gmail.com>
Closes: https://lore.kernel.org/bpf/20260421201035.1729473-1-utilityemal77@gmail.com/
Signed-off-by: Justin Suess <utilityemal77@gmail.com>
---
 include/linux/bpf.h            |  69 ++++++++++++
 kernel/bpf/arraymap.c          |  36 ++++++-
 kernel/bpf/bpf_local_storage.c |  13 ++-
 kernel/bpf/btf.c               |   6 +-
 kernel/bpf/hashtab.c           | 181 +++++++++++++++++++++++++++----
 kernel/bpf/syscall.c           | 190 +++++++++++++++++++++++++++++++--
 6 files changed, 456 insertions(+), 39 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 715b6df9c403..037bdadbed96 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -9,6 +9,8 @@
 
 #include <crypto/sha2.h>
 #include <linux/workqueue.h>
+#include <linux/irq_work.h>
+#include <linux/llist.h>
 #include <linux/file.h>
 #include <linux/percpu.h>
 #include <linux/err.h>
@@ -234,6 +236,10 @@ struct btf_field_kptr {
 	 * program-allocated, dtor is NULL,  and __bpf_obj_drop_impl is used
 	 */
 	btf_dtor_kfunc_t dtor;
+	struct irq_work irq_work;
+	struct llist_head irq_work_items;
+	struct llist_head free_list;
+	u32 aux_off;
 	u32 btf_id;
 };
 
@@ -257,6 +263,7 @@ struct btf_field {
 struct btf_record {
 	u32 cnt;
 	u32 field_mask;
+	u32 kptr_ref_aux_size;
 	int spin_lock_off;
 	int res_spin_lock_off;
 	int timer_off;
@@ -266,6 +273,67 @@ struct btf_record {
 	struct btf_field fields[];
 };
 
+struct bpf_kptr_dtor_aux {
+	struct llist_node node;
+	void *ptr;
+};
+
+static inline struct bpf_kptr_dtor_aux *
+bpf_kptr_ref_aux(const struct btf_field *field, void *value)
+{
+	return value + field->kptr.aux_off;
+}
+
+static inline void bpf_kptr_aux_init_field(struct btf_field *field, u32 *aux_off)
+{
+	if (field->type != BPF_KPTR_REF)
+		return;
+
+	field->kptr.aux_off = *aux_off;
+	*aux_off += sizeof(struct bpf_kptr_dtor_aux);
+}
+
+static inline void bpf_kptr_aux_init_value(const struct btf_record *rec, void *value)
+{
+	int i;
+
+	if (IS_ERR_OR_NULL(rec) || !rec->kptr_ref_aux_size)
+		return;
+
+	for (i = 0; i < rec->cnt; i++) {
+		struct bpf_kptr_dtor_aux *aux;
+
+		if (rec->fields[i].type != BPF_KPTR_REF)
+			continue;
+
+		aux = bpf_kptr_ref_aux(&rec->fields[i], value);
+		init_llist_node(&aux->node);
+		aux->ptr = NULL;
+	}
+}
+
+static inline bool bpf_kptr_ref_has_deferred_dtor(const struct btf_record *rec,
+						  void *value)
+{
+	int i;
+
+	if (IS_ERR_OR_NULL(rec) || !rec->kptr_ref_aux_size)
+		return false;
+
+	for (i = 0; i < rec->cnt; i++) {
+		struct bpf_kptr_dtor_aux *aux;
+
+		if (rec->fields[i].type != BPF_KPTR_REF)
+			continue;
+
+		aux = bpf_kptr_ref_aux(&rec->fields[i], value);
+		if (READ_ONCE(aux->ptr))
+			return true;
+	}
+
+	return false;
+}
+
 /* Non-opaque version of bpf_rb_node in uapi/linux/bpf.h */
 struct bpf_rb_node_kern {
 	struct rb_node rb_node;
@@ -2602,6 +2670,7 @@ void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj);
 void bpf_obj_free_task_work(const struct btf_record *rec, void *obj);
 void bpf_obj_free_fields(const struct btf_record *rec, void *obj);
 void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu);
+int bpf_map_attr_ref_kptr_aux_size(const union bpf_attr *attr);
 
 struct bpf_map *bpf_map_get(u32 ufd);
 struct bpf_map *bpf_map_get_with_uref(u32 ufd);
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 5e25e0353509..919861b553c2 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -54,6 +54,7 @@ int array_map_alloc_check(union bpf_attr *attr)
 {
 	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
 	int numa_node = bpf_map_attr_numa_node(attr);
+	int aux_size;
 
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
@@ -74,8 +75,12 @@ int array_map_alloc_check(union bpf_attr *attr)
 	/* avoid overflow on round_up(map->value_size) */
 	if (attr->value_size > INT_MAX)
 		return -E2BIG;
+	aux_size = bpf_map_attr_ref_kptr_aux_size(attr);
+	if (aux_size < 0)
+		return aux_size;
 	/* percpu map value size is bound by PCPU_MIN_UNIT_SIZE */
-	if (percpu && round_up(attr->value_size, 8) > PCPU_MIN_UNIT_SIZE)
+	if (percpu &&
+	    round_up(attr->value_size, 8) + aux_size > PCPU_MIN_UNIT_SIZE)
 		return -E2BIG;
 
 	return 0;
@@ -89,8 +94,13 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 	bool bypass_spec_v1 = bpf_bypass_spec_v1(NULL);
 	u64 array_size, mask64;
 	struct bpf_array *array;
+	int aux_size;
 
 	elem_size = round_up(attr->value_size, 8);
+	aux_size = bpf_map_attr_ref_kptr_aux_size(attr);
+	if (aux_size < 0)
+		return ERR_PTR(aux_size);
+	elem_size += aux_size;
 
 	max_entries = attr->max_entries;
 
@@ -205,7 +215,7 @@ static int array_map_direct_value_meta(const struct bpf_map *map, u64 imm,
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	u64 base = (unsigned long)array->value;
-	u64 range = array->elem_size;
+	u64 range = map->value_size;
 
 	if (map->max_entries != 1)
 		return -ENOTSUPP;
@@ -553,6 +563,9 @@ static int array_map_check_btf(struct bpf_map *map,
 			       const struct btf_type *key_type,
 			       const struct btf_type *value_type)
 {
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	int i;
+
 	/* One exception for keyless BTF: .bss/.data/.rodata map */
 	if (btf_type_is_void(key_type)) {
 		if (map->map_type != BPF_MAP_TYPE_ARRAY ||
@@ -572,6 +585,25 @@ static int array_map_check_btf(struct bpf_map *map,
 	if (!btf_type_is_i32(key_type))
 		return -EINVAL;
 
+	if (!IS_ERR_OR_NULL(map->record) && map->record->kptr_ref_aux_size) {
+		if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
+			for (i = 0; i < array->map.max_entries; i++) {
+				void __percpu *pptr =
+					array->pptrs[i & array->index_mask];
+				int cpu;
+
+				for_each_possible_cpu(cpu)
+					bpf_kptr_aux_init_value(
+						map->record,
+						per_cpu_ptr(pptr, cpu));
+			}
+		} else {
+			for (i = 0; i < array->map.max_entries; i++)
+				bpf_kptr_aux_init_value(map->record,
+							array_map_elem_ptr(array, i));
+		}
+	}
+
 	return 0;
 }
 
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 6fc6a4b672b5..8b0be9612f20 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -81,6 +81,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
 	if (selem) {
 		RCU_INIT_POINTER(SDATA(selem)->smap, smap);
 		atomic_set(&selem->state, 0);
+		bpf_kptr_aux_init_value(smap->map.record, SDATA(selem)->data);
 
 		if (value) {
 			/* No need to call check_and_init_map_value as memory is zero init */
@@ -800,14 +801,20 @@ bpf_local_storage_map_alloc(union bpf_attr *attr,
 		raw_res_spin_lock_init(&smap->buckets[i].lock);
 	}
 
-	smap->elem_size = offsetof(struct bpf_local_storage_elem,
-				   sdata.data[attr->value_size]);
+	smap->elem_size = offsetof(
+		struct bpf_local_storage_elem,
+		sdata.data[attr->value_size]);
+	err = bpf_map_attr_ref_kptr_aux_size(attr);
+	if (err < 0)
+		goto free_buckets;
+	smap->elem_size += err;
 
 	smap->cache_idx = bpf_local_storage_cache_idx_get(cache);
 	return &smap->map;
 
-free_smap:
+free_buckets:
 	kvfree(smap->buckets);
+free_smap:
 	bpf_map_area_free(smap);
 	return ERR_PTR(err);
 }
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 2b0511663319..a82a52aa7293 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -4074,7 +4074,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
 				    u32 field_mask, u32 value_size)
 {
 	struct btf_field_info info_arr[BTF_FIELDS_MAX];
-	u32 next_off = 0, field_type_size;
+	u32 next_off = 0, value_data_size, aux_off, field_type_size;
 	struct btf_record *rec;
 	int ret, i, cnt;
 
@@ -4098,6 +4098,8 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
 	rec->wq_off = -EINVAL;
 	rec->refcount_off = -EINVAL;
 	rec->task_work_off = -EINVAL;
+	value_data_size = round_up(value_size, 8);
+	aux_off = value_data_size;
 	for (i = 0; i < cnt; i++) {
 		field_type_size = btf_field_type_size(info_arr[i].type);
 		if (info_arr[i].off + field_type_size > value_size) {
@@ -4171,8 +4173,10 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
 			ret = -EFAULT;
 			goto end;
 		}
+		bpf_kptr_aux_init_field(&rec->fields[i], &aux_off);
 		rec->cnt++;
 	}
+	rec->kptr_ref_aux_size = aux_off - value_data_size;
 
 	if (rec->spin_lock_off >= 0 && rec->res_spin_lock_off >= 0) {
 		ret = -EINVAL;
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 3dd9b4924ae4..c3ad371948c3 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -86,6 +86,8 @@ struct bpf_htab {
 	struct bpf_map map;
 	struct bpf_mem_alloc ma;
 	struct bpf_mem_alloc pcpu_ma;
+	struct irq_work nmi_free_irq_work;
+	struct llist_head nmi_free_elems;
 	struct bucket *buckets;
 	void *elems;
 	union {
@@ -100,6 +102,7 @@ struct bpf_htab {
 	atomic_t count;
 	bool use_percpu_counter;
 	u32 n_buckets;	/* number of hash buckets */
+	u32 kptr_ref_aux_size;
 	u32 elem_size;	/* size of each element in bytes */
 	u32 hashrnd;
 };
@@ -130,6 +133,8 @@ struct htab_btf_record {
 	u32 key_size;
 };
 
+static void htab_nmi_free_irq_work(struct irq_work *work);
+
 static inline bool htab_is_prealloc(const struct bpf_htab *htab)
 {
 	return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
@@ -328,7 +333,8 @@ static int prealloc_init(struct bpf_htab *htab)
 		goto skip_percpu_elems;
 
 	for (i = 0; i < num_entries; i++) {
-		u32 size = round_up(htab->map.value_size, 8);
+		u32 size = round_up(htab->map.value_size, 8) +
+			   htab->kptr_ref_aux_size;
 		void __percpu *pptr;
 
 		pptr = bpf_map_alloc_percpu(&htab->map, size, 8,
@@ -419,6 +425,7 @@ static int htab_map_alloc_check(union bpf_attr *attr)
 	bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
 	bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED);
 	int numa_node = bpf_map_attr_numa_node(attr);
+	int aux_size;
 
 	BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) !=
 		     offsetof(struct htab_elem, hash_node.pprev));
@@ -447,8 +454,12 @@ static int htab_map_alloc_check(union bpf_attr *attr)
 	    attr->value_size == 0)
 		return -EINVAL;
 
-	if ((u64)attr->key_size + attr->value_size >= KMALLOC_MAX_SIZE -
-	   sizeof(struct htab_elem))
+	aux_size = bpf_map_attr_ref_kptr_aux_size(attr);
+	if (aux_size < 0)
+		return aux_size;
+
+	if ((u64)attr->key_size + round_up(attr->value_size, 8) + aux_size >=
+	    KMALLOC_MAX_SIZE - sizeof(struct htab_elem))
 		/* if key_size + value_size is bigger, the user space won't be
 		 * able to access the elements via bpf syscall. This check
 		 * also makes sure that the elem_size doesn't overflow and it's
@@ -456,7 +467,8 @@ static int htab_map_alloc_check(union bpf_attr *attr)
 		 */
 		return -E2BIG;
 	/* percpu map value size is bound by PCPU_MIN_UNIT_SIZE */
-	if (percpu && round_up(attr->value_size, 8) > PCPU_MIN_UNIT_SIZE)
+	if (percpu &&
+	    round_up(attr->value_size, 8) + aux_size > PCPU_MIN_UNIT_SIZE)
 		return -E2BIG;
 
 	return 0;
@@ -526,6 +538,33 @@ static int htab_map_check_btf(struct bpf_map *map, const struct btf *btf,
 			      const struct btf_type *key_type, const struct btf_type *value_type)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	u32 num_entries = htab->map.max_entries;
+	int i;
+
+	if (htab_is_prealloc(htab) && !IS_ERR_OR_NULL(map->record) &&
+	    map->record->kptr_ref_aux_size) {
+		if (htab_has_extra_elems(htab))
+			num_entries += num_possible_cpus();
+		for (i = 0; i < num_entries; i++) {
+			struct htab_elem *elem = get_htab_elem(htab, i);
+
+			if (htab_is_percpu(htab)) {
+				void __percpu *pptr = htab_elem_get_ptr(
+					elem, htab->map.key_size);
+				int cpu;
+
+				for_each_possible_cpu(cpu)
+					bpf_kptr_aux_init_value(
+						map->record,
+						per_cpu_ptr(pptr, cpu));
+			} else {
+				void *value = htab_elem_value(
+					elem, htab->map.key_size);
+
+				bpf_kptr_aux_init_value(map->record, value);
+			}
+		}
+	}
 
 	if (htab_is_prealloc(htab))
 		return 0;
@@ -551,6 +590,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
 	bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
 	struct bpf_htab *htab;
+	int aux_size;
 	int err;
 
 	htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE);
@@ -558,6 +598,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 		return ERR_PTR(-ENOMEM);
 
 	bpf_map_init_from_attr(&htab->map, attr);
+	init_irq_work(&htab->nmi_free_irq_work, htab_nmi_free_irq_work);
+	init_llist_head(&htab->nmi_free_elems);
 
 	if (percpu_lru) {
 		/* ensure each CPU's lru list has >=1 elements.
@@ -582,10 +624,17 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 
 	htab->elem_size = sizeof(struct htab_elem) +
 			  round_up(htab->map.key_size, 8);
+	aux_size = bpf_map_attr_ref_kptr_aux_size(attr);
+	if (aux_size < 0) {
+		err = aux_size;
+		goto free_htab;
+	}
+	htab->kptr_ref_aux_size = aux_size;
 	if (percpu)
 		htab->elem_size += sizeof(void *);
 	else
-		htab->elem_size += round_up(htab->map.value_size, 8);
+		htab->elem_size += round_up(htab->map.value_size, 8) +
+					    aux_size;
 
 	/* check for u32 overflow */
 	if (htab->n_buckets > U32_MAX / sizeof(struct bucket))
@@ -648,7 +697,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 			goto free_map_locked;
 		if (percpu) {
 			err = bpf_mem_alloc_init(&htab->pcpu_ma,
-						 round_up(htab->map.value_size, 8), true);
+						 round_up(htab->map.value_size, 8) + aux_size,
+						 true);
 			if (err)
 				goto free_map_locked;
 		}
@@ -834,22 +884,74 @@ static int htab_lru_map_gen_lookup(struct bpf_map *map,
 	return insn - insn_buf;
 }
 
-static void check_and_free_fields(struct bpf_htab *htab,
-				  struct htab_elem *elem)
+static bool check_and_free_fields(struct bpf_htab *htab, struct htab_elem *elem)
 {
+	bool deferred = false;
+
 	if (IS_ERR_OR_NULL(htab->map.record))
-		return;
+		return false;
 
 	if (htab_is_percpu(htab)) {
 		void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size);
 		int cpu;
 
-		for_each_possible_cpu(cpu)
-			bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu));
+		for_each_possible_cpu(cpu) {
+			void *value = per_cpu_ptr(pptr, cpu);
+
+			bpf_obj_free_fields(htab->map.record, value);
+			if (in_nmi() &&
+			    bpf_kptr_ref_has_deferred_dtor(htab->map.record, value))
+				deferred = true;
+		}
 	} else {
 		void *map_value = htab_elem_value(elem, htab->map.key_size);
 
 		bpf_obj_free_fields(htab->map.record, map_value);
+		if (in_nmi() &&
+		    bpf_kptr_ref_has_deferred_dtor(htab->map.record, map_value))
+			deferred = true;
+	}
+	return deferred;
+}
+
+static void htab_nmi_queue_free(struct bpf_htab *htab, struct htab_elem *elem)
+{
+	if (llist_add((struct llist_node *)&elem->fnode, &htab->nmi_free_elems))
+		irq_work_queue(&htab->nmi_free_irq_work);
+}
+
+static void htab_elem_free_nofields(struct bpf_htab *htab, struct htab_elem *l)
+{
+	if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
+		bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr);
+	bpf_mem_cache_free(&htab->ma, l);
+}
+
+static void htab_nmi_free_irq_work(struct irq_work *work)
+{
+	struct bpf_htab *htab =
+		container_of(work, struct bpf_htab, nmi_free_irq_work);
+	struct llist_node *node, *tmp, *list;
+
+	list = llist_del_all(&htab->nmi_free_elems);
+	if (!list)
+		return;
+
+	list = llist_reverse_order(list);
+	llist_for_each_safe(node, tmp, list) {
+		struct htab_elem *elem;
+
+		elem = container_of((struct pcpu_freelist_node *)node,
+				    struct htab_elem, fnode);
+		if (htab_is_prealloc(htab)) {
+			if (htab_is_lru(htab))
+				bpf_lru_push_free(&htab->lru, &elem->lru_node);
+			else
+				pcpu_freelist_push(&htab->freelist,
+						   &elem->fnode);
+		} else {
+			htab_elem_free_nofields(htab, elem);
+		}
 	}
 }
 
@@ -1002,11 +1104,16 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
 
 	if (htab_is_prealloc(htab)) {
 		bpf_map_dec_elem_count(&htab->map);
-		check_and_free_fields(htab, l);
-		pcpu_freelist_push(&htab->freelist, &l->fnode);
+		if (check_and_free_fields(htab, l))
+			htab_nmi_queue_free(htab, l);
+		else
+			pcpu_freelist_push(&htab->freelist, &l->fnode);
 	} else {
 		dec_elem_count(htab);
-		htab_elem_free(htab, l);
+		if (check_and_free_fields(htab, l))
+			htab_nmi_queue_free(htab, l);
+		else
+			htab_elem_free_nofields(htab, l);
 	}
 }
 
@@ -1082,12 +1189,23 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 
 	if (prealloc) {
 		if (old_elem) {
-			/* if we're updating the existing element,
-			 * use per-cpu extra elems to avoid freelist_pop/push
-			 */
-			pl_new = this_cpu_ptr(htab->extra_elems);
-			l_new = *pl_new;
-			*pl_new = old_elem;
+			if (in_nmi() && htab->kptr_ref_aux_size) {
+				struct pcpu_freelist_node *l;
+
+				l = __pcpu_freelist_pop(&htab->freelist);
+				if (!l)
+					return ERR_PTR(-E2BIG);
+				l_new = container_of(l, struct htab_elem,
+						     fnode);
+			} else {
+				/*
+				 * If updating an existing element, use per-cpu
+				 * extra elems to avoid freelist_pop/push.
+				 */
+				pl_new = this_cpu_ptr(htab->extra_elems);
+				l_new = *pl_new;
+				*pl_new = old_elem;
+			}
 		} else {
 			struct pcpu_freelist_node *l;
 
@@ -1131,6 +1249,15 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 			pptr = *(void __percpu **)ptr;
 		}
 
+		if (htab->kptr_ref_aux_size) {
+			int cpu;
+
+			for_each_possible_cpu(cpu)
+				bpf_kptr_aux_init_value(
+					htab->map.record,
+					per_cpu_ptr(pptr, cpu));
+		}
+
 		pcpu_init_value(htab, pptr, value, onallcpus, map_flags);
 
 		if (!prealloc)
@@ -1139,10 +1266,14 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 		size = round_up(size, 8);
 		memcpy(htab_elem_value(l_new, key_size), value, size);
 	} else if (map_flags & BPF_F_LOCK) {
+		bpf_kptr_aux_init_value(htab->map.record,
+					htab_elem_value(l_new, key_size));
 		copy_map_value_locked(&htab->map,
 				      htab_elem_value(l_new, key_size),
 				      value, false);
 	} else {
+		bpf_kptr_aux_init_value(htab->map.record,
+					htab_elem_value(l_new, key_size));
 		copy_map_value(&htab->map, htab_elem_value(l_new, key_size), value);
 	}
 
@@ -1270,7 +1401,11 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 
 static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem)
 {
-	check_and_free_fields(htab, elem);
+	if (check_and_free_fields(htab, elem)) {
+		bpf_map_dec_elem_count(&htab->map);
+		htab_nmi_queue_free(htab, elem);
+		return;
+	}
 	bpf_map_dec_elem_count(&htab->map);
 	bpf_lru_push_free(&htab->lru, &elem->lru_node);
 }
@@ -1634,6 +1769,7 @@ static void htab_map_free(struct bpf_map *map)
 	 * underneath and is responsible for waiting for callbacks to finish
 	 * during bpf_mem_alloc_destroy().
 	 */
+	irq_work_sync(&htab->nmi_free_irq_work);
 	if (!htab_is_prealloc(htab)) {
 		delete_all_elements(htab);
 	} else {
@@ -2316,7 +2452,8 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_
 static u64 htab_map_mem_usage(const struct bpf_map *map)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
-	u32 value_size = round_up(htab->map.value_size, 8);
+	u32 value_size = round_up(htab->map.value_size, 8) +
+			 htab->kptr_ref_aux_size;
 	bool prealloc = htab_is_prealloc(htab);
 	bool percpu = htab_is_percpu(htab);
 	bool lru = htab_is_lru(htab);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 2caafce00f24..f26c8ed81690 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -661,12 +661,93 @@ struct btf_field *btf_record_find(const struct btf_record *rec, u32 offset,
 	return field;
 }
 
+static void bpf_kptr_call_dtor(const struct btf_field *field, void *ptr)
+{
+	struct btf_struct_meta *pointee_struct_meta;
+
+	if (!btf_is_kernel(field->kptr.btf)) {
+		pointee_struct_meta = btf_find_struct_meta(field->kptr.btf,
+							   field->kptr.btf_id);
+		__bpf_obj_drop_impl(ptr,
+				    pointee_struct_meta ?
+					    pointee_struct_meta->record :
+					    NULL,
+				    false);
+		return;
+	}
+
+	field->kptr.dtor(ptr);
+}
+
+static void bpf_kptr_ref_process_queue(struct btf_field *field)
+{
+	struct llist_node *node, *tmp, *list;
+
+	list = llist_del_all(&field->kptr.irq_work_items);
+	if (!list)
+		return;
+
+	list = llist_reverse_order(list);
+	llist_for_each_safe(node, tmp, list) {
+		struct bpf_kptr_dtor_aux *aux;
+		void *ptr;
+
+		aux = container_of(node, struct bpf_kptr_dtor_aux, node);
+		ptr = xchg(&aux->ptr, NULL);
+		if (!ptr)
+			continue;
+		bpf_kptr_call_dtor(field, ptr);
+	}
+}
+
+static void bpf_kptr_ref_irq_work(struct irq_work *irq_work)
+{
+	struct btf_field_kptr *kptr =
+		container_of(irq_work, struct btf_field_kptr, irq_work);
+	struct btf_field *field = container_of(kptr, struct btf_field, kptr);
+
+	bpf_kptr_ref_process_queue(field);
+}
+
+static void bpf_kptr_record_init(struct btf_record *rec)
+{
+	int i;
+
+	if (IS_ERR_OR_NULL(rec))
+		return;
+
+	for (i = 0; i < rec->cnt; i++) {
+		if (rec->fields[i].type != BPF_KPTR_REF)
+			continue;
+		init_irq_work(&rec->fields[i].kptr.irq_work,
+			      bpf_kptr_ref_irq_work);
+		init_llist_head(&rec->fields[i].kptr.irq_work_items);
+		init_llist_head(&rec->fields[i].kptr.free_list);
+	}
+}
+
+static void bpf_kptr_record_flush(struct btf_record *rec)
+{
+	int i;
+
+	if (IS_ERR_OR_NULL(rec))
+		return;
+
+	for (i = 0; i < rec->cnt; i++) {
+		if (rec->fields[i].type != BPF_KPTR_REF)
+			continue;
+		irq_work_sync(&rec->fields[i].kptr.irq_work);
+		bpf_kptr_ref_process_queue(&rec->fields[i]);
+	}
+}
+
 void btf_record_free(struct btf_record *rec)
 {
 	int i;
 
 	if (IS_ERR_OR_NULL(rec))
 		return;
+	bpf_kptr_record_flush(rec);
 	for (i = 0; i < rec->cnt; i++) {
 		switch (rec->fields[i].type) {
 		case BPF_KPTR_UNREF:
@@ -751,6 +832,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)
 		}
 		new_rec->cnt++;
 	}
+	bpf_kptr_record_init(new_rec);
 	return new_rec;
 free:
 	btf_record_free(new_rec);
@@ -792,14 +874,79 @@ bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *r
 		return false;
 
 	for (i = 0; i < rec_a->cnt; i++) {
-		if (memcmp(&rec_a->fields[i], &rec_b->fields[i],
-			   sizeof(rec_a->fields[i])))
+		struct btf_field a = rec_a->fields[i];
+		struct btf_field b = rec_b->fields[i];
+
+		switch (a.type) {
+		case BPF_KPTR_UNREF:
+		case BPF_KPTR_REF:
+		case BPF_KPTR_PERCPU:
+		case BPF_UPTR:
+			memset(&a.kptr.irq_work, 0, sizeof(a.kptr.irq_work));
+			memset(&a.kptr.irq_work_items, 0,
+			       sizeof(a.kptr.irq_work_items));
+			memset(&a.kptr.free_list, 0, sizeof(a.kptr.free_list));
+			memset(&b.kptr.irq_work, 0, sizeof(b.kptr.irq_work));
+			memset(&b.kptr.irq_work_items, 0,
+			       sizeof(b.kptr.irq_work_items));
+			memset(&b.kptr.free_list, 0, sizeof(b.kptr.free_list));
+			break;
+		default:
+			break;
+		}
+
+		if (memcmp(&a, &b, sizeof(a)))
 			return false;
 	}
 
 	return true;
 }
 
+int bpf_map_attr_ref_kptr_aux_size(const union bpf_attr *attr)
+{
+	const struct btf_type *value_type;
+	struct btf_record *rec;
+	struct btf *btf;
+	u32 btf_value_type_id;
+	u32 value_size;
+	int aux_size;
+
+	if (!attr->btf_value_type_id)
+		return 0;
+
+	btf = btf_get_by_fd(attr->btf_fd);
+	if (IS_ERR(btf))
+		return 0;
+
+	btf_value_type_id = attr->btf_value_type_id;
+	value_type = btf_type_id_size(btf, &btf_value_type_id, &value_size);
+	if (!value_type || value_size != attr->value_size) {
+		aux_size = 0;
+		goto out;
+	}
+
+	/*
+	 * This helper is only sizing hidden storage for valid ref-kptr fields.
+	 * Leave full BTF validation to the regular map_check_btf() path.
+	 */
+	if (!__btf_type_is_struct(value_type) &&
+	    BTF_INFO_KIND(value_type->info) != BTF_KIND_DATASEC) {
+		aux_size = 0;
+		goto out;
+	}
+
+	rec = btf_parse_fields(btf, value_type, BPF_KPTR_REF, attr->value_size);
+	if (IS_ERR(rec)) {
+		aux_size = 0;
+		goto out;
+	}
+	aux_size = rec ? rec->kptr_ref_aux_size : 0;
+	btf_record_free(rec);
+out:
+	btf_put(btf);
+	return aux_size;
+}
+
 void bpf_obj_free_timer(const struct btf_record *rec, void *obj)
 {
 	if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TIMER)))
@@ -830,8 +977,7 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 		return;
 	fields = rec->fields;
 	for (i = 0; i < rec->cnt; i++) {
-		struct btf_struct_meta *pointee_struct_meta;
-		const struct btf_field *field = &fields[i];
+		struct btf_field *field = (struct btf_field *)&fields[i];
 		void *field_ptr = obj + field->offset;
 		void *xchgd_field;
 
@@ -857,14 +1003,35 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 			if (!xchgd_field)
 				break;
 
-			if (!btf_is_kernel(field->kptr.btf)) {
-				pointee_struct_meta = btf_find_struct_meta(field->kptr.btf,
-									   field->kptr.btf_id);
-				__bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ?
-								 pointee_struct_meta->record : NULL,
-								 fields[i].type == BPF_KPTR_PERCPU);
+			if (field->type == BPF_KPTR_REF && in_nmi()) {
+				struct bpf_kptr_dtor_aux *aux;
+
+				aux = bpf_kptr_ref_aux(field, obj);
+				WARN_ON_ONCE(READ_ONCE(aux->ptr));
+				WRITE_ONCE(aux->ptr, xchgd_field);
+				if (llist_add(&aux->node,
+					      &field->kptr.irq_work_items))
+					irq_work_queue(&field->kptr.irq_work);
+				break;
+			}
+
+			if (field->type == BPF_KPTR_PERCPU) {
+				struct btf_struct_meta *pointee_struct_meta;
+
+				pointee_struct_meta = NULL;
+				if (!btf_is_kernel(field->kptr.btf))
+					pointee_struct_meta =
+						btf_find_struct_meta(
+							field->kptr.btf,
+							field->kptr.btf_id);
+				__bpf_obj_drop_impl(
+					xchgd_field,
+					pointee_struct_meta ?
+						pointee_struct_meta->record :
+						NULL,
+					true);
 			} else {
-				field->kptr.dtor(xchgd_field);
+				bpf_kptr_call_dtor(field, xchgd_field);
 			}
 			break;
 		case BPF_UPTR:
@@ -1276,6 +1443,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
 				       BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR |
 				       BPF_TASK_WORK,
 				       map->value_size);
+	bpf_kptr_record_init(map->record);
 	if (!IS_ERR_OR_NULL(map->record)) {
 		int i;
 
-- 
2.53.0

Add a deadlock reproducer for task struct kptrs.

Test artificially triggers NMI events while freeing the last
reference to a task.

Verify that program completes successfully without deadlocking.

Test for array and hash map types.

Note that this test intentionally does racy operations between
userspace and BPF. Some error codes were left unused in case this
test is to be extended to other map types with different ordering
semantics.

Cc: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Assisted-by: GPT-4.5-Medium github-copilot-cli
Signed-off-by: Justin Suess <utilityemal77@gmail.com>
---
 .../prog_tests/task_kptr_nmi_deadlock_repro.c | 305 ++++++++++++++++++
 .../bpf/progs/task_kptr_nmi_deadlock_repro.c  | 217 +++++++++++++
 2 files changed, 522 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/task_kptr_nmi_deadlock_repro.c
 create mode 100644 tools/testing/selftests/bpf/progs/task_kptr_nmi_deadlock_repro.c

diff --git a/tools/testing/selftests/bpf/prog_tests/task_kptr_nmi_deadlock_repro.c b/tools/testing/selftests/bpf/prog_tests/task_kptr_nmi_deadlock_repro.c
new file mode 100644
index 000000000000..9f99e9a0a138
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/task_kptr_nmi_deadlock_repro.c
@@ -0,0 +1,305 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/perf_event.h>
+#include <sched.h>
+#include <sys/syscall.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include <bpf/btf.h>
+#include <test_progs.h>
+
+#include "task_kptr_nmi_deadlock_repro.skel.h"
+
+#define STASHED_TASKS 4
+#define DELETE_TIMEOUT_NS (5ULL * 1000 * 1000 * 1000)
+#define REPRO_ROUNDS 256
+
+enum task_kptr_nmi_map_type {
+	TASK_KPTR_NMI_MAP_HASH = 1,
+	TASK_KPTR_NMI_MAP_ARRAY,
+};
+
+enum task_kptr_nmi_err {
+	TASK_KPTR_NMI_ACQUIRE_ERR = 1,
+	TASK_KPTR_NMI_CREATE_ERR,
+	TASK_KPTR_NMI_LOOKUP_ERR,
+	TASK_KPTR_NMI_MAP_ERR,
+};
+
+struct task_kptr_nmi_repro_case {
+	const char *name;
+	__u32 map_type;
+};
+
+static int find_test_cpu(void)
+{
+	cpu_set_t cpuset;
+	int cpu, err;
+
+	err = sched_getaffinity(0, sizeof(cpuset), &cpuset);
+	if (!ASSERT_OK(err, "sched_getaffinity"))
+		return -1;
+
+	for (cpu = 1; cpu < CPU_SETSIZE; cpu++) {
+		if (CPU_ISSET(cpu, &cpuset))
+			return cpu;
+	}
+
+	ASSERT_TRUE(false, "cpu_available");
+	return -1;
+}
+
+static int open_nmi_pmu_event_on_cpu(int cpu)
+{
+	struct perf_event_attr attr = {
+		.size = sizeof(attr),
+		.type = PERF_TYPE_HARDWARE,
+		.config = PERF_COUNT_HW_CPU_CYCLES,
+		.freq = 1,
+		.sample_freq = 1000,
+	};
+	int pmu_fd;
+
+	pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */, cpu,
+			 -1 /* group_fd */, PERF_FLAG_FD_CLOEXEC);
+	if (pmu_fd == -1) {
+		if (errno == ENOENT || errno == EOPNOTSUPP) {
+			printf("SKIP:no PERF_COUNT_HW_CPU_CYCLES\n");
+			test__skip();
+		}
+		return -1;
+	}
+
+	return pmu_fd;
+}
+
+static bool has_nmi_handler_btf(void)
+{
+	struct btf *btf;
+	int id;
+
+	btf = btf__load_vmlinux_btf();
+	if (libbpf_get_error(btf)) {
+		printf("SKIP:no vmlinux BTF\n");
+		test__skip();
+		return false;
+	}
+
+	id = btf__find_by_name_kind(btf, "nmi_handler", BTF_KIND_FUNC);
+	btf__free(btf);
+	if (id <= 0) {
+		printf("SKIP:no BTF FUNC nmi_handler\n");
+		test__skip();
+		return false;
+	}
+
+	return true;
+}
+
+static bool pin_to_cpu(int cpu, cpu_set_t *old_cpuset)
+{
+	cpu_set_t cpuset;
+	int err;
+
+	err = sched_getaffinity(0, sizeof(*old_cpuset), old_cpuset);
+	if (!ASSERT_OK(err, "sched_getaffinity"))
+		return false;
+
+	CPU_ZERO(&cpuset);
+	CPU_SET(cpu, &cpuset);
+	err = sched_setaffinity(0, sizeof(cpuset), &cpuset);
+	if (!ASSERT_OK(err, "sched_setaffinity"))
+		return false;
+
+	return true;
+}
+
+static void restore_affinity(const cpu_set_t *old_cpuset)
+{
+	ASSERT_OK(sched_setaffinity(0, sizeof(*old_cpuset), old_cpuset),
+		  "restore_affinity");
+}
+
+static bool stash_exited_tasks(struct task_kptr_nmi_deadlock_repro *skel)
+{
+	int i, status;
+
+	for (i = 0; i < STASHED_TASKS; i++) {
+		int pipefd[2];
+		char sync;
+		pid_t child_pid;
+
+		if (!ASSERT_OK(pipe2(pipefd, O_CLOEXEC), "pipe2"))
+			return false;
+
+		child_pid = fork();
+		if (!ASSERT_GT(child_pid, -1, "fork")) {
+			close(pipefd[0]);
+			close(pipefd[1]);
+			return false;
+		}
+
+		if (child_pid == 0) {
+			char sync;
+			int fd;
+
+			close(pipefd[1]);
+			if (read(pipefd[0], &sync, 1) != 1)
+				_exit(127);
+			close(pipefd[0]);
+			fd = open("/dev/null", O_RDONLY | O_CLOEXEC);
+			if (fd < 0)
+				_exit(127);
+			close(fd);
+			_exit(0);
+		}
+
+		close(pipefd[0]);
+		skel->bss->task_kptr_nmi_pids[i] = child_pid;
+
+		sync = 1;
+		if (!ASSERT_EQ(write(pipefd[1], &sync, 1), 1, "start_child")) {
+			close(pipefd[1]);
+			waitpid(child_pid, &status, 0);
+			return false;
+		}
+		close(pipefd[1]);
+
+		if (!ASSERT_EQ(waitpid(child_pid, &status, 0), child_pid,
+			       "waitpid"))
+			return false;
+		if (!ASSERT_TRUE(WIFEXITED(status), "child_exited"))
+			return false;
+		if (!ASSERT_EQ(WEXITSTATUS(status), 0, "child_status"))
+			return false;
+	}
+
+	return true;
+}
+
+static bool
+wait_for_task_nmi_delete_batch(struct task_kptr_nmi_deadlock_repro *skel,
+			       int expected_deleted)
+{
+	u64 now_ns, timeout_time_ns;
+	unsigned long burn = 0;
+	int i;
+
+	now_ns = get_time_ns();
+	timeout_time_ns = now_ns + DELETE_TIMEOUT_NS;
+	for (i = 0; skel->bss->task_kptr_nmi_deleted < expected_deleted; i++) {
+		int j;
+
+		if (skel->bss->task_kptr_nmi_delete_err)
+			break;
+		for (j = 0; j < 1000000; j++)
+			burn += j + i;
+		now_ns = get_time_ns();
+		if (now_ns >= timeout_time_ns)
+			break;
+	}
+
+	if (!ASSERT_EQ(skel->bss->task_kptr_nmi_delete_err, 0,
+		       "task_kptr_nmi_delete_err"))
+		return false;
+	if (!ASSERT_GE(skel->bss->task_kptr_nmi_deleted, expected_deleted,
+		       "task_kptr_nmi_deleted"))
+		return false;
+	if (!ASSERT_LT(now_ns, timeout_time_ns, "task_kptr_nmi_delete_timeout"))
+		return false;
+
+	return true;
+}
+
+static void run_task_kptr_nmi_deadlock_repro_case(const struct task_kptr_nmi_repro_case *test)
+{
+	struct task_kptr_nmi_deadlock_repro *skel;
+	cpu_set_t old_cpuset;
+	bool pinned = false;
+	__u32 expected_deleted = 0;
+	int cpu = -1;
+	int pmu_fd = -1;
+	int err, round;
+
+	if (!has_nmi_handler_btf())
+		return;
+
+	cpu = find_test_cpu();
+	if (cpu < 0)
+		return;
+
+	skel = task_kptr_nmi_deadlock_repro__open();
+	if (!ASSERT_OK_PTR(skel, "task_kptr_nmi_deadlock_repro__open"))
+		return;
+
+	skel->bss->task_kptr_nmi_map_type = test->map_type;
+	bpf_program__set_autoload(skel->progs.clear_task_kptrs_from_nmi, true);
+
+	err = task_kptr_nmi_deadlock_repro__load(skel);
+	if (!ASSERT_OK(err, "task_kptr_nmi_deadlock_repro__load"))
+		goto cleanup;
+
+	if (bpf_program__fd(skel->progs.clear_task_kptrs_from_nmi) < 0) {
+		test__skip();
+		goto cleanup;
+	}
+
+	err = task_kptr_nmi_deadlock_repro__attach(skel);
+	if (!ASSERT_OK(err, "task_kptr_nmi_deadlock_repro__attach"))
+		goto cleanup;
+
+	pinned = pin_to_cpu(cpu, &old_cpuset);
+	if (!pinned)
+		goto cleanup;
+
+	pmu_fd = open_nmi_pmu_event_on_cpu(cpu);
+	if (pmu_fd < 0)
+		goto cleanup;
+
+	for (round = 0; round < REPRO_ROUNDS; round++) {
+		if (!stash_exited_tasks(skel))
+			goto cleanup;
+
+		/*
+		 * Hash map inserts create an empty element before looking it up
+		 * to stash the task kptr. NMI cleanup can delete that fresh
+		 * element in between, so LOOKUP_ERR here is a benign test race
+		 * and not a kernel failure.
+		 */
+		if (test->map_type == TASK_KPTR_NMI_MAP_HASH &&
+		    skel->bss->task_kptr_nmi_err == TASK_KPTR_NMI_LOOKUP_ERR)
+			skel->bss->task_kptr_nmi_err = 0;
+
+		if (!ASSERT_EQ(skel->bss->task_kptr_nmi_err, 0, "task_kptr_nmi_err"))
+			goto cleanup;
+		expected_deleted = skel->bss->task_kptr_nmi_inserted;
+		if (!wait_for_task_nmi_delete_batch(skel, expected_deleted))
+			goto cleanup;
+	}
+
+cleanup:
+	close(pmu_fd);
+	if (pinned)
+		restore_affinity(&old_cpuset);
+	task_kptr_nmi_deadlock_repro__destroy(skel);
+}
+
+void serial_test_task_kptr_nmi_deadlock_repro(void)
+{
+	static const struct task_kptr_nmi_repro_case tests[] = {
+		{ "hash", TASK_KPTR_NMI_MAP_HASH },
+		{ "array", TASK_KPTR_NMI_MAP_ARRAY },
+	};
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(tests); i++) {
+		if (!test__start_subtest(tests[i].name))
+			continue;
+		run_task_kptr_nmi_deadlock_repro_case(&tests[i]);
+	}
+}
diff --git a/tools/testing/selftests/bpf/progs/task_kptr_nmi_deadlock_repro.c b/tools/testing/selftests/bpf/progs/task_kptr_nmi_deadlock_repro.c
new file mode 100644
index 000000000000..1ba27d9c3044
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/task_kptr_nmi_deadlock_repro.c
@@ -0,0 +1,217 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <vmlinux.h>
+#include <linux/errno.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+#define MAX_TARGET_PIDS 4
+
+enum {
+	TASK_KPTR_NMI_MAP_HASH = 1,
+	TASK_KPTR_NMI_MAP_ARRAY,
+};
+
+enum {
+	TASK_KPTR_NMI_ACQUIRE_ERR = 1,
+	TASK_KPTR_NMI_CREATE_ERR,
+	TASK_KPTR_NMI_LOOKUP_ERR,
+	TASK_KPTR_NMI_MAP_ERR,
+};
+
+struct task_map_value {
+	struct task_struct __kptr * task;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, __u32);
+	__type(value, struct task_map_value);
+	__uint(max_entries, MAX_TARGET_PIDS);
+} stashed_tasks_hash SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, __u32);
+	__type(value, struct task_map_value);
+	__uint(max_entries, MAX_TARGET_PIDS);
+} stashed_tasks_array SEC(".maps");
+
+struct task_struct *bpf_task_acquire(struct task_struct *p) __ksym;
+void bpf_task_release(struct task_struct *p) __ksym;
+
+__u32 task_kptr_nmi_pids[MAX_TARGET_PIDS];
+__u8 task_kptr_nmi_live[MAX_TARGET_PIDS];
+__u32 task_kptr_nmi_map_type;
+__u32 task_kptr_nmi_inserted;
+__u32 task_kptr_nmi_deleted;
+__u32 task_kptr_nmi_err;
+int task_kptr_nmi_delete_err;
+
+static __always_inline int find_target_slot(__u32 pid)
+{
+	int i;
+
+	for (i = 0; i < MAX_TARGET_PIDS; i++) {
+		if (task_kptr_nmi_pids[i] == pid)
+			return i;
+	}
+
+	return -1;
+}
+
+static __always_inline void stash_task(int i, struct task_map_value *slot,
+				       struct task_struct *acquired)
+{
+	struct task_struct *old;
+
+	old = bpf_kptr_xchg(&slot->task, acquired);
+	if (old)
+		bpf_task_release(old);
+	else {
+		task_kptr_nmi_live[i] = 1;
+		task_kptr_nmi_inserted++;
+	}
+}
+
+static __always_inline void set_delete_err(int err)
+{
+	if (!task_kptr_nmi_delete_err)
+		task_kptr_nmi_delete_err = err;
+}
+
+SEC("lsm.s/file_open")
+int insert_task_kptr_from_lsm(struct file *ctx_file)
+{
+	struct task_map_value init = {};
+	struct task_map_value *slot;
+	struct task_struct *task, *acquired;
+	__u32 pid;
+	int i, ret;
+
+	(void)ctx_file;
+
+	pid = bpf_get_current_pid_tgid() >> 32;
+	i = find_target_slot(pid);
+	if (i < 0)
+		return 0;
+
+	task = bpf_get_current_task_btf();
+	acquired = bpf_task_acquire(task);
+	if (!acquired) {
+		task_kptr_nmi_err = TASK_KPTR_NMI_ACQUIRE_ERR;
+		return 0;
+	}
+
+	/*
+	 * Race is OK for these specific map types. Userspace may
+	 * have modified the array, causing inconsistency. This
+	 * error TASK_KPTR_NMI_CREATE_ERR is non-fatal for test
+	 * purposes. But may be important if this test is
+	 * extended for other map types.
+	 */
+	switch (task_kptr_nmi_map_type) {
+	case TASK_KPTR_NMI_MAP_HASH:
+		pid = i;
+		ret = bpf_map_update_elem(&stashed_tasks_hash, &pid, &init,
+					  BPF_NOEXIST);
+		if (ret && ret != -EEXIST) {
+			task_kptr_nmi_err = TASK_KPTR_NMI_CREATE_ERR;
+			goto release_task;
+		}
+		slot = bpf_map_lookup_elem(&stashed_tasks_hash, &pid);
+		if (!slot) {
+			task_kptr_nmi_err = TASK_KPTR_NMI_LOOKUP_ERR;
+			goto release_task;
+		}
+		break;
+	case TASK_KPTR_NMI_MAP_ARRAY:
+		pid = i;
+		slot = bpf_map_lookup_elem(&stashed_tasks_array, &pid);
+		if (!slot) {
+			task_kptr_nmi_err = TASK_KPTR_NMI_LOOKUP_ERR;
+			goto release_task;
+		}
+		break;
+	default:
+		task_kptr_nmi_err = TASK_KPTR_NMI_MAP_ERR;
+		goto release_task;
+	}
+
+	stash_task(i, slot, acquired);
+	return 0;
+
+release_task:
+	bpf_task_release(acquired);
+	return 0;
+}
+
+static __always_inline void clear_hash_tasks(void)
+{
+	int i;
+
+	for (i = 0; i < MAX_TARGET_PIDS; i++) {
+		__u32 slot = i;
+
+		if (!task_kptr_nmi_pids[i])
+			continue;
+		if (!bpf_map_delete_elem(&stashed_tasks_hash, &slot)) {
+			task_kptr_nmi_live[i] = 0;
+			task_kptr_nmi_deleted++;
+		} else if (bpf_map_lookup_elem(&stashed_tasks_hash, &slot)) {
+			set_delete_err(-EIO);
+		}
+	}
+}
+
+static __always_inline void clear_array_tasks(void)
+{
+	struct task_map_value init = {};
+	int i;
+
+	for (i = 0; i < MAX_TARGET_PIDS; i++) {
+		__u32 slot = i;
+
+		if (!task_kptr_nmi_pids[i])
+			continue;
+		if (bpf_map_update_elem(&stashed_tasks_array, &slot, &init,
+					BPF_EXIST)) {
+			set_delete_err(-EIO);
+			continue;
+		}
+		if (task_kptr_nmi_live[i]) {
+			task_kptr_nmi_live[i] = 0;
+			task_kptr_nmi_deleted++;
+		}
+	}
+}
+
+SEC("?tp_btf/nmi_handler")
+int BPF_PROG(clear_task_kptrs_from_nmi, void *handler, void *regs, s64 delta_ns,
+	     int handled)
+{
+	(void)handler;
+	(void)regs;
+	(void)delta_ns;
+	(void)handled;
+
+	if (task_kptr_nmi_deleted >= task_kptr_nmi_inserted)
+		return 0;
+
+	switch (task_kptr_nmi_map_type) {
+	case TASK_KPTR_NMI_MAP_HASH:
+		clear_hash_tasks();
+		break;
+	case TASK_KPTR_NMI_MAP_ARRAY:
+		clear_array_tasks();
+		break;
+	default:
+		task_kptr_nmi_err = TASK_KPTR_NMI_MAP_ERR;
+		break;
+	}
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
-- 
2.53.0