For now, raw_spinlock is used during adding, deleting and updating in the bpf lru map, which can lead to deadlock if it is done in the NMI context, as described in [1]. Fix this by convert the raw_spinlock_t in bpf_lru_list and bpf_lru_locallist to rqspinlock_t. Link: https://lore.kernel.org/bpf/CAEf4BzbTJCUx0D=zjx6+5m5iiGhwLzaP94hnw36ZMDHAf4-U_w@mail.gmail.com/ Signed-off-by: Menglong Dong --- kernel/bpf/bpf_lru_list.c | 47 +++++++++++++++++++++++---------------- kernel/bpf/bpf_lru_list.h | 5 +++-- 2 files changed, 31 insertions(+), 21 deletions(-) diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index e7a2fc60523f..38fddcb1e28c 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c @@ -307,9 +307,10 @@ static void bpf_lru_list_push_free(struct bpf_lru_list *l, if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type))) return; - raw_spin_lock_irqsave(&l->lock, flags); + if (raw_res_spin_lock_irqsave(&l->lock, flags)) + return; __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE); - raw_spin_unlock_irqrestore(&l->lock, flags); + raw_res_spin_unlock_irqrestore(&l->lock, flags); } static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru, @@ -319,7 +320,8 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru, struct bpf_lru_node *node, *tmp_node; unsigned int nfree = 0; - raw_spin_lock(&l->lock); + if (raw_res_spin_lock(&l->lock)) + return; __local_list_flush(l, loc_l); @@ -338,7 +340,7 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru, local_free_list(loc_l), BPF_LRU_LOCAL_LIST_T_FREE); - raw_spin_unlock(&l->lock); + raw_res_spin_unlock(&l->lock); } static void __local_list_add_pending(struct bpf_lru *lru, @@ -404,7 +406,8 @@ static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru, l = per_cpu_ptr(lru->percpu_lru, cpu); - raw_spin_lock_irqsave(&l->lock, flags); + if (raw_res_spin_lock_irqsave(&l->lock, flags)) + return NULL; __bpf_lru_list_rotate(lru, l); @@ -420,7 +423,7 @@ static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru, __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE); } - raw_spin_unlock_irqrestore(&l->lock, flags); + raw_res_spin_unlock_irqrestore(&l->lock, flags); return node; } @@ -437,7 +440,8 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru, loc_l = per_cpu_ptr(clru->local_list, cpu); - raw_spin_lock_irqsave(&loc_l->lock, flags); + if (raw_res_spin_lock_irqsave(&loc_l->lock, flags)) + return NULL; node = __local_list_pop_free(loc_l); if (!node) { @@ -448,7 +452,7 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru, if (node) __local_list_add_pending(lru, loc_l, cpu, node, hash); - raw_spin_unlock_irqrestore(&loc_l->lock, flags); + raw_res_spin_unlock_irqrestore(&loc_l->lock, flags); if (node) return node; @@ -466,23 +470,26 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru, do { steal_loc_l = per_cpu_ptr(clru->local_list, steal); - raw_spin_lock_irqsave(&steal_loc_l->lock, flags); + if (raw_res_spin_lock_irqsave(&steal_loc_l->lock, flags)) + goto out_next; node = __local_list_pop_free(steal_loc_l); if (!node) node = __local_list_pop_pending(lru, steal_loc_l); - raw_spin_unlock_irqrestore(&steal_loc_l->lock, flags); + raw_res_spin_unlock_irqrestore(&steal_loc_l->lock, flags); +out_next: steal = cpumask_next_wrap(steal, cpu_possible_mask); } while (!node && steal != first_steal); loc_l->next_steal = steal; if (node) { - raw_spin_lock_irqsave(&loc_l->lock, flags); + if (raw_res_spin_lock_irqsave(&loc_l->lock, flags)) + return NULL; __local_list_add_pending(lru, loc_l, cpu, node, hash); - raw_spin_unlock_irqrestore(&loc_l->lock, flags); + raw_res_spin_unlock_irqrestore(&loc_l->lock, flags); } return node; @@ -511,10 +518,11 @@ static void bpf_common_lru_push_free(struct bpf_lru *lru, loc_l = per_cpu_ptr(lru->common_lru.local_list, node->cpu); - raw_spin_lock_irqsave(&loc_l->lock, flags); + if (raw_res_spin_lock_irqsave(&loc_l->lock, flags)) + return; if (unlikely(node->type != BPF_LRU_LOCAL_LIST_T_PENDING)) { - raw_spin_unlock_irqrestore(&loc_l->lock, flags); + raw_res_spin_unlock_irqrestore(&loc_l->lock, flags); goto check_lru_list; } @@ -522,7 +530,7 @@ static void bpf_common_lru_push_free(struct bpf_lru *lru, bpf_lru_node_clear_ref(node); list_move(&node->list, local_free_list(loc_l)); - raw_spin_unlock_irqrestore(&loc_l->lock, flags); + raw_res_spin_unlock_irqrestore(&loc_l->lock, flags); return; } @@ -538,11 +546,12 @@ static void bpf_percpu_lru_push_free(struct bpf_lru *lru, l = per_cpu_ptr(lru->percpu_lru, node->cpu); - raw_spin_lock_irqsave(&l->lock, flags); + if (raw_res_spin_lock_irqsave(&l->lock, flags)) + return; __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE); - raw_spin_unlock_irqrestore(&l->lock, flags); + raw_res_spin_unlock_irqrestore(&l->lock, flags); } void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node) @@ -625,7 +634,7 @@ static void bpf_lru_locallist_init(struct bpf_lru_locallist *loc_l, int cpu) loc_l->next_steal = cpu; - raw_spin_lock_init(&loc_l->lock); + raw_res_spin_lock_init(&loc_l->lock); } static void bpf_lru_list_init(struct bpf_lru_list *l) @@ -640,7 +649,7 @@ static void bpf_lru_list_init(struct bpf_lru_list *l) l->next_inactive_rotation = &l->lists[BPF_LRU_LIST_T_INACTIVE]; - raw_spin_lock_init(&l->lock); + raw_res_spin_lock_init(&l->lock); } int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset, diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h index fe2661a58ea9..61fc7d7f9de1 100644 --- a/kernel/bpf/bpf_lru_list.h +++ b/kernel/bpf/bpf_lru_list.h @@ -7,6 +7,7 @@ #include #include #include +#include #define NR_BPF_LRU_LIST_T (3) #define NR_BPF_LRU_LIST_COUNT (2) @@ -34,13 +35,13 @@ struct bpf_lru_list { /* The next inactive list rotation starts from here */ struct list_head *next_inactive_rotation; - raw_spinlock_t lock ____cacheline_aligned_in_smp; + rqspinlock_t lock ____cacheline_aligned_in_smp; }; struct bpf_lru_locallist { struct list_head lists[NR_BPF_LRU_LOCAL_LIST_T]; u16 next_steal; - raw_spinlock_t lock; + rqspinlock_t lock; }; struct bpf_common_lru { -- 2.51.2 In this testing, map updating and deleting both happen in NMI context and user context, which is used to detect the possible deadlock. For now, LRU is added in the testing, and more map type can be added in the feature. Signed-off-by: Menglong Dong --- .../selftests/bpf/prog_tests/map_deadlock.c | 136 ++++++++++++++++++ .../selftests/bpf/progs/map_deadlock.c | 52 +++++++ 2 files changed, 188 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/map_deadlock.c create mode 100644 tools/testing/selftests/bpf/progs/map_deadlock.c diff --git a/tools/testing/selftests/bpf/prog_tests/map_deadlock.c b/tools/testing/selftests/bpf/prog_tests/map_deadlock.c new file mode 100644 index 000000000000..17fcf1f5efa6 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/map_deadlock.c @@ -0,0 +1,136 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include "map_deadlock.skel.h" + + +static int perf_open_all_cpus(struct perf_event_attr *attr, int fds[], int max_cpus) +{ + int n = 0; + + for (int cpu = 0; cpu < max_cpus; cpu++) { + int fd = syscall(__NR_perf_event_open, attr, -1 /* pid: all */, cpu, + -1 /* group_fd */, PERF_FLAG_FD_CLOEXEC); + if (fd < 0) + continue; + fds[cpu] = fd; + n++; + } + return n; +} + +struct thread_arg { + int map_fd; + bool *stop; +}; + +static void *user_update_thread(void *argp) +{ + struct thread_arg *arg = argp; + u32 key = 0; + u64 val = 1; + + while (!*arg->stop) { + key++; + val++; + bpf_map_update_elem(arg->map_fd, &key, &val, BPF_ANY); + if ((key & 0x7) == 0) + bpf_map_delete_elem(arg->map_fd, &key); + } + return NULL; +} + +static void test_map(const char *map_name, int map_index) +{ + struct perf_event_attr attr = { + .type = PERF_TYPE_HARDWARE, + .size = sizeof(struct perf_event_attr), + .config = PERF_COUNT_HW_CPU_CYCLES, + .sample_period = 1000000, + .freq = 0, + .disabled = 0, + .wakeup_events = 1, + }; + int map_fd, nfd = 0, max_cpus, err; + struct bpf_link **links = NULL; + struct map_deadlock *skel; + struct bpf_program *prog; + struct thread_arg targ; + bool stop = false; + int *fds = NULL; + pthread_t thr; + + skel = map_deadlock__open(); + if (!ASSERT_OK_PTR(skel, "map_deadlock__open")) + return; + skel->rodata->map_index = map_index; + err = map_deadlock__load(skel); + if (!ASSERT_OK(err, "map_deadlock__load")) + goto out; + + prog = skel->progs.on_perf; + map_fd = bpf_object__find_map_fd_by_name(skel->obj, map_name); + if (!ASSERT_GE(map_fd, 0, map_name)) + goto out; + + max_cpus = libbpf_num_possible_cpus(); + if (!ASSERT_GT(max_cpus, 0, "num cpus")) + goto out; + + links = calloc(max_cpus, sizeof(*links)); + ASSERT_OK_PTR(links, "alloc links"); + fds = calloc(max_cpus, sizeof(*fds)); + ASSERT_OK_PTR(fds, "alloc fds"); + for (int i = 0; i < max_cpus; i++) + fds[i] = -1; + + nfd = perf_open_all_cpus(&attr, fds, max_cpus); + if (!ASSERT_GT(nfd, 0, "perf fds")) + goto out; + + for (int cpu = 0; cpu < max_cpus; cpu++) { + if (fds[cpu] < 0) + continue; + links[cpu] = bpf_program__attach_perf_event(prog, fds[cpu]); + if (!ASSERT_OK_PTR(links[cpu], "attach perf")) + goto out; + } + + targ.map_fd = map_fd; + targ.stop = &stop; + err = pthread_create(&thr, NULL, user_update_thread, &targ); + if (!ASSERT_OK(err, "create thr")) + goto out; + + /* 1 second should be enough to trigger the deadlock */ + sleep(1); + stop = true; + (void)pthread_join(thr, NULL); + /* TODO: read dmesg to check the deadlock? */ +out: + if (links) { + for (int cpu = 0; cpu < max_cpus; cpu++) { + if (links[cpu]) + bpf_link__destroy(links[cpu]); + } + } + if (fds) { + for (int cpu = 0; cpu < max_cpus; cpu++) { + if (fds[cpu] >= 0) + close(fds[cpu]); + } + } + free(links); + free(fds); + map_deadlock__destroy(skel); +} + +void test_map_deadlock(void) +{ + if (test__start_subtest("lru")) + test_map("lru_map", 0); +} diff --git a/tools/testing/selftests/bpf/progs/map_deadlock.c b/tools/testing/selftests/bpf/progs/map_deadlock.c new file mode 100644 index 000000000000..6966224955fc --- /dev/null +++ b/tools/testing/selftests/bpf/progs/map_deadlock.c @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "vmlinux.h" +#include +#include + +char LICENSE[] SEC("license") = "GPL"; + +struct lru_map { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __uint(max_entries, 1024); + __type(key, u32); + __type(value, u64); +} lru_map SEC(".maps"); + +struct map_list { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, 1); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); + __array(values, struct lru_map); +} map_list SEC(".maps") = { + .values = { [0] = &lru_map }, +}; + +const volatile int map_index; + +static __always_inline void do_update_delete(void *map) +{ + u64 ts = bpf_ktime_get_ns(); + u32 key = (u32)(ts >> 12); + u64 val = ts; + + if ((ts & 1) == 0) + bpf_map_update_elem(map, &key, &val, BPF_ANY); + else + bpf_map_delete_elem(map, &key); +} + +SEC("perf_event") +int on_perf(struct bpf_perf_event_data *ctx) +{ + int key = map_index; + void *target_map; + + target_map = bpf_map_lookup_elem(&map_list, &key); + if (!target_map) + return 0; + + for (int i = 0; i < 4; i++) + do_update_delete(target_map); + return 0; +} -- 2.51.2