From: Mykyta Yatsenko Introduce mpmc_cell, a lock-free cell primitive designed to support concurrent writes to struct in NMI context (only one writer advances), allowing readers to consume consistent snapshot. Implementation details: Double buffering allows writers run concurrently with readers (read from one cell, write to another) The implementation uses a sequence-number-based protocol to enable exclusive writes. * Bit 0 of seq indicates an active writer * Bits 1+ form a generation counter * (seq & 2) >> 1 selects the read cell, write cell is opposite * Writers atomically set bit 0, write to the inactive cell, then increment seq to publish * Readers snapshot seq, read from the active cell, then validate that seq hasn't changed mpmc_cell expects users to pre-allocate double buffers. Key properties: * Writers never block (fail if lost the race to another writer) * Readers never block writers (double buffering), but may require retries if write updates the snapshot concurrently. This will be used by BPF timer and workqueue helpers to defer NMI-unsafe operations (like hrtimer_start()) to irq_work effectively allowing BPF programs to initiate timers and workqueues from NMI context. Signed-off-by: Mykyta Yatsenko --- kernel/bpf/Makefile | 2 +- kernel/bpf/mpmc_cell.c | 62 +++++++++++++++++++++++++++ kernel/bpf/mpmc_cell.h | 112 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 175 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 79cf22860a99ba31a9daf08a29de0f3a162ba89f..753fa63e0c24dc0a332d86c2c424894300f2d611 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -6,7 +6,7 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse endif CFLAGS_core.o += -Wno-override-init $(cflags-nogcse-yy) -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o liveness.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o liveness.o mpmc_cell.o obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o bpf_insn_array.o diff --git a/kernel/bpf/mpmc_cell.c b/kernel/bpf/mpmc_cell.c new file mode 100644 index 0000000000000000000000000000000000000000..ca91b4308c8b552bc81cfefa2d975290a64b596d --- /dev/null +++ b/kernel/bpf/mpmc_cell.c @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include "mpmc_cell.h" + +static u32 read_cell_idx(struct bpf_mpmc_cell_ctl *ctl, u32 seq) +{ + return (seq & 2) >> 1; +} + +void bpf_mpmc_cell_init(struct bpf_mpmc_cell_ctl *ctl, void *cell1, void *cell2) +{ + atomic_set(&ctl->seq, 0); + ctl->cell[0] = cell1; + ctl->cell[1] = cell2; +} + +void *bpf_mpmc_cell_read_begin(struct bpf_mpmc_cell_ctl *ctl, u32 *seq) +{ + *seq = atomic_read_acquire(&ctl->seq); + /* Mask out acive writer bit */ + *seq &= ~1; + + return ctl->cell[read_cell_idx(ctl, *seq)]; +} + +int bpf_mpmc_cell_read_end(struct bpf_mpmc_cell_ctl *ctl, u32 seq) +{ + u32 new_seq; + + /* Ensure cell reads complete before checking seq */ + smp_rmb(); + + new_seq = atomic_read_acquire(&ctl->seq); + new_seq &= ~1; /* Ignore active write bit */ + /* Check if seq changed between begin and end, if it did, new snapshot is available */ + if (new_seq != seq) + return -EAGAIN; + + return 0; +} + +void *bpf_mpmc_cell_write_begin(struct bpf_mpmc_cell_ctl *ctl) +{ + u32 seq; + + /* + * Try to set the lowest bit, on success, writer owns cell exclusively, + * other writers fail + */ + seq = atomic_fetch_or_acquire(1, &ctl->seq); + if (seq & 1) /* Check if another writer is active */ + return NULL; + + /* Write to opposite to read buffer */ + return ctl->cell[read_cell_idx(ctl, seq) ^ 1]; +} + +void bpf_mpmc_cell_write_commit(struct bpf_mpmc_cell_ctl *ctl) +{ + atomic_fetch_add_release(1, &ctl->seq); +} diff --git a/kernel/bpf/mpmc_cell.h b/kernel/bpf/mpmc_cell.h new file mode 100644 index 0000000000000000000000000000000000000000..8b57226927a6c51460fae3113b94d8631173da63 --- /dev/null +++ b/kernel/bpf/mpmc_cell.h @@ -0,0 +1,112 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#ifndef __BPF_MPMC_CELL_H__ +#define __BPF_MPMC_CELL_H__ +#include + +/** + * DOC: BPF MPMC Cell + * + * Multi-producer, multi-consumer lock-free double buffer. + * Designed for writers producing data in NMI context where locking is not possible. + * + * Writers never block or wait, but may fail (return NULL) if another writer is active + * (assume these writers are overridden) + * Readers never block writers. Readers may need to retry if a write + * completes during the read window (return -EAGAIN) + * + * User should provide two allocated cells. + * + * Typical usage: + * + * // Writer (from NMI or any context): + * cell = bpf_mpmc_cell_write_begin(ctl); + * if (!IS_ERR(cell)) { + * memcpy(cell, data, size); + * bpf_mpmc_cell_write_commit(ctl); + * } + * + * // Reader (from irq_work or similar): + * cell = bpf_mpmc_cell_read_begin(ctl, &seq); + * memcpy(local, cell, size); + * ret = bpf_mpmc_cell_read_end(ctl, seq); + * if (ret == 0) + * process(local); // success, we own this snapshot + * else if (ret == -EAGAIN) + * retry; // snapshot changed or lost race + */ + +/** + * struct bpf_mpmc_cell_ctl - control structure for mpmc cell + * @seq: sequence number (odd = write active, seq/2 = generation) + * @cell: pointers to two allocated cells to support double buffering + * + */ +struct bpf_mpmc_cell_ctl { + atomic_t seq; + void *cell[2]; +}; + +/** + * bpf_mpmc_cell_init() - initialize mpmc cell control structure + * @ctl: pointer to control structure to initialize + * @cell1: pointer to an allocated cell + * @cell2: pointer to another same sized cell + * + * Must be called before any read/write operations. + * Caller must allocate two same sized cells (buffers, structs) and pass + * them to this function, those two cells are used for double-buffering, + * supporting concurrent reads/writes: readers use one cell, writers another. + * + * Context: Any context. + * Return: void. + */ +void bpf_mpmc_cell_init(struct bpf_mpmc_cell_ctl *ctl, void *cell1, void *cell2); + +/** + * bpf_mpmc_cell_read_begin() - begin a read operation + * @ctl: pointer to control structure + * @seq: output parameter, sequence number for this read + * + * Returns: pointer to the current read cell. Caller must copy data + * out and then call bpf_mpmc_cell_read_end() to validate. + */ +void *bpf_mpmc_cell_read_begin(struct bpf_mpmc_cell_ctl *ctl, u32 *seq); + +/** + * bpf_mpmc_cell_read_end() - validate read operation. + * @ctl: pointer to control structure + * @seq: sequence number from matching bpf_mpmc_cell_read_begin() + * + * Validates that the snapshot read between bpf_mpmc_cell_read_begin() + * and bpf_mpmc_cell_read_end() is consistent. + * + * Return: + * 0 - success, snapshot is consistent + * -EAGAIN - snapshot invalidated (another writer completed) + */ +int bpf_mpmc_cell_read_end(struct bpf_mpmc_cell_ctl *ctl, u32 seq); + +/** + * bpf_mpmc_cell_write_begin() - begin a write operation + * @ctl: pointer to control structure + * + * Attempts to acquire exclusive writer access. Only one writer can be + * active at a time. On success, caller must write data and call + * bpf_mpmc_cell_write_commit(). There is no write abort mechanism. + * + * Return: Pointer to the write cell, or NULL if another writer is + * active. + */ +void *bpf_mpmc_cell_write_begin(struct bpf_mpmc_cell_ctl *ctl); + +/** + * bpf_mpmc_cell_write_commit() - complete a write operation + * @ctl: pointer to control structure + * + * Publishes the written data, making it visible to readers. + * Must be called after successful bpf_mpmc_cell_write_begin(). + */ +void bpf_mpmc_cell_write_commit(struct bpf_mpmc_cell_ctl *ctl); + +#endif -- 2.52.0