Add a struct_ops callback called handle_events, which will be called off the CQ waiting loop every time there is an event that might be interesting to the program. The program takes the io_uring ctx and also a loop state, which it can use to set the number of events it wants to wait for as well as the timeout value. Signed-off-by: Pavel Begunkov --- io_uring/bpf.c | 64 +++++++++++++++++++++++++++++++++++++++++++++ io_uring/bpf.h | 26 ++++++++++++++++++ io_uring/io_uring.c | 15 ++++++++++- 3 files changed, 104 insertions(+), 1 deletion(-) diff --git a/io_uring/bpf.c b/io_uring/bpf.c index 4cb5d25c9247..24dd2fe9134f 100644 --- a/io_uring/bpf.c +++ b/io_uring/bpf.c @@ -1,9 +1,18 @@ #include +#include #include "bpf.h" #include "register.h" +static const struct btf_type *loop_state_type; + +static int io_bpf_ops__loop(struct io_ring_ctx *ctx, struct iou_loop_state *ls) +{ + return IOU_RES_STOP; +} + static struct io_uring_ops io_bpf_ops_stubs = { + .loop = io_bpf_ops__loop, }; static bool bpf_io_is_valid_access(int off, int size, @@ -25,6 +34,17 @@ static int bpf_io_btf_struct_access(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, int off, int size) { + const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id); + + if (t == loop_state_type) { + if (off >= offsetof(struct iou_loop_state, cq_tail) && + off + size <= offsetofend(struct iou_loop_state, cq_tail)) + return SCALAR_VALUE; + if (off >= offsetof(struct iou_loop_state, timeout) && + off + size <= offsetofend(struct iou_loop_state, timeout)) + return SCALAR_VALUE; + } + return -EACCES; } @@ -34,8 +54,25 @@ static const struct bpf_verifier_ops bpf_io_verifier_ops = { .btf_struct_access = bpf_io_btf_struct_access, }; +static const struct btf_type * +io_lookup_struct_type(struct btf *btf, const char *name) +{ + s32 type_id; + + type_id = btf_find_by_name_kind(btf, name, BTF_KIND_STRUCT); + if (type_id < 0) + return NULL; + return btf_type_by_id(btf, type_id); +} + static int bpf_io_init(struct btf *btf) { + loop_state_type = io_lookup_struct_type(btf, "iou_loop_state"); + if (!loop_state_type) { + pr_err("io_uring: Failed to locate iou_loop_state\n"); + return -EINVAL; + } + return 0; } @@ -91,3 +128,30 @@ static int __init io_uring_bpf_init(void) return 0; } __initcall(io_uring_bpf_init); + +int io_run_cqwait_ops(struct io_ring_ctx *ctx, struct iou_loop_state *ls) +{ + int ret; + + io_run_task_work(); + + guard(mutex)(&ctx->uring_lock); + if (unlikely(!ctx->bpf_ops)) + return 1; + + if (unlikely(task_sigpending(current))) + return -EINTR; + + ret = ctx->bpf_ops->loop(ctx, ls); + if (ret == IOU_RES_STOP) + return 0; + + + if (io_local_work_pending(ctx)) { + unsigned nr_wait = ls->cq_tail - READ_ONCE(ctx->rings->cq.tail); + struct io_tw_state ts = {}; + + __io_run_local_work(ctx, ts, nr_wait, nr_wait); + } + return 1; +} diff --git a/io_uring/bpf.h b/io_uring/bpf.h index 34a51a57103d..0b7246c4f05b 100644 --- a/io_uring/bpf.h +++ b/io_uring/bpf.h @@ -7,15 +7,41 @@ #include "io_uring.h" +enum { + IOU_RES_WAIT, + IOU_RES_STOP, +}; + struct io_uring_ops { + int (*loop)(struct io_ring_ctx *ctx, struct iou_loop_state *ls); + + __u32 ring_fd; + void *priv; }; +static inline bool io_bpf_attached(struct io_ring_ctx *ctx) +{ + return IS_ENABLED(CONFIG_IO_URING_BPF) && ctx->bpf_ops != NULL; +} + +static inline bool io_has_cqwait_ops(struct io_ring_ctx *ctx) +{ + return io_bpf_attached(ctx); +} + + #ifdef CONFIG_IO_URING_BPF void io_unregister_bpf(struct io_ring_ctx *ctx); +int io_run_cqwait_ops(struct io_ring_ctx *ctx, struct iou_loop_state *ls); #else static inline void io_unregister_bpf(struct io_ring_ctx *ctx) { } +static inline int io_run_cqwait_ops(struct io_ring_ctx *ctx, + struct iou_loop_state *ls) +{ + return IOU_RES_STOP; +} #endif #endif diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 5b80987ebb2c..1d5e3dd6c608 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2633,6 +2633,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, ktime_t start_time; int ret; + min_events = min_t(int, min_events, ctx->cq_entries); if (!io_allowed_run_tw(ctx)) @@ -2644,8 +2645,13 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, if (unlikely(test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))) io_cqring_do_overflow_flush(ctx); - if (__io_cqring_events_user(ctx) >= min_events) + + if (io_has_cqwait_ops(ctx)) { + if (ext_arg->min_time) + return -EINVAL; + } else if (__io_cqring_events_user(ctx) >= min_events) { return 0; + } init_waitqueue_func_entry(&iowq.wqe, io_wake_function); iowq.wqe.private = current; @@ -2706,6 +2712,13 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, __set_current_state(TASK_RUNNING); atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT); + if (io_has_cqwait_ops(ctx)) { + ret = io_run_cqwait_ops(ctx, &iowq.ls); + if (ret <= 0) + break; + continue; + } + /* * Run task_work after scheduling and before io_should_wake(). * If we got woken because of task_work being processed, run it -- 2.49.0