Add a struct_ops callback called handle_events, which will be called
off the CQ waiting loop every time there is an event that might be
interesting to the program. The program takes the io_uring ctx and also
a loop state, which it can use to set the number of events it wants to
wait for as well as the timeout value.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 io_uring/bpf.c      | 64 +++++++++++++++++++++++++++++++++++++++++++++
 io_uring/bpf.h      | 26 ++++++++++++++++++
 io_uring/io_uring.c | 15 ++++++++++-
 3 files changed, 104 insertions(+), 1 deletion(-)

diff --git a/io_uring/bpf.c b/io_uring/bpf.c
index 4cb5d25c9247..24dd2fe9134f 100644
--- a/io_uring/bpf.c
+++ b/io_uring/bpf.c
@@ -1,9 +1,18 @@
 #include <linux/mutex.h>
+#include <linux/bpf_verifier.h>
 
 #include "bpf.h"
 #include "register.h"
 
+static const struct btf_type *loop_state_type;
+
+static int io_bpf_ops__loop(struct io_ring_ctx *ctx, struct iou_loop_state *ls)
+{
+	return IOU_RES_STOP;
+}
+
 static struct io_uring_ops io_bpf_ops_stubs = {
+	.loop = io_bpf_ops__loop,
 };
 
 static bool bpf_io_is_valid_access(int off, int size,
@@ -25,6 +34,17 @@ static int bpf_io_btf_struct_access(struct bpf_verifier_log *log,
 				    const struct bpf_reg_state *reg, int off,
 				    int size)
 {
+	const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id);
+
+	if (t == loop_state_type) {
+		if (off >= offsetof(struct iou_loop_state, cq_tail) &&
+		    off + size <= offsetofend(struct iou_loop_state, cq_tail))
+			return SCALAR_VALUE;
+		if (off >= offsetof(struct iou_loop_state, timeout) &&
+		    off + size <= offsetofend(struct iou_loop_state, timeout))
+			return SCALAR_VALUE;
+	}
+
 	return -EACCES;
 }
 
@@ -34,8 +54,25 @@ static const struct bpf_verifier_ops bpf_io_verifier_ops = {
 	.btf_struct_access = bpf_io_btf_struct_access,
 };
 
+static const struct btf_type *
+io_lookup_struct_type(struct btf *btf, const char *name)
+{
+	s32 type_id;
+
+	type_id = btf_find_by_name_kind(btf, name, BTF_KIND_STRUCT);
+	if (type_id < 0)
+		return NULL;
+	return btf_type_by_id(btf, type_id);
+}
+
 static int bpf_io_init(struct btf *btf)
 {
+	loop_state_type = io_lookup_struct_type(btf, "iou_loop_state");
+	if (!loop_state_type) {
+		pr_err("io_uring: Failed to locate iou_loop_state\n");
+		return -EINVAL;
+	}
+
 	return 0;
 }
 
@@ -91,3 +128,30 @@ static int __init io_uring_bpf_init(void)
 	return 0;
 }
 __initcall(io_uring_bpf_init);
+
+int io_run_cqwait_ops(struct io_ring_ctx *ctx, struct iou_loop_state *ls)
+{
+	int ret;
+
+	io_run_task_work();
+
+	guard(mutex)(&ctx->uring_lock);
+	if (unlikely(!ctx->bpf_ops))
+		return 1;
+
+	if (unlikely(task_sigpending(current)))
+		return -EINTR;
+
+	ret = ctx->bpf_ops->loop(ctx, ls);
+	if (ret == IOU_RES_STOP)
+		return 0;
+
+
+	if (io_local_work_pending(ctx)) {
+		unsigned nr_wait = ls->cq_tail - READ_ONCE(ctx->rings->cq.tail);
+		struct io_tw_state ts = {};
+
+		__io_run_local_work(ctx, ts, nr_wait, nr_wait);
+	}
+	return 1;
+}
diff --git a/io_uring/bpf.h b/io_uring/bpf.h
index 34a51a57103d..0b7246c4f05b 100644
--- a/io_uring/bpf.h
+++ b/io_uring/bpf.h
@@ -7,15 +7,41 @@
 
 #include "io_uring.h"
 
+enum {
+	IOU_RES_WAIT,
+	IOU_RES_STOP,
+};
+
 struct io_uring_ops {
+	int (*loop)(struct io_ring_ctx *ctx, struct iou_loop_state *ls);
+
+	__u32 ring_fd;
+	void *priv;
 };
 
+static inline bool io_bpf_attached(struct io_ring_ctx *ctx)
+{
+	return IS_ENABLED(CONFIG_IO_URING_BPF) && ctx->bpf_ops != NULL;
+}
+
+static inline bool io_has_cqwait_ops(struct io_ring_ctx *ctx)
+{
+	return io_bpf_attached(ctx);
+}
+
+
 #ifdef CONFIG_IO_URING_BPF
 void io_unregister_bpf(struct io_ring_ctx *ctx);
+int io_run_cqwait_ops(struct io_ring_ctx *ctx, struct iou_loop_state *ls);
 #else
 static inline void io_unregister_bpf(struct io_ring_ctx *ctx)
 {
 }
+static inline int io_run_cqwait_ops(struct io_ring_ctx *ctx,
+				    struct iou_loop_state *ls)
+{
+	return IOU_RES_STOP;
+}
 #endif
 
 #endif
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 5b80987ebb2c..1d5e3dd6c608 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2633,6 +2633,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
 	ktime_t start_time;
 	int ret;
 
+
 	min_events = min_t(int, min_events, ctx->cq_entries);
 
 	if (!io_allowed_run_tw(ctx))
@@ -2644,8 +2645,13 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
 
 	if (unlikely(test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)))
 		io_cqring_do_overflow_flush(ctx);
-	if (__io_cqring_events_user(ctx) >= min_events)
+
+	if (io_has_cqwait_ops(ctx)) {
+		if (ext_arg->min_time)
+			return -EINVAL;
+	} else if (__io_cqring_events_user(ctx) >= min_events) {
 		return 0;
+	}
 
 	init_waitqueue_func_entry(&iowq.wqe, io_wake_function);
 	iowq.wqe.private = current;
@@ -2706,6 +2712,13 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
 		__set_current_state(TASK_RUNNING);
 		atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT);
 
+		if (io_has_cqwait_ops(ctx)) {
+			ret = io_run_cqwait_ops(ctx, &iowq.ls);
+			if (ret <= 0)
+				break;
+			continue;
+		}
+
 		/*
 		 * Run task_work after scheduling and before io_should_wake().
 		 * If we got woken because of task_work being processed, run it
-- 
2.49.0