Outside of SQPOLL, normally SQ entries are consumed by the time the submission syscall returns. For those cases we don't need a circular buffer and the head/tail tracking, instead the kernel can assume that entries always start from the beginning of the SQ at index 0. This patch introduces a setup flag doing exactly that. This method is simpler in general, needs fewer operations, doesn't require looking up heads and tails, however, the main goal here is to keep caches hot. The userspace might overprovision SQ, and in the normal way we'd be touching all the cache lines, but with this feature it reuses first entries and keeps them hot. This simplicity will also be quite handy for bpf-io_uring. To use the feature the user should set the IORING_SETUP_SQ_REWIND flag, and have a compatible liburing/userspace. The flag is restricted to IORING_SETUP_NO_SQARRAY rings and is not compatible with IORING_SETUP_SQPOLL. Note, it uses relaxed ring synchronisation as the userspace doing the syscall is naturally in sync, and setups that share a SQ should be rolling their own intra process/thread synchronisation. Signed-off-by: Pavel Begunkov --- include/uapi/linux/io_uring.h | 6 ++++++ io_uring/io_uring.c | 29 ++++++++++++++++++++++------- io_uring/io_uring.h | 3 ++- 3 files changed, 30 insertions(+), 8 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index a0cc1cc0dd01..d1c654a7fa9a 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -231,6 +231,12 @@ enum io_uring_sqe_flags_bit { */ #define IORING_SETUP_CQE_MIXED (1U << 18) +/* + * SQEs always start at index 0 in the submission ring instead of using a + * wrap around indexing. + */ +#define IORING_SETUP_SQ_REWIND (1U << 19) + enum io_uring_op { IORING_OP_NOP, IORING_OP_READV, diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index ee04ab9bf968..e8af963d3233 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2367,12 +2367,16 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) { struct io_rings *rings = ctx->rings; - /* - * Ensure any loads from the SQEs are done at this point, - * since once we write the new head, the application could - * write new data to them. - */ - smp_store_release(&rings->sq.head, ctx->cached_sq_head); + if (ctx->flags & IORING_SETUP_SQ_REWIND) { + ctx->cached_sq_head = 0; + } else { + /* + * Ensure any loads from the SQEs are done at this point, + * since once we write the new head, the application could + * write new data to them. + */ + smp_store_release(&rings->sq.head, ctx->cached_sq_head); + } } /* @@ -2418,10 +2422,15 @@ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe) int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) __must_hold(&ctx->uring_lock) { - unsigned int entries = io_sqring_entries(ctx); + unsigned int entries; unsigned int left; int ret; + if (ctx->flags & IORING_SETUP_SQ_REWIND) + entries = ctx->sq_entries; + else + entries = io_sqring_entries(ctx); + entries = min(nr, entries); if (unlikely(!entries)) return 0; @@ -3678,6 +3687,12 @@ static int io_uring_sanitise_params(struct io_uring_params *p) { unsigned flags = p->flags; + if (flags & IORING_SETUP_SQ_REWIND) { + if ((flags & IORING_SETUP_SQPOLL) || + !(flags & IORING_SETUP_NO_SQARRAY)) + return -EINVAL; + } + /* There is no way to mmap rings without a real fd */ if ((flags & IORING_SETUP_REGISTERED_FD_ONLY) && !(flags & IORING_SETUP_NO_MMAP)) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 46d9141d772a..b998ed57dd93 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -54,7 +54,8 @@ IORING_SETUP_REGISTERED_FD_ONLY |\ IORING_SETUP_NO_SQARRAY |\ IORING_SETUP_HYBRID_IOPOLL |\ - IORING_SETUP_CQE_MIXED) + IORING_SETUP_CQE_MIXED |\ + IORING_SETUP_SQ_REWIND) #define IORING_ENTER_FLAGS (IORING_ENTER_GETEVENTS |\ IORING_ENTER_SQ_WAKEUP |\ -- 2.49.0