io_ring_ctx's mutex uring_lock can be quite expensive in high-IOPS workloads. Even when only one thread pinned to a single CPU is accessing the io_ring_ctx, the atomic CASes required to lock and unlock the mutex are very hot instructions. The mutex's primary purpose is to prevent concurrent io_uring system calls on the same io_ring_ctx. However, there is already a flag IORING_SETUP_SINGLE_ISSUER that promises only one task will make io_uring_enter() and io_uring_register() system calls on the io_ring_ctx once it's enabled. So if the io_ring_ctx is setup with IORING_SETUP_SINGLE_ISSUER, skip the uring_lock mutex_lock() and mutex_unlock() on the submitter_task. On other tasks acquiring the ctx uring lock, use a task work item to suspend the submitter_task for the critical section. In io_uring_register(), continue to always acquire the uring_lock mutex. io_uring_register() can be called on a disabled io_ring_ctx (indeed, it's required to enable it), when submitter_task isn't set yet. After submitter_task is set, io_uring_register() is only permitted on submitter_task, so uring_lock suffices to exclude all other users. Signed-off-by: Caleb Sander Mateos --- io_uring/io_uring.c | 11 +++++ io_uring/io_uring.h | 101 ++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 109 insertions(+), 3 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index e05e56a840f9..64e4e57e2c11 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -363,10 +363,21 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) xa_destroy(&ctx->io_bl_xa); kfree(ctx); return NULL; } +void io_ring_suspend_work(struct callback_head *cb_head) +{ + struct io_ring_suspend_work *suspend_work = + container_of(cb_head, struct io_ring_suspend_work, cb_head); + DECLARE_COMPLETION_ONSTACK(suspend_end); + + suspend_work->lock_state->suspend_end = &suspend_end; + complete(&suspend_work->suspend_start); + wait_for_completion(&suspend_end); +} + static void io_clean_op(struct io_kiocb *req) { if (unlikely(req->flags & REQ_F_BUFFER_SELECTED)) io_kbuf_drop_legacy(req); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 23dae0af530b..262971224cc6 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -1,8 +1,9 @@ #ifndef IOU_CORE_H #define IOU_CORE_H +#include #include #include #include #include #include @@ -195,36 +196,130 @@ void io_queue_next(struct io_kiocb *req); void io_task_refs_refill(struct io_uring_task *tctx); bool __io_alloc_req_refill(struct io_ring_ctx *ctx); void io_activate_pollwq(struct io_ring_ctx *ctx); +/* + * The ctx uring lock protects most of the mutable struct io_ring_ctx state + * accessed in the struct io_kiocb issue path. In the I/O path, it is typically + * acquired in the io_uring_enter() syscall and io_handle_tw_list(). For + * IORING_SETUP_SQPOLL, it's acquired by io_sq_thread() instead. io_kiocb's + * issued with IO_URING_F_UNLOCKED in issue_flags (e.g. by io_wq_submit_work()) + * acquire and release the ctx uring lock whenever they must touch io_ring_ctx + * state. io_uring_register() also acquires the ctx uring lock because most + * opcodes mutate io_ring_ctx state accessed in the issue path. + * + * For !IORING_SETUP_SINGLE_ISSUER io_ring_ctx's, acquiring the ctx uring lock + * is always done via mutex_(try)lock(&ctx->uring_lock). + * + * However, for IORING_SETUP_SINGLE_ISSUER, we can avoid the mutex_lock() + + * mutex_unlock() overhead on submitter_task because a single thread can't race + * with itself. In the uncommon case where the ctx uring lock is needed on + * another thread, it must suspend submitter_task by scheduling a task work item + * on it. io_ring_ctx_lock() returns once the task work item has started. + * submitter_task is unblocked once io_ring_ctx_unlock() is called. + * + * io_uring_register() requires special treatment for IORING_SETUP_SINGLE_ISSUER + * since it's allowed on a IORING_SETUP_R_DISABLED io_ring_ctx, where + * submitter_task isn't set yet. Hence the io_ring_register_ctx_*() family + * of helpers. They unconditionally acquire the uring_lock mutex, which always + * works to exclude other ctx uring lock users: + * - For !IORING_SETUP_SINGLE_ISSUER, all users acquire the ctx uring lock via + * the uring_lock mutex + * - For IORING_SETUP_SINGLE_ISSUER and IORING_SETUP_R_DISABLED, only + * io_uring_register() is allowed before the io_ring_ctx is enabled. + * So again, all ctx uring lock users acquire the uring_lock mutex. + * - For IORING_SETUP_SINGLE_ISSUER and !IORING_SETUP_R_DISABLED, + * io_uring_register() is only permitted on submitter_task, which is always + * granted the ctx uring lock unless suspended. + * Acquiring the uring_lock mutex is unnecessary but still correct. + */ + struct io_ring_ctx_lock_state { + struct completion *suspend_end; }; +struct io_ring_suspend_work { + struct callback_head cb_head; + struct completion suspend_start; + struct io_ring_ctx_lock_state *lock_state; +}; + +void io_ring_suspend_work(struct callback_head *cb_head); + /* Acquire the ctx uring lock */ static inline void io_ring_ctx_lock(struct io_ring_ctx *ctx, struct io_ring_ctx_lock_state *state) { - mutex_lock(&ctx->uring_lock); + struct io_ring_suspend_work suspend_work; + struct task_struct *submitter_task; + + if (!(ctx->flags & IORING_SETUP_SINGLE_ISSUER)) { + mutex_lock(&ctx->uring_lock); + return; + } + + submitter_task = ctx->submitter_task; + /* + * Not suitable for use while IORING_SETUP_R_DISABLED. + * Must use io_ring_register_ctx_lock() in that case. + */ + WARN_ON_ONCE(!submitter_task); + if (likely(current == submitter_task)) + return; + + /* Use task work to suspend submitter_task */ + init_task_work(&suspend_work.cb_head, io_ring_suspend_work); + init_completion(&suspend_work.suspend_start); + suspend_work.lock_state = state; + /* If task_work_add() fails, task is exiting, so no need to suspend */ + if (unlikely(task_work_add(submitter_task, &suspend_work.cb_head, + TWA_SIGNAL))) { + state->suspend_end = NULL; + return; + } + + wait_for_completion(&suspend_work.suspend_start); } /* Attempt to acquire the ctx uring lock without blocking */ static inline bool io_ring_ctx_trylock(struct io_ring_ctx *ctx) { - return mutex_trylock(&ctx->uring_lock); + if (!(ctx->flags & IORING_SETUP_SINGLE_ISSUER)) + return mutex_trylock(&ctx->uring_lock); + + /* Not suitable for use while IORING_SETUP_R_DISABLED */ + WARN_ON_ONCE(!ctx->submitter_task); + return current == ctx->submitter_task; } /* Release the ctx uring lock */ static inline void io_ring_ctx_unlock(struct io_ring_ctx *ctx, struct io_ring_ctx_lock_state *state) { - mutex_unlock(&ctx->uring_lock); + if (!(ctx->flags & IORING_SETUP_SINGLE_ISSUER)) { + mutex_unlock(&ctx->uring_lock); + return; + } + + if (likely(current == ctx->submitter_task)) + return; + + if (likely(state->suspend_end)) + complete(state->suspend_end); } /* Assert (if CONFIG_LOCKDEP) that the ctx uring lock is held */ static inline void io_ring_ctx_assert_locked(const struct io_ring_ctx *ctx) { + /* + * No straightforward way to check that submitter_task is suspended + * without access to struct io_ring_ctx_lock_state + */ + if (ctx->flags & IORING_SETUP_SINGLE_ISSUER) + return; + lockdep_assert_held(&ctx->uring_lock); } /* Acquire the ctx uring lock during the io_uring_register() syscall */ static inline void io_ring_register_ctx_lock(struct io_ring_ctx *ctx) -- 2.45.2