Hardware queue runs (blk_mq_run_hw_queue()) may happen concurrently. This may lead to request reordering as follows: Context 1 Context 2 ------------------ ------------------ dispatch request 1 dispatch request 2 queue request 2 queue request 1 Preserve the write order for zoned write pipelining by serializing request dispatching if zoned write pipelining is enabled. Cc: Damien Le Moal Cc: Christoph Hellwig Signed-off-by: Bart Van Assche --- block/blk-mq.c | 13 ++++++++----- block/blk-mq.h | 36 ++++++++++++++++++++++++++++++++++++ include/linux/blk-mq.h | 6 ++++++ 3 files changed, 50 insertions(+), 5 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 4a1443ce9bc2..19857d2047cb 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2398,7 +2398,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) return; } - blk_mq_run_dispatch_ops(hctx->queue, + blk_mq_run_dispatch_ops_serialized(hctx, blk_mq_sched_dispatch_requests(hctx)); } EXPORT_SYMBOL(blk_mq_run_hw_queue); @@ -2574,7 +2574,7 @@ static void blk_mq_run_work_fn(struct work_struct *work) struct blk_mq_hw_ctx *hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work); - blk_mq_run_dispatch_ops(hctx->queue, + blk_mq_run_dispatch_ops_serialized(hctx, blk_mq_sched_dispatch_requests(hctx)); } @@ -2800,11 +2800,12 @@ static bool blk_mq_get_budget_and_tag(struct request *rq) static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq) { + bool async = blk_pipeline_zwr(rq->q); blk_status_t ret; if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) { blk_mq_insert_request(rq, 0); - blk_mq_run_hw_queue(hctx, false); + blk_mq_run_hw_queue(hctx, async); return; } @@ -2821,7 +2822,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, case BLK_STS_RESOURCE: case BLK_STS_DEV_RESOURCE: blk_mq_request_bypass_insert(rq, 0); - blk_mq_run_hw_queue(hctx, false); + blk_mq_run_hw_queue(hctx, async); break; default: blk_mq_end_request(rq, ret); @@ -2853,6 +2854,7 @@ static void blk_mq_issue_direct(struct rq_list *rqs) while ((rq = rq_list_pop(rqs))) { bool last = rq_list_empty(rqs); + bool async = blk_pipeline_zwr(rq->q); if (hctx != rq->mq_hctx) { if (hctx) { @@ -2870,7 +2872,7 @@ static void blk_mq_issue_direct(struct rq_list *rqs) case BLK_STS_RESOURCE: case BLK_STS_DEV_RESOURCE: blk_mq_request_bypass_insert(rq, 0); - blk_mq_run_hw_queue(hctx, false); + blk_mq_run_hw_queue(hctx, async); goto out; default: blk_mq_end_request(rq, ret); @@ -4075,6 +4077,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, INIT_LIST_HEAD(&hctx->dispatch); INIT_HLIST_NODE(&hctx->cpuhp_dead); INIT_HLIST_NODE(&hctx->cpuhp_online); + mutex_init(&hctx->zwp_mutex); hctx->queue = q; hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED; diff --git a/block/blk-mq.h b/block/blk-mq.h index 625eaf459a55..4e8522a91477 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -458,6 +458,42 @@ do { \ #define blk_mq_run_dispatch_ops(q, dispatch_ops) \ __blk_mq_run_dispatch_ops(q, true, dispatch_ops) \ +static inline struct mutex *blk_mq_zwp_mutex(struct blk_mq_hw_ctx *hctx) +{ + struct request_queue *q = hctx->queue; + + /* + * If pipelining zoned writes is disabled, do not serialize dispatch + * operations. + */ + if (!blk_pipeline_zwr(q)) + return NULL; + + /* + * If no I/O scheduler is active or if the selected I/O scheduler + * uses multiple queues internally, serialize per hardware queue. + */ + if (!blk_queue_sq_sched(q)) + return &hctx->zwp_mutex; + + /* For single queue I/O schedulers, serialize per request queue. */ + return &blk_mq_map_queue_type(q, HCTX_TYPE_DEFAULT, 0)->zwp_mutex; +} + +#define blk_mq_run_dispatch_ops_serialized(hctx, dispatch_ops) \ +do { \ + struct request_queue *q = hctx->queue; \ + struct mutex *m = blk_mq_zwp_mutex(hctx); \ + \ + if (m) { \ + mutex_lock(m); \ + blk_mq_run_dispatch_ops(q, dispatch_ops); \ + mutex_unlock(m); \ + } else { \ + blk_mq_run_dispatch_ops(q, dispatch_ops); \ + } \ +} while (0) + static inline bool blk_mq_can_poll(struct request_queue *q) { return (q->limits.features & BLK_FEAT_POLL) && diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index d8867c2084b8..5aac7bd94e97 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -430,6 +430,12 @@ struct blk_mq_hw_ctx { /** @queue_num: Index of this hardware queue. */ unsigned int queue_num; + /** + * @zwp_mutex: Mutex used for serializing dispatching of zoned writes + * if zoned write pipelining is enabled. + */ + struct mutex zwp_mutex; + /** * @nr_active: Number of active requests. Only used when a tag set is * shared across request queues.