One of the optimizations in the block layer is that the software queues are bypassed if it is expected that the block driver will accept a request. This can cause request reordering even for requests submitted from the same CPU core. This patch preserves the order for sequential zoned writes submitted from a given CPU core by always inserting these requests into the appropriate software queue. Cc: Damien Le Moal Cc: Christoph Hellwig Signed-off-by: Bart Van Assche --- block/blk-mq.c | 35 +++++++++++++++++++++++++++++++++-- block/blk-zoned.c | 21 +++++++++++++++++++++ block/elevator.h | 1 + include/linux/blk-mq.h | 11 +++++++++++ include/linux/blkdev.h | 7 +++++++ 5 files changed, 73 insertions(+), 2 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 09f579414161..0457aa6eef47 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1546,6 +1546,35 @@ void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list) } EXPORT_SYMBOL(blk_mq_requeue_request); +/* + * Whether the block layer should preserve the order of @rq relative to other + * requests submitted to the same software queue. + */ +static bool blk_mq_preserve_order(struct request *rq) +{ + return blk_pipeline_zwr(rq->q) && blk_rq_is_seq_zoned_write(rq); +} + +/* + * Whether the order should be preserved for any request in @list. Returns %true + * if and only if zoned write pipelining is enabled and if there are any + * sequential zoned writes in @list. + */ +static bool blk_mq_preserve_order_for_list(struct request_queue *q, + struct list_head *list) +{ + struct request *rq; + + if (!blk_pipeline_zwr(q)) + return false; + + list_for_each_entry(rq, list, queuelist) + if (blk_rq_is_seq_zoned_write(rq)) + return true; + + return false; +} + static void blk_mq_requeue_work(struct work_struct *work) { struct request_queue *q = @@ -2575,7 +2604,8 @@ static void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, * Try to issue requests directly if the hw queue isn't busy to save an * extra enqueue & dequeue to the sw queue. */ - if (!hctx->dispatch_busy && !run_queue_async) { + if (!hctx->dispatch_busy && !run_queue_async && + !blk_mq_preserve_order_for_list(hctx->queue, list)) { blk_mq_run_dispatch_ops(hctx->queue, blk_mq_try_issue_list_directly(hctx, list)); if (list_empty(list)) @@ -3225,7 +3255,8 @@ void blk_mq_submit_bio(struct bio *bio) hctx = rq->mq_hctx; if ((rq->rq_flags & RQF_USE_SCHED) || - (hctx->dispatch_busy && (q->nr_hw_queues == 1 || !is_sync))) { + (hctx->dispatch_busy && (q->nr_hw_queues == 1 || !is_sync)) || + blk_mq_preserve_order(rq)) { blk_mq_insert_request(rq, 0); blk_mq_run_hw_queue(hctx, true); } else { diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 5e2a5788dc3b..f6bb4331eea6 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -22,6 +22,7 @@ #include "blk.h" #include "blk-mq-sched.h" #include "blk-mq-debugfs.h" +#include "elevator.h" #define ZONE_COND_NAME(name) [BLK_ZONE_COND_##name] = #name static const char *const zone_cond_name[] = { @@ -377,6 +378,26 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, return ret; } +/* + * blk_pipeline_zwr() - Whether or not sequential zoned writes will be + * pipelined per zone. + * @q: request queue pointer. + * + * Return: %true if and only if zoned writes will be pipelined per zone. Since + * running different hardware queues simultaneously on different CPU cores may + * lead to I/O reordering if an I/O scheduler maintains a single dispatch queue, + * only enable write pipelining if an I/O scheduler is active if the + * ELEVATOR_FLAG_SUPPORTS_ZONED_WRITE_PIPELINING flag has been set. + */ +bool blk_pipeline_zwr(struct request_queue *q) +{ + return q->limits.features & BLK_FEAT_ORDERED_HWQ && + (!q->elevator || + test_bit(ELEVATOR_FLAG_SUPPORTS_ZONED_WRITE_PIPELINING, + &q->elevator->flags)); +} +EXPORT_SYMBOL(blk_pipeline_zwr); + static bool disk_zone_is_last(struct gendisk *disk, struct blk_zone *zone) { return zone->start + zone->len >= get_capacity(disk); diff --git a/block/elevator.h b/block/elevator.h index c4d20155065e..41f28909a31c 100644 --- a/block/elevator.h +++ b/block/elevator.h @@ -133,6 +133,7 @@ struct elevator_queue #define ELEVATOR_FLAG_REGISTERED 0 #define ELEVATOR_FLAG_DYING 1 #define ELEVATOR_FLAG_ENABLE_WBT_ON_EXIT 2 +#define ELEVATOR_FLAG_SUPPORTS_ZONED_WRITE_PIPELINING 3 /* * block elevator interface diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index b25d12545f46..2c08a86b4ac3 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -1195,4 +1195,15 @@ static inline int blk_rq_map_sg(struct request *rq, struct scatterlist *sglist) } void blk_dump_rq_flags(struct request *, char *); +static inline bool blk_rq_is_seq_zoned_write(struct request *rq) +{ + switch (req_op(rq)) { + case REQ_OP_WRITE: + case REQ_OP_WRITE_ZEROES: + return bdev_zone_is_seq(rq->q->disk->part0, blk_rq_pos(rq)); + default: + return false; + } +} + #endif /* BLK_MQ_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9af9d97e31af..85fca05bd5eb 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -855,6 +855,8 @@ static inline unsigned int disk_nr_zones(struct gendisk *disk) return disk->nr_zones; } +bool blk_pipeline_zwr(struct request_queue *q); + /** * bio_needs_zone_write_plugging - Check if a BIO needs to be handled with zone * write plugging @@ -933,6 +935,11 @@ static inline unsigned int disk_nr_zones(struct gendisk *disk) return 0; } +static inline bool blk_pipeline_zwr(struct request_queue *q) +{ + return false; +} + static inline bool bio_needs_zone_write_plugging(struct bio *bio) { return false;