One of the optimizations in the block layer is that the software queues are bypassed if it is expected that the block driver will accept a request. This can cause request reordering even for requests submitted from the same CPU core. This patch preserves the order for sequential zoned writes submitted from a given CPU core by always inserting these requests into the appropriate software queue. Cc: Damien Le Moal Cc: Christoph Hellwig Signed-off-by: Bart Van Assche --- block/blk-mq.c | 30 ++++++++++++++++++++++++++++-- block/blk-zoned.c | 17 +++++++++++++++++ block/elevator.h | 1 + include/linux/blk-mq.h | 11 +++++++++++ include/linux/blkdev.h | 7 +++++++ 5 files changed, 64 insertions(+), 2 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index ba3a4b77f578..2af9c59ff2ad 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1537,6 +1537,30 @@ void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list) } EXPORT_SYMBOL(blk_mq_requeue_request); +/* + * Whether the block layer should preserve the order of @rq relative to other + * requests submitted to the same software queue. + */ +static bool blk_mq_preserve_order(struct request *rq) +{ + return blk_pipeline_zwr(rq->q) && blk_rq_is_seq_zoned_write(rq); +} + +static bool blk_mq_preserve_order_for_list(struct request_queue *q, + struct list_head *list) +{ + struct request *rq; + + if (!blk_pipeline_zwr(q)) + return false; + + list_for_each_entry(rq, list, queuelist) + if (blk_rq_is_seq_zoned_write(rq)) + return true; + + return false; +} + static void blk_mq_requeue_work(struct work_struct *work) { struct request_queue *q = @@ -2566,7 +2590,8 @@ static void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, * Try to issue requests directly if the hw queue isn't busy to save an * extra enqueue & dequeue to the sw queue. */ - if (!hctx->dispatch_busy && !run_queue_async) { + if (!hctx->dispatch_busy && !run_queue_async && + !blk_mq_preserve_order_for_list(hctx->queue, list)) { blk_mq_run_dispatch_ops(hctx->queue, blk_mq_try_issue_list_directly(hctx, list)); if (list_empty(list)) @@ -3215,7 +3240,8 @@ void blk_mq_submit_bio(struct bio *bio) hctx = rq->mq_hctx; if ((rq->rq_flags & RQF_USE_SCHED) || - (hctx->dispatch_busy && (q->nr_hw_queues == 1 || !is_sync))) { + (hctx->dispatch_busy && (q->nr_hw_queues == 1 || !is_sync)) || + blk_mq_preserve_order(rq)) { blk_mq_insert_request(rq, 0); blk_mq_run_hw_queue(hctx, true); } else { diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 5e2a5788dc3b..1b5923c1a149 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -22,6 +22,7 @@ #include "blk.h" #include "blk-mq-sched.h" #include "blk-mq-debugfs.h" +#include "elevator.h" #define ZONE_COND_NAME(name) [BLK_ZONE_COND_##name] = #name static const char *const zone_cond_name[] = { @@ -377,6 +378,22 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, return ret; } +/* + * blk_pipeline_zwr() - Whether or not sequential zoned writes will be + * pipelined per zone. + * @q: request queue pointer. + * + * Return: %true if and only if zoned writes will be pipelined per zone. + */ +bool blk_pipeline_zwr(struct request_queue *q) +{ + return q->limits.features & BLK_FEAT_ORDERED_HWQ && + (!q->elevator || + test_bit(ELEVATOR_FLAG_SUPPORTS_ZONED_WRITE_PIPELINING, + &q->elevator->flags)); +} +EXPORT_SYMBOL(blk_pipeline_zwr); + static bool disk_zone_is_last(struct gendisk *disk, struct blk_zone *zone) { return zone->start + zone->len >= get_capacity(disk); diff --git a/block/elevator.h b/block/elevator.h index adc5c157e17e..51311027fdb7 100644 --- a/block/elevator.h +++ b/block/elevator.h @@ -133,6 +133,7 @@ struct elevator_queue #define ELEVATOR_FLAG_REGISTERED 0 #define ELEVATOR_FLAG_DYING 1 #define ELEVATOR_FLAG_ENABLE_WBT_ON_EXIT 2 +#define ELEVATOR_FLAG_SUPPORTS_ZONED_WRITE_PIPELINING 3 /* * block elevator interface diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 2a5a828f19a0..30d7cd1b0484 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -1191,4 +1191,15 @@ static inline int blk_rq_map_sg(struct request *rq, struct scatterlist *sglist) } void blk_dump_rq_flags(struct request *, char *); +static inline bool blk_rq_is_seq_zoned_write(struct request *rq) +{ + switch (req_op(rq)) { + case REQ_OP_WRITE: + case REQ_OP_WRITE_ZEROES: + return bdev_zone_is_seq(rq->q->disk->part0, blk_rq_pos(rq)); + default: + return false; + } +} + #endif /* BLK_MQ_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 23bb2a407368..2c2579d4b7ed 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -854,6 +854,8 @@ static inline unsigned int disk_nr_zones(struct gendisk *disk) return disk->nr_zones; } +bool blk_pipeline_zwr(struct request_queue *q); + /** * bio_needs_zone_write_plugging - Check if a BIO needs to be handled with zone * write plugging @@ -932,6 +934,11 @@ static inline unsigned int disk_nr_zones(struct gendisk *disk) return 0; } +static inline bool blk_pipeline_zwr(struct request_queue *q) +{ + return false; +} + static inline bool bio_needs_zone_write_plugging(struct bio *bio) { return false;