From: Keith Busch A bio requesting sectors unaligned to the logical format is invalid rather than an IO error. Fix up the return code because there are some device mappers that care about distinguishing these kinds of errors. Signed-off-by: Keith Busch --- block/blk-mq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index d0c37daf568f2..881183345a85c 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3182,7 +3182,8 @@ void blk_mq_submit_bio(struct bio *bio) * have to be done with queue usage counter held. */ if (unlikely(bio_unaligned(bio, q))) { - bio_io_error(bio); + bio->bi_status = BLK_STS_INVAL; + bio_endio(bio); goto queue_exit; } -- 2.53.0-Meta From: Keith Busch A bio submitted to a zone block device that breaks the limits is invalid rather than an IO error. Fix up the return code to report it that way, as the previously used IOERR is considered a path failure. Signed-off-by: Keith Busch --- block/blk-core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 22af5dec112b5..92a802dc8042c 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -604,7 +604,7 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q, /* The bio sector must point to the start of a sequential zone */ if (!bdev_is_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector)) - return BLK_STS_IOERR; + return BLK_STS_INVAL; /* * Not allowed to cross zone boundaries. Otherwise, the BIO will be @@ -612,11 +612,11 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q, * different zones. */ if (nr_sectors > q->limits.chunk_sectors) - return BLK_STS_IOERR; + return BLK_STS_INVAL; /* Make sure the BIO is small enough and will not get split */ if (nr_sectors > q->limits.max_zone_append_sectors) - return BLK_STS_IOERR; + return BLK_STS_INVAL; bio->bi_opf |= REQ_NOMERGE; -- 2.53.0-Meta From: Keith Busch bio_check_eod() in submit_bio_noacct() validates that a bio does not extend beyond the partition's available sectors. This check runs before bio_queue_enter(), so it is not serialized against queue limit updates. A driver that freezes the queue, updates limits, changes the capacity, and unfreezes can race with a bio that passed the early check under the old capacity. Remove bio_check_eod() and replace it with a bounds check in __bio_split_to_limits(), which runs after the queue usage reference has been acquired. The check uses partition-aware arithmetic to validate both partition bounds and disk capacity in a single comparison that works correctly on the post-remap sector values. Signed-off-by: Keith Busch --- block/blk-core.c | 26 -------------------------- block/blk.h | 14 ++++++++++++++ 2 files changed, 14 insertions(+), 26 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 92a802dc8042c..c200d0fc44fe7 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -547,30 +547,6 @@ int should_fail_bio(struct bio *bio) } ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO); -/* - * Check whether this bio extends beyond the end of the device or partition. - * This may well happen - the kernel calls bread() without checking the size of - * the device, e.g., when mounting a file system. - */ -static inline int bio_check_eod(struct bio *bio) -{ - sector_t maxsector = bdev_nr_sectors(bio->bi_bdev); - unsigned int nr_sectors = bio_sectors(bio); - - if (nr_sectors && - (nr_sectors > maxsector || - bio->bi_iter.bi_sector > maxsector - nr_sectors)) { - if (!maxsector) - return -EIO; - pr_info_ratelimited("%s: attempt to access beyond end of device\n" - "%pg: rw=%d, sector=%llu, nr_sectors = %u limit=%llu\n", - current->comm, bio->bi_bdev, bio->bi_opf, - bio->bi_iter.bi_sector, nr_sectors, maxsector); - return -EIO; - } - return 0; -} - /* * Remap block n of partition p to block n+start(p) of the disk. */ @@ -802,8 +778,6 @@ void submit_bio_noacct(struct bio *bio) goto end_io; bio_check_ro(bio); if (!bio_flagged(bio, BIO_REMAPPED)) { - if (unlikely(bio_check_eod(bio))) - goto end_io; if (bdev_is_partition(bdev) && unlikely(blk_partition_remap(bio))) goto end_io; diff --git a/block/blk.h b/block/blk.h index bf1a80493ff1c..e70acb2d358e3 100644 --- a/block/blk.h +++ b/block/blk.h @@ -423,6 +423,17 @@ static inline bool bio_may_need_split(struct bio *bio, static inline struct bio *__bio_split_to_limits(struct bio *bio, const struct queue_limits *lim, unsigned int *nr_segs) { + if (unlikely(bio_end_sector(bio) > bdev_nr_sectors(bio->bi_bdev) + + bio->bi_bdev->bd_start_sect)) { + pr_info_ratelimited("%s: attempt to access beyond end of device\n" + "%pg: rw=%d, sector=%llu, nr_sectors = %u limit=%llu\n", + current->comm, bio->bi_bdev, bio->bi_opf, + bio->bi_iter.bi_sector, bio_sectors(bio), + bdev_nr_sectors(bio->bi_bdev) + + bio->bi_bdev->bd_start_sect); + goto ioerr; + } + switch (bio_op(bio)) { case REQ_OP_READ: case REQ_OP_WRITE: @@ -442,6 +453,9 @@ static inline struct bio *__bio_split_to_limits(struct bio *bio, *nr_segs = 0; return bio; } +ioerr: + bio_io_error(bio); + return NULL; } /** -- 2.53.0-Meta From: Keith Busch The bio checks in submit_bio_noacct() compares queue limits to determine whether operations like discard, write zeroes, zone append, and atomic writes are supported and valid. These checks run before bio_queue_enter(), so they race against any driver that updates queue limits inside a freeze window. Move all limit-dependent operation validation from submit_bio_noacct() into __bio_split_to_limits(), which runs after the queue usage reference has been acquired. This ensures that all checks are properly serialized against limit updates. The non-limit checks (crypto, fault injection, partition remap, and flush flag handling) remain in submit_bio_noacct() as they do not depend on queue limits. Signed-off-by: Keith Busch --- block/blk-core.c | 118 ----------------------------------------------- block/blk.h | 75 ++++++++++++++++++++++++++++-- 2 files changed, 72 insertions(+), 121 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index c200d0fc44fe7..8360c2b5efee5 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -519,25 +519,6 @@ static int __init fail_make_request_debugfs(void) late_initcall(fail_make_request_debugfs); #endif /* CONFIG_FAIL_MAKE_REQUEST */ -static inline void bio_check_ro(struct bio *bio) -{ - if (op_is_write(bio_op(bio)) && bdev_read_only(bio->bi_bdev)) { - if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) - return; - - if (bdev_test_flag(bio->bi_bdev, BD_RO_WARNED)) - return; - - bdev_set_flag(bio->bi_bdev, BD_RO_WARNED); - - /* - * Use ioctl to set underlying disk of raid/dm to read-only - * will trigger this. - */ - pr_warn("Trying to write to read-only block-device %pg\n", - bio->bi_bdev); - } -} int should_fail_bio(struct bio *bio) { @@ -566,39 +547,6 @@ static int blk_partition_remap(struct bio *bio) return 0; } -/* - * Check write append to a zoned block device. - */ -static inline blk_status_t blk_check_zone_append(struct request_queue *q, - struct bio *bio) -{ - int nr_sectors = bio_sectors(bio); - - /* Only applicable to zoned block devices */ - if (!bdev_is_zoned(bio->bi_bdev)) - return BLK_STS_NOTSUPP; - - /* The bio sector must point to the start of a sequential zone */ - if (!bdev_is_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector)) - return BLK_STS_INVAL; - - /* - * Not allowed to cross zone boundaries. Otherwise, the BIO will be - * split and could result in non-contiguous sectors being written in - * different zones. - */ - if (nr_sectors > q->limits.chunk_sectors) - return BLK_STS_INVAL; - - /* Make sure the BIO is small enough and will not get split */ - if (nr_sectors > q->limits.max_zone_append_sectors) - return BLK_STS_INVAL; - - bio->bi_opf |= REQ_NOMERGE; - - return BLK_STS_OK; -} - static void __submit_bio(struct bio *bio) { /* If plug is not used, add new plug here to cache nsecs time. */ @@ -731,18 +679,6 @@ void submit_bio_noacct_nocheck(struct bio *bio, bool split) } } -static blk_status_t blk_validate_atomic_write_op_size(struct request_queue *q, - struct bio *bio) -{ - if (bio->bi_iter.bi_size > queue_atomic_write_unit_max_bytes(q)) - return BLK_STS_INVAL; - - if (bio->bi_iter.bi_size % queue_atomic_write_unit_min_bytes(q)) - return BLK_STS_INVAL; - - return BLK_STS_OK; -} - /** * submit_bio_noacct - re-submit a bio to the block device layer for I/O * @bio: The bio describing the location in memory and on the device. @@ -755,7 +691,6 @@ static blk_status_t blk_validate_atomic_write_op_size(struct request_queue *q, void submit_bio_noacct(struct bio *bio) { struct block_device *bdev = bio->bi_bdev; - struct request_queue *q = bdev_get_queue(bdev); blk_status_t status = BLK_STS_IOERR; might_sleep(); @@ -776,7 +711,6 @@ void submit_bio_noacct(struct bio *bio) if (should_fail_bio(bio)) goto end_io; - bio_check_ro(bio); if (!bio_flagged(bio, BIO_REMAPPED)) { if (bdev_is_partition(bdev) && unlikely(blk_partition_remap(bio))) @@ -800,58 +734,6 @@ void submit_bio_noacct(struct bio *bio) } } - switch (bio_op(bio)) { - case REQ_OP_READ: - break; - case REQ_OP_WRITE: - if (bio->bi_opf & REQ_ATOMIC) { - status = blk_validate_atomic_write_op_size(q, bio); - if (status != BLK_STS_OK) - goto end_io; - } - break; - case REQ_OP_FLUSH: - /* - * REQ_OP_FLUSH can't be submitted through bios, it is only - * synthetized in struct request by the flush state machine. - */ - goto not_supported; - case REQ_OP_DISCARD: - if (!bdev_max_discard_sectors(bdev)) - goto not_supported; - break; - case REQ_OP_SECURE_ERASE: - if (!bdev_max_secure_erase_sectors(bdev)) - goto not_supported; - break; - case REQ_OP_ZONE_APPEND: - status = blk_check_zone_append(q, bio); - if (status != BLK_STS_OK) - goto end_io; - break; - case REQ_OP_WRITE_ZEROES: - if (!q->limits.max_write_zeroes_sectors) - goto not_supported; - break; - case REQ_OP_ZONE_RESET: - case REQ_OP_ZONE_OPEN: - case REQ_OP_ZONE_CLOSE: - case REQ_OP_ZONE_FINISH: - case REQ_OP_ZONE_RESET_ALL: - if (!bdev_is_zoned(bio->bi_bdev)) - goto not_supported; - break; - case REQ_OP_DRV_IN: - case REQ_OP_DRV_OUT: - /* - * Driver private operations are only used with passthrough - * requests. - */ - fallthrough; - default: - goto not_supported; - } - if (blk_throtl_bio(bio)) return; submit_bio_noacct_nocheck(bio, false); diff --git a/block/blk.h b/block/blk.h index e70acb2d358e3..d3b897e9b5ee9 100644 --- a/block/blk.h +++ b/block/blk.h @@ -407,6 +407,22 @@ static inline bool bio_may_need_split(struct bio *bio, return bv->bv_len + bv->bv_offset > lim->max_fast_segment_size; } +static inline void bio_check_ro(struct bio *bio) +{ + if (op_is_write(bio_op(bio)) && bdev_read_only(bio->bi_bdev)) { + if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) + return; + + if (bdev_test_flag(bio->bi_bdev, BD_RO_WARNED)) + return; + + bdev_set_flag(bio->bi_bdev, BD_RO_WARNED); + + pr_warn("Trying to write to read-only block-device %pg\n", + bio->bi_bdev); + } +} + /** * __bio_split_to_limits - split a bio to fit the queue limits * @bio: bio to be split @@ -423,6 +439,8 @@ static inline bool bio_may_need_split(struct bio *bio, static inline struct bio *__bio_split_to_limits(struct bio *bio, const struct queue_limits *lim, unsigned int *nr_segs) { + bio_check_ro(bio); + if (unlikely(bio_end_sector(bio) > bdev_nr_sectors(bio->bi_bdev) + bio->bi_bdev->bd_start_sect)) { pr_info_ratelimited("%s: attempt to access beyond end of device\n" @@ -435,24 +453,75 @@ static inline struct bio *__bio_split_to_limits(struct bio *bio, } switch (bio_op(bio)) { - case REQ_OP_READ: case REQ_OP_WRITE: + if (bio->bi_opf & REQ_ATOMIC) { + if (bio->bi_iter.bi_size > lim->atomic_write_unit_max || + bio->bi_iter.bi_size % lim->atomic_write_unit_min) + goto invalid; + } + fallthrough; + case REQ_OP_READ: if (bio_may_need_split(bio, lim)) return bio_split_rw(bio, lim, nr_segs); *nr_segs = 1; return bio; case REQ_OP_ZONE_APPEND: + /* Only applicable to zoned block devices */ + if (!(lim->features & BLK_FEAT_ZONED)) + goto not_supported; + + /* The bio sector must point to the start of a sequential zone */ + if (!bdev_is_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector)) + goto invalid; + + /* + * Not allowed to cross zone boundaries. Otherwise, the BIO + * will be split and could result in non-contiguous sectors + * being written in different zones. + */ + if (bio_sectors(bio) > lim->chunk_sectors) + goto invalid; + + /* Make sure the BIO is small enough and will not get split */ + if (bio_sectors(bio) > lim->max_zone_append_sectors) + goto invalid; + + bio->bi_opf |= REQ_NOMERGE; return bio_split_zone_append(bio, lim, nr_segs); case REQ_OP_DISCARD: + if (!lim->max_discard_sectors) + goto not_supported; + return bio_split_discard(bio, lim, nr_segs); case REQ_OP_SECURE_ERASE: + if (!lim->max_secure_erase_sectors) + goto not_supported; return bio_split_discard(bio, lim, nr_segs); case REQ_OP_WRITE_ZEROES: + if (!lim->max_write_zeroes_sectors) + goto not_supported; return bio_split_write_zeroes(bio, lim, nr_segs); - default: - /* other operations can't be split */ + case REQ_OP_ZONE_RESET: + case REQ_OP_ZONE_OPEN: + case REQ_OP_ZONE_CLOSE: + case REQ_OP_ZONE_FINISH: + case REQ_OP_ZONE_RESET_ALL: + if (!(lim->features & BLK_FEAT_ZONED)) + goto not_supported; *nr_segs = 0; return bio; + default: + WARN_ON_ONCE(1); + goto not_supported; } + +invalid: + bio->bi_status = BLK_STS_INVAL; + bio_endio(bio); + return NULL; +not_supported: + bio->bi_status = BLK_STS_NOTSUPP; + bio_endio(bio); + return NULL; ioerr: bio_io_error(bio); return NULL; -- 2.53.0-Meta From: Keith Busch The nvme driver has long utilized a zero capacity to indicate the path isn't reachable, which creates a race condition with IO dispatch when paths are being detached on a live system: when the block layer rejects a bio early due to a capacity check failure, drivers with multipath support using the original bio have no interception point to redirect the bio to another path. We don't want to have to clone the bio just for this condition, so add a failed_bio callback to block_device_operations, called from bio_io_error. If the callback returns true, the driver has taken ownership of the bio and the error completion is skipped. Implement the callback for NVMe multipath. nvme_failed_bio redirects failing bios back to the multipath head device's requeue list for path re-selection, but only when all three conditions are met: - The bio came through the multipath head (REQ_NVME_MPATH) - The error is a path-related error (blk_path_error) - The path is no longer ready (!NVME_NS_READY) Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 1 + drivers/nvme/host/multipath.c | 26 ++++++++++++++++++++++++++ drivers/nvme/host/nvme.h | 2 ++ include/linux/bio.h | 6 ------ include/linux/blkdev.h | 16 ++++++++++++++++ 5 files changed, 45 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c3032d6ad6b1e..ac33b4dd19127 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2650,6 +2650,7 @@ const struct block_device_operations nvme_bdev_ops = { .get_unique_id = nvme_get_unique_id, .report_zones = nvme_report_zones, .pr_ops = &nvme_pr_ops, + .failed_bio = nvme_failed_bio, }; static int nvme_wait_ready(struct nvme_ctrl *ctrl, u32 mask, u32 val, diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 263161cb8ac06..250d0719d32cf 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -134,6 +134,32 @@ void nvme_mpath_start_freeze(struct nvme_subsystem *subsys) blk_freeze_queue_start(h->disk->queue); } +bool nvme_failed_bio(struct bio *bio) +{ + unsigned long flags; + struct nvme_ns *ns; + + if (!(bio->bi_opf & REQ_NVME_MPATH)) + return false; + if (!blk_path_error(bio->bi_status)) + return false; + + ns = bio->bi_bdev->bd_disk->queue->queuedata; + if (test_bit(NVME_NS_READY, &ns->flags)) + return false; + nvme_mpath_clear_current_path(ns); + + bio->bi_status = BLK_STS_OK; + bio_set_dev(bio, ns->head->disk->part0); + + spin_lock_irqsave(&ns->head->requeue_lock, flags); + bio_list_add(&ns->head->requeue_list, bio); + spin_unlock_irqrestore(&ns->head->requeue_lock, flags); + + kblockd_schedule_work(&ns->head->requeue_work); + return true; +} + void nvme_failover_req(struct request *req) { struct nvme_ns *ns = req->q->queuedata; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index ccd5e05dac98f..37d4f037b9a8a 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -1028,6 +1028,7 @@ void nvme_mpath_unfreeze(struct nvme_subsystem *subsys); void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys); void nvme_mpath_start_freeze(struct nvme_subsystem *subsys); void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys); +bool nvme_failed_bio(struct bio *bio); void nvme_failover_req(struct request *req); void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl); int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head); @@ -1079,6 +1080,7 @@ static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl) { return false; } +#define nvme_failed_bio NULL static inline void nvme_failover_req(struct request *req) { } diff --git a/include/linux/bio.h b/include/linux/bio.h index 1a83a6753d70d..4f01033c32ced 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -371,12 +371,6 @@ void submit_bio(struct bio *bio); extern void bio_endio(struct bio *); -static inline void bio_io_error(struct bio *bio) -{ - bio->bi_status = BLK_STS_IOERR; - bio_endio(bio); -} - static inline void bio_wouldblock_error(struct bio *bio) { bio_set_flag(bio, BIO_QUIET); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 17270a28c66d5..75cbf496e0efa 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1686,8 +1686,24 @@ struct block_device_operations { * driver. */ int (*alternative_gpt_sector)(struct gendisk *disk, sector_t *sector); + bool (*failed_bio)(struct bio *bio); }; +static inline void bio_io_error(struct bio *bio) +{ + bio->bi_status = BLK_STS_IOERR; + + if (bio->bi_bdev) { + const struct block_device_operations *ops = + bio->bi_bdev->bd_disk->fops; + + if (ops->failed_bio && ops->failed_bio(bio)) + return; + } + + bio_endio(bio); +} + #ifdef CONFIG_COMPAT extern int blkdev_compat_ptr_ioctl(struct block_device *, blk_mode_t, unsigned int, unsigned long); -- 2.53.0-Meta