From: Yu Kuai Now that bio->bi_issue is only used by io-latency to get bio issue time, replace bio_issue with u64 time directly and remove bio_issue to make code cleaner. Signed-off-by: Yu Kuai --- block/bio.c | 2 +- block/blk-cgroup.h | 2 +- block/blk-iolatency.c | 14 +++---------- block/blk.h | 42 --------------------------------------- include/linux/blk_types.h | 7 ++----- 5 files changed, 7 insertions(+), 60 deletions(-) diff --git a/block/bio.c b/block/bio.c index 44c43b970387..c8fce0d6e332 100644 --- a/block/bio.c +++ b/block/bio.c @@ -261,7 +261,7 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, bio->bi_private = NULL; #ifdef CONFIG_BLK_CGROUP bio->bi_blkg = NULL; - bio->bi_issue.value = 0; + bio->issue_time_ns = 0; if (bdev) bio_associate_blkg(bio); #ifdef CONFIG_BLK_CGROUP_IOCOST diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 81868ad86330..d73204d27d72 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -372,7 +372,7 @@ static inline void blkg_put(struct blkcg_gq *blkg) static inline void blkcg_bio_issue_init(struct bio *bio) { - bio_issue_init(&bio->bi_issue, bio_sectors(bio)); + bio->issue_time_ns = blk_time_get_ns(); } static inline void blkcg_use_delay(struct blkcg_gq *blkg) diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 2f8fdecdd7a9..554b191a6892 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -485,19 +485,11 @@ static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio) mod_timer(&blkiolat->timer, jiffies + HZ); } -static void iolatency_record_time(struct iolatency_grp *iolat, - struct bio_issue *issue, u64 now, - bool issue_as_root) +static void iolatency_record_time(struct iolatency_grp *iolat, u64 start, + u64 now, bool issue_as_root) { - u64 start = bio_issue_time(issue); u64 req_time; - /* - * Have to do this so we are truncated to the correct time that our - * issue is truncated to. - */ - now = __bio_issue_time(now); - if (now <= start) return; @@ -625,7 +617,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) * submitted, so do not account for it. */ if (iolat->min_lat_nsec && bio->bi_status != BLK_STS_AGAIN) { - iolatency_record_time(iolat, &bio->bi_issue, now, + iolatency_record_time(iolat, bio->issue_time_ns, now, issue_as_root); window_start = atomic64_read(&iolat->window_start); if (now > window_start && diff --git a/block/blk.h b/block/blk.h index 46f566f9b126..0268deb22268 100644 --- a/block/blk.h +++ b/block/blk.h @@ -680,48 +680,6 @@ static inline ktime_t blk_time_get(void) return ns_to_ktime(blk_time_get_ns()); } -/* - * From most significant bit: - * 1 bit: reserved for other usage, see below - * 12 bits: original size of bio - * 51 bits: issue time of bio - */ -#define BIO_ISSUE_RES_BITS 1 -#define BIO_ISSUE_SIZE_BITS 12 -#define BIO_ISSUE_RES_SHIFT (64 - BIO_ISSUE_RES_BITS) -#define BIO_ISSUE_SIZE_SHIFT (BIO_ISSUE_RES_SHIFT - BIO_ISSUE_SIZE_BITS) -#define BIO_ISSUE_TIME_MASK ((1ULL << BIO_ISSUE_SIZE_SHIFT) - 1) -#define BIO_ISSUE_SIZE_MASK \ - (((1ULL << BIO_ISSUE_SIZE_BITS) - 1) << BIO_ISSUE_SIZE_SHIFT) -#define BIO_ISSUE_RES_MASK (~((1ULL << BIO_ISSUE_RES_SHIFT) - 1)) - -/* Reserved bit for blk-throtl */ -#define BIO_ISSUE_THROTL_SKIP_LATENCY (1ULL << 63) - -static inline u64 __bio_issue_time(u64 time) -{ - return time & BIO_ISSUE_TIME_MASK; -} - -static inline u64 bio_issue_time(struct bio_issue *issue) -{ - return __bio_issue_time(issue->value); -} - -static inline sector_t bio_issue_size(struct bio_issue *issue) -{ - return ((issue->value & BIO_ISSUE_SIZE_MASK) >> BIO_ISSUE_SIZE_SHIFT); -} - -static inline void bio_issue_init(struct bio_issue *issue, - sector_t size) -{ - size &= (1ULL << BIO_ISSUE_SIZE_BITS) - 1; - issue->value = ((issue->value & BIO_ISSUE_RES_MASK) | - (blk_time_get_ns() & BIO_ISSUE_TIME_MASK) | - ((u64)size << BIO_ISSUE_SIZE_SHIFT)); -} - void bdev_release(struct file *bdev_file); int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops, struct file *bdev_file); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 930daff207df..b8be751e16fc 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -198,10 +198,6 @@ static inline bool blk_path_error(blk_status_t error) return true; } -struct bio_issue { - u64 value; -}; - typedef __u32 __bitwise blk_opf_t; typedef unsigned int blk_qc_t; @@ -242,7 +238,8 @@ struct bio { * on release of the bio. */ struct blkcg_gq *bi_blkg; - struct bio_issue bi_issue; + /* Time that this bio was issued. */ + u64 issue_time_ns; #ifdef CONFIG_BLK_CGROUP_IOCOST u64 bi_iocost_cost; #endif -- 2.39.2 From: Yu Kuai blkcg_bio_issue_init() is called for every bio, while initialized bio_issue_time is only used by io-latency. Add a new queue_flag and only set this flag when io-latency is initialized, so that extra blk_time_get_ns() from blkcg_bio_issue_init() can be saved for disks that io-latency is not enabled. Signed-off-by: Yu Kuai --- block/blk-cgroup.h | 5 ++++- block/blk-iolatency.c | 1 + block/blk-mq-debugfs.c | 1 + include/linux/blkdev.h | 1 + 4 files changed, 7 insertions(+), 1 deletion(-) diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index d73204d27d72..93e8a9fa76fe 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -372,7 +372,10 @@ static inline void blkg_put(struct blkcg_gq *blkg) static inline void blkcg_bio_issue_init(struct bio *bio) { - bio->issue_time_ns = blk_time_get_ns(); + struct request_queue *q = bdev_get_queue(bio->bi_bdev); + + if (test_bit(QUEUE_FLAG_BIO_ISSUE, &q->queue_flags)) + bio->issue_time_ns = blk_time_get_ns(); } static inline void blkcg_use_delay(struct blkcg_gq *blkg) diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 554b191a6892..c9b3bd12c87c 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -767,6 +767,7 @@ static int blk_iolatency_init(struct gendisk *disk) if (ret) goto err_qos_del; + blk_queue_flag_set(QUEUE_FLAG_BIO_ISSUE, disk->queue); timer_setup(&blkiolat->timer, blkiolatency_timer_fn, 0); INIT_WORK(&blkiolat->enable_work, blkiolatency_enable_work_fn); diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 32c65efdda46..b192647456e1 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -96,6 +96,7 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(DISABLE_WBT_DEF), QUEUE_FLAG_NAME(NO_ELV_SWITCH), QUEUE_FLAG_NAME(QOS_ENABLED), + QUEUE_FLAG_NAME(BIO_ISSUE), }; #undef QUEUE_FLAG_NAME diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index fe1797bbec42..ca1dcf59cb32 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -657,6 +657,7 @@ enum { QUEUE_FLAG_DISABLE_WBT_DEF, /* for sched to disable/enable wbt */ QUEUE_FLAG_NO_ELV_SWITCH, /* can't switch elevator any more */ QUEUE_FLAG_QOS_ENABLED, /* qos is enabled */ + QUEUE_FLAG_BIO_ISSUE, /* track bio issue time */ QUEUE_FLAG_MAX }; -- 2.39.2 From: Yu Kuai If bio is split by internal chunksize of badblocks, the corresponding trace_block_split() is missing, causing blktrace can't catch the split events and make it hader to analyze IO behavior. Fixes: 4b1faf931650 ("block: Kill bio_pair_split()") Signed-off-by: Yu Kuai --- drivers/md/md-linear.c | 1 + drivers/md/raid0.c | 4 ++++ drivers/md/raid1.c | 4 ++++ drivers/md/raid10.c | 8 ++++++++ drivers/md/raid5.c | 2 ++ 5 files changed, 19 insertions(+) diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index 5d9b08115375..59d7963c7843 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -266,6 +266,7 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio) } bio_chain(split, bio); + trace_block_split(split, bio->bi_iter.bi_sector); submit_bio_noacct(bio); bio = split; } diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index f1d8811a542a..1ba7d0c090f7 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -472,7 +472,9 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) bio_endio(bio); return; } + bio_chain(split, bio); + trace_block_split(split, bio->bi_iter.bi_sector); submit_bio_noacct(bio); bio = split; end = zone->zone_end; @@ -620,7 +622,9 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) bio_endio(bio); return true; } + bio_chain(split, bio); + trace_block_split(split, bio->bi_iter.bi_sector); raid0_map_submit_bio(mddev, bio); bio = split; } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 408c26398321..29edb7b548f3 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1383,7 +1383,9 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, error = PTR_ERR(split); goto err_handle; } + bio_chain(split, bio); + trace_block_split(split, bio->bi_iter.bi_sector); submit_bio_noacct(bio); bio = split; r1_bio->master_bio = bio; @@ -1591,7 +1593,9 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, error = PTR_ERR(split); goto err_handle; } + bio_chain(split, bio); + trace_block_split(split, bio->bi_iter.bi_sector); submit_bio_noacct(bio); bio = split; r1_bio->master_bio = bio; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index b60c30bfb6c7..859c40a5ecf4 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1209,7 +1209,9 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, error = PTR_ERR(split); goto err_handle; } + bio_chain(split, bio); + trace_block_split(split, bio->bi_iter.bi_sector); allow_barrier(conf); submit_bio_noacct(bio); wait_barrier(conf, false); @@ -1495,7 +1497,9 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, error = PTR_ERR(split); goto err_handle; } + bio_chain(split, bio); + trace_block_split(split, bio->bi_iter.bi_sector); allow_barrier(conf); submit_bio_noacct(bio); wait_barrier(conf, false); @@ -1679,7 +1683,9 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio) bio_endio(bio); return 0; } + bio_chain(split, bio); + trace_block_split(split, bio->bi_iter.bi_sector); allow_barrier(conf); /* Resend the fist split part */ submit_bio_noacct(split); @@ -1694,7 +1700,9 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio) bio_endio(bio); return 0; } + bio_chain(split, bio); + trace_block_split(split, bio->bi_iter.bi_sector); allow_barrier(conf); /* Resend the second split part */ submit_bio_noacct(bio); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 023649fe2476..0fb838879844 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5475,8 +5475,10 @@ static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) if (sectors < bio_sectors(raid_bio)) { struct r5conf *conf = mddev->private; + split = bio_split(raid_bio, sectors, GFP_NOIO, &conf->bio_split); bio_chain(split, raid_bio); + trace_block_split(split, raid_bio->bi_iter.bi_sector); submit_bio_noacct(raid_bio); raid_bio = split; } -- 2.39.2 From: Yu Kuai 1) trace_block_split() is missing and blktrace can't catch split events; 2) blkcg_bio_issue_init() is missing, and io-latency will not work correctly for split bio. Fixes: 488f6682c832 ("block: blk-crypto-fallback for Inline Encryption") Signed-off-by: Yu Kuai --- block/blk-crypto-fallback.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index 005c9157ffb3..cae11c2f96c5 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -231,7 +231,10 @@ static bool blk_crypto_fallback_split_bio_if_needed(struct bio **bio_ptr) bio->bi_status = BLK_STS_RESOURCE; return false; } + + blkcg_bio_issue_init(split_bio); bio_chain(split_bio, bio); + trace_block_split(split_bio, bio->bi_iter.bi_sector); submit_bio_noacct(bio); *bio_ptr = split_bio; } -- 2.39.2 From: Yu Kuai No functional changes are intended, some drivers like mdraid will split bio by internal processing, prepare to unify bio split codes. Signed-off-by: Yu Kuai --- block/blk-merge.c | 63 ++++++++++++++++++++++++++++-------------- include/linux/blkdev.h | 2 ++ 2 files changed, 44 insertions(+), 21 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index 70d704615be5..e1afb07040c0 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -104,34 +104,55 @@ static unsigned int bio_allowed_max_sectors(const struct queue_limits *lim) return round_down(UINT_MAX, lim->logical_block_size) >> SECTOR_SHIFT; } +/** + * bio_submit_split_bioset - Submit a bio, splitting it at a designated sector + * @bio: the original bio to be submitted and split + * @split_sectors: the sector count at which to split + * @bs: the bio set used for allocating the new split bio + * + * The original bio is modified to contain the remaining sectors and submitted. + * The caller is responsible for submitting the returned bio. + * + * If succeed, the newly allocated bio representing the initial part will be + * returned, on failure NULL will be returned and original bio will fail. + */ +struct bio *bio_submit_split_bioset(struct bio *bio, unsigned int split_sectors, + struct bio_set *bs) +{ + struct bio *split = bio_split(bio, split_sectors, GFP_NOIO, bs); + + if (IS_ERR(split)) { + bio->bi_status = errno_to_blk_status(PTR_ERR(split)); + bio_endio(bio); + return NULL; + } + + blkcg_bio_issue_init(split); + bio_chain(split, bio); + trace_block_split(split, bio->bi_iter.bi_sector); + WARN_ON_ONCE(bio_zone_write_plugging(bio)); + submit_bio_noacct(bio); + + return split; +} +EXPORT_SYMBOL_GPL(bio_submit_split_bioset); + static struct bio *bio_submit_split(struct bio *bio, int split_sectors) { - if (unlikely(split_sectors < 0)) - goto error; + if (unlikely(split_sectors < 0)) { + bio->bi_status = errno_to_blk_status(split_sectors); + bio_endio(bio); + return NULL; + } if (split_sectors) { - struct bio *split; - - split = bio_split(bio, split_sectors, GFP_NOIO, - &bio->bi_bdev->bd_disk->bio_split); - if (IS_ERR(split)) { - split_sectors = PTR_ERR(split); - goto error; - } - split->bi_opf |= REQ_NOMERGE; - blkcg_bio_issue_init(split); - bio_chain(split, bio); - trace_block_split(split, bio->bi_iter.bi_sector); - WARN_ON_ONCE(bio_zone_write_plugging(bio)); - submit_bio_noacct(bio); - return split; + bio = bio_submit_split_bioset(bio, split_sectors, + &bio->bi_bdev->bd_disk->bio_split); + if (bio) + bio->bi_opf |= REQ_NOMERGE; } return bio; -error: - bio->bi_status = errno_to_blk_status(split_sectors); - bio_endio(bio); - return NULL; } struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ca1dcf59cb32..eddcd15e7727 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1000,6 +1000,8 @@ extern int blk_register_queue(struct gendisk *disk); extern void blk_unregister_queue(struct gendisk *disk); void submit_bio_noacct(struct bio *bio); struct bio *bio_split_to_limits(struct bio *bio); +struct bio *bio_submit_split_bioset(struct bio *bio, unsigned int split_sectors, + struct bio_set *bs); extern int blk_lld_busy(struct request_queue *q); extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags); -- 2.39.2 From: Yu Kuai Unify bio split code, and prepare to fix disordered split IO Noted commit 319ff40a5427 ("md/raid0: Fix performance regression for large sequential writes") already fix disordered split IO by converting bio to underlying disks before submit_bio_noacct(), with the respect md_submit_bio() already split by sectors, and raid0_make_request() will split at most once for unaligned IO. This is a bit hacky and we'll convert this to solution in general later. Signed-off-by: Yu Kuai --- drivers/md/raid0.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 1ba7d0c090f7..99f7839b4e96 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -463,23 +463,16 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) zone = find_zone(conf, &start); if (bio_end_sector(bio) > zone->zone_end) { - struct bio *split = bio_split(bio, - zone->zone_end - bio->bi_iter.bi_sector, GFP_NOIO, - &mddev->bio_set); - - if (IS_ERR(split)) { - bio->bi_status = errno_to_blk_status(PTR_ERR(split)); - bio_endio(bio); + bio = bio_submit_split_bioset( + bio, zone->zone_end - bio->bi_iter.bi_sector, + &mddev->bio_set); + if (!bio) return; - } - bio_chain(split, bio); - trace_block_split(split, bio->bi_iter.bi_sector); - submit_bio_noacct(bio); - bio = split; end = zone->zone_end; - } else + } else { end = bio_end_sector(bio); + } orig_end = end; if (zone != conf->strip_zone) -- 2.39.2 From: Yu Kuai Unify bio split code, and prepare to fix disordered split IO. Signed-off-by: Yu Kuai --- drivers/md/raid1.c | 38 +++++++++++--------------------------- drivers/md/raid1.h | 4 +++- 2 files changed, 14 insertions(+), 28 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 29edb7b548f3..f8434049f9b1 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1317,7 +1317,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, struct raid1_info *mirror; struct bio *read_bio; int max_sectors; - int rdisk, error; + int rdisk; bool r1bio_existed = !!r1_bio; /* @@ -1376,18 +1376,13 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, } if (max_sectors < bio_sectors(bio)) { - struct bio *split = bio_split(bio, max_sectors, - gfp, &conf->bio_split); - - if (IS_ERR(split)) { - error = PTR_ERR(split); + bio = bio_submit_split_bioset(bio, max_sectors, + &conf->bio_split); + if (!bio) { + set_bit(R1BIO_Returned, &r1_bio->state); goto err_handle; } - bio_chain(split, bio); - trace_block_split(split, bio->bi_iter.bi_sector); - submit_bio_noacct(bio); - bio = split; r1_bio->master_bio = bio; r1_bio->sectors = max_sectors; } @@ -1415,8 +1410,6 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, err_handle: atomic_dec(&mirror->rdev->nr_pending); - bio->bi_status = errno_to_blk_status(error); - set_bit(R1BIO_Uptodate, &r1_bio->state); raid_end_bio_io(r1_bio); } @@ -1459,7 +1452,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, { struct r1conf *conf = mddev->private; struct r1bio *r1_bio; - int i, disks, k, error; + int i, disks, k; unsigned long flags; int first_clone; int max_sectors; @@ -1563,10 +1556,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, * complexity of supporting that is not worth * the benefit. */ - if (bio->bi_opf & REQ_ATOMIC) { - error = -EIO; + if (bio->bi_opf & REQ_ATOMIC) goto err_handle; - } good_sectors = first_bad - r1_bio->sector; if (good_sectors < max_sectors) @@ -1586,18 +1577,13 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, max_sectors = min_t(int, max_sectors, BIO_MAX_VECS * (PAGE_SIZE >> 9)); if (max_sectors < bio_sectors(bio)) { - struct bio *split = bio_split(bio, max_sectors, - GFP_NOIO, &conf->bio_split); - - if (IS_ERR(split)) { - error = PTR_ERR(split); + bio = bio_submit_split_bioset(bio, max_sectors, + &conf->bio_split); + if (!bio) { + set_bit(R1BIO_Returned, &r1_bio->state); goto err_handle; } - bio_chain(split, bio); - trace_block_split(split, bio->bi_iter.bi_sector); - submit_bio_noacct(bio); - bio = split; r1_bio->master_bio = bio; r1_bio->sectors = max_sectors; } @@ -1687,8 +1673,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, } } - bio->bi_status = errno_to_blk_status(error); - set_bit(R1BIO_Uptodate, &r1_bio->state); raid_end_bio_io(r1_bio); } diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index d236ef179cfb..2ebe35aaa534 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -178,7 +178,9 @@ enum r1bio_state { * any write was successful. Otherwise we call when * any write-behind write succeeds, otherwise we call * with failure when last write completes (and all failed). - * Record that bi_end_io was called with this flag... + * + * And for bio_split errors, record that bi_end_io was called + * with this flag... */ R1BIO_Returned, /* If a write for this request means we can clear some -- 2.39.2 From: Yu Kuai Prepare to unfiy the bio split code, the helper bio_submit_split_bioset() can failed the orginal bio on split errors. The flag name is refer to the r1bio flag name. Signed-off-by: Yu Kuai --- drivers/md/raid10.c | 8 +++++--- drivers/md/raid10.h | 2 ++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 859c40a5ecf4..a775a1317635 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -322,10 +322,12 @@ static void raid_end_bio_io(struct r10bio *r10_bio) struct bio *bio = r10_bio->master_bio; struct r10conf *conf = r10_bio->mddev->private; - if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) - bio->bi_status = BLK_STS_IOERR; + if (!test_and_set_bit(R10BIO_Returned, &r10_bio->state)) { + if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + } - bio_endio(bio); /* * Wake up any possible resync thread that waits for the device * to go idle. diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 3f16ad6904a9..da00a55f7a55 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -165,6 +165,8 @@ enum r10bio_state { * so that raid10d knows what to do with them. */ R10BIO_ReadError, +/* For bio_split errors, record that bi_end_io was called. */ + R10BIO_Returned, /* If a write for this request means we can clear some * known-bad-block records, we set this flag. */ -- 2.39.2 From: Yu Kuai Unify bio split code, prepare to fix disordered split IO. Noted discard is not handled, because discard is only split for unaligned head and tail, and this can be considered slow path, the disorder here does not matter much. Signed-off-by: Yu Kuai --- drivers/md/raid10.c | 42 +++++++++++++----------------------------- 1 file changed, 13 insertions(+), 29 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index a775a1317635..69477be91b26 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1156,7 +1156,6 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, int slot = r10_bio->read_slot; struct md_rdev *err_rdev = NULL; gfp_t gfp = GFP_NOIO; - int error; if (slot >= 0 && r10_bio->devs[slot].rdev) { /* @@ -1205,19 +1204,15 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, rdev->bdev, (unsigned long long)r10_bio->sector); if (max_sectors < bio_sectors(bio)) { - struct bio *split = bio_split(bio, max_sectors, - gfp, &conf->bio_split); - if (IS_ERR(split)) { - error = PTR_ERR(split); + allow_barrier(conf); + bio = bio_submit_split_bioset(bio, max_sectors, + &conf->bio_split); + wait_barrier(conf, false); + if (!bio) { + set_bit(R10BIO_Returned, &r10_bio->state); goto err_handle; } - bio_chain(split, bio); - trace_block_split(split, bio->bi_iter.bi_sector); - allow_barrier(conf); - submit_bio_noacct(bio); - wait_barrier(conf, false); - bio = split; r10_bio->master_bio = bio; r10_bio->sectors = max_sectors; } @@ -1245,8 +1240,6 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, return; err_handle: atomic_dec(&rdev->nr_pending); - bio->bi_status = errno_to_blk_status(error); - set_bit(R10BIO_Uptodate, &r10_bio->state); raid_end_bio_io(r10_bio); } @@ -1355,7 +1348,6 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, int i, k; sector_t sectors; int max_sectors; - int error; if ((mddev_is_clustered(mddev) && mddev->cluster_ops->area_resyncing(mddev, WRITE, @@ -1469,10 +1461,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, * complexity of supporting that is not worth * the benefit. */ - if (bio->bi_opf & REQ_ATOMIC) { - error = -EIO; + if (bio->bi_opf & REQ_ATOMIC) goto err_handle; - } good_sectors = first_bad - dev_sector; if (good_sectors < max_sectors) @@ -1493,19 +1483,15 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, r10_bio->sectors = max_sectors; if (r10_bio->sectors < bio_sectors(bio)) { - struct bio *split = bio_split(bio, r10_bio->sectors, - GFP_NOIO, &conf->bio_split); - if (IS_ERR(split)) { - error = PTR_ERR(split); + allow_barrier(conf); + bio = bio_submit_split_bioset(bio, r10_bio->sectors, + &conf->bio_split); + wait_barrier(conf, false); + if (!bio) { + set_bit(R10BIO_Returned, &r10_bio->state); goto err_handle; } - bio_chain(split, bio); - trace_block_split(split, bio->bi_iter.bi_sector); - allow_barrier(conf); - submit_bio_noacct(bio); - wait_barrier(conf, false); - bio = split; r10_bio->master_bio = bio; } @@ -1537,8 +1523,6 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, } } - bio->bi_status = errno_to_blk_status(error); - set_bit(R10BIO_Uptodate, &r10_bio->state); raid_end_bio_io(r10_bio); } -- 2.39.2 From: Yu Kuai Unify bio split code, prepare to fix disordered split IO. Signed-off-by: Yu Kuai --- drivers/md/raid5.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 0fb838879844..3c9825ad3f07 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5468,7 +5468,6 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) { - struct bio *split; sector_t sector = raid_bio->bi_iter.bi_sector; unsigned chunk_sects = mddev->chunk_sectors; unsigned sectors = chunk_sects - (sector & (chunk_sects-1)); @@ -5476,11 +5475,10 @@ static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) if (sectors < bio_sectors(raid_bio)) { struct r5conf *conf = mddev->private; - split = bio_split(raid_bio, sectors, GFP_NOIO, &conf->bio_split); - bio_chain(split, raid_bio); - trace_block_split(split, raid_bio->bi_iter.bi_sector); - submit_bio_noacct(raid_bio); - raid_bio = split; + raid_bio = bio_submit_split_bioset(raid_bio, sectors, + &conf->bio_split); + if (!raid_bio) + return NULL; } if (!raid5_read_one_chunk(mddev, raid_bio)) -- 2.39.2 From: Yu Kuai Unify bio split code, prepare to fix disordered split IO. Signed-off-by: Yu Kuai --- drivers/md/md-linear.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index 59d7963c7843..701e3aac0a21 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -256,19 +256,11 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio) if (unlikely(bio_end_sector(bio) > end_sector)) { /* This bio crosses a device boundary, so we have to split it */ - struct bio *split = bio_split(bio, end_sector - bio_sector, + bio = bio_submit_split_bioset(bio, end_sector - bio_sector, GFP_NOIO, &mddev->bio_set); - - if (IS_ERR(split)) { - bio->bi_status = errno_to_blk_status(PTR_ERR(split)); - bio_endio(bio); + if (!bio) { return true; } - - bio_chain(split, bio); - trace_block_split(split, bio->bi_iter.bi_sector); - submit_bio_noacct(bio); - bio = split; } md_account_bio(mddev, &bio); -- 2.39.2 From: Yu Kuai Unify bio split code, prepare to fix disordered split IO. Signed-off-by: Yu Kuai --- block/blk-crypto-fallback.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index cae11c2f96c5..e6ed50d9b00f 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -223,20 +223,12 @@ static bool blk_crypto_fallback_split_bio_if_needed(struct bio **bio_ptr) break; } if (num_sectors < bio_sectors(bio)) { - struct bio *split_bio; - - split_bio = bio_split(bio, num_sectors, GFP_NOIO, - &crypto_bio_split); - if (IS_ERR(split_bio)) { - bio->bi_status = BLK_STS_RESOURCE; + bio = bio_submit_split_bioset(bio, num_sectors, + &crypto_bio_split); + if (!bio) return false; - } - blkcg_bio_issue_init(split_bio); - bio_chain(split_bio, bio); - trace_block_split(split_bio, bio->bi_iter.bi_sector); - submit_bio_noacct(bio); - *bio_ptr = split_bio; + *bio_ptr = bio; } return true; -- 2.39.2 From: Yu Kuai Lots of checks are already done while submitting this bio the first time, and there is no need to check them again when this bio is resubmitted after split. Hence factor out a helper submit_split_bio_noacct() for resubmitting bio after splitting, only should_fail_bio() and blk_throtl_bio() are kept. Signed-off-by: Yu Kuai --- block/blk-core.c | 16 ++++++++++++++++ block/blk-merge.c | 2 +- block/blk.h | 1 + 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/block/blk-core.c b/block/blk-core.c index 4201504158a1..ea194a1a5b2c 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -765,6 +765,22 @@ static blk_status_t blk_validate_atomic_write_op_size(struct request_queue *q, return BLK_STS_OK; } +/* resubmit a bio after split, see bio_submit_split_bioset() */ +void submit_split_bio_noacct(struct bio *bio) +{ + might_sleep(); + + if (should_fail_bio(bio)) { + bio_io_error(bio); + return; + } + + if (blk_throtl_bio(bio)) + return; + + submit_bio_noacct_nocheck(bio); +} + /** * submit_bio_noacct - re-submit a bio to the block device layer for I/O * @bio: The bio describing the location in memory and on the device. diff --git a/block/blk-merge.c b/block/blk-merge.c index e1afb07040c0..4feeaab0d3db 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -131,7 +131,7 @@ struct bio *bio_submit_split_bioset(struct bio *bio, unsigned int split_sectors, bio_chain(split, bio); trace_block_split(split, bio->bi_iter.bi_sector); WARN_ON_ONCE(bio_zone_write_plugging(bio)); - submit_bio_noacct(bio); + submit_split_bio_noacct(bio); return split; } diff --git a/block/blk.h b/block/blk.h index 0268deb22268..68bf637ab7ca 100644 --- a/block/blk.h +++ b/block/blk.h @@ -54,6 +54,7 @@ bool blk_queue_start_drain(struct request_queue *q); bool __blk_freeze_queue_start(struct request_queue *q, struct task_struct *owner); int __bio_queue_enter(struct request_queue *q, struct bio *bio); +void submit_split_bio_noacct(struct bio *bio); void submit_bio_noacct_nocheck(struct bio *bio); void bio_await_chain(struct bio *bio); -- 2.39.2 From: Yu Kuai Currently, split bio will be chained to original bio, and original bio will be resubmitted to the tail of current->bio_list, waiting for split bio to be issued. However, if split bio get split again, the IO order will be messed up, for example, in raid456 IO will first be split by max_sector from md_submit_bio(), and then later be split again by chunksize for internal handling: For example, assume max_sectors is 1M, and chunksize is 512k 1) issue a 2M IO: bio issuing: 0+2M current->bio_list: NULL 2) md_submit_bio() split by max_sector: bio issuing: 0+1M current->bio_list: 1M+1M 3) chunk_aligned_read() split by chunksize: bio issuing: 0+512k current->bio_list: 1M+1M -> 512k+512k 4) after first bio issued, __submit_bio_noacct() will contuine issuing next bio: bio issuing: 1M+1M current->bio_list: 512k+512k bio issued: 0+512k 5) chunk_aligned_read() split by chunksize: bio issuing: 1M+512k current->bio_list: 512k+512k -> 1536k+512k bio issued: 0+512k 6) no split afterwards, finally the issue order is: 0+512k -> 1M+512k -> 512k+512k -> 1536k+512k This behaviour will cause large IO read on raid456 endup to be small discontinuous IO in underlying disks. Fix this problem by placing split bio to the head of current->bio_list. Test script: test on 8 disk raid5 with 64k chunksize dd if=/dev/md0 of=/dev/null bs=4480k iflag=direct Test results: Before this patch 1) iostat results: Device r/s rMB/s rrqm/s %rrqm r_await rareq-sz aqu-sz %util md0 52430.00 3276.87 0.00 0.00 0.62 64.00 32.60 80.10 sd* 4487.00 409.00 2054.00 31.40 0.82 93.34 3.68 71.20 2) blktrace G stage: 8,0 0 486445 11.357392936 843 G R 14071424 + 128 [dd] 8,0 0 486451 11.357466360 843 G R 14071168 + 128 [dd] 8,0 0 486454 11.357515868 843 G R 14071296 + 128 [dd] 8,0 0 486468 11.357968099 843 G R 14072192 + 128 [dd] 8,0 0 486474 11.358031320 843 G R 14071936 + 128 [dd] 8,0 0 486480 11.358096298 843 G R 14071552 + 128 [dd] 8,0 0 486490 11.358303858 843 G R 14071808 + 128 [dd] 3) io seek for sdx: Noted io seek is the result from blktrace D stage, statistic of: ABS((offset of next IO) - (offset + len of previous IO)) Read|Write seek cnt 55175, zero cnt 25079 >=(KB) .. <(KB) : count ratio |distribution | 0 .. 1 : 25079 45.5% |########################################| 1 .. 2 : 0 0.0% | | 2 .. 4 : 0 0.0% | | 4 .. 8 : 0 0.0% | | 8 .. 16 : 0 0.0% | | 16 .. 32 : 0 0.0% | | 32 .. 64 : 12540 22.7% |##################### | 64 .. 128 : 2508 4.5% |##### | 128 .. 256 : 0 0.0% | | 256 .. 512 : 10032 18.2% |################# | 512 .. 1024 : 5016 9.1% |######### | After this patch: 1) iostat results: Device r/s rMB/s rrqm/s %rrqm r_await rareq-sz aqu-sz %util md0 87965.00 5271.88 0.00 0.00 0.16 61.37 14.03 90.60 sd* 6020.00 658.44 5117.00 45.95 0.44 112.00 2.68 86.50 2) blktrace G stage: 8,0 0 206296 5.354894072 664 G R 7156992 + 128 [dd] 8,0 0 206305 5.355018179 664 G R 7157248 + 128 [dd] 8,0 0 206316 5.355204438 664 G R 7157504 + 128 [dd] 8,0 0 206319 5.355241048 664 G R 7157760 + 128 [dd] 8,0 0 206333 5.355500923 664 G R 7158016 + 128 [dd] 8,0 0 206344 5.355837806 664 G R 7158272 + 128 [dd] 8,0 0 206353 5.355960395 664 G R 7158528 + 128 [dd] 8,0 0 206357 5.356020772 664 G R 7158784 + 128 [dd] 3) io seek for sdx Read|Write seek cnt 28644, zero cnt 21483 >=(KB) .. <(KB) : count ratio |distribution | 0 .. 1 : 21483 75.0% |########################################| 1 .. 2 : 0 0.0% | | 2 .. 4 : 0 0.0% | | 4 .. 8 : 0 0.0% | | 8 .. 16 : 0 0.0% | | 16 .. 32 : 0 0.0% | | 32 .. 64 : 7161 25.0% |############## | BTW, this looks like a long term problem from day one, and large sequential IO read is pretty common case like video playing. And even with this patch, in this test case IO is merged to at most 128k is due to block layer plug limit BLK_PLUG_FLUSH_SIZE, increase such limit can get even better performance. However, we'll figure out how to do this properly later. Fixes: d89d87965dcb ("When stacked block devices are in-use (e.g. md or dm), the recursive calls") Reported-by: Tie Ren Closes: https://lore.kernel.org/all/7dro5o7u5t64d6bgiansesjavxcuvkq5p2pok7dtwkav7b7ape@3isfr44b6352/ Signed-off-by: Yu Kuai --- block/blk-core.c | 21 ++++++++++++++------- block/blk-throttle.c | 2 +- block/blk.h | 2 +- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index ea194a1a5b2c..6ca3c45f421c 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -725,7 +725,7 @@ static void __submit_bio_noacct_mq(struct bio *bio) current->bio_list = NULL; } -void submit_bio_noacct_nocheck(struct bio *bio) +void submit_bio_noacct_nocheck(struct bio *bio, bool split) { blk_cgroup_bio_start(bio); blkcg_bio_issue_init(bio); @@ -745,12 +745,16 @@ void submit_bio_noacct_nocheck(struct bio *bio) * to collect a list of requests submited by a ->submit_bio method while * it is active, and then process them after it returned. */ - if (current->bio_list) - bio_list_add(¤t->bio_list[0], bio); - else if (!bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO)) + if (current->bio_list) { + if (split && !bdev_is_zoned(bio->bi_bdev)) + bio_list_add_head(¤t->bio_list[0], bio); + else + bio_list_add(¤t->bio_list[0], bio); + } else if (!bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO)) { __submit_bio_noacct_mq(bio); - else + } else { __submit_bio_noacct(bio); + } } static blk_status_t blk_validate_atomic_write_op_size(struct request_queue *q, @@ -770,6 +774,9 @@ void submit_split_bio_noacct(struct bio *bio) { might_sleep(); + /* This helper should only be called from submit_bio context */ + WARN_ON_ONCE(!current->bio_list); + if (should_fail_bio(bio)) { bio_io_error(bio); return; @@ -778,7 +785,7 @@ void submit_split_bio_noacct(struct bio *bio) if (blk_throtl_bio(bio)) return; - submit_bio_noacct_nocheck(bio); + submit_bio_noacct_nocheck(bio, true); } /** @@ -887,7 +894,7 @@ void submit_bio_noacct(struct bio *bio) if (blk_throtl_bio(bio)) return; - submit_bio_noacct_nocheck(bio); + submit_bio_noacct_nocheck(bio, false); return; not_supported: diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 397b6a410f9e..ead7b0eb4846 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1224,7 +1224,7 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work) if (!bio_list_empty(&bio_list_on_stack)) { blk_start_plug(&plug); while ((bio = bio_list_pop(&bio_list_on_stack))) - submit_bio_noacct_nocheck(bio); + submit_bio_noacct_nocheck(bio, false); blk_finish_plug(&plug); } } diff --git a/block/blk.h b/block/blk.h index 68bf637ab7ca..a7207eea7a84 100644 --- a/block/blk.h +++ b/block/blk.h @@ -55,7 +55,7 @@ bool __blk_freeze_queue_start(struct request_queue *q, struct task_struct *owner); int __bio_queue_enter(struct request_queue *q, struct bio *bio); void submit_split_bio_noacct(struct bio *bio); -void submit_bio_noacct_nocheck(struct bio *bio); +void submit_bio_noacct_nocheck(struct bio *bio, bool split); void bio_await_chain(struct bio *bio); static inline bool blk_try_enter_queue(struct request_queue *q, bool pm) -- 2.39.2 From: Yu Kuai Currently, raid0_make_request() will remap the original bio to underlying disks to prevent disordered IO. Now that bio_submit_split_bioset() will put original bio to the head of current->bio_list, it's safe converting to use this helper and bio will still be ordered. CC: Jan Kara Signed-off-by: Yu Kuai --- drivers/md/raid0.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 99f7839b4e96..857adc487962 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -607,19 +607,10 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) : sector_div(sector, chunk_sects)); if (sectors < bio_sectors(bio)) { - struct bio *split = bio_split(bio, sectors, GFP_NOIO, + bio = bio_submit_split_bioset(bio, sectors, &mddev->bio_set); - - if (IS_ERR(split)) { - bio->bi_status = errno_to_blk_status(PTR_ERR(split)); - bio_endio(bio); + if (!bio) return true; - } - - bio_chain(split, bio); - trace_block_split(split, bio->bi_iter.bi_sector); - raid0_map_submit_bio(mddev, bio); - bio = split; } raid0_map_submit_bio(mddev, bio); -- 2.39.2