To move implementation details inside blk-wbt.c, prepare to fix possible deadlock to call wbt_init() while queue is frozen in the next patch. Signed-off-by: Yu Kuai --- block/blk-sysfs.c | 39 ++---------------------------------- block/blk-wbt.c | 50 ++++++++++++++++++++++++++++++++++++++++++++--- block/blk-wbt.h | 8 ++------ 3 files changed, 51 insertions(+), 46 deletions(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 8684c57498cc..96f10e1bcbd1 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -636,11 +636,8 @@ static ssize_t queue_wb_lat_show(struct gendisk *disk, char *page) static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page, size_t count) { - struct request_queue *q = disk->queue; - struct rq_qos *rqos; ssize_t ret; s64 val; - unsigned int memflags; ret = queue_var_store64(&val, page); if (ret < 0) @@ -648,40 +645,8 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page, if (val < -1) return -EINVAL; - /* - * Ensure that the queue is idled, in case the latency update - * ends up either enabling or disabling wbt completely. We can't - * have IO inflight if that happens. - */ - memflags = blk_mq_freeze_queue(q); - - rqos = wbt_rq_qos(q); - if (!rqos) { - ret = wbt_init(disk); - if (ret) - goto out; - } - - ret = count; - if (val == -1) - val = wbt_default_latency_nsec(q); - else if (val >= 0) - val *= 1000ULL; - - if (wbt_get_min_lat(q) == val) - goto out; - - blk_mq_quiesce_queue(q); - - mutex_lock(&disk->rqos_state_mutex); - wbt_set_min_lat(q, val); - mutex_unlock(&disk->rqos_state_mutex); - - blk_mq_unquiesce_queue(q); -out: - blk_mq_unfreeze_queue(q, memflags); - - return ret; + ret = wbt_set_lat(disk, val); + return ret ? ret : count; } QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec"); diff --git a/block/blk-wbt.c b/block/blk-wbt.c index eb8037bae0bd..ba807649b29a 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -93,6 +93,8 @@ struct rq_wb { struct rq_depth rq_depth; }; +static int wbt_init(struct gendisk *disk); + static inline struct rq_wb *RQWB(struct rq_qos *rqos) { return container_of(rqos, struct rq_wb, rqos); @@ -506,7 +508,7 @@ u64 wbt_get_min_lat(struct request_queue *q) return RQWB(rqos)->min_lat_nsec; } -void wbt_set_min_lat(struct request_queue *q, u64 val) +static void wbt_set_min_lat(struct request_queue *q, u64 val) { struct rq_qos *rqos = wbt_rq_qos(q); if (!rqos) @@ -729,7 +731,7 @@ void wbt_enable_default(struct gendisk *disk) } EXPORT_SYMBOL_GPL(wbt_enable_default); -u64 wbt_default_latency_nsec(struct request_queue *q) +static u64 wbt_default_latency_nsec(struct request_queue *q) { /* * We default to 2msec for non-rotational storage, and 75msec @@ -890,7 +892,7 @@ static const struct rq_qos_ops wbt_rqos_ops = { #endif }; -int wbt_init(struct gendisk *disk) +static int wbt_init(struct gendisk *disk) { struct request_queue *q = disk->queue; struct rq_wb *rwb; @@ -937,3 +939,45 @@ int wbt_init(struct gendisk *disk) return ret; } + +int wbt_set_lat(struct gendisk *disk, s64 val) +{ + struct request_queue *q = disk->queue; + unsigned int memflags; + struct rq_qos *rqos; + int ret = 0; + + /* + * Ensure that the queue is idled, in case the latency update + * ends up either enabling or disabling wbt completely. We can't + * have IO inflight if that happens. + */ + memflags = blk_mq_freeze_queue(q); + + rqos = wbt_rq_qos(q); + if (!rqos) { + ret = wbt_init(disk); + if (ret) + goto out; + } + + if (val == -1) + val = wbt_default_latency_nsec(q); + else if (val >= 0) + val *= 1000ULL; + + if (wbt_get_min_lat(q) == val) + goto out; + + blk_mq_quiesce_queue(q); + + mutex_lock(&disk->rqos_state_mutex); + wbt_set_min_lat(q, val); + mutex_unlock(&disk->rqos_state_mutex); + + blk_mq_unquiesce_queue(q); +out: + blk_mq_unfreeze_queue(q, memflags); + + return ret; +} diff --git a/block/blk-wbt.h b/block/blk-wbt.h index e5fc653b9b76..414cf1c34212 100644 --- a/block/blk-wbt.h +++ b/block/blk-wbt.h @@ -4,15 +4,11 @@ #ifdef CONFIG_BLK_WBT -int wbt_init(struct gendisk *disk); void wbt_disable_default(struct gendisk *disk); void wbt_enable_default(struct gendisk *disk); - -u64 wbt_get_min_lat(struct request_queue *q); -void wbt_set_min_lat(struct request_queue *q, u64 val); bool wbt_disabled(struct request_queue *); - -u64 wbt_default_latency_nsec(struct request_queue *); +u64 wbt_get_min_lat(struct request_queue *q); +int wbt_set_lat(struct gendisk *disk, s64 val); #else -- 2.51.0 If wbt is disabled by default and user configures wbt by sysfs, queue will be frozen first and then pcpu_alloc_mutex will be held in blk_stat_alloc_callback(). Fix this problem by allocating memory first before queue frozen. Signed-off-by: Yu Kuai --- block/blk-wbt.c | 106 ++++++++++++++++++++++++++++-------------------- 1 file changed, 61 insertions(+), 45 deletions(-) diff --git a/block/blk-wbt.c b/block/blk-wbt.c index ba807649b29a..696baa681717 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -93,7 +93,7 @@ struct rq_wb { struct rq_depth rq_depth; }; -static int wbt_init(struct gendisk *disk); +static int wbt_init(struct gendisk *disk, struct rq_wb *rwb); static inline struct rq_wb *RQWB(struct rq_qos *rqos) { @@ -698,6 +698,41 @@ static void wbt_requeue(struct rq_qos *rqos, struct request *rq) } } +static int wbt_data_dir(const struct request *rq) +{ + const enum req_op op = req_op(rq); + + if (op == REQ_OP_READ) + return READ; + else if (op_is_write(op)) + return WRITE; + + /* don't account */ + return -1; +} + +static struct rq_wb *wbt_alloc(void) +{ + struct rq_wb *rwb = kzalloc(sizeof(*rwb), GFP_KERNEL); + + if (!rwb) + return NULL; + + rwb->cb = blk_stat_alloc_callback(wb_timer_fn, wbt_data_dir, 2, rwb); + if (!rwb->cb) { + kfree(rwb); + return NULL; + } + + return rwb; +} + +static void wbt_free(struct rq_wb *rwb) +{ + blk_stat_free_callback(rwb->cb); + kfree(rwb); +} + /* * Enable wbt if defaults are configured that way */ @@ -726,8 +761,15 @@ void wbt_enable_default(struct gendisk *disk) if (!blk_queue_registered(q)) return; - if (queue_is_mq(q) && enable) - wbt_init(disk); + if (queue_is_mq(q) && enable) { + struct rq_wb *rwb = wbt_alloc(); + + if (WARN_ON_ONCE(!rwb)) + return; + + if (WARN_ON_ONCE(wbt_init(disk, rwb))) + wbt_free(rwb); + } } EXPORT_SYMBOL_GPL(wbt_enable_default); @@ -743,19 +785,6 @@ static u64 wbt_default_latency_nsec(struct request_queue *q) return 75000000ULL; } -static int wbt_data_dir(const struct request *rq) -{ - const enum req_op op = req_op(rq); - - if (op == REQ_OP_READ) - return READ; - else if (op_is_write(op)) - return WRITE; - - /* don't account */ - return -1; -} - static void wbt_queue_depth_changed(struct rq_qos *rqos) { RQWB(rqos)->rq_depth.queue_depth = blk_queue_depth(rqos->disk->queue); @@ -767,8 +796,7 @@ static void wbt_exit(struct rq_qos *rqos) struct rq_wb *rwb = RQWB(rqos); blk_stat_remove_callback(rqos->disk->queue, rwb->cb); - blk_stat_free_callback(rwb->cb); - kfree(rwb); + wbt_free(rwb); } /* @@ -892,22 +920,11 @@ static const struct rq_qos_ops wbt_rqos_ops = { #endif }; -static int wbt_init(struct gendisk *disk) +static int wbt_init(struct gendisk *disk, struct rq_wb *rwb) { struct request_queue *q = disk->queue; - struct rq_wb *rwb; - int i; int ret; - - rwb = kzalloc(sizeof(*rwb), GFP_KERNEL); - if (!rwb) - return -ENOMEM; - - rwb->cb = blk_stat_alloc_callback(wb_timer_fn, wbt_data_dir, 2, rwb); - if (!rwb->cb) { - kfree(rwb); - return -ENOMEM; - } + int i; for (i = 0; i < WBT_NUM_RWQ; i++) rq_wait_init(&rwb->rq_wait[i]); @@ -927,38 +944,38 @@ static int wbt_init(struct gendisk *disk) ret = rq_qos_add(&rwb->rqos, disk, RQ_QOS_WBT, &wbt_rqos_ops); mutex_unlock(&q->rq_qos_mutex); if (ret) - goto err_free; + return ret; blk_stat_add_callback(q, rwb->cb); - return 0; - -err_free: - blk_stat_free_callback(rwb->cb); - kfree(rwb); - return ret; - } int wbt_set_lat(struct gendisk *disk, s64 val) { struct request_queue *q = disk->queue; + struct rq_qos *rqos = wbt_rq_qos(q); + struct rq_wb *rwb = NULL; unsigned int memflags; - struct rq_qos *rqos; int ret = 0; + if (!rqos) { + rwb = wbt_alloc(); + if (!rwb) + return -ENOMEM; + } + /* * Ensure that the queue is idled, in case the latency update * ends up either enabling or disabling wbt completely. We can't * have IO inflight if that happens. */ memflags = blk_mq_freeze_queue(q); - - rqos = wbt_rq_qos(q); if (!rqos) { - ret = wbt_init(disk); - if (ret) + ret = wbt_init(disk, rwb); + if (ret) { + wbt_free(rwb); goto out; + } } if (val == -1) @@ -978,6 +995,5 @@ int wbt_set_lat(struct gendisk *disk, s64 val) blk_mq_unquiesce_queue(q); out: blk_mq_unfreeze_queue(q, memflags); - return ret; } -- 2.51.0 There is already a helper blk_mq_debugfs_register_rqos() to register one rqos, however this helper is called synchronously when the rqos is created with queue frozen. Prepare to fix possible deadlock to create blk-mq debugfs entries while queue is still frozen. Signed-off-by: Yu Kuai --- block/blk-mq-debugfs.c | 23 +++++++++++++++-------- block/blk-mq-debugfs.h | 5 +++++ 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 4896525b1c05..128d2aa6a20d 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -619,6 +619,20 @@ static void debugfs_create_files(struct dentry *parent, void *data, (void *)attr, data, &blk_mq_debugfs_fops); } +void blk_mq_debugfs_register_rq_qos(struct request_queue *q) +{ + lockdep_assert_held(&q->debugfs_mutex); + + if (q->rq_qos) { + struct rq_qos *rqos = q->rq_qos; + + while (rqos) { + blk_mq_debugfs_register_rqos(rqos); + rqos = rqos->next; + } + } +} + void blk_mq_debugfs_register(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; @@ -631,14 +645,7 @@ void blk_mq_debugfs_register(struct request_queue *q) blk_mq_debugfs_register_hctx(q, hctx); } - if (q->rq_qos) { - struct rq_qos *rqos = q->rq_qos; - - while (rqos) { - blk_mq_debugfs_register_rqos(rqos); - rqos = rqos->next; - } - } + blk_mq_debugfs_register_rq_qos(q); } static void blk_mq_debugfs_register_ctx(struct blk_mq_hw_ctx *hctx, diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h index c80e453e3014..54948a266889 100644 --- a/block/blk-mq-debugfs.h +++ b/block/blk-mq-debugfs.h @@ -33,6 +33,7 @@ void blk_mq_debugfs_register_sched_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx); void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx); +void blk_mq_debugfs_register_rq_qos(struct request_queue *q); void blk_mq_debugfs_register_rqos(struct rq_qos *rqos); void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos); #else @@ -78,6 +79,10 @@ static inline void blk_mq_debugfs_register_rqos(struct rq_qos *rqos) { } +static inline void blk_mq_debugfs_register_rq_qos(struct request_queue *q) +{ +} + static inline void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos) { } -- 2.51.0 Currently rq-qos debugfs entries are created from rq_qos_add(), while rq_qos_add() can be called while queue is still frozen. This can deadlock because creating new entries can trigger fs reclaim. Fix this problem by delaying creating rq-qos debugfs entries after queue is unfrozen. - For wbt, 1) it can be initialized by default, fix it by calling new helper after wbt_init() from wbt_enable_default; 2) it can be initialized by sysfs, fix it by calling new helper after queue is unfrozen from queue_wb_lat_store(). - For iocost and iolatency, they can only be initialized by blkcg configuration, however, they don't have debugfs entries for now, hence they are not handled yet. Signed-off-by: Yu Kuai --- block/blk-rq-qos.c | 7 ------- block/blk-wbt.c | 12 +++++++++++- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index 654478dfbc20..d7ce99ce2e80 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -347,13 +347,6 @@ int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id, blk_queue_flag_set(QUEUE_FLAG_QOS_ENABLED, q); blk_mq_unfreeze_queue(q, memflags); - - if (rqos->ops->debugfs_attrs) { - mutex_lock(&q->debugfs_mutex); - blk_mq_debugfs_register_rqos(rqos); - mutex_unlock(&q->debugfs_mutex); - } - return 0; ebusy: blk_mq_unfreeze_queue(q, memflags); diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 696baa681717..4bf6f42bef2e 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -767,8 +767,14 @@ void wbt_enable_default(struct gendisk *disk) if (WARN_ON_ONCE(!rwb)) return; - if (WARN_ON_ONCE(wbt_init(disk, rwb))) + if (WARN_ON_ONCE(wbt_init(disk, rwb))) { wbt_free(rwb); + return; + } + + mutex_lock(&q->debugfs_mutex); + blk_mq_debugfs_register_rq_qos(q); + mutex_unlock(&q->debugfs_mutex); } } EXPORT_SYMBOL_GPL(wbt_enable_default); @@ -995,5 +1001,9 @@ int wbt_set_lat(struct gendisk *disk, s64 val) blk_mq_unquiesce_queue(q); out: blk_mq_unfreeze_queue(q, memflags); + mutex_lock(&q->debugfs_mutex); + blk_mq_debugfs_register_rq_qos(q); + mutex_unlock(&q->debugfs_mutex); + return ret; } -- 2.51.0 Because it's only used inside blk-mq-debugfs.c now. Signed-off-by: Yu Kuai --- block/blk-mq-debugfs.c | 4 +++- block/blk-mq-debugfs.h | 5 ----- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 128d2aa6a20d..99466595c0a4 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -14,6 +14,8 @@ #include "blk-mq-sched.h" #include "blk-rq-qos.h" +static void blk_mq_debugfs_register_rqos(struct rq_qos *rqos); + static int queue_poll_stat_show(void *data, struct seq_file *m) { return 0; @@ -758,7 +760,7 @@ void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos) rqos->debugfs_dir = NULL; } -void blk_mq_debugfs_register_rqos(struct rq_qos *rqos) +static void blk_mq_debugfs_register_rqos(struct rq_qos *rqos) { struct request_queue *q = rqos->disk->queue; const char *dir_name = rq_qos_id_to_name(rqos->id); diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h index 54948a266889..d94daa66556b 100644 --- a/block/blk-mq-debugfs.h +++ b/block/blk-mq-debugfs.h @@ -34,7 +34,6 @@ void blk_mq_debugfs_register_sched_hctx(struct request_queue *q, void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx); void blk_mq_debugfs_register_rq_qos(struct request_queue *q); -void blk_mq_debugfs_register_rqos(struct rq_qos *rqos); void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos); #else static inline void blk_mq_debugfs_register(struct request_queue *q) @@ -75,10 +74,6 @@ static inline void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hc { } -static inline void blk_mq_debugfs_register_rqos(struct rq_qos *rqos) -{ -} - static inline void blk_mq_debugfs_register_rq_qos(struct request_queue *q) { } -- 2.51.0 Creating new debugfs entries can trigger fs reclaim, hence we can't do this with queue frozen, meanwhile, other locks that can be held while queue is frozen should not be held as well. Signed-off-by: Yu Kuai --- block/blk-mq-debugfs.c | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 99466595c0a4..d54f8c29d2f4 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -610,9 +610,22 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = { {}, }; -static void debugfs_create_files(struct dentry *parent, void *data, +static void debugfs_create_files(struct request_queue *q, struct dentry *parent, + void *data, const struct blk_mq_debugfs_attr *attr) { + /* + * Creating new debugfs entries with queue freezed has the risk of + * deadlock. + */ + WARN_ON_ONCE(q->mq_freeze_depth != 0); + /* + * debugfs_mutex should not be nested under other locks that can be + * grabbed while queue is frozen. + */ + lockdep_assert_not_held(&q->elevator_lock); + lockdep_assert_not_held(&q->rq_qos_mutex); + if (IS_ERR_OR_NULL(parent)) return; @@ -640,7 +653,7 @@ void blk_mq_debugfs_register(struct request_queue *q) struct blk_mq_hw_ctx *hctx; unsigned long i; - debugfs_create_files(q->debugfs_dir, q, blk_mq_debugfs_queue_attrs); + debugfs_create_files(q, q->debugfs_dir, q, blk_mq_debugfs_queue_attrs); queue_for_each_hw_ctx(q, hctx, i) { if (!hctx->debugfs_dir) @@ -659,7 +672,8 @@ static void blk_mq_debugfs_register_ctx(struct blk_mq_hw_ctx *hctx, snprintf(name, sizeof(name), "cpu%u", ctx->cpu); ctx_dir = debugfs_create_dir(name, hctx->debugfs_dir); - debugfs_create_files(ctx_dir, ctx, blk_mq_debugfs_ctx_attrs); + debugfs_create_files(hctx->queue, ctx_dir, ctx, + blk_mq_debugfs_ctx_attrs); } void blk_mq_debugfs_register_hctx(struct request_queue *q, @@ -675,7 +689,8 @@ void blk_mq_debugfs_register_hctx(struct request_queue *q, snprintf(name, sizeof(name), "hctx%u", hctx->queue_num); hctx->debugfs_dir = debugfs_create_dir(name, q->debugfs_dir); - debugfs_create_files(hctx->debugfs_dir, hctx, blk_mq_debugfs_hctx_attrs); + debugfs_create_files(q, hctx->debugfs_dir, hctx, + blk_mq_debugfs_hctx_attrs); hctx_for_each_ctx(hctx, ctx, i) blk_mq_debugfs_register_ctx(hctx, ctx); @@ -726,7 +741,7 @@ void blk_mq_debugfs_register_sched(struct request_queue *q) q->sched_debugfs_dir = debugfs_create_dir("sched", q->debugfs_dir); - debugfs_create_files(q->sched_debugfs_dir, q, e->queue_debugfs_attrs); + debugfs_create_files(q, q->sched_debugfs_dir, q, e->queue_debugfs_attrs); } void blk_mq_debugfs_unregister_sched(struct request_queue *q) @@ -775,7 +790,8 @@ static void blk_mq_debugfs_register_rqos(struct rq_qos *rqos) q->debugfs_dir); rqos->debugfs_dir = debugfs_create_dir(dir_name, q->rqos_debugfs_dir); - debugfs_create_files(rqos->debugfs_dir, rqos, rqos->ops->debugfs_attrs); + debugfs_create_files(q, rqos->debugfs_dir, rqos, + rqos->ops->debugfs_attrs); } void blk_mq_debugfs_register_sched_hctx(struct request_queue *q, @@ -798,7 +814,7 @@ void blk_mq_debugfs_register_sched_hctx(struct request_queue *q, hctx->sched_debugfs_dir = debugfs_create_dir("sched", hctx->debugfs_dir); - debugfs_create_files(hctx->sched_debugfs_dir, hctx, + debugfs_create_files(q, hctx->sched_debugfs_dir, hctx, e->hctx_debugfs_attrs); } -- 2.51.0 blk_throtl_init() can be called with rq_qos_mutex held from blkcg configuration, hence trigger fs reclaim can cause deadlock because rq_qos_mutex can be held with queue frozen. Signed-off-by: Yu Kuai --- block/blk-throttle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 97188a795848..5530a9bb0620 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1313,7 +1313,7 @@ static int blk_throtl_init(struct gendisk *disk) unsigned int memflags; int ret; - td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); + td = kzalloc_node(sizeof(*td), GFP_NOIO, q->node); if (!td) return -ENOMEM; -- 2.51.0 queue should not be frozen under rq_qos_mutex, see example from commit 9730763f4756 ("block: correct locking order for protecting blk-wbt parameters"), which means current implementation of rq_qos_add() is problematic. Add a new helper and prepare to fix this problem in following patches. Signed-off-by: Yu Kuai --- block/blk-rq-qos.c | 21 +++++++++++++++++++++ block/blk-rq-qos.h | 2 ++ 2 files changed, 23 insertions(+) diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index d7ce99ce2e80..c1a94c2a9742 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -322,6 +322,27 @@ void rq_qos_exit(struct request_queue *q) mutex_unlock(&q->rq_qos_mutex); } +int rq_qos_add_frozen(struct rq_qos *rqos, struct gendisk *disk, + enum rq_qos_id id, const struct rq_qos_ops *ops) +{ + struct request_queue *q = disk->queue; + + WARN_ON_ONCE(q->mq_freeze_depth == 0); + lockdep_assert_held(&q->rq_qos_mutex); + + if (rq_qos_id(q, id)) + return -EBUSY; + + rqos->disk = disk; + rqos->id = id; + rqos->ops = ops; + rqos->next = q->rq_qos; + q->rq_qos = rqos; + blk_queue_flag_set(QUEUE_FLAG_QOS_ENABLED, q); + + return 0; +} + int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id, const struct rq_qos_ops *ops) { diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index b538f2c0febc..8d9fb10ae526 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -87,6 +87,8 @@ static inline void rq_wait_init(struct rq_wait *rq_wait) int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id, const struct rq_qos_ops *ops); +int rq_qos_add_frozen(struct rq_qos *rqos, struct gendisk *disk, + enum rq_qos_id id, const struct rq_qos_ops *ops); void rq_qos_del(struct rq_qos *rqos); typedef bool (acquire_inflight_cb_t)(struct rq_wait *rqw, void *private_data); -- 2.51.0 wbt_init() can be called from sysfs attribute and wbt_enable_default(), however the lock order are inversely. - queue_wb_lat_store() freeze queue first, and then wbt_init() hold rq_qos_mutex. In this case queue will be frozen again inside rq_qos_add(), however, in this case freeze queue recursively is inoperative; - wbt_enable_default() from elevator switch will hold rq_qos_mutex first, and then rq_qos_add() will freeze queue; Fix this problem by converting to use new helper rq_qos_add_frozen() in wbt_init(), and for wbt_enable_default(), freeze queue before calling wbt_init(). Fixes: a13bd91be223 ("block/rq_qos: protect rq_qos apis with a new lock") Signed-off-by: Yu Kuai --- block/blk-wbt.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 4bf6f42bef2e..d852317c9cb1 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -19,6 +19,7 @@ * Copyright (C) 2016 Jens Axboe * */ +#include "linux/blk-mq.h" #include #include #include @@ -763,14 +764,18 @@ void wbt_enable_default(struct gendisk *disk) if (queue_is_mq(q) && enable) { struct rq_wb *rwb = wbt_alloc(); + unsigned int memflags; if (WARN_ON_ONCE(!rwb)) return; + memflags = blk_mq_freeze_queue(q); if (WARN_ON_ONCE(wbt_init(disk, rwb))) { + blk_mq_unfreeze_queue(q, memflags); wbt_free(rwb); return; } + blk_mq_unfreeze_queue(q, memflags); mutex_lock(&q->debugfs_mutex); blk_mq_debugfs_register_rq_qos(q); @@ -947,7 +952,7 @@ static int wbt_init(struct gendisk *disk, struct rq_wb *rwb) * Assign rwb and add the stats callback. */ mutex_lock(&q->rq_qos_mutex); - ret = rq_qos_add(&rwb->rqos, disk, RQ_QOS_WBT, &wbt_rqos_ops); + ret = rq_qos_add_frozen(&rwb->rqos, disk, RQ_QOS_WBT, &wbt_rqos_ops); mutex_unlock(&q->rq_qos_mutex); if (ret) return ret; -- 2.51.0 Like wbt, rq_qos_add() can be called from two path and the lock order are inversely: - From ioc_qos_write(), queue is already frozen before rq_qos_add(); - From ioc_cost_model_write(), rq_qos_add() is called directly; Fix this problem by converting to use blkg_conf_open_bdev_frozen() from ioc_cost_model_write(), then since all rq_qos_add() callers already freeze queue, convert to use rq_qos_add_frozen(). Signed-off-by: Yu Kuai --- block/blk-iocost.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index a0416927d33d..929fc1421d7e 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -2925,7 +2925,7 @@ static int blk_iocost_init(struct gendisk *disk) * called before policy activation completion, can't assume that the * target bio has an iocg associated and need to test for NULL iocg. */ - ret = rq_qos_add(&ioc->rqos, disk, RQ_QOS_COST, &ioc_rqos_ops); + ret = rq_qos_add_frozen(&ioc->rqos, disk, RQ_QOS_COST, &ioc_rqos_ops); if (ret) goto err_free_ioc; @@ -3408,7 +3408,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, { struct blkg_conf_ctx ctx; struct request_queue *q; - unsigned int memflags; + unsigned long memflags; struct ioc *ioc; u64 u[NR_I_LCOEFS]; bool user; @@ -3417,9 +3417,11 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, blkg_conf_init(&ctx, input); - ret = blkg_conf_open_bdev(&ctx); - if (ret) + memflags = blkg_conf_open_bdev_frozen(&ctx); + if (IS_ERR_VALUE(memflags)) { + ret = memflags; goto err; + } body = ctx.body; q = bdev_get_queue(ctx.bdev); @@ -3436,7 +3438,6 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, ioc = q_to_ioc(q); } - memflags = blk_mq_freeze_queue(q); blk_mq_quiesce_queue(q); spin_lock_irq(&ioc->lock); @@ -3488,20 +3489,18 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, spin_unlock_irq(&ioc->lock); blk_mq_unquiesce_queue(q); - blk_mq_unfreeze_queue(q, memflags); - blkg_conf_exit(&ctx); + blkg_conf_exit_frozen(&ctx, memflags); return nbytes; einval: spin_unlock_irq(&ioc->lock); blk_mq_unquiesce_queue(q); - blk_mq_unfreeze_queue(q, memflags); ret = -EINVAL; err: - blkg_conf_exit(&ctx); + blkg_conf_exit_frozen(&ctx, memflags); return ret; } -- 2.51.0 Currently blk-iolatency will hold rq_qos_mutex first and then call rq_qos_add() to freeze queue. Fix this problem by converting to use blkg_conf_open_bdev_frozen() from iolatency_set_limit(), and convert to use rq_qos_add_frozen(). Signed-off-by: Yu Kuai --- block/blk-iolatency.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 45bd18f68541..1558afbf517b 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -764,8 +764,8 @@ static int blk_iolatency_init(struct gendisk *disk) if (!blkiolat) return -ENOMEM; - ret = rq_qos_add(&blkiolat->rqos, disk, RQ_QOS_LATENCY, - &blkcg_iolatency_ops); + ret = rq_qos_add_frozen(&blkiolat->rqos, disk, RQ_QOS_LATENCY, + &blkcg_iolatency_ops); if (ret) goto err_free; ret = blkcg_activate_policy(disk, &blkcg_policy_iolatency); @@ -831,16 +831,19 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf, struct blkcg_gq *blkg; struct blkg_conf_ctx ctx; struct iolatency_grp *iolat; + unsigned long memflags; char *p, *tok; u64 lat_val = 0; u64 oldval; - int ret; + int ret = 0; blkg_conf_init(&ctx, buf); - ret = blkg_conf_open_bdev(&ctx); - if (ret) + memflags = blkg_conf_open_bdev_frozen(&ctx); + if (IS_ERR_VALUE(memflags)) { + ret = memflags; goto out; + } /* * blk_iolatency_init() may fail after rq_qos_add() succeeds which can @@ -890,7 +893,7 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf, iolatency_clear_scaling(blkg); ret = 0; out: - blkg_conf_exit(&ctx); + blkg_conf_exit_frozen(&ctx, memflags); return ret ?: nbytes; } -- 2.51.0 blk-throttle is still holding rq_qos_mutex before freezing queue from blk_throtl_init(). However blk_throtl_bio() can be called before grabbing q_usage_counter hence freeze queue really doesn't stop new IO issuing to blk-throtl. Also use READ_ONCE and WRITE_ONCE for q->td because blk_throtl_init() can concurrent with issuing IO. Signed-off-by: Yu Kuai --- block/blk-throttle.c | 11 ++--------- block/blk-throttle.h | 3 ++- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 5530a9bb0620..516481722a98 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1310,7 +1310,6 @@ static int blk_throtl_init(struct gendisk *disk) { struct request_queue *q = disk->queue; struct throtl_data *td; - unsigned int memflags; int ret; td = kzalloc_node(sizeof(*td), GFP_NOIO, q->node); @@ -1320,22 +1319,16 @@ static int blk_throtl_init(struct gendisk *disk) INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); throtl_service_queue_init(&td->service_queue); - memflags = blk_mq_freeze_queue(disk->queue); - blk_mq_quiesce_queue(disk->queue); - - q->td = td; + WRITE_ONCE(q->td, td); td->queue = q; /* activate policy, blk_throtl_activated() will return true */ ret = blkcg_activate_policy(disk, &blkcg_policy_throtl); if (ret) { - q->td = NULL; + WRITE_ONCE(q->td, NULL); kfree(td); } - blk_mq_unquiesce_queue(disk->queue); - blk_mq_unfreeze_queue(disk->queue, memflags); - return ret; } diff --git a/block/blk-throttle.h b/block/blk-throttle.h index 9d7a42c039a1..3d177b20f9e1 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -162,7 +162,8 @@ static inline bool blk_throtl_activated(struct request_queue *q) * blkcg_policy_enabled() guarantees that the policy is activated * in the request_queue. */ - return q->td != NULL && blkcg_policy_enabled(q, &blkcg_policy_throtl); + return READ_ONCE(q->td) && + blkcg_policy_enabled(q, &blkcg_policy_throtl); } static inline bool blk_should_throtl(struct bio *bio) -- 2.51.0 Now that there is no caller of rq_qos_add(), remove it, and also rename rq_qos_add_frozen() back to rq_qos_add(). Signed-off-by: Yu Kuai --- block/blk-iocost.c | 2 +- block/blk-iolatency.c | 4 ++-- block/blk-rq-qos.c | 35 ++--------------------------------- block/blk-rq-qos.h | 2 -- block/blk-wbt.c | 2 +- 5 files changed, 6 insertions(+), 39 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 929fc1421d7e..0359a5b65202 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -2925,7 +2925,7 @@ static int blk_iocost_init(struct gendisk *disk) * called before policy activation completion, can't assume that the * target bio has an iocg associated and need to test for NULL iocg. */ - ret = rq_qos_add_frozen(&ioc->rqos, disk, RQ_QOS_COST, &ioc_rqos_ops); + ret = rq_qos_add(&ioc->rqos, disk, RQ_QOS_COST, &ioc_rqos_ops); if (ret) goto err_free_ioc; diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 1558afbf517b..5b18125e21c9 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -764,8 +764,8 @@ static int blk_iolatency_init(struct gendisk *disk) if (!blkiolat) return -ENOMEM; - ret = rq_qos_add_frozen(&blkiolat->rqos, disk, RQ_QOS_LATENCY, - &blkcg_iolatency_ops); + ret = rq_qos_add(&blkiolat->rqos, disk, RQ_QOS_LATENCY, + &blkcg_iolatency_ops); if (ret) goto err_free; ret = blkcg_activate_policy(disk, &blkcg_policy_iolatency); diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index c1a94c2a9742..8de7dae3273e 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -322,8 +322,8 @@ void rq_qos_exit(struct request_queue *q) mutex_unlock(&q->rq_qos_mutex); } -int rq_qos_add_frozen(struct rq_qos *rqos, struct gendisk *disk, - enum rq_qos_id id, const struct rq_qos_ops *ops) +int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id, + const struct rq_qos_ops *ops) { struct request_queue *q = disk->queue; @@ -343,37 +343,6 @@ int rq_qos_add_frozen(struct rq_qos *rqos, struct gendisk *disk, return 0; } -int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id, - const struct rq_qos_ops *ops) -{ - struct request_queue *q = disk->queue; - unsigned int memflags; - - lockdep_assert_held(&q->rq_qos_mutex); - - rqos->disk = disk; - rqos->id = id; - rqos->ops = ops; - - /* - * No IO can be in-flight when adding rqos, so freeze queue, which - * is fine since we only support rq_qos for blk-mq queue. - */ - memflags = blk_mq_freeze_queue(q); - - if (rq_qos_id(q, rqos->id)) - goto ebusy; - rqos->next = q->rq_qos; - q->rq_qos = rqos; - blk_queue_flag_set(QUEUE_FLAG_QOS_ENABLED, q); - - blk_mq_unfreeze_queue(q, memflags); - return 0; -ebusy: - blk_mq_unfreeze_queue(q, memflags); - return -EBUSY; -} - void rq_qos_del(struct rq_qos *rqos) { struct request_queue *q = rqos->disk->queue; diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index 8d9fb10ae526..b538f2c0febc 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -87,8 +87,6 @@ static inline void rq_wait_init(struct rq_wait *rq_wait) int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id, const struct rq_qos_ops *ops); -int rq_qos_add_frozen(struct rq_qos *rqos, struct gendisk *disk, - enum rq_qos_id id, const struct rq_qos_ops *ops); void rq_qos_del(struct rq_qos *rqos); typedef bool (acquire_inflight_cb_t)(struct rq_wait *rqw, void *private_data); diff --git a/block/blk-wbt.c b/block/blk-wbt.c index d852317c9cb1..4645e8fbca64 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -952,7 +952,7 @@ static int wbt_init(struct gendisk *disk, struct rq_wb *rwb) * Assign rwb and add the stats callback. */ mutex_lock(&q->rq_qos_mutex); - ret = rq_qos_add_frozen(&rwb->rqos, disk, RQ_QOS_WBT, &wbt_rqos_ops); + ret = rq_qos_add(&rwb->rqos, disk, RQ_QOS_WBT, &wbt_rqos_ops); mutex_unlock(&q->rq_qos_mutex); if (ret) return ret; -- 2.51.0