From: Yu Kuai It's safe to iterate blkgs with cgroup lock or rcu lock held, prevent nested queue_lock under rcu lock, and prepare to convert protecting blkcg with blkcg_mutex instead of queuelock. Signed-off-by: Yu Kuai --- block/blk-cgroup-rwstat.c | 2 -- block/blk-cgroup.c | 16 +++++----------- 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/block/blk-cgroup-rwstat.c b/block/blk-cgroup-rwstat.c index a55fb0c53558..d41ade993312 100644 --- a/block/blk-cgroup-rwstat.c +++ b/block/blk-cgroup-rwstat.c @@ -101,8 +101,6 @@ void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, struct cgroup_subsys_state *pos_css; unsigned int i; - lockdep_assert_held(&blkg->q->queue_lock); - memset(sum, 0, sizeof(*sum)); rcu_read_lock(); blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index f93de34fe87d..2d767ae61d2f 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -712,12 +712,9 @@ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, u64 total = 0; rcu_read_lock(); - hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { - spin_lock_irq(&blkg->q->queue_lock); - if (blkcg_policy_enabled(blkg->q, pol)) + hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) + if (blkcg_policy_enabled(blkg->q, pol) && blkg->online) total += prfill(sf, blkg->pd[pol->plid], data); - spin_unlock_irq(&blkg->q->queue_lock); - } rcu_read_unlock(); if (show_total) @@ -1242,13 +1239,10 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) else css_rstat_flush(&blkcg->css); - rcu_read_lock(); - hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { - spin_lock_irq(&blkg->q->queue_lock); + spin_lock_irq(&blkcg->lock); + hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) blkcg_print_one_stat(blkg, sf); - spin_unlock_irq(&blkg->q->queue_lock); - } - rcu_read_unlock(); + spin_unlock_irq(&blkcg->lock); return 0; } -- 2.39.2 From: Yu Kuai Changes to two step: 1) hold rcu lock and do blkg_lookup() from fast path; 2) hold queue_lock directly from slow path, and don't nest it under rcu lock; Prepare to convert protecting blkcg with blkcg_mutex instead of queue_lock; Signed-off-by: Yu Kuai --- block/blk-cgroup.c | 57 +++++++++++++++++++++++++++++----------------- 1 file changed, 36 insertions(+), 21 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 2d767ae61d2f..9ffc3195f7e4 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -467,22 +467,17 @@ static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, { struct request_queue *q = disk->queue; struct blkcg_gq *blkg; - unsigned long flags; - - WARN_ON_ONCE(!rcu_read_lock_held()); - blkg = blkg_lookup(blkcg, q); - if (blkg) - return blkg; - - spin_lock_irqsave(&q->queue_lock, flags); + rcu_read_lock(); blkg = blkg_lookup(blkcg, q); if (blkg) { if (blkcg != &blkcg_root && blkg != rcu_dereference(blkcg->blkg_hint)) rcu_assign_pointer(blkcg->blkg_hint, blkg); - goto found; + rcu_read_unlock(); + return blkg; } + rcu_read_unlock(); /* * Create blkgs walking down from blkcg_root to @blkcg, so that all @@ -514,8 +509,6 @@ static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, break; } -found: - spin_unlock_irqrestore(&q->queue_lock, flags); return blkg; } @@ -2077,6 +2070,18 @@ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta) atomic64_add(delta, &blkg->delay_nsec); } +static inline struct blkcg_gq *blkg_lookup_tryget(struct blkcg_gq *blkg) +{ +retry: + if (blkg_tryget(blkg)) + return blkg; + + blkg = blkg->parent; + if (blkg) + goto retry; + + return NULL; +} /** * blkg_tryget_closest - try and get a blkg ref on the closet blkg * @bio: target bio @@ -2089,20 +2094,30 @@ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta) static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio, struct cgroup_subsys_state *css) { - struct blkcg_gq *blkg, *ret_blkg = NULL; + struct request_queue *q = bio->bi_bdev->bd_queue; + struct blkcg *blkcg = css_to_blkcg(css); + struct blkcg_gq *blkg; rcu_read_lock(); - blkg = blkg_lookup_create(css_to_blkcg(css), bio->bi_bdev->bd_disk); - while (blkg) { - if (blkg_tryget(blkg)) { - ret_blkg = blkg; - break; - } - blkg = blkg->parent; - } + blkg = blkg_lookup(blkcg, q); + if (likely(blkg)) + blkg = blkg_lookup_tryget(blkg); rcu_read_unlock(); - return ret_blkg; + if (blkg) + return blkg; + + /* + * Fast path failed, we're probably issuing IO in this cgroup the first + * time, hold lock to create new blkg. + */ + spin_lock_irq(&q->queue_lock); + blkg = blkg_lookup_create(blkcg, bio->bi_bdev->bd_disk); + if (blkg) + blkg = blkg_lookup_tryget(blkg); + spin_unlock_irq(&q->queue_lock); + + return blkg; } /** -- 2.39.2 From: Yu Kuai If bio is already associated with blkg, blkcg is already pinned until bio is done, no need for rcu protection; Otherwise protect blkcg_css() with rcu independently. Prepare to convert protecting blkcg with blkcg_mutex instead of queue_lock. Signed-off-by: Yu Kuai --- block/blk-cgroup.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 9ffc3195f7e4..53a64bfe4a24 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -2165,16 +2165,20 @@ void bio_associate_blkg(struct bio *bio) if (blk_op_is_passthrough(bio->bi_opf)) return; - rcu_read_lock(); - - if (bio->bi_blkg) + if (bio->bi_blkg) { css = bio_blkcg_css(bio); - else + bio_associate_blkg_from_css(bio, css); + } else { + rcu_read_lock(); css = blkcg_css(); + if (!css_tryget_online(css)) + css = NULL; + rcu_read_unlock(); - bio_associate_blkg_from_css(bio, css); - - rcu_read_unlock(); + bio_associate_blkg_from_css(bio, css); + if (css) + css_put(css); + } } EXPORT_SYMBOL_GPL(bio_associate_blkg); -- 2.39.2 From: Yu Kuai The correct lock order is q->queue_lock before blkcg->lock, and in order to prevent deadlock from blkcg_destroy_blkgs(), trylock is used for q->queue_lock while blkcg->lock is already held, this is hacky. Hence refactor blkcg_destroy_blkgs(), by holding blkcg->lock to get the first blkg and release it, then hold q->queue_lock and blkcg->lock in the correct order to destroy blkg. This is super cold path, it's fine to grab and release locks. Also prepare to convert protecting blkcg with blkcg_mutex instead of queue_lock. Signed-off-by: Yu Kuai --- block/blk-cgroup.c | 45 ++++++++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 19 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 53a64bfe4a24..795efb5ccb5e 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1283,6 +1283,21 @@ struct list_head *blkcg_get_cgwb_list(struct cgroup_subsys_state *css) * This finally frees the blkcg. */ +static struct blkcg_gq *blkcg_get_first_blkg(struct blkcg *blkcg) +{ + struct blkcg_gq *blkg = NULL; + + spin_lock_irq(&blkcg->lock); + if (!hlist_empty(&blkcg->blkg_list)) { + blkg = hlist_entry(blkcg->blkg_list.first, struct blkcg_gq, + blkcg_node); + blkg_get(blkg); + } + spin_unlock_irq(&blkcg->lock); + + return blkg; +} + /** * blkcg_destroy_blkgs - responsible for shooting down blkgs * @blkcg: blkcg of interest @@ -1296,32 +1311,24 @@ struct list_head *blkcg_get_cgwb_list(struct cgroup_subsys_state *css) */ static void blkcg_destroy_blkgs(struct blkcg *blkcg) { - might_sleep(); + struct blkcg_gq *blkg; - spin_lock_irq(&blkcg->lock); + might_sleep(); - while (!hlist_empty(&blkcg->blkg_list)) { - struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first, - struct blkcg_gq, blkcg_node); + while ((blkg = blkcg_get_first_blkg(blkcg))) { struct request_queue *q = blkg->q; - if (need_resched() || !spin_trylock(&q->queue_lock)) { - /* - * Given that the system can accumulate a huge number - * of blkgs in pathological cases, check to see if we - * need to rescheduling to avoid softlockup. - */ - spin_unlock_irq(&blkcg->lock); - cond_resched(); - spin_lock_irq(&blkcg->lock); - continue; - } + spin_lock_irq(&q->queue_lock); + spin_lock(&blkcg->lock); blkg_destroy(blkg); - spin_unlock(&q->queue_lock); - } - spin_unlock_irq(&blkcg->lock); + spin_unlock(&blkcg->lock); + spin_unlock_irq(&q->queue_lock); + + blkg_put(blkg); + cond_resched(); + } } /** -- 2.39.2 From: Yu Kuai Prepare to convert protecting blkcg with blkcg_mutex instead of queue_lock. Signed-off-by: Yu Kuai --- mm/page_io.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/mm/page_io.c b/mm/page_io.c index a2056a5ecb13..4f4cc9370573 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -313,8 +313,13 @@ static void bio_associate_blkg_from_page(struct bio *bio, struct folio *folio) rcu_read_lock(); css = cgroup_e_css(memcg->css.cgroup, &io_cgrp_subsys); - bio_associate_blkg_from_css(bio, css); + if (!css || !css_tryget_online(css)) + css = NULL; rcu_read_unlock(); + + bio_associate_blkg_from_css(bio, css); + if (css) + css_put(css); } #else #define bio_associate_blkg_from_page(bio, folio) do { } while (0) -- 2.39.2 From: Yu Kuai Request_queue is freezed and quiesced during elevator init_sched() method, there is no point to hold queue_lock for protection. Signed-off-by: Yu Kuai --- block/bfq-iosched.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 4a8d3d96bfe4..a77b98187370 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -7213,10 +7213,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_queue *eq) return -ENOMEM; eq->elevator_data = bfqd; - - spin_lock_irq(&q->queue_lock); q->elevator = eq; - spin_unlock_irq(&q->queue_lock); /* * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. @@ -7249,7 +7246,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_queue *eq) * If the disk supports multiple actuators, copy independent * access ranges from the request queue structure. */ - spin_lock_irq(&q->queue_lock); if (ia_ranges) { /* * Check if the disk ia_ranges size exceeds the current bfq @@ -7275,7 +7271,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_queue *eq) bfqd->sector[0] = 0; bfqd->nr_sectors[0] = get_capacity(q->disk); } - spin_unlock_irq(&q->queue_lock); INIT_LIST_HEAD(&bfqd->dispatch); -- 2.39.2 From: Yu Kuai With previous modifications, queue_lock no longer grabbed under other spinlocks and rcu for protecting blkgs, it's ok convert to blkcg_mutex directly. Signed-off-by: Yu Kuai --- block/bfq-cgroup.c | 6 +-- block/bfq-iosched.c | 8 ++-- block/blk-cgroup.c | 104 ++++++++++++++------------------------------ block/blk-cgroup.h | 6 +-- 4 files changed, 42 insertions(+), 82 deletions(-) diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 9fb9f3533150..8af471d565d9 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -405,7 +405,7 @@ static void bfqg_stats_xfer_dead(struct bfq_group *bfqg) parent = bfqg_parent(bfqg); - lockdep_assert_held(&bfqg_to_blkg(bfqg)->q->queue_lock); + lockdep_assert_held(&bfqg_to_blkg(bfqg)->q->blkcg_mutex); if (unlikely(!parent)) return; @@ -866,7 +866,7 @@ static void bfq_reparent_active_queues(struct bfq_data *bfqd, * and reparent its children entities. * @pd: descriptor of the policy going offline. * - * blkio already grabs the queue_lock for us, so no need to use + * blkio already grabs the blkcg_mtuex for us, so no need to use * RCU-based magic */ static void bfq_pd_offline(struct blkg_policy_data *pd) @@ -1139,7 +1139,7 @@ static u64 bfqg_prfill_stat_recursive(struct seq_file *sf, struct cgroup_subsys_state *pos_css; u64 sum = 0; - lockdep_assert_held(&blkg->q->queue_lock); + lockdep_assert_held(&blkg->q->blkcg_mutex); rcu_read_lock(); blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index a77b98187370..4ffbe4383dd2 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5266,7 +5266,7 @@ static void bfq_update_dispatch_stats(struct request_queue *q, * In addition, the following queue lock guarantees that * bfqq_group(bfqq) exists as well. */ - spin_lock_irq(&q->queue_lock); + mutex_lock(&q->blkcg_mutex); if (idle_timer_disabled) /* * Since the idle timer has been disabled, @@ -5285,7 +5285,7 @@ static void bfq_update_dispatch_stats(struct request_queue *q, bfqg_stats_set_start_empty_time(bfqg); bfqg_stats_update_io_remove(bfqg, rq->cmd_flags); } - spin_unlock_irq(&q->queue_lock); + mutex_unlock(&q->blkcg_mutex); } #else static inline void bfq_update_dispatch_stats(struct request_queue *q, @@ -6218,11 +6218,11 @@ static void bfq_update_insert_stats(struct request_queue *q, * In addition, the following queue lock guarantees that * bfqq_group(bfqq) exists as well. */ - spin_lock_irq(&q->queue_lock); + mutex_lock(&q->blkcg_mutex); bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags); if (idle_timer_disabled) bfqg_stats_update_idle_time(bfqq_group(bfqq)); - spin_unlock_irq(&q->queue_lock); + mutex_unlock(&q->blkcg_mutex); } #else static inline void bfq_update_insert_stats(struct request_queue *q, diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 795efb5ccb5e..280f29a713b6 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -130,9 +130,7 @@ static void blkg_free_workfn(struct work_struct *work) blkcg_policy[i]->pd_free_fn(blkg->pd[i]); if (blkg->parent) blkg_put(blkg->parent); - spin_lock_irq(&q->queue_lock); list_del_init(&blkg->q_node); - spin_unlock_irq(&q->queue_lock); mutex_unlock(&q->blkcg_mutex); blk_put_queue(q); @@ -372,7 +370,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk, struct blkcg_gq *blkg; int i, ret; - lockdep_assert_held(&disk->queue->queue_lock); + lockdep_assert_held(&disk->queue->blkcg_mutex); /* request_queue is dying, do not create/recreate a blkg */ if (blk_queue_dying(disk->queue)) { @@ -457,7 +455,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk, * Lookup blkg for the @blkcg - @disk pair. If it doesn't exist, try to * create one. blkg creation is performed recursively from blkcg_root such * that all non-root blkg's have access to the parent blkg. This function - * should be called under RCU read lock and takes @disk->queue->queue_lock. + * should be called under RCU read lock and takes @disk->queue->blkcg_mutex. * * Returns the blkg or the closest blkg if blkg_create() fails as it walks * down from root. @@ -517,7 +515,7 @@ static void blkg_destroy(struct blkcg_gq *blkg) struct blkcg *blkcg = blkg->blkcg; int i; - lockdep_assert_held(&blkg->q->queue_lock); + lockdep_assert_held(&blkg->q->blkcg_mutex); lockdep_assert_held(&blkcg->lock); /* @@ -546,8 +544,8 @@ static void blkg_destroy(struct blkcg_gq *blkg) /* * Both setting lookup hint to and clearing it from @blkg are done - * under queue_lock. If it's not pointing to @blkg now, it never - * will. Hint assignment itself can race safely. + * under q->blkcg_mutex and blkcg->lock. If it's not pointing to @blkg + * now, it never will. Hint assignment itself can race safely. */ if (rcu_access_pointer(blkcg->blkg_hint) == blkg) rcu_assign_pointer(blkcg->blkg_hint, NULL); @@ -567,25 +565,20 @@ static void blkg_destroy_all(struct gendisk *disk) int i; restart: - spin_lock_irq(&q->queue_lock); + mutex_lock(&q->blkcg_mutex); list_for_each_entry(blkg, &q->blkg_list, q_node) { struct blkcg *blkcg = blkg->blkcg; if (hlist_unhashed(&blkg->blkcg_node)) continue; - spin_lock(&blkcg->lock); + spin_lock_irq(&blkcg->lock); blkg_destroy(blkg); - spin_unlock(&blkcg->lock); + spin_unlock_irq(&blkcg->lock); - /* - * in order to avoid holding the spin lock for too long, release - * it when a batch of blkgs are destroyed. - */ if (!(--count)) { count = BLKG_DESTROY_BATCH_SIZE; - spin_unlock_irq(&q->queue_lock); - cond_resched(); + mutex_unlock(&q->blkcg_mutex); goto restart; } } @@ -603,7 +596,7 @@ static void blkg_destroy_all(struct gendisk *disk) } q->root_blkg = NULL; - spin_unlock_irq(&q->queue_lock); + mutex_unlock(&q->blkcg_mutex); } static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src) @@ -853,7 +846,7 @@ unsigned long __must_check blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx) */ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, struct blkg_conf_ctx *ctx) - __acquires(&bdev->bd_queue->queue_lock) + __acquires(&bdev->bd_queue->blkcg_mutex) { struct gendisk *disk; struct request_queue *q; @@ -869,7 +862,6 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, /* Prevent concurrent with blkcg_deactivate_policy() */ mutex_lock(&q->blkcg_mutex); - spin_lock_irq(&q->queue_lock); if (!blkcg_policy_enabled(q, pol)) { ret = -EOPNOTSUPP; @@ -895,23 +887,18 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, parent = blkcg_parent(parent); } - /* Drop locks to do new blkg allocation with GFP_KERNEL. */ - spin_unlock_irq(&q->queue_lock); - new_blkg = blkg_alloc(pos, disk, GFP_NOIO); if (unlikely(!new_blkg)) { ret = -ENOMEM; - goto fail_exit; + goto fail_unlock; } if (radix_tree_preload(GFP_KERNEL)) { blkg_free(new_blkg); ret = -ENOMEM; - goto fail_exit; + goto fail_unlock; } - spin_lock_irq(&q->queue_lock); - if (!blkcg_policy_enabled(q, pol)) { blkg_free(new_blkg); ret = -EOPNOTSUPP; @@ -935,15 +922,12 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, goto success; } success: - mutex_unlock(&q->blkcg_mutex); ctx->blkg = blkg; return 0; fail_preloaded: radix_tree_preload_end(); fail_unlock: - spin_unlock_irq(&q->queue_lock); -fail_exit: mutex_unlock(&q->blkcg_mutex); /* * If queue was bypassing, we should retry. Do so after a @@ -967,11 +951,11 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep); * blkg_conf_ctx's initialized with blkg_conf_init(). */ void blkg_conf_exit(struct blkg_conf_ctx *ctx) - __releases(&ctx->bdev->bd_queue->queue_lock) + __releases(&ctx->bdev->bd_queue->blkcg_mutex) __releases(&ctx->bdev->bd_queue->rq_qos_mutex) { if (ctx->blkg) { - spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock); + mutex_unlock(&bdev_get_queue(ctx->bdev)->blkcg_mutex); ctx->blkg = NULL; } @@ -1318,13 +1302,13 @@ static void blkcg_destroy_blkgs(struct blkcg *blkcg) while ((blkg = blkcg_get_first_blkg(blkcg))) { struct request_queue *q = blkg->q; - spin_lock_irq(&q->queue_lock); - spin_lock(&blkcg->lock); + mutex_lock(&q->blkcg_mutex); + spin_lock_irq(&blkcg->lock); blkg_destroy(blkg); - spin_unlock(&blkcg->lock); - spin_unlock_irq(&q->queue_lock); + spin_unlock_irq(&blkcg->lock); + mutex_unlock(&q->blkcg_mutex); blkg_put(blkg); cond_resched(); @@ -1501,24 +1485,23 @@ int blkcg_init_disk(struct gendisk *disk) if (!new_blkg) return -ENOMEM; - preloaded = !radix_tree_preload(GFP_KERNEL); + mutex_lock(&q->blkcg_mutex); + preloaded = !radix_tree_preload(GFP_NOIO); /* Make sure the root blkg exists. */ - /* spin_lock_irq can serve as RCU read-side critical section. */ - spin_lock_irq(&q->queue_lock); blkg = blkg_create(&blkcg_root, disk, new_blkg); if (IS_ERR(blkg)) goto err_unlock; q->root_blkg = blkg; - spin_unlock_irq(&q->queue_lock); if (preloaded) radix_tree_preload_end(); + mutex_unlock(&q->blkcg_mutex); return 0; err_unlock: - spin_unlock_irq(&q->queue_lock); + mutex_unlock(&q->blkcg_mutex); if (preloaded) radix_tree_preload_end(); return PTR_ERR(blkg); @@ -1595,8 +1578,7 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) if (queue_is_mq(q)) memflags = blk_mq_freeze_queue(q); -retry: - spin_lock_irq(&q->queue_lock); + mutex_lock(&q->blkcg_mutex); /* blkg_list is pushed at the head, reverse walk to initialize parents first */ list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) { @@ -1605,36 +1587,17 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) if (blkg->pd[pol->plid]) continue; - /* If prealloc matches, use it; otherwise try GFP_NOWAIT */ + /* If prealloc matches, use it */ if (blkg == pinned_blkg) { pd = pd_prealloc; pd_prealloc = NULL; } else { pd = pol->pd_alloc_fn(disk, blkg->blkcg, - GFP_NOWAIT); + GFP_NOIO); } - if (!pd) { - /* - * GFP_NOWAIT failed. Free the existing one and - * prealloc for @blkg w/ GFP_KERNEL. - */ - if (pinned_blkg) - blkg_put(pinned_blkg); - blkg_get(blkg); - pinned_blkg = blkg; - - spin_unlock_irq(&q->queue_lock); - - if (pd_prealloc) - pol->pd_free_fn(pd_prealloc); - pd_prealloc = pol->pd_alloc_fn(disk, blkg->blkcg, - GFP_KERNEL); - if (pd_prealloc) - goto retry; - else - goto enomem; - } + if (!pd) + goto enomem; spin_lock(&blkg->blkcg->lock); @@ -1655,8 +1618,8 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) __set_bit(pol->plid, q->blkcg_pols); ret = 0; - spin_unlock_irq(&q->queue_lock); out: + mutex_unlock(&q->blkcg_mutex); if (queue_is_mq(q)) blk_mq_unfreeze_queue(q, memflags); if (pinned_blkg) @@ -1667,7 +1630,6 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) enomem: /* alloc failed, take down everything */ - spin_lock_irq(&q->queue_lock); list_for_each_entry(blkg, &q->blkg_list, q_node) { struct blkcg *blkcg = blkg->blkcg; struct blkg_policy_data *pd; @@ -1683,7 +1645,7 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) } spin_unlock(&blkcg->lock); } - spin_unlock_irq(&q->queue_lock); + ret = -ENOMEM; goto out; } @@ -1711,7 +1673,6 @@ void blkcg_deactivate_policy(struct gendisk *disk, memflags = blk_mq_freeze_queue(q); mutex_lock(&q->blkcg_mutex); - spin_lock_irq(&q->queue_lock); __clear_bit(pol->plid, q->blkcg_pols); @@ -1728,7 +1689,6 @@ void blkcg_deactivate_policy(struct gendisk *disk, spin_unlock(&blkcg->lock); } - spin_unlock_irq(&q->queue_lock); mutex_unlock(&q->blkcg_mutex); if (queue_is_mq(q)) @@ -2118,11 +2078,11 @@ static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio, * Fast path failed, we're probably issuing IO in this cgroup the first * time, hold lock to create new blkg. */ - spin_lock_irq(&q->queue_lock); + mutex_lock(&q->blkcg_mutex); blkg = blkg_lookup_create(blkcg, bio->bi_bdev->bd_disk); if (blkg) blkg = blkg_lookup_tryget(blkg); - spin_unlock_irq(&q->queue_lock); + mutex_unlock(&q->blkcg_mutex); return blkg; } diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 1cce3294634d..60c1da02f437 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -261,7 +261,7 @@ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, return q->root_blkg; blkg = rcu_dereference_check(blkcg->blkg_hint, - lockdep_is_held(&q->queue_lock)); + lockdep_is_held(&q->blkcg_mutex)); if (blkg && blkg->q == q) return blkg; @@ -345,8 +345,8 @@ static inline void blkg_put(struct blkcg_gq *blkg) * @p_blkg: target blkg to walk descendants of * * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU - * read locked. If called under either blkcg or queue lock, the iteration - * is guaranteed to include all and only online blkgs. The caller may + * read locked. If called under either blkcg->lock or q->blkcg_mutex, the + * iteration is guaranteed to include all and only online blkgs. The caller may * update @pos_css by calling css_rightmost_descendant() to skip subtree. * @p_blkg is included in the iteration and the first node to be visited. */ -- 2.39.2 From: Yu Kuai Now that blkcg_mutex is used to protect blkgs, memory allocation no longer need to be non-blocking, this is not needed. Signed-off-by: Yu Kuai --- block/blk-cgroup.c | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 280f29a713b6..bfc74cfebd3e 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -893,16 +893,10 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, goto fail_unlock; } - if (radix_tree_preload(GFP_KERNEL)) { - blkg_free(new_blkg); - ret = -ENOMEM; - goto fail_unlock; - } - if (!blkcg_policy_enabled(q, pol)) { blkg_free(new_blkg); ret = -EOPNOTSUPP; - goto fail_preloaded; + goto fail_unlock; } blkg = blkg_lookup(pos, q); @@ -912,12 +906,10 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, blkg = blkg_create(pos, disk, new_blkg); if (IS_ERR(blkg)) { ret = PTR_ERR(blkg); - goto fail_preloaded; + goto fail_unlock; } } - radix_tree_preload_end(); - if (pos == blkcg) goto success; } @@ -925,8 +917,6 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, ctx->blkg = blkg; return 0; -fail_preloaded: - radix_tree_preload_end(); fail_unlock: mutex_unlock(&q->blkcg_mutex); /* @@ -1479,14 +1469,12 @@ int blkcg_init_disk(struct gendisk *disk) { struct request_queue *q = disk->queue; struct blkcg_gq *new_blkg, *blkg; - bool preloaded; new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL); if (!new_blkg) return -ENOMEM; mutex_lock(&q->blkcg_mutex); - preloaded = !radix_tree_preload(GFP_NOIO); /* Make sure the root blkg exists. */ blkg = blkg_create(&blkcg_root, disk, new_blkg); @@ -1494,16 +1482,12 @@ int blkcg_init_disk(struct gendisk *disk) goto err_unlock; q->root_blkg = blkg; - if (preloaded) - radix_tree_preload_end(); mutex_unlock(&q->blkcg_mutex); return 0; err_unlock: mutex_unlock(&q->blkcg_mutex); - if (preloaded) - radix_tree_preload_end(); return PTR_ERR(blkg); } -- 2.39.2 From: Yu Kuai Now that blkg_create is protected with blkcg_mutex, there is no need to preallocate blkg, remove related code. Signed-off-by: Yu Kuai --- block/blk-cgroup.c | 91 +++++++++++++++++----------------------------- 1 file changed, 33 insertions(+), 58 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index bfc74cfebd3e..60b8c742f876 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -364,10 +364,9 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk, * If @new_blkg is %NULL, this function tries to allocate a new one as * necessary using %GFP_NOWAIT. @new_blkg is always consumed on return. */ -static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk, - struct blkcg_gq *new_blkg) +static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk) { - struct blkcg_gq *blkg; + struct blkcg_gq *blkg = NULL; int i, ret; lockdep_assert_held(&disk->queue->blkcg_mutex); @@ -384,15 +383,11 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk, goto err_free_blkg; } - /* allocate */ - if (!new_blkg) { - new_blkg = blkg_alloc(blkcg, disk, GFP_NOWAIT); - if (unlikely(!new_blkg)) { - ret = -ENOMEM; - goto err_put_css; - } + blkg = blkg_alloc(blkcg, disk, GFP_NOIO); + if (unlikely(!blkg)) { + ret = -ENOMEM; + goto err_put_css; } - blkg = new_blkg; /* link parent */ if (blkcg_parent(blkcg)) { @@ -415,35 +410,34 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk, /* insert */ spin_lock(&blkcg->lock); ret = radix_tree_insert(&blkcg->blkg_tree, disk->queue->id, blkg); - if (likely(!ret)) { - hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); - list_add(&blkg->q_node, &disk->queue->blkg_list); + if (unlikely(ret)) { + spin_unlock(&blkcg->lock); + blkg_put(blkg); + return ERR_PTR(ret); + } - for (i = 0; i < BLKCG_MAX_POLS; i++) { - struct blkcg_policy *pol = blkcg_policy[i]; + hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); + list_add(&blkg->q_node, &disk->queue->blkg_list); - if (blkg->pd[i]) { - if (pol->pd_online_fn) - pol->pd_online_fn(blkg->pd[i]); - blkg->pd[i]->online = true; - } + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + + if (blkg->pd[i]) { + if (pol->pd_online_fn) + pol->pd_online_fn(blkg->pd[i]); + blkg->pd[i]->online = true; } } + blkg->online = true; spin_unlock(&blkcg->lock); - - if (!ret) - return blkg; - - /* @blkg failed fully initialized, use the usual release path */ - blkg_put(blkg); - return ERR_PTR(ret); + return blkg; err_put_css: css_put(&blkcg->css); err_free_blkg: - if (new_blkg) - blkg_free(new_blkg); + if (blkg) + blkg_free(blkg); return ERR_PTR(ret); } @@ -498,7 +492,7 @@ static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, parent = blkcg_parent(parent); } - blkg = blkg_create(pos, disk, NULL); + blkg = blkg_create(pos, disk); if (IS_ERR(blkg)) { blkg = ret_blkg; break; @@ -879,7 +873,6 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, while (true) { struct blkcg *pos = blkcg; struct blkcg *parent; - struct blkcg_gq *new_blkg; parent = blkcg_parent(blkcg); while (parent && !blkg_lookup(parent, q)) { @@ -887,23 +880,14 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, parent = blkcg_parent(parent); } - new_blkg = blkg_alloc(pos, disk, GFP_NOIO); - if (unlikely(!new_blkg)) { - ret = -ENOMEM; - goto fail_unlock; - } - if (!blkcg_policy_enabled(q, pol)) { - blkg_free(new_blkg); ret = -EOPNOTSUPP; goto fail_unlock; } blkg = blkg_lookup(pos, q); - if (blkg) { - blkg_free(new_blkg); - } else { - blkg = blkg_create(pos, disk, new_blkg); + if (!blkg) { + blkg = blkg_create(pos, disk); if (IS_ERR(blkg)) { ret = PTR_ERR(blkg); goto fail_unlock; @@ -1468,27 +1452,18 @@ void blkg_init_queue(struct request_queue *q) int blkcg_init_disk(struct gendisk *disk) { struct request_queue *q = disk->queue; - struct blkcg_gq *new_blkg, *blkg; - - new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL); - if (!new_blkg) - return -ENOMEM; + struct blkcg_gq *blkg; + /* Make sure the root blkg exists. */ mutex_lock(&q->blkcg_mutex); + blkg = blkg_create(&blkcg_root, disk); + mutex_unlock(&q->blkcg_mutex); - /* Make sure the root blkg exists. */ - blkg = blkg_create(&blkcg_root, disk, new_blkg); if (IS_ERR(blkg)) - goto err_unlock; - q->root_blkg = blkg; - - mutex_unlock(&q->blkcg_mutex); + return PTR_ERR(blkg); + q->root_blkg = blkg; return 0; - -err_unlock: - mutex_unlock(&q->blkcg_mutex); - return PTR_ERR(blkg); } void blkcg_exit_disk(struct gendisk *disk) -- 2.39.2 From: Yu Kuai Abusing queue_lock to protect blk-throttle can cause deadlock: 1) throtl_pending_timer_fn() will hold the lock, while throtl_pd_free() will flush the timer, this is fixed by protecting blkgs with blkcg_mutex instead of queue_lock by previous patches. 2) queue_lock can be held from hardirq context, hence if throtl_pending_timer_fn() is interrupted by hardirq, deadlock can be triggered as well. Stop abusing queue_lock to protect blk-throttle, and intorduce a new internal lock td->lock for protection. And now that the new lock won't be grabbed from hardirq context, it's safe to use spin_lock_bh() from thread context and spin_lock() directly from softirq context. Fixes: 6e1a5704cbbd ("blk-throttle: dispatch from throtl_pending_timer_fn()") Signed-off-by: Yu Kuai --- block/blk-throttle.c | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 2c5b64b1a724..a2fa440559c9 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -33,6 +33,7 @@ static struct workqueue_struct *kthrotld_workqueue; struct throtl_data { + spinlock_t lock; /* service tree for active throtl groups */ struct throtl_service_queue service_queue; @@ -1140,7 +1141,7 @@ static void throtl_pending_timer_fn(struct timer_list *t) else q = td->queue; - spin_lock_irq(&q->queue_lock); + spin_lock(&td->lock); if (!q->root_blkg) goto out_unlock; @@ -1166,9 +1167,9 @@ static void throtl_pending_timer_fn(struct timer_list *t) break; /* this dispatch windows is still open, relax and repeat */ - spin_unlock_irq(&q->queue_lock); + spin_unlock(&td->lock); cpu_relax(); - spin_lock_irq(&q->queue_lock); + spin_lock(&td->lock); } if (!dispatched) @@ -1191,7 +1192,7 @@ static void throtl_pending_timer_fn(struct timer_list *t) queue_work(kthrotld_workqueue, &td->dispatch_work); } out_unlock: - spin_unlock_irq(&q->queue_lock); + spin_unlock(&td->lock); } /** @@ -1207,7 +1208,6 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work) struct throtl_data *td = container_of(work, struct throtl_data, dispatch_work); struct throtl_service_queue *td_sq = &td->service_queue; - struct request_queue *q = td->queue; struct bio_list bio_list_on_stack; struct bio *bio; struct blk_plug plug; @@ -1215,11 +1215,11 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work) bio_list_init(&bio_list_on_stack); - spin_lock_irq(&q->queue_lock); + spin_lock_bh(&td->lock); for (rw = READ; rw <= WRITE; rw++) while ((bio = throtl_pop_queued(td_sq, NULL, rw))) bio_list_add(&bio_list_on_stack, bio); - spin_unlock_irq(&q->queue_lock); + spin_unlock_bh(&td->lock); if (!bio_list_empty(&bio_list_on_stack)) { blk_start_plug(&plug); @@ -1297,7 +1297,7 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global) rcu_read_unlock(); /* - * We're already holding queue_lock and know @tg is valid. Let's + * We're already holding td->lock and know @tg is valid. Let's * apply the new config directly. * * Restart the slices for both READ and WRITES. It might happen @@ -1324,6 +1324,7 @@ static int blk_throtl_init(struct gendisk *disk) if (!td) return -ENOMEM; + spin_lock_init(&td->lock); INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); throtl_service_queue_init(&td->service_queue); @@ -1694,12 +1695,7 @@ void blk_throtl_cancel_bios(struct gendisk *disk) if (!blk_throtl_activated(q)) return; - spin_lock_irq(&q->queue_lock); - /* - * queue_lock is held, rcu lock is not needed here technically. - * However, rcu lock is still held to emphasize that following - * path need RCU protection and to prevent warning from lockdep. - */ + spin_lock_bh(&q->td->lock); rcu_read_lock(); blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) { /* @@ -1713,7 +1709,7 @@ void blk_throtl_cancel_bios(struct gendisk *disk) tg_flush_bios(blkg_to_tg(blkg)); } rcu_read_unlock(); - spin_unlock_irq(&q->queue_lock); + spin_unlock_bh(&q->td->lock); } static bool tg_within_limit(struct throtl_grp *tg, struct bio *bio, bool rw) @@ -1746,7 +1742,6 @@ static bool tg_within_limit(struct throtl_grp *tg, struct bio *bio, bool rw) bool __blk_throtl_bio(struct bio *bio) { - struct request_queue *q = bdev_get_queue(bio->bi_bdev); struct blkcg_gq *blkg = bio->bi_blkg; struct throtl_qnode *qn = NULL; struct throtl_grp *tg = blkg_to_tg(blkg); @@ -1756,7 +1751,7 @@ bool __blk_throtl_bio(struct bio *bio) struct throtl_data *td = tg->td; rcu_read_lock(); - spin_lock_irq(&q->queue_lock); + spin_lock_bh(&td->lock); sq = &tg->service_queue; while (true) { @@ -1832,7 +1827,7 @@ bool __blk_throtl_bio(struct bio *bio) } out_unlock: - spin_unlock_irq(&q->queue_lock); + spin_unlock_bh(&td->lock); rcu_read_unlock(); return throttled; -- 2.39.2