From: Yu Kuai <yukuai3@huawei.com>

blkcg_print_one_stat() will be called for each blkg:
- access blkg->iostat, which is freed from rcu callback
  blkg_free_workfn();
- access policy data from pd_stat_fn(), which is frred from
  pd_free_fn(), while pd_free_fn() can be called by removing blkcg or
  deactivating policy;

The blkcg->lock can make sure iterated blkgs are still online, and both
blkg->iostat and policy data for activated policy won't be freed.

Prepare to convert protecting blkgs from request_queue with mutex.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 block/blk-cgroup.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index f93de34fe87d..0f6039d468a6 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1242,13 +1242,10 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
 	else
 		css_rstat_flush(&blkcg->css);
 
-	rcu_read_lock();
-	hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
-		spin_lock_irq(&blkg->q->queue_lock);
+	guard(spinlock)(&blkcg->lock);
+	hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node)
 		blkcg_print_one_stat(blkg, sf);
-		spin_unlock_irq(&blkg->q->queue_lock);
-	}
-	rcu_read_unlock();
+
 	return 0;
 }
 
-- 
2.51.0

From: Yu Kuai <yukuai3@huawei.com>

Currently blkcg_print_blkgs() must be protected by rcu to iterate blkgs
from blkcg, and then prfill() must be protected by queue_lock to prevent
policy_data to be freed by deactivating policy. For consequence,
queue_lock have to be nested under rcu from blkcg_print_blkgs().

This patch delay freeing policy_data after rcu grace period, so that it's
possible to protect prfill() just with rcu lock held.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 block/bfq-cgroup.c    | 10 ++++++++--
 block/blk-cgroup.h    |  2 ++
 block/blk-iocost.c    | 14 ++++++++++++--
 block/blk-iolatency.c | 10 +++++++++-
 block/blk-throttle.c  | 13 +++++++++++--
 5 files changed, 42 insertions(+), 7 deletions(-)

diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 9fb9f3533150..a7e705d98751 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -548,14 +548,20 @@ static void bfq_pd_init(struct blkg_policy_data *pd)
 	bfqg->rq_pos_tree = RB_ROOT;
 }
 
-static void bfq_pd_free(struct blkg_policy_data *pd)
+static void bfqg_release(struct rcu_head *rcu)
 {
+	struct blkg_policy_data *pd =
+		container_of(rcu, struct blkg_policy_data, rcu_head);
 	struct bfq_group *bfqg = pd_to_bfqg(pd);
 
-	bfqg_stats_exit(&bfqg->stats);
 	bfqg_put(bfqg);
 }
 
+static void bfq_pd_free(struct blkg_policy_data *pd)
+{
+	call_rcu(&pd->rcu_head, bfqg_release);
+}
+
 static void bfq_pd_reset_stats(struct blkg_policy_data *pd)
 {
 	struct bfq_group *bfqg = pd_to_bfqg(pd);
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 1cce3294634d..fd206d1fa3c9 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -140,6 +140,8 @@ struct blkg_policy_data {
 	struct blkcg_gq			*blkg;
 	int				plid;
 	bool				online;
+
+	struct rcu_head			rcu_head;
 };
 
 /*
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index 5bfd70311359..3593547930cc 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -3017,6 +3017,16 @@ static void ioc_pd_init(struct blkg_policy_data *pd)
 	spin_unlock_irqrestore(&ioc->lock, flags);
 }
 
+static void iocg_release(struct rcu_head *rcu)
+{
+	struct blkg_policy_data *pd =
+		container_of(rcu, struct blkg_policy_data, rcu_head);
+	struct ioc_gq *iocg = pd_to_iocg(pd);
+
+	free_percpu(iocg->pcpu_stat);
+	kfree(iocg);
+}
+
 static void ioc_pd_free(struct blkg_policy_data *pd)
 {
 	struct ioc_gq *iocg = pd_to_iocg(pd);
@@ -3041,8 +3051,8 @@ static void ioc_pd_free(struct blkg_policy_data *pd)
 
 		hrtimer_cancel(&iocg->waitq_timer);
 	}
-	free_percpu(iocg->pcpu_stat);
-	kfree(iocg);
+
+	call_rcu(&pd->rcu_head, iocg_release);
 }
 
 static void ioc_pd_stat(struct blkg_policy_data *pd, struct seq_file *s)
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index 45bd18f68541..ce25fbb8aaf6 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -1031,13 +1031,21 @@ static void iolatency_pd_offline(struct blkg_policy_data *pd)
 	iolatency_clear_scaling(blkg);
 }
 
-static void iolatency_pd_free(struct blkg_policy_data *pd)
+static void iolat_release(struct rcu_head *rcu)
 {
+	struct blkg_policy_data *pd =
+		container_of(rcu, struct blkg_policy_data, rcu_head);
 	struct iolatency_grp *iolat = pd_to_lat(pd);
+
 	free_percpu(iolat->stats);
 	kfree(iolat);
 }
 
+static void iolatency_pd_free(struct blkg_policy_data *pd)
+{
+	call_rcu(&pd->rcu_head, iolat_release);
+}
+
 static struct cftype iolatency_files[] = {
 	{
 		.name = "latency",
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 2c5b64b1a724..cb3bfdb4684a 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -360,16 +360,25 @@ static void throtl_pd_online(struct blkg_policy_data *pd)
 	tg_update_has_rules(tg);
 }
 
-static void throtl_pd_free(struct blkg_policy_data *pd)
+static void tg_release(struct rcu_head *rcu)
 {
+	struct blkg_policy_data *pd =
+		container_of(rcu, struct blkg_policy_data, rcu_head);
 	struct throtl_grp *tg = pd_to_tg(pd);
 
-	timer_delete_sync(&tg->service_queue.pending_timer);
 	blkg_rwstat_exit(&tg->stat_bytes);
 	blkg_rwstat_exit(&tg->stat_ios);
 	kfree(tg);
 }
 
+static void throtl_pd_free(struct blkg_policy_data *pd)
+{
+	struct throtl_grp *tg = pd_to_tg(pd);
+
+	timer_delete_sync(&tg->service_queue.pending_timer);
+	call_rcu(&pd->rcu_head, tg_release);
+}
+
 static struct throtl_grp *
 throtl_rb_first(struct throtl_service_queue *parent_sq)
 {
-- 
2.51.0

From: Yu Kuai <yukuai3@huawei.com>

With previous modification to delay freeing policy data after rcu grace
period, now it's safe to protect prfill() with rcu directly because
it's guaranteed policy_data won't be freed by concurrent deactivating
policy.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 block/blk-cgroup-rwstat.c | 4 +---
 block/blk-cgroup.c        | 2 --
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/block/blk-cgroup-rwstat.c b/block/blk-cgroup-rwstat.c
index a55fb0c53558..b8ab8c0063a3 100644
--- a/block/blk-cgroup-rwstat.c
+++ b/block/blk-cgroup-rwstat.c
@@ -101,10 +101,9 @@ void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol,
 	struct cgroup_subsys_state *pos_css;
 	unsigned int i;
 
-	lockdep_assert_held(&blkg->q->queue_lock);
+	WARN_ON_ONCE(!rcu_read_lock_held());
 
 	memset(sum, 0, sizeof(*sum));
-	rcu_read_lock();
 	blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
 		struct blkg_rwstat *rwstat;
 
@@ -119,6 +118,5 @@ void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol,
 		for (i = 0; i < BLKG_RWSTAT_NR; i++)
 			sum->cnt[i] += blkg_rwstat_read_counter(rwstat, i);
 	}
-	rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum);
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 0f6039d468a6..fb40262971c9 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -713,10 +713,8 @@ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
 
 	rcu_read_lock();
 	hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
-		spin_lock_irq(&blkg->q->queue_lock);
 		if (blkcg_policy_enabled(blkg->q, pol))
 			total += prfill(sf, blkg->pd[pol->plid], data);
-		spin_unlock_irq(&blkg->q->queue_lock);
 	}
 	rcu_read_unlock();
 
-- 
2.51.0

From: Yu Kuai <yukuai3@huawei.com>

Changes to two step:

1) hold rcu lock and do blkg_lookup() from fast path;
2) hold queue_lock directly from slow path, and don't nest it under rcu
   lock;

Prepare to convert protecting blkcg with blkcg_mutex instead of
queue_lock;

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 block/blk-cgroup.c | 57 +++++++++++++++++++++++++++++-----------------
 1 file changed, 36 insertions(+), 21 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index fb40262971c9..3363d2476fed 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -467,22 +467,17 @@ static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
 {
 	struct request_queue *q = disk->queue;
 	struct blkcg_gq *blkg;
-	unsigned long flags;
-
-	WARN_ON_ONCE(!rcu_read_lock_held());
 
-	blkg = blkg_lookup(blkcg, q);
-	if (blkg)
-		return blkg;
-
-	spin_lock_irqsave(&q->queue_lock, flags);
+	rcu_read_lock();
 	blkg = blkg_lookup(blkcg, q);
 	if (blkg) {
 		if (blkcg != &blkcg_root &&
 		    blkg != rcu_dereference(blkcg->blkg_hint))
 			rcu_assign_pointer(blkcg->blkg_hint, blkg);
-		goto found;
+		rcu_read_unlock();
+		return blkg;
 	}
+	rcu_read_unlock();
 
 	/*
 	 * Create blkgs walking down from blkcg_root to @blkcg, so that all
@@ -514,8 +509,6 @@ static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
 			break;
 	}
 
-found:
-	spin_unlock_irqrestore(&q->queue_lock, flags);
 	return blkg;
 }
 
@@ -2078,6 +2071,18 @@ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
 	atomic64_add(delta, &blkg->delay_nsec);
 }
 
+static inline struct blkcg_gq *blkg_lookup_tryget(struct blkcg_gq *blkg)
+{
+retry:
+	if (blkg_tryget(blkg))
+		return blkg;
+
+	blkg = blkg->parent;
+	if (blkg)
+		goto retry;
+
+	return NULL;
+}
 /**
  * blkg_tryget_closest - try and get a blkg ref on the closet blkg
  * @bio: target bio
@@ -2090,20 +2095,30 @@ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
 static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio,
 		struct cgroup_subsys_state *css)
 {
-	struct blkcg_gq *blkg, *ret_blkg = NULL;
+	struct request_queue *q = bio->bi_bdev->bd_queue;
+	struct blkcg *blkcg = css_to_blkcg(css);
+	struct blkcg_gq *blkg;
 
 	rcu_read_lock();
-	blkg = blkg_lookup_create(css_to_blkcg(css), bio->bi_bdev->bd_disk);
-	while (blkg) {
-		if (blkg_tryget(blkg)) {
-			ret_blkg = blkg;
-			break;
-		}
-		blkg = blkg->parent;
-	}
+	blkg = blkg_lookup(blkcg, q);
+	if (likely(blkg))
+		blkg = blkg_lookup_tryget(blkg);
 	rcu_read_unlock();
 
-	return ret_blkg;
+	if (blkg)
+		return blkg;
+
+	/*
+	 * Fast path failed, we're probably issuing IO in this cgroup the first
+	 * time, hold lock to create new blkg.
+	 */
+	spin_lock_irq(&q->queue_lock);
+	blkg = blkg_lookup_create(blkcg, bio->bi_bdev->bd_disk);
+	if (blkg)
+		blkg = blkg_lookup_tryget(blkg);
+	spin_unlock_irq(&q->queue_lock);
+
+	return blkg;
 }
 
 /**
-- 
2.51.0

From: Yu Kuai <yukuai3@huawei.com>

If bio is already associated with blkg, blkcg is already pinned until
bio is done, no need for rcu protection; Otherwise protect blkcg_css()
with rcu independently. Prepare to convert protecting blkcg with
blkcg_mutex instead of queue_lock.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 block/blk-cgroup.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 3363d2476fed..2234ff2b2b8b 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -2166,16 +2166,20 @@ void bio_associate_blkg(struct bio *bio)
 	if (blk_op_is_passthrough(bio->bi_opf))
 		return;
 
-	rcu_read_lock();
-
-	if (bio->bi_blkg)
+	if (bio->bi_blkg) {
 		css = bio_blkcg_css(bio);
-	else
+		bio_associate_blkg_from_css(bio, css);
+	} else {
+		rcu_read_lock();
 		css = blkcg_css();
+		if (!css_tryget_online(css))
+			css = NULL;
+		rcu_read_unlock();
 
-	bio_associate_blkg_from_css(bio, css);
-
-	rcu_read_unlock();
+		bio_associate_blkg_from_css(bio, css);
+		if (css)
+			css_put(css);
+	}
 }
 EXPORT_SYMBOL_GPL(bio_associate_blkg);
 
-- 
2.51.0

From: Yu Kuai <yukuai3@huawei.com>

The correct lock order is q->queue_lock before blkcg->lock, and in order
to prevent deadlock from blkcg_destroy_blkgs(), trylock is used for
q->queue_lock while blkcg->lock is already held, this is hacky.

Hence refactor blkcg_destroy_blkgs(), by holding blkcg->lock to get the
first blkg and release it, then hold q->queue_lock and blkcg->lock in
the correct order to destroy blkg. This is super cold path, it's fine to
grab and release locks.

Also prepare to convert protecting blkcg with blkcg_mutex instead of
queue_lock.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 block/blk-cgroup.c | 45 ++++++++++++++++++++++++++-------------------
 1 file changed, 26 insertions(+), 19 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 2234ff2b2b8b..99edf15ce525 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1284,6 +1284,21 @@ struct list_head *blkcg_get_cgwb_list(struct cgroup_subsys_state *css)
  *    This finally frees the blkcg.
  */
 
+static struct blkcg_gq *blkcg_get_first_blkg(struct blkcg *blkcg)
+{
+	struct blkcg_gq *blkg = NULL;
+
+	spin_lock_irq(&blkcg->lock);
+	if (!hlist_empty(&blkcg->blkg_list)) {
+		blkg = hlist_entry(blkcg->blkg_list.first, struct blkcg_gq,
+				   blkcg_node);
+		blkg_get(blkg);
+	}
+	spin_unlock_irq(&blkcg->lock);
+
+	return blkg;
+}
+
 /**
  * blkcg_destroy_blkgs - responsible for shooting down blkgs
  * @blkcg: blkcg of interest
@@ -1297,32 +1312,24 @@ struct list_head *blkcg_get_cgwb_list(struct cgroup_subsys_state *css)
  */
 static void blkcg_destroy_blkgs(struct blkcg *blkcg)
 {
-	might_sleep();
+	struct blkcg_gq *blkg;
 
-	spin_lock_irq(&blkcg->lock);
+	might_sleep();
 
-	while (!hlist_empty(&blkcg->blkg_list)) {
-		struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first,
-						struct blkcg_gq, blkcg_node);
+	while ((blkg = blkcg_get_first_blkg(blkcg))) {
 		struct request_queue *q = blkg->q;
 
-		if (need_resched() || !spin_trylock(&q->queue_lock)) {
-			/*
-			 * Given that the system can accumulate a huge number
-			 * of blkgs in pathological cases, check to see if we
-			 * need to rescheduling to avoid softlockup.
-			 */
-			spin_unlock_irq(&blkcg->lock);
-			cond_resched();
-			spin_lock_irq(&blkcg->lock);
-			continue;
-		}
+		spin_lock_irq(&q->queue_lock);
+		spin_lock(&blkcg->lock);
 
 		blkg_destroy(blkg);
-		spin_unlock(&q->queue_lock);
-	}
 
-	spin_unlock_irq(&blkcg->lock);
+		spin_unlock(&blkcg->lock);
+		spin_unlock_irq(&q->queue_lock);
+
+		blkg_put(blkg);
+		cond_resched();
+	}
 }
 
 /**
-- 
2.51.0

From: Yu Kuai <yukuai3@huawei.com>

Prepare to convert protecting blkcg with blkcg_mutex instead of
queue_lock.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 mm/page_io.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/mm/page_io.c b/mm/page_io.c
index a2056a5ecb13..4f4cc9370573 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -313,8 +313,13 @@ static void bio_associate_blkg_from_page(struct bio *bio, struct folio *folio)
 
 	rcu_read_lock();
 	css = cgroup_e_css(memcg->css.cgroup, &io_cgrp_subsys);
-	bio_associate_blkg_from_css(bio, css);
+	if (!css || !css_tryget_online(css))
+		css = NULL;
 	rcu_read_unlock();
+
+	bio_associate_blkg_from_css(bio, css);
+	if (css)
+		css_put(css);
 }
 #else
 #define bio_associate_blkg_from_page(bio, folio)		do { } while (0)
-- 
2.51.0

From: Yu Kuai <yukuai3@huawei.com>

Request_queue is freezed and quiesced during elevator init_sched()
method, there is no point to hold queue_lock for protection.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 block/bfq-iosched.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 9e0eee9aba5c..86309828e235 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -7203,10 +7203,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_queue *eq)
 		return -ENOMEM;
 
 	eq->elevator_data = bfqd;
-
-	spin_lock_irq(&q->queue_lock);
 	q->elevator = eq;
-	spin_unlock_irq(&q->queue_lock);
 
 	/*
 	 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
@@ -7239,7 +7236,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_queue *eq)
 	 * If the disk supports multiple actuators, copy independent
 	 * access ranges from the request queue structure.
 	 */
-	spin_lock_irq(&q->queue_lock);
 	if (ia_ranges) {
 		/*
 		 * Check if the disk ia_ranges size exceeds the current bfq
@@ -7265,7 +7261,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_queue *eq)
 		bfqd->sector[0] = 0;
 		bfqd->nr_sectors[0] = get_capacity(q->disk);
 	}
-	spin_unlock_irq(&q->queue_lock);
 
 	INIT_LIST_HEAD(&bfqd->dispatch);
 
-- 
2.51.0

From: Yu Kuai <yukuai3@huawei.com>

With previous modifications, queue_lock no longer grabbed under other
spinlocks and rcu for protecting blkgs, it's ok convert to blkcg_mutex
directly.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 block/bfq-cgroup.c  |   6 +--
 block/bfq-iosched.c |   8 ++--
 block/blk-cgroup.c  | 104 ++++++++++++++------------------------------
 block/blk-cgroup.h  |   6 +--
 4 files changed, 42 insertions(+), 82 deletions(-)

diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index a7e705d98751..43790ae91b57 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -405,7 +405,7 @@ static void bfqg_stats_xfer_dead(struct bfq_group *bfqg)
 
 	parent = bfqg_parent(bfqg);
 
-	lockdep_assert_held(&bfqg_to_blkg(bfqg)->q->queue_lock);
+	lockdep_assert_held(&bfqg_to_blkg(bfqg)->q->blkcg_mutex);
 
 	if (unlikely(!parent))
 		return;
@@ -872,7 +872,7 @@ static void bfq_reparent_active_queues(struct bfq_data *bfqd,
  *		    and reparent its children entities.
  * @pd: descriptor of the policy going offline.
  *
- * blkio already grabs the queue_lock for us, so no need to use
+ * blkio already grabs the blkcg_mtuex for us, so no need to use
  * RCU-based magic
  */
 static void bfq_pd_offline(struct blkg_policy_data *pd)
@@ -1145,7 +1145,7 @@ static u64 bfqg_prfill_stat_recursive(struct seq_file *sf,
 	struct cgroup_subsys_state *pos_css;
 	u64 sum = 0;
 
-	lockdep_assert_held(&blkg->q->queue_lock);
+	lockdep_assert_held(&blkg->q->blkcg_mutex);
 
 	rcu_read_lock();
 	blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 86309828e235..3350c9b22eb4 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -5266,7 +5266,7 @@ static void bfq_update_dispatch_stats(struct request_queue *q,
 	 * In addition, the following queue lock guarantees that
 	 * bfqq_group(bfqq) exists as well.
 	 */
-	spin_lock_irq(&q->queue_lock);
+	mutex_lock(&q->blkcg_mutex);
 	if (idle_timer_disabled)
 		/*
 		 * Since the idle timer has been disabled,
@@ -5285,7 +5285,7 @@ static void bfq_update_dispatch_stats(struct request_queue *q,
 		bfqg_stats_set_start_empty_time(bfqg);
 		bfqg_stats_update_io_remove(bfqg, rq->cmd_flags);
 	}
-	spin_unlock_irq(&q->queue_lock);
+	mutex_unlock(&q->blkcg_mutex);
 }
 #else
 static inline void bfq_update_dispatch_stats(struct request_queue *q,
@@ -6218,11 +6218,11 @@ static void bfq_update_insert_stats(struct request_queue *q,
 	 * In addition, the following queue lock guarantees that
 	 * bfqq_group(bfqq) exists as well.
 	 */
-	spin_lock_irq(&q->queue_lock);
+	mutex_lock(&q->blkcg_mutex);
 	bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags);
 	if (idle_timer_disabled)
 		bfqg_stats_update_idle_time(bfqq_group(bfqq));
-	spin_unlock_irq(&q->queue_lock);
+	mutex_unlock(&q->blkcg_mutex);
 }
 #else
 static inline void bfq_update_insert_stats(struct request_queue *q,
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 99edf15ce525..b8bb2f3506aa 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -130,9 +130,7 @@ static void blkg_free_workfn(struct work_struct *work)
 			blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
 	if (blkg->parent)
 		blkg_put(blkg->parent);
-	spin_lock_irq(&q->queue_lock);
 	list_del_init(&blkg->q_node);
-	spin_unlock_irq(&q->queue_lock);
 	mutex_unlock(&q->blkcg_mutex);
 
 	blk_put_queue(q);
@@ -372,7 +370,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk,
 	struct blkcg_gq *blkg;
 	int i, ret;
 
-	lockdep_assert_held(&disk->queue->queue_lock);
+	lockdep_assert_held(&disk->queue->blkcg_mutex);
 
 	/* request_queue is dying, do not create/recreate a blkg */
 	if (blk_queue_dying(disk->queue)) {
@@ -457,7 +455,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk,
  * Lookup blkg for the @blkcg - @disk pair.  If it doesn't exist, try to
  * create one.  blkg creation is performed recursively from blkcg_root such
  * that all non-root blkg's have access to the parent blkg.  This function
- * should be called under RCU read lock and takes @disk->queue->queue_lock.
+ * should be called under RCU read lock and takes @disk->queue->blkcg_mutex.
  *
  * Returns the blkg or the closest blkg if blkg_create() fails as it walks
  * down from root.
@@ -517,7 +515,7 @@ static void blkg_destroy(struct blkcg_gq *blkg)
 	struct blkcg *blkcg = blkg->blkcg;
 	int i;
 
-	lockdep_assert_held(&blkg->q->queue_lock);
+	lockdep_assert_held(&blkg->q->blkcg_mutex);
 	lockdep_assert_held(&blkcg->lock);
 
 	/*
@@ -546,8 +544,8 @@ static void blkg_destroy(struct blkcg_gq *blkg)
 
 	/*
 	 * Both setting lookup hint to and clearing it from @blkg are done
-	 * under queue_lock.  If it's not pointing to @blkg now, it never
-	 * will.  Hint assignment itself can race safely.
+	 * under q->blkcg_mutex and blkcg->lock.  If it's not pointing to @blkg
+	 * now, it never will. Hint assignment itself can race safely.
 	 */
 	if (rcu_access_pointer(blkcg->blkg_hint) == blkg)
 		rcu_assign_pointer(blkcg->blkg_hint, NULL);
@@ -567,25 +565,20 @@ static void blkg_destroy_all(struct gendisk *disk)
 	int i;
 
 restart:
-	spin_lock_irq(&q->queue_lock);
+	mutex_lock(&q->blkcg_mutex);
 	list_for_each_entry(blkg, &q->blkg_list, q_node) {
 		struct blkcg *blkcg = blkg->blkcg;
 
 		if (hlist_unhashed(&blkg->blkcg_node))
 			continue;
 
-		spin_lock(&blkcg->lock);
+		spin_lock_irq(&blkcg->lock);
 		blkg_destroy(blkg);
-		spin_unlock(&blkcg->lock);
+		spin_unlock_irq(&blkcg->lock);
 
-		/*
-		 * in order to avoid holding the spin lock for too long, release
-		 * it when a batch of blkgs are destroyed.
-		 */
 		if (!(--count)) {
 			count = BLKG_DESTROY_BATCH_SIZE;
-			spin_unlock_irq(&q->queue_lock);
-			cond_resched();
+			mutex_unlock(&q->blkcg_mutex);
 			goto restart;
 		}
 	}
@@ -603,7 +596,7 @@ static void blkg_destroy_all(struct gendisk *disk)
 	}
 
 	q->root_blkg = NULL;
-	spin_unlock_irq(&q->queue_lock);
+	mutex_unlock(&q->blkcg_mutex);
 }
 
 static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src)
@@ -854,7 +847,7 @@ unsigned long __must_check blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx)
  */
 int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 		   struct blkg_conf_ctx *ctx)
-	__acquires(&bdev->bd_queue->queue_lock)
+	__acquires(&bdev->bd_queue->blkcg_mutex)
 {
 	struct gendisk *disk;
 	struct request_queue *q;
@@ -870,7 +863,6 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 
 	/* Prevent concurrent with blkcg_deactivate_policy() */
 	mutex_lock(&q->blkcg_mutex);
-	spin_lock_irq(&q->queue_lock);
 
 	if (!blkcg_policy_enabled(q, pol)) {
 		ret = -EOPNOTSUPP;
@@ -896,23 +888,18 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 			parent = blkcg_parent(parent);
 		}
 
-		/* Drop locks to do new blkg allocation with GFP_KERNEL. */
-		spin_unlock_irq(&q->queue_lock);
-
 		new_blkg = blkg_alloc(pos, disk, GFP_NOIO);
 		if (unlikely(!new_blkg)) {
 			ret = -ENOMEM;
-			goto fail_exit;
+			goto fail_unlock;
 		}
 
 		if (radix_tree_preload(GFP_KERNEL)) {
 			blkg_free(new_blkg);
 			ret = -ENOMEM;
-			goto fail_exit;
+			goto fail_unlock;
 		}
 
-		spin_lock_irq(&q->queue_lock);
-
 		if (!blkcg_policy_enabled(q, pol)) {
 			blkg_free(new_blkg);
 			ret = -EOPNOTSUPP;
@@ -936,15 +923,12 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 			goto success;
 	}
 success:
-	mutex_unlock(&q->blkcg_mutex);
 	ctx->blkg = blkg;
 	return 0;
 
 fail_preloaded:
 	radix_tree_preload_end();
 fail_unlock:
-	spin_unlock_irq(&q->queue_lock);
-fail_exit:
 	mutex_unlock(&q->blkcg_mutex);
 	/*
 	 * If queue was bypassing, we should retry.  Do so after a
@@ -968,11 +952,11 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep);
  * blkg_conf_ctx's initialized with blkg_conf_init().
  */
 void blkg_conf_exit(struct blkg_conf_ctx *ctx)
-	__releases(&ctx->bdev->bd_queue->queue_lock)
+	__releases(&ctx->bdev->bd_queue->blkcg_mutex)
 	__releases(&ctx->bdev->bd_queue->rq_qos_mutex)
 {
 	if (ctx->blkg) {
-		spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock);
+		mutex_unlock(&bdev_get_queue(ctx->bdev)->blkcg_mutex);
 		ctx->blkg = NULL;
 	}
 
@@ -1319,13 +1303,13 @@ static void blkcg_destroy_blkgs(struct blkcg *blkcg)
 	while ((blkg = blkcg_get_first_blkg(blkcg))) {
 		struct request_queue *q = blkg->q;
 
-		spin_lock_irq(&q->queue_lock);
-		spin_lock(&blkcg->lock);
+		mutex_lock(&q->blkcg_mutex);
+		spin_lock_irq(&blkcg->lock);
 
 		blkg_destroy(blkg);
 
-		spin_unlock(&blkcg->lock);
-		spin_unlock_irq(&q->queue_lock);
+		spin_unlock_irq(&blkcg->lock);
+		mutex_unlock(&q->blkcg_mutex);
 
 		blkg_put(blkg);
 		cond_resched();
@@ -1502,24 +1486,23 @@ int blkcg_init_disk(struct gendisk *disk)
 	if (!new_blkg)
 		return -ENOMEM;
 
-	preloaded = !radix_tree_preload(GFP_KERNEL);
+	mutex_lock(&q->blkcg_mutex);
+	preloaded = !radix_tree_preload(GFP_NOIO);
 
 	/* Make sure the root blkg exists. */
-	/* spin_lock_irq can serve as RCU read-side critical section. */
-	spin_lock_irq(&q->queue_lock);
 	blkg = blkg_create(&blkcg_root, disk, new_blkg);
 	if (IS_ERR(blkg))
 		goto err_unlock;
 	q->root_blkg = blkg;
-	spin_unlock_irq(&q->queue_lock);
 
 	if (preloaded)
 		radix_tree_preload_end();
+	mutex_unlock(&q->blkcg_mutex);
 
 	return 0;
 
 err_unlock:
-	spin_unlock_irq(&q->queue_lock);
+	mutex_unlock(&q->blkcg_mutex);
 	if (preloaded)
 		radix_tree_preload_end();
 	return PTR_ERR(blkg);
@@ -1596,8 +1579,7 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
 
 	if (queue_is_mq(q))
 		memflags = blk_mq_freeze_queue(q);
-retry:
-	spin_lock_irq(&q->queue_lock);
+	mutex_lock(&q->blkcg_mutex);
 
 	/* blkg_list is pushed at the head, reverse walk to initialize parents first */
 	list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
@@ -1606,36 +1588,17 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
 		if (blkg->pd[pol->plid])
 			continue;
 
-		/* If prealloc matches, use it; otherwise try GFP_NOWAIT */
+		/* If prealloc matches, use it */
 		if (blkg == pinned_blkg) {
 			pd = pd_prealloc;
 			pd_prealloc = NULL;
 		} else {
 			pd = pol->pd_alloc_fn(disk, blkg->blkcg,
-					      GFP_NOWAIT);
+					      GFP_NOIO);
 		}
 
-		if (!pd) {
-			/*
-			 * GFP_NOWAIT failed.  Free the existing one and
-			 * prealloc for @blkg w/ GFP_KERNEL.
-			 */
-			if (pinned_blkg)
-				blkg_put(pinned_blkg);
-			blkg_get(blkg);
-			pinned_blkg = blkg;
-
-			spin_unlock_irq(&q->queue_lock);
-
-			if (pd_prealloc)
-				pol->pd_free_fn(pd_prealloc);
-			pd_prealloc = pol->pd_alloc_fn(disk, blkg->blkcg,
-						       GFP_KERNEL);
-			if (pd_prealloc)
-				goto retry;
-			else
-				goto enomem;
-		}
+		if (!pd)
+			goto enomem;
 
 		spin_lock(&blkg->blkcg->lock);
 
@@ -1656,8 +1619,8 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
 	__set_bit(pol->plid, q->blkcg_pols);
 	ret = 0;
 
-	spin_unlock_irq(&q->queue_lock);
 out:
+	mutex_unlock(&q->blkcg_mutex);
 	if (queue_is_mq(q))
 		blk_mq_unfreeze_queue(q, memflags);
 	if (pinned_blkg)
@@ -1668,7 +1631,6 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
 
 enomem:
 	/* alloc failed, take down everything */
-	spin_lock_irq(&q->queue_lock);
 	list_for_each_entry(blkg, &q->blkg_list, q_node) {
 		struct blkcg *blkcg = blkg->blkcg;
 		struct blkg_policy_data *pd;
@@ -1684,7 +1646,7 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
 		}
 		spin_unlock(&blkcg->lock);
 	}
-	spin_unlock_irq(&q->queue_lock);
+
 	ret = -ENOMEM;
 	goto out;
 }
@@ -1712,7 +1674,6 @@ void blkcg_deactivate_policy(struct gendisk *disk,
 		memflags = blk_mq_freeze_queue(q);
 
 	mutex_lock(&q->blkcg_mutex);
-	spin_lock_irq(&q->queue_lock);
 
 	__clear_bit(pol->plid, q->blkcg_pols);
 
@@ -1729,7 +1690,6 @@ void blkcg_deactivate_policy(struct gendisk *disk,
 		spin_unlock(&blkcg->lock);
 	}
 
-	spin_unlock_irq(&q->queue_lock);
 	mutex_unlock(&q->blkcg_mutex);
 
 	if (queue_is_mq(q))
@@ -2119,11 +2079,11 @@ static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio,
 	 * Fast path failed, we're probably issuing IO in this cgroup the first
 	 * time, hold lock to create new blkg.
 	 */
-	spin_lock_irq(&q->queue_lock);
+	mutex_lock(&q->blkcg_mutex);
 	blkg = blkg_lookup_create(blkcg, bio->bi_bdev->bd_disk);
 	if (blkg)
 		blkg = blkg_lookup_tryget(blkg);
-	spin_unlock_irq(&q->queue_lock);
+	mutex_unlock(&q->blkcg_mutex);
 
 	return blkg;
 }
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index fd206d1fa3c9..540be30aebcd 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -263,7 +263,7 @@ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
 		return q->root_blkg;
 
 	blkg = rcu_dereference_check(blkcg->blkg_hint,
-			lockdep_is_held(&q->queue_lock));
+			lockdep_is_held(&q->blkcg_mutex));
 	if (blkg && blkg->q == q)
 		return blkg;
 
@@ -347,8 +347,8 @@ static inline void blkg_put(struct blkcg_gq *blkg)
  * @p_blkg: target blkg to walk descendants of
  *
  * Walk @c_blkg through the descendants of @p_blkg.  Must be used with RCU
- * read locked.  If called under either blkcg or queue lock, the iteration
- * is guaranteed to include all and only online blkgs.  The caller may
+ * read locked.  If called under either blkcg->lock or q->blkcg_mutex, the
+ * iteration is guaranteed to include all and only online blkgs.  The caller may
  * update @pos_css by calling css_rightmost_descendant() to skip subtree.
  * @p_blkg is included in the iteration and the first node to be visited.
  */
-- 
2.51.0

From: Yu Kuai <yukuai3@huawei.com>

Now that blkcg_mutex is used to protect blkgs, memory allocation no
longer need to be non-blocking, this is not needed.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 block/blk-cgroup.c | 20 ++------------------
 1 file changed, 2 insertions(+), 18 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index b8bb2f3506aa..030499d70543 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -894,16 +894,10 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 			goto fail_unlock;
 		}
 
-		if (radix_tree_preload(GFP_KERNEL)) {
-			blkg_free(new_blkg);
-			ret = -ENOMEM;
-			goto fail_unlock;
-		}
-
 		if (!blkcg_policy_enabled(q, pol)) {
 			blkg_free(new_blkg);
 			ret = -EOPNOTSUPP;
-			goto fail_preloaded;
+			goto fail_unlock;
 		}
 
 		blkg = blkg_lookup(pos, q);
@@ -913,12 +907,10 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 			blkg = blkg_create(pos, disk, new_blkg);
 			if (IS_ERR(blkg)) {
 				ret = PTR_ERR(blkg);
-				goto fail_preloaded;
+				goto fail_unlock;
 			}
 		}
 
-		radix_tree_preload_end();
-
 		if (pos == blkcg)
 			goto success;
 	}
@@ -926,8 +918,6 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 	ctx->blkg = blkg;
 	return 0;
 
-fail_preloaded:
-	radix_tree_preload_end();
 fail_unlock:
 	mutex_unlock(&q->blkcg_mutex);
 	/*
@@ -1480,14 +1470,12 @@ int blkcg_init_disk(struct gendisk *disk)
 {
 	struct request_queue *q = disk->queue;
 	struct blkcg_gq *new_blkg, *blkg;
-	bool preloaded;
 
 	new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL);
 	if (!new_blkg)
 		return -ENOMEM;
 
 	mutex_lock(&q->blkcg_mutex);
-	preloaded = !radix_tree_preload(GFP_NOIO);
 
 	/* Make sure the root blkg exists. */
 	blkg = blkg_create(&blkcg_root, disk, new_blkg);
@@ -1495,16 +1483,12 @@ int blkcg_init_disk(struct gendisk *disk)
 		goto err_unlock;
 	q->root_blkg = blkg;
 
-	if (preloaded)
-		radix_tree_preload_end();
 	mutex_unlock(&q->blkcg_mutex);
 
 	return 0;
 
 err_unlock:
 	mutex_unlock(&q->blkcg_mutex);
-	if (preloaded)
-		radix_tree_preload_end();
 	return PTR_ERR(blkg);
 }
 
-- 
2.51.0

From: Yu Kuai <yukuai3@huawei.com>

Now that blkg_create is protected with blkcg_mutex, there is no need to
preallocate blkg, remove related code.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 block/blk-cgroup.c | 91 +++++++++++++++++-----------------------------
 1 file changed, 33 insertions(+), 58 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 030499d70543..3c23d2d1e237 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -364,10 +364,9 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
  * If @new_blkg is %NULL, this function tries to allocate a new one as
  * necessary using %GFP_NOWAIT.  @new_blkg is always consumed on return.
  */
-static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk,
-				    struct blkcg_gq *new_blkg)
+static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk)
 {
-	struct blkcg_gq *blkg;
+	struct blkcg_gq *blkg = NULL;
 	int i, ret;
 
 	lockdep_assert_held(&disk->queue->blkcg_mutex);
@@ -384,15 +383,11 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk,
 		goto err_free_blkg;
 	}
 
-	/* allocate */
-	if (!new_blkg) {
-		new_blkg = blkg_alloc(blkcg, disk, GFP_NOWAIT);
-		if (unlikely(!new_blkg)) {
-			ret = -ENOMEM;
-			goto err_put_css;
-		}
+	blkg = blkg_alloc(blkcg, disk, GFP_NOIO);
+	if (unlikely(!blkg)) {
+		ret = -ENOMEM;
+		goto err_put_css;
 	}
-	blkg = new_blkg;
 
 	/* link parent */
 	if (blkcg_parent(blkcg)) {
@@ -415,35 +410,34 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk,
 	/* insert */
 	spin_lock(&blkcg->lock);
 	ret = radix_tree_insert(&blkcg->blkg_tree, disk->queue->id, blkg);
-	if (likely(!ret)) {
-		hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
-		list_add(&blkg->q_node, &disk->queue->blkg_list);
+	if (unlikely(ret)) {
+		spin_unlock(&blkcg->lock);
+		blkg_put(blkg);
+		return ERR_PTR(ret);
+	}
 
-		for (i = 0; i < BLKCG_MAX_POLS; i++) {
-			struct blkcg_policy *pol = blkcg_policy[i];
+	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
+	list_add(&blkg->q_node, &disk->queue->blkg_list);
 
-			if (blkg->pd[i]) {
-				if (pol->pd_online_fn)
-					pol->pd_online_fn(blkg->pd[i]);
-				blkg->pd[i]->online = true;
-			}
+	for (i = 0; i < BLKCG_MAX_POLS; i++) {
+		struct blkcg_policy *pol = blkcg_policy[i];
+
+		if (blkg->pd[i]) {
+			if (pol->pd_online_fn)
+				pol->pd_online_fn(blkg->pd[i]);
+			blkg->pd[i]->online = true;
 		}
 	}
+
 	blkg->online = true;
 	spin_unlock(&blkcg->lock);
-
-	if (!ret)
-		return blkg;
-
-	/* @blkg failed fully initialized, use the usual release path */
-	blkg_put(blkg);
-	return ERR_PTR(ret);
+	return blkg;
 
 err_put_css:
 	css_put(&blkcg->css);
 err_free_blkg:
-	if (new_blkg)
-		blkg_free(new_blkg);
+	if (blkg)
+		blkg_free(blkg);
 	return ERR_PTR(ret);
 }
 
@@ -498,7 +492,7 @@ static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
 			parent = blkcg_parent(parent);
 		}
 
-		blkg = blkg_create(pos, disk, NULL);
+		blkg = blkg_create(pos, disk);
 		if (IS_ERR(blkg)) {
 			blkg = ret_blkg;
 			break;
@@ -880,7 +874,6 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 	while (true) {
 		struct blkcg *pos = blkcg;
 		struct blkcg *parent;
-		struct blkcg_gq *new_blkg;
 
 		parent = blkcg_parent(blkcg);
 		while (parent && !blkg_lookup(parent, q)) {
@@ -888,23 +881,14 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 			parent = blkcg_parent(parent);
 		}
 
-		new_blkg = blkg_alloc(pos, disk, GFP_NOIO);
-		if (unlikely(!new_blkg)) {
-			ret = -ENOMEM;
-			goto fail_unlock;
-		}
-
 		if (!blkcg_policy_enabled(q, pol)) {
-			blkg_free(new_blkg);
 			ret = -EOPNOTSUPP;
 			goto fail_unlock;
 		}
 
 		blkg = blkg_lookup(pos, q);
-		if (blkg) {
-			blkg_free(new_blkg);
-		} else {
-			blkg = blkg_create(pos, disk, new_blkg);
+		if (!blkg) {
+			blkg = blkg_create(pos, disk);
 			if (IS_ERR(blkg)) {
 				ret = PTR_ERR(blkg);
 				goto fail_unlock;
@@ -1469,27 +1453,18 @@ void blkg_init_queue(struct request_queue *q)
 int blkcg_init_disk(struct gendisk *disk)
 {
 	struct request_queue *q = disk->queue;
-	struct blkcg_gq *new_blkg, *blkg;
-
-	new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL);
-	if (!new_blkg)
-		return -ENOMEM;
+	struct blkcg_gq *blkg;
 
+	/* Make sure the root blkg exists. */
 	mutex_lock(&q->blkcg_mutex);
+	blkg = blkg_create(&blkcg_root, disk);
+	mutex_unlock(&q->blkcg_mutex);
 
-	/* Make sure the root blkg exists. */
-	blkg = blkg_create(&blkcg_root, disk, new_blkg);
 	if (IS_ERR(blkg))
-		goto err_unlock;
-	q->root_blkg = blkg;
-
-	mutex_unlock(&q->blkcg_mutex);
+		return PTR_ERR(blkg);
 
+	q->root_blkg = blkg;
 	return 0;
-
-err_unlock:
-	mutex_unlock(&q->blkcg_mutex);
-	return PTR_ERR(blkg);
 }
 
 void blkcg_exit_disk(struct gendisk *disk)
-- 
2.51.0

From: Yu Kuai <yukuai3@huawei.com>

Abusing queue_lock to protect blk-throttle can cause deadlock:

1) throtl_pending_timer_fn() will hold the lock, while throtl_pd_free()
   will flush the timer, this is fixed by protecting blkgs with
   blkcg_mutex instead of queue_lock by previous patches.
2) queue_lock can be held from hardirq context, hence if
   throtl_pending_timer_fn() is interrupted by hardirq, deadlock can be
   triggered as well.

Stop abusing queue_lock to protect blk-throttle, and intorduce a new
internal lock td->lock for protection. And now that the new lock won't
be grabbed from hardirq context, it's safe to use spin_lock_bh() from
thread context and spin_lock() directly from softirq context.

Fixes: 6e1a5704cbbd ("blk-throttle: dispatch from throtl_pending_timer_fn()")
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 block/blk-throttle.c | 31 +++++++++++++------------------
 1 file changed, 13 insertions(+), 18 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index cb3bfdb4684a..7feaa2ef0a6b 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -33,6 +33,7 @@ static struct workqueue_struct *kthrotld_workqueue;
 
 struct throtl_data
 {
+	spinlock_t lock;
 	/* service tree for active throtl groups */
 	struct throtl_service_queue service_queue;
 
@@ -1149,7 +1150,7 @@ static void throtl_pending_timer_fn(struct timer_list *t)
 	else
 		q = td->queue;
 
-	spin_lock_irq(&q->queue_lock);
+	spin_lock(&td->lock);
 
 	if (!q->root_blkg)
 		goto out_unlock;
@@ -1175,9 +1176,9 @@ static void throtl_pending_timer_fn(struct timer_list *t)
 			break;
 
 		/* this dispatch windows is still open, relax and repeat */
-		spin_unlock_irq(&q->queue_lock);
+		spin_unlock(&td->lock);
 		cpu_relax();
-		spin_lock_irq(&q->queue_lock);
+		spin_lock(&td->lock);
 	}
 
 	if (!dispatched)
@@ -1200,7 +1201,7 @@ static void throtl_pending_timer_fn(struct timer_list *t)
 		queue_work(kthrotld_workqueue, &td->dispatch_work);
 	}
 out_unlock:
-	spin_unlock_irq(&q->queue_lock);
+	spin_unlock(&td->lock);
 }
 
 /**
@@ -1216,7 +1217,6 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
 	struct throtl_data *td = container_of(work, struct throtl_data,
 					      dispatch_work);
 	struct throtl_service_queue *td_sq = &td->service_queue;
-	struct request_queue *q = td->queue;
 	struct bio_list bio_list_on_stack;
 	struct bio *bio;
 	struct blk_plug plug;
@@ -1224,11 +1224,11 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
 
 	bio_list_init(&bio_list_on_stack);
 
-	spin_lock_irq(&q->queue_lock);
+	spin_lock_bh(&td->lock);
 	for (rw = READ; rw <= WRITE; rw++)
 		while ((bio = throtl_pop_queued(td_sq, NULL, rw)))
 			bio_list_add(&bio_list_on_stack, bio);
-	spin_unlock_irq(&q->queue_lock);
+	spin_unlock_bh(&td->lock);
 
 	if (!bio_list_empty(&bio_list_on_stack)) {
 		blk_start_plug(&plug);
@@ -1306,7 +1306,7 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global)
 	rcu_read_unlock();
 
 	/*
-	 * We're already holding queue_lock and know @tg is valid.  Let's
+	 * We're already holding td->lock and know @tg is valid.  Let's
 	 * apply the new config directly.
 	 *
 	 * Restart the slices for both READ and WRITES. It might happen
@@ -1333,6 +1333,7 @@ static int blk_throtl_init(struct gendisk *disk)
 	if (!td)
 		return -ENOMEM;
 
+	spin_lock_init(&td->lock);
 	INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
 	throtl_service_queue_init(&td->service_queue);
 
@@ -1703,12 +1704,7 @@ void blk_throtl_cancel_bios(struct gendisk *disk)
 	if (!blk_throtl_activated(q))
 		return;
 
-	spin_lock_irq(&q->queue_lock);
-	/*
-	 * queue_lock is held, rcu lock is not needed here technically.
-	 * However, rcu lock is still held to emphasize that following
-	 * path need RCU protection and to prevent warning from lockdep.
-	 */
+	spin_lock_bh(&q->td->lock);
 	rcu_read_lock();
 	blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) {
 		/*
@@ -1722,7 +1718,7 @@ void blk_throtl_cancel_bios(struct gendisk *disk)
 		tg_flush_bios(blkg_to_tg(blkg));
 	}
 	rcu_read_unlock();
-	spin_unlock_irq(&q->queue_lock);
+	spin_unlock_bh(&q->td->lock);
 }
 
 static bool tg_within_limit(struct throtl_grp *tg, struct bio *bio, bool rw)
@@ -1755,7 +1751,6 @@ static bool tg_within_limit(struct throtl_grp *tg, struct bio *bio, bool rw)
 
 bool __blk_throtl_bio(struct bio *bio)
 {
-	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
 	struct blkcg_gq *blkg = bio->bi_blkg;
 	struct throtl_qnode *qn = NULL;
 	struct throtl_grp *tg = blkg_to_tg(blkg);
@@ -1765,7 +1760,7 @@ bool __blk_throtl_bio(struct bio *bio)
 	struct throtl_data *td = tg->td;
 
 	rcu_read_lock();
-	spin_lock_irq(&q->queue_lock);
+	spin_lock_bh(&td->lock);
 	sq = &tg->service_queue;
 
 	while (true) {
@@ -1841,7 +1836,7 @@ bool __blk_throtl_bio(struct bio *bio)
 	}
 
 out_unlock:
-	spin_unlock_irq(&q->queue_lock);
+	spin_unlock_bh(&td->lock);
 
 	rcu_read_unlock();
 	return throttled;
-- 
2.51.0

From: Yu Kuai <yukuai3@huawei.com>

Currently there are many helpers to be used in different cases:

- blkg_conf_open_bdev()
- blkg_conf_open_bdev_frozen()
- blkg_conf_prep()
- blkg_conf_exit()
- blkg_conf_exit_frozen()

This patch introduce two new helpers:

- blkg_conf_start()
- blkg_conf_end()

And following patches will convert all blkcg policy to use this two
helpers.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 block/blk-cgroup.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++
 block/blk-cgroup.h |  3 +++
 2 files changed, 64 insertions(+)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 3c23d2d1e237..63089ae269cb 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -824,6 +824,67 @@ unsigned long __must_check blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx)
 	return memflags;
 }
 
+void blkg_conf_end(struct blkg_conf_ctx *ctx)
+{
+	struct request_queue *q = bdev_get_queue(ctx->bdev);
+
+	mutex_unlock(&q->blkcg_mutex);
+	mutex_unlock(&q->rq_qos_mutex);
+	mutex_unlock(&q->elevator_lock);
+	blk_mq_unfreeze_queue(q, ctx->memflags);
+	blkdev_put_no_open(ctx->bdev);
+}
+EXPORT_SYMBOL_GPL(blkg_conf_end);
+
+int blkg_conf_start(struct blkcg *blkcg, struct blkg_conf_ctx *ctx)
+{
+	char *input = ctx->input;
+	unsigned int major, minor;
+	struct block_device *bdev;
+	struct request_queue *q;
+	int key_len;
+
+	if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
+		return -EINVAL;
+
+	input += key_len;
+	if (!isspace(*input))
+		return -EINVAL;
+
+	input = skip_spaces(input);
+	bdev = blkdev_get_no_open(MKDEV(major, minor), false);
+	if (!bdev)
+		return -ENODEV;
+
+	if (bdev_is_partition(bdev)) {
+		blkdev_put_no_open(bdev);
+		return -ENODEV;
+	}
+
+	if (!disk_live(bdev->bd_disk)) {
+		blkdev_put_no_open(bdev);
+		return -ENODEV;
+	}
+
+	ctx->body = input;
+	ctx->bdev = bdev;
+	ctx->memflags = blk_mq_freeze_queue(ctx->bdev->bd_queue);
+	
+	q = bdev->bd_queue;
+	mutex_lock(&q->elevator_lock);
+	mutex_lock(&q->rq_qos_mutex);
+	mutex_lock(&q->blkcg_mutex);
+
+	ctx->blkg = blkg_lookup_create(blkcg, bdev->bd_disk);
+	if (!ctx->blkg) {
+		blkg_conf_end(ctx);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(blkg_conf_start);
+
 /**
  * blkg_conf_prep - parse and prepare for per-blkg config update
  * @blkcg: target block cgroup
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 540be30aebcd..e7868989befb 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -217,6 +217,7 @@ struct blkg_conf_ctx {
 	char				*body;
 	struct block_device		*bdev;
 	struct blkcg_gq			*blkg;
+	unsigned long			memflags;
 };
 
 void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input);
@@ -226,6 +227,8 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 		   struct blkg_conf_ctx *ctx);
 void blkg_conf_exit(struct blkg_conf_ctx *ctx);
 void blkg_conf_exit_frozen(struct blkg_conf_ctx *ctx, unsigned long memflags);
+void blkg_conf_end(struct blkg_conf_ctx *ctx);
+int blkg_conf_start(struct blkcg *blkcg, struct blkg_conf_ctx *ctx);
 
 /**
  * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg
-- 
2.51.0

From: Yu Kuai <yukuai3@huawei.com>

Currently bfq policy is activated by initializing elevator, while others
are activated by cgroupfs configuration. factor out a helper that
blkcg_mutex is alread held to prepare use new helpers blkg_conf{start,
end} for policys other than bfq.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 block/blk-cgroup.c | 31 +++----------------------------
 block/blk-cgroup.h | 34 +++++++++++++++++++++++++++++++++-
 2 files changed, 36 insertions(+), 29 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 63089ae269cb..4b7324c1d0d5 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1562,32 +1562,14 @@ struct cgroup_subsys io_cgrp_subsys = {
 };
 EXPORT_SYMBOL_GPL(io_cgrp_subsys);
 
-/**
- * blkcg_activate_policy - activate a blkcg policy on a gendisk
- * @disk: gendisk of interest
- * @pol: blkcg policy to activate
- *
- * Activate @pol on @disk.  Requires %GFP_KERNEL context.  @disk goes through
- * bypass mode to populate its blkgs with policy_data for @pol.
- *
- * Activation happens with @disk bypassed, so nobody would be accessing blkgs
- * from IO path.  Update of each blkg is protected by both queue and blkcg
- * locks so that holding either lock and testing blkcg_policy_enabled() is
- * always enough for dereferencing policy data.
- *
- * The caller is responsible for synchronizing [de]activations and policy
- * [un]registerations.  Returns 0 on success, -errno on failure.
- */
-int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
+int __blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
 {
 	struct request_queue *q = disk->queue;
 	struct blkg_policy_data *pd_prealloc = NULL;
 	struct blkcg_gq *blkg, *pinned_blkg = NULL;
-	unsigned int memflags;
 	int ret;
 
-	if (blkcg_policy_enabled(q, pol))
-		return 0;
+	lockdep_assert_held(&q->blkcg_mutex);
 
 	/*
 	 * Policy is allowed to be registered without pd_alloc_fn/pd_free_fn,
@@ -1597,10 +1579,6 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
 	if (WARN_ON_ONCE(!pol->pd_alloc_fn || !pol->pd_free_fn))
 		return -EINVAL;
 
-	if (queue_is_mq(q))
-		memflags = blk_mq_freeze_queue(q);
-	mutex_lock(&q->blkcg_mutex);
-
 	/* blkg_list is pushed at the head, reverse walk to initialize parents first */
 	list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
 		struct blkg_policy_data *pd;
@@ -1640,9 +1618,6 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
 	ret = 0;
 
 out:
-	mutex_unlock(&q->blkcg_mutex);
-	if (queue_is_mq(q))
-		blk_mq_unfreeze_queue(q, memflags);
 	if (pinned_blkg)
 		blkg_put(pinned_blkg);
 	if (pd_prealloc)
@@ -1670,7 +1645,7 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
 	ret = -ENOMEM;
 	goto out;
 }
-EXPORT_SYMBOL_GPL(blkcg_activate_policy);
+EXPORT_SYMBOL_GPL(__blkcg_activate_policy);
 
 /**
  * blkcg_deactivate_policy - deactivate a blkcg policy on a gendisk
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index e7868989befb..c3d16d52c275 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -200,7 +200,7 @@ void blkcg_exit_disk(struct gendisk *disk);
 /* Blkio controller policy registration */
 int blkcg_policy_register(struct blkcg_policy *pol);
 void blkcg_policy_unregister(struct blkcg_policy *pol);
-int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol);
+int __blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol);
 void blkcg_deactivate_policy(struct gendisk *disk,
 			     const struct blkcg_policy *pol);
 
@@ -465,6 +465,38 @@ static inline bool blkcg_policy_enabled(struct request_queue *q,
 	return pol && test_bit(pol->plid, q->blkcg_pols);
 }
 
+/**
+ * blkcg_activate_policy - activate a blkcg policy on a gendisk
+ * @disk: gendisk of interest
+ * @pol: blkcg policy to activate
+ *
+ * Activate @pol on @disk.  Requires %GFP_KERNEL context.  @disk goes through
+ * bypass mode to populate its blkgs with policy_data for @pol.
+ *
+ * Activation happens with @disk bypassed, so nobody would be accessing blkgs
+ * from IO path.  Update of each blkg is protected by both queue and blkcg
+ * locks so that holding either lock and testing blkcg_policy_enabled() is
+ * always enough for dereferencing policy data.
+ *
+ * The caller is responsible for synchronizing [de]activations and policy
+ * [un]registerations.  Returns 0 on success, -errno on failure.
+ */
+static inline int blkcg_activate_policy(struct gendisk *disk,
+					const struct blkcg_policy *pol)
+{
+	struct request_queue *q = disk->queue;
+	int ret;
+
+	if (blkcg_policy_enabled(q, pol))
+		return 0;
+
+	mutex_lock(&q->blkcg_mutex);
+	ret = __blkcg_activate_policy(disk, pol);
+	mutex_unlock(&q->blkcg_mutex);
+
+	return ret;
+}
+
 void blk_cgroup_bio_start(struct bio *bio);
 void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta);
 #else	/* CONFIG_BLK_CGROUP */
-- 
2.51.0

From: Yu Kuai <yukuai3@huawei.com>

No functional changes are intended, make code cleaner.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 block/blk-throttle.c | 29 +++++++++--------------------
 1 file changed, 9 insertions(+), 20 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 7feaa2ef0a6b..761499feed5e 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1326,7 +1326,6 @@ static int blk_throtl_init(struct gendisk *disk)
 {
 	struct request_queue *q = disk->queue;
 	struct throtl_data *td;
-	unsigned int memflags;
 	int ret;
 
 	td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
@@ -1337,14 +1336,13 @@ static int blk_throtl_init(struct gendisk *disk)
 	INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
 	throtl_service_queue_init(&td->service_queue);
 
-	memflags = blk_mq_freeze_queue(disk->queue);
 	blk_mq_quiesce_queue(disk->queue);
 
 	q->td = td;
 	td->queue = q;
 
 	/* activate policy, blk_throtl_activated() will return true */
-	ret = blkcg_activate_policy(disk, &blkcg_policy_throtl);
+	ret = __blkcg_activate_policy(disk, &blkcg_policy_throtl);
 	if (ret) {
 		q->td = NULL;
 		kfree(td);
@@ -1361,7 +1359,6 @@ static int blk_throtl_init(struct gendisk *disk)
 
 out:
 	blk_mq_unquiesce_queue(disk->queue);
-	blk_mq_unfreeze_queue(disk->queue, memflags);
 
 	return ret;
 }
@@ -1377,10 +1374,9 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
 	u64 v;
 
 	blkg_conf_init(&ctx, buf);
-
-	ret = blkg_conf_open_bdev(&ctx);
+	ret = blkg_conf_start(blkcg, &ctx);
 	if (ret)
-		goto out_finish;
+		return ret;
 
 	if (!blk_throtl_activated(ctx.bdev->bd_queue)) {
 		ret = blk_throtl_init(ctx.bdev->bd_disk);
@@ -1388,10 +1384,6 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
 			goto out_finish;
 	}
 
-	ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx);
-	if (ret)
-		goto out_finish;
-
 	ret = -EINVAL;
 	if (sscanf(ctx.body, "%llu", &v) != 1)
 		goto out_finish;
@@ -1408,8 +1400,9 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
 
 	tg_conf_updated(tg, false);
 	ret = 0;
+
 out_finish:
-	blkg_conf_exit(&ctx);
+	blkg_conf_end(&ctx);
 	return ret ?: nbytes;
 }
 
@@ -1561,10 +1554,9 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
 	int ret;
 
 	blkg_conf_init(&ctx, buf);
-
-	ret = blkg_conf_open_bdev(&ctx);
+	ret = blkg_conf_start(blkcg, &ctx);
 	if (ret)
-		goto out_finish;
+		return ret;
 
 	if (!blk_throtl_activated(ctx.bdev->bd_queue)) {
 		ret = blk_throtl_init(ctx.bdev->bd_disk);
@@ -1572,10 +1564,6 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
 			goto out_finish;
 	}
 
-	ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx);
-	if (ret)
-		goto out_finish;
-
 	tg = blkg_to_tg(ctx.blkg);
 	tg_update_carryover(tg);
 
@@ -1626,8 +1614,9 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
 
 	tg_conf_updated(tg, false);
 	ret = 0;
+
 out_finish:
-	blkg_conf_exit(&ctx);
+	blkg_conf_end(&ctx);
 	return ret ?: nbytes;
 }
 
-- 
2.51.0

From: Yu Kuai <yukuai3@huawei.com>

Make code cleaner, we'll create new blkg and then return error
if bfq is not enabled for the device, this is fine because this
is super cold path.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 block/bfq-cgroup.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 43790ae91b57..d39c7a5db35d 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -1056,10 +1056,9 @@ static ssize_t bfq_io_set_device_weight(struct kernfs_open_file *of,
 	u64 v;
 
 	blkg_conf_init(&ctx, buf);
-
-	ret = blkg_conf_prep(blkcg, &blkcg_policy_bfq, &ctx);
+	ret = blkg_conf_start(blkcg, &ctx);
 	if (ret)
-		goto out;
+		return ret;
 
 	if (sscanf(ctx.body, "%llu", &v) == 1) {
 		/* require "default" on dfl */
@@ -1074,6 +1073,10 @@ static ssize_t bfq_io_set_device_weight(struct kernfs_open_file *of,
 	}
 
 	bfqg = blkg_to_bfqg(ctx.blkg);
+	if (!bfqg) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
 
 	ret = -ERANGE;
 	if (!v || (v >= BFQ_MIN_WEIGHT && v <= BFQ_MAX_WEIGHT)) {
@@ -1081,7 +1084,7 @@ static ssize_t bfq_io_set_device_weight(struct kernfs_open_file *of,
 		ret = 0;
 	}
 out:
-	blkg_conf_exit(&ctx);
+	blkg_conf_end(&ctx);
 	return ret ?: nbytes;
 }
 
-- 
2.51.0

From: Yu Kuai <yukuai3@huawei.com>

No functional changes are intended, make code cleaner.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 block/blk-iolatency.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index ce25fbb8aaf6..1199447a2a33 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -768,7 +768,7 @@ static int blk_iolatency_init(struct gendisk *disk)
 			 &blkcg_iolatency_ops);
 	if (ret)
 		goto err_free;
-	ret = blkcg_activate_policy(disk, &blkcg_policy_iolatency);
+	ret = __blkcg_activate_policy(disk, &blkcg_policy_iolatency);
 	if (ret)
 		goto err_qos_del;
 
@@ -837,10 +837,9 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
 	int ret;
 
 	blkg_conf_init(&ctx, buf);
-
-	ret = blkg_conf_open_bdev(&ctx);
+	ret = blkg_conf_start(blkcg, &ctx);
 	if (ret)
-		goto out;
+		return ret;
 
 	/*
 	 * blk_iolatency_init() may fail after rq_qos_add() succeeds which can
@@ -852,10 +851,6 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
 	if (ret)
 		goto out;
 
-	ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, &ctx);
-	if (ret)
-		goto out;
-
 	iolat = blkg_to_lat(ctx.blkg);
 	p = ctx.body;
 
@@ -890,7 +885,7 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
 		iolatency_clear_scaling(blkg);
 	ret = 0;
 out:
-	blkg_conf_exit(&ctx);
+	blkg_conf_end(&ctx);
 	return ret ?: nbytes;
 }
 
-- 
2.51.0

From: Yu Kuai <yukuai3@huawei.com>

No functional changes are intended, make code cleaner.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 block/blk-iocost.c | 47 +++++++++++++++++++++-------------------------
 1 file changed, 21 insertions(+), 26 deletions(-)

diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index 3593547930cc..de3862acb297 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -2931,7 +2931,7 @@ static int blk_iocost_init(struct gendisk *disk)
 	if (ret)
 		goto err_free_ioc;
 
-	ret = blkcg_activate_policy(disk, &blkcg_policy_iocost);
+	ret = __blkcg_activate_policy(disk, &blkcg_policy_iocost);
 	if (ret)
 		goto err_del_qos;
 	return 0;
@@ -3140,12 +3140,15 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
 	}
 
 	blkg_conf_init(&ctx, buf);
-
-	ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, &ctx);
+	ret = blkg_conf_start(blkcg, &ctx);
 	if (ret)
-		goto err;
+		return ret;
 
 	iocg = blkg_to_iocg(ctx.blkg);
+	if (!iocg) {
+		ret = -EOPNOTSUPP;
+		goto err;
+	}
 
 	if (!strncmp(ctx.body, "default", 7)) {
 		v = 0;
@@ -3162,13 +3165,13 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
 	weight_updated(iocg, &now);
 	spin_unlock(&iocg->ioc->lock);
 
-	blkg_conf_exit(&ctx);
+	blkg_conf_end(&ctx);
 	return nbytes;
 
 einval:
 	ret = -EINVAL;
 err:
-	blkg_conf_exit(&ctx);
+	blkg_conf_end(&ctx);
 	return ret;
 }
 
@@ -3226,22 +3229,19 @@ static const match_table_t qos_tokens = {
 static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
 			     size_t nbytes, loff_t off)
 {
+	struct blkcg *blkcg = css_to_blkcg(of_css(of));
 	struct blkg_conf_ctx ctx;
 	struct gendisk *disk;
 	struct ioc *ioc;
 	u32 qos[NR_QOS_PARAMS];
 	bool enable, user;
 	char *body, *p;
-	unsigned long memflags;
 	int ret;
 
 	blkg_conf_init(&ctx, input);
-
-	memflags = blkg_conf_open_bdev_frozen(&ctx);
-	if (IS_ERR_VALUE(memflags)) {
-		ret = memflags;
-		goto err;
-	}
+	ret = blkg_conf_start(blkcg, &ctx);
+	if (ret)
+		return ret;
 
 	body = ctx.body;
 	disk = ctx.bdev->bd_disk;
@@ -3358,14 +3358,14 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
 
 	blk_mq_unquiesce_queue(disk->queue);
 
-	blkg_conf_exit_frozen(&ctx, memflags);
+	blkg_conf_end(&ctx);
 	return nbytes;
 einval:
 	spin_unlock_irq(&ioc->lock);
 	blk_mq_unquiesce_queue(disk->queue);
 	ret = -EINVAL;
 err:
-	blkg_conf_exit_frozen(&ctx, memflags);
+	blkg_conf_end(&ctx);
 	return ret;
 }
 
@@ -3418,9 +3418,9 @@ static const match_table_t i_lcoef_tokens = {
 static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
 				    size_t nbytes, loff_t off)
 {
+	struct blkcg *blkcg = css_to_blkcg(of_css(of));
 	struct blkg_conf_ctx ctx;
 	struct request_queue *q;
-	unsigned int memflags;
 	struct ioc *ioc;
 	u64 u[NR_I_LCOEFS];
 	bool user;
@@ -3428,10 +3428,9 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
 	int ret;
 
 	blkg_conf_init(&ctx, input);
-
-	ret = blkg_conf_open_bdev(&ctx);
+	ret = blkg_conf_start(blkcg, &ctx);
 	if (ret)
-		goto err;
+		return ret;
 
 	body = ctx.body;
 	q = bdev_get_queue(ctx.bdev);
@@ -3448,10 +3447,9 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
 		ioc = q_to_ioc(q);
 	}
 
-	memflags = blk_mq_freeze_queue(q);
 	blk_mq_quiesce_queue(q);
-
 	spin_lock_irq(&ioc->lock);
+
 	memcpy(u, ioc->params.i_lcoefs, sizeof(u));
 	user = ioc->user_cost_model;
 
@@ -3500,20 +3498,17 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
 	spin_unlock_irq(&ioc->lock);
 
 	blk_mq_unquiesce_queue(q);
-	blk_mq_unfreeze_queue(q, memflags);
 
-	blkg_conf_exit(&ctx);
+	blkg_conf_end(&ctx);
 	return nbytes;
 
 einval:
 	spin_unlock_irq(&ioc->lock);
-
 	blk_mq_unquiesce_queue(q);
-	blk_mq_unfreeze_queue(q, memflags);
 
 	ret = -EINVAL;
 err:
-	blkg_conf_exit(&ctx);
+	blkg_conf_end(&ctx);
 	return ret;
 }
 
-- 
2.51.0

From: Yu Kuai <yukuai3@huawei.com>

Remove followings helpers that is now unused:

- blkg_conf_open_bdev()
- blkg_conf_open_bdev_frozen()
- blkg_conf_prep()
- blkg_conf_exit()
- blkg_conf_exit_frozen()

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 block/blk-cgroup.c | 224 +--------------------------------------------
 block/blk-cgroup.h |   6 --
 2 files changed, 1 insertion(+), 229 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 4b7324c1d0d5..d93654334854 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -729,8 +729,7 @@ EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
  * @input: input string
  *
  * Initialize @ctx which can be used to parse blkg config input string @input.
- * Once initialized, @ctx can be used with blkg_conf_open_bdev() and
- * blkg_conf_prep(), and must be cleaned up with blkg_conf_exit().
+ * Once initialized, @ctx can be used with blkg_conf_start().
  */
 void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input)
 {
@@ -738,92 +737,6 @@ void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input)
 }
 EXPORT_SYMBOL_GPL(blkg_conf_init);
 
-/**
- * blkg_conf_open_bdev - parse and open bdev for per-blkg config update
- * @ctx: blkg_conf_ctx initialized with blkg_conf_init()
- *
- * Parse the device node prefix part, MAJ:MIN, of per-blkg config update from
- * @ctx->input and get and store the matching bdev in @ctx->bdev. @ctx->body is
- * set to point past the device node prefix.
- *
- * This function may be called multiple times on @ctx and the extra calls become
- * NOOPs. blkg_conf_prep() implicitly calls this function. Use this function
- * explicitly if bdev access is needed without resolving the blkcg / policy part
- * of @ctx->input. Returns -errno on error.
- */
-int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx)
-{
-	char *input = ctx->input;
-	unsigned int major, minor;
-	struct block_device *bdev;
-	int key_len;
-
-	if (ctx->bdev)
-		return 0;
-
-	if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
-		return -EINVAL;
-
-	input += key_len;
-	if (!isspace(*input))
-		return -EINVAL;
-	input = skip_spaces(input);
-
-	bdev = blkdev_get_no_open(MKDEV(major, minor), false);
-	if (!bdev)
-		return -ENODEV;
-	if (bdev_is_partition(bdev)) {
-		blkdev_put_no_open(bdev);
-		return -ENODEV;
-	}
-
-	mutex_lock(&bdev->bd_queue->rq_qos_mutex);
-	if (!disk_live(bdev->bd_disk)) {
-		blkdev_put_no_open(bdev);
-		mutex_unlock(&bdev->bd_queue->rq_qos_mutex);
-		return -ENODEV;
-	}
-
-	ctx->body = input;
-	ctx->bdev = bdev;
-	return 0;
-}
-/*
- * Similar to blkg_conf_open_bdev, but additionally freezes the queue,
- * acquires q->elevator_lock, and ensures the correct locking order
- * between q->elevator_lock and q->rq_qos_mutex.
- *
- * This function returns negative error on failure. On success it returns
- * memflags which must be saved and later passed to blkg_conf_exit_frozen
- * for restoring the memalloc scope.
- */
-unsigned long __must_check blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx)
-{
-	int ret;
-	unsigned long memflags;
-
-	if (ctx->bdev)
-		return -EINVAL;
-
-	ret = blkg_conf_open_bdev(ctx);
-	if (ret < 0)
-		return ret;
-	/*
-	 * At this point, we haven’t started protecting anything related to QoS,
-	 * so we release q->rq_qos_mutex here, which was first acquired in blkg_
-	 * conf_open_bdev. Later, we re-acquire q->rq_qos_mutex after freezing
-	 * the queue and acquiring q->elevator_lock to maintain the correct
-	 * locking order.
-	 */
-	mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex);
-
-	memflags = blk_mq_freeze_queue(ctx->bdev->bd_queue);
-	mutex_lock(&ctx->bdev->bd_queue->elevator_lock);
-	mutex_lock(&ctx->bdev->bd_queue->rq_qos_mutex);
-
-	return memflags;
-}
-
 void blkg_conf_end(struct blkg_conf_ctx *ctx)
 {
 	struct request_queue *q = bdev_get_queue(ctx->bdev);
@@ -885,141 +798,6 @@ int blkg_conf_start(struct blkcg *blkcg, struct blkg_conf_ctx *ctx)
 }
 EXPORT_SYMBOL_GPL(blkg_conf_start);
 
-/**
- * blkg_conf_prep - parse and prepare for per-blkg config update
- * @blkcg: target block cgroup
- * @pol: target policy
- * @ctx: blkg_conf_ctx initialized with blkg_conf_init()
- *
- * Parse per-blkg config update from @ctx->input and initialize @ctx
- * accordingly. On success, @ctx->body points to the part of @ctx->input
- * following MAJ:MIN, @ctx->bdev points to the target block device and
- * @ctx->blkg to the blkg being configured.
- *
- * blkg_conf_open_bdev() may be called on @ctx beforehand. On success, this
- * function returns with queue lock held and must be followed by
- * blkg_conf_exit().
- */
-int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
-		   struct blkg_conf_ctx *ctx)
-	__acquires(&bdev->bd_queue->blkcg_mutex)
-{
-	struct gendisk *disk;
-	struct request_queue *q;
-	struct blkcg_gq *blkg;
-	int ret;
-
-	ret = blkg_conf_open_bdev(ctx);
-	if (ret)
-		return ret;
-
-	disk = ctx->bdev->bd_disk;
-	q = disk->queue;
-
-	/* Prevent concurrent with blkcg_deactivate_policy() */
-	mutex_lock(&q->blkcg_mutex);
-
-	if (!blkcg_policy_enabled(q, pol)) {
-		ret = -EOPNOTSUPP;
-		goto fail_unlock;
-	}
-
-	blkg = blkg_lookup(blkcg, q);
-	if (blkg)
-		goto success;
-
-	/*
-	 * Create blkgs walking down from blkcg_root to @blkcg, so that all
-	 * non-root blkgs have access to their parents.
-	 */
-	while (true) {
-		struct blkcg *pos = blkcg;
-		struct blkcg *parent;
-
-		parent = blkcg_parent(blkcg);
-		while (parent && !blkg_lookup(parent, q)) {
-			pos = parent;
-			parent = blkcg_parent(parent);
-		}
-
-		if (!blkcg_policy_enabled(q, pol)) {
-			ret = -EOPNOTSUPP;
-			goto fail_unlock;
-		}
-
-		blkg = blkg_lookup(pos, q);
-		if (!blkg) {
-			blkg = blkg_create(pos, disk);
-			if (IS_ERR(blkg)) {
-				ret = PTR_ERR(blkg);
-				goto fail_unlock;
-			}
-		}
-
-		if (pos == blkcg)
-			goto success;
-	}
-success:
-	ctx->blkg = blkg;
-	return 0;
-
-fail_unlock:
-	mutex_unlock(&q->blkcg_mutex);
-	/*
-	 * If queue was bypassing, we should retry.  Do so after a
-	 * short msleep().  It isn't strictly necessary but queue
-	 * can be bypassing for some time and it's always nice to
-	 * avoid busy looping.
-	 */
-	if (ret == -EBUSY) {
-		msleep(10);
-		ret = restart_syscall();
-	}
-	return ret;
-}
-EXPORT_SYMBOL_GPL(blkg_conf_prep);
-
-/**
- * blkg_conf_exit - clean up per-blkg config update
- * @ctx: blkg_conf_ctx initialized with blkg_conf_init()
- *
- * Clean up after per-blkg config update. This function must be called on all
- * blkg_conf_ctx's initialized with blkg_conf_init().
- */
-void blkg_conf_exit(struct blkg_conf_ctx *ctx)
-	__releases(&ctx->bdev->bd_queue->blkcg_mutex)
-	__releases(&ctx->bdev->bd_queue->rq_qos_mutex)
-{
-	if (ctx->blkg) {
-		mutex_unlock(&bdev_get_queue(ctx->bdev)->blkcg_mutex);
-		ctx->blkg = NULL;
-	}
-
-	if (ctx->bdev) {
-		mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex);
-		blkdev_put_no_open(ctx->bdev);
-		ctx->body = NULL;
-		ctx->bdev = NULL;
-	}
-}
-EXPORT_SYMBOL_GPL(blkg_conf_exit);
-
-/*
- * Similar to blkg_conf_exit, but also unfreezes the queue and releases
- * q->elevator_lock. Should be used when blkg_conf_open_bdev_frozen
- * is used to open the bdev.
- */
-void blkg_conf_exit_frozen(struct blkg_conf_ctx *ctx, unsigned long memflags)
-{
-	if (ctx->bdev) {
-		struct request_queue *q = ctx->bdev->bd_queue;
-
-		blkg_conf_exit(ctx);
-		mutex_unlock(&q->elevator_lock);
-		blk_mq_unfreeze_queue(q, memflags);
-	}
-}
-
 static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src)
 {
 	int i;
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index c3d16d52c275..aec801255821 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -221,12 +221,6 @@ struct blkg_conf_ctx {
 };
 
 void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input);
-int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx);
-unsigned long blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx);
-int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
-		   struct blkg_conf_ctx *ctx);
-void blkg_conf_exit(struct blkg_conf_ctx *ctx);
-void blkg_conf_exit_frozen(struct blkg_conf_ctx *ctx, unsigned long memflags);
 void blkg_conf_end(struct blkg_conf_ctx *ctx);
 int blkg_conf_start(struct blkcg *blkcg, struct blkg_conf_ctx *ctx);
 
-- 
2.51.0