From: Zizhi Wo [BUG] Our fuzz testing triggered a blkcg use-after-free issue: BUG: KASAN: slab-use-after-free in _raw_spin_lock+0x75/0xe0 Call Trace: ... blkcg_deactivate_policy+0x244/0x4d0 ioc_rqos_exit+0x44/0xe0 rq_qos_exit+0xba/0x120 __del_gendisk+0x50b/0x800 del_gendisk+0xff/0x190 ... [CAUSE] process1 process2 cgroup_rmdir ... css_killed_work_fn offline_css ... blkcg_destroy_blkgs ... __blkg_release css_put(&blkg->blkcg->css) blkg_free INIT_WORK(xxx, blkg_free_workfn) schedule_work css_put ... blkcg_css_free kfree(blkcg)--------blkcg has been freed!!! ====================================schedule_work blkg_free_workfn __del_gendisk rq_qos_exit ioc_rqos_exit blkcg_deactivate_policy mutex_lock(&q->blkcg_mutex) spin_lock_irq(&q->queue_lock) list_for_each_entry(blkg, xxx) blkcg = blkg->blkcg spin_lock(&blkcg->lock)-------UAF!!! mutex_lock(&q->blkcg_mutex) spin_lock_irq(&q->queue_lock) /* Only then is the blkg removed from the list */ list_del_init(&blkg->q_node) As a result, a blkg can still be reachable through q->blkg_list while its ->blkcg has already been freed. [Fix] Fix this by deferring the blkcg css_put() until after the blkg has been unlinked from q->blkg_list in blkg_free_workfn(). This ensures that the blkcg outlives every blkg still reachable through q->blkg_list, so any iterator holding q->queue_lock is guaranteed to observe a valid blkg->blkcg. While at it, move css_tryget_online() from blkg_create() into blkg_alloc() so that the css reference is owned by the alloc/free pair rather than straddling layers: blkg_alloc() <-> blkg_free() blkg_create() <-> blkg_destroy() Fixes: f1c006f1c685 ("blk-cgroup: synchronize pd_free_fn() from blkg_free_workfn() and blkcg_deactivate_policy()") Suggested-by: Hou Tao Signed-off-by: Zizhi Wo --- v2: - Move css_tryget_online() from blkg_create() into blkg_alloc() so the css reference follows the blkg's own lifetime, making the put in blkg_free_workfn() symmetric with the get in blkg_alloc(). v1: https://lore.kernel.org/all/20260518010932.633707-1-wozizhi@huaweicloud.com/ block/blk-cgroup.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index bc63bd220865..27414c291e49 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -132,10 +132,15 @@ static void blkg_free_workfn(struct work_struct *work) if (blkg->parent) blkg_put(blkg->parent); spin_lock_irq(&q->queue_lock); list_del_init(&blkg->q_node); spin_unlock_irq(&q->queue_lock); + /* + * Release blkcg css ref only after blkg is removed from q->blkg_list, + * so concurrent iterators won't see a blkg with a freed blkcg. + */ + css_put(&blkg->blkcg->css); mutex_unlock(&q->blkcg_mutex); blk_put_queue(q); free_percpu(blkg->iostat_cpu); percpu_ref_exit(&blkg->refcnt); @@ -177,12 +182,10 @@ static void __blkg_release(struct rcu_head *rcu) * blkg_stat_lock is for serializing blkg stat update */ for_each_possible_cpu(cpu) __blkcg_rstat_flush(blkcg, cpu); - /* release the blkcg and parent blkg refs this blkg has been holding */ - css_put(&blkg->blkcg->css); blkg_free(blkg); } /* * A group is RCU protected, but having an rcu lock does not mean that one @@ -311,10 +314,13 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk, blkg->iostat_cpu = alloc_percpu_gfp(struct blkg_iostat_set, gfp_mask); if (!blkg->iostat_cpu) goto out_exit_refcnt; if (!blk_get_queue(disk->queue)) goto out_free_iostat; + /* blkg holds a reference to blkcg */ + if (!css_tryget_online(&blkcg->css)) + goto out_put_queue; blkg->q = disk->queue; INIT_LIST_HEAD(&blkg->q_node); blkg->blkcg = blkcg; blkg->iostat.blkg = blkg; @@ -351,10 +357,12 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk, out_free_pds: while (--i >= 0) if (blkg->pd[i]) blkcg_policy[i]->pd_free_fn(blkg->pd[i]); + css_put(&blkcg->css); +out_put_queue: blk_put_queue(disk->queue); out_free_iostat: free_percpu(blkg->iostat_cpu); out_exit_refcnt: percpu_ref_exit(&blkg->refcnt); @@ -379,32 +387,26 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk, if (blk_queue_dying(disk->queue)) { ret = -ENODEV; goto err_free_blkg; } - /* blkg holds a reference to blkcg */ - if (!css_tryget_online(&blkcg->css)) { - ret = -ENODEV; - goto err_free_blkg; - } - /* allocate */ if (!new_blkg) { new_blkg = blkg_alloc(blkcg, disk, GFP_NOWAIT); if (unlikely(!new_blkg)) { ret = -ENOMEM; - goto err_put_css; + goto err_free_blkg; } } blkg = new_blkg; /* link parent */ if (blkcg_parent(blkcg)) { blkg->parent = blkg_lookup(blkcg_parent(blkcg), disk->queue); if (WARN_ON_ONCE(!blkg->parent)) { ret = -ENODEV; - goto err_put_css; + goto err_free_blkg; } blkg_get(blkg->parent); } /* invoke per-policy init */ @@ -440,12 +442,10 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk, /* @blkg failed fully initialized, use the usual release path */ blkg_put(blkg); return ERR_PTR(ret); -err_put_css: - css_put(&blkcg->css); err_free_blkg: if (new_blkg) blkg_free(new_blkg); return ERR_PTR(ret); } -- 2.52.0