When a container exits, the following BUG_ON() is occasionally triggered: ================================================================== VFS: Busy inodes after unmount of sdb (ext4) ------------[ cut here ]------------ kernel BUG at fs/super.c:695! CPU: 3 PID: 6 Comm: containerd-shim Tainted: G OE K 6.6 #1 pstate: 63400009 (nZCv daif +PAN -UAO +TCO +DIT -SSBS BTYPE=--) pc : generic_shutdown_super+0xf0/0x100 lr : generic_shutdown_super+0xf0/0x100 Call trace: generic_shutdown_super+0xf0/0x100 kill_block_super+0x20/0x48 ext4_kill_sb+0x28/0x60 deactivate_locked_super+0x54/0x130 deactivate_super+0x84/0xa0 cleanup_mnt+0xa4/0x140 __cleanup_mnt+0x18/0x28 task_work_run+0x78/0xe0 do_notify_resume+0x204/0x240 ================================================================== The root cause is a race between cgroup_writeback_umount() and inode_switch_wbs()/cleanup_offline_cgwb(). There is a window between inode_prepare_wbs_switch() returning true and the subsequent wb_queue_isw() call. Following is the process that triggers the issue: CPU A (umount) | CPU B (writeback) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ inode_switch_wbs/cleanup_offline_cgwb atomic_inc(&isw_nr_in_flight) inode_prepare_wbs_switch -> passes SB_ACTIVE check __iget(inode) generic_shutdown_super sb->s_flags &= ~SB_ACTIVE cgroup_writeback_umount(sb) smp_mb() atomic_read(&isw_nr_in_flight) rcu_barrier() -> no pending RCU callbacks flush_workqueue(isw_wq) -> nothing queued, returns evict_inodes(sb) -> Inode skipped as isw still holds a ref. sop->put_super(sb) /* destroys percpu counters */ -> VFS: Busy inodes after unmount! wb_queue_isw() queue_work(isw_wq, ...) /* later in work function */ inode_switch_wbs_work_fn process_inode_switch_wbs iput() -> evict percpu_counter_dec() // UAF! A simple approach to synchronize is to have cgroup_writeback_umount() wait for the isw_nr_in_flight count of the current superblock to drop to zero, since no new increments can occur after SB_ACTIVE is cleared. However, isw_nr_in_flight is global, so introduce a new per-sb counter s_isw_nr_in_flight for this purpose, while retaining the global counter for throttling. Fixes: a1a0e23e4903 ("writeback: flush inode cgroup wb switches instead of pinning super_block") Signed-off-by: Baokun Li --- fs/fs-writeback.c | 44 +++++++++++++++++++--------------- include/linux/fs/super_types.h | 3 +++ 2 files changed, 28 insertions(+), 19 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 71b7fd036e97..7924cf3484fa 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -556,8 +556,12 @@ static void process_inode_switch_wbs(struct bdi_writeback *new_wb, wb_put_many(old_wb, nr_switched); } - for (inodep = isw->inodes; *inodep; inodep++) + for (inodep = isw->inodes; *inodep; inodep++) { + struct super_block *sb = (*inodep)->i_sb; + iput(*inodep); + atomic_dec(&sb->s_isw_nr_in_flight); + } wb_put(new_wb); kfree(isw); atomic_dec(&isw_nr_in_flight); @@ -602,14 +606,15 @@ static bool inode_prepare_wbs_switch(struct inode *inode, { /* * Paired with smp_mb() in cgroup_writeback_umount(). - * isw_nr_in_flight must be increased before checking SB_ACTIVE and - * grabbing an inode, otherwise isw_nr_in_flight can be observed as 0 - * in cgroup_writeback_umount() and the isw_wq will be not flushed. + * s_isw_nr_in_flight must be increased before checking SB_ACTIVE and + * grabbing an inode, otherwise s_isw_nr_in_flight can be observed as + * 0 in cgroup_writeback_umount() and the isw_wq will be not flushed. */ + atomic_inc(&inode->i_sb->s_isw_nr_in_flight); smp_mb(); if (IS_DAX(inode)) - return false; + goto out_dec; /* while holding I_WB_SWITCH, no one else can update the association */ spin_lock(&inode->i_lock); @@ -617,13 +622,17 @@ static bool inode_prepare_wbs_switch(struct inode *inode, inode_state_read(inode) & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) || inode_to_wb(inode) == new_wb) { spin_unlock(&inode->i_lock); - return false; + goto out_dec; } inode_state_set(inode, I_WB_SWITCH); __iget(inode); spin_unlock(&inode->i_lock); return true; + +out_dec: + atomic_dec(&inode->i_sb->s_isw_nr_in_flight); + return false; } static void wb_queue_isw(struct bdi_writeback *wb, @@ -1212,31 +1221,28 @@ int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, * @sb: target super_block * * This function is called when a super_block is about to be destroyed and - * flushes in-flight inode wb switches. An inode wb switch goes through - * RCU and then workqueue, so the two need to be flushed in order to ensure - * that all previously scheduled switches are finished. As wb switches are - * rare occurrences and synchronize_rcu() can take a while, perform - * flushing iff wb switches are in flight. + * waits for all in-flight inode wb switches on this sb to complete. Since + * SB_ACTIVE is cleared before this is called, no new switches can start, + * so the per-sb counter will monotonically decrease to zero. */ void cgroup_writeback_umount(struct super_block *sb) { - if (!(sb->s_bdi->capabilities & BDI_CAP_WRITEBACK)) return; /* * SB_ACTIVE should be reliably cleared before checking - * isw_nr_in_flight, see generic_shutdown_super(). + * s_isw_nr_in_flight, see generic_shutdown_super(). + * + * Paired with smp_mb() in inode_prepare_wbs_switch(), which ensures + * that either we see the incremented counter, or the switcher sees + * SB_ACTIVE cleared and aborts. */ smp_mb(); - if (atomic_read(&isw_nr_in_flight)) { - /* - * Use rcu_barrier() to wait for all pending callbacks to - * ensure that all in-flight wb switches are in the workqueue. - */ - rcu_barrier(); + while (atomic_read(&sb->s_isw_nr_in_flight)) { flush_workqueue(isw_wq); + cond_resched(); } } diff --git a/include/linux/fs/super_types.h b/include/linux/fs/super_types.h index 383050e7fdf5..8030c873016d 100644 --- a/include/linux/fs/super_types.h +++ b/include/linux/fs/super_types.h @@ -274,6 +274,9 @@ struct super_block { /* number of fserrors that are being sent to fsnotify/filesystems */ refcount_t s_pending_errors; + + /* number of in-flight inode writeback switches on this sb */ + atomic_t s_isw_nr_in_flight; } __randomize_layout; /* -- 2.43.7