Calling iput() on EA inodes while holding xattr_sem or a jbd2 handle can trigger write_inode_now() -> ext4_writepages() -> s_writepages_rwsem, creating a lock ordering issue during mount (!SB_ACTIVE). Add ext4_put_ea_inode() which safely releases EA inode references: when SB_ACTIVE, it calls iput() directly (write_inode_now cannot be triggered); during mount (!SB_ACTIVE), it queues the inode on a per-sb lock-free llist and schedules a delayed worker (1 jiffie) to call iput() in a clean context without holding any ext4 locks. The delay allows multiple inodes to accumulate before the worker runs, reducing context switches. Convert the iput in ext4_xattr_block_set()'s "Drop the previous xattr block" path to use ext4_xattr_inode_array_free_deferred(), which releases EA inodes via ext4_put_ea_inode(). This path previously called ext4_xattr_inode_array_free() (synchronous iput) while holding xattr_sem and a jbd2 handle. The worker is flushed in ext4_put_super() before quota shutdown to ensure all pending EA inode cleanup completes while quota accounting is still active. Signed-off-by: Yun Zhou --- fs/ext4/ext4.h | 5 ++++ fs/ext4/super.c | 6 ++++ fs/ext4/xattr.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++- fs/ext4/xattr.h | 2 ++ 4 files changed, 86 insertions(+), 1 deletion(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 94283a991e5c..e31d60f82a63 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1706,6 +1706,11 @@ struct ext4_sb_info { struct ext4_es_stats s_es_stats; struct mb_cache *s_ea_block_cache; struct mb_cache *s_ea_inode_cache; + + /* Deferred iput for EA inodes to avoid lock ordering issues */ + struct llist_head s_ea_inode_to_free; + struct delayed_work s_ea_inode_work; + spinlock_t s_es_lock ____cacheline_aligned_in_smp; /* Journal triggers for checksum computation */ diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 6a77db4d3124..5dd7c29a70bc 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1303,6 +1303,8 @@ static void ext4_put_super(struct super_block *sb) &sb->s_uuid); ext4_unregister_li_request(sb); + /* Flush deferred EA inode iputs while quota is still active */ + flush_delayed_work(&sbi->s_ea_inode_work); ext4_quotas_off(sb, EXT4_MAXQUOTAS); destroy_workqueue(sbi->rsv_conversion_wq); @@ -5535,6 +5537,9 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) needs_recovery = 0; } + init_llist_head(&sbi->s_ea_inode_to_free); + INIT_DELAYED_WORK(&sbi->s_ea_inode_work, ext4_ea_inode_work); + if (!test_opt(sb, NO_MBCACHE)) { sbi->s_ea_block_cache = ext4_xattr_create_cache(); if (!sbi->s_ea_block_cache) { @@ -5763,6 +5768,7 @@ failed_mount8: __maybe_unused if (EXT4_SB(sb)->rsv_conversion_wq) destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); failed_mount_wq: + flush_delayed_work(&sbi->s_ea_inode_work); ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); sbi->s_ea_inode_cache = NULL; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 982a1f831e22..79de182e22e6 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -117,6 +117,8 @@ const struct xattr_handler * const ext4_xattr_handlers[] = { static int ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array, struct inode *inode); +static void ext4_xattr_inode_array_free_deferred(struct super_block *sb, + struct ext4_xattr_inode_array *array); #ifdef CONFIG_LOCKDEP void ext4_xattr_inode_set_class(struct inode *ea_inode) @@ -2187,7 +2189,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, ext4_xattr_release_block(handle, inode, bs->bh, &ea_inode_array, 0 /* extra_credits */); - ext4_xattr_inode_array_free(ea_inode_array); + ext4_xattr_inode_array_free_deferred(inode->i_sb, + ea_inode_array); } error = 0; @@ -3025,6 +3028,75 @@ void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array) kfree(ea_inode_array); } +static void ext4_xattr_inode_array_free_deferred(struct super_block *sb, + struct ext4_xattr_inode_array *array) +{ + int idx; + + if (array == NULL) + return; + + for (idx = 0; idx < array->count; ++idx) + ext4_put_ea_inode(sb, array->inodes[idx]); + kfree(array); +} + +struct ext4_ea_iput_entry { + struct llist_node node; + struct inode *inode; +}; + +/* + * Worker function for deferred EA inode iput. Processes all inodes queued + * on s_ea_inode_to_free in a context free of xattr_sem/jbd2 handle locks. + */ +void ext4_ea_inode_work(struct work_struct *work) +{ + struct ext4_sb_info *sbi = container_of(to_delayed_work(work), + struct ext4_sb_info, + s_ea_inode_work); + struct llist_node *node = llist_del_all(&sbi->s_ea_inode_to_free); + struct llist_node *next; + + while (node) { + struct ext4_ea_iput_entry *entry = container_of(node, + struct ext4_ea_iput_entry, node); + next = node->next; + iput(entry->inode); + kfree(entry); + node = next; + } +} + +/* + * Release a VFS reference on an EA inode after ext4_xattr_inode_dec_ref() + * may have set i_nlink=0. Must be used instead of iput() in any context + * where xattr_sem or a jbd2 handle is held, because eviction of a nlink=0 + * inode can acquire those same locks. + * + * When SB_ACTIVE, eviction does not call write_inode_now() so direct + * iput() is safe. During mount (!SB_ACTIVE), defer to a workqueue. + * + * For EA inode references dropped without a preceding dec_ref (e.g., + * lookup-only paths where nlink remains >= 1), plain iput() is safe + * and preferred. + */ +void ext4_put_ea_inode(struct super_block *sb, struct inode *inode) +{ + struct ext4_ea_iput_entry *entry; + + if (!inode) + return; + if (sb->s_flags & SB_ACTIVE) { + iput(inode); + return; + } + entry = kmalloc(sizeof(*entry), GFP_NOFS | __GFP_NOFAIL); + entry->inode = inode; + llist_add(&entry->node, &EXT4_SB(sb)->s_ea_inode_to_free); + schedule_delayed_work(&EXT4_SB(sb)->s_ea_inode_work, 1); +} + /* * ext4_xattr_block_cache_insert() * diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 1fedf44d4fb6..52074537dce5 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -190,6 +190,8 @@ extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, struct ext4_xattr_inode_array **array, int extra_credits); extern void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *array); +extern void ext4_ea_inode_work(struct work_struct *work); +extern void ext4_put_ea_inode(struct super_block *sb, struct inode *inode); extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_inode *raw_inode, handle_t *handle); -- 2.43.0