Calling iput() on EA inodes while holding xattr_sem or a jbd2 handle can trigger write_inode_now() -> ext4_writepages() -> s_writepages_rwsem, creating a lock ordering issue during mount (!SB_ACTIVE). Add ext4_put_ea_inode() which uses iput_if_not_last() as a fast path. If this is not the last reference, it is dropped immediately. If this is the last reference, the inode is linked onto a per-sb lock-free llist via i_ea_iput_node (embedded in ext4_inode_info, sharing space with the unused xattr_sem of EA inodes via a union) and a delayed worker (1 jiffie) performs the final iput() in a clean context. This avoids per-iput memory allocation. Convert the first call site: ext4_xattr_block_set()'s "Drop the previous xattr block" path, which previously called ext4_xattr_inode_array_free() under xattr_sem + jbd2 handle. The worker is drained in ext4_put_super() before quota shutdown using a loop to handle re-arming (evicting an EA inode may queue further EA inodes). Initialization is placed before journal loading since fast commit replay may trigger evictions that call ext4_put_ea_inode(). Signed-off-by: Yun Zhou Suggested-by: Jan Kara --- fs/ext4/ext4.h | 13 ++++++++- fs/ext4/super.c | 18 +++++++++++- fs/ext4/xattr.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++- fs/ext4/xattr.h | 14 ++++++++++ 4 files changed, 115 insertions(+), 3 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b37c136ea3ab..b9b0ada7774b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1070,8 +1070,14 @@ struct ext4_inode_info { * between readers of EAs and writers of regular file data, so * instead we synchronize on xattr_sem when reading or changing * EAs. + * + * EA inodes (EXT4_EA_INODE_FL) do not use xattr_sem; they reuse + * the space for deferred iput linkage. */ - struct rw_semaphore xattr_sem; + union { + struct rw_semaphore xattr_sem; + struct llist_node i_ea_iput_node; + }; /* * Inodes with EXT4_STATE_ORPHAN_FILE use i_orphan_idx. Otherwise @@ -1770,6 +1776,11 @@ struct ext4_sb_info { struct ext4_es_stats s_es_stats; struct mb_cache *s_ea_block_cache; struct mb_cache *s_ea_inode_cache; + + /* Deferred iput for EA inodes to avoid lock ordering issues */ + struct llist_head s_ea_inode_to_free; + struct delayed_work s_ea_inode_work; + spinlock_t s_es_lock ____cacheline_aligned_in_smp; /* Journal triggers for checksum computation */ diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 245f67d10ded..97f0e7c1b254 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1303,6 +1303,8 @@ static void ext4_put_super(struct super_block *sb) &sb->s_uuid); ext4_unregister_li_request(sb); + /* Drain deferred EA inode iputs while quota is still active. */ + ext4_drain_ea_inode_work(sbi); ext4_quotas_off(sb, EXT4_MAXQUOTAS); destroy_workqueue(sbi->rsv_conversion_wq); @@ -1423,6 +1425,13 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) memset(&ei->i_dquot, 0, sizeof(ei->i_dquot)); #endif ei->jinode = NULL; + /* + * Reinitialize xattr_sem every allocation because EA inodes + * share this space with i_ea_iput_node (via union) which may + * have overwritten the semaphore when the slab object was + * previously used as an EA inode. + */ + init_rwsem(&ei->xattr_sem); INIT_LIST_HEAD(&ei->i_rsv_conversion_list); spin_lock_init(&ei->i_completed_io_lock); ei->i_sync_tid = 0; @@ -1488,7 +1497,6 @@ static void init_once(void *foo) struct ext4_inode_info *ei = foo; INIT_LIST_HEAD(&ei->i_orphan); - init_rwsem(&ei->xattr_sem); init_rwsem(&ei->i_data_sem); inode_init_once(&ei->vfs_inode); ext4_fc_init_inode(&ei->vfs_inode); @@ -5508,6 +5516,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) * The first inode we look at is the journal inode. Don't try * root first: it may be modified in the journal! */ + ext4_init_ea_inode_work(sbi); + if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) { err = ext4_load_and_init_journal(sb, es, ctx); if (err) @@ -5747,6 +5757,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) return 0; failed_mount9: + /* Drain deferred EA inode iputs before quota shutdown */ + ext4_drain_ea_inode_work(sbi); ext4_quotas_off(sb, EXT4_MAXQUOTAS); failed_mount8: __maybe_unused ext4_release_orphan_info(sb); @@ -5767,6 +5779,8 @@ failed_mount8: __maybe_unused if (EXT4_SB(sb)->rsv_conversion_wq) destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); failed_mount_wq: + /* Drain deferred EA inode iputs before freeing structures */ + ext4_drain_ea_inode_work(sbi); ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); sbi->s_ea_inode_cache = NULL; @@ -5777,6 +5791,8 @@ failed_mount8: __maybe_unused ext4_journal_destroy(sbi, sbi->s_journal); } failed_mount3a: + /* Drain deferred EA inode iputs from journal replay */ + ext4_drain_ea_inode_work(sbi); ext4_es_unregister_shrinker(sbi); failed_mount3: /* flush s_sb_upd_work before sbi destroy */ diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 982a1f831e22..ecdad5920b14 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -117,6 +117,8 @@ const struct xattr_handler * const ext4_xattr_handlers[] = { static int ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array, struct inode *inode); +static void ext4_xattr_inode_array_free_deferred(struct super_block *sb, + struct ext4_xattr_inode_array *array); #ifdef CONFIG_LOCKDEP void ext4_xattr_inode_set_class(struct inode *ea_inode) @@ -2187,7 +2189,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, ext4_xattr_release_block(handle, inode, bs->bh, &ea_inode_array, 0 /* extra_credits */); - ext4_xattr_inode_array_free(ea_inode_array); + ext4_xattr_inode_array_free_deferred(inode->i_sb, + ea_inode_array); } error = 0; @@ -3025,6 +3028,74 @@ void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array) kfree(ea_inode_array); } +static void ext4_xattr_inode_array_free_deferred(struct super_block *sb, + struct ext4_xattr_inode_array *array) +{ + int idx; + + if (array == NULL) + return; + + for (idx = 0; idx < array->count; ++idx) + ext4_put_ea_inode(sb, array->inodes[idx]); + kfree(array); +} + +/* + * Worker function for deferred EA inode iput. Processes all inodes queued + * on s_ea_inode_to_free in a context free of xattr_sem/jbd2 handle locks. + */ +static void ext4_ea_inode_work(struct work_struct *work) +{ + struct ext4_sb_info *sbi = container_of(to_delayed_work(work), + struct ext4_sb_info, + s_ea_inode_work); + struct llist_node *node = llist_del_all(&sbi->s_ea_inode_to_free); + struct llist_node *next; + + while (node) { + struct ext4_inode_info *ei = container_of(node, + struct ext4_inode_info, i_ea_iput_node); + next = node->next; + iput(&ei->vfs_inode); + node = next; + } +} + +/* + * Release a VFS reference on an EA inode. Must be used instead of iput() + * in any context where xattr_sem or a jbd2 handle is held. + * + * If this is not the last reference, drops it immediately via + * iput_if_not_last() with no further action needed. + * + * If this is the last reference, the inode is linked onto a per-sb + * llist via i_ea_iput_node (embedded in ext4_inode_info, sharing space + * with the unused xattr_sem) and a delayed worker performs the final + * iput() in a clean context. + */ +void ext4_put_ea_inode(struct super_block *sb, struct inode *inode) +{ + if (!inode) + return; + WARN_ON_ONCE(!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)); + if (iput_if_not_last(inode)) + return; + llist_add(&EXT4_I(inode)->i_ea_iput_node, + &EXT4_SB(sb)->s_ea_inode_to_free); + /* + * Use a short delay to allow multiple EA inodes to accumulate, + * reducing workqueue wakeups when several are released together. + */ + schedule_delayed_work(&EXT4_SB(sb)->s_ea_inode_work, 1); +} + +void ext4_init_ea_inode_work(struct ext4_sb_info *sbi) +{ + init_llist_head(&sbi->s_ea_inode_to_free); + INIT_DELAYED_WORK(&sbi->s_ea_inode_work, ext4_ea_inode_work); +} + /* * ext4_xattr_block_cache_insert() * diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 1fedf44d4fb6..9883ba5569a1 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -190,6 +190,20 @@ extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, struct ext4_xattr_inode_array **array, int extra_credits); extern void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *array); +extern void ext4_init_ea_inode_work(struct ext4_sb_info *sbi); +extern void ext4_put_ea_inode(struct super_block *sb, struct inode *inode); + +/* + * Drain all pending deferred EA inode iputs. Must be called before + * freeing resources that eviction depends on (quota, block allocator). + * Loops because worker iput may trigger eviction that re-queues. + */ +static inline void ext4_drain_ea_inode_work(struct ext4_sb_info *sbi) +{ + while (flush_delayed_work(&sbi->s_ea_inode_work) || + !llist_empty(&sbi->s_ea_inode_to_free)) + ; +} extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_inode *raw_inode, handle_t *handle); -- 2.43.0