Add a helper that drops an inode reference only if the caller does not hold the last one. Returns true if the reference was dropped, false otherwise. This is useful for filesystems that need to release inode references in contexts where triggering final iput (and thus eviction) would be unsafe due to lock ordering constraints. The caller can check the return value and defer the final iput to a safe context. Unlike iput_not_last() which BUG_ON's if called with the last ref, this variant is designed to be called speculatively. Signed-off-by: Yun Zhou Suggested-by: Jan Kara --- include/linux/fs.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/include/linux/fs.h b/include/linux/fs.h index 6da44573ce45..4916a9d54347 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2418,6 +2418,19 @@ static inline void super_set_sysfs_name_generic(struct super_block *sb, const ch extern void ihold(struct inode * inode); extern void iput(struct inode *); void iput_not_last(struct inode *); + +/** + * iput_if_not_last - drop an inode reference only if it is not the last one + * @inode: inode to put + * + * Returns true if the reference was dropped, false if this was the last + * reference and the caller must arrange for final iput() in a safe context. + */ +static inline bool iput_if_not_last(struct inode *inode) +{ + return atomic_add_unless(&inode->i_count, -1, 1); +} + int inode_update_time(struct inode *inode, enum fs_update_time type, unsigned int flags); int generic_update_time(struct inode *inode, enum fs_update_time type, -- 2.43.0 Calling iput() on EA inodes while holding xattr_sem or a jbd2 handle can trigger write_inode_now() -> ext4_writepages() -> s_writepages_rwsem, creating a lock ordering issue during mount (!SB_ACTIVE). Add ext4_put_ea_inode() which uses iput_if_not_last() as a fast path. If this is not the last reference, it is dropped immediately. If this is the last reference, the inode is linked onto a per-sb lock-free llist via i_ea_iput_node (embedded in ext4_inode_info, sharing space with the unused xattr_sem of EA inodes via a union) and a delayed worker (1 jiffie) performs the final iput() in a clean context. This avoids per-iput memory allocation. Convert the first call site: ext4_xattr_block_set()'s "Drop the previous xattr block" path, which previously called ext4_xattr_inode_array_free() under xattr_sem + jbd2 handle. The worker is drained in ext4_put_super() before quota shutdown using a loop to handle re-arming (evicting an EA inode may queue further EA inodes). Initialization is placed before journal loading since fast commit replay may trigger evictions that call ext4_put_ea_inode(). Signed-off-by: Yun Zhou Suggested-by: Jan Kara --- fs/ext4/ext4.h | 13 ++++++++- fs/ext4/super.c | 19 ++++++++++++- fs/ext4/xattr.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++- fs/ext4/xattr.h | 14 ++++++++++ 4 files changed, 116 insertions(+), 3 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b37c136ea3ab..b9b0ada7774b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1070,8 +1070,14 @@ struct ext4_inode_info { * between readers of EAs and writers of regular file data, so * instead we synchronize on xattr_sem when reading or changing * EAs. + * + * EA inodes (EXT4_EA_INODE_FL) do not use xattr_sem; they reuse + * the space for deferred iput linkage. */ - struct rw_semaphore xattr_sem; + union { + struct rw_semaphore xattr_sem; + struct llist_node i_ea_iput_node; + }; /* * Inodes with EXT4_STATE_ORPHAN_FILE use i_orphan_idx. Otherwise @@ -1770,6 +1776,11 @@ struct ext4_sb_info { struct ext4_es_stats s_es_stats; struct mb_cache *s_ea_block_cache; struct mb_cache *s_ea_inode_cache; + + /* Deferred iput for EA inodes to avoid lock ordering issues */ + struct llist_head s_ea_inode_to_free; + struct delayed_work s_ea_inode_work; + spinlock_t s_es_lock ____cacheline_aligned_in_smp; /* Journal triggers for checksum computation */ diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 245f67d10ded..ed1d0cad2bc2 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1303,6 +1303,8 @@ static void ext4_put_super(struct super_block *sb) &sb->s_uuid); ext4_unregister_li_request(sb); + /* Drain deferred EA inode iputs while quota is still active. */ + ext4_drain_ea_inode_work(sbi); ext4_quotas_off(sb, EXT4_MAXQUOTAS); destroy_workqueue(sbi->rsv_conversion_wq); @@ -1423,6 +1425,13 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) memset(&ei->i_dquot, 0, sizeof(ei->i_dquot)); #endif ei->jinode = NULL; + /* + * Reinitialize xattr_sem every allocation because EA inodes + * share this space with i_ea_iput_node (via union) which may + * have overwritten the semaphore when the slab object was + * previously used as an EA inode. + */ + init_rwsem(&ei->xattr_sem); INIT_LIST_HEAD(&ei->i_rsv_conversion_list); spin_lock_init(&ei->i_completed_io_lock); ei->i_sync_tid = 0; @@ -1488,7 +1497,6 @@ static void init_once(void *foo) struct ext4_inode_info *ei = foo; INIT_LIST_HEAD(&ei->i_orphan); - init_rwsem(&ei->xattr_sem); init_rwsem(&ei->i_data_sem); inode_init_once(&ei->vfs_inode); ext4_fc_init_inode(&ei->vfs_inode); @@ -5497,6 +5505,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) ext4_has_feature_orphan_present(sb) || ext4_has_feature_journal_needs_recovery(sb)); + ext4_init_ea_inode_work(sbi); + if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb)) { err = ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)); if (err) @@ -5508,6 +5518,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) * The first inode we look at is the journal inode. Don't try * root first: it may be modified in the journal! */ + if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) { err = ext4_load_and_init_journal(sb, es, ctx); if (err) @@ -5747,6 +5758,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) return 0; failed_mount9: + /* Drain deferred EA inode iputs before quota shutdown */ + ext4_drain_ea_inode_work(sbi); ext4_quotas_off(sb, EXT4_MAXQUOTAS); failed_mount8: __maybe_unused ext4_release_orphan_info(sb); @@ -5767,6 +5780,8 @@ failed_mount8: __maybe_unused if (EXT4_SB(sb)->rsv_conversion_wq) destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); failed_mount_wq: + /* Drain deferred EA inode iputs before freeing structures */ + ext4_drain_ea_inode_work(sbi); ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); sbi->s_ea_inode_cache = NULL; @@ -5777,6 +5792,8 @@ failed_mount8: __maybe_unused ext4_journal_destroy(sbi, sbi->s_journal); } failed_mount3a: + /* Drain deferred EA inode iputs from journal replay */ + ext4_drain_ea_inode_work(sbi); ext4_es_unregister_shrinker(sbi); failed_mount3: /* flush s_sb_upd_work before sbi destroy */ diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 982a1f831e22..ecdad5920b14 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -117,6 +117,8 @@ const struct xattr_handler * const ext4_xattr_handlers[] = { static int ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array, struct inode *inode); +static void ext4_xattr_inode_array_free_deferred(struct super_block *sb, + struct ext4_xattr_inode_array *array); #ifdef CONFIG_LOCKDEP void ext4_xattr_inode_set_class(struct inode *ea_inode) @@ -2187,7 +2189,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, ext4_xattr_release_block(handle, inode, bs->bh, &ea_inode_array, 0 /* extra_credits */); - ext4_xattr_inode_array_free(ea_inode_array); + ext4_xattr_inode_array_free_deferred(inode->i_sb, + ea_inode_array); } error = 0; @@ -3025,6 +3028,74 @@ void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array) kfree(ea_inode_array); } +static void ext4_xattr_inode_array_free_deferred(struct super_block *sb, + struct ext4_xattr_inode_array *array) +{ + int idx; + + if (array == NULL) + return; + + for (idx = 0; idx < array->count; ++idx) + ext4_put_ea_inode(sb, array->inodes[idx]); + kfree(array); +} + +/* + * Worker function for deferred EA inode iput. Processes all inodes queued + * on s_ea_inode_to_free in a context free of xattr_sem/jbd2 handle locks. + */ +static void ext4_ea_inode_work(struct work_struct *work) +{ + struct ext4_sb_info *sbi = container_of(to_delayed_work(work), + struct ext4_sb_info, + s_ea_inode_work); + struct llist_node *node = llist_del_all(&sbi->s_ea_inode_to_free); + struct llist_node *next; + + while (node) { + struct ext4_inode_info *ei = container_of(node, + struct ext4_inode_info, i_ea_iput_node); + next = node->next; + iput(&ei->vfs_inode); + node = next; + } +} + +/* + * Release a VFS reference on an EA inode. Must be used instead of iput() + * in any context where xattr_sem or a jbd2 handle is held. + * + * If this is not the last reference, drops it immediately via + * iput_if_not_last() with no further action needed. + * + * If this is the last reference, the inode is linked onto a per-sb + * llist via i_ea_iput_node (embedded in ext4_inode_info, sharing space + * with the unused xattr_sem) and a delayed worker performs the final + * iput() in a clean context. + */ +void ext4_put_ea_inode(struct super_block *sb, struct inode *inode) +{ + if (!inode) + return; + WARN_ON_ONCE(!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)); + if (iput_if_not_last(inode)) + return; + llist_add(&EXT4_I(inode)->i_ea_iput_node, + &EXT4_SB(sb)->s_ea_inode_to_free); + /* + * Use a short delay to allow multiple EA inodes to accumulate, + * reducing workqueue wakeups when several are released together. + */ + schedule_delayed_work(&EXT4_SB(sb)->s_ea_inode_work, 1); +} + +void ext4_init_ea_inode_work(struct ext4_sb_info *sbi) +{ + init_llist_head(&sbi->s_ea_inode_to_free); + INIT_DELAYED_WORK(&sbi->s_ea_inode_work, ext4_ea_inode_work); +} + /* * ext4_xattr_block_cache_insert() * diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 1fedf44d4fb6..9883ba5569a1 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -190,6 +190,20 @@ extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, struct ext4_xattr_inode_array **array, int extra_credits); extern void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *array); +extern void ext4_init_ea_inode_work(struct ext4_sb_info *sbi); +extern void ext4_put_ea_inode(struct super_block *sb, struct inode *inode); + +/* + * Drain all pending deferred EA inode iputs. Must be called before + * freeing resources that eviction depends on (quota, block allocator). + * Loops because worker iput may trigger eviction that re-queues. + */ +static inline void ext4_drain_ea_inode_work(struct ext4_sb_info *sbi) +{ + while (flush_delayed_work(&sbi->s_ea_inode_work) || + !llist_empty(&sbi->s_ea_inode_to_free)) + ; +} extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_inode *raw_inode, handle_t *handle); -- 2.43.0 Convert all iput() calls on EA inodes in xattr code paths to use ext4_put_ea_inode(). This establishes a uniform rule: every EA inode reference release in ext4 xattr code goes through ext4_put_ea_inode(), eliminating the need to analyze each call site individually for lock safety. Converted sites: - ext4_xattr_inode_get() read path - ext4_xattr_inode_inc_ref_all() main loop and cleanup path - ext4_xattr_inode_dec_ref_all() error paths - ext4_xattr_inode_create() error path - ext4_xattr_inode_cache_find() mismatch path - ext4_xattr_inode_lookup_create() out_err - ext4_xattr_set_entry() old_ea_inode - ext4_xattr_block_set() new block path, cleanup, and tmp_inode - ext4_xattr_ibody_set() error and success paths - ext4_xattr_delete_inode() quota loop For most of these, iput_if_not_last() will succeed (the EA inode has other references) making the overhead a single atomic operation. Signed-off-by: Yun Zhou --- fs/ext4/xattr.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index ecdad5920b14..90b693b78a45 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -569,7 +569,7 @@ ext4_xattr_inode_get(struct inode *inode, struct ext4_xattr_entry *entry, ea_inode->i_ino, true /* reusable */); } out: - iput(ea_inode); + ext4_put_ea_inode(inode->i_sb, ea_inode); return err; } @@ -1106,10 +1106,10 @@ static int ext4_xattr_inode_inc_ref_all(handle_t *handle, struct inode *parent, err = ext4_xattr_inode_inc_ref(handle, ea_inode); if (err) { ext4_warning_inode(ea_inode, "inc ref error %d", err); - iput(ea_inode); + ext4_put_ea_inode(parent->i_sb, ea_inode); goto cleanup; } - iput(ea_inode); + ext4_put_ea_inode(parent->i_sb, ea_inode); } return 0; @@ -1135,7 +1135,7 @@ static int ext4_xattr_inode_inc_ref_all(handle_t *handle, struct inode *parent, if (err) ext4_warning_inode(ea_inode, "cleanup dec ref error %d", err); - iput(ea_inode); + ext4_put_ea_inode(parent->i_sb, ea_inode); } return saved_err; } @@ -1203,7 +1203,7 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, if (err) { ext4_warning_inode(ea_inode, "Expand inode array err=%d", err); - iput(ea_inode); + ext4_put_ea_inode(parent->i_sb, ea_inode); continue; } @@ -1507,7 +1507,7 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle, if (ext4_xattr_inode_dec_ref(handle, ea_inode)) ext4_warning_inode(ea_inode, "cleanup dec ref error %d", err); - iput(ea_inode); + ext4_put_ea_inode(inode->i_sb, ea_inode); return ERR_PTR(err); } @@ -1566,7 +1566,7 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value, kvfree(ea_data); return ea_inode; } - iput(ea_inode); + ext4_put_ea_inode(inode->i_sb, ea_inode); next_entry: ce = mb_cache_entry_find_next(ea_inode_cache, ce); } @@ -1617,7 +1617,7 @@ static struct inode *ext4_xattr_inode_lookup_create(handle_t *handle, ea_inode->i_ino, true /* reusable */); return ea_inode; out_err: - iput(ea_inode); + ext4_put_ea_inode(inode->i_sb, ea_inode); ext4_xattr_inode_free_quota(inode, NULL, value_len); return ERR_PTR(err); } @@ -1850,7 +1850,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, ret = 0; out: - iput(old_ea_inode); + ext4_put_ea_inode(inode->i_sb, old_ea_inode); return ret; } @@ -2012,7 +2012,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, old_ea_inode_quota = le32_to_cpu( s->here->e_value_size); } - iput(tmp_inode); + ext4_put_ea_inode(inode->i_sb, tmp_inode); s->here->e_value_inum = 0; s->here->e_value_size = 0; @@ -2152,7 +2152,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, ext4_warning_inode(ea_inode, "dec ref error=%d", error); - iput(ea_inode); + ext4_put_ea_inode(inode->i_sb, ea_inode); ea_inode = NULL; } @@ -2206,7 +2206,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, ext4_xattr_inode_free_quota(inode, ea_inode, i_size_read(ea_inode)); } - iput(ea_inode); + ext4_put_ea_inode(inode->i_sb, ea_inode); } if (ce) mb_cache_entry_put(ea_block_cache, ce); @@ -2288,7 +2288,7 @@ int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, ext4_xattr_inode_free_quota(inode, ea_inode, i_size_read(ea_inode)); - iput(ea_inode); + ext4_put_ea_inode(inode->i_sb, ea_inode); } return error; } @@ -2300,7 +2300,7 @@ int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, header->h_magic = cpu_to_le32(0); ext4_clear_inode_state(inode, EXT4_STATE_XATTR); } - iput(ea_inode); + ext4_put_ea_inode(inode->i_sb, ea_inode); return 0; } @@ -2989,7 +2989,7 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, continue; ext4_xattr_inode_free_quota(inode, ea_inode, le32_to_cpu(entry->e_value_size)); - iput(ea_inode); + ext4_put_ea_inode(inode->i_sb, ea_inode); } } -- 2.43.0 Now that ext4_put_ea_inode() handles deferred iput safely for all cases (using iput_if_not_last + embedded llist_node), the ea_inode_array mechanism for batching deferred iputs is redundant. Remove: - ext4_expand_inode_array() and ext4_xattr_inode_array_free() - ext4_xattr_inode_array_free_deferred() - struct ext4_xattr_inode_array and EIA_INCR/EIA_MASK defines - ea_inode_array parameter from ext4_xattr_inode_dec_ref_all(), ext4_xattr_release_block(), and ext4_xattr_delete_inode() - ea_inode_array variable from ext4_evict_inode() Instead, ext4_xattr_inode_dec_ref_all() now calls ext4_put_ea_inode() directly after processing each EA inode. This simplifies the code by eliminating multi-layer parameter threading and removes the need for callers to manage array lifetime. Signed-off-by: Yun Zhou Suggested-by: Jan Kara --- fs/ext4/inode.c | 6 +--- fs/ext4/xattr.c | 95 +++---------------------------------------------- fs/ext4/xattr.h | 7 ---- 3 files changed, 6 insertions(+), 102 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0d131371ad3d..6f1b84e46a2e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -176,7 +176,6 @@ void ext4_evict_inode(struct inode *inode) * (xattr block freeing), bitmap, group descriptor (inode freeing) */ int extra_credits = 6; - struct ext4_xattr_inode_array *ea_inode_array = NULL; bool freeze_protected = false; trace_ext4_evict_inode(inode); @@ -282,8 +281,7 @@ void ext4_evict_inode(struct inode *inode) } /* Remove xattr references. */ - err = ext4_xattr_delete_inode(handle, inode, &ea_inode_array, - extra_credits); + err = ext4_xattr_delete_inode(handle, inode, extra_credits); if (err) { ext4_warning(inode->i_sb, "xattr delete (err %d)", err); stop_handle: @@ -291,7 +289,6 @@ void ext4_evict_inode(struct inode *inode) ext4_orphan_del(NULL, inode); if (freeze_protected) sb_end_intwrite(inode->i_sb); - ext4_xattr_inode_array_free(ea_inode_array); goto no_delete; } @@ -321,7 +318,6 @@ void ext4_evict_inode(struct inode *inode) ext4_journal_stop(handle); if (freeze_protected) sb_end_intwrite(inode->i_sb); - ext4_xattr_inode_array_free(ea_inode_array); return; no_delete: /* diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 90b693b78a45..7f334349bd4f 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -114,12 +114,6 @@ const struct xattr_handler * const ext4_xattr_handlers[] = { #define EA_INODE_CACHE(inode) (((struct ext4_sb_info *) \ inode->i_sb->s_fs_info)->s_ea_inode_cache) -static int -ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array, - struct inode *inode); -static void ext4_xattr_inode_array_free_deferred(struct super_block *sb, - struct ext4_xattr_inode_array *array); - #ifdef CONFIG_LOCKDEP void ext4_xattr_inode_set_class(struct inode *ea_inode) { @@ -1162,7 +1156,6 @@ static void ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, struct buffer_head *bh, struct ext4_xattr_entry *first, bool block_csum, - struct ext4_xattr_inode_array **ea_inode_array, int extra_credits, bool skip_quota) { struct inode *ea_inode; @@ -1199,14 +1192,6 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, if (err) continue; - err = ext4_expand_inode_array(ea_inode_array, ea_inode); - if (err) { - ext4_warning_inode(ea_inode, - "Expand inode array err=%d", err); - ext4_put_ea_inode(parent->i_sb, ea_inode); - continue; - } - err = ext4_journal_ensure_credits_fn(handle, credits, credits, ext4_free_metadata_revoke_credits(parent->i_sb, 1), ext4_xattr_restart_fn(handle, parent, bh, block_csum, @@ -1214,6 +1199,7 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, if (err < 0) { ext4_warning_inode(ea_inode, "Ensure credits err=%d", err); + ext4_put_ea_inode(parent->i_sb, ea_inode); continue; } if (err > 0) { @@ -1223,6 +1209,7 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, ext4_warning_inode(ea_inode, "Re-get write access err=%d", err); + ext4_put_ea_inode(parent->i_sb, ea_inode); continue; } } @@ -1231,6 +1218,7 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, if (err) { ext4_warning_inode(ea_inode, "ea_inode dec ref err=%d", err); + ext4_put_ea_inode(parent->i_sb, ea_inode); continue; } @@ -1247,6 +1235,7 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, entry->e_value_inum = 0; entry->e_value_size = 0; + ext4_put_ea_inode(parent->i_sb, ea_inode); dirty = true; } @@ -1273,7 +1262,6 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, static void ext4_xattr_release_block(handle_t *handle, struct inode *inode, struct buffer_head *bh, - struct ext4_xattr_inode_array **ea_inode_array, int extra_credits) { struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); @@ -1315,7 +1303,6 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, ext4_xattr_inode_dec_ref_all(handle, inode, bh, BFIRST(bh), true /* block_csum */, - ea_inode_array, extra_credits, true /* skip_quota */); ext4_free_blocks(handle, inode, bh, 0, 1, @@ -2184,13 +2171,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, /* Drop the previous xattr block. */ if (bs->bh && bs->bh != new_bh) { - struct ext4_xattr_inode_array *ea_inode_array = NULL; - ext4_xattr_release_block(handle, inode, bs->bh, - &ea_inode_array, 0 /* extra_credits */); - ext4_xattr_inode_array_free_deferred(inode->i_sb, - ea_inode_array); } error = 0; @@ -2866,46 +2848,6 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, return error; } -#define EIA_INCR 16 /* must be 2^n */ -#define EIA_MASK (EIA_INCR - 1) - -/* Add the large xattr @inode into @ea_inode_array for deferred iput(). - * If @ea_inode_array is new or full it will be grown and the old - * contents copied over. - */ -static int -ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array, - struct inode *inode) -{ - if (*ea_inode_array == NULL) { - /* - * Start with 15 inodes, so it fits into a power-of-two size. - */ - (*ea_inode_array) = kmalloc_flex(**ea_inode_array, inodes, - EIA_MASK, GFP_NOFS); - if (*ea_inode_array == NULL) - return -ENOMEM; - (*ea_inode_array)->count = 0; - } else if (((*ea_inode_array)->count & EIA_MASK) == EIA_MASK) { - /* expand the array once all 15 + n * 16 slots are full */ - struct ext4_xattr_inode_array *new_array = NULL; - - new_array = kmalloc_flex(**ea_inode_array, inodes, - (*ea_inode_array)->count + EIA_INCR, - GFP_NOFS); - if (new_array == NULL) - return -ENOMEM; - memcpy(new_array, *ea_inode_array, - struct_size(*ea_inode_array, inodes, - (*ea_inode_array)->count)); - kfree(*ea_inode_array); - *ea_inode_array = new_array; - } - (*ea_inode_array)->count++; - (*ea_inode_array)->inodes[(*ea_inode_array)->count - 1] = inode; - return 0; -} - /* * ext4_xattr_delete_inode() * @@ -2916,7 +2858,6 @@ ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array, * references on xattr block and xattr inodes. */ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, - struct ext4_xattr_inode_array **ea_inode_array, int extra_credits) { struct buffer_head *bh = NULL; @@ -2955,7 +2896,6 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, ext4_xattr_inode_dec_ref_all(handle, inode, iloc.bh, IFIRST(header), false /* block_csum */, - ea_inode_array, extra_credits, false /* skip_quota */); } @@ -2994,7 +2934,7 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, } - ext4_xattr_release_block(handle, inode, bh, ea_inode_array, + ext4_xattr_release_block(handle, inode, bh, extra_credits); /* * Update i_file_acl value in the same transaction that releases @@ -3016,31 +2956,6 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, return error; } -void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array) -{ - int idx; - - if (ea_inode_array == NULL) - return; - - for (idx = 0; idx < ea_inode_array->count; ++idx) - iput(ea_inode_array->inodes[idx]); - kfree(ea_inode_array); -} - -static void ext4_xattr_inode_array_free_deferred(struct super_block *sb, - struct ext4_xattr_inode_array *array) -{ - int idx; - - if (array == NULL) - return; - - for (idx = 0; idx < array->count; ++idx) - ext4_put_ea_inode(sb, array->inodes[idx]); - kfree(array); -} - /* * Worker function for deferred EA inode iput. Processes all inodes queued * on s_ea_inode_to_free in a context free of xattr_sem/jbd2 handle locks. diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 9883ba5569a1..8214a31fe001 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -131,11 +131,6 @@ struct ext4_xattr_ibody_find { struct ext4_iloc iloc; }; -struct ext4_xattr_inode_array { - unsigned int count; - struct inode *inodes[] __counted_by(count); -}; - extern const struct xattr_handler ext4_xattr_user_handler; extern const struct xattr_handler ext4_xattr_trusted_handler; extern const struct xattr_handler ext4_xattr_security_handler; @@ -187,9 +182,7 @@ extern int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode, bool is_create); extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, - struct ext4_xattr_inode_array **array, int extra_credits); -extern void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *array); extern void ext4_init_ea_inode_work(struct ext4_sb_info *sbi); extern void ext4_put_ea_inode(struct super_block *sb, struct inode *inode); -- 2.43.0 On a corrupted filesystem, multiple xattr entries may reference the same EA inode. When ext4_xattr_inode_dec_ref_all() processes such entries, it can dec_ref the EA inode (setting nlink=0) and queue it for deferred iput. If the deferred worker runs before the loop processes the duplicate entry, the second iget() may block on I_FREEING while the worker's eviction waits for the caller's transaction to commit -- classic ABBA deadlock. Fix by tracking successfully processed EA inodes on a per-call llist (reusing i_ea_iput_node) and skipping any ea_ino already in the list. This covers both intra-block duplicates and cross ibody/block duplicates in ext4_xattr_delete_inode(). The actual ext4_put_ea_inode() is deferred until after the processing loop completes (ext4_put_ea_inode_llist), ensuring no EA inode is queued for eviction while the loop is still iterating. Signed-off-by: Yun Zhou --- fs/ext4/xattr.c | 68 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 61 insertions(+), 7 deletions(-) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 7f334349bd4f..5c929043e44a 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1152,11 +1152,41 @@ static int ext4_xattr_restart_fn(handle_t *handle, struct inode *inode, return 0; } +/* Check if an EA inode number is already in the processed llist. */ +static bool ext4_ea_ino_in_llist(unsigned int ea_ino, + struct llist_head *processed) +{ + struct ext4_inode_info *ei; + + llist_for_each_entry(ei, processed->first, i_ea_iput_node) { + if (ei->vfs_inode.i_ino == ea_ino) + return true; + } + return false; +} + +/* Put all EA inodes on a processed llist via ext4_put_ea_inode. */ +static void ext4_put_ea_inode_llist(struct super_block *sb, + struct llist_head *processed) +{ + struct llist_node *node = llist_del_all(processed); + struct llist_node *next; + + while (node) { + struct ext4_inode_info *ei = container_of(node, + struct ext4_inode_info, i_ea_iput_node); + next = node->next; + ext4_put_ea_inode(sb, &ei->vfs_inode); + node = next; + } +} + static void ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, struct buffer_head *bh, struct ext4_xattr_entry *first, bool block_csum, - int extra_credits, bool skip_quota) + int extra_credits, bool skip_quota, + struct llist_head *processed) { struct inode *ea_inode; struct ext4_xattr_entry *entry; @@ -1186,6 +1216,11 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, if (!entry->e_value_inum) continue; ea_ino = le32_to_cpu(entry->e_value_inum); + + /* Skip if already processed (duplicate on corrupted fs) */ + if (ext4_ea_ino_in_llist(ea_ino, processed)) + continue; + err = ext4_xattr_inode_iget(parent, ea_ino, le32_to_cpu(entry->e_hash), &ea_inode); @@ -1235,7 +1270,12 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, entry->e_value_inum = 0; entry->e_value_size = 0; - ext4_put_ea_inode(parent->i_sb, ea_inode); + /* + * Collect processed EA inodes for dedup and deferred iput. + * ext4_put_ea_inode_llist() handles the actual release + * after the loop, preventing iget deadlocks on duplicates. + */ + llist_add(&EXT4_I(ea_inode)->i_ea_iput_node, processed); dirty = true; } @@ -1262,7 +1302,8 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, static void ext4_xattr_release_block(handle_t *handle, struct inode *inode, struct buffer_head *bh, - int extra_credits) + int extra_credits, + struct llist_head *processed) { struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); u32 hash, ref; @@ -1304,7 +1345,8 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, BFIRST(bh), true /* block_csum */, extra_credits, - true /* skip_quota */); + true /* skip_quota */, + processed); ext4_free_blocks(handle, inode, bh, 0, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); @@ -2171,8 +2213,12 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, /* Drop the previous xattr block. */ if (bs->bh && bs->bh != new_bh) { + LLIST_HEAD(processed); + ext4_xattr_release_block(handle, inode, bs->bh, - 0 /* extra_credits */); + 0 /* extra_credits */, + &processed); + ext4_put_ea_inode_llist(inode->i_sb, &processed); } error = 0; @@ -2866,6 +2912,7 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, struct ext4_xattr_entry *entry; struct inode *ea_inode; int error; + LLIST_HEAD(processed); error = ext4_journal_ensure_credits(handle, extra_credits, ext4_free_metadata_revoke_credits(inode->i_sb, 1)); @@ -2897,7 +2944,8 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, IFIRST(header), false /* block_csum */, extra_credits, - false /* skip_quota */); + false /* skip_quota */, + &processed); } if (EXT4_I(inode)->i_file_acl) { @@ -2921,6 +2969,11 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, entry = EXT4_XATTR_NEXT(entry)) { if (!entry->e_value_inum) continue; + /* Skip EA inodes already dec_ref'd from ibody */ + if (ext4_ea_ino_in_llist( + le32_to_cpu(entry->e_value_inum), + &processed)) + continue; error = ext4_xattr_inode_iget(inode, le32_to_cpu(entry->e_value_inum), le32_to_cpu(entry->e_hash), @@ -2935,7 +2988,7 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, } ext4_xattr_release_block(handle, inode, bh, - extra_credits); + extra_credits, &processed); /* * Update i_file_acl value in the same transaction that releases * block. @@ -2951,6 +3004,7 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, } error = 0; cleanup: + ext4_put_ea_inode_llist(inode->i_sb, &processed); brelse(iloc.bh); brelse(bh); return error; -- 2.43.0