jbd2_inode fields are updated under journal->j_list_lock, but some paths read them without holding the lock (e.g. fast commit helpers and ordered truncate helpers). READ_ONCE() alone is not sufficient for i_dirty_start/end as they are loff_t and 32-bit platforms can observe torn loads. Store the dirty range in PAGE_SIZE units as pgoff_t so lockless readers can take non-torn snapshots. Use READ_ONCE() on the read side and WRITE_ONCE() on the write side for the dirty range and i_flags to match the existing lockless access pattern. Suggested-by: Jan Kara Reviewed-by: Jan Kara Signed-off-by: Li Chen --- Changes since v1: - Store i_dirty_start/end in PAGE_SIZE units (pgoff_t) to avoid torn loads on 32-bit (pointed out by Matthew, suggested by Jan). - Use WRITE_ONCE() for i_dirty_* / i_flags updates (per Jan). - Drop pointless READ_ONCE() on i_vfs_inode in jbd2_wait_inode_data (per Jan). fs/jbd2/commit.c | 65 ++++++++++++++++++++++++++++++++++--------- fs/jbd2/journal.c | 3 +- fs/jbd2/transaction.c | 20 ++++++++----- include/linux/jbd2.h | 17 +++++++---- 4 files changed, 78 insertions(+), 27 deletions(-) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 7203d2d2624d..d98f4dbde695 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -180,7 +180,13 @@ static int journal_wait_on_commit_record(journal_t *journal, /* Send all the data buffers related to an inode */ int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode) { - if (!jinode || !(jinode->i_flags & JI_WRITE_DATA)) + unsigned long flags; + + if (!jinode) + return 0; + + flags = READ_ONCE(jinode->i_flags); + if (!(flags & JI_WRITE_DATA)) return 0; trace_jbd2_submit_inode_data(jinode->i_vfs_inode); @@ -191,12 +197,35 @@ EXPORT_SYMBOL(jbd2_submit_inode_data); int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode) { - if (!jinode || !(jinode->i_flags & JI_WAIT_DATA) || - !jinode->i_vfs_inode || !jinode->i_vfs_inode->i_mapping) + struct address_space *mapping; + struct inode *inode; + unsigned long flags; + pgoff_t start, end; + loff_t start_byte, end_byte; + + if (!jinode) + return 0; + + flags = READ_ONCE(jinode->i_flags); + if (!(flags & JI_WAIT_DATA)) + return 0; + + inode = jinode->i_vfs_inode; + if (!inode) + return 0; + + mapping = inode->i_mapping; + start = READ_ONCE(jinode->i_dirty_start); + end = READ_ONCE(jinode->i_dirty_end); + if (end == JBD2_INODE_DIRTY_RANGE_NONE) + return 0; + start_byte = (loff_t)start << PAGE_SHIFT; + end_byte = ((loff_t)end << PAGE_SHIFT) + PAGE_SIZE - 1; + + if (!mapping) return 0; return filemap_fdatawait_range_keep_errors( - jinode->i_vfs_inode->i_mapping, jinode->i_dirty_start, - jinode->i_dirty_end); + mapping, start_byte, end_byte); } EXPORT_SYMBOL(jbd2_wait_inode_data); @@ -218,7 +247,8 @@ static int journal_submit_data_buffers(journal_t *journal, list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { if (!(jinode->i_flags & JI_WRITE_DATA)) continue; - jinode->i_flags |= JI_COMMIT_RUNNING; + WRITE_ONCE(jinode->i_flags, + jinode->i_flags | JI_COMMIT_RUNNING); spin_unlock(&journal->j_list_lock); /* submit the inode data buffers. */ trace_jbd2_submit_inode_data(jinode->i_vfs_inode); @@ -229,7 +259,8 @@ static int journal_submit_data_buffers(journal_t *journal, } spin_lock(&journal->j_list_lock); J_ASSERT(jinode->i_transaction == commit_transaction); - jinode->i_flags &= ~JI_COMMIT_RUNNING; + WRITE_ONCE(jinode->i_flags, + jinode->i_flags & ~JI_COMMIT_RUNNING); smp_mb(); wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); } @@ -240,10 +271,17 @@ static int journal_submit_data_buffers(journal_t *journal, int jbd2_journal_finish_inode_data_buffers(struct jbd2_inode *jinode) { struct address_space *mapping = jinode->i_vfs_inode->i_mapping; + pgoff_t start = READ_ONCE(jinode->i_dirty_start); + pgoff_t end = READ_ONCE(jinode->i_dirty_end); + loff_t start_byte, end_byte; + + if (end == JBD2_INODE_DIRTY_RANGE_NONE) + return 0; + start_byte = (loff_t)start << PAGE_SHIFT; + end_byte = ((loff_t)end << PAGE_SHIFT) + PAGE_SIZE - 1; return filemap_fdatawait_range_keep_errors(mapping, - jinode->i_dirty_start, - jinode->i_dirty_end); + start_byte, end_byte); } /* @@ -262,7 +300,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal, list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { if (!(jinode->i_flags & JI_WAIT_DATA)) continue; - jinode->i_flags |= JI_COMMIT_RUNNING; + WRITE_ONCE(jinode->i_flags, jinode->i_flags | JI_COMMIT_RUNNING); spin_unlock(&journal->j_list_lock); /* wait for the inode data buffers writeout. */ if (journal->j_finish_inode_data_buffers) { @@ -272,7 +310,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal, } cond_resched(); spin_lock(&journal->j_list_lock); - jinode->i_flags &= ~JI_COMMIT_RUNNING; + WRITE_ONCE(jinode->i_flags, jinode->i_flags & ~JI_COMMIT_RUNNING); smp_mb(); wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); } @@ -288,8 +326,9 @@ static int journal_finish_inode_data_buffers(journal_t *journal, &jinode->i_transaction->t_inode_list); } else { jinode->i_transaction = NULL; - jinode->i_dirty_start = 0; - jinode->i_dirty_end = 0; + WRITE_ONCE(jinode->i_dirty_start, 0); + WRITE_ONCE(jinode->i_dirty_end, + JBD2_INODE_DIRTY_RANGE_NONE); } } spin_unlock(&journal->j_list_lock); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index c973162d5b31..9a7477c54dcb 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -3021,7 +3021,7 @@ void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode) jinode->i_vfs_inode = inode; jinode->i_flags = 0; jinode->i_dirty_start = 0; - jinode->i_dirty_end = 0; + jinode->i_dirty_end = JBD2_INODE_DIRTY_RANGE_NONE; INIT_LIST_HEAD(&jinode->i_list); } @@ -3178,4 +3178,3 @@ MODULE_DESCRIPTION("Generic filesystem journal-writing module"); MODULE_LICENSE("GPL"); module_init(journal_init); module_exit(journal_exit); - diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index dca4b5d8aaaa..bbe47be6c73c 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -2646,6 +2646,7 @@ static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode, { transaction_t *transaction = handle->h_transaction; journal_t *journal; + pgoff_t start, end; if (is_handle_aborted(handle)) return -EROFS; @@ -2654,15 +2655,20 @@ static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode, jbd2_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino, transaction->t_tid); + start = (pgoff_t)(start_byte >> PAGE_SHIFT); + end = (pgoff_t)(end_byte >> PAGE_SHIFT); + spin_lock(&journal->j_list_lock); - jinode->i_flags |= flags; + WRITE_ONCE(jinode->i_flags, jinode->i_flags | flags); - if (jinode->i_dirty_end) { - jinode->i_dirty_start = min(jinode->i_dirty_start, start_byte); - jinode->i_dirty_end = max(jinode->i_dirty_end, end_byte); + if (jinode->i_dirty_end != JBD2_INODE_DIRTY_RANGE_NONE) { + WRITE_ONCE(jinode->i_dirty_start, + min(jinode->i_dirty_start, start)); + WRITE_ONCE(jinode->i_dirty_end, + max(jinode->i_dirty_end, end)); } else { - jinode->i_dirty_start = start_byte; - jinode->i_dirty_end = end_byte; + WRITE_ONCE(jinode->i_dirty_start, start); + WRITE_ONCE(jinode->i_dirty_end, end); } /* Is inode already attached where we need it? */ @@ -2739,7 +2745,7 @@ int jbd2_journal_begin_ordered_truncate(journal_t *journal, int ret = 0; /* This is a quick check to avoid locking if not necessary */ - if (!jinode->i_transaction) + if (!READ_ONCE(jinode->i_transaction)) goto out; /* Locks are here just to force reading of recent values, it is * enough that the transaction was not committing before we started diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index a53a00d36228..81eb58ddc126 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -390,6 +390,8 @@ static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh) /* Wait for outstanding data writes for this inode before commit */ #define JI_WAIT_DATA (1 << __JI_WAIT_DATA) +#define JBD2_INODE_DIRTY_RANGE_NONE ((pgoff_t)-1) + /** * struct jbd2_inode - The jbd_inode type is the structure linking inodes in * ordered mode present in a transaction so that we can sync them during commit. @@ -431,18 +433,23 @@ struct jbd2_inode { /** * @i_dirty_start: * - * Offset in bytes where the dirty range for this inode starts. + * Dirty range start in PAGE_SIZE units. + * + * The dirty range is empty if @i_dirty_end is set to + * %JBD2_INODE_DIRTY_RANGE_NONE. + * * [j_list_lock] */ - loff_t i_dirty_start; + pgoff_t i_dirty_start; /** * @i_dirty_end: * - * Inclusive offset in bytes where the dirty range for this inode - * ends. [j_list_lock] + * Dirty range end in PAGE_SIZE units (inclusive). + * + * [j_list_lock] */ - loff_t i_dirty_end; + pgoff_t i_dirty_end; }; struct jbd2_revoke_table_s; -- 2.52.0 ext4 journal commit callbacks access jbd2_inode fields such as i_transaction and i_dirty_start/end without holding journal->j_list_lock. Use READ_ONCE() for these reads to match the lockless access pattern. Convert the dirty range from PAGE_SIZE units back to byte offsets before passing it to writeback. Suggested-by: Jan Kara Reviewed-by: Jan Kara Signed-off-by: Li Chen --- Changes since v1: - Convert the jinode dirty range from PAGE_SIZE units (pgoff_t) back to byte offsets before passing it to writeback. fs/ext4/inode.c | 12 ++++++++++-- fs/ext4/super.c | 19 ++++++++++++++----- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d99296d7315f..5ec60580a2d0 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3033,11 +3033,19 @@ static int ext4_writepages(struct address_space *mapping, int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode) { + pgoff_t dirty_start = READ_ONCE(jinode->i_dirty_start); + pgoff_t dirty_end = READ_ONCE(jinode->i_dirty_end); + loff_t range_start, range_end; + + if (dirty_end == JBD2_INODE_DIRTY_RANGE_NONE) + return 0; + range_start = (loff_t)dirty_start << PAGE_SHIFT; + range_end = ((loff_t)dirty_end << PAGE_SHIFT) + PAGE_SIZE - 1; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, - .range_start = jinode->i_dirty_start, - .range_end = jinode->i_dirty_end, + .range_start = range_start, + .range_end = range_end, }; struct mpage_da_data mpd = { .inode = jinode->i_vfs_inode, diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 69eb63dde983..27062c8ad60a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -521,6 +521,7 @@ static bool ext4_journalled_writepage_needs_redirty(struct jbd2_inode *jinode, { struct buffer_head *bh, *head; struct journal_head *jh; + transaction_t *trans = READ_ONCE(jinode->i_transaction); bh = head = folio_buffers(folio); do { @@ -539,7 +540,7 @@ static bool ext4_journalled_writepage_needs_redirty(struct jbd2_inode *jinode, */ jh = bh2jh(bh); if (buffer_dirty(bh) || - (jh && (jh->b_transaction != jinode->i_transaction || + (jh && (jh->b_transaction != trans || jh->b_next_transaction))) return true; } while ((bh = bh->b_this_page) != head); @@ -550,12 +551,20 @@ static bool ext4_journalled_writepage_needs_redirty(struct jbd2_inode *jinode, static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode) { struct address_space *mapping = jinode->i_vfs_inode->i_mapping; + pgoff_t dirty_start = READ_ONCE(jinode->i_dirty_start); + pgoff_t dirty_end = READ_ONCE(jinode->i_dirty_end); + loff_t range_start, range_end; + + if (dirty_end == JBD2_INODE_DIRTY_RANGE_NONE) + return 0; + range_start = (loff_t)dirty_start << PAGE_SHIFT; + range_end = ((loff_t)dirty_end << PAGE_SHIFT) + PAGE_SIZE - 1; struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, + .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, - .range_start = jinode->i_dirty_start, - .range_end = jinode->i_dirty_end, - }; + .range_start = range_start, + .range_end = range_end, + }; struct folio *folio = NULL; int error; -- 2.52.0 ocfs2 journal commit callback reads jbd2_inode dirty range fields without holding journal->j_list_lock. Use READ_ONCE() for these reads to match the lockless access pattern. Convert the dirty range from PAGE_SIZE units back to byte offsets before passing it to writeback. Suggested-by: Jan Kara Signed-off-by: Li Chen --- Changes since v1: - Convert the jinode dirty range from PAGE_SIZE units (pgoff_t) back to byte offsets before passing it to writeback. fs/ocfs2/journal.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 85239807dec7..0b40383ebcdf 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -902,8 +902,17 @@ int ocfs2_journal_alloc(struct ocfs2_super *osb) static int ocfs2_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) { - return filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping, - jinode->i_dirty_start, jinode->i_dirty_end); + struct address_space *mapping = jinode->i_vfs_inode->i_mapping; + pgoff_t dirty_start = READ_ONCE(jinode->i_dirty_start); + pgoff_t dirty_end = READ_ONCE(jinode->i_dirty_end); + loff_t start_byte, end_byte; + + if (dirty_end == JBD2_INODE_DIRTY_RANGE_NONE) + return 0; + start_byte = (loff_t)dirty_start << PAGE_SHIFT; + end_byte = ((loff_t)dirty_end << PAGE_SHIFT) + PAGE_SIZE - 1; + + return filemap_fdatawrite_range(mapping, start_byte, end_byte); } int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty) -- 2.52.0