When iomap uses large folios, per-block uptodate tracking is managed via iomap_folio_state (ifs). A race condition can cause the ifs uptodate bits to become inconsistent with the folio's uptodate flag. The race occurs because folio_end_read() uses XOR semantics to atomically set the uptodate bit and clear the locked bit: Thread A (read completion): Thread B (concurrent write): -------------------------------- -------------------------------- iomap_finish_folio_read() spin_lock(state_lock) ifs_set_range_uptodate() -> true spin_unlock(state_lock) iomap_set_range_uptodate() spin_lock(state_lock) ifs_set_range_uptodate() -> true spin_unlock(state_lock) folio_mark_uptodate(folio) folio_end_read(folio, true) folio_xor_flags() // XOR CLEARS uptodate! Result: folio is NOT uptodate, but ifs says all blocks ARE uptodate. Fix by checking read_bytes_pending in iomap_set_range_uptodate() under the lock. If a read is in progress, skip calling folio_mark_uptodate() - the read completion path will handle it via folio_end_read(). The warning was triggered during FUSE-based filesystem (e.g., NTFS-3G) unmount when the LTP writev03 test was run: WARNING: fs/iomap/buffered-io.c at ifs_free Call trace: ifs_free iomap_invalidate_folio truncate_cleanup_folio truncate_inode_pages_range truncate_inode_pages_final fuse_evict_inode ... fuse_kill_sb_blk Fixes: 7a4847e54cc1 ("iomap: use folio_end_read()") Assisted-by: claude-opus-4-5-20251101 Signed-off-by: Sasha Levin --- fs/fuse/dev.c | 3 +- fs/fuse/file.c | 6 ++-- fs/iomap/buffered-io.c | 65 +++++++++++++++++++++++++++++++++++++++--- include/linux/iomap.h | 2 ++ 4 files changed, 68 insertions(+), 8 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 6d59cbc877c6..50e84e913589 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -11,6 +11,7 @@ #include "fuse_dev_i.h" #include +#include #include #include #include @@ -1820,7 +1821,7 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, if (!folio_test_uptodate(folio) && !err && offset == 0 && (nr_bytes == folio_size(folio) || file_size == end)) { folio_zero_segment(folio, nr_bytes, folio_size(folio)); - folio_mark_uptodate(folio); + iomap_set_range_uptodate(folio, 0, folio_size(folio)); } folio_unlock(folio); folio_put(folio); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 01bc894e9c2b..3abe38416199 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1216,13 +1216,13 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, struct folio *folio = ap->folios[i]; if (err) { - folio_clear_uptodate(folio); + iomap_clear_folio_uptodate(folio); } else { if (count >= folio_size(folio) - offset) count -= folio_size(folio) - offset; else { if (short_write) - folio_clear_uptodate(folio); + iomap_clear_folio_uptodate(folio); count = 0; } offset = 0; @@ -1305,7 +1305,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, /* If we copied full folio, mark it uptodate */ if (tmp == folio_size(folio)) - folio_mark_uptodate(folio); + iomap_set_range_uptodate(folio, 0, folio_size(folio)); if (folio_test_uptodate(folio)) { folio_unlock(folio); diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index e5c1ca440d93..7ceda24cf6a7 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -74,8 +74,7 @@ static bool ifs_set_range_uptodate(struct folio *folio, return ifs_is_fully_uptodate(folio, ifs); } -static void iomap_set_range_uptodate(struct folio *folio, size_t off, - size_t len) +void iomap_set_range_uptodate(struct folio *folio, size_t off, size_t len) { struct iomap_folio_state *ifs = folio->private; unsigned long flags; @@ -87,12 +86,50 @@ static void iomap_set_range_uptodate(struct folio *folio, size_t off, if (ifs) { spin_lock_irqsave(&ifs->state_lock, flags); uptodate = ifs_set_range_uptodate(folio, ifs, off, len); + /* + * If a read is in progress, we must NOT call folio_mark_uptodate + * here. The read completion path (iomap_finish_folio_read or + * iomap_read_end) will call folio_end_read() which uses XOR + * semantics to set the uptodate bit. If we set it here, the XOR + * in folio_end_read() will clear it, leaving the folio not + * uptodate while the ifs says all blocks are uptodate. + */ + if (uptodate && ifs->read_bytes_pending) + uptodate = false; spin_unlock_irqrestore(&ifs->state_lock, flags); } if (uptodate) folio_mark_uptodate(folio); } +EXPORT_SYMBOL_GPL(iomap_set_range_uptodate); + +void iomap_clear_folio_uptodate(struct folio *folio) +{ + struct iomap_folio_state *ifs = folio->private; + + if (ifs) { + struct inode *inode = folio->mapping->host; + unsigned int nr_blocks = i_blocks_per_folio(inode, folio); + unsigned long flags; + + spin_lock_irqsave(&ifs->state_lock, flags); + /* + * If a read is in progress, don't clear the uptodate state. + * The read completion path will handle the folio state, and + * clearing here would race with iomap_finish_folio_read() + * potentially causing ifs/folio uptodate state mismatch. + */ + if (ifs->read_bytes_pending) { + spin_unlock_irqrestore(&ifs->state_lock, flags); + return; + } + bitmap_clear(ifs->state, 0, nr_blocks); + spin_unlock_irqrestore(&ifs->state_lock, flags); + } + folio_clear_uptodate(folio); +} +EXPORT_SYMBOL_GPL(iomap_clear_folio_uptodate); /* * Find the next dirty block in the folio. end_blk is inclusive. @@ -399,8 +436,17 @@ void iomap_finish_folio_read(struct folio *folio, size_t off, size_t len, spin_unlock_irqrestore(&ifs->state_lock, flags); } - if (finished) + if (finished) { + /* + * If uptodate is true but the folio is already marked uptodate, + * folio_end_read's XOR semantics would clear the uptodate bit. + * This should never happen because iomap_set_range_uptodate() + * skips calling folio_mark_uptodate() when read_bytes_pending + * is non-zero, ensuring only the read completion path sets it. + */ + WARN_ON_ONCE(uptodate && folio_test_uptodate(folio)); folio_end_read(folio, uptodate); + } } EXPORT_SYMBOL_GPL(iomap_finish_folio_read); @@ -481,8 +527,19 @@ static void iomap_read_end(struct folio *folio, size_t bytes_submitted) if (end_read) uptodate = ifs_is_fully_uptodate(folio, ifs); spin_unlock_irq(&ifs->state_lock); - if (end_read) + if (end_read) { + /* + * If uptodate is true but the folio is already marked + * uptodate, folio_end_read's XOR semantics would clear + * the uptodate bit. This should never happen because + * iomap_set_range_uptodate() skips calling + * folio_mark_uptodate() when read_bytes_pending is + * non-zero, ensuring only the read completion path + * sets it. + */ + WARN_ON_ONCE(uptodate && folio_test_uptodate(folio)); folio_end_read(folio, uptodate); + } } else if (!bytes_submitted) { /* * If there were no bytes submitted, this means we are diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 520e967cb501..3c2ad88d16b6 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -345,6 +345,8 @@ void iomap_read_folio(const struct iomap_ops *ops, void iomap_readahead(const struct iomap_ops *ops, struct iomap_read_folio_ctx *ctx); bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count); +void iomap_set_range_uptodate(struct folio *folio, size_t off, size_t len); +void iomap_clear_folio_uptodate(struct folio *folio); struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len); bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags); void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len); -- 2.51.0