From: Zhang Yi <yi.zhang@huawei.com>

For append writes, wait for ordered I/O to complete before updating
i_disksize. This ensures that zeroed data is flushed to disk before the
metadata update, preventing stale data from being exposed during
unaligned post-EOF append writes.

Suggested-by: Jan Kara <jack@suse.cz>
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
 fs/ext4/ext4.h    | 11 +++++++
 fs/ext4/inode.c   | 80 ++++++++++++++++++++++++++++++++++++++++++-----
 fs/ext4/page-io.c | 60 +++++++++++++++++++++++++++++++++++
 fs/ext4/super.c   | 23 ++++++++++----
 4 files changed, 161 insertions(+), 13 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 078feda47e36..9ce2128eea3e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1195,6 +1195,15 @@ struct ext4_inode_info {
 #ifdef CONFIG_FS_ENCRYPTION
 	struct fscrypt_inode_info *i_crypt_info;
 #endif
+
+	/*
+	 * Track ordered zeroed data during post-EOF append writes, fallocate,
+	 * and truncate-up operations. These parameters are used only in the
+	 * iomap buffered I/O path.
+	 */
+	ext4_lblk_t i_ordered_lblk;
+	ext4_lblk_t i_ordered_len;
+	wait_queue_head_t i_ordered_wq;
 };
 
 /*
@@ -3858,6 +3867,8 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
 			     __u64 len, __u64 *moved_len);
 
 /* page-io.c */
+#define EXT4_IOMAP_IOEND_ORDER_IO	1UL	/* This I/O is an ordered one */
+
 extern int __init ext4_init_pageio(void);
 extern void ext4_exit_pageio(void);
 extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e013aeb03d7b..11fb369efeb1 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4345,6 +4345,7 @@ static int ext4_iomap_writeback_submit(struct iomap_writepage_ctx *wpc,
 {
 	struct iomap_ioend *ioend = wpc->wb_ctx;
 	struct ext4_inode_info *ei = EXT4_I(ioend->io_inode);
+	ext4_lblk_t start, end, order_lblk, order_len;
 
 	/*
 	 * After I/O completion, a worker needs to be scheduled when:
@@ -4357,6 +4358,30 @@ static int ext4_iomap_writeback_submit(struct iomap_writepage_ctx *wpc,
 	    test_opt(ioend->io_inode->i_sb, DATA_ERR_ABORT))
 		ioend->io_bio.bi_end_io = ext4_iomap_end_bio;
 
+	/*
+	 * Mark the I/O as ordered. Ordered I/O requires separate endio
+	 * handling and must not be merged with regular I/O operations.
+	 */
+	order_len = READ_ONCE(ei->i_ordered_len);
+	if (order_len) {
+		/*
+		 * Pair with smp_store_release() in ext4_block_zero_eof().
+		 * Ensure we see the updated i_ordered_lblk that was written
+		 * before the release store to i_ordered_len.
+		 */
+		smp_rmb();
+		order_lblk = READ_ONCE(ei->i_ordered_lblk);
+		start = ioend->io_offset >> ioend->io_inode->i_blkbits;
+		end = EXT4_B_TO_LBLK(ioend->io_inode,
+				     ioend->io_offset + ioend->io_size);
+
+		if (start <= order_lblk && end >= order_lblk + order_len) {
+			ioend->io_bio.bi_end_io = ext4_iomap_end_bio;
+			ioend->io_private = (void *)EXT4_IOMAP_IOEND_ORDER_IO;
+			ioend->io_flags |= IOMAP_IOEND_BOUNDARY;
+		}
+	}
+
 	return iomap_ioend_writeback_submit(wpc, error);
 }
 
@@ -4746,8 +4771,10 @@ static int ext4_iomap_submit_zero_block(struct inode *inode,
 					loff_t from, loff_t end)
 {
 	struct address_space *mapping = inode->i_mapping;
+	struct ext4_inode_info *ei = EXT4_I(inode);
 	struct folio *folio;
 	bool do_submit = false;
+	int ret;
 
 	folio = filemap_lock_folio(mapping, from >> PAGE_SHIFT);
 	if (IS_ERR(folio))
@@ -4757,14 +4784,50 @@ static int ext4_iomap_submit_zero_block(struct inode *inode,
 	folio_wait_writeback(folio);
 	WARN_ON_ONCE(folio_test_writeback(folio));
 
-	if (likely(folio_test_dirty(folio)))
+	/*
+	 * Mark the ordered range. It will be cleared upon I/O completion
+	 * in ext4_iomap_end_bio(). Any operation that extends i_disksize
+	 * (including append write end io past the zeroed boundary,
+	 * truncate up and append fallocate) must wait for this I/O to
+	 * complete before updating i_disksize.
+	 *
+	 * When multiple overlapping unaligned EOF writes are in flight, we
+	 * only need to track and wait for the first one. Subsequent writes
+	 * will zero the gap in memory and ensure that the zeroed data is
+	 * written out along with the valid data in the same block before
+	 * i_disksize is updated.
+	 */
+	if (likely(folio_test_dirty(folio) &&
+		   READ_ONCE(ei->i_ordered_len) == 0)) {
+		WRITE_ONCE(ei->i_ordered_lblk,
+			   from >> inode->i_blkbits);
+		/*
+		 * Pairs with smp_rmb() in ext4_iomap_writeback_submit()
+		 * and ext4_iomap_wb_ordered_wait(). Ensure the updated
+		 * i_ordered_lblk is visible when i_ordered_len becomes
+		 * non-zero.
+		 */
+		smp_store_release(&ei->i_ordered_len, 1);
 		do_submit = true;
+	}
 	folio_unlock(folio);
 	folio_put(folio);
 
 	/* Submit zeroed block. */
-	if (do_submit)
-		return filemap_fdatawrite_range(mapping, from, end - 1);
+	if (do_submit) {
+		ret = filemap_fdatawrite_range(mapping, from, end - 1);
+		if (ret) {
+			/*
+			 * Pairs with wait_event() in
+			 * ext4_iomap_wb_ordered_wait(). Ensure
+			 * i_ordered_len = 0 is visible before waking up
+			 * waiters.
+			 */
+			smp_store_release(&ei->i_ordered_len, 0);
+			wake_up_all(&ei->i_ordered_wq);
+			return ret;
+		}
+	}
 	return 0;
 }
 
@@ -4827,10 +4890,13 @@ int ext4_block_zero_eof(struct inode *inode, loff_t from, loff_t end)
 		 * data=ordered mode. We submit zeroed range directly here.
 		 * Do not wait for I/O completion for performance.
 		 *
-		 * TODO: Any operation that extends i_disksize (including
-		 * append write end io past the zeroed boundary, truncate up,
-		 * and append fallocate) must wait for the relevant I/O to
-		 * complete before updating i_disksize.
+		 * The end_io handler ext4_iomap_wb_ordered_wait() will wait
+		 * for I/O completion before updating i_disksize if the write
+		 * extends beyond the zeroed boundary.
+		 *
+		 * TODO: Any other operation that extends i_disksize
+		 * (including truncate up and append fallocate) must wait for
+		 * the relevant I/O to complete before updating i_disksize.
 		 */
 		} else if (ext4_inode_buffered_iomap(inode)) {
 			err = ext4_iomap_submit_zero_block(inode, from, end);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 3050c887329f..ad05ebb49bf6 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -613,6 +613,46 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio,
 	return 0;
 }
 
+/*
+ * If the old disk size is not block size aligned and the current
+ * writeback range is entirely beyond the old EOF block, we should
+ * wait for the zeroed data written in ext4_block_zero_eof() to be
+ * written out, otherwise, it may expose stale data in that block.
+ */
+static void ext4_iomap_wb_ordered_wait(struct inode *inode,
+				       loff_t pos, loff_t end)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	unsigned int blocksize = i_blocksize(inode);
+	loff_t disksize = READ_ONCE(ei->i_disksize);
+	ext4_lblk_t order_lblk, order_len;
+
+	/*
+	 * Waiting for ordered I/O is unnecessary when:
+	 * - The on-disk size is block-aligned (no stale data exists).
+	 * - The write start is within the block of the old EOF
+	 *   (overwriting, or appending to a block that already contains
+	 *   valid data).
+	 */
+	if (!(disksize & (blocksize - 1)) ||
+	    pos < round_up(disksize, blocksize))
+		return;
+
+	order_len = READ_ONCE(ei->i_ordered_len);
+	if (!order_len)
+		return;
+
+	/*
+	 * Pair with smp_store_release() in ext4_iomap_end_bio() and
+	 * ext4_block_zero_eof(). Ensure we see the updated i_ordered_lblk
+	 * that was written before the release store to i_ordered_len.
+	 */
+	smp_rmb();
+	order_lblk = READ_ONCE(ei->i_ordered_lblk);
+	if ((pos >> inode->i_blkbits) >= order_lblk + order_len)
+		wait_event(ei->i_ordered_wq, READ_ONCE(ei->i_ordered_len) == 0);
+}
+
 static int ext4_iomap_wb_update_disksize(handle_t *handle, struct inode *inode,
 					 loff_t end)
 {
@@ -656,6 +696,9 @@ static void ext4_iomap_finish_ioend(struct iomap_ioend *ioend)
 		goto out;
 	}
 
+	/* Wait ordered zero data to be written out. */
+	ext4_iomap_wb_ordered_wait(inode, pos, pos + size);
+
 	/* We may need to convert one extent and dirty the inode. */
 	credits = ext4_chunk_trans_blocks(inode,
 			EXT4_MAX_BLOCKS(size, pos, inode->i_blkbits));
@@ -717,8 +760,25 @@ void ext4_iomap_end_bio(struct bio *bio)
 	struct inode *inode = ioend->io_inode;
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	unsigned long io_mode = (unsigned long)ioend->io_private;
 	unsigned long flags;
 
+	/*
+	 * This is an ordered I/O, clear the ordered range set in
+	 * ext4_block_zero_eof() and wake up all waiters that will update
+	 * the inode i_disksize.
+	 */
+	if (io_mode == EXT4_IOMAP_IOEND_ORDER_IO) {
+		/*
+		 * Pairs with wait_event() in ext4_iomap_wb_ordered_wait().
+		 * Ensure i_ordered_len = 0 is visible before waking up
+		 * waiters.
+		 */
+		smp_store_release(&ei->i_ordered_len, 0);
+		wake_up_all(&ei->i_ordered_wq);
+		goto defer;
+	}
+
 	/* Needs to convert unwritten extents or update the i_disksize. */
 	if ((ioend->io_flags & IOMAP_IOEND_UNWRITTEN) ||
 	    ioend->io_offset + ioend->io_size > READ_ONCE(ei->i_disksize))
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 62bfe05a64bc..9c0a00e716f3 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1444,6 +1444,9 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 	ext4_fc_init_inode(&ei->vfs_inode);
 	spin_lock_init(&ei->i_fc_lock);
 	mmb_init(&ei->i_metadata_bhs, &ei->vfs_inode.i_data);
+	ei->i_ordered_lblk = 0;
+	ei->i_ordered_len = 0;
+	init_waitqueue_head(&ei->i_ordered_wq);
 	return &ei->vfs_inode;
 }
 
@@ -1480,12 +1483,20 @@ static void ext4_destroy_inode(struct inode *inode)
 		dump_stack();
 	}
 
-	if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ERROR_FS) &&
-	    WARN_ON_ONCE(EXT4_I(inode)->i_reserved_data_blocks))
-		ext4_msg(inode->i_sb, KERN_ERR,
-			 "Inode %llu (%p): i_reserved_data_blocks (%u) not cleared!",
-			 inode->i_ino, EXT4_I(inode),
-			 EXT4_I(inode)->i_reserved_data_blocks);
+	if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ERROR_FS)) {
+		if (WARN_ON_ONCE(EXT4_I(inode)->i_reserved_data_blocks))
+			ext4_msg(inode->i_sb, KERN_ERR,
+				 "Inode %llu (%p): i_reserved_data_blocks (%u) not cleared!",
+				 inode->i_ino, EXT4_I(inode),
+				 EXT4_I(inode)->i_reserved_data_blocks);
+
+		if (WARN_ON_ONCE(EXT4_I(inode)->i_ordered_len))
+			ext4_msg(inode->i_sb, KERN_ERR,
+				 "Inode %llu (%p): i_ordered_lblk (%u) and i_ordered_len (%u) not cleared!",
+				 inode->i_ino, EXT4_I(inode),
+				 EXT4_I(inode)->i_ordered_lblk,
+				 EXT4_I(inode)->i_ordered_len);
+	}
 }
 
 static void ext4_shutdown(struct super_block *sb)
-- 
2.52.0