From: Zhang Yi Introduce a new iomap_ops instance, ext4_iomap_zero_ops, along with ext4_iomap_block_zero_range() to implement block zeroing via the iomap infrastructure for ext4. ext4_iomap_block_zero_range() calls iomap_zero_range() with ext4_iomap_zero_begin() as the callback. The callback locates and zeros out either a mapped partial block or a dirty, unwritten partial block. Important constraints: Zeroing out under an active journal handle can cause deadlock, because the order of acquiring the folio lock and starting a handle is inconsistent with the iomap writeback path. Therefore, ext4_iomap_block_zero_range(): - Must NOT be called under an active handle. - Cannot rely on data=ordered mode to ensure zeroed data persistence before updating i_disksize (for the cases of post-EOF append write, post-EOF fallocate, and truncate up). In subsequent patches, we will address this by synchronizing commit I/O but doesn't waiting for completion, and updating i_disksize to i_size only after the zeroed data has been written back. Signed-off-by: Zhang Yi --- fs/ext4/inode.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c6fe42d012fc..e0dae2501292 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4101,6 +4101,51 @@ static int ext4_iomap_buffered_da_write_end(struct inode *inode, loff_t offset, return 0; } +static int ext4_iomap_zero_begin(struct inode *inode, + loff_t offset, loff_t length, unsigned int flags, + struct iomap *iomap, struct iomap *srcmap) +{ + struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap); + struct ext4_map_blocks map; + u8 blkbits = inode->i_blkbits; + unsigned int iomap_flags = 0; + int ret; + + ret = ext4_emergency_state(inode->i_sb); + if (unlikely(ret)) + return ret; + + if (WARN_ON_ONCE(!(flags & IOMAP_ZERO))) + return -EINVAL; + + ret = ext4_iomap_map_blocks(inode, offset, length, NULL, &map); + if (ret < 0) + return ret; + + /* + * Look up dirty folios for unwritten mappings within EOF. Providing + * this bypasses the flush iomap uses to trigger extent conversion + * when unwritten mappings have dirty pagecache in need of zeroing. + */ + if (map.m_flags & EXT4_MAP_UNWRITTEN) { + loff_t start = ((loff_t)map.m_lblk) << blkbits; + loff_t end = ((loff_t)map.m_lblk + map.m_len) << blkbits; + + iomap_fill_dirty_folios(iter, &start, end, &iomap_flags); + if ((start >> blkbits) < map.m_lblk + map.m_len) + map.m_len = (start >> blkbits) - map.m_lblk; + } + + ext4_set_iomap(inode, iomap, &map, offset, length, flags); + iomap->flags |= iomap_flags; + + return 0; +} + +static const struct iomap_ops ext4_iomap_zero_ops = { + .iomap_begin = ext4_iomap_zero_begin, +}; + /* * Since we always allocate unwritten extents, there is no need for * iomap_end to clean up allocated blocks on a short write. @@ -4616,6 +4661,47 @@ static int ext4_block_journalled_zero_range(struct inode *inode, loff_t from, return err; } +static int ext4_block_iomap_zero_range(struct inode *inode, loff_t from, + loff_t length, bool *did_zero, + bool *zero_written) +{ + int ret; + + /* + * Zeroing out under an active handle can cause deadlock since + * the order of acquiring the folio lock and starting a handle is + * inconsistent with the iomap writeback procedure. + */ + if (WARN_ON_ONCE(ext4_handle_valid(journal_current_handle()))) + return -EINVAL; + + /* The zeroing scope should not extend across a block. */ + if (WARN_ON_ONCE((from >> inode->i_blkbits) != + ((from + length - 1) >> inode->i_blkbits))) + return -EINVAL; + + if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS) && + !(inode_state_read_once(inode) & (I_NEW | I_FREEING))) + WARN_ON_ONCE(!inode_is_locked(inode) && + !rwsem_is_locked(&inode->i_mapping->invalidate_lock)); + + ret = iomap_zero_range(inode, from, length, did_zero, + &ext4_iomap_zero_ops, &ext4_iomap_write_ops, + NULL); + if (ret) + return ret; + + /* + * TODO: The iomap does not distinguish between different types of + * zeroing and always sets zero_written if a zeroing operation is + * performed, which may result in unnecessary order operations. + */ + if (did_zero && zero_written) + *zero_written = *did_zero; + + return 0; +} + /* * Zeros out a mapping of length 'length' starting from file offset * 'from'. The range to be zero'd must be contained with in one block. @@ -4642,6 +4728,9 @@ static int ext4_block_zero_range(struct inode *inode, } else if (ext4_should_journal_data(inode)) { return ext4_block_journalled_zero_range(inode, from, length, did_zero); + } else if (ext4_inode_buffered_iomap(inode)) { + return ext4_block_iomap_zero_range(inode, from, length, + did_zero, zero_written); } return ext4_block_do_zero_range(inode, from, length, did_zero, zero_written); @@ -4682,6 +4771,9 @@ int ext4_block_zero_eof(struct inode *inode, loff_t from, loff_t end) * truncating up or performing an append write, because there might be * exposing stale on-disk data which may caused by concurrent post-EOF * mmap write during folio writeback. + * + * TODO: In the iomap path, handle this by updating i_disksize to + * i_size after the zeroed data has been written back. */ if (ext4_should_order_data(inode) && did_zero && zero_written && !IS_DAX(inode)) { -- 2.52.0