From: Zhang Yi Introduce two new iomap_ops instances for ext4 buffered writes: - ext4_iomap_buffered_da_write_ops: for delayed allocation mode, using ext4_da_map_blocks() to map delalloc extents. - ext4_iomap_buffered_write_ops: for non-delayed allocation mode, using ext4_iomap_get_blocks() to directly allocate blocks. Also add ext4_iomap_valid() for the iomap infrastructure to check extent validity. Key changes and considerations: - Unwritten extents for new blocks (dioread_nolock always on) Since data=ordered mode is not used to prevent stale data exposure in the non-delayed allocation path, new blocks are always allocated as unwritten extents. - Short write and write failure handling a. Delalloc path: On short write or failure, the stale delalloc range must be dropped and its space reservation released. Otherwise, a clean folio may cover leftover delalloc extents, causing inaccurate space reservation accounting. b. Non-delalloc path: No cleanup of allocated blocks is needed on short write. - Lock ordering reversal The folio lock and transaction start ordering is reversed compared to the buffer_head buffered write path. To handle this, the journal handle must be stopped in iomap_begin() callbacks. The lock ordering documentation in super.c has been updated accordingly. Signed-off-by: Zhang Yi --- fs/ext4/ext4.h | 4 ++ fs/ext4/file.c | 20 +++++- fs/ext4/inode.c | 165 +++++++++++++++++++++++++++++++++++++++++++++++- fs/ext4/super.c | 10 ++- 4 files changed, 192 insertions(+), 7 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 1e27d73d7427..4832e7f7db82 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3057,6 +3057,7 @@ int ext4_walk_page_buffers(handle_t *handle, int do_journal_get_write_access(handle_t *handle, struct inode *inode, struct buffer_head *bh); void ext4_set_inode_mapping_order(struct inode *inode); +int ext4_nonda_switch(struct super_block *sb); #define FALL_BACK_TO_NONDELALLOC 1 #define CONVERT_INLINE_DATA 2 @@ -3926,6 +3927,9 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) extern const struct iomap_ops ext4_iomap_ops; extern const struct iomap_ops ext4_iomap_report_ops; +extern const struct iomap_ops ext4_iomap_buffered_write_ops; +extern const struct iomap_ops ext4_iomap_buffered_da_write_ops; +extern const struct iomap_write_ops ext4_iomap_write_ops; static inline int ext4_buffer_uptodate(struct buffer_head *bh) { diff --git a/fs/ext4/file.c b/fs/ext4/file.c index eb1a323962b1..7f9bfbbc4a4e 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -299,6 +299,21 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) return count; } +static ssize_t ext4_iomap_buffered_write(struct kiocb *iocb, + struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + const struct iomap_ops *iomap_ops; + + if (test_opt(inode->i_sb, DELALLOC) && !ext4_nonda_switch(inode->i_sb)) + iomap_ops = &ext4_iomap_buffered_da_write_ops; + else + iomap_ops = &ext4_iomap_buffered_write_ops; + + return iomap_file_buffered_write(iocb, from, iomap_ops, + &ext4_iomap_write_ops, NULL); +} + static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, struct iov_iter *from) { @@ -313,7 +328,10 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, if (ret <= 0) goto out; - ret = generic_perform_write(iocb, from); + if (ext4_inode_buffered_iomap(inode)) + ret = ext4_iomap_buffered_write(iocb, from); + else + ret = generic_perform_write(iocb, from); out: inode_unlock(inode); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 39577a6b65b9..1ae7d3f4a1c8 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3097,7 +3097,7 @@ static int ext4_dax_writepages(struct address_space *mapping, return ret; } -static int ext4_nonda_switch(struct super_block *sb) +int ext4_nonda_switch(struct super_block *sb) { s64 free_clusters, dirty_clusters; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -3467,6 +3467,15 @@ static bool ext4_inode_datasync_dirty(struct inode *inode) return inode_state_read_once(inode) & I_DIRTY_DATASYNC; } +static bool ext4_iomap_valid(struct inode *inode, const struct iomap *iomap) +{ + return iomap->validity_cookie == READ_ONCE(EXT4_I(inode)->i_es_seq); +} + +const struct iomap_write_ops ext4_iomap_write_ops = { + .iomap_valid = ext4_iomap_valid, +}; + static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, struct ext4_map_blocks *map, loff_t offset, loff_t length, unsigned int flags) @@ -3501,6 +3510,8 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) iomap->flags |= IOMAP_F_MERGED; + iomap->validity_cookie = map->m_seq; + /* * Flags passed to ext4_map_blocks() for direct I/O writes can result * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits @@ -3908,8 +3919,12 @@ const struct iomap_ops ext4_iomap_report_ops = { .iomap_begin = ext4_iomap_begin_report, }; +/* Map blocks */ +typedef int (ext4_get_blocks_t)(struct inode *, struct ext4_map_blocks *); + static int ext4_iomap_map_blocks(struct inode *inode, loff_t offset, - loff_t length, struct ext4_map_blocks *map) + loff_t length, ext4_get_blocks_t get_blocks, + struct ext4_map_blocks *map) { u8 blkbits = inode->i_blkbits; @@ -3921,6 +3936,9 @@ static int ext4_iomap_map_blocks(struct inode *inode, loff_t offset, map->m_len = min_t(loff_t, (offset + length - 1) >> blkbits, EXT4_MAX_LOGICAL_BLOCK) - map->m_lblk + 1; + if (get_blocks) + return get_blocks(inode, map); + return ext4_map_blocks(NULL, inode, map, 0); } @@ -3938,7 +3956,7 @@ static int ext4_iomap_buffered_read_begin(struct inode *inode, loff_t offset, if (WARN_ON_ONCE(ext4_has_inline_data(inode))) return -ERANGE; - ret = ext4_iomap_map_blocks(inode, offset, length, &map); + ret = ext4_iomap_map_blocks(inode, offset, length, NULL, &map); if (ret < 0) return ret; @@ -3946,6 +3964,147 @@ static int ext4_iomap_buffered_read_begin(struct inode *inode, loff_t offset, return 0; } +static int ext4_iomap_get_blocks(struct inode *inode, + struct ext4_map_blocks *map) +{ + loff_t i_size = i_size_read(inode); + handle_t *handle; + int ret; + + /* + * Check if the blocks have already been allocated, this could + * avoid initiating a new journal transaction and return the + * mapping information directly. + */ + if ((map->m_lblk + map->m_len) <= + round_up(i_size, i_blocksize(inode)) >> inode->i_blkbits) { + ret = ext4_map_blocks(NULL, inode, map, 0); + if (ret < 0) + return ret; + if (map->m_flags & (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN | + EXT4_MAP_DELAYED)) + return 0; + } + + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, + ext4_chunk_trans_blocks(inode, map->m_len)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + ret = ext4_map_blocks(handle, inode, map, + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT); + /* + * Stop handle here following the lock ordering of the folio lock + * and the transaction start. + */ + ext4_journal_stop(handle); + + return ret; +} + +static int ext4_iomap_buffered_do_write_begin(struct inode *inode, + loff_t offset, loff_t length, unsigned int flags, + struct iomap *iomap, struct iomap *srcmap, bool delalloc) +{ + int ret, retries = 0; + struct ext4_map_blocks map; + ext4_get_blocks_t *get_blocks; + + ret = ext4_emergency_state(inode->i_sb); + if (unlikely(ret)) + return ret; + + /* Inline data and non-extent are not supported. */ + if (WARN_ON_ONCE(ext4_has_inline_data(inode))) + return -ERANGE; + if (WARN_ON_ONCE(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + return -EINVAL; + if (WARN_ON_ONCE(!(flags & IOMAP_WRITE))) + return -EINVAL; + + if (delalloc) + get_blocks = ext4_da_map_blocks; + else + get_blocks = ext4_iomap_get_blocks; +retry: + ret = ext4_iomap_map_blocks(inode, offset, length, get_blocks, &map); + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + if (ret < 0) + return ret; + + ext4_set_iomap(inode, iomap, &map, offset, length, flags); + return 0; +} + +static int ext4_iomap_buffered_write_begin(struct inode *inode, + loff_t offset, loff_t length, unsigned int flags, + struct iomap *iomap, struct iomap *srcmap) +{ + return ext4_iomap_buffered_do_write_begin(inode, offset, length, flags, + iomap, srcmap, false); +} + +static int ext4_iomap_buffered_da_write_begin(struct inode *inode, + loff_t offset, loff_t length, unsigned int flags, + struct iomap *iomap, struct iomap *srcmap) +{ + return ext4_iomap_buffered_do_write_begin(inode, offset, length, flags, + iomap, srcmap, true); +} + +/* + * On write failure, drop the stale delayed allocation range and release + * its reserved space for both start and end blocks. Otherwise, we may + * leave a range of delayed extents covered by a clean folio, which can + * result in inaccurate space reservation accounting. + */ +static void ext4_iomap_punch_delalloc(struct inode *inode, loff_t offset, + loff_t length, struct iomap *iomap) +{ + down_write(&EXT4_I(inode)->i_data_sem); + ext4_es_remove_extent(inode, offset >> inode->i_blkbits, + DIV_ROUND_UP_ULL(length, EXT4_BLOCK_SIZE(inode->i_sb))); + up_write(&EXT4_I(inode)->i_data_sem); +} + +static int ext4_iomap_buffered_da_write_end(struct inode *inode, loff_t offset, + loff_t length, ssize_t written, + unsigned int flags, + struct iomap *iomap) +{ + loff_t start_byte, end_byte; + + /* If we didn't reserve the blocks, we're not allowed to punch them. */ + if (iomap->type != IOMAP_DELALLOC || !(iomap->flags & IOMAP_F_NEW)) + return 0; + + /* Nothing to do if we've written the entire delalloc extent */ + start_byte = iomap_last_written_block(inode, offset, written); + end_byte = round_up(offset + length, i_blocksize(inode)); + if (start_byte >= end_byte) + return 0; + + filemap_invalidate_lock(inode->i_mapping); + iomap_write_delalloc_release(inode, start_byte, end_byte, flags, + iomap, ext4_iomap_punch_delalloc); + filemap_invalidate_unlock(inode->i_mapping); + return 0; +} + +/* + * Since we always allocate unwritten extents, there is no need for + * iomap_end to clean up allocated blocks on a short write. + */ +const struct iomap_ops ext4_iomap_buffered_write_ops = { + .iomap_begin = ext4_iomap_buffered_write_begin, +}; + +const struct iomap_ops ext4_iomap_buffered_da_write_ops = { + .iomap_begin = ext4_iomap_buffered_da_write_begin, + .iomap_end = ext4_iomap_buffered_da_write_end, +}; + const struct iomap_ops ext4_iomap_buffered_read_ops = { .iomap_begin = ext4_iomap_buffered_read_begin, }; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 6a77db4d3124..9bc294b769db 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -104,9 +104,13 @@ static const struct fs_parameter_spec ext4_param_specs[]; * -> page lock -> i_data_sem (rw) * * buffered write path: - * sb_start_write -> i_mutex -> mmap_lock - * sb_start_write -> i_mutex -> transaction start -> page lock -> - * i_data_sem (rw) + * sb_start_write -> i_rwsem (w) -> mmap_lock + * - buffer_head path: + * sb_start_write -> i_rwsem (w) -> transaction start -> folio lock -> + * i_data_sem (rw) + * - iomap path: + * sb_start_write -> i_rwsem (w) -> transaction start -> i_data_sem (rw) + * sb_start_write -> i_rwsem (w) -> folio lock (not under an active handle) * * truncate: * sb_start_write -> i_mutex -> invalidate_lock (w) -> i_mmap_rwsem (w) -> -- 2.52.0