Ext4 buffered writes into large folios still walk every buffer_head in the folio in ext4_block_write_begin() and again in block_commit_write(). Before regular files used large folios this was cheap, but a large folio can contain hundreds of buffer_heads. Small overwrites of an existing large folio therefore pay work proportional to the folio size instead of the write size. This is visible when the page cache is first populated with large folios and then a small range is overwritten. The numbers below come from a local libMicro-based microbenchmark. Each round first drops caches, writes a 10 MiB file with dd to instantiate large page-cache folios, and then runs libMicro's write, pwrite, or writev benchmark for a small buffered overwrite. The writev cases use libMicro's default vector count of 10. A representative pwrite round is: sync echo 3 > /proc/sys/vm/drop_caches dd if=/dev/zero of=$file bs=1024k count=10 taskset -c 0 ./bin/pwrite -H -C 50 -D 3 -S -N pwrite_u1k \ -s 1k -f $file To avoid comparing this change with an older kernel, the benchmark uses two kernels built from the same master tree: one with this change and one with only this change reverted. With THP=always and 10 dd-prefill rounds, median latencies were: nofix patched improvement write_u1k 1.418 usec 0.342 usec 75.9% write_u10k 1.887 usec 0.409 usec 78.3% write_u100k 4.114 usec 2.554 usec 37.9% pwrite_u1k 1.677 usec 0.335 usec 80.1% pwrite_u10k 1.903 usec 0.410 usec 78.5% pwrite_u100k 4.101 usec 2.563 usec 37.5% writev_u1k 2.285 usec 0.756 usec 66.9% writev_u10k 4.655 usec 3.025 usec 35.0% Start the ext4 write_begin walk at the first buffer that overlaps the write. For already-uptodate large folio overwrites, add a partial commit path which marks only the written buffers uptodate and dirty. Leave non-uptodate folios on the old full-buffer commit path so BH_New cleanup and folio-uptodate discovery are preserved. Partially uptodate large folios remain described by per-buffer state, which is what block_is_partially_uptodate() and read_folio use for later reads. Signed-off-by: Jia Zhu --- fs/buffer.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/inode.c | 21 ++++++++++---------- 2 files changed, 62 insertions(+), 10 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index b0b3792b1496e..e0c5868b088be 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2092,6 +2092,44 @@ int __block_write_begin(struct folio *folio, loff_t pos, unsigned len, } EXPORT_SYMBOL(__block_write_begin); +static struct buffer_head *folio_buffer_seek(struct buffer_head *head, + unsigned int blocksize, + size_t offset, + size_t *block_start) +{ + size_t nr = offset / blocksize; + + *block_start = nr * blocksize; + while (nr--) + head = head->b_this_page; + return head; +} + +static void block_commit_write_range(struct buffer_head *head, + unsigned int blocksize, size_t from, + size_t to) +{ + size_t block_start, block_end; + struct buffer_head *bh; + + if (from == to) + return; + if (WARN_ON_ONCE(to > folio_size(head->b_folio))) + return; + + bh = folio_buffer_seek(head, blocksize, from, &block_start); + do { + block_end = block_start + blocksize; + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + if (buffer_new(bh)) + clear_buffer_new(bh); + + block_start = block_end; + bh = bh->b_this_page; + } while (block_start < to && bh != head); +} + void block_commit_write(struct folio *folio, size_t from, size_t to) { size_t block_start, block_end; @@ -2104,6 +2142,19 @@ void block_commit_write(struct folio *folio, size_t from, size_t to) return; blocksize = bh->b_size; + /* + * Large folios can carry hundreds of buffer_heads. For partial writes, + * keep commit work local to the written range; partially uptodate + * reads remain governed by the buffer state. + */ + if (folio_test_large(folio) && from < to && + folio_test_uptodate(folio) && + to <= folio_size(folio) && + (from != 0 || to != folio_size(folio))) { + block_commit_write_range(head, blocksize, from, to); + return; + } + block_start = 0; do { block_end = block_start + blocksize; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c2c2d6ac7f3d1..e58bba0289eba 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1180,7 +1180,7 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio, unsigned int blocksize = i_blocksize(inode); struct buffer_head *bh, *head, *wait[2]; int nr_wait = 0; - int i; + unsigned int i; bool should_journal_data = ext4_should_journal_data(inode); BUG_ON(!folio_test_locked(folio)); @@ -1191,17 +1191,18 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio, head = folio_buffers(folio); if (!head) head = create_empty_buffers(folio, blocksize, 0); - block = EXT4_PG_TO_LBLK(inode, folio->index); + if (from == to) + return 0; + block_start = round_down(from, blocksize); + block = EXT4_PG_TO_LBLK(inode, folio->index) + + (block_start >> inode->i_blkbits); + bh = head; + for (i = 0; i < block_start; i += blocksize) + bh = bh->b_this_page; - for (bh = head, block_start = 0; bh != head || !block_start; - block++, block_start = block_end, bh = bh->b_this_page) { + for (; block_start < to; + block++, block_start = block_end, bh = bh->b_this_page) { block_end = block_start + blocksize; - if (block_end <= from || block_start >= to) { - if (folio_test_uptodate(folio)) { - set_buffer_uptodate(bh); - } - continue; - } if (WARN_ON_ONCE(buffer_new(bh))) clear_buffer_new(bh); if (!buffer_mapped(bh)) { base-commit: e43ffb69e0438cddd72aaa30898b4dc446f664f8 -- 2.39.5 (Apple Git-154)