Add an additional arg to __folio_start_writeback() that takes in the number of pages to write back. Signed-off-by: Joanne Koong --- fs/btrfs/subpage.c | 2 +- fs/ext4/page-io.c | 2 +- include/linux/page-flags.h | 4 ++-- mm/page-writeback.c | 10 +++++----- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index cb4f97833dc3..895e0c96a8fc 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -456,7 +456,7 @@ void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info, * ordering guarantees. */ if (!folio_test_writeback(folio)) - __folio_start_writeback(folio, true); + __folio_start_writeback(folio, true, folio_nr_pages(folio)); if (!folio_test_dirty(folio)) { struct address_space *mapping = folio_mapping(folio); XA_STATE(xas, &mapping->i_pages, folio->index); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 39abfeec5f36..6b12a6b869f8 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -580,7 +580,7 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio, io_folio = page_folio(bounce_page); } - __folio_start_writeback(folio, keep_towrite); + __folio_start_writeback(folio, keep_towrite, folio_nr_pages(folio)); /* Now submit buffers to write */ do { diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 8d3fa3a91ce4..d1e0743217b7 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -832,11 +832,11 @@ static __always_inline void SetPageUptodate(struct page *page) CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL) -void __folio_start_writeback(struct folio *folio, bool keep_write); +void __folio_start_writeback(struct folio *folio, bool keep_write, long nr_pages); void set_page_writeback(struct page *page); #define folio_start_writeback(folio) \ - __folio_start_writeback(folio, false) + __folio_start_writeback(folio, false, folio_nr_pages(folio)) static __always_inline bool folio_test_head(const struct folio *folio) { diff --git a/mm/page-writeback.c b/mm/page-writeback.c index de669636120d..d1b2c91f0619 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -3042,9 +3042,9 @@ bool __folio_end_writeback(struct folio *folio) return ret; } -void __folio_start_writeback(struct folio *folio, bool keep_write) +void __folio_start_writeback(struct folio *folio, bool keep_write, + long nr_pages) { - long nr = folio_nr_pages(folio); struct address_space *mapping = folio_mapping(folio); int access_ret; @@ -3065,7 +3065,7 @@ void __folio_start_writeback(struct folio *folio, bool keep_write) on_wblist = mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK); xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); - wb_stat_mod(wb, WB_WRITEBACK, nr); + wb_stat_mod(wb, WB_WRITEBACK, nr_pages); if (!on_wblist) { wb_inode_writeback_start(wb); /* @@ -3086,8 +3086,8 @@ void __folio_start_writeback(struct folio *folio, bool keep_write) folio_test_set_writeback(folio); } - lruvec_stat_mod_folio(folio, NR_WRITEBACK, nr); - zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr); + lruvec_stat_mod_folio(folio, NR_WRITEBACK, nr_pages); + zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr_pages); access_ret = arch_make_folio_accessible(folio); /* -- 2.47.3 Add an additional arg to __folio_end_writeback() that takes in the number of pages that were written back. Signed-off-by: Joanne Koong --- mm/filemap.c | 2 +- mm/internal.h | 2 +- mm/page-writeback.c | 13 ++++++------- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 751838ef05e5..cbfb0f085eb6 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1657,7 +1657,7 @@ void folio_end_writeback(struct folio *folio) * reused before the folio_wake_bit(). */ folio_get(folio); - if (__folio_end_writeback(folio)) + if (__folio_end_writeback(folio, folio_nr_pages(folio))) folio_wake_bit(folio, PG_writeback); filemap_end_dropbehind_write(folio); diff --git a/mm/internal.h b/mm/internal.h index 45b725c3dc03..2eb156823d45 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -438,7 +438,7 @@ static inline vm_fault_t vmf_anon_prepare(struct vm_fault *vmf) vm_fault_t do_swap_page(struct vm_fault *vmf); void folio_rotate_reclaimable(struct folio *folio); -bool __folio_end_writeback(struct folio *folio); +bool __folio_end_writeback(struct folio *folio, long nr_pages); void deactivate_file_folio(struct folio *folio); void folio_activate(struct folio *folio); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d1b2c91f0619..65002552458a 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -3006,9 +3006,8 @@ static void wb_inode_writeback_end(struct bdi_writeback *wb) spin_unlock_irqrestore(&wb->work_lock, flags); } -bool __folio_end_writeback(struct folio *folio) +bool __folio_end_writeback(struct folio *folio, long nr_pages) { - long nr = folio_nr_pages(folio); struct address_space *mapping = folio_mapping(folio); bool ret; @@ -3022,8 +3021,8 @@ bool __folio_end_writeback(struct folio *folio) __xa_clear_mark(&mapping->i_pages, folio_index(folio), PAGECACHE_TAG_WRITEBACK); - wb_stat_mod(wb, WB_WRITEBACK, -nr); - __wb_writeout_add(wb, nr); + wb_stat_mod(wb, WB_WRITEBACK, -nr_pages); + __wb_writeout_add(wb, nr_pages); if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) { wb_inode_writeback_end(wb); if (mapping->host) @@ -3035,9 +3034,9 @@ bool __folio_end_writeback(struct folio *folio) ret = folio_xor_flags_has_waiters(folio, 1 << PG_writeback); } - lruvec_stat_mod_folio(folio, NR_WRITEBACK, -nr); - zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr); - node_stat_mod_folio(folio, NR_WRITTEN, nr); + lruvec_stat_mod_folio(folio, NR_WRITEBACK, -nr_pages); + zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr_pages); + node_stat_mod_folio(folio, NR_WRITTEN, nr_pages); return ret; } -- 2.47.3 Add folio_end_writeback_pages() which takes in the number of pages written back. Signed-off-by: Joanne Koong --- include/linux/pagemap.h | 1 + mm/filemap.c | 12 +++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 12a12dae727d..362900730247 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1221,6 +1221,7 @@ void folio_wait_writeback(struct folio *folio); int folio_wait_writeback_killable(struct folio *folio); void end_page_writeback(struct page *page); void folio_end_writeback(struct folio *folio); +void folio_end_writeback_pages(struct folio *folio, long nr_pages); void folio_wait_stable(struct folio *folio); void __folio_mark_dirty(struct folio *folio, struct address_space *, int warn); void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb); diff --git a/mm/filemap.c b/mm/filemap.c index cbfb0f085eb6..6d50afaff930 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1627,14 +1627,15 @@ static void filemap_end_dropbehind_write(struct folio *folio) } /** - * folio_end_writeback - End writeback against a folio. + * folio_end_writeback_pages - End writeback against a folio. * @folio: The folio. + * @nr_pages: The number of pages written back. * * The folio must actually be under writeback. * * Context: May be called from process or interrupt context. */ -void folio_end_writeback(struct folio *folio) +void folio_end_writeback_pages(struct folio *folio, long nr_pages) { VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio); @@ -1657,13 +1658,18 @@ void folio_end_writeback(struct folio *folio) * reused before the folio_wake_bit(). */ folio_get(folio); - if (__folio_end_writeback(folio, folio_nr_pages(folio))) + if (__folio_end_writeback(folio, nr_pages)) folio_wake_bit(folio, PG_writeback); filemap_end_dropbehind_write(folio); acct_reclaim_writeback(folio); folio_put(folio); } + +void folio_end_writeback(struct folio *folio) +{ + folio_end_writeback_pages(folio, folio_nr_pages(folio)); +} EXPORT_SYMBOL(folio_end_writeback); /** -- 2.47.3 Add an additional arg to __folio_mark_dirty() that takes in the number of pages dirtied, so that this can be passed to folio_account_dirtied() when it updates the stats. Signed-off-by: Joanne Koong --- fs/buffer.c | 6 ++++-- include/linux/pagemap.h | 3 ++- mm/page-writeback.c | 10 +++++----- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 6a8752f7bbed..65c96c432800 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -751,7 +751,8 @@ bool block_dirty_folio(struct address_space *mapping, struct folio *folio) spin_unlock(&mapping->i_private_lock); if (newly_dirty) - __folio_mark_dirty(folio, mapping, 1); + __folio_mark_dirty(folio, mapping, 1, + folio_nr_pages(folio)); if (newly_dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); @@ -1203,7 +1204,8 @@ void mark_buffer_dirty(struct buffer_head *bh) if (!folio_test_set_dirty(folio)) { mapping = folio->mapping; if (mapping) - __folio_mark_dirty(folio, mapping, 0); + __folio_mark_dirty(folio, mapping, 0, + folio_nr_pages(folio)); } if (mapping) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 362900730247..48745f8f6dfe 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1223,7 +1223,8 @@ void end_page_writeback(struct page *page); void folio_end_writeback(struct folio *folio); void folio_end_writeback_pages(struct folio *folio, long nr_pages); void folio_wait_stable(struct folio *folio); -void __folio_mark_dirty(struct folio *folio, struct address_space *, int warn); +void __folio_mark_dirty(struct folio *folio, struct address_space *, int warn, + long nr_pages); void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb); void __folio_cancel_dirty(struct folio *folio); static inline void folio_cancel_dirty(struct folio *folio) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 65002552458a..e66eef2d1584 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2675,7 +2675,7 @@ EXPORT_SYMBOL(noop_dirty_folio); * NOTE: This relies on being atomic wrt interrupts. */ static void folio_account_dirtied(struct folio *folio, - struct address_space *mapping) + struct address_space *mapping, long nr) { struct inode *inode = mapping->host; @@ -2683,7 +2683,6 @@ static void folio_account_dirtied(struct folio *folio, if (mapping_can_writeback(mapping)) { struct bdi_writeback *wb; - long nr = folio_nr_pages(folio); inode_attach_wb(inode, folio); wb = inode_to_wb(inode); @@ -2731,14 +2730,14 @@ void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb) * try_to_free_buffers() to fail. */ void __folio_mark_dirty(struct folio *folio, struct address_space *mapping, - int warn) + int warn, long nr_pages) { unsigned long flags; xa_lock_irqsave(&mapping->i_pages, flags); if (folio->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !folio_test_uptodate(folio)); - folio_account_dirtied(folio, mapping); + folio_account_dirtied(folio, mapping, nr_pages); __xa_set_mark(&mapping->i_pages, folio_index(folio), PAGECACHE_TAG_DIRTY); } @@ -2769,7 +2768,8 @@ bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio) if (folio_test_set_dirty(folio)) return false; - __folio_mark_dirty(folio, mapping, !folio_test_private(folio)); + __folio_mark_dirty(folio, mapping, !folio_test_private(folio), + folio_nr_pages(folio)); if (mapping->host) { /* !PageAnon && !swapper_space */ -- 2.47.3 Add filemap_dirty_folio_pages() which is equivalent to filemap_dirty_folio() except it takes in the number of pages in the folio to account for as dirty when it updates internal dirty stats instead of accounting all pages in the folio as dirty. If the folio is already dirty, calling this function will still update the stats. As such, the caller is responsible for ensuring no overaccounting happens. The same caller responsibilities apply here as for filemap_dirty_folio() (eg, should ensure this doesn't race with truncation/writeback). Signed-off-by: Joanne Koong --- fs/buffer.c | 4 ++-- include/linux/pagemap.h | 2 +- include/linux/writeback.h | 2 ++ mm/page-writeback.c | 41 +++++++++++++++++++++++++++++++++++---- 4 files changed, 42 insertions(+), 7 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 65c96c432800..558591254fdb 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -752,7 +752,7 @@ bool block_dirty_folio(struct address_space *mapping, struct folio *folio) if (newly_dirty) __folio_mark_dirty(folio, mapping, 1, - folio_nr_pages(folio)); + folio_nr_pages(folio), true); if (newly_dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); @@ -1205,7 +1205,7 @@ void mark_buffer_dirty(struct buffer_head *bh) mapping = folio->mapping; if (mapping) __folio_mark_dirty(folio, mapping, 0, - folio_nr_pages(folio)); + folio_nr_pages(folio), true); } if (mapping) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 48745f8f6dfe..510bc6e0f70b 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1224,7 +1224,7 @@ void folio_end_writeback(struct folio *folio); void folio_end_writeback_pages(struct folio *folio, long nr_pages); void folio_wait_stable(struct folio *folio); void __folio_mark_dirty(struct folio *folio, struct address_space *, int warn, - long nr_pages); + long nr_pages, bool newly_dirty); void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb); void __folio_cancel_dirty(struct folio *folio); static inline void folio_cancel_dirty(struct folio *folio) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index a2848d731a46..0df11d00cce2 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -372,6 +372,8 @@ void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end); bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio); +bool filemap_dirty_folio_pages(struct address_space *mapping, + struct folio *folio, long nr_pages); bool folio_redirty_for_writepage(struct writeback_control *, struct folio *); bool redirty_page_for_writepage(struct writeback_control *, struct page *); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index e66eef2d1584..1f862ab3c68d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2730,7 +2730,7 @@ void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb) * try_to_free_buffers() to fail. */ void __folio_mark_dirty(struct folio *folio, struct address_space *mapping, - int warn, long nr_pages) + int warn, long nr_pages, bool newly_dirty) { unsigned long flags; @@ -2738,8 +2738,9 @@ void __folio_mark_dirty(struct folio *folio, struct address_space *mapping, if (folio->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !folio_test_uptodate(folio)); folio_account_dirtied(folio, mapping, nr_pages); - __xa_set_mark(&mapping->i_pages, folio_index(folio), - PAGECACHE_TAG_DIRTY); + if (newly_dirty) + __xa_set_mark(&mapping->i_pages, folio_index(folio), + PAGECACHE_TAG_DIRTY); } xa_unlock_irqrestore(&mapping->i_pages, flags); } @@ -2769,7 +2770,7 @@ bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio) return false; __folio_mark_dirty(folio, mapping, !folio_test_private(folio), - folio_nr_pages(folio)); + folio_nr_pages(folio), true); if (mapping->host) { /* !PageAnon && !swapper_space */ @@ -2779,6 +2780,38 @@ bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio) } EXPORT_SYMBOL(filemap_dirty_folio); +/** + * filemap_dirty_folio_pages - Mark a folio dirty and update stats to account + * for dirtying @nr_pages within the folio. + * @mapping: Address space this folio belongs to. + * @folio: Folio to be marked as dirty. + * @nr_pages: Number of pages to dirty. + * + * This is equivalent to filemap_dirty_folio() except it takes in the number of + * pages in the folio to account for as dirty when it updates internal dirty + * stats instead of accounting all pages in the folio as dirty. If the folio is + * already dirty, calling this function will still update the stats. As such, + * the caller is responsible for ensuring no overaccounting happens. + * + * The same caller responsibilities apply here as for filemap_dirty_folio() + * (eg, should ensure this doesn't race with truncation/writeback). + */ +bool filemap_dirty_folio_pages(struct address_space *mapping, + struct folio *folio, long nr_pages) +{ + bool newly_dirty = !folio_test_set_dirty(folio); + + __folio_mark_dirty(folio, mapping, !folio_test_private(folio), + nr_pages, newly_dirty); + + if (newly_dirty && mapping->host) { + /* !PageAnon && !swapper_space */ + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + } + + return newly_dirty; +} + /** * folio_redirty_for_writepage - Decline to write a dirty folio. * @wbc: The writeback control. -- 2.47.3 Add __folio_clear_dirty_for_io() which takes in an arg for whether the folio and wb stats should be updated as part of the call or not. Signed-off-by: Joanne Koong --- mm/page-writeback.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 1f862ab3c68d..fe39137f01d6 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2955,7 +2955,7 @@ EXPORT_SYMBOL(__folio_cancel_dirty); * This incoherency between the folio's dirty flag and xarray tag is * unfortunate, but it only exists while the folio is locked. */ -bool folio_clear_dirty_for_io(struct folio *folio) +static bool __folio_clear_dirty_for_io(struct folio *folio, bool update_stats) { struct address_space *mapping = folio_mapping(folio); bool ret = false; @@ -3004,10 +3004,14 @@ bool folio_clear_dirty_for_io(struct folio *folio) */ wb = unlocked_inode_to_wb_begin(inode, &cookie); if (folio_test_clear_dirty(folio)) { - long nr = folio_nr_pages(folio); - lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr); - zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr); - wb_stat_mod(wb, WB_RECLAIMABLE, -nr); + if (update_stats) { + long nr = folio_nr_pages(folio); + lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, + -nr); + zone_stat_mod_folio(folio, + NR_ZONE_WRITE_PENDING, -nr); + wb_stat_mod(wb, WB_RECLAIMABLE, -nr); + } ret = true; } unlocked_inode_to_wb_end(inode, &cookie); @@ -3015,6 +3019,11 @@ bool folio_clear_dirty_for_io(struct folio *folio) } return folio_test_clear_dirty(folio); } + +bool folio_clear_dirty_for_io(struct folio *folio) +{ + return __folio_clear_dirty_for_io(folio, true); +} EXPORT_SYMBOL(folio_clear_dirty_for_io); static void wb_inode_writeback_start(struct bdi_writeback *wb) -- 2.47.3 Add a no_stats_accounting bitfield to wbc that callers can set. Hook this up to __folio_clear_dirty_for_io() when preparing writeback. This is so that for filesystems that implement granular dirty writeback for its large folios, the stats reflect only the dirty pages that are written back instead of all the pages in the folio, which helps enforce more accurate / less conservative dirty page balancing. Signed-off-by: Joanne Koong --- include/linux/writeback.h | 7 +++++++ mm/page-writeback.c | 3 ++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 0df11d00cce2..f63a52b56dff 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -71,6 +71,13 @@ struct writeback_control { */ unsigned no_cgroup_owner:1; + /* + * Do not do any stats accounting. The caller will do this themselves. + * This is useful for filesystems that implement granular dirty + * writeback for its large folios. + */ + unsigned no_stats_accounting:1; + /* internal fields used by the ->writepages implementation: */ struct folio_batch fbatch; pgoff_t index; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index fe39137f01d6..294339887e55 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2402,6 +2402,7 @@ void tag_pages_for_writeback(struct address_space *mapping, } EXPORT_SYMBOL(tag_pages_for_writeback); +static bool __folio_clear_dirty_for_io(struct folio *folio, bool update_stats); static bool folio_prepare_writeback(struct address_space *mapping, struct writeback_control *wbc, struct folio *folio) { @@ -2428,7 +2429,7 @@ static bool folio_prepare_writeback(struct address_space *mapping, } BUG_ON(folio_test_writeback(folio)); - if (!folio_clear_dirty_for_io(folio)) + if (!__folio_clear_dirty_for_io(folio, !wbc->no_stats_accounting)) return false; return true; -- 2.47.3 Move logic for clearing dirty stats into a helper function both folio_account_cleaned() and __folio_clear_dirty_for_io() invoke. Signed-off-by: Joanne Koong Reviewed-by: Jeff Layton --- mm/page-writeback.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 294339887e55..e0410cfbe480 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2701,6 +2701,14 @@ static void folio_account_dirtied(struct folio *folio, } } +static void __clear_dirty_for_io_stats(struct folio *folio, + struct bdi_writeback *wb, long nr_pages) +{ + lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr_pages); + zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr_pages); + wb_stat_mod(wb, WB_RECLAIMABLE, -nr_pages); +} + /* * Helper function for deaccounting dirty page without writeback. * @@ -2709,9 +2717,7 @@ void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb) { long nr = folio_nr_pages(folio); - lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr); - zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr); - wb_stat_mod(wb, WB_RECLAIMABLE, -nr); + __clear_dirty_for_io_stats(folio, wb, nr); task_io_account_cancelled_write(nr * PAGE_SIZE); } @@ -3005,14 +3011,9 @@ static bool __folio_clear_dirty_for_io(struct folio *folio, bool update_stats) */ wb = unlocked_inode_to_wb_begin(inode, &cookie); if (folio_test_clear_dirty(folio)) { - if (update_stats) { - long nr = folio_nr_pages(folio); - lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, - -nr); - zone_stat_mod_folio(folio, - NR_ZONE_WRITE_PENDING, -nr); - wb_stat_mod(wb, WB_RECLAIMABLE, -nr); - } + if (update_stats) + __clear_dirty_for_io_stats(folio, wb, + folio_nr_pages(folio)); ret = true; } unlocked_inode_to_wb_end(inode, &cookie); -- 2.47.3 Add clear_dirty_for_io_stats() which clears dirty stats corresponding to a folio. The main use case for this is for filesystems that implement granular dirty writeback for large folios. This allows them (after setting the wbc no_stats_accounting bitfield) to update dirty writeback stats only for the pages in the folio that are written back instead of for the entire folio, which helps enforce more accurate / less conservative dirty page balancing. Signed-off-by: Joanne Koong --- include/linux/writeback.h | 1 + mm/page-writeback.c | 16 ++++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index f63a52b56dff..2ae0bea03d48 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -375,6 +375,7 @@ int write_cache_pages(struct address_space *mapping, void *data); int do_writepages(struct address_space *mapping, struct writeback_control *wbc); void writeback_set_ratelimit(void); +void clear_dirty_for_io_stats(struct folio *folio, long nr_pages); void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index e0410cfbe480..726da7611cce 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2709,6 +2709,22 @@ static void __clear_dirty_for_io_stats(struct folio *folio, wb_stat_mod(wb, WB_RECLAIMABLE, -nr_pages); } +void clear_dirty_for_io_stats(struct folio *folio, long nr_pages) +{ + struct address_space *mapping = folio_mapping(folio); + struct bdi_writeback *wb; + struct wb_lock_cookie cookie = {}; + struct inode *inode; + + if (!mapping || !mapping_can_writeback(mapping)) + return; + + inode = mapping->host; + wb = unlocked_inode_to_wb_begin(inode, &cookie); + __clear_dirty_for_io_stats(folio, wb, nr_pages); + unlocked_inode_to_wb_end(inode, &cookie); +} + /* * Helper function for deaccounting dirty page without writeback. * -- 2.47.3 Use find_next_bit()/find_next_zero_bit() for iomap dirty bitmap iteration. This uses __ffs() internally and is more efficient for finding the next dirty or clean bit than manually iterating through the bitmap range testing every bit. Signed-off-by: Joanne Koong Suggested-by: Christoph Hellwig --- fs/iomap/buffered-io.c | 67 ++++++++++++++++++++++++++++++------------ 1 file changed, 48 insertions(+), 19 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index fd827398afd2..dc1a1f371412 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -75,13 +75,42 @@ static void iomap_set_range_uptodate(struct folio *folio, size_t off, folio_mark_uptodate(folio); } -static inline bool ifs_block_is_dirty(struct folio *folio, - struct iomap_folio_state *ifs, int block) +/** + * ifs_next_dirty_block - find the next dirty block in the folio + * @folio: The folio + * @start_blk: Block number to begin searching at + * @end_blk: Last block number (inclusive) to search + * + * If no dirty block is found, this will return end_blk + 1. + */ +static unsigned ifs_next_dirty_block(struct folio *folio, + unsigned start_blk, unsigned end_blk) { + struct iomap_folio_state *ifs = folio->private; struct inode *inode = folio->mapping->host; - unsigned int blks_per_folio = i_blocks_per_folio(inode, folio); + unsigned int blks = i_blocks_per_folio(inode, folio); + + return find_next_bit(ifs->state, blks + end_blk + 1, + blks + start_blk) - blks; +} + +/** + * ifs_next_clean_block - find the next clean block in the folio + * @folio: The folio + * @start_blk: Block number to begin searching at + * @end_blk: Last block number (inclusive) to search + * + * If no clean block is found, this will return end_blk + 1. + */ +static unsigned ifs_next_clean_block(struct folio *folio, + unsigned start_blk, unsigned end_blk) +{ + struct iomap_folio_state *ifs = folio->private; + struct inode *inode = folio->mapping->host; + unsigned int blks = i_blocks_per_folio(inode, folio); - return test_bit(block + blks_per_folio, ifs->state); + return find_next_zero_bit(ifs->state, blks + end_blk + 1, + blks + start_blk) - blks; } static unsigned ifs_find_dirty_range(struct folio *folio, @@ -92,18 +121,15 @@ static unsigned ifs_find_dirty_range(struct folio *folio, offset_in_folio(folio, *range_start) >> inode->i_blkbits; unsigned end_blk = min_not_zero( offset_in_folio(folio, range_end) >> inode->i_blkbits, - i_blocks_per_folio(inode, folio)); - unsigned nblks = 1; + i_blocks_per_folio(inode, folio)) - 1; + unsigned nblks; - while (!ifs_block_is_dirty(folio, ifs, start_blk)) - if (++start_blk == end_blk) - return 0; + start_blk = ifs_next_dirty_block(folio, start_blk, end_blk); + if (start_blk > end_blk) + return 0; - while (start_blk + nblks < end_blk) { - if (!ifs_block_is_dirty(folio, ifs, start_blk + nblks)) - break; - nblks++; - } + nblks = ifs_next_clean_block(folio, start_blk + 1, end_blk) + - start_blk; *range_start = folio_pos(folio) + (start_blk << inode->i_blkbits); return nblks << inode->i_blkbits; @@ -1077,7 +1103,7 @@ static void iomap_write_delalloc_ifs_punch(struct inode *inode, struct folio *folio, loff_t start_byte, loff_t end_byte, struct iomap *iomap, iomap_punch_t punch) { - unsigned int first_blk, last_blk, i; + unsigned int first_blk, last_blk; loff_t last_byte; u8 blkbits = inode->i_blkbits; struct iomap_folio_state *ifs; @@ -1096,10 +1122,13 @@ static void iomap_write_delalloc_ifs_punch(struct inode *inode, folio_pos(folio) + folio_size(folio) - 1); first_blk = offset_in_folio(folio, start_byte) >> blkbits; last_blk = offset_in_folio(folio, last_byte) >> blkbits; - for (i = first_blk; i <= last_blk; i++) { - if (!ifs_block_is_dirty(folio, ifs, i)) - punch(inode, folio_pos(folio) + (i << blkbits), - 1 << blkbits, iomap); + while (first_blk <= last_blk) { + first_blk = ifs_next_clean_block(folio, first_blk, last_blk); + if (first_blk > last_blk) + break; + punch(inode, folio_pos(folio) + (first_blk << blkbits), + 1 << blkbits, iomap); + first_blk++; } } -- 2.47.3 Use find_next_bit()/find_next_zero_bit() for iomap uptodate bitmap iteration. This uses __ffs() internally and is more efficient for finding the next uptodate or non-uptodate bit than manually iterating through the bitmap range testing every bit. Signed-off-by: Joanne Koong Suggested-by: Christoph Hellwig --- fs/iomap/buffered-io.c | 74 +++++++++++++++++++++++++++--------------- 1 file changed, 48 insertions(+), 26 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index dc1a1f371412..4f021dcaaffe 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -37,10 +37,36 @@ static inline bool ifs_is_fully_uptodate(struct folio *folio, return bitmap_full(ifs->state, i_blocks_per_folio(inode, folio)); } -static inline bool ifs_block_is_uptodate(struct iomap_folio_state *ifs, - unsigned int block) +/** + * ifs_next_uptodate_block - find the next uptodate block in the folio + * @folio: The folio + * @start_blk: Block number to begin searching at + * @end_blk: Last block number (inclusive) to search + * + * If no uptodate block is found, this will return end_blk + 1. + */ +static unsigned ifs_next_uptodate_block(struct folio *folio, + unsigned start_blk, unsigned end_blk) { - return test_bit(block, ifs->state); + struct iomap_folio_state *ifs = folio->private; + + return find_next_bit(ifs->state, end_blk + 1, start_blk); +} + +/** + * ifs_next_nonuptodate_block - find the next non-uptodate block in the folio + * @folio: The folio + * @start_blk: Block number to begin searching at + * @end_blk: Last block number (inclusive) to search + * + * If no non-uptodate block is found, this will return end_blk + 1. + */ +static unsigned ifs_next_nonuptodate_block(struct folio *folio, + unsigned start_blk, unsigned end_blk) +{ + struct iomap_folio_state *ifs = folio->private; + + return find_next_zero_bit(ifs->state, end_blk + 1, start_blk); } static bool ifs_set_range_uptodate(struct folio *folio, @@ -266,24 +292,23 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, * to avoid reading in already uptodate ranges. */ if (ifs) { - unsigned int i; - - /* move forward for each leading block marked uptodate */ - for (i = first; i <= last; i++) { - if (!ifs_block_is_uptodate(ifs, i)) - break; - *pos += block_size; - poff += block_size; - plen -= block_size; - first++; - } - - /* truncate len if we find any trailing uptodate block(s) */ - while (++i <= last) { - if (ifs_block_is_uptodate(ifs, i)) { - plen -= (last - i + 1) * block_size; - last = i - 1; - break; + unsigned next, bytes; + + /* find the next non-uptodate block */ + next = ifs_next_nonuptodate_block(folio, first, last); + bytes = (next - first) << block_bits; + *pos += bytes; + poff += bytes; + WARN_ON_ONCE(bytes > plen); + plen -= bytes; + first = next; + + if (++next <= last) { + /* truncate len if we find any trailing uptodate block(s) */ + next = ifs_next_uptodate_block(folio, next, last); + if (next <= last) { + plen -= (last - next + 1) << block_bits; + last = next - 1; } } } @@ -607,7 +632,7 @@ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count) { struct iomap_folio_state *ifs = folio->private; struct inode *inode = folio->mapping->host; - unsigned first, last, i; + unsigned first, last; if (!ifs) return false; @@ -619,10 +644,7 @@ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count) first = from >> inode->i_blkbits; last = (from + count - 1) >> inode->i_blkbits; - for (i = first; i <= last; i++) - if (!ifs_block_is_uptodate(ifs, i)) - return false; - return true; + return ifs_next_nonuptodate_block(folio, first, last) > last; } EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate); -- 2.47.3 Add granular dirty and writeback accounting for large folios. These stats are used by the mm layer for dirty balancing and throttling. Having granular dirty and writeback accounting helps prevent over-aggressive balancing and throttling. There are 4 places in iomap this commit affects: a) filemap dirtying, which now calls filemap_dirty_folio_pages() b) writeback_iter with setting the wbc->no_stats_accounting bit and calling clear_dirty_for_io_stats() c) starting writeback, which now calls __folio_start_writeback() d) ending writeback, which now calls folio_end_writeback_pages() This relies on using the ifs->state dirty bitmap to track dirty pages in the folio. As such, this can only be utilized on filesystems where the block size >= PAGE_SIZE. Signed-off-by: Joanne Koong --- fs/iomap/buffered-io.c | 140 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 132 insertions(+), 8 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 4f021dcaaffe..bf33a5361a39 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -20,6 +20,8 @@ struct iomap_folio_state { spinlock_t state_lock; unsigned int read_bytes_pending; atomic_t write_bytes_pending; + /* number of pages being currently written back */ + unsigned nr_pages_writeback; /* * Each block has two bits in this bitmap: @@ -139,6 +141,29 @@ static unsigned ifs_next_clean_block(struct folio *folio, blks + start_blk) - blks; } +static unsigned ifs_count_dirty_pages(struct folio *folio) +{ + struct inode *inode = folio->mapping->host; + unsigned block_size = i_blocksize(inode); + unsigned start_blk, end_blk; + unsigned blks, nblks = 0; + + start_blk = 0; + blks = i_blocks_per_folio(inode, folio); + end_blk = (i_size_read(inode) - 1) >> inode->i_blkbits; + end_blk = min(end_blk, i_blocks_per_folio(inode, folio) - 1); + + while (start_blk <= end_blk) { + start_blk = ifs_next_dirty_block(folio, start_blk, end_blk); + if (start_blk > end_blk) + break; + nblks++; + start_blk++; + } + + return nblks * (block_size >> PAGE_SHIFT); +} + static unsigned ifs_find_dirty_range(struct folio *folio, struct iomap_folio_state *ifs, u64 *range_start, u64 range_end) { @@ -220,6 +245,58 @@ static void iomap_set_range_dirty(struct folio *folio, size_t off, size_t len) ifs_set_range_dirty(folio, ifs, off, len); } +static long iomap_get_range_newly_dirtied(struct folio *folio, loff_t pos, + unsigned len) +{ + struct inode *inode = folio->mapping->host; + unsigned block_size = i_blocksize(inode); + unsigned start_blk, end_blk; + unsigned nblks = 0; + + start_blk = pos >> inode->i_blkbits; + end_blk = (pos + len - 1) >> inode->i_blkbits; + end_blk = min(end_blk, i_blocks_per_folio(inode, folio) - 1); + + while (start_blk <= end_blk) { + /* count how many clean blocks there are */ + start_blk = ifs_next_clean_block(folio, start_blk, end_blk); + if (start_blk > end_blk) + break; + nblks++; + start_blk++; + } + + return nblks * (block_size >> PAGE_SHIFT); +} + +static bool iomap_granular_dirty_pages(struct folio *folio) +{ + struct iomap_folio_state *ifs = folio->private; + + if (!ifs) + return false; + + return i_blocksize(folio->mapping->host) >= PAGE_SIZE; +} + +static bool iomap_dirty_folio_range(struct address_space *mapping, + struct folio *folio, loff_t pos, unsigned len) +{ + long nr_new_dirty_pages; + + if (!iomap_granular_dirty_pages(folio)) { + iomap_set_range_dirty(folio, pos, len); + return filemap_dirty_folio(mapping, folio); + } + + nr_new_dirty_pages = iomap_get_range_newly_dirtied(folio, pos, len); + if (!nr_new_dirty_pages) + return false; + + iomap_set_range_dirty(folio, pos, len); + return filemap_dirty_folio_pages(mapping, folio, nr_new_dirty_pages); +} + static struct iomap_folio_state *ifs_alloc(struct inode *inode, struct folio *folio, unsigned int flags) { @@ -712,8 +789,7 @@ bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio) size_t len = folio_size(folio); ifs_alloc(inode, folio, 0); - iomap_set_range_dirty(folio, 0, len); - return filemap_dirty_folio(mapping, folio); + return iomap_dirty_folio_range(mapping, folio, 0, len); } EXPORT_SYMBOL_GPL(iomap_dirty_folio); @@ -937,8 +1013,8 @@ static bool __iomap_write_end(struct inode *inode, loff_t pos, size_t len, if (unlikely(copied < len && !folio_test_uptodate(folio))) return false; iomap_set_range_uptodate(folio, offset_in_folio(folio, pos), len); - iomap_set_range_dirty(folio, offset_in_folio(folio, pos), copied); - filemap_dirty_folio(inode->i_mapping, folio); + iomap_dirty_folio_range(inode->i_mapping, folio, + offset_in_folio(folio, pos), copied); return true; } @@ -1613,6 +1689,29 @@ void iomap_start_folio_write(struct inode *inode, struct folio *folio, } EXPORT_SYMBOL_GPL(iomap_start_folio_write); +static void iomap_folio_start_writeback(struct folio *folio) +{ + struct iomap_folio_state *ifs = folio->private; + + if (!iomap_granular_dirty_pages(folio)) + return folio_start_writeback(folio); + + __folio_start_writeback(folio, false, ifs->nr_pages_writeback); +} + +static void iomap_folio_end_writeback(struct folio *folio) +{ + struct iomap_folio_state *ifs = folio->private; + long nr_pages_writeback; + + if (!iomap_granular_dirty_pages(folio)) + return folio_end_writeback(folio); + + nr_pages_writeback = ifs->nr_pages_writeback; + ifs->nr_pages_writeback = 0; + folio_end_writeback_pages(folio, nr_pages_writeback); +} + void iomap_finish_folio_write(struct inode *inode, struct folio *folio, size_t len) { @@ -1622,7 +1721,7 @@ void iomap_finish_folio_write(struct inode *inode, struct folio *folio, WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= 0); if (!ifs || atomic_sub_and_test(len, &ifs->write_bytes_pending)) - folio_end_writeback(folio); + iomap_folio_end_writeback(folio); } EXPORT_SYMBOL_GPL(iomap_finish_folio_write); @@ -1710,6 +1809,21 @@ static bool iomap_writeback_handle_eof(struct folio *folio, struct inode *inode, return true; } +static void iomap_update_dirty_stats(struct folio *folio) +{ + struct iomap_folio_state *ifs = folio->private; + long nr_dirty_pages; + + if (iomap_granular_dirty_pages(folio)) { + nr_dirty_pages = ifs_count_dirty_pages(folio); + ifs->nr_pages_writeback = nr_dirty_pages; + } else { + nr_dirty_pages = folio_nr_pages(folio); + } + + clear_dirty_for_io_stats(folio, nr_dirty_pages); +} + int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio) { struct iomap_folio_state *ifs = folio->private; @@ -1727,6 +1841,8 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio) trace_iomap_writeback_folio(inode, pos, folio_size(folio)); + iomap_update_dirty_stats(folio); + if (!iomap_writeback_handle_eof(folio, inode, &end_pos)) return 0; WARN_ON_ONCE(end_pos <= pos); @@ -1734,6 +1850,7 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio) if (i_blocks_per_folio(inode, folio) > 1) { if (!ifs) { ifs = ifs_alloc(inode, folio, 0); + ifs->nr_pages_writeback = folio_nr_pages(folio); iomap_set_range_dirty(folio, 0, end_pos - pos); } @@ -1751,7 +1868,7 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio) * Set the writeback bit ASAP, as the I/O completion for the single * block per folio case happen hit as soon as we're submitting the bio. */ - folio_start_writeback(folio); + iomap_folio_start_writeback(folio); /* * Walk through the folio to find dirty areas to write back. @@ -1784,10 +1901,10 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio) */ if (ifs) { if (atomic_dec_and_test(&ifs->write_bytes_pending)) - folio_end_writeback(folio); + iomap_folio_end_writeback(folio); } else { if (!wb_pending) - folio_end_writeback(folio); + iomap_folio_end_writeback(folio); } mapping_set_error(inode->i_mapping, error); return error; @@ -1809,6 +1926,13 @@ iomap_writepages(struct iomap_writepage_ctx *wpc) PF_MEMALLOC)) return -EIO; + /* + * iomap opts out of the default wbc stats accounting because it does + * its own granular dirty/writeback accounting (see + * iomap_update_dirty_stats()). + */ + wpc->wbc->no_stats_accounting = true; + while ((folio = writeback_iter(mapping, wpc->wbc, folio, &error))) { error = iomap_writeback_folio(wpc, folio); folio_unlock(folio); -- 2.47.3