This is a prep patch which introduces a new bdi_writeback_ctx structure that enables us to have multiple writeback contexts for parallel writeback. Each bdi now can have multiple writeback contexts, with each writeback context having has its own cgwb tree. Modify all the functions/places that operate on bdi's wb, wb_list, cgwb_tree, wb_switch_rwsem, wb_waitq as these fields have now been moved to bdi_writeback_ctx. This patch mechanically replaces bdi->wb to bdi->wb_ctx[0]->wb and there is no functional change. Suggested-by: Jan Kara Signed-off-by: Anuj Gupta Signed-off-by: Kundan Kumar --- fs/f2fs/node.c | 4 +- fs/f2fs/segment.h | 2 +- fs/fs-writeback.c | 78 +++++++++++++-------- fs/fuse/file.c | 6 +- fs/gfs2/super.c | 2 +- fs/nfs/internal.h | 3 +- fs/nfs/write.c | 3 +- include/linux/backing-dev-defs.h | 32 +++++---- include/linux/backing-dev.h | 41 +++++++---- include/linux/fs.h | 1 - mm/backing-dev.c | 113 +++++++++++++++++++------------ mm/page-writeback.c | 5 +- 12 files changed, 179 insertions(+), 111 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 27743b93e186..1693da9417f9 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -73,7 +73,7 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) if (excess_cached_nats(sbi)) res = false; } else if (type == DIRTY_DENTS) { - if (sbi->sb->s_bdi->wb.dirty_exceeded) + if (sbi->sb->s_bdi->wb_ctx[0]->wb.dirty_exceeded) return false; mem_size = get_pages(sbi, F2FS_DIRTY_DENTS); res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); @@ -114,7 +114,7 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) res = false; #endif } else { - if (!sbi->sb->s_bdi->wb.dirty_exceeded) + if (!sbi->sb->s_bdi->wb_ctx[0]->wb.dirty_exceeded) return true; } return res; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 5e2ee5c686b1..7e5b7b1a5d2b 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -993,7 +993,7 @@ static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno) */ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) { - if (sbi->sb->s_bdi->wb.dirty_exceeded) + if (sbi->sb->s_bdi->wb_ctx[0]->wb.dirty_exceeded) return 0; if (type == DATA) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index a07b8cf73ae2..0715a7617391 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -265,23 +265,26 @@ void __inode_attach_wb(struct inode *inode, struct folio *folio) { struct backing_dev_info *bdi = inode_to_bdi(inode); struct bdi_writeback *wb = NULL; + struct bdi_writeback_ctx *bdi_writeback_ctx = bdi->wb_ctx[0]; if (inode_cgwb_enabled(inode)) { struct cgroup_subsys_state *memcg_css; if (folio) { memcg_css = mem_cgroup_css_from_folio(folio); - wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); + wb = wb_get_create(bdi, bdi_writeback_ctx, memcg_css, + GFP_ATOMIC); } else { /* must pin memcg_css, see wb_get_create() */ memcg_css = task_get_css(current, memory_cgrp_id); - wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); + wb = wb_get_create(bdi, bdi_writeback_ctx, memcg_css, + GFP_ATOMIC); css_put(memcg_css); } } if (!wb) - wb = &bdi->wb; + wb = &bdi_writeback_ctx->wb; /* * There may be multiple instances of this function racing to @@ -307,7 +310,7 @@ static void inode_cgwb_move_to_attached(struct inode *inode, WARN_ON_ONCE(inode->i_state & I_FREEING); inode->i_state &= ~I_SYNC_QUEUED; - if (wb != &wb->bdi->wb) + if (wb != &wb->bdi_wb_ctx->wb) list_move(&inode->i_io_list, &wb->b_attached); else list_del_init(&inode->i_io_list); @@ -382,14 +385,16 @@ struct inode_switch_wbs_context { struct inode *inodes[]; }; -static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) +static void +bdi_down_write_wb_ctx_switch_rwsem(struct bdi_writeback_ctx *bdi_wb_ctx) { - down_write(&bdi->wb_switch_rwsem); + down_write(&bdi_wb_ctx->wb_switch_rwsem); } -static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) +static void +bdi_up_write_wb_ctx_switch_rwsem(struct bdi_writeback_ctx *bdi_wb_ctx) { - up_write(&bdi->wb_switch_rwsem); + up_write(&bdi_wb_ctx->wb_switch_rwsem); } static bool inode_do_switch_wbs(struct inode *inode, @@ -490,7 +495,8 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) { struct inode_switch_wbs_context *isw = container_of(to_rcu_work(work), struct inode_switch_wbs_context, work); - struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]); + struct bdi_writeback_ctx *bdi_wb_ctx = + fetch_bdi_writeback_ctx(isw->inodes[0]); struct bdi_writeback *old_wb = isw->inodes[0]->i_wb; struct bdi_writeback *new_wb = isw->new_wb; unsigned long nr_switched = 0; @@ -500,7 +506,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) * If @inode switches cgwb membership while sync_inodes_sb() is * being issued, sync_inodes_sb() might miss it. Synchronize. */ - down_read(&bdi->wb_switch_rwsem); + down_read(&bdi_wb_ctx->wb_switch_rwsem); /* * By the time control reaches here, RCU grace period has passed @@ -529,7 +535,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) spin_unlock(&new_wb->list_lock); spin_unlock(&old_wb->list_lock); - up_read(&bdi->wb_switch_rwsem); + up_read(&bdi_wb_ctx->wb_switch_rwsem); if (nr_switched) { wb_wakeup(new_wb); @@ -583,6 +589,7 @@ static bool inode_prepare_wbs_switch(struct inode *inode, static void inode_switch_wbs(struct inode *inode, int new_wb_id) { struct backing_dev_info *bdi = inode_to_bdi(inode); + struct bdi_writeback_ctx *bdi_wb_ctx = fetch_bdi_writeback_ctx(inode); struct cgroup_subsys_state *memcg_css; struct inode_switch_wbs_context *isw; @@ -609,7 +616,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) if (!memcg_css) goto out_free; - isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); + isw->new_wb = wb_get_create(bdi, bdi_wb_ctx, memcg_css, GFP_ATOMIC); css_put(memcg_css); if (!isw->new_wb) goto out_free; @@ -678,12 +685,14 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) for (memcg_css = wb->memcg_css->parent; memcg_css; memcg_css = memcg_css->parent) { - isw->new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL); + isw->new_wb = wb_get_create(wb->bdi, wb->bdi_wb_ctx, + memcg_css, GFP_KERNEL); if (isw->new_wb) break; } + /* wb_get() is noop for bdi's wb */ if (unlikely(!isw->new_wb)) - isw->new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */ + isw->new_wb = &wb->bdi_wb_ctx->wb; nr = 0; spin_lock(&wb->list_lock); @@ -994,18 +1003,19 @@ static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages) * total active write bandwidth of @bdi. */ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, + struct bdi_writeback_ctx *bdi_wb_ctx, struct wb_writeback_work *base_work, bool skip_if_busy) { struct bdi_writeback *last_wb = NULL; - struct bdi_writeback *wb = list_entry(&bdi->wb_list, + struct bdi_writeback *wb = list_entry(&bdi_wb_ctx->wb_list, struct bdi_writeback, bdi_node); might_sleep(); restart: rcu_read_lock(); - list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) { - DEFINE_WB_COMPLETION(fallback_work_done, bdi); + list_for_each_entry_continue_rcu(wb, &bdi_wb_ctx->wb_list, bdi_node) { + DEFINE_WB_COMPLETION(fallback_work_done, bdi_wb_ctx); struct wb_writeback_work fallback_work; struct wb_writeback_work *work; long nr_pages; @@ -1103,7 +1113,7 @@ int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, * And find the associated wb. If the wb isn't there already * there's nothing to flush, don't create one. */ - wb = wb_get_lookup(bdi, memcg_css); + wb = wb_get_lookup(bdi->wb_ctx[0], memcg_css); if (!wb) { ret = -ENOENT; goto out_css_put; @@ -1189,8 +1199,13 @@ fs_initcall(cgroup_writeback_init); #else /* CONFIG_CGROUP_WRITEBACK */ -static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { } -static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { } +static void +bdi_down_write_wb_ctx_switch_rwsem(struct bdi_writeback_ctx *bdi_wb_ctx) +{ } + +static void +bdi_up_write_wb_ctx_switch_rwsem(struct bdi_writeback_ctx *bdi_wb_ctx) +{ } static void inode_cgwb_move_to_attached(struct inode *inode, struct bdi_writeback *wb) @@ -1231,14 +1246,15 @@ static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages) } static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, + struct bdi_writeback_ctx *bdi_wb_ctx, struct wb_writeback_work *base_work, bool skip_if_busy) { might_sleep(); - if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) { + if (!skip_if_busy || !writeback_in_progress(&bdi_wb_ctx->wb)) { base_work->auto_free = 0; - wb_queue_work(&bdi->wb, base_work); + wb_queue_work(&bdi_wb_ctx->wb, base_work); } } @@ -2371,7 +2387,7 @@ static void __wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, if (!bdi_has_dirty_io(bdi)) return; - list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) + list_for_each_entry_rcu(wb, &bdi->wb_ctx[0]->wb_list, bdi_node) wb_start_writeback(wb, reason); } @@ -2427,7 +2443,8 @@ static void wakeup_dirtytime_writeback(struct work_struct *w) list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { struct bdi_writeback *wb; - list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) + list_for_each_entry_rcu(wb, &bdi->wb_ctx[0]->wb_list, + bdi_node) if (!list_empty(&wb->b_dirty_time)) wb_wakeup(wb); } @@ -2730,7 +2747,7 @@ static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr, enum wb_reason reason, bool skip_if_busy) { struct backing_dev_info *bdi = sb->s_bdi; - DEFINE_WB_COMPLETION(done, bdi); + DEFINE_WB_COMPLETION(done, bdi->wb_ctx[0]); struct wb_writeback_work work = { .sb = sb, .sync_mode = WB_SYNC_NONE, @@ -2744,7 +2761,8 @@ static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr, return; WARN_ON(!rwsem_is_locked(&sb->s_umount)); - bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy); + bdi_split_work_to_wbs(sb->s_bdi, bdi->wb_ctx[0], &work, + skip_if_busy); wb_wait_for_completion(&done); } @@ -2808,7 +2826,7 @@ EXPORT_SYMBOL(try_to_writeback_inodes_sb); void sync_inodes_sb(struct super_block *sb) { struct backing_dev_info *bdi = sb->s_bdi; - DEFINE_WB_COMPLETION(done, bdi); + DEFINE_WB_COMPLETION(done, bdi->wb_ctx[0]); struct wb_writeback_work work = { .sb = sb, .sync_mode = WB_SYNC_ALL, @@ -2829,10 +2847,10 @@ void sync_inodes_sb(struct super_block *sb) WARN_ON(!rwsem_is_locked(&sb->s_umount)); /* protect against inode wb switch, see inode_switch_wbs_work_fn() */ - bdi_down_write_wb_switch_rwsem(bdi); - bdi_split_work_to_wbs(bdi, &work, false); + bdi_down_write_wb_ctx_switch_rwsem(bdi->wb_ctx[0]); + bdi_split_work_to_wbs(bdi, bdi->wb_ctx[0], &work, false); wb_wait_for_completion(&done); - bdi_up_write_wb_switch_rwsem(bdi); + bdi_up_write_wb_ctx_switch_rwsem(bdi->wb_ctx[0]); wait_sb_inodes(sb); } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 4adcf09d4b01..8c823a661139 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1833,8 +1833,8 @@ static void fuse_writepage_finish(struct fuse_writepage_args *wpa) * contention and noticeably improves performance. */ iomap_finish_folio_write(inode, ap->folios[i], 1); - dec_wb_stat(&bdi->wb, WB_WRITEBACK); - wb_writeout_inc(&bdi->wb); + dec_wb_stat(&bdi->wb_ctx[0]->wb, WB_WRITEBACK); + wb_writeout_inc(&bdi->wb_ctx[0]->wb); } wake_up(&fi->page_waitq); @@ -2017,7 +2017,7 @@ static void fuse_writepage_args_page_fill(struct fuse_writepage_args *wpa, struc ap->descs[folio_index].offset = offset; ap->descs[folio_index].length = len; - inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); + inc_wb_stat(&inode_to_bdi(inode)->wb_ctx[0]->wb, WB_WRITEBACK); } static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio, diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index b42e2110084b..bd11d5e6cf63 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -447,7 +447,7 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc) gfs2_log_flush(GFS2_SB(inode), ip->i_gl, GFS2_LOG_HEAD_FLUSH_NORMAL | GFS2_LFC_WRITE_INODE); - if (bdi->wb.dirty_exceeded) + if (bdi->wb_ctx[0]->wb.dirty_exceeded) gfs2_ail1_flush(sdp, wbc); else filemap_fdatawrite(metamapping); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index c0a44f389f8f..5b3c84104b5b 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -857,7 +857,8 @@ static inline void nfs_folio_mark_unstable(struct folio *folio, * writeback is happening on the server now. */ node_stat_mod_folio(folio, NR_WRITEBACK, nr); - wb_stat_mod(&inode_to_bdi(inode)->wb, WB_WRITEBACK, nr); + wb_stat_mod(&inode_to_bdi(inode)->wb_ctx[0]->wb, + WB_WRITEBACK, nr); __mark_inode_dirty(inode, I_DIRTY_DATASYNC); } } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 647c53d1418a..4317b93bc2af 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -865,9 +865,10 @@ static void nfs_folio_clear_commit(struct folio *folio) { if (folio) { long nr = folio_nr_pages(folio); + struct inode *inode = folio->mapping->host; node_stat_mod_folio(folio, NR_WRITEBACK, -nr); - wb_stat_mod(&inode_to_bdi(folio->mapping->host)->wb, + wb_stat_mod(&inode_to_bdi(inode)->wb_ctx[0]->wb, WB_WRITEBACK, -nr); } } diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 2ad261082bba..692ec7be73e2 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -75,10 +75,11 @@ struct wb_completion { * can wait for the completion of all using wb_wait_for_completion(). Work * items which are waited upon aren't freed automatically on completion. */ -#define WB_COMPLETION_INIT(bdi) __WB_COMPLETION_INIT(&(bdi)->wb_waitq) +#define WB_COMPLETION_INIT(bdi_wb_ctx) \ + __WB_COMPLETION_INIT(&(bdi_wb_ctx)->wb_waitq) -#define DEFINE_WB_COMPLETION(cmpl, bdi) \ - struct wb_completion cmpl = WB_COMPLETION_INIT(bdi) +#define DEFINE_WB_COMPLETION(cmpl, bdi_wb_ctx) \ + struct wb_completion cmpl = WB_COMPLETION_INIT(bdi_wb_ctx) /* * Each wb (bdi_writeback) can perform writeback operations, is measured @@ -104,6 +105,7 @@ struct wb_completion { */ struct bdi_writeback { struct backing_dev_info *bdi; /* our parent bdi */ + struct bdi_writeback_ctx *bdi_wb_ctx; unsigned long state; /* Always use atomic bitops on this */ unsigned long last_old_flush; /* last old data flush */ @@ -160,6 +162,16 @@ struct bdi_writeback { #endif }; +struct bdi_writeback_ctx { + struct bdi_writeback wb; /* the root writeback info for this bdi */ + struct list_head wb_list; /* list of all wbs */ +#ifdef CONFIG_CGROUP_WRITEBACK + struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */ + struct rw_semaphore wb_switch_rwsem; /* no cgwb switch while syncing */ +#endif + wait_queue_head_t wb_waitq; +}; + struct backing_dev_info { u64 id; struct rb_node rb_node; /* keyed by ->id */ @@ -183,15 +195,11 @@ struct backing_dev_info { */ unsigned long last_bdp_sleep; - struct bdi_writeback wb; /* the root writeback info for this bdi */ - struct list_head wb_list; /* list of all wbs */ + int nr_wb_ctx; + struct bdi_writeback_ctx **wb_ctx; #ifdef CONFIG_CGROUP_WRITEBACK - struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */ struct mutex cgwb_release_mutex; /* protect shutdown of wb structs */ - struct rw_semaphore wb_switch_rwsem; /* no cgwb switch while syncing */ #endif - wait_queue_head_t wb_waitq; - struct device *dev; char dev_name[64]; struct device *owner; @@ -216,7 +224,7 @@ struct wb_lock_cookie { */ static inline bool wb_tryget(struct bdi_writeback *wb) { - if (wb != &wb->bdi->wb) + if (wb != &wb->bdi_wb_ctx->wb) return percpu_ref_tryget(&wb->refcnt); return true; } @@ -227,7 +235,7 @@ static inline bool wb_tryget(struct bdi_writeback *wb) */ static inline void wb_get(struct bdi_writeback *wb) { - if (wb != &wb->bdi->wb) + if (wb != &wb->bdi_wb_ctx->wb) percpu_ref_get(&wb->refcnt); } @@ -246,7 +254,7 @@ static inline void wb_put_many(struct bdi_writeback *wb, unsigned long nr) return; } - if (wb != &wb->bdi->wb) + if (wb != &wb->bdi_wb_ctx->wb) percpu_ref_put_many(&wb->refcnt, nr); } diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index e721148c95d0..92674543ac8a 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -148,11 +148,20 @@ static inline bool mapping_can_writeback(struct address_space *mapping) return inode_to_bdi(mapping->host)->capabilities & BDI_CAP_WRITEBACK; } +static inline struct bdi_writeback_ctx * +fetch_bdi_writeback_ctx(struct inode *inode) +{ + struct backing_dev_info *bdi = inode_to_bdi(inode); + + return bdi->wb_ctx[0]; +} + #ifdef CONFIG_CGROUP_WRITEBACK -struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi, +struct bdi_writeback *wb_get_lookup(struct bdi_writeback_ctx *bdi_wb_ctx, struct cgroup_subsys_state *memcg_css); struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, + struct bdi_writeback_ctx *bdi_wb_ctx, struct cgroup_subsys_state *memcg_css, gfp_t gfp); void wb_memcg_offline(struct mem_cgroup *memcg); @@ -187,16 +196,18 @@ static inline bool inode_cgwb_enabled(struct inode *inode) * Must be called under rcu_read_lock() which protects the returend wb. * NULL if not found. */ -static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi) +static inline struct bdi_writeback * +wb_find_current(struct backing_dev_info *bdi, + struct bdi_writeback_ctx *bdi_wb_ctx) { struct cgroup_subsys_state *memcg_css; struct bdi_writeback *wb; memcg_css = task_css(current, memory_cgrp_id); if (!memcg_css->parent) - return &bdi->wb; + return &bdi_wb_ctx->wb; - wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); + wb = radix_tree_lookup(&bdi_wb_ctx->cgwb_tree, memcg_css->id); /* * %current's blkcg equals the effective blkcg of its memcg. No @@ -217,12 +228,13 @@ static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi * wb_find_current(). */ static inline struct bdi_writeback * -wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) +wb_get_create_current(struct backing_dev_info *bdi, + struct bdi_writeback_ctx *bdi_wb_ctx, gfp_t gfp) { struct bdi_writeback *wb; rcu_read_lock(); - wb = wb_find_current(bdi); + wb = wb_find_current(bdi, bdi_wb_ctx); if (wb && unlikely(!wb_tryget(wb))) wb = NULL; rcu_read_unlock(); @@ -231,7 +243,7 @@ wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) struct cgroup_subsys_state *memcg_css; memcg_css = task_get_css(current, memory_cgrp_id); - wb = wb_get_create(bdi, memcg_css, gfp); + wb = wb_get_create(bdi, bdi_wb_ctx, memcg_css, gfp); css_put(memcg_css); } return wb; @@ -265,7 +277,7 @@ static inline struct bdi_writeback *inode_to_wb_wbc( * If wbc does not have inode attached, it means cgroup writeback was * disabled when wbc started. Just use the default wb in that case. */ - return wbc->wb ? wbc->wb : &inode_to_bdi(inode)->wb; + return wbc->wb ? wbc->wb : &fetch_bdi_writeback_ctx(inode)->wb; } /** @@ -325,20 +337,23 @@ static inline bool inode_cgwb_enabled(struct inode *inode) return false; } -static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi) +static inline struct bdi_writeback *wb_find_current( + struct backing_dev_info *bdi, + struct bdi_writeback_ctx *bdi_wb_ctx) { - return &bdi->wb; + return &bdi_wb_ctx->wb; } static inline struct bdi_writeback * -wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) +wb_get_create_current(struct backing_dev_info *bdi, + struct bdi_writeback_ctx *bdi_wb_ctx, gfp_t gfp) { - return &bdi->wb; + return &bdi_wb_ctx->wb; } static inline struct bdi_writeback *inode_to_wb(struct inode *inode) { - return &inode_to_bdi(inode)->wb; + return &fetch_bdi_writeback_ctx(inode)->wb; } static inline struct bdi_writeback *inode_to_wb_wbc( diff --git a/include/linux/fs.h b/include/linux/fs.h index 601d036a6c78..754fec84f350 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2342,7 +2342,6 @@ struct super_operations { struct inode *(*alloc_inode)(struct super_block *sb); void (*destroy_inode)(struct inode *); void (*free_inode)(struct inode *); - void (*dirty_inode) (struct inode *, int flags); int (*write_inode) (struct inode *, struct writeback_control *wbc); int (*drop_inode) (struct inode *); diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 783904d8c5ef..8b7125349f6c 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -84,13 +84,14 @@ static void collect_wb_stats(struct wb_stats *stats, } #ifdef CONFIG_CGROUP_WRITEBACK + static void bdi_collect_stats(struct backing_dev_info *bdi, struct wb_stats *stats) { struct bdi_writeback *wb; rcu_read_lock(); - list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) { + list_for_each_entry_rcu(wb, &bdi->wb_ctx[0]->wb_list, bdi_node) { if (!wb_tryget(wb)) continue; @@ -103,7 +104,7 @@ static void bdi_collect_stats(struct backing_dev_info *bdi, static void bdi_collect_stats(struct backing_dev_info *bdi, struct wb_stats *stats) { - collect_wb_stats(stats, &bdi->wb); + collect_wb_stats(stats, &bdi->wb_ctx[0]->wb); } #endif @@ -149,7 +150,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) stats.nr_io, stats.nr_more_io, stats.nr_dirty_time, - !list_empty(&bdi->bdi_list), bdi->wb.state); + !list_empty(&bdi->bdi_list), bdi->wb_ctx[0]->wb.state); return 0; } @@ -193,14 +194,14 @@ static void wb_stats_show(struct seq_file *m, struct bdi_writeback *wb, static int cgwb_debug_stats_show(struct seq_file *m, void *v) { struct backing_dev_info *bdi = m->private; + struct bdi_writeback *wb; unsigned long background_thresh; unsigned long dirty_thresh; - struct bdi_writeback *wb; global_dirty_limits(&background_thresh, &dirty_thresh); rcu_read_lock(); - list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) { + list_for_each_entry_rcu(wb, &bdi->wb_ctx[0]->wb_list, bdi_node) { struct wb_stats stats = { .dirty_thresh = dirty_thresh }; if (!wb_tryget(wb)) @@ -520,6 +521,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, memset(wb, 0, sizeof(*wb)); wb->bdi = bdi; + wb->bdi_wb_ctx = bdi->wb_ctx[0]; wb->last_old_flush = jiffies; INIT_LIST_HEAD(&wb->b_dirty); INIT_LIST_HEAD(&wb->b_io); @@ -643,11 +645,12 @@ static void cgwb_release(struct percpu_ref *refcnt) queue_work(cgwb_release_wq, &wb->release_work); } -static void cgwb_kill(struct bdi_writeback *wb) +static void cgwb_kill(struct bdi_writeback *wb, + struct bdi_writeback_ctx *bdi_wb_ctx) { lockdep_assert_held(&cgwb_lock); - WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id)); + WARN_ON(!radix_tree_delete(&bdi_wb_ctx->cgwb_tree, wb->memcg_css->id)); list_del(&wb->memcg_node); list_del(&wb->blkcg_node); list_add(&wb->offline_node, &offline_cgwbs); @@ -662,6 +665,7 @@ static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb) } static int cgwb_create(struct backing_dev_info *bdi, + struct bdi_writeback_ctx *bdi_wb_ctx, struct cgroup_subsys_state *memcg_css, gfp_t gfp) { struct mem_cgroup *memcg; @@ -678,9 +682,9 @@ static int cgwb_create(struct backing_dev_info *bdi, /* look up again under lock and discard on blkcg mismatch */ spin_lock_irqsave(&cgwb_lock, flags); - wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); + wb = radix_tree_lookup(&bdi_wb_ctx->cgwb_tree, memcg_css->id); if (wb && wb->blkcg_css != blkcg_css) { - cgwb_kill(wb); + cgwb_kill(wb, bdi_wb_ctx); wb = NULL; } spin_unlock_irqrestore(&cgwb_lock, flags); @@ -721,12 +725,13 @@ static int cgwb_create(struct backing_dev_info *bdi, */ ret = -ENODEV; spin_lock_irqsave(&cgwb_lock, flags); - if (test_bit(WB_registered, &bdi->wb.state) && + if (test_bit(WB_registered, &bdi_wb_ctx->wb.state) && blkcg_cgwb_list->next && memcg_cgwb_list->next) { /* we might have raced another instance of this function */ - ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb); + ret = radix_tree_insert(&bdi_wb_ctx->cgwb_tree, + memcg_css->id, wb); if (!ret) { - list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list); + list_add_tail_rcu(&wb->bdi_node, &bdi_wb_ctx->wb_list); list_add(&wb->memcg_node, memcg_cgwb_list); list_add(&wb->blkcg_node, blkcg_cgwb_list); blkcg_pin_online(blkcg_css); @@ -779,16 +784,16 @@ static int cgwb_create(struct backing_dev_info *bdi, * each lookup. On mismatch, the existing wb is discarded and a new one is * created. */ -struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi, +struct bdi_writeback *wb_get_lookup(struct bdi_writeback_ctx *bdi_wb_ctx, struct cgroup_subsys_state *memcg_css) { struct bdi_writeback *wb; if (!memcg_css->parent) - return &bdi->wb; + return &bdi_wb_ctx->wb; rcu_read_lock(); - wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); + wb = radix_tree_lookup(&bdi_wb_ctx->cgwb_tree, memcg_css->id); if (wb) { struct cgroup_subsys_state *blkcg_css; @@ -813,6 +818,7 @@ struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi, * create one. See wb_get_lookup() for more details. */ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, + struct bdi_writeback_ctx *bdi_wb_ctx, struct cgroup_subsys_state *memcg_css, gfp_t gfp) { @@ -821,8 +827,8 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, might_alloc(gfp); do { - wb = wb_get_lookup(bdi, memcg_css); - } while (!wb && !cgwb_create(bdi, memcg_css, gfp)); + wb = wb_get_lookup(bdi_wb_ctx, memcg_css); + } while (!wb && !cgwb_create(bdi, bdi_wb_ctx, memcg_css, gfp)); return wb; } @@ -830,36 +836,40 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, static int cgwb_bdi_init(struct backing_dev_info *bdi) { int ret; + struct bdi_writeback_ctx *bdi_wb_ctx = bdi->wb_ctx[0]; - INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC); + INIT_RADIX_TREE(&bdi_wb_ctx->cgwb_tree, GFP_ATOMIC); mutex_init(&bdi->cgwb_release_mutex); - init_rwsem(&bdi->wb_switch_rwsem); + init_rwsem(&bdi_wb_ctx->wb_switch_rwsem); - ret = wb_init(&bdi->wb, bdi, GFP_KERNEL); + ret = wb_init(&bdi_wb_ctx->wb, bdi, GFP_KERNEL); if (!ret) { - bdi->wb.memcg_css = &root_mem_cgroup->css; - bdi->wb.blkcg_css = blkcg_root_css; + bdi_wb_ctx->wb.memcg_css = &root_mem_cgroup->css; + bdi_wb_ctx->wb.blkcg_css = blkcg_root_css; } return ret; } -static void cgwb_bdi_unregister(struct backing_dev_info *bdi) +/* callers should create a loop and pass bdi_wb_ctx */ +static void cgwb_bdi_unregister(struct backing_dev_info *bdi, + struct bdi_writeback_ctx *bdi_wb_ctx) { struct radix_tree_iter iter; void **slot; struct bdi_writeback *wb; - WARN_ON(test_bit(WB_registered, &bdi->wb.state)); + WARN_ON(test_bit(WB_registered, &bdi_wb_ctx->wb.state)); spin_lock_irq(&cgwb_lock); - radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0) - cgwb_kill(*slot); + radix_tree_for_each_slot(slot, &bdi_wb_ctx->cgwb_tree, &iter, 0) + cgwb_kill(*slot, bdi_wb_ctx); spin_unlock_irq(&cgwb_lock); mutex_lock(&bdi->cgwb_release_mutex); spin_lock_irq(&cgwb_lock); - while (!list_empty(&bdi->wb_list)) { - wb = list_first_entry(&bdi->wb_list, struct bdi_writeback, + while (!list_empty(&bdi_wb_ctx->wb_list)) { + wb = list_first_entry(&bdi_wb_ctx->wb_list, + struct bdi_writeback, bdi_node); spin_unlock_irq(&cgwb_lock); wb_shutdown(wb); @@ -930,7 +940,7 @@ void wb_memcg_offline(struct mem_cgroup *memcg) spin_lock_irq(&cgwb_lock); list_for_each_entry_safe(wb, next, memcg_cgwb_list, memcg_node) - cgwb_kill(wb); + cgwb_kill(wb, wb->bdi_wb_ctx); memcg_cgwb_list->next = NULL; /* prevent new wb's */ spin_unlock_irq(&cgwb_lock); @@ -950,15 +960,16 @@ void wb_blkcg_offline(struct cgroup_subsys_state *css) spin_lock_irq(&cgwb_lock); list_for_each_entry_safe(wb, next, list, blkcg_node) - cgwb_kill(wb); + cgwb_kill(wb, wb->bdi_wb_ctx); list->next = NULL; /* prevent new wb's */ spin_unlock_irq(&cgwb_lock); } -static void cgwb_bdi_register(struct backing_dev_info *bdi) +static void cgwb_bdi_register(struct backing_dev_info *bdi, + struct bdi_writeback_ctx *bdi_wb_ctx) { spin_lock_irq(&cgwb_lock); - list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list); + list_add_tail_rcu(&bdi_wb_ctx->wb.bdi_node, &bdi_wb_ctx->wb_list); spin_unlock_irq(&cgwb_lock); } @@ -981,14 +992,18 @@ subsys_initcall(cgwb_init); static int cgwb_bdi_init(struct backing_dev_info *bdi) { - return wb_init(&bdi->wb, bdi, GFP_KERNEL); + return wb_init(&bdi->wb_ctx[0]->wb, bdi, GFP_KERNEL); } -static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { } +static void cgwb_bdi_unregister(struct backing_dev_info *bdi, + struct bdi_writeback_ctx *bdi_wb_ctx) +{ } -static void cgwb_bdi_register(struct backing_dev_info *bdi) +/* callers should create a loop and pass bdi_wb_ctx */ +static void cgwb_bdi_register(struct backing_dev_info *bdi, + struct bdi_writeback_ctx *bdi_wb_ctx) { - list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list); + list_add_tail_rcu(&bdi_wb_ctx->wb.bdi_node, &bdi_wb_ctx->wb_list); } static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb) @@ -1006,9 +1021,15 @@ int bdi_init(struct backing_dev_info *bdi) bdi->min_ratio = 0; bdi->max_ratio = 100 * BDI_RATIO_SCALE; bdi->max_prop_frac = FPROP_FRAC_BASE; + bdi->nr_wb_ctx = 1; + bdi->wb_ctx = kcalloc(bdi->nr_wb_ctx, + sizeof(struct bdi_writeback_ctx *), + GFP_KERNEL); INIT_LIST_HEAD(&bdi->bdi_list); - INIT_LIST_HEAD(&bdi->wb_list); - init_waitqueue_head(&bdi->wb_waitq); + bdi->wb_ctx[0] = (struct bdi_writeback_ctx *) + kzalloc(sizeof(struct bdi_writeback_ctx), GFP_KERNEL); + INIT_LIST_HEAD(&bdi->wb_ctx[0]->wb_list); + init_waitqueue_head(&bdi->wb_ctx[0]->wb_waitq); bdi->last_bdp_sleep = jiffies; return cgwb_bdi_init(bdi); @@ -1023,6 +1044,8 @@ struct backing_dev_info *bdi_alloc(int node_id) return NULL; if (bdi_init(bdi)) { + kfree(bdi->wb_ctx[0]); + kfree(bdi->wb_ctx); kfree(bdi); return NULL; } @@ -1095,11 +1118,11 @@ int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args) if (IS_ERR(dev)) return PTR_ERR(dev); - cgwb_bdi_register(bdi); + cgwb_bdi_register(bdi, bdi->wb_ctx[0]); + set_bit(WB_registered, &bdi->wb_ctx[0]->wb.state); bdi->dev = dev; bdi_debug_register(bdi, dev_name(dev)); - set_bit(WB_registered, &bdi->wb.state); spin_lock_bh(&bdi_lock); @@ -1155,8 +1178,8 @@ void bdi_unregister(struct backing_dev_info *bdi) /* make sure nobody finds us on the bdi_list anymore */ bdi_remove_from_list(bdi); - wb_shutdown(&bdi->wb); - cgwb_bdi_unregister(bdi); + wb_shutdown(&bdi->wb_ctx[0]->wb); + cgwb_bdi_unregister(bdi, bdi->wb_ctx[0]); /* * If this BDI's min ratio has been set, use bdi_set_min_ratio() to @@ -1183,9 +1206,11 @@ static void release_bdi(struct kref *ref) struct backing_dev_info *bdi = container_of(ref, struct backing_dev_info, refcnt); - WARN_ON_ONCE(test_bit(WB_registered, &bdi->wb.state)); WARN_ON_ONCE(bdi->dev); - wb_exit(&bdi->wb); + WARN_ON_ONCE(test_bit(WB_registered, &bdi->wb_ctx[0]->wb.state)); + wb_exit(&bdi->wb_ctx[0]->wb); + kfree(bdi->wb_ctx[0]); + kfree(bdi->wb_ctx); kfree(bdi); } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 3e248d1c3969..6f283a777da6 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2048,6 +2048,7 @@ int balance_dirty_pages_ratelimited_flags(struct address_space *mapping, { struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); + struct bdi_writeback_ctx *bdi_wb_ctx = fetch_bdi_writeback_ctx(inode); struct bdi_writeback *wb = NULL; int ratelimit; int ret = 0; @@ -2057,9 +2058,9 @@ int balance_dirty_pages_ratelimited_flags(struct address_space *mapping, return ret; if (inode_cgwb_enabled(inode)) - wb = wb_get_create_current(bdi, GFP_KERNEL); + wb = wb_get_create_current(bdi, bdi_wb_ctx, GFP_KERNEL); if (!wb) - wb = &bdi->wb; + wb = &bdi_wb_ctx->wb; ratelimit = current->nr_dirtied_pause; if (wb->dirty_exceeded) -- 2.25.1 Introduce a new macro for_each_bdi_wb_ctx to iterate over multiple writeback ctxs. Added logic for allocation, init, free, registration and unregistration of multiple writeback contexts within a bdi. Signed-off-by: Kundan Kumar Signed-off-by: Anuj Gupta --- include/linux/backing-dev.h | 4 ++ mm/backing-dev.c | 81 +++++++++++++++++++++++++++---------- 2 files changed, 63 insertions(+), 22 deletions(-) diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 92674543ac8a..951ab5497500 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -148,6 +148,10 @@ static inline bool mapping_can_writeback(struct address_space *mapping) return inode_to_bdi(mapping->host)->capabilities & BDI_CAP_WRITEBACK; } +#define for_each_bdi_wb_ctx(bdi, wbctx) \ + for (int __i = 0; __i < (bdi)->nr_wb_ctx \ + && ((wbctx) = (bdi)->wb_ctx[__i]) != NULL; __i++) + static inline struct bdi_writeback_ctx * fetch_bdi_writeback_ctx(struct inode *inode) { diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 8b7125349f6c..47196d326e16 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -835,17 +835,20 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, static int cgwb_bdi_init(struct backing_dev_info *bdi) { - int ret; - struct bdi_writeback_ctx *bdi_wb_ctx = bdi->wb_ctx[0]; + int ret = 0; + struct bdi_writeback_ctx *bdi_wb_ctx; - INIT_RADIX_TREE(&bdi_wb_ctx->cgwb_tree, GFP_ATOMIC); - mutex_init(&bdi->cgwb_release_mutex); - init_rwsem(&bdi_wb_ctx->wb_switch_rwsem); + for_each_bdi_wb_ctx(bdi, bdi_wb_ctx) { + INIT_RADIX_TREE(&bdi_wb_ctx->cgwb_tree, GFP_ATOMIC); + mutex_init(&bdi->cgwb_release_mutex); + init_rwsem(&bdi_wb_ctx->wb_switch_rwsem); - ret = wb_init(&bdi_wb_ctx->wb, bdi, GFP_KERNEL); - if (!ret) { - bdi_wb_ctx->wb.memcg_css = &root_mem_cgroup->css; - bdi_wb_ctx->wb.blkcg_css = blkcg_root_css; + ret = wb_init(&bdi_wb_ctx->wb, bdi, GFP_KERNEL); + if (!ret) { + bdi_wb_ctx->wb.memcg_css = &root_mem_cgroup->css; + bdi_wb_ctx->wb.blkcg_css = blkcg_root_css; + } else + return ret; } return ret; } @@ -992,7 +995,16 @@ subsys_initcall(cgwb_init); static int cgwb_bdi_init(struct backing_dev_info *bdi) { - return wb_init(&bdi->wb_ctx[0]->wb, bdi, GFP_KERNEL); + struct bdi_writeback_ctx *bdi_wb_ctx; + + for_each_bdi_wb_ctx(bdi, bdi_wb_ctx) { + int ret; + + ret = wb_init(&bdi_wb_ctx->wb, bdi, GFP_KERNEL); + if (ret) + return ret; + } + return 0; } static void cgwb_bdi_unregister(struct backing_dev_info *bdi, @@ -1026,10 +1038,19 @@ int bdi_init(struct backing_dev_info *bdi) sizeof(struct bdi_writeback_ctx *), GFP_KERNEL); INIT_LIST_HEAD(&bdi->bdi_list); - bdi->wb_ctx[0] = (struct bdi_writeback_ctx *) - kzalloc(sizeof(struct bdi_writeback_ctx), GFP_KERNEL); - INIT_LIST_HEAD(&bdi->wb_ctx[0]->wb_list); - init_waitqueue_head(&bdi->wb_ctx[0]->wb_waitq); + for (int i = 0; i < bdi->nr_wb_ctx; i++) { + bdi->wb_ctx[i] = (struct bdi_writeback_ctx *) + kzalloc(sizeof(struct bdi_writeback_ctx), GFP_KERNEL); + if (!bdi->wb_ctx[i]) { + pr_err("Failed to allocate %d", i); + while (--i >= 0) + kfree(bdi->wb_ctx[i]); + kfree(bdi->wb_ctx); + return -ENOMEM; + } + INIT_LIST_HEAD(&bdi->wb_ctx[i]->wb_list); + init_waitqueue_head(&bdi->wb_ctx[i]->wb_waitq); + } bdi->last_bdp_sleep = jiffies; return cgwb_bdi_init(bdi); @@ -1038,13 +1059,16 @@ int bdi_init(struct backing_dev_info *bdi) struct backing_dev_info *bdi_alloc(int node_id) { struct backing_dev_info *bdi; + struct bdi_writeback_ctx *bdi_wb_ctx; bdi = kzalloc_node(sizeof(*bdi), GFP_KERNEL, node_id); if (!bdi) return NULL; if (bdi_init(bdi)) { - kfree(bdi->wb_ctx[0]); + for_each_bdi_wb_ctx(bdi, bdi_wb_ctx) { + kfree(bdi_wb_ctx); + } kfree(bdi->wb_ctx); kfree(bdi); return NULL; @@ -1109,6 +1133,7 @@ int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args) { struct device *dev; struct rb_node *parent, **p; + struct bdi_writeback_ctx *bdi_wb_ctx; if (bdi->dev) /* The driver needs to use separate queues per device */ return 0; @@ -1118,8 +1143,11 @@ int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args) if (IS_ERR(dev)) return PTR_ERR(dev); - cgwb_bdi_register(bdi, bdi->wb_ctx[0]); - set_bit(WB_registered, &bdi->wb_ctx[0]->wb.state); + for_each_bdi_wb_ctx(bdi, bdi_wb_ctx) { + cgwb_bdi_register(bdi, bdi_wb_ctx); + set_bit(WB_registered, &bdi_wb_ctx->wb.state); + } + bdi->dev = dev; bdi_debug_register(bdi, dev_name(dev)); @@ -1174,12 +1202,17 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi) void bdi_unregister(struct backing_dev_info *bdi) { + struct bdi_writeback_ctx *bdi_wb_ctx; + timer_delete_sync(&bdi->laptop_mode_wb_timer); /* make sure nobody finds us on the bdi_list anymore */ bdi_remove_from_list(bdi); - wb_shutdown(&bdi->wb_ctx[0]->wb); - cgwb_bdi_unregister(bdi, bdi->wb_ctx[0]); + + for_each_bdi_wb_ctx(bdi, bdi_wb_ctx) { + wb_shutdown(&bdi_wb_ctx->wb); + cgwb_bdi_unregister(bdi, bdi_wb_ctx); + } /* * If this BDI's min ratio has been set, use bdi_set_min_ratio() to @@ -1205,11 +1238,15 @@ static void release_bdi(struct kref *ref) { struct backing_dev_info *bdi = container_of(ref, struct backing_dev_info, refcnt); + struct bdi_writeback_ctx *bdi_wb_ctx; WARN_ON_ONCE(bdi->dev); - WARN_ON_ONCE(test_bit(WB_registered, &bdi->wb_ctx[0]->wb.state)); - wb_exit(&bdi->wb_ctx[0]->wb); - kfree(bdi->wb_ctx[0]); + + for_each_bdi_wb_ctx(bdi, bdi_wb_ctx) { + WARN_ON_ONCE(test_bit(WB_registered, &bdi_wb_ctx->wb.state)); + wb_exit(&bdi_wb_ctx->wb); + kfree(bdi_wb_ctx); + } kfree(bdi->wb_ctx); kfree(bdi); } -- 2.25.1 Introduce a bdi_writeback_ctx field in bdi_writeback. This helps in fetching the writeback context from the bdi_writeback. Signed-off-by: Kundan Kumar Signed-off-by: Anuj Gupta --- mm/backing-dev.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 47196d326e16..754f2f6c6d7c 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -513,15 +513,16 @@ static void wb_update_bandwidth_workfn(struct work_struct *work) */ #define INIT_BW (100 << (20 - PAGE_SHIFT)) -static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, - gfp_t gfp) +static int wb_init(struct bdi_writeback *wb, + struct bdi_writeback_ctx *bdi_wb_ctx, + struct backing_dev_info *bdi, gfp_t gfp) { int err; memset(wb, 0, sizeof(*wb)); wb->bdi = bdi; - wb->bdi_wb_ctx = bdi->wb_ctx[0]; + wb->bdi_wb_ctx = bdi_wb_ctx; wb->last_old_flush = jiffies; INIT_LIST_HEAD(&wb->b_dirty); INIT_LIST_HEAD(&wb->b_io); @@ -698,7 +699,7 @@ static int cgwb_create(struct backing_dev_info *bdi, goto out_put; } - ret = wb_init(wb, bdi, gfp); + ret = wb_init(wb, bdi_wb_ctx, bdi, gfp); if (ret) goto err_free; @@ -843,7 +844,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) mutex_init(&bdi->cgwb_release_mutex); init_rwsem(&bdi_wb_ctx->wb_switch_rwsem); - ret = wb_init(&bdi_wb_ctx->wb, bdi, GFP_KERNEL); + ret = wb_init(&bdi_wb_ctx->wb, bdi_wb_ctx, bdi, GFP_KERNEL); if (!ret) { bdi_wb_ctx->wb.memcg_css = &root_mem_cgroup->css; bdi_wb_ctx->wb.blkcg_css = blkcg_root_css; @@ -1000,7 +1001,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) for_each_bdi_wb_ctx(bdi, bdi_wb_ctx) { int ret; - ret = wb_init(&bdi_wb_ctx->wb, bdi, GFP_KERNEL); + ret = wb_init(&bdi_wb_ctx->wb, bdi_wb_ctx, bdi, GFP_KERNEL); if (ret) return ret; } -- 2.25.1 Affine inode to a writeback context. This helps in minimizing the filesytem fragmentation due to inode being processed by different threads. To support parallel writeback, wire up a new superblock operation get_inode_wb_ctx(). Filesystems can override this callback and select desired writeback context for a inode. FS can use the wb context based on its geometry and also use 64 bit inode numbers. If a filesystem doesn't implement this callback, it defaults to DEFALT_WB_CTX = 0, maintaining its original behavior. An example implementation for XFS is provided, where XFS selects the writeback context based on its Allocation Group number. Signed-off-by: Anuj Gupta Signed-off-by: Kundan Kumar --- fs/fs-writeback.c | 3 ++- fs/xfs/xfs_super.c | 13 +++++++++++++ include/linux/backing-dev.h | 5 ++++- include/linux/fs.h | 1 + 4 files changed, 20 insertions(+), 2 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 0715a7617391..56c048e22f72 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -265,7 +265,8 @@ void __inode_attach_wb(struct inode *inode, struct folio *folio) { struct backing_dev_info *bdi = inode_to_bdi(inode); struct bdi_writeback *wb = NULL; - struct bdi_writeback_ctx *bdi_writeback_ctx = bdi->wb_ctx[0]; + struct bdi_writeback_ctx *bdi_writeback_ctx = + fetch_bdi_writeback_ctx(inode); if (inode_cgwb_enabled(inode)) { struct cgroup_subsys_state *memcg_css; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index bb0a82635a77..b3ec9141d902 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -53,6 +53,7 @@ #include #include #include +#include static const struct super_operations xfs_super_operations; @@ -1294,6 +1295,17 @@ xfs_fs_show_stats( return 0; } +static struct bdi_writeback_ctx * +xfs_get_inode_wb_ctx( + struct inode *inode) +{ + struct xfs_inode *ip = XFS_I(inode); + struct backing_dev_info *bdi = inode_to_bdi(inode); + xfs_agino_t agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino); + + return bdi->wb_ctx[agno % bdi->nr_wb_ctx]; +} + static const struct super_operations xfs_super_operations = { .alloc_inode = xfs_fs_alloc_inode, .destroy_inode = xfs_fs_destroy_inode, @@ -1310,6 +1322,7 @@ static const struct super_operations xfs_super_operations = { .free_cached_objects = xfs_fs_free_cached_objects, .shutdown = xfs_fs_shutdown, .show_stats = xfs_fs_show_stats, + .get_inode_wb_ctx = xfs_get_inode_wb_ctx, }; static int diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 951ab5497500..59bbb69d300c 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -148,6 +148,7 @@ static inline bool mapping_can_writeback(struct address_space *mapping) return inode_to_bdi(mapping->host)->capabilities & BDI_CAP_WRITEBACK; } +#define DEFAULT_WB_CTX 0 #define for_each_bdi_wb_ctx(bdi, wbctx) \ for (int __i = 0; __i < (bdi)->nr_wb_ctx \ && ((wbctx) = (bdi)->wb_ctx[__i]) != NULL; __i++) @@ -157,7 +158,9 @@ fetch_bdi_writeback_ctx(struct inode *inode) { struct backing_dev_info *bdi = inode_to_bdi(inode); - return bdi->wb_ctx[0]; + if (inode->i_sb->s_op->get_inode_wb_ctx) + return inode->i_sb->s_op->get_inode_wb_ctx(inode); + return bdi->wb_ctx[DEFAULT_WB_CTX]; } #ifdef CONFIG_CGROUP_WRITEBACK diff --git a/include/linux/fs.h b/include/linux/fs.h index 754fec84f350..5199b0d49fa5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2379,6 +2379,7 @@ struct super_operations { */ int (*remove_bdev)(struct super_block *sb, struct block_device *bdev); void (*shutdown)(struct super_block *sb); + struct bdi_writeback_ctx *(*get_inode_wb_ctx)(struct inode *inode); }; /* -- 2.25.1 Since we have multiple cgwb per bdi, embedded in writeback_ctx now, we iterate over all of them to find the associated writeback. Signed-off-by: Kundan Kumar Signed-off-by: Anuj Gupta --- fs/fs-writeback.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 56c048e22f72..93f8ea340247 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1090,7 +1090,8 @@ int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, { struct backing_dev_info *bdi; struct cgroup_subsys_state *memcg_css; - struct bdi_writeback *wb; + struct bdi_writeback *wb = NULL; + struct bdi_writeback_ctx *bdi_wb_ctx; struct wb_writeback_work *work; unsigned long dirty; int ret; @@ -1114,7 +1115,11 @@ int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, * And find the associated wb. If the wb isn't there already * there's nothing to flush, don't create one. */ - wb = wb_get_lookup(bdi->wb_ctx[0], memcg_css); + for_each_bdi_wb_ctx(bdi, bdi_wb_ctx) { + wb = wb_get_lookup(bdi_wb_ctx, memcg_css); + if (wb) + break; + } if (!wb) { ret = -ENOENT; goto out_css_put; -- 2.25.1 Modify flusher and dirtytime logic to iterate through all the writeback contexts. Signed-off-by: Kundan Kumar Signed-off-by: Anuj Gupta --- fs/fs-writeback.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 93f8ea340247..432f392c8256 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -2389,12 +2389,14 @@ static void __wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, enum wb_reason reason) { struct bdi_writeback *wb; + struct bdi_writeback_ctx *bdi_wb_ctx; if (!bdi_has_dirty_io(bdi)) return; - list_for_each_entry_rcu(wb, &bdi->wb_ctx[0]->wb_list, bdi_node) - wb_start_writeback(wb, reason); + for_each_bdi_wb_ctx(bdi, bdi_wb_ctx) + list_for_each_entry_rcu(wb, &bdi_wb_ctx->wb_list, bdi_node) + wb_start_writeback(wb, reason); } void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, @@ -2444,15 +2446,17 @@ static DECLARE_DELAYED_WORK(dirtytime_work, wakeup_dirtytime_writeback); static void wakeup_dirtytime_writeback(struct work_struct *w) { struct backing_dev_info *bdi; + struct bdi_writeback_ctx *bdi_wb_ctx; rcu_read_lock(); list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { struct bdi_writeback *wb; - list_for_each_entry_rcu(wb, &bdi->wb_ctx[0]->wb_list, - bdi_node) - if (!list_empty(&wb->b_dirty_time)) - wb_wakeup(wb); + for_each_bdi_wb_ctx(bdi, bdi_wb_ctx) + list_for_each_entry_rcu(wb, &bdi_wb_ctx->wb_list, + bdi_node) + if (!list_empty(&wb->b_dirty_time)) + wb_wakeup(wb); } rcu_read_unlock(); schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ); -- 2.25.1 Modify sync related functions to iterate over all writeback contexts. Signed-off-by: Kundan Kumar Signed-off-by: Anuj Gupta --- fs/fs-writeback.c | 66 +++++++++++++++++++++++++++++++---------------- 1 file changed, 44 insertions(+), 22 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 432f392c8256..7bf1f6c1c0ba 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -2753,11 +2753,13 @@ static void wait_sb_inodes(struct super_block *sb) mutex_unlock(&sb->s_sync_lock); } -static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr, - enum wb_reason reason, bool skip_if_busy) +static void __writeback_inodes_sb_nr_ctx(struct super_block *sb, + unsigned long nr, + enum wb_reason reason, + bool skip_if_busy, + struct bdi_writeback_ctx *bdi_wb_ctx) { - struct backing_dev_info *bdi = sb->s_bdi; - DEFINE_WB_COMPLETION(done, bdi->wb_ctx[0]); + DEFINE_WB_COMPLETION(done, bdi_wb_ctx); struct wb_writeback_work work = { .sb = sb, .sync_mode = WB_SYNC_NONE, @@ -2767,13 +2769,23 @@ static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr, .reason = reason, }; + bdi_split_work_to_wbs(sb->s_bdi, bdi_wb_ctx, &work, skip_if_busy); + wb_wait_for_completion(&done); +} + +static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr, + enum wb_reason reason, bool skip_if_busy) +{ + struct backing_dev_info *bdi = sb->s_bdi; + struct bdi_writeback_ctx *bdi_wb_ctx; + if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info) return; WARN_ON(!rwsem_is_locked(&sb->s_umount)); - bdi_split_work_to_wbs(sb->s_bdi, bdi->wb_ctx[0], &work, - skip_if_busy); - wb_wait_for_completion(&done); + for_each_bdi_wb_ctx(bdi, bdi_wb_ctx) + __writeback_inodes_sb_nr_ctx(sb, nr, reason, skip_if_busy, + bdi_wb_ctx); } /** @@ -2826,17 +2838,11 @@ void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) } EXPORT_SYMBOL(try_to_writeback_inodes_sb); -/** - * sync_inodes_sb - sync sb inode pages - * @sb: the superblock - * - * This function writes and waits on any dirty inode belonging to this - * super_block. - */ -void sync_inodes_sb(struct super_block *sb) +static void sync_inodes_bdi_wb_ctx(struct super_block *sb, + struct backing_dev_info *bdi, + struct bdi_writeback_ctx *bdi_wb_ctx) { - struct backing_dev_info *bdi = sb->s_bdi; - DEFINE_WB_COMPLETION(done, bdi->wb_ctx[0]); + DEFINE_WB_COMPLETION(done, bdi_wb_ctx); struct wb_writeback_work work = { .sb = sb, .sync_mode = WB_SYNC_ALL, @@ -2847,6 +2853,25 @@ void sync_inodes_sb(struct super_block *sb) .for_sync = 1, }; + /* protect against inode wb switch, see inode_switch_wbs_work_fn() */ + bdi_down_write_wb_ctx_switch_rwsem(bdi_wb_ctx); + bdi_split_work_to_wbs(bdi, bdi_wb_ctx, &work, false); + wb_wait_for_completion(&done); + bdi_up_write_wb_ctx_switch_rwsem(bdi_wb_ctx); +} + +/** + * sync_inodes_sb - sync sb inode pages + * @sb: the superblock + * + * This function writes and waits on any dirty inode belonging to this + * super_block. + */ +void sync_inodes_sb(struct super_block *sb) +{ + struct backing_dev_info *bdi = sb->s_bdi; + struct bdi_writeback_ctx *bdi_wb_ctx; + /* * Can't skip on !bdi_has_dirty() because we should wait for !dirty * inodes under writeback and I_DIRTY_TIME inodes ignored by @@ -2856,11 +2881,8 @@ void sync_inodes_sb(struct super_block *sb) return; WARN_ON(!rwsem_is_locked(&sb->s_umount)); - /* protect against inode wb switch, see inode_switch_wbs_work_fn() */ - bdi_down_write_wb_ctx_switch_rwsem(bdi->wb_ctx[0]); - bdi_split_work_to_wbs(bdi, bdi->wb_ctx[0], &work, false); - wb_wait_for_completion(&done); - bdi_up_write_wb_ctx_switch_rwsem(bdi->wb_ctx[0]); + for_each_bdi_wb_ctx(bdi, bdi_wb_ctx) + sync_inodes_bdi_wb_ctx(sb, bdi, bdi_wb_ctx); wait_sb_inodes(sb); } -- 2.25.1 Modified stats collection to collect stats for all the writeback contexts within a bdi. Signed-off-by: Kundan Kumar Signed-off-by: Anuj Gupta --- mm/backing-dev.c | 72 ++++++++++++++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 30 deletions(-) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 754f2f6c6d7c..0a772d984ecf 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -50,6 +50,7 @@ struct wb_stats { unsigned long nr_written; unsigned long dirty_thresh; unsigned long wb_thresh; + unsigned long state; }; static struct dentry *bdi_debug_root; @@ -81,6 +82,7 @@ static void collect_wb_stats(struct wb_stats *stats, stats->nr_dirtied += wb_stat(wb, WB_DIRTIED); stats->nr_written += wb_stat(wb, WB_WRITTEN); stats->wb_thresh += wb_calc_thresh(wb, stats->dirty_thresh); + stats->state |= wb->state; } #ifdef CONFIG_CGROUP_WRITEBACK @@ -89,22 +91,27 @@ static void bdi_collect_stats(struct backing_dev_info *bdi, struct wb_stats *stats) { struct bdi_writeback *wb; + struct bdi_writeback_ctx *bdi_wb_ctx; rcu_read_lock(); - list_for_each_entry_rcu(wb, &bdi->wb_ctx[0]->wb_list, bdi_node) { - if (!wb_tryget(wb)) - continue; + for_each_bdi_wb_ctx(bdi, bdi_wb_ctx) + list_for_each_entry_rcu(wb, &bdi_wb_ctx->wb_list, bdi_node) { + if (!wb_tryget(wb)) + continue; - collect_wb_stats(stats, wb); - wb_put(wb); - } + collect_wb_stats(stats, wb); + wb_put(wb); + } rcu_read_unlock(); } #else static void bdi_collect_stats(struct backing_dev_info *bdi, struct wb_stats *stats) { - collect_wb_stats(stats, &bdi->wb_ctx[0]->wb); + struct bdi_writeback_ctx *bdi_wb_ctx; + + for_each_bdi_wb_ctx(bdi, bdi_wb_ctx) + collect_wb_stats(stats, &bdi_wb_ctx->wb); } #endif @@ -150,7 +157,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) stats.nr_io, stats.nr_more_io, stats.nr_dirty_time, - !list_empty(&bdi->bdi_list), bdi->wb_ctx[0]->wb.state); + !list_empty(&bdi->bdi_list), stats.state); return 0; } @@ -195,35 +202,40 @@ static int cgwb_debug_stats_show(struct seq_file *m, void *v) { struct backing_dev_info *bdi = m->private; struct bdi_writeback *wb; + struct bdi_writeback_ctx *bdi_wb_ctx; unsigned long background_thresh; unsigned long dirty_thresh; + struct wb_stats stats; global_dirty_limits(&background_thresh, &dirty_thresh); + stats.dirty_thresh = dirty_thresh; rcu_read_lock(); - list_for_each_entry_rcu(wb, &bdi->wb_ctx[0]->wb_list, bdi_node) { - struct wb_stats stats = { .dirty_thresh = dirty_thresh }; - - if (!wb_tryget(wb)) - continue; - - collect_wb_stats(&stats, wb); - - /* - * Calculate thresh of wb in writeback cgroup which is min of - * thresh in global domain and thresh in cgroup domain. Drop - * rcu lock because cgwb_calc_thresh may sleep in - * cgroup_rstat_flush. We can do so here because we have a ref. - */ - if (mem_cgroup_wb_domain(wb)) { - rcu_read_unlock(); - stats.wb_thresh = min(stats.wb_thresh, cgwb_calc_thresh(wb)); - rcu_read_lock(); + for_each_bdi_wb_ctx(bdi, bdi_wb_ctx) { + list_for_each_entry_rcu(wb, &bdi_wb_ctx->wb_list, bdi_node) { + if (!wb_tryget(wb)) + continue; + + collect_wb_stats(&stats, wb); + + /* + * Calculate thresh of wb in writeback cgroup which is + * min of thresh in global domain and thresh in cgroup + * domain. Drop rcu lock because cgwb_calc_thresh may + * sleep in cgroup_rstat_flush. We can do so here + * because we have a ref. + */ + if (mem_cgroup_wb_domain(wb)) { + rcu_read_unlock(); + stats.wb_thresh = min(stats.wb_thresh, + cgwb_calc_thresh(wb)); + rcu_read_lock(); + } + + wb_stats_show(m, wb, &stats); + + wb_put(wb); } - - wb_stats_show(m, wb, &stats); - - wb_put(wb); } rcu_read_unlock(); -- 2.25.1 Add support to handle multiple writeback contexts and check for dirty_exceeded across all the writeback contexts. Made a new helper for same. Signed-off-by: Kundan Kumar Signed-off-by: Anuj Gupta --- fs/f2fs/node.c | 4 ++-- fs/f2fs/segment.h | 2 +- include/linux/backing-dev.h | 18 +++++++++++++++--- 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 1693da9417f9..cd75aa98a1ca 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -73,7 +73,7 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) if (excess_cached_nats(sbi)) res = false; } else if (type == DIRTY_DENTS) { - if (sbi->sb->s_bdi->wb_ctx[0]->wb.dirty_exceeded) + if (bdi_wb_dirty_limit_exceeded(sbi->sb->s_bdi)) return false; mem_size = get_pages(sbi, F2FS_DIRTY_DENTS); res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); @@ -114,7 +114,7 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) res = false; #endif } else { - if (!sbi->sb->s_bdi->wb_ctx[0]->wb.dirty_exceeded) + if (!bdi_wb_dirty_limit_exceeded(sbi->sb->s_bdi)) return true; } return res; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 7e5b7b1a5d2b..8487bd5d4394 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -993,7 +993,7 @@ static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno) */ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) { - if (sbi->sb->s_bdi->wb_ctx[0]->wb.dirty_exceeded) + if (bdi_wb_dirty_limit_exceeded(sbi->sb->s_bdi)) return 0; if (type == DATA) diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 59bbb69d300c..bb35f8fa4973 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -51,6 +51,21 @@ static inline bool wb_has_dirty_io(struct bdi_writeback *wb) return test_bit(WB_has_dirty_io, &wb->state); } +#define for_each_bdi_wb_ctx(bdi, wbctx) \ + for (int __i = 0; __i < (bdi)->nr_wb_ctx \ + && ((wbctx) = (bdi)->wb_ctx[__i]) != NULL; __i++) + +static inline bool bdi_wb_dirty_limit_exceeded(struct backing_dev_info *bdi) +{ + struct bdi_writeback_ctx *bdi_wb_ctx; + + for_each_bdi_wb_ctx(bdi, bdi_wb_ctx) { + if (bdi_wb_ctx->wb.dirty_exceeded) + return true; + } + return false; +} + static inline bool bdi_has_dirty_io(struct backing_dev_info *bdi) { /* @@ -149,9 +164,6 @@ static inline bool mapping_can_writeback(struct address_space *mapping) } #define DEFAULT_WB_CTX 0 -#define for_each_bdi_wb_ctx(bdi, wbctx) \ - for (int __i = 0; __i < (bdi)->nr_wb_ctx \ - && ((wbctx) = (bdi)->wb_ctx[__i]) != NULL; __i++) static inline struct bdi_writeback_ctx * fetch_bdi_writeback_ctx(struct inode *inode) -- 2.25.1 Made a helper to fetch writeback context to which an inode is affined. Use it to perform writeback related operations. Signed-off-by: Kundan Kumar Signed-off-by: Anuj Gupta --- fs/fuse/file.c | 7 +++---- include/linux/backing-dev.h | 17 +++++++++++++++++ 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 8c823a661139..9c7f0e4b741f 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1823,7 +1823,6 @@ static void fuse_writepage_finish(struct fuse_writepage_args *wpa) struct fuse_args_pages *ap = &wpa->ia.ap; struct inode *inode = wpa->inode; struct fuse_inode *fi = get_fuse_inode(inode); - struct backing_dev_info *bdi = inode_to_bdi(inode); int i; for (i = 0; i < ap->num_folios; i++) { @@ -1833,8 +1832,8 @@ static void fuse_writepage_finish(struct fuse_writepage_args *wpa) * contention and noticeably improves performance. */ iomap_finish_folio_write(inode, ap->folios[i], 1); - dec_wb_stat(&bdi->wb_ctx[0]->wb, WB_WRITEBACK); - wb_writeout_inc(&bdi->wb_ctx[0]->wb); + bdi_wb_stat_mod(inode, -1); + bdi_wb_writeout_inc(inode); } wake_up(&fi->page_waitq); @@ -2017,7 +2016,7 @@ static void fuse_writepage_args_page_fill(struct fuse_writepage_args *wpa, struc ap->descs[folio_index].offset = offset; ap->descs[folio_index].length = len; - inc_wb_stat(&inode_to_bdi(inode)->wb_ctx[0]->wb, WB_WRITEBACK); + bdi_wb_stat_mod(inode, 1); } static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio, diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index bb35f8fa4973..fb042e593c16 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -46,6 +46,9 @@ extern struct list_head bdi_list; extern struct workqueue_struct *bdi_wq; +static inline struct bdi_writeback_ctx * +fetch_bdi_writeback_ctx(struct inode *inode); + static inline bool wb_has_dirty_io(struct bdi_writeback *wb) { return test_bit(WB_has_dirty_io, &wb->state); @@ -103,6 +106,20 @@ static inline s64 wb_stat_sum(struct bdi_writeback *wb, enum wb_stat_item item) extern void wb_writeout_inc(struct bdi_writeback *wb); +static inline void bdi_wb_stat_mod(struct inode *inode, s64 amount) +{ + struct bdi_writeback_ctx *bdi_wb_ctx = fetch_bdi_writeback_ctx(inode); + + wb_stat_mod(&bdi_wb_ctx->wb, WB_WRITEBACK, amount); +} + +static inline void bdi_wb_writeout_inc(struct inode *inode) +{ + struct bdi_writeback_ctx *bdi_wb_ctx = fetch_bdi_writeback_ctx(inode); + + wb_writeout_inc(&bdi_wb_ctx->wb); +} + /* * maximal error of a stat counter. */ -- 2.25.1 Add support to handle multiple writeback contexts and check for dirty_exceeded across all the writeback contexts Signed-off-by: Kundan Kumar Signed-off-by: Anuj Gupta --- fs/gfs2/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index bd11d5e6cf63..b1e00a64e5ec 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -447,7 +447,7 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc) gfs2_log_flush(GFS2_SB(inode), ip->i_gl, GFS2_LOG_HEAD_FLUSH_NORMAL | GFS2_LFC_WRITE_INODE); - if (bdi->wb_ctx[0]->wb.dirty_exceeded) + if (bdi_wb_dirty_limit_exceeded(bdi)) gfs2_ail1_flush(sdp, wbc); else filemap_fdatawrite(metamapping); -- 2.25.1 Fetch writeback context to which an inode is affined. Use it to perform writeback related operations. Signed-off-by: Kundan Kumar Signed-off-by: Anuj Gupta --- fs/nfs/internal.h | 3 +-- fs/nfs/write.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 5b3c84104b5b..99eb6a5d5d01 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -857,8 +857,7 @@ static inline void nfs_folio_mark_unstable(struct folio *folio, * writeback is happening on the server now. */ node_stat_mod_folio(folio, NR_WRITEBACK, nr); - wb_stat_mod(&inode_to_bdi(inode)->wb_ctx[0]->wb, - WB_WRITEBACK, nr); + bdi_wb_stat_mod(inode, nr); __mark_inode_dirty(inode, I_DIRTY_DATASYNC); } } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 4317b93bc2af..0fe6ae84c4a2 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -868,8 +868,7 @@ static void nfs_folio_clear_commit(struct folio *folio) struct inode *inode = folio->mapping->host; node_stat_mod_folio(folio, NR_WRITEBACK, -nr); - wb_stat_mod(&inode_to_bdi(inode)->wb_ctx[0]->wb, - WB_WRITEBACK, -nr); + bdi_wb_stat_mod(inode, -nr); } } -- 2.25.1 The number of writeback contexts can be configured, with a valid range between 0 and the number of online CPUs. Inodes are then distributed across these contexts, enabling parallel writeback. Signed-off-by: Kundan Kumar Signed-off-by: Anuj Gupta --- mm/backing-dev.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 0a772d984ecf..0a3204a3a3a3 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -1046,6 +1046,12 @@ int bdi_init(struct backing_dev_info *bdi) bdi->min_ratio = 0; bdi->max_ratio = 100 * BDI_RATIO_SCALE; bdi->max_prop_frac = FPROP_FRAC_BASE; + + /* + * User can configure nr_wb_ctx using the newly introduced sysfs knob. + * echo N > /sys/class/bdi/:/nwritebacks + * Filesystem can also increase same during mount. + */ bdi->nr_wb_ctx = 1; bdi->wb_ctx = kcalloc(bdi->nr_wb_ctx, sizeof(struct bdi_writeback_ctx *), -- 2.25.1 The independent functions of alloc and free will be used while changing the number of writeback contexts. Signed-off-by: Kundan Kumar Signed-off-by: Anuj Gupta --- mm/backing-dev.c | 72 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 49 insertions(+), 23 deletions(-) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 0a3204a3a3a3..2a8f3b683b2d 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -1038,8 +1038,46 @@ static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb) #endif /* CONFIG_CGROUP_WRITEBACK */ +static struct bdi_writeback_ctx **wb_ctx_alloc(struct backing_dev_info *bdi, + int num_ctxs) +{ + struct bdi_writeback_ctx **wb_ctx; + + wb_ctx = kcalloc(num_ctxs, sizeof(struct bdi_writeback_ctx *), + GFP_KERNEL); + if (!wb_ctx) + return NULL; + + for (int i = 0; i < num_ctxs; i++) { + wb_ctx[i] = (struct bdi_writeback_ctx *) + kzalloc(sizeof(struct bdi_writeback_ctx), GFP_KERNEL); + if (!wb_ctx[i]) { + pr_err("Failed to allocate %d", i); + while (--i >= 0) + kfree(wb_ctx[i]); + kfree(wb_ctx); + return NULL; + } + INIT_LIST_HEAD(&wb_ctx[i]->wb_list); + init_waitqueue_head(&wb_ctx[i]->wb_waitq); + } + return wb_ctx; +} + +static void wb_ctx_free(struct backing_dev_info *bdi) +{ + struct bdi_writeback_ctx *bdi_wb_ctx; + + for_each_bdi_wb_ctx(bdi, bdi_wb_ctx) { + kfree(bdi_wb_ctx); + } + kfree(bdi->wb_ctx); +} + int bdi_init(struct backing_dev_info *bdi) { + int ret; + bdi->dev = NULL; kref_init(&bdi->refcnt); @@ -1047,48 +1085,36 @@ int bdi_init(struct backing_dev_info *bdi) bdi->max_ratio = 100 * BDI_RATIO_SCALE; bdi->max_prop_frac = FPROP_FRAC_BASE; + INIT_LIST_HEAD(&bdi->bdi_list); + /* * User can configure nr_wb_ctx using the newly introduced sysfs knob. * echo N > /sys/class/bdi/:/nwritebacks * Filesystem can also increase same during mount. */ bdi->nr_wb_ctx = 1; - bdi->wb_ctx = kcalloc(bdi->nr_wb_ctx, - sizeof(struct bdi_writeback_ctx *), - GFP_KERNEL); - INIT_LIST_HEAD(&bdi->bdi_list); - for (int i = 0; i < bdi->nr_wb_ctx; i++) { - bdi->wb_ctx[i] = (struct bdi_writeback_ctx *) - kzalloc(sizeof(struct bdi_writeback_ctx), GFP_KERNEL); - if (!bdi->wb_ctx[i]) { - pr_err("Failed to allocate %d", i); - while (--i >= 0) - kfree(bdi->wb_ctx[i]); - kfree(bdi->wb_ctx); - return -ENOMEM; - } - INIT_LIST_HEAD(&bdi->wb_ctx[i]->wb_list); - init_waitqueue_head(&bdi->wb_ctx[i]->wb_waitq); - } + + bdi->wb_ctx = wb_ctx_alloc(bdi, bdi->nr_wb_ctx); + if (!bdi->wb_ctx) + return -ENOMEM; + bdi->last_bdp_sleep = jiffies; - return cgwb_bdi_init(bdi); + ret = cgwb_bdi_init(bdi); + if (ret) + wb_ctx_free(bdi); + return ret; } struct backing_dev_info *bdi_alloc(int node_id) { struct backing_dev_info *bdi; - struct bdi_writeback_ctx *bdi_wb_ctx; bdi = kzalloc_node(sizeof(*bdi), GFP_KERNEL, node_id); if (!bdi) return NULL; if (bdi_init(bdi)) { - for_each_bdi_wb_ctx(bdi, bdi_wb_ctx) { - kfree(bdi_wb_ctx); - } - kfree(bdi->wb_ctx); kfree(bdi); return NULL; } -- 2.25.1 User can change the number of writeback contexts with values 1 to num cpus using the new sysfs attribute echo > /sys/class/bdi/:/nwritebacks The sequence of operations when number of writebacks is changed : - fetch the superblock for a bdi - freezes the filesystem - iterate through inodes of the superblock and flush the pages - shutdown and free the writeback threads - allocate and register the wb threads - thaw the filesystem Suggested-by: Christoph Hellwig Signed-off-by: Kundan Kumar Signed-off-by: Anuj Gupta --- fs/super.c | 23 +++++++++ include/linux/backing-dev.h | 1 + include/linux/fs.h | 1 + mm/backing-dev.c | 93 +++++++++++++++++++++++++++++++++++++ mm/page-writeback.c | 8 ++++ 5 files changed, 126 insertions(+) diff --git a/fs/super.c b/fs/super.c index 7f876f32343a..19ae05880888 100644 --- a/fs/super.c +++ b/fs/super.c @@ -2072,6 +2072,29 @@ static inline bool may_unfreeze(struct super_block *sb, enum freeze_holder who, return false; } +struct super_block *freeze_bdi_super(struct backing_dev_info *bdi) +{ + struct super_block *sb_iter; + struct super_block *sb = NULL; + + spin_lock(&sb_lock); + list_for_each_entry(sb_iter, &super_blocks, s_list) { + if (sb_iter->s_bdi == bdi) { + sb = sb_iter; + break; + } + } + spin_unlock(&sb_lock); + + if (sb) { + atomic_inc(&sb->s_active); + freeze_super(sb, FREEZE_HOLDER_KERNEL, NULL); + } + + return sb; +} +EXPORT_SYMBOL(freeze_bdi_super); + /** * freeze_super - lock the filesystem and force it into a consistent state * @sb: the super to lock diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index fb042e593c16..14f53183b8d1 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -144,6 +144,7 @@ int bdi_set_max_ratio_no_scale(struct backing_dev_info *bdi, unsigned int max_ra int bdi_set_min_bytes(struct backing_dev_info *bdi, u64 min_bytes); int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes); int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit); +int bdi_set_nwritebacks(struct backing_dev_info *bdi, unsigned int nwritebacks); /* * Flags in backing_dev_info::capability diff --git a/include/linux/fs.h b/include/linux/fs.h index 5199b0d49fa5..c7ed1c0b79f9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2770,6 +2770,7 @@ extern int unregister_filesystem(struct file_system_type *); extern int vfs_statfs(const struct path *, struct kstatfs *); extern int user_statfs(const char __user *, struct kstatfs *); extern int fd_statfs(int, struct kstatfs *); +struct super_block *freeze_bdi_super(struct backing_dev_info *bdi); int freeze_super(struct super_block *super, enum freeze_holder who, const void *freeze_owner); int thaw_super(struct super_block *super, enum freeze_holder who, diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 2a8f3b683b2d..5bfb9bf3ce52 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -35,6 +35,17 @@ LIST_HEAD(bdi_list); /* bdi_wq serves all asynchronous writeback tasks */ struct workqueue_struct *bdi_wq; +static int cgwb_bdi_init(struct backing_dev_info *bdi); +static void cgwb_bdi_register(struct backing_dev_info *bdi, + struct bdi_writeback_ctx *bdi_wb_ctx); +static void cgwb_bdi_unregister(struct backing_dev_info *bdi, + struct bdi_writeback_ctx *bdi_wb_ctx); +static void wb_shutdown(struct bdi_writeback *wb); +static void wb_exit(struct bdi_writeback *wb); +static struct bdi_writeback_ctx **wb_ctx_alloc(struct backing_dev_info *bdi, + int num_ctxs); +static void wb_ctx_free(struct backing_dev_info *bdi); + #ifdef CONFIG_DEBUG_FS #include #include @@ -469,6 +480,87 @@ static ssize_t strict_limit_show(struct device *dev, } static DEVICE_ATTR_RW(strict_limit); +static ssize_t nwritebacks_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + unsigned int nwritebacks; + ssize_t ret; + struct super_block *sb = NULL; + struct bdi_writeback_ctx **wb_ctx; + struct bdi_writeback_ctx *bdi_wb_ctx; + struct inode *inode; + + ret = kstrtouint(buf, 10, &nwritebacks); + if (ret < 0) + return ret; + + if (nwritebacks < 1 || nwritebacks > num_online_cpus()) + return -EINVAL; + + if (nwritebacks == bdi->nr_wb_ctx) + return count; + + wb_ctx = wb_ctx_alloc(bdi, nwritebacks); + if (!wb_ctx) + return -ENOMEM; + + sb = freeze_bdi_super(bdi); + if (!sb) + return -EBUSY; + + spin_lock(&sb->s_inode_list_lock); + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + filemap_write_and_wait(inode->i_mapping); + truncate_inode_pages_final(inode->i_mapping); +#ifdef CONFIG_CGROUP_WRITEBACK + if (inode->i_wb) { + WARN_ON_ONCE(!(inode->i_state & I_CLEAR)); + wb_put(inode->i_wb); + inode->i_wb = NULL; + } +#endif + } + spin_unlock(&sb->s_inode_list_lock); + + for_each_bdi_wb_ctx(bdi, bdi_wb_ctx) { + wb_shutdown(&bdi_wb_ctx->wb); + cgwb_bdi_unregister(bdi, bdi_wb_ctx); + } + + for_each_bdi_wb_ctx(bdi, bdi_wb_ctx) { + WARN_ON_ONCE(test_bit(WB_registered, &bdi_wb_ctx->wb.state)); + wb_exit(&bdi_wb_ctx->wb); + kfree(bdi_wb_ctx); + } + kfree(bdi->wb_ctx); + + ret = bdi_set_nwritebacks(bdi, nwritebacks); + + bdi->wb_ctx = wb_ctx; + + cgwb_bdi_init(bdi); + for_each_bdi_wb_ctx(bdi, bdi_wb_ctx) { + cgwb_bdi_register(bdi, bdi_wb_ctx); + set_bit(WB_registered, &bdi_wb_ctx->wb.state); + } + + thaw_super(sb, FREEZE_HOLDER_KERNEL, NULL); + deactivate_super(sb); + + return ret; +} + +static ssize_t nwritebacks_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + + return sysfs_emit(buf, "%d\n", bdi->nr_wb_ctx); +} +static DEVICE_ATTR_RW(nwritebacks); + static struct attribute *bdi_dev_attrs[] = { &dev_attr_read_ahead_kb.attr, &dev_attr_min_ratio.attr, @@ -479,6 +571,7 @@ static struct attribute *bdi_dev_attrs[] = { &dev_attr_max_bytes.attr, &dev_attr_stable_pages_required.attr, &dev_attr_strict_limit.attr, + &dev_attr_nwritebacks.attr, NULL, }; ATTRIBUTE_GROUPS(bdi_dev); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 6f283a777da6..1a43022affdd 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -818,6 +818,14 @@ int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit return 0; } +int bdi_set_nwritebacks(struct backing_dev_info *bdi, unsigned int nwritebacks) +{ + spin_lock_bh(&bdi_lock); + bdi->nr_wb_ctx = nwritebacks; + spin_unlock_bh(&bdi_lock); + return 0; +} + static unsigned long dirty_freerun_ceiling(unsigned long thresh, unsigned long bg_thresh) { -- 2.25.1 Implemented bdi_inc_writeback() to increase the writeback context count and called this function at XFS mount time to set the desired count. Signed-off-by: Kundan Kumar Signed-off-by: Anuj Gupta --- fs/xfs/xfs_super.c | 2 ++ include/linux/backing-dev.h | 1 + mm/backing-dev.c | 58 +++++++++++++++++++++++++++++++++++++ 3 files changed, 61 insertions(+) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index b3ec9141d902..aa97b59f53c6 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1783,6 +1783,8 @@ xfs_fs_fill_super( if (error) goto out_free_sb; + bdi_inc_writeback(sb->s_bdi, mp->m_sb.sb_agcount); + /* * V4 support is undergoing deprecation. * diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 14f53183b8d1..89a465e1964f 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -40,6 +40,7 @@ void wb_start_background_writeback(struct bdi_writeback *wb); void wb_workfn(struct work_struct *work); void wb_wait_for_completion(struct wb_completion *done); +int bdi_inc_writeback(struct backing_dev_info *bdi, int nwriteback); extern spinlock_t bdi_lock; extern struct list_head bdi_list; diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 5bfb9bf3ce52..e450b3a9b952 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -1219,6 +1219,64 @@ struct backing_dev_info *bdi_alloc(int node_id) } EXPORT_SYMBOL(bdi_alloc); +int bdi_inc_writeback(struct backing_dev_info *bdi, int nwritebacks) +{ + struct bdi_writeback_ctx **wb_ctx; + int ret = 0; + + if (nwritebacks <= bdi->nr_wb_ctx) + return ret; + + wb_ctx = kcalloc(nwritebacks, sizeof(struct bdi_writeback_ctx *), + GFP_KERNEL); + if (!wb_ctx) + return -ENOMEM; + + for (int i = 0; i < bdi->nr_wb_ctx; i++) + wb_ctx[i] = bdi->wb_ctx[i]; + + for (int i = bdi->nr_wb_ctx; i < nwritebacks; i++) { + wb_ctx[i] = (struct bdi_writeback_ctx *) + kzalloc(sizeof(struct bdi_writeback_ctx), GFP_KERNEL); + if (!wb_ctx[i]) { + pr_err("Failed to allocate %d", i); + while (--i >= bdi->nr_wb_ctx) + kfree(wb_ctx[i]); + kfree(wb_ctx); + return -ENOMEM; + } + INIT_LIST_HEAD(&wb_ctx[i]->wb_list); + init_waitqueue_head(&wb_ctx[i]->wb_waitq); + +#ifdef CONFIG_CGROUP_WRITEBACK + INIT_RADIX_TREE(&wb_ctx[i]->cgwb_tree, GFP_ATOMIC); + init_rwsem(&wb_ctx[i]->wb_switch_rwsem); +#endif + ret = wb_init(&wb_ctx[i]->wb, wb_ctx[i], bdi, GFP_KERNEL); + if (!ret) { +#ifdef CONFIG_CGROUP_WRITEBACK + wb_ctx[i]->wb.memcg_css = &root_mem_cgroup->css; + wb_ctx[i]->wb.blkcg_css = blkcg_root_css; +#endif + } else { + while (--i >= bdi->nr_wb_ctx) + kfree(wb_ctx[i]); + kfree(wb_ctx); + return ret; + } + cgwb_bdi_register(bdi, wb_ctx[i]); + set_bit(WB_registered, &wb_ctx[i]->wb.state); + } + + spin_lock_bh(&bdi_lock); + kfree(bdi->wb_ctx); + bdi->wb_ctx = wb_ctx; + bdi->nr_wb_ctx = nwritebacks; + spin_unlock_bh(&bdi_lock); + return 0; +} +EXPORT_SYMBOL(bdi_inc_writeback); + static struct rb_node **bdi_lookup_rb_node(u64 id, struct rb_node **parentp) { struct rb_node **p = &bdi_tree.rb_node; -- 2.25.1