Btrfs currently tracks its metadata pages in the page cache, using a fake inode (fs_info->btree_inode) with offsets corresponding to where the metadata is stored in the filesystem's full logical address space. A consequence of this is that when btrfs uses filemap_add_folio(), this usage is charged to the cgroup of whichever task happens to be running at the time. These folios don't belong to any particular user cgroup, so I don't think it makes much sense for them to be charged in that way. Some negative consequences as a result: - A task can be holding some important btrfs locks, then need to lookup some metadata and go into reclaim, extending the duration it holds that lock for, and unfairly pushing its own reclaim pain onto other cgroups. - If that cgroup goes into reclaim, it might reclaim these folios a different non-reclaiming cgroup might need soon. This is naturally offset by LRU reclaim, but still. A very similar proposal to use the root cgroup was previously made by Qu, but he was advised by Christoph to instead introduce a _nocharge variant of filemap_add_folio. Link: https://lore.kernel.org/linux-mm/b5fef5372ae454a7b6da4f2f75c427aeab6a07d6.1727498749.git.wqu@suse.com/ Suggested-by: Christoph Hellwig Signed-off-by: Boris Burkov --- include/linux/pagemap.h | 2 ++ mm/filemap.c | 23 +++++++++++++++++------ 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index e63fbfbd5b0f..acc8d390ecbb 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1237,6 +1237,8 @@ size_t fault_in_readable(const char __user *uaddr, size_t size); int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t index, gfp_t gfp); +int filemap_add_folio_nocharge(struct address_space *mapping, + struct folio *folio, pgoff_t index, gfp_t gfp); int filemap_add_folio(struct address_space *mapping, struct folio *folio, pgoff_t index, gfp_t gfp); void filemap_remove_folio(struct folio *folio); diff --git a/mm/filemap.c b/mm/filemap.c index bada249b9fb7..ccc9cfb4d418 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -955,20 +955,15 @@ noinline int __filemap_add_folio(struct address_space *mapping, } ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO); -int filemap_add_folio(struct address_space *mapping, struct folio *folio, +int filemap_add_folio_nocharge(struct address_space *mapping, struct folio *folio, pgoff_t index, gfp_t gfp) { void *shadow = NULL; int ret; - ret = mem_cgroup_charge(folio, NULL, gfp); - if (ret) - return ret; - __folio_set_locked(folio); ret = __filemap_add_folio(mapping, folio, index, gfp, &shadow); if (unlikely(ret)) { - mem_cgroup_uncharge(folio); __folio_clear_locked(folio); } else { /* @@ -986,6 +981,22 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio, } return ret; } +EXPORT_SYMBOL_GPL(filemap_add_folio_nocharge); + +int filemap_add_folio(struct address_space *mapping, struct folio *folio, + pgoff_t index, gfp_t gfp) +{ + int ret; + + ret = mem_cgroup_charge(folio, NULL, gfp); + if (ret) + return ret; + + ret = filemap_add_folio_nocharge(mapping, folio, index, gfp); + if (ret) + mem_cgroup_uncharge(folio); + return ret; +} EXPORT_SYMBOL_GPL(filemap_add_folio); #ifdef CONFIG_NUMA -- 2.50.1 Don't block on cgroup reclaim or subject shared extent_buffers to cgroup reclaim. Signed-off-by: Boris Burkov --- fs/btrfs/extent_io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f9ccbe5a203a..1f25447f1e39 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3258,8 +3258,8 @@ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i, retry: existing_folio = NULL; - ret = filemap_add_folio(mapping, eb->folios[i], index + i, - GFP_NOFS | __GFP_NOFAIL); + ret = filemap_add_folio_nocharge(mapping, eb->folios[i], index + i, + GFP_NOFS | __GFP_NOFAIL); if (!ret) goto finish; -- 2.50.1 If cgroups are configured into the kernel, then uncharged pages can only come from filemap_add_folio_nocharge. Track such uncharged folios in vmstat so that they are accounted for. Suggested-by: Shakeel Butt Signed-off-by: Boris Burkov --- include/linux/mmzone.h | 3 +++ mm/filemap.c | 18 ++++++++++++++++++ mm/vmstat.c | 3 +++ 3 files changed, 24 insertions(+) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 283913d42d7b..a945dec65371 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -241,6 +241,9 @@ enum node_stat_item { NR_HUGETLB, #endif NR_BALLOON_PAGES, +#ifdef CONFIG_MEMCG + NR_UNCHARGED_FILE_PAGES, +#endif NR_VM_NODE_STAT_ITEMS }; diff --git a/mm/filemap.c b/mm/filemap.c index ccc9cfb4d418..0a258b4a9246 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -146,6 +146,22 @@ static void page_cache_delete(struct address_space *mapping, mapping->nrpages -= nr; } +#ifdef CONFIG_MEMCG +static void filemap_mod_uncharged_vmstat(struct folio *folio, int sign) +{ + long nr = folio_nr_pages(folio) * sign; + + if (!folio_memcg(folio)) + __lruvec_stat_mod_folio(folio, NR_UNCHARGED_FILE_PAGES, nr); +} +#else +static void filemap_mod_uncharged_cgroup_vmstat(struct folio *folio, int sign) +{ + return; +} +#endif + + static void filemap_unaccount_folio(struct address_space *mapping, struct folio *folio) { @@ -190,6 +206,7 @@ static void filemap_unaccount_folio(struct address_space *mapping, __lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); } + filemap_mod_uncharged_vmstat(folio, -1); /* * At this point folio must be either written or cleaned by @@ -978,6 +995,7 @@ int filemap_add_folio_nocharge(struct address_space *mapping, struct folio *foli if (!(gfp & __GFP_WRITE) && shadow) workingset_refault(folio, shadow); folio_add_lru(folio); + filemap_mod_uncharged_vmstat(folio, 1); } return ret; } diff --git a/mm/vmstat.c b/mm/vmstat.c index a78d70ddeacd..63318742ae5a 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1281,6 +1281,9 @@ const char * const vmstat_text[] = { "nr_hugetlb", #endif "nr_balloon_pages", +#ifdef CONFIG_MEMCG + "nr_uncharged_file_pages", +#endif /* system-wide enum vm_stat_item counters */ "nr_dirty_threshold", "nr_dirty_background_threshold", -- 2.50.1