All the zsmalloc functions that operate on a zsmalloc object (encoded location values) are named "zs_obj_xxx", except for zs_object_copy. Rename zs_object_copy to zs_obj_copy to conform to the pattern. No functional changes intended. Signed-off-by: Joshua Hahn --- mm/zsmalloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index d5d1c27b3852..0ca2e94af5ad 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1416,7 +1416,7 @@ void zs_free(struct zs_pool *pool, unsigned long handle) } EXPORT_SYMBOL_GPL(zs_free); -static void zs_object_copy(struct size_class *class, unsigned long dst, +static void zs_obj_copy(struct size_class *class, unsigned long dst, unsigned long src) { struct zpdesc *s_zpdesc, *d_zpdesc; @@ -1537,7 +1537,7 @@ static void migrate_zspage(struct zs_pool *pool, struct zspage *src_zspage, used_obj = handle_to_obj(handle); free_obj = obj_malloc(pool, dst_zspage, handle); - zs_object_copy(class, free_obj, used_obj); + zs_obj_copy(class, free_obj, used_obj); obj_idx++; obj_free(class->size, used_obj); -- 2.47.3 object indices, which describe the location of an object in a zspage, cannot be negative. To reflect this most helpers calculate and return these values as unsigned ints. Convert find_alloced_obj, the only function that calculates obj_idx as a signed int, to use an unsigned int as well. No functional change intended. Signed-off-by: Joshua Hahn --- mm/zsmalloc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 0ca2e94af5ad..7846f31bcc8b 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1491,10 +1491,11 @@ static void zs_obj_copy(struct size_class *class, unsigned long dst, * return handle. */ static unsigned long find_alloced_obj(struct size_class *class, - struct zpdesc *zpdesc, int *obj_idx) + struct zpdesc *zpdesc, + unsigned int *obj_idx) { unsigned int offset; - int index = *obj_idx; + unsigned int index = *obj_idx; unsigned long handle = 0; void *addr = kmap_local_zpdesc(zpdesc); @@ -1521,7 +1522,7 @@ static void migrate_zspage(struct zs_pool *pool, struct zspage *src_zspage, { unsigned long used_obj, free_obj; unsigned long handle; - int obj_idx = 0; + unsigned int obj_idx = 0; struct zpdesc *s_zpdesc = get_first_zpdesc(src_zspage); struct size_class *class = pool->size_class[src_zspage->class]; -- 2.47.3 Introduce an array of struct obj_cgroup pointers to zpdesc to keep track of compressed objects' memcg ownership. The 8 bytes required to add the array in struct zpdesc brings its size up from 56 bytes to 64 bytes. However, in the current implementation, struct zpdesc lays on top of struct page[1]. This allows the increased size to remain invisible to the outside, since 64 bytes are used for struct zpdesc anyways. The newly added obj_cgroup array pointer overlays page->memcg_data, which causes problems for functions that try to perform page charging by checking the zeroness of page->memcg_data. To make sure that the backing zpdesc's obj_cgroup ** is not interpreted as a mem_cgroup *, follow SLUB's lead and use the MEMCG_DATA_OBJEXTS bit to tag the pointer. Consumers of zsmalloc that do not perform memcg accounting (i.e. zram) are completely unaffected by this patch, as the array to track the obj_cgroup pointers are only allocated in the zswap path. This patch temporarily increases the memory used by zswap by 8 bytes per zswap_entry, since the obj_cgroup pointer is duplicated in the zpdesc and in zswap_entry. In the following patches, we will redirect memory charging operations to use the zpdesc's obj_cgroup instead, and remove the pointer from zswap_entry. This will leave no net memory usage increase for both zram and zswap. In this patch, allocate / free the objcg pointer array for the zswap path, and handle partial object migration and full zpdesc migration. [1] In the (near) future, struct zpdesc may no longer overlay struct page as we shift towards using memdescs. When this happens, the size increase of struct zpdesc will no longer free. With that said, the difference can be kept minimal. All the changes that are being implemented are currently guarded under CONFIG_MEMCG. We can optionally minimize the impact on zram users by guarding these changes in CONFIG_MEMCG && CONFIG_ZSWAP as well. Suggested-by: Johannes Weiner Signed-off-by: Joshua Hahn --- drivers/block/zram/zram_drv.c | 10 ++--- include/linux/zsmalloc.h | 2 +- mm/zpdesc.h | 25 +++++++++++- mm/zsmalloc.c | 74 +++++++++++++++++++++++++++++------ mm/zswap.c | 2 +- 5 files changed, 93 insertions(+), 20 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 61d3e2c74901..60ee85679730 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -2220,8 +2220,8 @@ static int write_incompressible_page(struct zram *zram, struct page *page, * like we do for compressible pages. */ handle = zs_malloc(zram->mem_pool, PAGE_SIZE, - GFP_NOIO | __GFP_NOWARN | - __GFP_HIGHMEM | __GFP_MOVABLE, page_to_nid(page)); + GFP_NOIO | __GFP_NOWARN | __GFP_HIGHMEM | + __GFP_MOVABLE, page_to_nid(page), false); if (IS_ERR_VALUE(handle)) return PTR_ERR((void *)handle); @@ -2283,8 +2283,8 @@ static int zram_write_page(struct zram *zram, struct page *page, u32 index) } handle = zs_malloc(zram->mem_pool, comp_len, - GFP_NOIO | __GFP_NOWARN | - __GFP_HIGHMEM | __GFP_MOVABLE, page_to_nid(page)); + GFP_NOIO | __GFP_NOWARN | __GFP_HIGHMEM | + __GFP_MOVABLE, page_to_nid(page), false); if (IS_ERR_VALUE(handle)) { zcomp_stream_put(zstrm); return PTR_ERR((void *)handle); @@ -2514,7 +2514,7 @@ static int recompress_slot(struct zram *zram, u32 index, struct page *page, handle_new = zs_malloc(zram->mem_pool, comp_len_new, GFP_NOIO | __GFP_NOWARN | __GFP_HIGHMEM | __GFP_MOVABLE, - page_to_nid(page)); + page_to_nid(page), false); if (IS_ERR_VALUE(handle_new)) { zcomp_stream_put(zstrm); return PTR_ERR((void *)handle_new); diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index 478410c880b1..8ef28b964bb0 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -28,7 +28,7 @@ struct zs_pool *zs_create_pool(const char *name); void zs_destroy_pool(struct zs_pool *pool); unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t flags, - const int nid); + const int nid, bool objcg); void zs_free(struct zs_pool *pool, unsigned long obj); size_t zs_huge_class_size(struct zs_pool *pool); diff --git a/mm/zpdesc.h b/mm/zpdesc.h index b8258dc78548..d10a73e4a90e 100644 --- a/mm/zpdesc.h +++ b/mm/zpdesc.h @@ -20,10 +20,12 @@ * @zspage: Points to the zspage this zpdesc is a part of. * @first_obj_offset: First object offset in zsmalloc pool. * @_refcount: The number of references to this zpdesc. + * @objcgs: Array of objcgs pointers that the stored objs + * belong to. Overlayed on top of page->memcg_data, and + * will always have first bit set if it is a valid pointer. * * This struct overlays struct page for now. Do not modify without a good - * understanding of the issues. In particular, do not expand into the overlap - * with memcg_data. + * understanding of the issues. * * Page flags used: * * PG_private identifies the first component page. @@ -47,6 +49,9 @@ struct zpdesc { */ unsigned int first_obj_offset; atomic_t _refcount; +#ifdef CONFIG_MEMCG + unsigned long objcgs; +#endif }; #define ZPDESC_MATCH(pg, zp) \ static_assert(offsetof(struct page, pg) == offsetof(struct zpdesc, zp)) @@ -59,6 +64,9 @@ ZPDESC_MATCH(__folio_index, handle); ZPDESC_MATCH(private, zspage); ZPDESC_MATCH(page_type, first_obj_offset); ZPDESC_MATCH(_refcount, _refcount); +#ifdef CONFIG_MEMCG +ZPDESC_MATCH(memcg_data, objcgs); +#endif #undef ZPDESC_MATCH static_assert(sizeof(struct zpdesc) <= sizeof(struct page)); @@ -171,4 +179,17 @@ static inline bool zpdesc_is_locked(struct zpdesc *zpdesc) { return folio_test_locked(zpdesc_folio(zpdesc)); } + +#ifdef CONFIG_MEMCG +static inline struct obj_cgroup **zpdesc_objcgs(struct zpdesc *zpdesc) +{ + return (struct obj_cgroup **)(zpdesc->objcgs & ~OBJEXTS_FLAGS_MASK); +} + +static inline void zpdesc_set_objcgs(struct zpdesc *zpdesc, + struct obj_cgroup **objcgs) +{ + zpdesc->objcgs = (unsigned long)objcgs | MEMCG_DATA_OBJEXTS; +} +#endif #endif diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 7846f31bcc8b..7d56bb700e11 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -39,6 +39,7 @@ #include #include #include +#include #include "zpdesc.h" #define ZSPAGE_MAGIC 0x58 @@ -777,6 +778,10 @@ static void reset_zpdesc(struct zpdesc *zpdesc) ClearPagePrivate(page); zpdesc->zspage = NULL; zpdesc->next = NULL; +#ifdef CONFIG_MEMCG + kfree(zpdesc_objcgs(zpdesc)); + zpdesc->objcgs = 0; +#endif /* PageZsmalloc is sticky until the page is freed to the buddy. */ } @@ -893,6 +898,43 @@ static void init_zspage(struct size_class *class, struct zspage *zspage) set_freeobj(zspage, 0); } +#ifdef CONFIG_MEMCG +static bool alloc_zspage_objcgs(struct size_class *class, gfp_t gfp, + struct zpdesc *zpdescs[]) +{ + /* + * Add 2 to objcgs_per_zpdesc to account for partial objs that may be + * stored at the beginning or end of the zpdesc. + */ + int objcgs_per_zpdesc = (PAGE_SIZE / class->size) + 2; + int i; + struct obj_cgroup **objcgs; + + for (i = 0; i < class->pages_per_zspage; i++) { + objcgs = kcalloc(objcgs_per_zpdesc, sizeof(struct obj_cgroup *), + gfp & ~__GFP_HIGHMEM); + if (!objcgs) { + while (--i >= 0) { + kfree(zpdesc_objcgs(zpdescs[i])); + zpdescs[i]->objcgs = 0; + } + + return false; + } + + zpdesc_set_objcgs(zpdescs[i], objcgs); + } + + return true; +} +#else +static bool alloc_zspage_objcgs(struct size_class *class, gfp_t gfp, + struct zpdesc *zpdescs[]) +{ + return true; +} +#endif + static void create_page_chain(struct size_class *class, struct zspage *zspage, struct zpdesc *zpdescs[]) { @@ -931,7 +973,7 @@ static void create_page_chain(struct size_class *class, struct zspage *zspage, */ static struct zspage *alloc_zspage(struct zs_pool *pool, struct size_class *class, - gfp_t gfp, const int nid) + gfp_t gfp, const int nid, bool objcg) { int i; struct zpdesc *zpdescs[ZS_MAX_PAGES_PER_ZSPAGE]; @@ -952,24 +994,29 @@ static struct zspage *alloc_zspage(struct zs_pool *pool, struct zpdesc *zpdesc; zpdesc = alloc_zpdesc(gfp, nid); - if (!zpdesc) { - while (--i >= 0) { - zpdesc_dec_zone_page_state(zpdescs[i]); - free_zpdesc(zpdescs[i]); - } - cache_free_zspage(zspage); - return NULL; - } + if (!zpdesc) + goto err; __zpdesc_set_zsmalloc(zpdesc); zpdesc_inc_zone_page_state(zpdesc); zpdescs[i] = zpdesc; } + if (objcg && !alloc_zspage_objcgs(class, gfp, zpdescs)) + goto err; + create_page_chain(class, zspage, zpdescs); init_zspage(class, zspage); return zspage; + +err: + while (--i >= 0) { + zpdesc_dec_zone_page_state(zpdescs[i]); + free_zpdesc(zpdescs[i]); + } + cache_free_zspage(zspage); + return NULL; } static struct zspage *find_get_zspage(struct size_class *class) @@ -1289,13 +1336,14 @@ static unsigned long obj_malloc(struct zs_pool *pool, * @size: size of block to allocate * @gfp: gfp flags when allocating object * @nid: The preferred node id to allocate new zspage (if needed) + * @objcg: Whether the zspage should track per-object memory charging. * * On success, handle to the allocated object is returned, * otherwise an ERR_PTR(). * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail. */ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp, - const int nid) + const int nid, bool objcg) { unsigned long handle; struct size_class *class; @@ -1330,7 +1378,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp, spin_unlock(&class->lock); - zspage = alloc_zspage(pool, class, gfp, nid); + zspage = alloc_zspage(pool, class, gfp, nid, objcg); if (!zspage) { cache_free_handle(handle); return (unsigned long)ERR_PTR(-ENOMEM); @@ -1672,6 +1720,10 @@ static void replace_sub_page(struct size_class *class, struct zspage *zspage, if (unlikely(ZsHugePage(zspage))) newzpdesc->handle = oldzpdesc->handle; __zpdesc_set_movable(newzpdesc); +#ifdef CONFIG_MEMCG + zpdesc_set_objcgs(newzpdesc, zpdesc_objcgs(oldzpdesc)); + oldzpdesc->objcgs = 0; +#endif } static bool zs_page_isolate(struct page *page, isolate_mode_t mode) diff --git a/mm/zswap.c b/mm/zswap.c index af3f0fbb0558..dd083110bfa0 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -905,7 +905,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, } gfp = GFP_NOWAIT | __GFP_NORETRY | __GFP_HIGHMEM | __GFP_MOVABLE; - handle = zs_malloc(pool->zs_pool, dlen, gfp, page_to_nid(page)); + handle = zs_malloc(pool->zs_pool, dlen, gfp, page_to_nid(page), true); if (IS_ERR_VALUE(handle)) { alloc_ret = PTR_ERR((void *)handle); goto unlock; -- 2.47.3 With each zswap-backing zpdesc now having an array of obj_cgroup pointers, plumb the obj_cgroup pointer from the zswap / zram layer down to zsmalloc. Introduce two helper functions zpdesc_obj_cgroup and zpdesc_set_obj_cgroup, which abstract the conversion of an object's zspage idx to its zpdesc idx and the retrieval of the obj_cgroup pointer from the zpdesc. From the zswap path, store the obj_cgroup pointer after compression when writing the object and free when the object gets freed. Also handle the migration of an object across zpdescs. The lifetime and charging of the obj_cgroup is still handled in the zswap layer. Suggested-by: Johannes Weiner Signed-off-by: Joshua Hahn --- drivers/block/zram/zram_drv.c | 7 ++-- include/linux/zsmalloc.h | 3 +- mm/zsmalloc.c | 71 ++++++++++++++++++++++++++++++++++- mm/zswap.c | 6 +-- 4 files changed, 79 insertions(+), 8 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 60ee85679730..209668b14428 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -2231,7 +2231,7 @@ static int write_incompressible_page(struct zram *zram, struct page *page, } src = kmap_local_page(page); - zs_obj_write(zram->mem_pool, handle, src, PAGE_SIZE); + zs_obj_write(zram->mem_pool, handle, src, PAGE_SIZE, NULL); kunmap_local(src); slot_lock(zram, index); @@ -2296,7 +2296,7 @@ static int zram_write_page(struct zram *zram, struct page *page, u32 index) return -ENOMEM; } - zs_obj_write(zram->mem_pool, handle, zstrm->buffer, comp_len); + zs_obj_write(zram->mem_pool, handle, zstrm->buffer, comp_len, NULL); zcomp_stream_put(zstrm); slot_lock(zram, index); @@ -2520,7 +2520,8 @@ static int recompress_slot(struct zram *zram, u32 index, struct page *page, return PTR_ERR((void *)handle_new); } - zs_obj_write(zram->mem_pool, handle_new, zstrm->buffer, comp_len_new); + zs_obj_write(zram->mem_pool, handle_new, zstrm->buffer, + comp_len_new, NULL); zcomp_stream_put(zstrm); slot_free(zram, index); diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index 8ef28b964bb0..22f3baa13f24 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -15,6 +15,7 @@ #define _ZS_MALLOC_H_ #include +#include struct zs_pool_stats { /* How many pages were migrated (freed) */ @@ -48,7 +49,7 @@ void zs_obj_read_sg_begin(struct zs_pool *pool, unsigned long handle, struct scatterlist *sg, size_t mem_len); void zs_obj_read_sg_end(struct zs_pool *pool, unsigned long handle); void zs_obj_write(struct zs_pool *pool, unsigned long handle, - void *handle_mem, size_t mem_len); + void *handle_mem, size_t mem_len, struct obj_cgroup *objcg); extern const struct movable_operations zsmalloc_mops; diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 7d56bb700e11..e5ae9a0fc78a 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -899,6 +899,41 @@ static void init_zspage(struct size_class *class, struct zspage *zspage) } #ifdef CONFIG_MEMCG +/* idx is indexed per-zspage, not per-zpdesc. */ +static inline struct obj_cgroup *zpdesc_obj_cgroup(struct zpdesc *zpdesc, + unsigned int idx, + int size) +{ + struct obj_cgroup **objcgs = zpdesc_objcgs(zpdesc); + unsigned int off = offset_in_page(size * idx); + unsigned int zpdesc_idx = DIV_ROUND_UP(off, size); + + if (!objcgs) + return NULL; + + return objcgs[zpdesc_idx]; +} + +/* idx is indexed per-zspage, not per-zpdesc. */ +static inline void zpdesc_set_obj_cgroup(struct zpdesc *zpdesc, + unsigned int idx, int size, + struct obj_cgroup *objcg) +{ + struct obj_cgroup **objcgs = zpdesc_objcgs(zpdesc); + unsigned int off = offset_in_page(size * idx); + unsigned int zpdesc_idx = DIV_ROUND_UP(off, size); + + if (!objcgs) + return; + + objcgs[zpdesc_idx] = objcg; + if (off + size > PAGE_SIZE) { + /* object spans two pages */ + objcgs = zpdesc_objcgs(get_next_zpdesc(zpdesc)); + objcgs[0] = objcg; + } +} + static bool alloc_zspage_objcgs(struct size_class *class, gfp_t gfp, struct zpdesc *zpdescs[]) { @@ -927,12 +962,40 @@ static bool alloc_zspage_objcgs(struct size_class *class, gfp_t gfp, return true; } + +static void migrate_obj_objcg(unsigned long used_obj, unsigned long free_obj, + int size) +{ + unsigned int s_obj_idx, d_obj_idx; + struct zpdesc *s_zpdesc, *d_zpdesc; + struct obj_cgroup *objcg; + + obj_to_location(used_obj, &s_zpdesc, &s_obj_idx); + obj_to_location(free_obj, &d_zpdesc, &d_obj_idx); + objcg = zpdesc_obj_cgroup(s_zpdesc, s_obj_idx, size); + + zpdesc_set_obj_cgroup(d_zpdesc, d_obj_idx, size, objcg); + zpdesc_set_obj_cgroup(s_zpdesc, s_obj_idx, size, NULL); +} #else +static inline struct obj_cgroup *zpdesc_obj_cgroup(struct zpdesc *zpdesc, + unsigned int offset, + int size) +{ + return NULL; +} + +static inline void zpdesc_set_obj_cgroup(struct zpdesc *zpdesc, + unsigned int offset, int size, + struct obj_cgroup *objcg) {} static bool alloc_zspage_objcgs(struct size_class *class, gfp_t gfp, struct zpdesc *zpdescs[]) { return true; } + +static void migrate_obj_objcg(unsigned long used_obj, unsigned long free_obj, + int size) {} #endif static void create_page_chain(struct size_class *class, struct zspage *zspage, @@ -1221,7 +1284,7 @@ void zs_obj_read_sg_end(struct zs_pool *pool, unsigned long handle) EXPORT_SYMBOL_GPL(zs_obj_read_sg_end); void zs_obj_write(struct zs_pool *pool, unsigned long handle, - void *handle_mem, size_t mem_len) + void *handle_mem, size_t mem_len, struct obj_cgroup *objcg) { struct zspage *zspage; struct zpdesc *zpdesc; @@ -1242,6 +1305,9 @@ void zs_obj_write(struct zs_pool *pool, unsigned long handle, class = zspage_class(pool, zspage); off = offset_in_page(class->size * obj_idx); + if (objcg) + zpdesc_set_obj_cgroup(zpdesc, obj_idx, class->size, objcg); + if (!ZsHugePage(zspage)) off += ZS_HANDLE_SIZE; @@ -1415,6 +1481,8 @@ static void obj_free(int class_size, unsigned long obj) f_offset = offset_in_page(class_size * f_objidx); zspage = get_zspage(f_zpdesc); + zpdesc_set_obj_cgroup(f_zpdesc, f_objidx, class_size, NULL); + vaddr = kmap_local_zpdesc(f_zpdesc); link = (struct link_free *)(vaddr + f_offset); @@ -1587,6 +1655,7 @@ static void migrate_zspage(struct zs_pool *pool, struct zspage *src_zspage, used_obj = handle_to_obj(handle); free_obj = obj_malloc(pool, dst_zspage, handle); zs_obj_copy(class, free_obj, used_obj); + migrate_obj_objcg(used_obj, free_obj, class->size); obj_idx++; obj_free(class->size, used_obj); diff --git a/mm/zswap.c b/mm/zswap.c index dd083110bfa0..1e2d60f47919 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -851,7 +851,7 @@ static void acomp_ctx_put_unlock(struct crypto_acomp_ctx *acomp_ctx) } static bool zswap_compress(struct page *page, struct zswap_entry *entry, - struct zswap_pool *pool) + struct zswap_pool *pool, struct obj_cgroup *objcg) { struct crypto_acomp_ctx *acomp_ctx; struct scatterlist input, output; @@ -911,7 +911,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, goto unlock; } - zs_obj_write(pool->zs_pool, handle, dst, dlen); + zs_obj_write(pool->zs_pool, handle, dst, dlen, objcg); entry->handle = handle; entry->length = dlen; @@ -1413,7 +1413,7 @@ static bool zswap_store_page(struct page *page, return false; } - if (!zswap_compress(page, entry, pool)) + if (!zswap_compress(page, entry, pool, objcg)) goto compress_failed; old = xa_store(swap_zswap_tree(page_swpentry), -- 2.47.3 Now that obj_cgroups are tracked in zpdesc, redirect the zswap layer to use the pointer stored in the zpdesc and remove the pointer in struct zswap_entry. This offsets the temporary memory increase caused by the duplicate storage of the obj_cgroup pointer and results in a net zero memory footprint change. The lifetime and charging of the obj_cgroup is still handled in the zswap layer. Clean up mem_cgroup_from_entry, which has no more callers. Suggested-by: Johannes Weiner Signed-off-by: Joshua Hahn --- include/linux/zsmalloc.h | 1 + mm/zsmalloc.c | 29 +++++++++++++++++++++++ mm/zswap.c | 51 ++++++++++++++++++---------------------- 3 files changed, 53 insertions(+), 28 deletions(-) diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index 22f3baa13f24..05b2b163a427 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -38,6 +38,7 @@ unsigned long zs_get_total_pages(struct zs_pool *pool); unsigned long zs_compact(struct zs_pool *pool); unsigned int zs_lookup_class_index(struct zs_pool *pool, unsigned int size); +struct obj_cgroup *zs_lookup_objcg(struct zs_pool *pool, unsigned long handle); void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index e5ae9a0fc78a..067215a6ddcc 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -977,6 +977,30 @@ static void migrate_obj_objcg(unsigned long used_obj, unsigned long free_obj, zpdesc_set_obj_cgroup(d_zpdesc, d_obj_idx, size, objcg); zpdesc_set_obj_cgroup(s_zpdesc, s_obj_idx, size, NULL); } + +struct obj_cgroup *zs_lookup_objcg(struct zs_pool *pool, unsigned long handle) +{ + unsigned long obj; + struct zpdesc *zpdesc; + struct zspage *zspage; + struct size_class *class; + struct obj_cgroup *objcg; + unsigned int obj_idx; + + read_lock(&pool->lock); + obj = handle_to_obj(handle); + obj_to_location(obj, &zpdesc, &obj_idx); + + zspage = get_zspage(zpdesc); + zspage_read_lock(zspage); + read_unlock(&pool->lock); + + class = zspage_class(pool, zspage); + objcg = zpdesc_obj_cgroup(zpdesc, obj_idx, class->size); + zspage_read_unlock(zspage); + + return objcg; +} #else static inline struct obj_cgroup *zpdesc_obj_cgroup(struct zpdesc *zpdesc, unsigned int offset, @@ -996,6 +1020,11 @@ static bool alloc_zspage_objcgs(struct size_class *class, gfp_t gfp, static void migrate_obj_objcg(unsigned long used_obj, unsigned long free_obj, int size) {} + +struct obj_cgroup *zs_lookup_objcg(struct zs_pool *pool, unsigned long handle) +{ + return NULL; +} #endif static void create_page_chain(struct size_class *class, struct zspage *zspage, diff --git a/mm/zswap.c b/mm/zswap.c index 1e2d60f47919..55161a5c9d4c 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -193,7 +193,6 @@ struct zswap_entry { bool referenced; struct zswap_pool *pool; unsigned long handle; - struct obj_cgroup *objcg; struct list_head lru; }; @@ -601,25 +600,13 @@ static int zswap_enabled_param_set(const char *val, * lru functions **********************************/ -/* should be called under RCU */ -#ifdef CONFIG_MEMCG -static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) -{ - return entry->objcg ? obj_cgroup_memcg(entry->objcg) : NULL; -} -#else -static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) -{ - return NULL; -} -#endif - static inline int entry_to_nid(struct zswap_entry *entry) { return page_to_nid(virt_to_page(entry)); } -static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) +static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry, + struct obj_cgroup *objcg) { int nid = entry_to_nid(entry); struct mem_cgroup *memcg; @@ -636,19 +623,20 @@ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) * Similar reasoning holds for list_lru_del(). */ rcu_read_lock(); - memcg = mem_cgroup_from_entry(entry); + memcg = objcg ? obj_cgroup_memcg(objcg) : NULL; /* will always succeed */ list_lru_add(list_lru, &entry->lru, nid, memcg); rcu_read_unlock(); } -static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry) +static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry, + struct obj_cgroup *objcg) { int nid = entry_to_nid(entry); struct mem_cgroup *memcg; rcu_read_lock(); - memcg = mem_cgroup_from_entry(entry); + memcg = objcg ? obj_cgroup_memcg(objcg) : NULL; /* will always succeed */ list_lru_del(list_lru, &entry->lru, nid, memcg); rcu_read_unlock(); @@ -716,12 +704,16 @@ static void zswap_entry_cache_free(struct zswap_entry *entry) */ static void zswap_entry_free(struct zswap_entry *entry) { - zswap_lru_del(&zswap_list_lru, entry); + struct obj_cgroup *objcg = zs_lookup_objcg(entry->pool->zs_pool, + entry->handle); + + zswap_lru_del(&zswap_list_lru, entry, objcg); zs_free(entry->pool->zs_pool, entry->handle); zswap_pool_put(entry->pool); - if (entry->objcg) { - obj_cgroup_uncharge_zswap(entry->objcg, entry->length); - obj_cgroup_put(entry->objcg); + + if (objcg) { + obj_cgroup_uncharge_zswap(objcg, entry->length); + obj_cgroup_put(objcg); } if (entry->length == PAGE_SIZE) atomic_long_dec(&zswap_stored_incompressible_pages); @@ -994,6 +986,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry, struct mempolicy *mpol; bool folio_was_allocated; struct swap_info_struct *si; + struct obj_cgroup *objcg; int ret = 0; /* try to allocate swap cache folio */ @@ -1043,8 +1036,9 @@ static int zswap_writeback_entry(struct zswap_entry *entry, xa_erase(tree, offset); count_vm_event(ZSWPWB); - if (entry->objcg) - count_objcg_events(entry->objcg, ZSWPWB, 1); + objcg = zs_lookup_objcg(entry->pool->zs_pool, entry->handle); + if (objcg) + count_objcg_events(objcg, ZSWPWB, 1); zswap_entry_free(entry); @@ -1463,11 +1457,10 @@ static bool zswap_store_page(struct page *page, */ entry->pool = pool; entry->swpentry = page_swpentry; - entry->objcg = objcg; entry->referenced = true; if (entry->length) { INIT_LIST_HEAD(&entry->lru); - zswap_lru_add(&zswap_list_lru, entry); + zswap_lru_add(&zswap_list_lru, entry, objcg); } return true; @@ -1592,6 +1585,7 @@ int zswap_load(struct folio *folio) bool swapcache = folio_test_swapcache(folio); struct xarray *tree = swap_zswap_tree(swp); struct zswap_entry *entry; + struct obj_cgroup *objcg; VM_WARN_ON_ONCE(!folio_test_locked(folio)); @@ -1620,8 +1614,9 @@ int zswap_load(struct folio *folio) folio_mark_uptodate(folio); count_vm_event(ZSWPIN); - if (entry->objcg) - count_objcg_events(entry->objcg, ZSWPIN, 1); + objcg = zs_lookup_objcg(entry->pool->zs_pool, entry->handle); + if (objcg) + count_objcg_events(objcg, ZSWPIN, 1); /* * When reading into the swapcache, invalidate our entry. The -- 2.47.3 Now that zswap_entries do not directly track obj_cgroups of the entries, handle the lifetime management and charging of these entries into the zsmalloc layer. One functional change is that zswap entries are now no longer accounted by the size of the compressed object, but by the size of the size_class slot they occupy. This brings the charging one step closer to an accurate representation of the memory consumed in the zpdesc; even if a compressed object doesn't consume the entirety of a obj slot, the hole it creates between the objects is dead space the obj is accountable for. Thus, account the memory each object makes unusable, not the amount of memory each object takes up. Signed-off-by: Joshua Hahn --- include/linux/memcontrol.h | 10 ------- mm/memcontrol.c | 51 ---------------------------------- mm/zsmalloc.c | 57 ++++++++++++++++++++++++++++++++++++-- mm/zswap.c | 8 ------ 4 files changed, 55 insertions(+), 71 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index b6c82c8f73e1..dd4278b1ca35 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1824,22 +1824,12 @@ static inline bool memcg_is_dying(struct mem_cgroup *memcg) #if defined(CONFIG_MEMCG) && defined(CONFIG_ZSWAP) bool obj_cgroup_may_zswap(struct obj_cgroup *objcg); -void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size); -void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size); bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg); #else static inline bool obj_cgroup_may_zswap(struct obj_cgroup *objcg) { return true; } -static inline void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, - size_t size) -{ -} -static inline void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, - size_t size) -{ -} static inline bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg) { /* if zswap is disabled, do not block pages going to the swapping device */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 007413a53b45..3432e1afc037 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5433,57 +5433,6 @@ bool obj_cgroup_may_zswap(struct obj_cgroup *objcg) return ret; } -/** - * obj_cgroup_charge_zswap - charge compression backend memory - * @objcg: the object cgroup - * @size: size of compressed object - * - * This forces the charge after obj_cgroup_may_zswap() allowed - * compression and storage in zswap for this cgroup to go ahead. - */ -void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size) -{ - struct mem_cgroup *memcg; - - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) - return; - - VM_WARN_ON_ONCE(!(current->flags & PF_MEMALLOC)); - - /* PF_MEMALLOC context, charging must succeed */ - if (obj_cgroup_charge(objcg, GFP_KERNEL, size)) - VM_WARN_ON_ONCE(1); - - rcu_read_lock(); - memcg = obj_cgroup_memcg(objcg); - mod_memcg_state(memcg, MEMCG_ZSWAP_B, size); - mod_memcg_state(memcg, MEMCG_ZSWAPPED, 1); - rcu_read_unlock(); -} - -/** - * obj_cgroup_uncharge_zswap - uncharge compression backend memory - * @objcg: the object cgroup - * @size: size of compressed object - * - * Uncharges zswap memory on page in. - */ -void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size) -{ - struct mem_cgroup *memcg; - - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) - return; - - obj_cgroup_uncharge(objcg, size); - - rcu_read_lock(); - memcg = obj_cgroup_memcg(objcg); - mod_memcg_state(memcg, MEMCG_ZSWAP_B, -size); - mod_memcg_state(memcg, MEMCG_ZSWAPPED, -1); - rcu_read_unlock(); -} - bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg) { /* if zswap is disabled, do not block pages going to the swapping device */ diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 067215a6ddcc..88c7cd399261 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -963,6 +963,44 @@ static bool alloc_zspage_objcgs(struct size_class *class, gfp_t gfp, return true; } +static void zs_charge_objcg(struct zpdesc *zpdesc, struct obj_cgroup *objcg, + int size, unsigned long offset) +{ + struct mem_cgroup *memcg; + + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return; + + VM_WARN_ON_ONCE(!(current->flags & PF_MEMALLOC)); + + /* PF_MEMALLOC context, charging must succeed */ + if (obj_cgroup_charge(objcg, GFP_KERNEL, size)) + VM_WARN_ON_ONCE(1); + + rcu_read_lock(); + memcg = obj_cgroup_memcg(objcg); + mod_memcg_state(memcg, MEMCG_ZSWAP_B, size); + mod_memcg_state(memcg, MEMCG_ZSWAPPED, 1); + rcu_read_unlock(); +} + +static void zs_uncharge_objcg(struct zpdesc *zpdesc, struct obj_cgroup *objcg, + int size, unsigned long offset) +{ + struct mem_cgroup *memcg; + + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return; + + obj_cgroup_uncharge(objcg, size); + + rcu_read_lock(); + memcg = obj_cgroup_memcg(objcg); + mod_memcg_state(memcg, MEMCG_ZSWAP_B, -size); + mod_memcg_state(memcg, MEMCG_ZSWAPPED, -1); + rcu_read_unlock(); +} + static void migrate_obj_objcg(unsigned long used_obj, unsigned long free_obj, int size) { @@ -1018,6 +1056,12 @@ static bool alloc_zspage_objcgs(struct size_class *class, gfp_t gfp, return true; } +static void zs_charge_objcg(struct zpdesc *zpdesc, struct obj_cgroup *objcg, + int size, unsigned long offset) {} + +static void zs_uncharge_objcg(struct zpdesc *zpdesc, struct obj_cgroup *objcg, + int size, unsigned long offset) {} + static void migrate_obj_objcg(unsigned long used_obj, unsigned long free_obj, int size) {} @@ -1334,8 +1378,11 @@ void zs_obj_write(struct zs_pool *pool, unsigned long handle, class = zspage_class(pool, zspage); off = offset_in_page(class->size * obj_idx); - if (objcg) + if (objcg) { + obj_cgroup_get(objcg); + zs_charge_objcg(zpdesc, objcg, class->size, off); zpdesc_set_obj_cgroup(zpdesc, obj_idx, class->size, objcg); + } if (!ZsHugePage(zspage)) off += ZS_HANDLE_SIZE; @@ -1501,6 +1548,7 @@ static void obj_free(int class_size, unsigned long obj) struct link_free *link; struct zspage *zspage; struct zpdesc *f_zpdesc; + struct obj_cgroup *objcg; unsigned long f_offset; unsigned int f_objidx; void *vaddr; @@ -1510,7 +1558,12 @@ static void obj_free(int class_size, unsigned long obj) f_offset = offset_in_page(class_size * f_objidx); zspage = get_zspage(f_zpdesc); - zpdesc_set_obj_cgroup(f_zpdesc, f_objidx, class_size, NULL); + objcg = zpdesc_obj_cgroup(f_zpdesc, f_objidx, class_size); + if (objcg) { + zs_uncharge_objcg(f_zpdesc, objcg, class_size, f_offset); + obj_cgroup_put(objcg); + zpdesc_set_obj_cgroup(f_zpdesc, f_objidx, class_size, NULL); + } vaddr = kmap_local_zpdesc(f_zpdesc); link = (struct link_free *)(vaddr + f_offset); diff --git a/mm/zswap.c b/mm/zswap.c index 55161a5c9d4c..77d3c6516ed3 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -711,10 +711,6 @@ static void zswap_entry_free(struct zswap_entry *entry) zs_free(entry->pool->zs_pool, entry->handle); zswap_pool_put(entry->pool); - if (objcg) { - obj_cgroup_uncharge_zswap(objcg, entry->length); - obj_cgroup_put(objcg); - } if (entry->length == PAGE_SIZE) atomic_long_dec(&zswap_stored_incompressible_pages); zswap_entry_cache_free(entry); @@ -1437,10 +1433,6 @@ static bool zswap_store_page(struct page *page, * when the entry is removed from the tree. */ zswap_pool_get(pool); - if (objcg) { - obj_cgroup_get(objcg); - obj_cgroup_charge_zswap(objcg, entry->length); - } atomic_long_inc(&zswap_stored_pages); if (entry->length == PAGE_SIZE) atomic_long_inc(&zswap_stored_incompressible_pages); -- 2.47.3 Zswap compresses and uncompresses in PAGE_SIZE units, which simplifies the accounting for how much memory it has compressed. However, when a compressed object is stored at the boundary of two zspages, accounting at PAGE_SIZE units makes it difficult to fractionally charge each backing zspage with the ratio of memory it backs for the compressed object. To make sub-PAGE_SIZE granularity charging possible for MEMCG_ZSWAPPED, track the value in bytes and adjust its accounting accordingly. No functional changes intended. Signed-off-by: Joshua Hahn --- include/linux/memcontrol.h | 2 +- mm/memcontrol.c | 5 +++-- mm/zsmalloc.c | 4 ++-- mm/zswap.c | 6 ++++-- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index dd4278b1ca35..d3952c918fd4 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -38,7 +38,7 @@ enum memcg_stat_item { MEMCG_VMALLOC, MEMCG_KMEM, MEMCG_ZSWAP_B, - MEMCG_ZSWAPPED, + MEMCG_ZSWAPPED_B, MEMCG_NR_STAT, }; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3432e1afc037..b662902d4e03 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -340,7 +340,7 @@ static const unsigned int memcg_stat_items[] = { MEMCG_VMALLOC, MEMCG_KMEM, MEMCG_ZSWAP_B, - MEMCG_ZSWAPPED, + MEMCG_ZSWAPPED_B, }; #define NR_MEMCG_NODE_STAT_ITEMS ARRAY_SIZE(memcg_node_stat_items) @@ -1345,7 +1345,7 @@ static const struct memory_stat memory_stats[] = { { "shmem", NR_SHMEM }, #ifdef CONFIG_ZSWAP { "zswap", MEMCG_ZSWAP_B }, - { "zswapped", MEMCG_ZSWAPPED }, + { "zswapped", MEMCG_ZSWAPPED_B }, #endif { "file_mapped", NR_FILE_MAPPED }, { "file_dirty", NR_FILE_DIRTY }, @@ -1393,6 +1393,7 @@ static int memcg_page_state_unit(int item) switch (item) { case MEMCG_PERCPU_B: case MEMCG_ZSWAP_B: + case MEMCG_ZSWAPPED_B: case NR_SLAB_RECLAIMABLE_B: case NR_SLAB_UNRECLAIMABLE_B: return 1; diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 88c7cd399261..6794927c60fb 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -980,7 +980,7 @@ static void zs_charge_objcg(struct zpdesc *zpdesc, struct obj_cgroup *objcg, rcu_read_lock(); memcg = obj_cgroup_memcg(objcg); mod_memcg_state(memcg, MEMCG_ZSWAP_B, size); - mod_memcg_state(memcg, MEMCG_ZSWAPPED, 1); + mod_memcg_state(memcg, MEMCG_ZSWAPPED_B, 1); rcu_read_unlock(); } @@ -997,7 +997,7 @@ static void zs_uncharge_objcg(struct zpdesc *zpdesc, struct obj_cgroup *objcg, rcu_read_lock(); memcg = obj_cgroup_memcg(objcg); mod_memcg_state(memcg, MEMCG_ZSWAP_B, -size); - mod_memcg_state(memcg, MEMCG_ZSWAPPED, -1); + mod_memcg_state(memcg, MEMCG_ZSWAPPED_B, -1); rcu_read_unlock(); } diff --git a/mm/zswap.c b/mm/zswap.c index 77d3c6516ed3..97f38d0afa86 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1214,8 +1214,10 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker, */ if (!mem_cgroup_disabled()) { mem_cgroup_flush_stats(memcg); - nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT; - nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED); + nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B); + nr_backing >>= PAGE_SHIFT; + nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED_B); + nr_stored >>= PAGE_SHIFT; } else { nr_backing = zswap_total_pages(); nr_stored = atomic_long_read(&zswap_stored_pages); -- 2.47.3 Now that memcg charging happens in the zsmalloc layer where we have both objcg and page information, we can specify which node's memcg lruvec zswapped memory should be accounted to. Move MEMCG_ZSWAP_B and MEMCG_ZSWAPPED_B from enum_node_stat_item to int memcg_node_stat_items. Rename their prefix from MEMCG to NR to reflect this move as well. In addition, decouple the updates of node stats (vmstat) and memcg-lruvec stats, since node stats can only track values at a PAGE_SIZE granularity. Finally, track the moving charges whenever a compressed object migrates from one zspage to another. memcg-lruvec stats are now updated precisely and proportionally when compressed objects are split across pages. Unfortunately for node stats, only NR_ZSWAP_B can be kept accurate. NR_ZSWAPPED_B works as a good best-effort value, but cannot proportionally account for compressed objects split across pages due to the coarse PAGE_SIZE granularity of node stats. For such objects, NR_ZSWAPPED_B is accounted to the first zpdesc's node stats. Note that this is not a new inaccuracy, but one that is simply left unable to be fixed as part of these changes. The small inaccuracy is accepted in place of invasive changes across all of vmstat infrastructure to begin tracking stats at byte granularity. Suggested-by: Johannes Weiner Signed-off-by: Joshua Hahn --- include/linux/memcontrol.h | 5 +-- include/linux/mmzone.h | 2 ++ mm/memcontrol.c | 18 +++++----- mm/vmstat.c | 2 ++ mm/zsmalloc.c | 72 ++++++++++++++++++++++++++++++-------- mm/zswap.c | 4 +-- 6 files changed, 76 insertions(+), 27 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d3952c918fd4..ba97b86d9104 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -37,8 +37,6 @@ enum memcg_stat_item { MEMCG_PERCPU_B, MEMCG_VMALLOC, MEMCG_KMEM, - MEMCG_ZSWAP_B, - MEMCG_ZSWAPPED_B, MEMCG_NR_STAT, }; @@ -932,6 +930,9 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg); void mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, int val); +void mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, + int val); + static inline void mod_memcg_page_state(struct page *page, enum memcg_stat_item idx, int val) { diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3e51190a55e4..ae16a90491ac 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -258,6 +258,8 @@ enum node_stat_item { #ifdef CONFIG_HUGETLB_PAGE NR_HUGETLB, #endif + NR_ZSWAP_B, + NR_ZSWAPPED_B, NR_BALLOON_PAGES, NR_KERNEL_FILE_PAGES, NR_VM_NODE_STAT_ITEMS diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b662902d4e03..dc7cfff97296 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -331,6 +331,8 @@ static const unsigned int memcg_node_stat_items[] = { #ifdef CONFIG_HUGETLB_PAGE NR_HUGETLB, #endif + NR_ZSWAP_B, + NR_ZSWAPPED_B, }; static const unsigned int memcg_stat_items[] = { @@ -339,8 +341,6 @@ static const unsigned int memcg_stat_items[] = { MEMCG_PERCPU_B, MEMCG_VMALLOC, MEMCG_KMEM, - MEMCG_ZSWAP_B, - MEMCG_ZSWAPPED_B, }; #define NR_MEMCG_NODE_STAT_ITEMS ARRAY_SIZE(memcg_node_stat_items) @@ -726,7 +726,7 @@ unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) } #endif -static void mod_memcg_lruvec_state(struct lruvec *lruvec, +void mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { @@ -1344,8 +1344,8 @@ static const struct memory_stat memory_stats[] = { { "vmalloc", MEMCG_VMALLOC }, { "shmem", NR_SHMEM }, #ifdef CONFIG_ZSWAP - { "zswap", MEMCG_ZSWAP_B }, - { "zswapped", MEMCG_ZSWAPPED_B }, + { "zswap", NR_ZSWAP_B }, + { "zswapped", NR_ZSWAPPED_B }, #endif { "file_mapped", NR_FILE_MAPPED }, { "file_dirty", NR_FILE_DIRTY }, @@ -1392,8 +1392,8 @@ static int memcg_page_state_unit(int item) { switch (item) { case MEMCG_PERCPU_B: - case MEMCG_ZSWAP_B: - case MEMCG_ZSWAPPED_B: + case NR_ZSWAP_B: + case NR_ZSWAPPED_B: case NR_SLAB_RECLAIMABLE_B: case NR_SLAB_UNRECLAIMABLE_B: return 1; @@ -5424,7 +5424,7 @@ bool obj_cgroup_may_zswap(struct obj_cgroup *objcg) /* Force flush to get accurate stats for charging */ __mem_cgroup_flush_stats(memcg, true); - pages = memcg_page_state(memcg, MEMCG_ZSWAP_B) / PAGE_SIZE; + pages = memcg_page_state(memcg, NR_ZSWAP_B) / PAGE_SIZE; if (pages < max) continue; ret = false; @@ -5453,7 +5453,7 @@ static u64 zswap_current_read(struct cgroup_subsys_state *css, struct mem_cgroup *memcg = mem_cgroup_from_css(css); mem_cgroup_flush_stats(memcg); - return memcg_page_state(memcg, MEMCG_ZSWAP_B); + return memcg_page_state(memcg, NR_ZSWAP_B); } static int zswap_max_show(struct seq_file *m, void *v) diff --git a/mm/vmstat.c b/mm/vmstat.c index 99270713e0c1..4b10610bd999 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1279,6 +1279,8 @@ const char * const vmstat_text[] = { #ifdef CONFIG_HUGETLB_PAGE [I(NR_HUGETLB)] = "nr_hugetlb", #endif + [I(NR_ZSWAP_B)] = "zswap", + [I(NR_ZSWAPPED_B)] = "zswapped", [I(NR_BALLOON_PAGES)] = "nr_balloon_pages", [I(NR_KERNEL_FILE_PAGES)] = "nr_kernel_file_pages", #undef I diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 6794927c60fb..548e7f4b8bf6 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -810,6 +810,7 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class, struct zspage *zspage) { struct zpdesc *zpdesc, *next; + bool objcg = !!zpdesc_objcgs(zspage->first_zpdesc); assert_spin_locked(&class->lock); @@ -823,6 +824,8 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class, reset_zpdesc(zpdesc); zpdesc_unlock(zpdesc); zpdesc_dec_zone_page_state(zpdesc); + if (objcg) + dec_node_page_state(zpdesc_page(zpdesc), NR_ZSWAP_B); zpdesc_put(zpdesc); zpdesc = next; } while (zpdesc != NULL); @@ -963,11 +966,45 @@ static bool alloc_zspage_objcgs(struct size_class *class, gfp_t gfp, return true; } -static void zs_charge_objcg(struct zpdesc *zpdesc, struct obj_cgroup *objcg, - int size, unsigned long offset) +static void __zs_mod_memcg_lruvec(struct zpdesc *zpdesc, + struct obj_cgroup *objcg, int size, + int sign, unsigned long offset) { struct mem_cgroup *memcg; + struct lruvec *lruvec; + int compressed_size = size, original_size = PAGE_SIZE; + int nid = page_to_nid(zpdesc_page(zpdesc)); + int next_nid = nid; + + if (offset + size > PAGE_SIZE) { + struct zpdesc *next_zpdesc = get_next_zpdesc(zpdesc); + + next_nid = page_to_nid(zpdesc_page(next_zpdesc)); + if (nid != next_nid) { + compressed_size = PAGE_SIZE - offset; + original_size = (PAGE_SIZE * compressed_size) / size; + } + } + + rcu_read_lock(); + memcg = obj_cgroup_memcg(objcg); + lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); + mod_memcg_lruvec_state(lruvec, NR_ZSWAP_B, sign * compressed_size); + mod_memcg_lruvec_state(lruvec, NR_ZSWAPPED_B, sign * original_size); + + if (nid != next_nid) { + lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(next_nid)); + mod_memcg_lruvec_state(lruvec, NR_ZSWAP_B, + sign * (size - compressed_size)); + mod_memcg_lruvec_state(lruvec, NR_ZSWAPPED_B, + sign * (PAGE_SIZE - original_size)); + } + rcu_read_unlock(); +} +static void zs_charge_objcg(struct zpdesc *zpdesc, struct obj_cgroup *objcg, + int size, unsigned long offset) +{ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) return; @@ -977,28 +1014,30 @@ static void zs_charge_objcg(struct zpdesc *zpdesc, struct obj_cgroup *objcg, if (obj_cgroup_charge(objcg, GFP_KERNEL, size)) VM_WARN_ON_ONCE(1); - rcu_read_lock(); - memcg = obj_cgroup_memcg(objcg); - mod_memcg_state(memcg, MEMCG_ZSWAP_B, size); - mod_memcg_state(memcg, MEMCG_ZSWAPPED_B, 1); - rcu_read_unlock(); + __zs_mod_memcg_lruvec(zpdesc, objcg, size, 1, offset); + + /* + * Node-level vmstats are charged in PAGE_SIZE units. As a + * best-effort, always charge NR_ZSWAPPED_B to the first zpdesc. + */ + inc_node_page_state(zpdesc_page(zpdesc), NR_ZSWAPPED_B); } static void zs_uncharge_objcg(struct zpdesc *zpdesc, struct obj_cgroup *objcg, int size, unsigned long offset) { - struct mem_cgroup *memcg; - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) return; obj_cgroup_uncharge(objcg, size); - rcu_read_lock(); - memcg = obj_cgroup_memcg(objcg); - mod_memcg_state(memcg, MEMCG_ZSWAP_B, -size); - mod_memcg_state(memcg, MEMCG_ZSWAPPED_B, -1); - rcu_read_unlock(); + __zs_mod_memcg_lruvec(zpdesc, objcg, size, -1, offset); + + /* + * Node-level vmstats are uncharged in PAGE_SIZE units. As a + * best-effort, always uncharge NR_ZSWAPPED_B to the first zpdesc. + */ + dec_node_page_state(zpdesc_page(zpdesc), NR_ZSWAPPED_B); } static void migrate_obj_objcg(unsigned long used_obj, unsigned long free_obj, @@ -1135,6 +1174,8 @@ static struct zspage *alloc_zspage(struct zs_pool *pool, __zpdesc_set_zsmalloc(zpdesc); zpdesc_inc_zone_page_state(zpdesc); + if (objcg) + inc_node_page_state(zpdesc_page(zpdesc), NR_ZSWAP_B); zpdescs[i] = zpdesc; } @@ -1149,6 +1190,9 @@ static struct zspage *alloc_zspage(struct zs_pool *pool, err: while (--i >= 0) { zpdesc_dec_zone_page_state(zpdescs[i]); + if (objcg) + dec_node_page_state(zpdesc_page(zpdescs[i]), + NR_ZSWAP_B); free_zpdesc(zpdescs[i]); } cache_free_zspage(zspage); diff --git a/mm/zswap.c b/mm/zswap.c index 97f38d0afa86..9e845e1d7214 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1214,9 +1214,9 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker, */ if (!mem_cgroup_disabled()) { mem_cgroup_flush_stats(memcg); - nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B); + nr_backing = memcg_page_state(memcg, NR_ZSWAP_B); nr_backing >>= PAGE_SHIFT; - nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED_B); + nr_stored = memcg_page_state(memcg, NR_ZSWAPPED_B); nr_stored >>= PAGE_SHIFT; } else { nr_backing = zswap_total_pages(); -- 2.47.3