Just like we have memory.peak, introduce a dmem.peak, which uses the page_counter support for that. It can be written to in order to reset the peak, but different from memory.peak, which expects any write, dmem.peak expects the region name to be written to it. That region peak is the one that is reset. That requires ofp_peak to carry a pointer to the pool that was reset. Writing a different region name will reset the different region and make the original region peak get back to its non-reset value. Signed-off-by: Thadeu Lima de Souza Cascardo --- Documentation/admin-guide/cgroup-v2.rst | 10 +++ include/linux/cgroup-defs.h | 1 + kernel/cgroup/dmem.c | 132 ++++++++++++++++++++++++++++++-- 3 files changed, 137 insertions(+), 6 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 6efd0095ed99..3ba7ab3a36b3 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -2808,6 +2808,16 @@ DMEM Interface Files The semantics are the same as for the memory cgroup controller, and are calculated in the same way. + dmem.peak + A readwrite nested-keyed file that exists on non-root cgroups. + + The max memory usage recorded for the cgroup and its descendants since + either the creation of the cgroup or the most recent reset for that FD. + + A write of a region name to this file resets it to the current memory + usage for subsequent reads through the same file descriptor for that + region. + dmem.capacity A read-only file that describes maximum region capacity. It only exists on the root cgroup. Not all memory can be diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index a85044cb0553..b536054bd916 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -874,6 +874,7 @@ extern bool cgroup_enable_per_threadgroup_rwsem; struct cgroup_of_peak { unsigned long value; struct list_head list; + struct dmem_cgroup_pool_state *pool; }; void of_peak_reset(struct cgroup_of_peak *ofp, struct page_counter *pc, diff --git a/kernel/cgroup/dmem.c b/kernel/cgroup/dmem.c index 1ab1fb47f271..afa380c9839b 100644 --- a/kernel/cgroup/dmem.c +++ b/kernel/cgroup/dmem.c @@ -57,6 +57,9 @@ struct dmemcg_state { struct cgroup_subsys_state css; struct list_head pools; + + /** @peaks_lock: Protects access to the pools' peaks lists */ + spinlock_t peaks_lock; }; struct dmem_cgroup_pool_state { @@ -72,6 +75,10 @@ struct dmem_cgroup_pool_state { struct rcu_head rcu; struct page_counter cnt; + + /* Protected by the dmemcg_state peaks_lock */ + struct list_head peaks; + struct dmem_cgroup_pool_state *parent; refcount_t ref; @@ -162,26 +169,45 @@ set_resource_max(struct dmem_cgroup_pool_state *pool, u64 val) page_counter_set_max(&pool->cnt, val); } -static u64 get_resource_low(struct dmem_cgroup_pool_state *pool) +static u64 get_resource_low(struct seq_file *sf, struct dmem_cgroup_pool_state *pool) { return pool ? READ_ONCE(pool->cnt.low) : 0; } -static u64 get_resource_min(struct dmem_cgroup_pool_state *pool) +static u64 get_resource_min(struct seq_file *sf, struct dmem_cgroup_pool_state *pool) { return pool ? READ_ONCE(pool->cnt.min) : 0; } -static u64 get_resource_max(struct dmem_cgroup_pool_state *pool) +static u64 get_resource_max(struct seq_file *sf, struct dmem_cgroup_pool_state *pool) { return pool ? READ_ONCE(pool->cnt.max) : PAGE_COUNTER_MAX; } -static u64 get_resource_current(struct dmem_cgroup_pool_state *pool) +static u64 get_resource_current(struct seq_file *sf, struct dmem_cgroup_pool_state *pool) { return pool ? page_counter_read(&pool->cnt) : 0; } +static u64 get_resource_peak(struct seq_file *sf, struct dmem_cgroup_pool_state *pool) +{ + struct cgroup_of_peak *ofp = of_peak(sf->private); + u64 fd_peak, peak; + struct dmem_cgroup_pool_state *of_pool; + + if (!pool) + return 0; + + of_pool = READ_ONCE(ofp->pool); + + fd_peak = READ_ONCE(ofp->value); + if (of_pool != pool || fd_peak == OFP_PEAK_UNSET) + peak = pool->cnt.watermark; + else + peak = max(fd_peak, READ_ONCE(pool->cnt.local_watermark)); + return peak; +} + static void reset_all_resource_limits(struct dmem_cgroup_pool_state *rpool) { set_resource_min(rpool, 0); @@ -227,6 +253,7 @@ dmemcs_alloc(struct cgroup_subsys_state *parent_css) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&dmemcs->pools); + spin_lock_init(&dmemcs->peaks_lock); return &dmemcs->css; } @@ -377,6 +404,7 @@ alloc_pool_single(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region ppool ? &ppool->cnt : NULL, true); reset_all_resource_limits(pool); refcount_set(&pool->ref, 1); + INIT_LIST_HEAD(&pool->peaks); kref_get(®ion->ref); if (ppool && !pool->parent) { pool->parent = ppool; @@ -784,7 +812,7 @@ static ssize_t dmemcg_limit_write(struct kernfs_open_file *of, } static int dmemcg_limit_show(struct seq_file *sf, void *v, - u64 (*fn)(struct dmem_cgroup_pool_state *)) + u64 (*fn)(struct seq_file *, struct dmem_cgroup_pool_state *)) { struct dmemcg_state *dmemcs = css_to_dmemcs(seq_css(sf)); struct dmem_cgroup_region *region; @@ -796,7 +824,7 @@ static int dmemcg_limit_show(struct seq_file *sf, void *v, seq_puts(sf, region->name); - val = fn(pool); + val = fn(sf, pool); if (val < PAGE_COUNTER_MAX) seq_printf(sf, " %lld\n", val); else @@ -807,6 +835,90 @@ static int dmemcg_limit_show(struct seq_file *sf, void *v, return 0; } +static int dmem_cgroup_region_peak_open(struct kernfs_open_file *of) +{ + struct cgroup_of_peak *ofp = of_peak(of); + + ofp->value = OFP_PEAK_UNSET; + + return 0; +} + +static void dmem_cgroup_region_peak_remove(struct cgroup_of_peak *ofp) +{ + struct dmem_cgroup_pool_state *pool; + struct dmemcg_state *dmemcs; + + pool = xchg(&ofp->pool, NULL); + if (!pool) + return; + + dmemcs = pool->cs; + + spin_lock(&dmemcs->peaks_lock); + list_del(&ofp->list); + spin_unlock(&dmemcs->peaks_lock); + + WRITE_ONCE(ofp->value, OFP_PEAK_UNSET); + + dmemcg_pool_put(pool); +} + +static void dmem_cgroup_region_peak_release(struct kernfs_open_file *of) +{ + struct cgroup_of_peak *ofp = of_peak(of); + + if (ofp->value == OFP_PEAK_UNSET) { + /* fast path (no writes on this fd) */ + return; + } + + dmem_cgroup_region_peak_remove(ofp); +} + +static ssize_t dmem_cgroup_region_peak_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct dmemcg_state *dmemcs = css_to_dmemcs(of_css(of)); + struct cgroup_of_peak *ofp = of_peak(of); + struct dmem_cgroup_pool_state *pool = NULL; + struct dmem_cgroup_region *region; + int err = 0; + + buf = strstrip(buf); + if (!buf[0]) + return -EINVAL; + + rcu_read_lock(); + region = dmemcg_get_region_by_name(buf); + rcu_read_unlock(); + + if (!region) + return -EINVAL; + + pool = get_cg_pool_unlocked(dmemcs, region); + if (IS_ERR(pool)) { + err = PTR_ERR(pool); + goto out_put; + } + + dmem_cgroup_region_peak_remove(ofp); + + xchg(&ofp->pool, pool); + spin_lock(&dmemcs->peaks_lock); + of_peak_reset(ofp, &pool->cnt, &pool->peaks); + spin_unlock(&dmemcs->peaks_lock); + +out_put: + kref_put(®ion->ref, dmemcg_free_region); + return err ?: nbytes; +} + +static int dmem_cgroup_region_peak_show(struct seq_file *sf, void *v) +{ + return dmemcg_limit_show(sf, v, get_resource_peak); +} + static int dmem_cgroup_region_current_show(struct seq_file *sf, void *v) { return dmemcg_limit_show(sf, v, get_resource_current); @@ -855,6 +967,14 @@ static struct cftype files[] = { .name = "current", .seq_show = dmem_cgroup_region_current_show, }, + { + .name = "peak", + .open = dmem_cgroup_region_peak_open, + .release = dmem_cgroup_region_peak_release, + .write = dmem_cgroup_region_peak_write, + .seq_show = dmem_cgroup_region_peak_show, + .flags = CFTYPE_NOT_ON_ROOT, + }, { .name = "min", .write = dmem_cgroup_region_min_write, -- 2.47.3