This patch integrates the swap tier infrastructure with cgroup, enabling the selection of specific swap devices per cgroup by configuring allowed swap tiers. The new `memory.swap.tiers` interface controls allowed swap tiers via a mask. By default, the mask is set to include all tiers, allowing specific tiers to be excluded or restored. Note that effective tiers are calculated separately using a dedicated mask to respect the cgroup hierarchy. Consequently, configured tiers may differ from effective ones, as they must be a subset of the parent's. Note that cgroups do not pin swap tiers. This is similar to the `cpuset` controller, which does not prevent CPU hotplug. This approach ensures flexibility by allowing tier configuration changes regardless of cgroup usage. Signed-off-by: Youngjun Park --- Documentation/admin-guide/cgroup-v2.rst | 27 +++++++++ include/linux/memcontrol.h | 3 +- mm/memcontrol.c | 80 +++++++++++++++++++++++++ mm/swap_tier.c | 66 ++++++++++++++++++++ mm/swap_tier.h | 21 +++++++ mm/swapfile.c | 5 ++ 6 files changed, 201 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 7f5b59d95fce..776a908ce1b9 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1848,6 +1848,33 @@ The following nested keys are defined. Swap usage hard limit. If a cgroup's swap usage reaches this limit, anonymous memory of the cgroup will not be swapped out. + memory.swap.tiers + A read-write nested-keyed file which exists on non-root + cgroups. The default is to enable all tiers. + + This interface allows selecting which swap tiers a cgroup can + use for swapping out memory. + + The effective tiers are inherited from the parent. Only tiers + effective in the parent can be effective in the child. However, + the child can explicitly disable tiers allowed by the parent. + + When read, the file shows two lines: + - The first line shows the operation string that was + written to this file. + - The second line shows the effective operation after + merging with parent settings. + + When writing, the format is: + (+/-)(TIER_NAME) (+/-)(TIER_NAME) ... + + Valid tier names are those configured in + /sys/kernel/mm/swap/tiers. + + Each tier can be prefixed with: + + Enable this tier + - Disable this tier + memory.swap.events A read-only flat-keyed file which exists on non-root cgroups. The following entries are defined. Unless specified diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index b6c82c8f73e1..542bee1b5f60 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -283,7 +283,8 @@ struct mem_cgroup { /* per-memcg mm_struct list */ struct lru_gen_mm_list mm_list; #endif - + int tier_mask; + int tier_effective_mask; #ifdef CONFIG_MEMCG_V1 /* Legacy consumer-oriented counters */ struct page_counter kmem; /* v1 only */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 007413a53b45..c0a0a957a630 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -68,6 +68,7 @@ #include #include "slab.h" #include "memcontrol-v1.h" +#include "swap_tier.h" #include @@ -3691,6 +3692,7 @@ static void mem_cgroup_free(struct mem_cgroup *memcg) { lru_gen_exit_memcg(memcg); memcg_wb_domain_exit(memcg); + swap_tiers_memcg_sync_mask(memcg); __mem_cgroup_free(memcg); } @@ -3792,6 +3794,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) WRITE_ONCE(memcg->zswap_writeback, true); #endif page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); + memcg->tier_mask = TIER_ALL_MASK; + swap_tiers_memcg_inherit_mask(memcg, parent); + if (parent) { WRITE_ONCE(memcg->swappiness, mem_cgroup_swappiness(parent)); @@ -5352,6 +5357,75 @@ static int swap_events_show(struct seq_file *m, void *v) return 0; } +static int swap_tier_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + + swap_tiers_mask_show(m, memcg->tier_mask); + swap_tiers_mask_show(m, memcg->tier_effective_mask); + + return 0; +} + +static ssize_t swap_tier_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + char *pos, *token; + int ret = 0; + + pos = strstrip(buf); + + spin_lock(&swap_tier_lock); + if (!*pos) { + memcg->tier_mask = TIER_ALL_MASK; + goto sync; + } + + while ((token = strsep(&pos, " \t\n")) != NULL) { + int mask; + + if (!*token) + continue; + + if (token[0] != '-' && token[0] != '+') { + ret = -EINVAL; + goto err; + } + + mask = swap_tiers_mask_lookup(token+1); + if (!mask) { + ret = -EINVAL; + goto err; + } + + /* + * if child already set, cannot add that tiers for hierarch mismatching. + * parent compatible, child must respect parent selected swap device. + */ + switch (token[0]) { + case '-': + memcg->tier_mask &= ~mask; + break; + case '+': + memcg->tier_mask |= mask; + break; + default: + ret = -EINVAL; + break; + } + + if (ret) + goto err; + } + +sync: + __swap_tiers_memcg_sync_mask(memcg); +err: + spin_unlock(&swap_tier_lock); + return ret ? ret : nbytes; +} + static struct cftype swap_files[] = { { .name = "swap.current", @@ -5384,6 +5458,12 @@ static struct cftype swap_files[] = { .file_offset = offsetof(struct mem_cgroup, swap_events_file), .seq_show = swap_events_show, }, + { + .name = "swap.tiers", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = swap_tier_show, + .write = swap_tier_write, + }, { } /* terminate */ }; diff --git a/mm/swap_tier.c b/mm/swap_tier.c index d90f6eccb908..e860c87292e2 100644 --- a/mm/swap_tier.c +++ b/mm/swap_tier.c @@ -384,3 +384,69 @@ bool swap_tiers_update(void) return true; } + +void swap_tiers_mask_show(struct seq_file *m, int mask) +{ + struct swap_tier *tier; + + spin_lock(&swap_tier_lock); + for_each_active_tier(tier) { + if (mask & TIER_MASK(tier)) + seq_printf(m, "%s ", tier->name); + } + spin_unlock(&swap_tier_lock); + seq_puts(m, "\n"); +} + +int swap_tiers_mask_lookup(const char *name) +{ + struct swap_tier *tier; + + lockdep_assert_held(&swap_tier_lock); + + for_each_active_tier(tier) { + if (!strcmp(name, tier->name)) + return TIER_MASK(tier); + } + + return 0; +} + +static void __swap_tier_memcg_inherit_mask(struct mem_cgroup *memcg, + struct mem_cgroup *parent) +{ + int effective_mask + = parent ? parent->tier_effective_mask : TIER_ALL_MASK; + + memcg->tier_effective_mask + = effective_mask & memcg->tier_mask; +} + +void swap_tiers_memcg_inherit_mask(struct mem_cgroup *memcg, + struct mem_cgroup *parent) +{ + spin_lock(&swap_tier_lock); + __swap_tier_memcg_inherit_mask(memcg, parent); + spin_unlock(&swap_tier_lock); +} + +void __swap_tiers_memcg_sync_mask(struct mem_cgroup *memcg) +{ + struct mem_cgroup *child; + + lockdep_assert_held(&swap_tier_lock); + + if (memcg == root_mem_cgroup) + return; + + for_each_mem_cgroup_tree(child, memcg) + __swap_tier_memcg_inherit_mask(child, parent_mem_cgroup(child)); +} + +void swap_tiers_memcg_sync_mask(struct mem_cgroup *memcg) +{ + spin_lock(&swap_tier_lock); + memcg->tier_mask = TIER_ALL_MASK; + __swap_tiers_memcg_sync_mask(memcg); + spin_unlock(&swap_tier_lock); +} diff --git a/mm/swap_tier.h b/mm/swap_tier.h index de81d540e3b5..8652a7f993ab 100644 --- a/mm/swap_tier.h +++ b/mm/swap_tier.h @@ -46,4 +46,25 @@ bool swap_tiers_update(void); /* Tier assignment */ void swap_tiers_assign_dev(struct swap_info_struct *swp); +/* Memcg related functions */ +void swap_tiers_mask_show(struct seq_file *m, int mask); +void swap_tiers_memcg_inherit_mask(struct mem_cgroup *memcg, + struct mem_cgroup *parent); +void swap_tiers_memcg_sync_mask(struct mem_cgroup *memcg); +void __swap_tiers_memcg_sync_mask(struct mem_cgroup *memcg); + +/* Mask and tier lookup */ +int swap_tiers_mask_lookup(const char *name); + +/** + * swap_tiers_mask_test - Check if the tier mask is valid + * @tier_mask: The tier mask to check + * @mask: The mask to compare against + * + * Return: true if condition matches, false otherwise + */ +static inline bool swap_tiers_mask_test(int tier_mask, int mask) +{ + return tier_mask & mask; +} #endif /* _SWAP_TIER_H */ diff --git a/mm/swapfile.c b/mm/swapfile.c index 4f8ce021c5bd..dd97e850ea2c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1348,10 +1348,15 @@ static bool swap_alloc_fast(struct folio *folio) static void swap_alloc_slow(struct folio *folio) { struct swap_info_struct *si, *next; + int mask = folio_memcg(folio) ? + folio_memcg(folio)->tier_effective_mask : TIER_ALL_MASK; spin_lock(&swap_avail_lock); start_over: plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) { + if (!swap_tiers_mask_test(si->tier_mask, mask)) + continue; + /* Rotate the device and switch to a new cluster */ plist_requeue(&si->avail_list, &swap_avail_head); spin_unlock(&swap_avail_lock); -- 2.34.1