DAMOS_COLLAPSE currently collapses into PMD-size THP only. Add a target_order field to express per-order mTHP collapse intent. Zero means system default (PMD order, same as current behavior). Valid values are 0 and 2..HPAGE_PMD_ORDER. Wire up the sysfs interface: a per-scheme rw file "target_order". Validate at store time that the value is in range, and warn at scheme creation time if DAMOS_COLLAPSE is used with an unsupported non-PMD order, resetting to 0. The actual mTHP application via the khugepaged wrapper will be added in subsequent patches. Co-developed-by: Kunwu Chan Signed-off-by: Kunwu Chan Signed-off-by: Wang Lian --- include/linux/damon.h | 5 +++++ mm/damon/sysfs-schemes.c | 45 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/include/linux/damon.h b/include/linux/damon.h index 6f7edb3590ef..5a0587556573 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -572,6 +572,11 @@ struct damos_migrate_dests { struct damos { struct damos_access_pattern pattern; enum damos_action action; + /* + * @target_order: target order for mTHP actions (DAMOS_COLLAPSE). + * 0 means system default (PMD order). Valid: 0, 2..HPAGE_PMD_ORDER. + */ + unsigned int target_order; unsigned long apply_interval_us; /* private: internal use only */ /* diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 329cfd0bbe9f..735970717048 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -6,7 +6,9 @@ */ #include +#include #include +#include #include "sysfs-common.h" @@ -2257,6 +2259,7 @@ struct damon_sysfs_scheme { struct damon_sysfs_stats *stats; struct damon_sysfs_scheme_regions *tried_regions; int target_nid; + unsigned int target_order; struct damos_sysfs_dests *dests; }; @@ -2642,6 +2645,34 @@ static ssize_t target_nid_store(struct kobject *kobj, return err ? err : count; } +static ssize_t target_order_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_scheme *scheme = container_of(kobj, + struct damon_sysfs_scheme, kobj); + + return sysfs_emit(buf, "%u\n", scheme->target_order); +} + +static ssize_t target_order_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_scheme *scheme = container_of(kobj, + struct damon_sysfs_scheme, kobj); + unsigned int val; + int err; + + err = kstrtouint(buf, 0, &val); + if (err) + return err; + + if (val != 0 && (val < 2 || val > HPAGE_PMD_ORDER)) + return -EINVAL; + + scheme->target_order = val; + return count; +} + static void damon_sysfs_scheme_release(struct kobject *kobj) { kfree(container_of(kobj, struct damon_sysfs_scheme, kobj)); @@ -2656,10 +2687,14 @@ static struct kobj_attribute damon_sysfs_scheme_apply_interval_us_attr = static struct kobj_attribute damon_sysfs_scheme_target_nid_attr = __ATTR_RW_MODE(target_nid, 0600); +static struct kobj_attribute damon_sysfs_scheme_target_order_attr = + __ATTR_RW_MODE(target_order, 0600); + static struct attribute *damon_sysfs_scheme_attrs[] = { &damon_sysfs_scheme_action_attr.attr, &damon_sysfs_scheme_apply_interval_us_attr.attr, &damon_sysfs_scheme_target_nid_attr.attr, + &damon_sysfs_scheme_target_order_attr.attr, NULL, }; ATTRIBUTE_GROUPS(damon_sysfs_scheme); @@ -3005,6 +3040,16 @@ static struct damos *damon_sysfs_mk_scheme( if (!scheme) return NULL; + if (sysfs_scheme->action == DAMOS_COLLAPSE && + sysfs_scheme->target_order != 0 && + sysfs_scheme->target_order != HPAGE_PMD_ORDER) { + pr_warn("DAMON collapse: target_order %u not supported, only PMD order (%u) is available. Use 0 or %u.\n", + sysfs_scheme->target_order, + HPAGE_PMD_ORDER, HPAGE_PMD_ORDER); + sysfs_scheme->target_order = 0; + } + scheme->target_order = sysfs_scheme->target_order; + err = damos_sysfs_add_quota_score(sysfs_quotas->goals, &scheme->quota); if (err) { damon_destroy_scheme(scheme); -- 2.50.1 (Apple Git-155) Export a thin wrapper around collapse_huge_page() that allows external subsystems such as DAMON to trigger THP collapse on a target address range. Currently restricted to PMD order (HPAGE_PMD_ORDER), since collapse_huge_page() does not yet support arbitrary mTHP orders. The restriction can be relaxed when khugepaged gains mTHP support. The caller must hold a reference to @mm. Do not hold mmap lock: collapse_huge_page() acquires mmap_read_lock for validation, releases it, then acquires mmap_write_lock for the actual collapse. Holding an outer mmap_read_lock would cause a self-deadlock when the same thread attempts the inner mmap_write_lock. Co-developed-by: Kunwu Chan Signed-off-by: Kunwu Chan Signed-off-by: Wang Lian --- include/linux/khugepaged.h | 3 +++ mm/khugepaged.c | 39 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index d7a9053ff4fe..6fb8a6857790 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -20,6 +20,9 @@ extern bool current_is_khugepaged(void); void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, bool install_pmd); +int damon_collapse_folio_range(struct mm_struct *mm, unsigned long start_addr, + unsigned int target_order); + static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm) { if (mm_flags_test(MMF_VM_HUGEPAGE, oldmm)) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 617bca76db49..0387841ba2e7 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -3272,3 +3272,42 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start, return thps == ((hend - hstart) >> HPAGE_PMD_SHIFT) ? 0 : madvise_collapse_errno(last_fail); } + +/** + * damon_collapse_folio_range() - Collapse base pages in range into a THP + * @mm: mm_struct of the target process + * @start_addr: start address (must be order-aligned) + * @target_order: page order of the collapse result (currently only + * HPAGE_PMD_ORDER is supported) + * + * Thin wrapper around collapse_huge_page() for external callers such as + * DAMON. The caller must hold a reference to @mm. Do not hold mmap + * lock: collapse_huge_page() acquires mmap_read_lock for validation, + * releases it, then acquires mmap_write_lock for the collapse. Holding + * an outer mmap_read_lock would self-deadlock. + * + * Return: 0 on success, -EINVAL on bad arguments, negative error from + * madvise_collapse_errno() otherwise. + */ +int damon_collapse_folio_range(struct mm_struct *mm, unsigned long start_addr, + unsigned int target_order) +{ + struct collapse_control cc = { + .is_khugepaged = false, + }; + enum scan_result result; + + if (target_order != HPAGE_PMD_ORDER) { + pr_warn_once("%s: only PMD order (%u) is supported, got %u\n", + __func__, HPAGE_PMD_ORDER, target_order); + return -EINVAL; + } + if (start_addr & ((PAGE_SIZE << target_order) - 1)) + return -EINVAL; + + result = collapse_huge_page(mm, start_addr, 1, 0, &cc, target_order); + if (result == SCAN_SUCCEED) + return 0; + return madvise_collapse_errno(result); +} +EXPORT_SYMBOL_GPL(damon_collapse_folio_range); -- 2.50.1 (Apple Git-155) When target_order is set (non-zero), the DAMOS_COLLAPSE handler now calls damon_collapse_folio_range() to collapse pages into the requested mTHP size, iterating over the target region in order-aligned chunks. When target_order is 0 (default), the existing madvise(MADV_COLLAPSE) path is used, preserving backwards compatibility. Region boundaries are expanded outward to the covering aligned range (ALIGN_DOWN start, ALIGN end) so that collapse works even after kdamond_split_regions reduces region sizes below the chunk size. collapse_huge_page() internally validates VMA bounds, so expanding beyond the original region is safe. No external mmap lock is held: collapse_huge_page() acquires mmap_read_lock internally for validation, releases it, then acquires mmap_write_lock for the actual collapse. Holding an outer mmap_read_lock would cause a self-deadlock when the same thread attempts the inner mmap_write_lock. Co-developed-by: Kunwu Chan Signed-off-by: Kunwu Chan Signed-off-by: Wang Lian --- mm/damon/vaddr.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index d27147603564..2a3757c13bf0 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "../internal.h" #include "ops-common.h" @@ -899,6 +900,40 @@ static unsigned long damos_va_stat(struct damon_target *target, return 0; } +static unsigned long damos_va_collapse(struct damon_target *target, + struct damon_region *r, struct damos *s, + unsigned long *sz_filter_passed) +{ + unsigned long addr, end, chunk_sz; + unsigned int target_order = s->target_order; + unsigned long applied = 0; + struct mm_struct *mm; + int ret; + + if (target_order < 2 || target_order > HPAGE_PMD_ORDER) + return 0; + + chunk_sz = PAGE_SIZE << target_order; + addr = ALIGN_DOWN(r->ar.start, chunk_sz); + end = ALIGN(r->ar.end, chunk_sz); + + mm = damon_get_mm(target); + if (!mm) + return 0; + + while (addr < end) { + ret = damon_collapse_folio_range(mm, addr, target_order); + if (!ret) + applied += chunk_sz; + *sz_filter_passed += chunk_sz; + addr += chunk_sz; + cond_resched(); + } + + mmput(mm); + return applied; +} + static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, struct damon_region *r, struct damos *scheme, unsigned long *sz_filter_passed) @@ -922,6 +957,9 @@ static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, madv_action = MADV_NOHUGEPAGE; break; case DAMOS_COLLAPSE: + if (scheme->target_order) + return damos_va_collapse(t, r, scheme, + sz_filter_passed); madv_action = MADV_COLLAPSE; break; case DAMOS_MIGRATE_HOT: -- 2.50.1 (Apple Git-155) Add a new DAMOS_MTHP_SPLIT action to split a large folio to the specified target_order, and a hot_threshold parameter to control split decisions based on sub-page access heatmap. target_order: For MTHP_SPLIT, valid range is 2..HPAGE_PMD_ORDER-1, allowing splits to e.g. order-2 (16KB) or order-3 (32KB) mTHP. An invalid value (0 or >= PMD_ORDER) defaults to order-2; 0 would mean "split to base page" which defeats the purpose of mTHP split. hot_threshold: Minimum percentage (0-100) of hot subpages required to preserve a THP. THPs with hot_fraction >= hot_threshold are kept intact; below it, the THP is split to target_order. Default is 30%, based on ARM SPE profiling on Kunpeng 920 which showed: - 97% of THPs have <10% hot subpages (clearly cold, split) - 1-2% have 10-30% (borderline, tunable) - <1% have >30% (genuinely hot, preserve) The 30% default catches genuinely hot THPs while splitting the vast majority of cold THPs. Exposed as sysfs attribute for per-scheme tuning (e.g. lower for memory-pressure scenarios, higher for latency-sensitive workloads). sysfs interface: /sys/kernel/mm/damon/admin/kdamonds/.../schemes/0/hot_threshold The actual split implementation follows in subsequent patches. Co-developed-by: Kunwu Chan Signed-off-by: Kunwu Chan Signed-off-by: Wang Lian --- include/linux/damon.h | 17 ++++++++++++-- mm/damon/sysfs-schemes.c | 51 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 2 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 5a0587556573..982057bbce3b 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -121,6 +121,7 @@ struct damon_target { * @DAMOS_HUGEPAGE: Call ``madvise()`` for the region with MADV_HUGEPAGE. * @DAMOS_NOHUGEPAGE: Call ``madvise()`` for the region with MADV_NOHUGEPAGE. * @DAMOS_COLLAPSE: Call ``madvise()`` for the region with MADV_COLLAPSE. + * @DAMOS_MTHP_SPLIT: Split large folios to the target mTHP order. * @DAMOS_LRU_PRIO: Prioritize the region on its LRU lists. * @DAMOS_LRU_DEPRIO: Deprioritize the region on its LRU lists. * @DAMOS_MIGRATE_HOT: Migrate the regions prioritizing warmer regions. @@ -141,6 +142,7 @@ enum damos_action { DAMOS_HUGEPAGE, DAMOS_NOHUGEPAGE, DAMOS_COLLAPSE, + DAMOS_MTHP_SPLIT, DAMOS_LRU_PRIO, DAMOS_LRU_DEPRIO, DAMOS_MIGRATE_HOT, @@ -573,10 +575,21 @@ struct damos { struct damos_access_pattern pattern; enum damos_action action; /* - * @target_order: target order for mTHP actions (DAMOS_COLLAPSE). - * 0 means system default (PMD order). Valid: 0, 2..HPAGE_PMD_ORDER. + * @target_order: target mTHP order for DAMOS_COLLAPSE and + * DAMOS_MTHP_SPLIT. For COLLAPSE, 0 means PMD order default, + * valid values: 0, 2..HPAGE_PMD_ORDER. For MTHP_SPLIT, + * valid values: 2..HPAGE_PMD_ORDER-1; 0 and HPAGE_PMD_ORDER + * are rejected at scheme creation time (defaulting to 2). */ unsigned int target_order; + /* + * @hot_threshold: minimum hot subpage percentage (0-100) to + * preserve a THP during DAMOS_MTHP_SPLIT. A THP with + * hot_fraction >= hot_threshold is kept intact; below it, the + * THP is split to @target_order. Default 30 based on SPE + * profiling showing 97% of THPs have <10% hot subpages. + */ + unsigned int hot_threshold; unsigned long apply_interval_us; /* private: internal use only */ /* diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 735970717048..823f1ca9bd90 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -2260,6 +2260,7 @@ struct damon_sysfs_scheme { struct damon_sysfs_scheme_regions *tried_regions; int target_nid; unsigned int target_order; + unsigned int hot_threshold; struct damos_sysfs_dests *dests; }; @@ -2293,6 +2294,10 @@ static struct damos_sysfs_action_name damos_sysfs_action_names[] = { .action = DAMOS_COLLAPSE, .name = "collapse", }, + { + .action = DAMOS_MTHP_SPLIT, + .name = "mthp_split", + }, { .action = DAMOS_LRU_PRIO, .name = "lru_prio", @@ -2673,6 +2678,34 @@ static ssize_t target_order_store(struct kobject *kobj, return count; } +static ssize_t hot_threshold_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_scheme *scheme = container_of(kobj, + struct damon_sysfs_scheme, kobj); + + return sysfs_emit(buf, "%u\n", scheme->hot_threshold); +} + +static ssize_t hot_threshold_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_scheme *scheme = container_of(kobj, + struct damon_sysfs_scheme, kobj); + unsigned int val; + int err; + + err = kstrtouint(buf, 0, &val); + if (err) + return err; + + if (val > 100) + return -EINVAL; + + scheme->hot_threshold = val; + return count; +} + static void damon_sysfs_scheme_release(struct kobject *kobj) { kfree(container_of(kobj, struct damon_sysfs_scheme, kobj)); @@ -2690,11 +2723,15 @@ static struct kobj_attribute damon_sysfs_scheme_target_nid_attr = static struct kobj_attribute damon_sysfs_scheme_target_order_attr = __ATTR_RW_MODE(target_order, 0600); +static struct kobj_attribute damon_sysfs_scheme_hot_threshold_attr = + __ATTR_RW_MODE(hot_threshold, 0600); + static struct attribute *damon_sysfs_scheme_attrs[] = { &damon_sysfs_scheme_action_attr.attr, &damon_sysfs_scheme_apply_interval_us_attr.attr, &damon_sysfs_scheme_target_nid_attr.attr, &damon_sysfs_scheme_target_order_attr.attr, + &damon_sysfs_scheme_hot_threshold_attr.attr, NULL, }; ATTRIBUTE_GROUPS(damon_sysfs_scheme); @@ -3048,8 +3085,22 @@ static struct damos *damon_sysfs_mk_scheme( HPAGE_PMD_ORDER, HPAGE_PMD_ORDER); sysfs_scheme->target_order = 0; } + if (sysfs_scheme->action == DAMOS_MTHP_SPLIT && + (sysfs_scheme->target_order == 0 || + sysfs_scheme->target_order >= HPAGE_PMD_ORDER)) { + pr_warn("DAMON mthp_split: target_order %u invalid, need 2..%u. Defaulting to 2.\n", + sysfs_scheme->target_order, + HPAGE_PMD_ORDER - 1); + sysfs_scheme->target_order = 2; + } scheme->target_order = sysfs_scheme->target_order; + if (sysfs_scheme->action == DAMOS_MTHP_SPLIT) { + if (sysfs_scheme->hot_threshold == 0) + sysfs_scheme->hot_threshold = 30; + scheme->hot_threshold = sysfs_scheme->hot_threshold; + } + err = damos_sysfs_add_quota_score(sysfs_quotas->goals, &scheme->quota); if (err) { damon_destroy_scheme(scheme); -- 2.50.1 (Apple Git-155) Implement the DAMOS_MTHP_SPLIT action for vaddr-based DAMON operations. Walk the region in PMD-sized aligned chunks, use folio_walk_start() to locate THP folios, and call split_folio_to_order() when the folio order exceeds the target_order. Unlike COLLAPSE which is limited to anonymous memory via collapse_huge_page(), split_folio_to_order() supports both anon and shmem folios. This is critical for tmpfs THP-backed KVM guest memory, where cold and hot pages bundled together in a single PMD THP cause DAMON to overestimate hot regions. The handler holds mmap_read_lock per chunk for VMA lookup and folio_walk_start(), then releases it before the next iteration. split_folio_to_order() does not reacquire mmap locks internally, so this pattern is safe. Co-developed-by: Kunwu Chan Signed-off-by: Kunwu Chan Signed-off-by: Wang Lian --- mm/damon/vaddr.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 2a3757c13bf0..1957e390a277 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -934,6 +934,71 @@ static unsigned long damos_va_collapse(struct damon_target *target, return applied; } +static unsigned long damos_va_mthp_split(struct damon_target *target, + struct damon_region *r, struct damos *s, + unsigned long *sz_filter_passed) +{ + unsigned long addr, end, chunk_sz; + unsigned int target_order = s->target_order; + unsigned long applied = 0; + struct mm_struct *mm; + struct vm_area_struct *vma; + struct folio *folio; + struct folio_walk fw; + + mm = damon_get_mm(target); + if (!mm) + return 0; + + chunk_sz = PAGE_SIZE << HPAGE_PMD_ORDER; + addr = ALIGN_DOWN(r->ar.start, chunk_sz); + end = ALIGN(r->ar.end, chunk_sz); + + while (addr < end) { + mmap_read_lock(mm); + vma = find_vma(mm, addr); + /* + * split_folio_to_order() supports both anon and shmem + * folios, so we accept any VMA that has a folio at @addr. + * This covers important use cases like tmpfs THP-backed + * KVM guest memory where cold and hot pages are bundled + * together in a single PMD THP. + */ + if (!vma || addr < vma->vm_start) + goto unlock; + + folio = folio_walk_start(&fw, vma, addr, 0); + if (!folio) + goto unlock; + + if (folio_order(folio) > target_order) { + if (!folio_trylock(folio)) { + folio_walk_end(&fw, vma); + goto unlock; + } + folio_get(folio); + folio_walk_end(&fw, vma); + + if (!split_folio_to_order(folio, target_order)) + applied += chunk_sz; + + folio_unlock(folio); + folio_put(folio); + } else { + folio_walk_end(&fw, vma); + } + +unlock: + *sz_filter_passed += chunk_sz; + addr += chunk_sz; + mmap_read_unlock(mm); + cond_resched(); + } + + mmput(mm); + return applied; +} + static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, struct damon_region *r, struct damos *scheme, unsigned long *sz_filter_passed) @@ -967,6 +1032,9 @@ static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, return damos_va_migrate(t, r, scheme, sz_filter_passed); case DAMOS_STAT: return damos_va_stat(t, r, scheme, sz_filter_passed); + case DAMOS_MTHP_SPLIT: + return damos_va_mthp_split(t, r, scheme, + sz_filter_passed); default: /* * DAMOS actions that are not yet supported by 'vaddr'. -- 2.50.1 (Apple Git-155) Add a sub-THP access heatmap that enables data-driven split decisions in DAMOS_MTHP_SPLIT. The split handler queries damon_spe_hot_fraction() and compares against the scheme's configurable hot_threshold (default 30%, set in patch 4) to preserve genuinely hot THPs while splitting cold ones. Key data-driven design decisions from Kunpeng 920 SPE profiling: 1. Signal vs noise threshold (this patch): Raw SPE data shows most THPs have scattered 1-2 sample hits across many subpages — noise, not genuine access patterns. The heatmap now uses a two-pass signal threshold: a subpage chunk must have >= 1/10 of the peak chunk's access count to be considered hot. This reduces false hot classification from ~50% to <5% of subpages. 2. hot_threshold 30% (patch 4, sysfs-configurable): With the signal filter applied, 97% of THPs have <10% hot subpages (clearly cold), 1-2% have 10-30% (borderline), and <1% have >30% (genuinely hot). The 30% default catches hot THPs while allowing the vast majority to be split. Architecture (three-phase): Phase 2a (current fallback): Walk PTE access bits via folio_walk for THPs already split to PTEs. For PMD-mapped THPs (the common case), return -EOPNOTSUPP, which causes the split handler to split unconditionally. Phase 2b (userspace daemon -> kernel, ready for validation): Userspace SPE daemon decodes ARM SPE records, feeds PFNs via debugfs (/sys/kernel/debug/damon/spe_feed). The kernel aggregates accesses into a per-folio rbtree keyed by THP-aligned PFN. Phase 2c (kernel-native, future): perf_event_create_kernel_counter for ARM SPE. Overflow handler calls damon_spe_record_access() directly. Data structure (mm/damon/spe.c): - Per-folio rbtree keyed by PFN, storing access_count[512] (one counter per 4KB subpage) - Max 1024 entries, entries older than 30s are pruned periodically - Global spinlock-protected rbtree with GFP_ATOMIC allocation Debugfs interface: - /sys/kernel/debug/damon/spe_feed (write): accept one PFN per line - /sys/kernel/debug/damon/spe_stats (read): rbtree stats + top entries When CONFIG_DAMON_SPE is disabled, all SPE functions are empty stubs returning -EOPNOTSUPP, making the split unconditional. Co-developed-by: Kunwu Chan Signed-off-by: Kunwu Chan Signed-off-by: Wang Lian --- mm/damon/Kconfig | 12 ++ mm/damon/Makefile | 1 + mm/damon/core.c | 3 + mm/damon/spe.c | 505 ++++++++++++++++++++++++++++++++++++++++++++++ mm/damon/spe.h | 62 ++++++ mm/damon/vaddr.c | 16 +- 6 files changed, 597 insertions(+), 2 deletions(-) create mode 100644 mm/damon/spe.c create mode 100644 mm/damon/spe.h diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index 34631a44cdec..ea75a8dab989 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -121,4 +121,16 @@ config DAMON_STAT_ENABLED_DEFAULT Whether to enable DAMON_STAT by default. Users can disable it in boot or runtime using its 'enabled' parameter. +config DAMON_SPE + bool "DAMON SPE feedback for sub-THP access monitoring (prototype)" + depends on DAMON_VADDR + help + Enable sub-THP access heatmap feedback for DAMOS_MTHP_SPLIT. + Currently a prototype: uses PTE access bits for THPs that have + been split to PTEs, returns "no data" for PMD-mapped THPs. + + On hardware with ARM SPE (e.g. Kunpeng 920), this will be + extended to provide per-subpage access data without needing to + split the PMD first, enabling precise mTHP split decisions. + endmenu diff --git a/mm/damon/Makefile b/mm/damon/Makefile index d8d6bf5f8bff..507b43a9f009 100644 --- a/mm/damon/Makefile +++ b/mm/damon/Makefile @@ -7,3 +7,4 @@ obj-$(CONFIG_DAMON_SYSFS) += sysfs-common.o sysfs-schemes.o sysfs.o obj-$(CONFIG_DAMON_RECLAIM) += modules-common.o reclaim.o obj-$(CONFIG_DAMON_LRU_SORT) += modules-common.o lru_sort.o obj-$(CONFIG_DAMON_STAT) += modules-common.o stat.o +obj-$(CONFIG_DAMON_SPE) += spe.o diff --git a/mm/damon/core.c b/mm/damon/core.c index 265d51ade25b..0805e71a90d8 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -20,6 +20,7 @@ /* for damon_get_folio() used by node eligible memory metrics */ #include "ops-common.h" +#include "spe.h" #define CREATE_TRACE_POINTS #include @@ -2987,6 +2988,8 @@ static void kdamond_apply_schemes(struct damon_ctx *c) if (!has_schemes_to_apply) return; + damon_spe_prune(); + max_region_sz = damon_region_sz_limit(c); mutex_lock(&c->walk_control_lock); damon_for_each_target(t, c) { diff --git a/mm/damon/spe.c b/mm/damon/spe.c new file mode 100644 index 000000000000..98f8d32053e4 --- /dev/null +++ b/mm/damon/spe.c @@ -0,0 +1,505 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DAMON SPE (Statistical Profiling Extension) feedback + * + * Provides sub-THP access heatmap for intelligent split decisions. + * + * Architecture: + * Phase 2a (current): PTE access bits via folio_walk. + * Works only when a THP has been previously split to PTEs. + * Returns -EOPNOTSUPP for PMD-mapped THPs. + * + * Phase 2b (userspace): spe_hist daemon decodes SPE in userspace, + * feeds {pfn, subpage_idx} via debugfs/sysfs into the rbtree below. + * + * Phase 2c (kernel): perf_event_create_kernel_counter for ARM SPE, + * overflow handler aggregates into rbtree. Requires SPE hardware. + * + * Data structure: + * Per-folio rbtree keyed by PFN, storing per-subpage access counts. + * Entries are aged and pruned periodically. + * + * Copyright (C) 2026 Wang Lian + */ + +#define pr_fmt(fmt) "damon-spe: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "spe.h" + +/* Max sub-pages when querying at order 0 */ +#define DAMON_SPE_MAX_CHUNKS 512 + +/* Max folio entries in the rbtree (per-mm or global) */ +#define DAMON_SPE_MAX_ENTRIES 1024 + +/* Entry considered stale after this many jiffies (default: 30s) */ +#define DAMON_SPE_ENTRY_TTL (30 * HZ) + +/* + * Per-folio access histogram entry. + * Keyed by pfn in an rbtree. Each entry tracks access count per subpage. + * The access_count array is sized for PMD-order / 0 = 512 4KB subpages. + */ +struct damon_spe_entry { + struct rb_node node; + unsigned long pfn; /* THP-aligned PFN */ + pid_t pid; /* owner process */ + unsigned long access_count[DAMON_SPE_MAX_CHUNKS]; + unsigned long total_accesses; + unsigned long last_access; /* jiffies of last update */ +}; + +static struct rb_root spe_tree = RB_ROOT; +static DEFINE_SPINLOCK(spe_lock); +static unsigned int spe_nr_entries; + +/* Forward declarations */ +static void __spe_prune(void); + +/* + * Find an entry by PFN. Must be called with spe_lock held. + */ +static struct damon_spe_entry *spe_find(unsigned long pfn) +{ + struct rb_node *node = spe_tree.rb_node; + + while (node) { + struct damon_spe_entry *e = + rb_entry(node, struct damon_spe_entry, node); + + if (pfn < e->pfn) + node = node->rb_left; + else if (pfn > e->pfn) + node = node->rb_right; + else + return e; + } + return NULL; +} + +/* + * Insert a new entry. Must be called with spe_lock held. + * Returns the new entry, or NULL if the tree is full. + */ +static struct damon_spe_entry *spe_insert(unsigned long pfn, pid_t pid) +{ + struct rb_node **new = &spe_tree.rb_node, *parent = NULL; + struct damon_spe_entry *e; + + if (spe_nr_entries >= DAMON_SPE_MAX_ENTRIES) { + __spe_prune(); + if (spe_nr_entries >= DAMON_SPE_MAX_ENTRIES) + return NULL; + } + + e = kzalloc(sizeof(*e), GFP_ATOMIC); + if (!e) + return NULL; + + e->pfn = pfn; + e->pid = pid; + e->last_access = jiffies; + + while (*new) { + struct damon_spe_entry *this = + rb_entry(*new, struct damon_spe_entry, node); + + parent = *new; + if (pfn < this->pfn) + new = &((*new)->rb_left); + else if (pfn > this->pfn) + new = &((*new)->rb_right); + else { + /* Race: another CPU inserted the same PFN */ + kfree(e); + return this; + } + } + + rb_link_node(&e->node, parent, new); + rb_insert_color(&e->node, &spe_tree); + spe_nr_entries++; + return e; +} + +/* + * Prune entries that haven't been updated for DAMON_SPE_ENTRY_TTL. + * Must be called with spe_lock held. + */ +static void __spe_prune(void) +{ + struct rb_node *node, *next; + unsigned long deadline = jiffies - DAMON_SPE_ENTRY_TTL; + + node = rb_first(&spe_tree); + while (node) { + struct damon_spe_entry *e = + rb_entry(node, struct damon_spe_entry, node); + + next = rb_next(node); + + if (time_before(e->last_access, deadline)) { + rb_erase(&e->node, &spe_tree); + spe_nr_entries--; + kfree(e); + } + node = next; + } +} + +/** + * damon_spe_record_access() - Record a single subpage access + * @pfn: Physical page frame number (any page within a THP) + * @pid: Process ID that performed the access + * + * The PFN is automatically aligned to the THP base. The subpage index + * within the THP is derived from the low bits of the PFN. + * + * Context: Can be called from IRQ context. + */ +void damon_spe_record_access(unsigned long pfn, pid_t pid) +{ + unsigned long thp_pfn = pfn & ~(unsigned long)(DAMON_SPE_MAX_CHUNKS - 1); + unsigned int idx = pfn & (DAMON_SPE_MAX_CHUNKS - 1); + struct damon_spe_entry *e; + unsigned long flags; + + spin_lock_irqsave(&spe_lock, flags); + + e = spe_find(thp_pfn); + if (!e) + e = spe_insert(thp_pfn, pid); + + if (e) { + e->access_count[idx]++; + e->total_accesses++; + e->last_access = jiffies; + } + + spin_unlock_irqrestore(&spe_lock, flags); +} +EXPORT_SYMBOL_GPL(damon_spe_record_access); + +/** + * damon_spe_folio_heatmap() - Get sub-THP access bitmap for a folio + * @folio: The folio to query + * @vma: VMA containing the folio + * @addr: Virtual address of the folio start + * @target_order: Page order for each chunk in the bitmap + * @hot_bitmap: Output bitmap with one bit per chunk + * + * Queries the SPE rbtree first. Falls back to PTE access bits if no + * SPE data is available (requires the THP to be split to PTEs). + * + * Return: Number of chunks on success, negative error on failure. + */ +int damon_spe_folio_heatmap(struct folio *folio, struct vm_area_struct *vma, + unsigned long addr, unsigned int target_order, + unsigned long *hot_bitmap) +{ + unsigned long num_chunks = folio_nr_pages(folio) >> target_order; + unsigned long chunk_sz = PAGE_SIZE << target_order; + unsigned long pfn; + unsigned long flags; + struct damon_spe_entry *e; + struct folio_walk fw; + struct folio *sub_folio; + int i; + + if (!folio || !vma || !hot_bitmap) + return -EINVAL; + if (target_order >= folio_order(folio)) + return -EINVAL; + + pfn = folio_pfn(folio); + + /* + * Phase 2b/2c path: query the SPE rbtree. + * If we have aggregated SPE data for this folio, use it. + */ + spin_lock_irqsave(&spe_lock, flags); + e = spe_find(pfn); + if (e && e->total_accesses > 0) { + unsigned long max_sum = 0; + unsigned long sig_thresh; + unsigned int spp = chunk_sz >> PAGE_SHIFT; + + /* First pass: find peak chunk access count */ + for (i = 0; i < num_chunks; i++) { + unsigned long sum = 0; + int j; + + for (j = 0; j < spp; j++) { + unsigned int idx = i * spp + j; + + if (idx < DAMON_SPE_MAX_CHUNKS) + sum += e->access_count[idx]; + } + if (sum > max_sum) + max_sum = sum; + } + + /* + * Signal threshold: a chunk needs >= 1/10 of peak access + * count to be considered hot. This filters SPE noise — + * Kunpeng 920 data shows most THPs have scattered 1-2 + * sample hits across many subpages that don't represent + * genuine hot access patterns. + */ + sig_thresh = max(max_sum / 10, 1UL); + + /* Second pass: build hot bitmap using signal threshold */ + bitmap_zero(hot_bitmap, num_chunks); + for (i = 0; i < num_chunks; i++) { + unsigned long sum = 0; + int j; + + for (j = 0; j < spp; j++) { + unsigned int idx = i * spp + j; + + if (idx < DAMON_SPE_MAX_CHUNKS) + sum += e->access_count[idx]; + } + if (sum >= sig_thresh) + __set_bit(i, hot_bitmap); + } + + spin_unlock_irqrestore(&spe_lock, flags); + return (int)num_chunks; + } + spin_unlock_irqrestore(&spe_lock, flags); + + /* + * Phase 2a fallback: walk PTEs to check access bits. + * Only works when the THP has been split to PTEs. + */ + bitmap_zero(hot_bitmap, num_chunks); + + for (i = 0; i < num_chunks; i++) { + unsigned long chunk_addr = addr + i * chunk_sz; + + sub_folio = folio_walk_start(&fw, vma, chunk_addr, 0); + if (!sub_folio) + return -EOPNOTSUPP; + + if (fw.level == FW_LEVEL_PMD) { + folio_walk_end(&fw, vma); + return -EOPNOTSUPP; + } + + if (fw.level == FW_LEVEL_PTE && pte_young(fw.pte)) + __set_bit(i, hot_bitmap); + + folio_walk_end(&fw, vma); + } + + return (int)num_chunks; +} +EXPORT_SYMBOL_GPL(damon_spe_folio_heatmap); + +/** + * damon_spe_hot_fraction() - Return hot chunk percentage of a folio + * @folio: The folio to query + * @vma: VMA containing the folio + * @addr: Virtual address of the folio start + * @target_order: Page order for each chunk + * + * Return: Percentage (0-100) on success, negative error on failure. + */ +int damon_spe_hot_fraction(struct folio *folio, struct vm_area_struct *vma, + unsigned long addr, unsigned int target_order) +{ + unsigned long num_chunks = folio_nr_pages(folio) >> target_order; + DECLARE_BITMAP(hot_bitmap, DAMON_SPE_MAX_CHUNKS); + int ret, hot; + + if (num_chunks > DAMON_SPE_MAX_CHUNKS) + return -ERANGE; + + ret = damon_spe_folio_heatmap(folio, vma, addr, target_order, + hot_bitmap); + if (ret < 0) + return ret; + + hot = bitmap_weight(hot_bitmap, num_chunks); + return (hot * 100) / (int)num_chunks; +} +EXPORT_SYMBOL_GPL(damon_spe_hot_fraction); + +/** + * damon_spe_prune() - Remove stale entries from the SPE rbtree + * + * Called from DAMON's aggregation cycle. Removes entries not updated + * within DAMON_SPE_ENTRY_TTL jiffies. + */ +void damon_spe_prune(void) +{ + unsigned long flags; + + spin_lock_irqsave(&spe_lock, flags); + __spe_prune(); + spin_unlock_irqrestore(&spe_lock, flags); +} + +/** + * damon_spe_stats() - Return current SPE rbtree statistics + * @nr_entries: Output for number of entries, may be NULL + * @total_accesses: Output for total accumulated accesses, may be NULL + */ +void damon_spe_stats(unsigned int *nr_entries, unsigned long *total_accesses) +{ + struct rb_node *node; + unsigned long flags; + unsigned int count = 0; + unsigned long total = 0; + + spin_lock_irqsave(&spe_lock, flags); + for (node = rb_first(&spe_tree); node; node = rb_next(node)) { + struct damon_spe_entry *e = + rb_entry(node, struct damon_spe_entry, node); + count++; + total += e->total_accesses; + } + spin_unlock_irqrestore(&spe_lock, flags); + + if (nr_entries) + *nr_entries = count; + if (total_accesses) + *total_accesses = total; +} +EXPORT_SYMBOL_GPL(damon_spe_stats); + +/* ---- debugfs interface for Phase 2b (userspace daemon → kernel rbtree) ---- */ + +static struct dentry *damon_spe_dentry; + +/* + * spe_feed write: accept one PFN per line (hex or decimal). + * The PFN is recorded as an access via damon_spe_record_access(). + * + * Usage from userspace: + * echo 0x12345678 > /sys/kernel/debug/damon/spe_feed + * + * For bulk feed from SPE daemon: + * cat spe_pfns.txt > /sys/kernel/debug/damon/spe_feed + */ +static ssize_t spe_feed_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + char line[32]; + size_t len = min(count, sizeof(line) - 1); + unsigned long pfn; + + if (copy_from_user(line, buf, len)) + return -EFAULT; + line[len] = '\0'; + + /* Strip trailing newline */ + if (len > 0 && line[len - 1] == '\n') + line[len - 1] = '\0'; + + if (kstrtoul(line, 0, &pfn) == 0 && pfn != 0) + damon_spe_record_access(pfn, 0); + + return count; +} + +/* + * spe_stats read: show current SPE rbtree statistics. + * + * Usage: + * cat /sys/kernel/debug/damon/spe_stats + */ +static int spe_stats_show(struct seq_file *m, void *v) +{ + struct rb_node *node; + unsigned long flags; + unsigned int count = 0; + unsigned long total = 0; + + spin_lock_irqsave(&spe_lock, flags); + for (node = rb_first(&spe_tree); node; node = rb_next(node)) { + struct damon_spe_entry *e = + rb_entry(node, struct damon_spe_entry, node); + count++; + total += e->total_accesses; + } + spin_unlock_irqrestore(&spe_lock, flags); + + seq_printf(m, "nr_entries=%u total_accesses=%lu\n", count, total); + + /* Show top entries (limit output) */ + spin_lock_irqsave(&spe_lock, flags); + count = 0; + for (node = rb_first(&spe_tree); node; node = rb_next(node)) { + struct damon_spe_entry *e = + rb_entry(node, struct damon_spe_entry, node); + unsigned int hot_pages = 0; + int i; + + for (i = 0; i < DAMON_SPE_MAX_CHUNKS; i++) + if (e->access_count[i]) + hot_pages++; + + seq_printf(m, " pfn=0x%lx pid=%d total=%lu hot_pages=%u/%d\n", + e->pfn, e->pid, e->total_accesses, + hot_pages, DAMON_SPE_MAX_CHUNKS); + if (++count >= 10) + break; + } + spin_unlock_irqrestore(&spe_lock, flags); + + return 0; +} + +static int spe_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, spe_stats_show, inode->i_private); +} + +static const struct file_operations spe_feed_fops = { + .write = spe_feed_write, +}; + +static const struct file_operations spe_stats_fops = { + .open = spe_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init damon_spe_dbgfs_init(void) +{ + struct dentry *d; + + d = debugfs_lookup("damon", NULL); + if (!d) { + d = debugfs_create_dir("damon", NULL); + if (IS_ERR(d)) + return PTR_ERR(d); + } + damon_spe_dentry = d; + + debugfs_create_file("spe_feed", 0200, damon_spe_dentry, + NULL, &spe_feed_fops); + debugfs_create_file("spe_stats", 0400, damon_spe_dentry, + NULL, &spe_stats_fops); + + pr_info("debugfs interface ready: /sys/kernel/debug/damon/spe_{feed,stats}\n"); + return 0; +} + +late_initcall(damon_spe_dbgfs_init); diff --git a/mm/damon/spe.h b/mm/damon/spe.h new file mode 100644 index 000000000000..38799688b5af --- /dev/null +++ b/mm/damon/spe.h @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * DAMON SPE (Statistical Profiling Extension) feedback + * + * Provides sub-THP access heatmap for intelligent split decisions. + * + * Three-phase architecture: + * Phase 2a: PTE access bits via folio_walk (current fallback) + * Phase 2b: Userspace SPE daemon feeds {pfn, subpage} via debugfs + * Phase 2c: Kernel perf_event_create_kernel_counter for ARM SPE + * + * Copyright (C) 2026 Wang Lian + */ + +#ifndef _DAMON_SPE_H +#define _DAMON_SPE_H + +#include +#include + +#ifdef CONFIG_DAMON_SPE + +/* ---- Sub-page heatmap query ---- */ + +int damon_spe_folio_heatmap(struct folio *folio, struct vm_area_struct *vma, + unsigned long addr, unsigned int target_order, + unsigned long *hot_bitmap); + +int damon_spe_hot_fraction(struct folio *folio, struct vm_area_struct *vma, + unsigned long addr, unsigned int target_order); + +/* ---- Recording (called from SPE event handler or userspace daemon) ---- */ + +void damon_spe_record_access(unsigned long pfn, pid_t pid); + +/* ---- Maintenance ---- */ + +void damon_spe_prune(void); +void damon_spe_stats(unsigned int *nr_entries, unsigned long *total_accesses); + +#else /* !CONFIG_DAMON_SPE */ + +static inline int damon_spe_folio_heatmap(struct folio *folio, + struct vm_area_struct *vma, unsigned long addr, + unsigned int target_order, unsigned long *hot_bitmap) +{ + return -EOPNOTSUPP; +} + +static inline int damon_spe_hot_fraction(struct folio *folio, + struct vm_area_struct *vma, unsigned long addr, + unsigned int target_order) +{ + return -EOPNOTSUPP; +} + +static inline void damon_spe_record_access(unsigned long pfn, pid_t pid) {} +static inline void damon_spe_prune(void) {} +static inline void damon_spe_stats(unsigned int *nr, unsigned long *total) {} + +#endif /* CONFIG_DAMON_SPE */ +#endif /* _DAMON_SPE_H */ diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 1957e390a277..cb3ea2766b9e 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -18,6 +18,7 @@ #include "../internal.h" #include "ops-common.h" +#include "spe.h" #ifdef CONFIG_DAMON_VADDR_KUNIT_TEST #undef DAMON_MIN_REGION_SZ @@ -945,6 +946,7 @@ static unsigned long damos_va_mthp_split(struct damon_target *target, struct vm_area_struct *vma; struct folio *folio; struct folio_walk fw; + int hot_pct; mm = damon_get_mm(target); if (!mm) @@ -979,8 +981,18 @@ static unsigned long damos_va_mthp_split(struct damon_target *target, folio_get(folio); folio_walk_end(&fw, vma); - if (!split_folio_to_order(folio, target_order)) - applied += chunk_sz; + hot_pct = damon_spe_hot_fraction(folio, vma, addr, + target_order); + /* + * hot_pct < 0: no heatmap data (no SPE, PMD-mapped), + * split unconditionally — DAMON access pattern already + * identified this region as cold. + */ + if (hot_pct < 0 || + (unsigned int)hot_pct < s->hot_threshold) { + if (!split_folio_to_order(folio, target_order)) + applied += chunk_sz; + } folio_unlock(folio); folio_put(folio); -- 2.50.1 (Apple Git-155)