DAMOS_COLLAPSE currently collapses into PMD-size THP only.  Add a
target_order field to express per-order mTHP collapse intent.  Zero
means system default (PMD order, same as current behavior).  Valid
values are 0 and 2..HPAGE_PMD_ORDER.

Wire up the sysfs interface: a per-scheme rw file "target_order".
Validate at store time that the value is in range, and warn at scheme
creation time if DAMOS_COLLAPSE is used with an unsupported non-PMD
order, resetting to 0.

The actual mTHP application via the khugepaged wrapper will be added
in subsequent patches.

Co-developed-by: Kunwu Chan <kunwu.chan@gmail.com>
Signed-off-by: Kunwu Chan <kunwu.chan@gmail.com>
Signed-off-by: Wang Lian <lianux.mm@gmail.com>
---
 include/linux/damon.h    |  5 +++++
 mm/damon/sysfs-schemes.c | 45 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 6f7edb3590ef..5a0587556573 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -572,6 +572,11 @@ struct damos_migrate_dests {
 struct damos {
 	struct damos_access_pattern pattern;
 	enum damos_action action;
+	/*
+	 * @target_order: target order for mTHP actions (DAMOS_COLLAPSE).
+	 * 0 means system default (PMD order).  Valid: 0, 2..HPAGE_PMD_ORDER.
+	 */
+	unsigned int target_order;
 	unsigned long apply_interval_us;
 /* private: internal use only */
 	/*
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 329cfd0bbe9f..735970717048 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -6,7 +6,9 @@
  */
 
 #include <linux/slab.h>
+#include <linux/mm.h>
 #include <linux/numa.h>
+#include <linux/huge_mm.h>
 
 #include "sysfs-common.h"
 
@@ -2257,6 +2259,7 @@ struct damon_sysfs_scheme {
 	struct damon_sysfs_stats *stats;
 	struct damon_sysfs_scheme_regions *tried_regions;
 	int target_nid;
+	unsigned int target_order;
 	struct damos_sysfs_dests *dests;
 };
 
@@ -2642,6 +2645,34 @@ static ssize_t target_nid_store(struct kobject *kobj,
 	return err ? err : count;
 }
 
+static ssize_t target_order_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_scheme *scheme = container_of(kobj,
+			struct damon_sysfs_scheme, kobj);
+
+	return sysfs_emit(buf, "%u\n", scheme->target_order);
+}
+
+static ssize_t target_order_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	struct damon_sysfs_scheme *scheme = container_of(kobj,
+			struct damon_sysfs_scheme, kobj);
+	unsigned int val;
+	int err;
+
+	err = kstrtouint(buf, 0, &val);
+	if (err)
+		return err;
+
+	if (val != 0 && (val < 2 || val > HPAGE_PMD_ORDER))
+		return -EINVAL;
+
+	scheme->target_order = val;
+	return count;
+}
+
 static void damon_sysfs_scheme_release(struct kobject *kobj)
 {
 	kfree(container_of(kobj, struct damon_sysfs_scheme, kobj));
@@ -2656,10 +2687,14 @@ static struct kobj_attribute damon_sysfs_scheme_apply_interval_us_attr =
 static struct kobj_attribute damon_sysfs_scheme_target_nid_attr =
 		__ATTR_RW_MODE(target_nid, 0600);
 
+static struct kobj_attribute damon_sysfs_scheme_target_order_attr =
+		__ATTR_RW_MODE(target_order, 0600);
+
 static struct attribute *damon_sysfs_scheme_attrs[] = {
 	&damon_sysfs_scheme_action_attr.attr,
 	&damon_sysfs_scheme_apply_interval_us_attr.attr,
 	&damon_sysfs_scheme_target_nid_attr.attr,
+	&damon_sysfs_scheme_target_order_attr.attr,
 	NULL,
 };
 ATTRIBUTE_GROUPS(damon_sysfs_scheme);
@@ -3005,6 +3040,16 @@ static struct damos *damon_sysfs_mk_scheme(
 	if (!scheme)
 		return NULL;
 
+	if (sysfs_scheme->action == DAMOS_COLLAPSE &&
+	    sysfs_scheme->target_order != 0 &&
+	    sysfs_scheme->target_order != HPAGE_PMD_ORDER) {
+		pr_warn("DAMON collapse: target_order %u not supported, only PMD order (%u) is available. Use 0 or %u.\n",
+			sysfs_scheme->target_order,
+			HPAGE_PMD_ORDER, HPAGE_PMD_ORDER);
+		sysfs_scheme->target_order = 0;
+	}
+	scheme->target_order = sysfs_scheme->target_order;
+
 	err = damos_sysfs_add_quota_score(sysfs_quotas->goals, &scheme->quota);
 	if (err) {
 		damon_destroy_scheme(scheme);
-- 
2.50.1 (Apple Git-155)


Export a thin wrapper around collapse_huge_page() that allows external
subsystems such as DAMON to trigger THP collapse on a target address
range.

Currently restricted to PMD order (HPAGE_PMD_ORDER), since
collapse_huge_page() does not yet support arbitrary mTHP orders.
The restriction can be relaxed when khugepaged gains mTHP support.

The caller must hold a reference to @mm.  Do not hold mmap lock:
collapse_huge_page() acquires mmap_read_lock for validation, releases
it, then acquires mmap_write_lock for the actual collapse.  Holding
an outer mmap_read_lock would cause a self-deadlock when the same
thread attempts the inner mmap_write_lock.

Co-developed-by: Kunwu Chan <kunwu.chan@gmail.com>
Signed-off-by: Kunwu Chan <kunwu.chan@gmail.com>
Signed-off-by: Wang Lian <lianux.mm@gmail.com>
---
 include/linux/khugepaged.h |  3 +++
 mm/khugepaged.c            | 39 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+)

diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index d7a9053ff4fe..6fb8a6857790 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -20,6 +20,9 @@ extern bool current_is_khugepaged(void);
 void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
 		bool install_pmd);
 
+int damon_collapse_folio_range(struct mm_struct *mm, unsigned long start_addr,
+			       unsigned int target_order);
+
 static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 {
 	if (mm_flags_test(MMF_VM_HUGEPAGE, oldmm))
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 617bca76db49..0387841ba2e7 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -3272,3 +3272,42 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
 	return thps == ((hend - hstart) >> HPAGE_PMD_SHIFT) ? 0
 			: madvise_collapse_errno(last_fail);
 }
+
+/**
+ * damon_collapse_folio_range() - Collapse base pages in range into a THP
+ * @mm:         mm_struct of the target process
+ * @start_addr: start address (must be order-aligned)
+ * @target_order: page order of the collapse result (currently only
+ *                HPAGE_PMD_ORDER is supported)
+ *
+ * Thin wrapper around collapse_huge_page() for external callers such as
+ * DAMON.  The caller must hold a reference to @mm.  Do not hold mmap
+ * lock: collapse_huge_page() acquires mmap_read_lock for validation,
+ * releases it, then acquires mmap_write_lock for the collapse.  Holding
+ * an outer mmap_read_lock would self-deadlock.
+ *
+ * Return: 0 on success, -EINVAL on bad arguments, negative error from
+ *         madvise_collapse_errno() otherwise.
+ */
+int damon_collapse_folio_range(struct mm_struct *mm, unsigned long start_addr,
+			       unsigned int target_order)
+{
+	struct collapse_control cc = {
+		.is_khugepaged = false,
+	};
+	enum scan_result result;
+
+	if (target_order != HPAGE_PMD_ORDER) {
+		pr_warn_once("%s: only PMD order (%u) is supported, got %u\n",
+			     __func__, HPAGE_PMD_ORDER, target_order);
+		return -EINVAL;
+	}
+	if (start_addr & ((PAGE_SIZE << target_order) - 1))
+		return -EINVAL;
+
+	result = collapse_huge_page(mm, start_addr, 1, 0, &cc, target_order);
+	if (result == SCAN_SUCCEED)
+		return 0;
+	return madvise_collapse_errno(result);
+}
+EXPORT_SYMBOL_GPL(damon_collapse_folio_range);
-- 
2.50.1 (Apple Git-155)


When target_order is set (non-zero), the DAMOS_COLLAPSE handler now calls
damon_collapse_folio_range() to collapse pages into the requested mTHP
size, iterating over the target region in order-aligned chunks.  When
target_order is 0 (default), the existing madvise(MADV_COLLAPSE) path is
used, preserving backwards compatibility.

Region boundaries are expanded outward to the covering aligned range
(ALIGN_DOWN start, ALIGN end) so that collapse works even after
kdamond_split_regions reduces region sizes below the chunk size.
collapse_huge_page() internally validates VMA bounds, so expanding
beyond the original region is safe.

No external mmap lock is held: collapse_huge_page() acquires
mmap_read_lock internally for validation, releases it, then acquires
mmap_write_lock for the actual collapse.  Holding an outer
mmap_read_lock would cause a self-deadlock when the same thread
attempts the inner mmap_write_lock.

Co-developed-by: Kunwu Chan <kunwu.chan@gmail.com>
Signed-off-by: Kunwu Chan <kunwu.chan@gmail.com>
Signed-off-by: Wang Lian <lianux.mm@gmail.com>
---
 mm/damon/vaddr.c | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index d27147603564..2a3757c13bf0 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -14,6 +14,7 @@
 #include <linux/page_idle.h>
 #include <linux/pagewalk.h>
 #include <linux/sched/mm.h>
+#include <linux/khugepaged.h>
 
 #include "../internal.h"
 #include "ops-common.h"
@@ -899,6 +900,40 @@ static unsigned long damos_va_stat(struct damon_target *target,
 	return 0;
 }
 
+static unsigned long damos_va_collapse(struct damon_target *target,
+		struct damon_region *r, struct damos *s,
+		unsigned long *sz_filter_passed)
+{
+	unsigned long addr, end, chunk_sz;
+	unsigned int target_order = s->target_order;
+	unsigned long applied = 0;
+	struct mm_struct *mm;
+	int ret;
+
+	if (target_order < 2 || target_order > HPAGE_PMD_ORDER)
+		return 0;
+
+	chunk_sz = PAGE_SIZE << target_order;
+	addr = ALIGN_DOWN(r->ar.start, chunk_sz);
+	end = ALIGN(r->ar.end, chunk_sz);
+
+	mm = damon_get_mm(target);
+	if (!mm)
+		return 0;
+
+	while (addr < end) {
+		ret = damon_collapse_folio_range(mm, addr, target_order);
+		if (!ret)
+			applied += chunk_sz;
+		*sz_filter_passed += chunk_sz;
+		addr += chunk_sz;
+		cond_resched();
+	}
+
+	mmput(mm);
+	return applied;
+}
+
 static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx,
 		struct damon_target *t, struct damon_region *r,
 		struct damos *scheme, unsigned long *sz_filter_passed)
@@ -922,6 +957,9 @@ static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx,
 		madv_action = MADV_NOHUGEPAGE;
 		break;
 	case DAMOS_COLLAPSE:
+		if (scheme->target_order)
+			return damos_va_collapse(t, r, scheme,
+						 sz_filter_passed);
 		madv_action = MADV_COLLAPSE;
 		break;
 	case DAMOS_MIGRATE_HOT:
-- 
2.50.1 (Apple Git-155)


Add a new DAMOS_MTHP_SPLIT action to split a large folio to the
specified target_order, and a hot_threshold parameter to control
split decisions based on sub-page access heatmap.

target_order: For MTHP_SPLIT, valid range is 2..HPAGE_PMD_ORDER-1,
allowing splits to e.g. order-2 (16KB) or order-3 (32KB) mTHP.
An invalid value (0 or >= PMD_ORDER) defaults to order-2; 0 would
mean "split to base page" which defeats the purpose of mTHP split.

hot_threshold: Minimum percentage (0-100) of hot subpages required
to preserve a THP.  THPs with hot_fraction >= hot_threshold are
kept intact; below it, the THP is split to target_order.  Default
is 30%, based on ARM SPE profiling on Kunpeng 920 which showed:

  - 97% of THPs have <10% hot subpages (clearly cold, split)
  - 1-2% have 10-30% (borderline, tunable)
  - <1% have >30% (genuinely hot, preserve)

The 30% default catches genuinely hot THPs while splitting the vast
majority of cold THPs.  Exposed as sysfs attribute for per-scheme
tuning (e.g. lower for memory-pressure scenarios, higher for
latency-sensitive workloads).

sysfs interface:
  /sys/kernel/mm/damon/admin/kdamonds/.../schemes/0/hot_threshold

The actual split implementation follows in subsequent patches.

Co-developed-by: Kunwu Chan <kunwu.chan@gmail.com>
Signed-off-by: Kunwu Chan <kunwu.chan@gmail.com>
Signed-off-by: Wang Lian <lianux.mm@gmail.com>
---
 include/linux/damon.h    | 17 ++++++++++++--
 mm/damon/sysfs-schemes.c | 51 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+), 2 deletions(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 5a0587556573..982057bbce3b 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -121,6 +121,7 @@ struct damon_target {
  * @DAMOS_HUGEPAGE:	Call ``madvise()`` for the region with MADV_HUGEPAGE.
  * @DAMOS_NOHUGEPAGE:	Call ``madvise()`` for the region with MADV_NOHUGEPAGE.
  * @DAMOS_COLLAPSE:	Call ``madvise()`` for the region with MADV_COLLAPSE.
+ * @DAMOS_MTHP_SPLIT:	Split large folios to the target mTHP order.
  * @DAMOS_LRU_PRIO:	Prioritize the region on its LRU lists.
  * @DAMOS_LRU_DEPRIO:	Deprioritize the region on its LRU lists.
  * @DAMOS_MIGRATE_HOT:  Migrate the regions prioritizing warmer regions.
@@ -141,6 +142,7 @@ enum damos_action {
 	DAMOS_HUGEPAGE,
 	DAMOS_NOHUGEPAGE,
 	DAMOS_COLLAPSE,
+	DAMOS_MTHP_SPLIT,
 	DAMOS_LRU_PRIO,
 	DAMOS_LRU_DEPRIO,
 	DAMOS_MIGRATE_HOT,
@@ -573,10 +575,21 @@ struct damos {
 	struct damos_access_pattern pattern;
 	enum damos_action action;
 	/*
-	 * @target_order: target order for mTHP actions (DAMOS_COLLAPSE).
-	 * 0 means system default (PMD order).  Valid: 0, 2..HPAGE_PMD_ORDER.
+	 * @target_order: target mTHP order for DAMOS_COLLAPSE and
+	 * DAMOS_MTHP_SPLIT.  For COLLAPSE, 0 means PMD order default,
+	 * valid values: 0, 2..HPAGE_PMD_ORDER.  For MTHP_SPLIT,
+	 * valid values: 2..HPAGE_PMD_ORDER-1; 0 and HPAGE_PMD_ORDER
+	 * are rejected at scheme creation time (defaulting to 2).
 	 */
 	unsigned int target_order;
+	/*
+	 * @hot_threshold: minimum hot subpage percentage (0-100) to
+	 * preserve a THP during DAMOS_MTHP_SPLIT.  A THP with
+	 * hot_fraction >= hot_threshold is kept intact; below it, the
+	 * THP is split to @target_order.  Default 30 based on SPE
+	 * profiling showing 97% of THPs have <10% hot subpages.
+	 */
+	unsigned int hot_threshold;
 	unsigned long apply_interval_us;
 /* private: internal use only */
 	/*
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 735970717048..823f1ca9bd90 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -2260,6 +2260,7 @@ struct damon_sysfs_scheme {
 	struct damon_sysfs_scheme_regions *tried_regions;
 	int target_nid;
 	unsigned int target_order;
+	unsigned int hot_threshold;
 	struct damos_sysfs_dests *dests;
 };
 
@@ -2293,6 +2294,10 @@ static struct damos_sysfs_action_name damos_sysfs_action_names[] = {
 		.action = DAMOS_COLLAPSE,
 		.name = "collapse",
 	},
+	{
+		.action = DAMOS_MTHP_SPLIT,
+		.name = "mthp_split",
+	},
 	{
 		.action = DAMOS_LRU_PRIO,
 		.name = "lru_prio",
@@ -2673,6 +2678,34 @@ static ssize_t target_order_store(struct kobject *kobj,
 	return count;
 }
 
+static ssize_t hot_threshold_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_scheme *scheme = container_of(kobj,
+			struct damon_sysfs_scheme, kobj);
+
+	return sysfs_emit(buf, "%u\n", scheme->hot_threshold);
+}
+
+static ssize_t hot_threshold_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	struct damon_sysfs_scheme *scheme = container_of(kobj,
+			struct damon_sysfs_scheme, kobj);
+	unsigned int val;
+	int err;
+
+	err = kstrtouint(buf, 0, &val);
+	if (err)
+		return err;
+
+	if (val > 100)
+		return -EINVAL;
+
+	scheme->hot_threshold = val;
+	return count;
+}
+
 static void damon_sysfs_scheme_release(struct kobject *kobj)
 {
 	kfree(container_of(kobj, struct damon_sysfs_scheme, kobj));
@@ -2690,11 +2723,15 @@ static struct kobj_attribute damon_sysfs_scheme_target_nid_attr =
 static struct kobj_attribute damon_sysfs_scheme_target_order_attr =
 		__ATTR_RW_MODE(target_order, 0600);
 
+static struct kobj_attribute damon_sysfs_scheme_hot_threshold_attr =
+		__ATTR_RW_MODE(hot_threshold, 0600);
+
 static struct attribute *damon_sysfs_scheme_attrs[] = {
 	&damon_sysfs_scheme_action_attr.attr,
 	&damon_sysfs_scheme_apply_interval_us_attr.attr,
 	&damon_sysfs_scheme_target_nid_attr.attr,
 	&damon_sysfs_scheme_target_order_attr.attr,
+	&damon_sysfs_scheme_hot_threshold_attr.attr,
 	NULL,
 };
 ATTRIBUTE_GROUPS(damon_sysfs_scheme);
@@ -3048,8 +3085,22 @@ static struct damos *damon_sysfs_mk_scheme(
 			HPAGE_PMD_ORDER, HPAGE_PMD_ORDER);
 		sysfs_scheme->target_order = 0;
 	}
+	if (sysfs_scheme->action == DAMOS_MTHP_SPLIT &&
+	    (sysfs_scheme->target_order == 0 ||
+	     sysfs_scheme->target_order >= HPAGE_PMD_ORDER)) {
+		pr_warn("DAMON mthp_split: target_order %u invalid, need 2..%u. Defaulting to 2.\n",
+			sysfs_scheme->target_order,
+			HPAGE_PMD_ORDER - 1);
+		sysfs_scheme->target_order = 2;
+	}
 	scheme->target_order = sysfs_scheme->target_order;
 
+	if (sysfs_scheme->action == DAMOS_MTHP_SPLIT) {
+		if (sysfs_scheme->hot_threshold == 0)
+			sysfs_scheme->hot_threshold = 30;
+		scheme->hot_threshold = sysfs_scheme->hot_threshold;
+	}
+
 	err = damos_sysfs_add_quota_score(sysfs_quotas->goals, &scheme->quota);
 	if (err) {
 		damon_destroy_scheme(scheme);
-- 
2.50.1 (Apple Git-155)


Implement the DAMOS_MTHP_SPLIT action for vaddr-based DAMON operations.
Walk the region in PMD-sized aligned chunks, use folio_walk_start() to
locate THP folios, and call split_folio_to_order() when the folio order
exceeds the target_order.

Unlike COLLAPSE which is limited to anonymous memory via
collapse_huge_page(), split_folio_to_order() supports both anon and
shmem folios.  This is critical for tmpfs THP-backed KVM guest memory,
where cold and hot pages bundled together in a single PMD THP cause
DAMON to overestimate hot regions.

The handler holds mmap_read_lock per chunk for VMA lookup and
folio_walk_start(), then releases it before the next iteration.
split_folio_to_order() does not reacquire mmap locks internally,
so this pattern is safe.

Co-developed-by: Kunwu Chan <kunwu.chan@gmail.com>
Signed-off-by: Kunwu Chan <kunwu.chan@gmail.com>
Signed-off-by: Wang Lian <lianux.mm@gmail.com>
---
 mm/damon/vaddr.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 2a3757c13bf0..1957e390a277 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -934,6 +934,71 @@ static unsigned long damos_va_collapse(struct damon_target *target,
 	return applied;
 }
 
+static unsigned long damos_va_mthp_split(struct damon_target *target,
+		struct damon_region *r, struct damos *s,
+		unsigned long *sz_filter_passed)
+{
+	unsigned long addr, end, chunk_sz;
+	unsigned int target_order = s->target_order;
+	unsigned long applied = 0;
+	struct mm_struct *mm;
+	struct vm_area_struct *vma;
+	struct folio *folio;
+	struct folio_walk fw;
+
+	mm = damon_get_mm(target);
+	if (!mm)
+		return 0;
+
+	chunk_sz = PAGE_SIZE << HPAGE_PMD_ORDER;
+	addr = ALIGN_DOWN(r->ar.start, chunk_sz);
+	end = ALIGN(r->ar.end, chunk_sz);
+
+	while (addr < end) {
+		mmap_read_lock(mm);
+		vma = find_vma(mm, addr);
+		/*
+		 * split_folio_to_order() supports both anon and shmem
+		 * folios, so we accept any VMA that has a folio at @addr.
+		 * This covers important use cases like tmpfs THP-backed
+		 * KVM guest memory where cold and hot pages are bundled
+		 * together in a single PMD THP.
+		 */
+		if (!vma || addr < vma->vm_start)
+			goto unlock;
+
+		folio = folio_walk_start(&fw, vma, addr, 0);
+		if (!folio)
+			goto unlock;
+
+		if (folio_order(folio) > target_order) {
+			if (!folio_trylock(folio)) {
+				folio_walk_end(&fw, vma);
+				goto unlock;
+			}
+			folio_get(folio);
+			folio_walk_end(&fw, vma);
+
+			if (!split_folio_to_order(folio, target_order))
+				applied += chunk_sz;
+
+			folio_unlock(folio);
+			folio_put(folio);
+		} else {
+			folio_walk_end(&fw, vma);
+		}
+
+unlock:
+		*sz_filter_passed += chunk_sz;
+		addr += chunk_sz;
+		mmap_read_unlock(mm);
+		cond_resched();
+	}
+
+	mmput(mm);
+	return applied;
+}
+
 static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx,
 		struct damon_target *t, struct damon_region *r,
 		struct damos *scheme, unsigned long *sz_filter_passed)
@@ -967,6 +1032,9 @@ static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx,
 		return damos_va_migrate(t, r, scheme, sz_filter_passed);
 	case DAMOS_STAT:
 		return damos_va_stat(t, r, scheme, sz_filter_passed);
+	case DAMOS_MTHP_SPLIT:
+		return damos_va_mthp_split(t, r, scheme,
+					  sz_filter_passed);
 	default:
 		/*
 		 * DAMOS actions that are not yet supported by 'vaddr'.
-- 
2.50.1 (Apple Git-155)


Add a sub-THP access heatmap that enables data-driven split decisions
in DAMOS_MTHP_SPLIT.  The split handler queries damon_spe_hot_fraction()
and compares against the scheme's configurable hot_threshold (default
30%, set in patch 4) to preserve genuinely hot THPs while splitting
cold ones.

Key data-driven design decisions from Kunpeng 920 SPE profiling:

  1. Signal vs noise threshold (this patch):
     Raw SPE data shows most THPs have scattered 1-2 sample hits across
     many subpages — noise, not genuine access patterns.  The heatmap
     now uses a two-pass signal threshold: a subpage chunk must have
     >= 1/10 of the peak chunk's access count to be considered hot.
     This reduces false hot classification from ~50% to <5% of subpages.

  2. hot_threshold 30% (patch 4, sysfs-configurable):
     With the signal filter applied, 97% of THPs have <10% hot
     subpages (clearly cold), 1-2% have 10-30% (borderline), and
     <1% have >30% (genuinely hot).  The 30% default catches hot THPs
     while allowing the vast majority to be split.

Architecture (three-phase):

  Phase 2a (current fallback):
    Walk PTE access bits via folio_walk for THPs already split to PTEs.
    For PMD-mapped THPs (the common case), return -EOPNOTSUPP, which
    causes the split handler to split unconditionally.

  Phase 2b (userspace daemon -> kernel, ready for validation):
    Userspace SPE daemon decodes ARM SPE records, feeds PFNs via debugfs
    (/sys/kernel/debug/damon/spe_feed).  The kernel aggregates accesses
    into a per-folio rbtree keyed by THP-aligned PFN.

  Phase 2c (kernel-native, future):
    perf_event_create_kernel_counter for ARM SPE.  Overflow handler
    calls damon_spe_record_access() directly.

Data structure (mm/damon/spe.c):
  - Per-folio rbtree keyed by PFN, storing access_count[512] (one
    counter per 4KB subpage)
  - Max 1024 entries, entries older than 30s are pruned periodically
  - Global spinlock-protected rbtree with GFP_ATOMIC allocation

Debugfs interface:
  - /sys/kernel/debug/damon/spe_feed  (write): accept one PFN per line
  - /sys/kernel/debug/damon/spe_stats (read):  rbtree stats + top entries

When CONFIG_DAMON_SPE is disabled, all SPE functions are empty stubs
returning -EOPNOTSUPP, making the split unconditional.

Co-developed-by: Kunwu Chan <kunwu.chan@gmail.com>
Signed-off-by: Kunwu Chan <kunwu.chan@gmail.com>
Signed-off-by: Wang Lian <lianux.mm@gmail.com>
---
 mm/damon/Kconfig  |  12 ++
 mm/damon/Makefile |   1 +
 mm/damon/core.c   |   3 +
 mm/damon/spe.c    | 505 ++++++++++++++++++++++++++++++++++++++++++++++
 mm/damon/spe.h    |  62 ++++++
 mm/damon/vaddr.c  |  16 +-
 6 files changed, 597 insertions(+), 2 deletions(-)
 create mode 100644 mm/damon/spe.c
 create mode 100644 mm/damon/spe.h

diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig
index 34631a44cdec..ea75a8dab989 100644
--- a/mm/damon/Kconfig
+++ b/mm/damon/Kconfig
@@ -121,4 +121,16 @@ config DAMON_STAT_ENABLED_DEFAULT
 	  Whether to enable DAMON_STAT by default.  Users can disable it in
 	  boot or runtime using its 'enabled' parameter.
 
+config DAMON_SPE
+	bool "DAMON SPE feedback for sub-THP access monitoring (prototype)"
+	depends on DAMON_VADDR
+	help
+	  Enable sub-THP access heatmap feedback for DAMOS_MTHP_SPLIT.
+	  Currently a prototype: uses PTE access bits for THPs that have
+	  been split to PTEs, returns "no data" for PMD-mapped THPs.
+
+	  On hardware with ARM SPE (e.g. Kunpeng 920), this will be
+	  extended to provide per-subpage access data without needing to
+	  split the PMD first, enabling precise mTHP split decisions.
+
 endmenu
diff --git a/mm/damon/Makefile b/mm/damon/Makefile
index d8d6bf5f8bff..507b43a9f009 100644
--- a/mm/damon/Makefile
+++ b/mm/damon/Makefile
@@ -7,3 +7,4 @@ obj-$(CONFIG_DAMON_SYSFS)	+= sysfs-common.o sysfs-schemes.o sysfs.o
 obj-$(CONFIG_DAMON_RECLAIM)	+= modules-common.o reclaim.o
 obj-$(CONFIG_DAMON_LRU_SORT)	+= modules-common.o lru_sort.o
 obj-$(CONFIG_DAMON_STAT)	+= modules-common.o stat.o
+obj-$(CONFIG_DAMON_SPE)		+= spe.o
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 265d51ade25b..0805e71a90d8 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -20,6 +20,7 @@
 
 /* for damon_get_folio() used by node eligible memory metrics */
 #include "ops-common.h"
+#include "spe.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/damon.h>
@@ -2987,6 +2988,8 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
 	if (!has_schemes_to_apply)
 		return;
 
+	damon_spe_prune();
+
 	max_region_sz = damon_region_sz_limit(c);
 	mutex_lock(&c->walk_control_lock);
 	damon_for_each_target(t, c) {
diff --git a/mm/damon/spe.c b/mm/damon/spe.c
new file mode 100644
index 000000000000..98f8d32053e4
--- /dev/null
+++ b/mm/damon/spe.c
@@ -0,0 +1,505 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * DAMON SPE (Statistical Profiling Extension) feedback
+ *
+ * Provides sub-THP access heatmap for intelligent split decisions.
+ *
+ * Architecture:
+ *   Phase 2a (current): PTE access bits via folio_walk.
+ *     Works only when a THP has been previously split to PTEs.
+ *     Returns -EOPNOTSUPP for PMD-mapped THPs.
+ *
+ *   Phase 2b (userspace): spe_hist daemon decodes SPE in userspace,
+ *     feeds {pfn, subpage_idx} via debugfs/sysfs into the rbtree below.
+ *
+ *   Phase 2c (kernel): perf_event_create_kernel_counter for ARM SPE,
+ *     overflow handler aggregates into rbtree.  Requires SPE hardware.
+ *
+ * Data structure:
+ *   Per-folio rbtree keyed by PFN, storing per-subpage access counts.
+ *   Entries are aged and pruned periodically.
+ *
+ * Copyright (C) 2026 Wang Lian <lianux.mm@gmail.com>
+ */
+
+#define pr_fmt(fmt) "damon-spe: " fmt
+
+#include <linux/mm.h>
+#include <linux/pagewalk.h>
+#include <linux/huge_mm.h>
+#include <linux/bitmap.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/jiffies.h>
+#include <linux/sched.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/uaccess.h>
+#include <linux/init.h>
+#include "spe.h"
+
+/* Max sub-pages when querying at order 0 */
+#define DAMON_SPE_MAX_CHUNKS	512
+
+/* Max folio entries in the rbtree (per-mm or global) */
+#define DAMON_SPE_MAX_ENTRIES	1024
+
+/* Entry considered stale after this many jiffies (default: 30s) */
+#define DAMON_SPE_ENTRY_TTL	(30 * HZ)
+
+/*
+ * Per-folio access histogram entry.
+ * Keyed by pfn in an rbtree.  Each entry tracks access count per subpage.
+ * The access_count array is sized for PMD-order / 0 = 512 4KB subpages.
+ */
+struct damon_spe_entry {
+	struct rb_node		node;
+	unsigned long		pfn;		/* THP-aligned PFN */
+	pid_t			pid;		/* owner process */
+	unsigned long		access_count[DAMON_SPE_MAX_CHUNKS];
+	unsigned long		total_accesses;
+	unsigned long		last_access;	/* jiffies of last update */
+};
+
+static struct rb_root spe_tree = RB_ROOT;
+static DEFINE_SPINLOCK(spe_lock);
+static unsigned int spe_nr_entries;
+
+/* Forward declarations */
+static void __spe_prune(void);
+
+/*
+ * Find an entry by PFN.  Must be called with spe_lock held.
+ */
+static struct damon_spe_entry *spe_find(unsigned long pfn)
+{
+	struct rb_node *node = spe_tree.rb_node;
+
+	while (node) {
+		struct damon_spe_entry *e =
+			rb_entry(node, struct damon_spe_entry, node);
+
+		if (pfn < e->pfn)
+			node = node->rb_left;
+		else if (pfn > e->pfn)
+			node = node->rb_right;
+		else
+			return e;
+	}
+	return NULL;
+}
+
+/*
+ * Insert a new entry.  Must be called with spe_lock held.
+ * Returns the new entry, or NULL if the tree is full.
+ */
+static struct damon_spe_entry *spe_insert(unsigned long pfn, pid_t pid)
+{
+	struct rb_node **new = &spe_tree.rb_node, *parent = NULL;
+	struct damon_spe_entry *e;
+
+	if (spe_nr_entries >= DAMON_SPE_MAX_ENTRIES) {
+		__spe_prune();
+		if (spe_nr_entries >= DAMON_SPE_MAX_ENTRIES)
+			return NULL;
+	}
+
+	e = kzalloc(sizeof(*e), GFP_ATOMIC);
+	if (!e)
+		return NULL;
+
+	e->pfn = pfn;
+	e->pid = pid;
+	e->last_access = jiffies;
+
+	while (*new) {
+		struct damon_spe_entry *this =
+			rb_entry(*new, struct damon_spe_entry, node);
+
+		parent = *new;
+		if (pfn < this->pfn)
+			new = &((*new)->rb_left);
+		else if (pfn > this->pfn)
+			new = &((*new)->rb_right);
+		else {
+			/* Race: another CPU inserted the same PFN */
+			kfree(e);
+			return this;
+		}
+	}
+
+	rb_link_node(&e->node, parent, new);
+	rb_insert_color(&e->node, &spe_tree);
+	spe_nr_entries++;
+	return e;
+}
+
+/*
+ * Prune entries that haven't been updated for DAMON_SPE_ENTRY_TTL.
+ * Must be called with spe_lock held.
+ */
+static void __spe_prune(void)
+{
+	struct rb_node *node, *next;
+	unsigned long deadline = jiffies - DAMON_SPE_ENTRY_TTL;
+
+	node = rb_first(&spe_tree);
+	while (node) {
+		struct damon_spe_entry *e =
+			rb_entry(node, struct damon_spe_entry, node);
+
+		next = rb_next(node);
+
+		if (time_before(e->last_access, deadline)) {
+			rb_erase(&e->node, &spe_tree);
+			spe_nr_entries--;
+			kfree(e);
+		}
+		node = next;
+	}
+}
+
+/**
+ * damon_spe_record_access() - Record a single subpage access
+ * @pfn: Physical page frame number (any page within a THP)
+ * @pid: Process ID that performed the access
+ *
+ * The PFN is automatically aligned to the THP base.  The subpage index
+ * within the THP is derived from the low bits of the PFN.
+ *
+ * Context: Can be called from IRQ context.
+ */
+void damon_spe_record_access(unsigned long pfn, pid_t pid)
+{
+	unsigned long thp_pfn = pfn & ~(unsigned long)(DAMON_SPE_MAX_CHUNKS - 1);
+	unsigned int idx = pfn & (DAMON_SPE_MAX_CHUNKS - 1);
+	struct damon_spe_entry *e;
+	unsigned long flags;
+
+	spin_lock_irqsave(&spe_lock, flags);
+
+	e = spe_find(thp_pfn);
+	if (!e)
+		e = spe_insert(thp_pfn, pid);
+
+	if (e) {
+		e->access_count[idx]++;
+		e->total_accesses++;
+		e->last_access = jiffies;
+	}
+
+	spin_unlock_irqrestore(&spe_lock, flags);
+}
+EXPORT_SYMBOL_GPL(damon_spe_record_access);
+
+/**
+ * damon_spe_folio_heatmap() - Get sub-THP access bitmap for a folio
+ * @folio: The folio to query
+ * @vma: VMA containing the folio
+ * @addr: Virtual address of the folio start
+ * @target_order: Page order for each chunk in the bitmap
+ * @hot_bitmap: Output bitmap with one bit per chunk
+ *
+ * Queries the SPE rbtree first.  Falls back to PTE access bits if no
+ * SPE data is available (requires the THP to be split to PTEs).
+ *
+ * Return: Number of chunks on success, negative error on failure.
+ */
+int damon_spe_folio_heatmap(struct folio *folio, struct vm_area_struct *vma,
+			    unsigned long addr, unsigned int target_order,
+			    unsigned long *hot_bitmap)
+{
+	unsigned long num_chunks = folio_nr_pages(folio) >> target_order;
+	unsigned long chunk_sz = PAGE_SIZE << target_order;
+	unsigned long pfn;
+	unsigned long flags;
+	struct damon_spe_entry *e;
+	struct folio_walk fw;
+	struct folio *sub_folio;
+	int i;
+
+	if (!folio || !vma || !hot_bitmap)
+		return -EINVAL;
+	if (target_order >= folio_order(folio))
+		return -EINVAL;
+
+	pfn = folio_pfn(folio);
+
+	/*
+	 * Phase 2b/2c path: query the SPE rbtree.
+	 * If we have aggregated SPE data for this folio, use it.
+	 */
+	spin_lock_irqsave(&spe_lock, flags);
+	e = spe_find(pfn);
+	if (e && e->total_accesses > 0) {
+		unsigned long max_sum = 0;
+		unsigned long sig_thresh;
+		unsigned int spp = chunk_sz >> PAGE_SHIFT;
+
+		/* First pass: find peak chunk access count */
+		for (i = 0; i < num_chunks; i++) {
+			unsigned long sum = 0;
+			int j;
+
+			for (j = 0; j < spp; j++) {
+				unsigned int idx = i * spp + j;
+
+				if (idx < DAMON_SPE_MAX_CHUNKS)
+					sum += e->access_count[idx];
+			}
+			if (sum > max_sum)
+				max_sum = sum;
+		}
+
+		/*
+		 * Signal threshold: a chunk needs >= 1/10 of peak access
+		 * count to be considered hot.  This filters SPE noise —
+		 * Kunpeng 920 data shows most THPs have scattered 1-2
+		 * sample hits across many subpages that don't represent
+		 * genuine hot access patterns.
+		 */
+		sig_thresh = max(max_sum / 10, 1UL);
+
+		/* Second pass: build hot bitmap using signal threshold */
+		bitmap_zero(hot_bitmap, num_chunks);
+		for (i = 0; i < num_chunks; i++) {
+			unsigned long sum = 0;
+			int j;
+
+			for (j = 0; j < spp; j++) {
+				unsigned int idx = i * spp + j;
+
+				if (idx < DAMON_SPE_MAX_CHUNKS)
+					sum += e->access_count[idx];
+			}
+			if (sum >= sig_thresh)
+				__set_bit(i, hot_bitmap);
+		}
+
+		spin_unlock_irqrestore(&spe_lock, flags);
+		return (int)num_chunks;
+	}
+	spin_unlock_irqrestore(&spe_lock, flags);
+
+	/*
+	 * Phase 2a fallback: walk PTEs to check access bits.
+	 * Only works when the THP has been split to PTEs.
+	 */
+	bitmap_zero(hot_bitmap, num_chunks);
+
+	for (i = 0; i < num_chunks; i++) {
+		unsigned long chunk_addr = addr + i * chunk_sz;
+
+		sub_folio = folio_walk_start(&fw, vma, chunk_addr, 0);
+		if (!sub_folio)
+			return -EOPNOTSUPP;
+
+		if (fw.level == FW_LEVEL_PMD) {
+			folio_walk_end(&fw, vma);
+			return -EOPNOTSUPP;
+		}
+
+		if (fw.level == FW_LEVEL_PTE && pte_young(fw.pte))
+			__set_bit(i, hot_bitmap);
+
+		folio_walk_end(&fw, vma);
+	}
+
+	return (int)num_chunks;
+}
+EXPORT_SYMBOL_GPL(damon_spe_folio_heatmap);
+
+/**
+ * damon_spe_hot_fraction() - Return hot chunk percentage of a folio
+ * @folio: The folio to query
+ * @vma: VMA containing the folio
+ * @addr: Virtual address of the folio start
+ * @target_order: Page order for each chunk
+ *
+ * Return: Percentage (0-100) on success, negative error on failure.
+ */
+int damon_spe_hot_fraction(struct folio *folio, struct vm_area_struct *vma,
+			   unsigned long addr, unsigned int target_order)
+{
+	unsigned long num_chunks = folio_nr_pages(folio) >> target_order;
+	DECLARE_BITMAP(hot_bitmap, DAMON_SPE_MAX_CHUNKS);
+	int ret, hot;
+
+	if (num_chunks > DAMON_SPE_MAX_CHUNKS)
+		return -ERANGE;
+
+	ret = damon_spe_folio_heatmap(folio, vma, addr, target_order,
+				      hot_bitmap);
+	if (ret < 0)
+		return ret;
+
+	hot = bitmap_weight(hot_bitmap, num_chunks);
+	return (hot * 100) / (int)num_chunks;
+}
+EXPORT_SYMBOL_GPL(damon_spe_hot_fraction);
+
+/**
+ * damon_spe_prune() - Remove stale entries from the SPE rbtree
+ *
+ * Called from DAMON's aggregation cycle.  Removes entries not updated
+ * within DAMON_SPE_ENTRY_TTL jiffies.
+ */
+void damon_spe_prune(void)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&spe_lock, flags);
+	__spe_prune();
+	spin_unlock_irqrestore(&spe_lock, flags);
+}
+
+/**
+ * damon_spe_stats() - Return current SPE rbtree statistics
+ * @nr_entries: Output for number of entries, may be NULL
+ * @total_accesses: Output for total accumulated accesses, may be NULL
+ */
+void damon_spe_stats(unsigned int *nr_entries, unsigned long *total_accesses)
+{
+	struct rb_node *node;
+	unsigned long flags;
+	unsigned int count = 0;
+	unsigned long total = 0;
+
+	spin_lock_irqsave(&spe_lock, flags);
+	for (node = rb_first(&spe_tree); node; node = rb_next(node)) {
+		struct damon_spe_entry *e =
+			rb_entry(node, struct damon_spe_entry, node);
+		count++;
+		total += e->total_accesses;
+	}
+	spin_unlock_irqrestore(&spe_lock, flags);
+
+	if (nr_entries)
+		*nr_entries = count;
+	if (total_accesses)
+		*total_accesses = total;
+}
+EXPORT_SYMBOL_GPL(damon_spe_stats);
+
+/* ---- debugfs interface for Phase 2b (userspace daemon → kernel rbtree) ---- */
+
+static struct dentry *damon_spe_dentry;
+
+/*
+ * spe_feed write: accept one PFN per line (hex or decimal).
+ * The PFN is recorded as an access via damon_spe_record_access().
+ *
+ * Usage from userspace:
+ *   echo 0x12345678 > /sys/kernel/debug/damon/spe_feed
+ *
+ * For bulk feed from SPE daemon:
+ *   cat spe_pfns.txt > /sys/kernel/debug/damon/spe_feed
+ */
+static ssize_t spe_feed_write(struct file *file, const char __user *buf,
+			      size_t count, loff_t *ppos)
+{
+	char line[32];
+	size_t len = min(count, sizeof(line) - 1);
+	unsigned long pfn;
+
+	if (copy_from_user(line, buf, len))
+		return -EFAULT;
+	line[len] = '\0';
+
+	/* Strip trailing newline */
+	if (len > 0 && line[len - 1] == '\n')
+		line[len - 1] = '\0';
+
+	if (kstrtoul(line, 0, &pfn) == 0 && pfn != 0)
+		damon_spe_record_access(pfn, 0);
+
+	return count;
+}
+
+/*
+ * spe_stats read: show current SPE rbtree statistics.
+ *
+ * Usage:
+ *   cat /sys/kernel/debug/damon/spe_stats
+ */
+static int spe_stats_show(struct seq_file *m, void *v)
+{
+	struct rb_node *node;
+	unsigned long flags;
+	unsigned int count = 0;
+	unsigned long total = 0;
+
+	spin_lock_irqsave(&spe_lock, flags);
+	for (node = rb_first(&spe_tree); node; node = rb_next(node)) {
+		struct damon_spe_entry *e =
+			rb_entry(node, struct damon_spe_entry, node);
+		count++;
+		total += e->total_accesses;
+	}
+	spin_unlock_irqrestore(&spe_lock, flags);
+
+	seq_printf(m, "nr_entries=%u total_accesses=%lu\n", count, total);
+
+	/* Show top entries (limit output) */
+	spin_lock_irqsave(&spe_lock, flags);
+	count = 0;
+	for (node = rb_first(&spe_tree); node; node = rb_next(node)) {
+		struct damon_spe_entry *e =
+			rb_entry(node, struct damon_spe_entry, node);
+		unsigned int hot_pages = 0;
+		int i;
+
+		for (i = 0; i < DAMON_SPE_MAX_CHUNKS; i++)
+			if (e->access_count[i])
+				hot_pages++;
+
+		seq_printf(m, "  pfn=0x%lx pid=%d total=%lu hot_pages=%u/%d\n",
+			   e->pfn, e->pid, e->total_accesses,
+			   hot_pages, DAMON_SPE_MAX_CHUNKS);
+		if (++count >= 10)
+			break;
+	}
+	spin_unlock_irqrestore(&spe_lock, flags);
+
+	return 0;
+}
+
+static int spe_stats_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, spe_stats_show, inode->i_private);
+}
+
+static const struct file_operations spe_feed_fops = {
+	.write = spe_feed_write,
+};
+
+static const struct file_operations spe_stats_fops = {
+	.open = spe_stats_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static int __init damon_spe_dbgfs_init(void)
+{
+	struct dentry *d;
+
+	d = debugfs_lookup("damon", NULL);
+	if (!d) {
+		d = debugfs_create_dir("damon", NULL);
+		if (IS_ERR(d))
+			return PTR_ERR(d);
+	}
+	damon_spe_dentry = d;
+
+	debugfs_create_file("spe_feed", 0200, damon_spe_dentry,
+			    NULL, &spe_feed_fops);
+	debugfs_create_file("spe_stats", 0400, damon_spe_dentry,
+			    NULL, &spe_stats_fops);
+
+	pr_info("debugfs interface ready: /sys/kernel/debug/damon/spe_{feed,stats}\n");
+	return 0;
+}
+
+late_initcall(damon_spe_dbgfs_init);
diff --git a/mm/damon/spe.h b/mm/damon/spe.h
new file mode 100644
index 000000000000..38799688b5af
--- /dev/null
+++ b/mm/damon/spe.h
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * DAMON SPE (Statistical Profiling Extension) feedback
+ *
+ * Provides sub-THP access heatmap for intelligent split decisions.
+ *
+ * Three-phase architecture:
+ *   Phase 2a: PTE access bits via folio_walk (current fallback)
+ *   Phase 2b: Userspace SPE daemon feeds {pfn, subpage} via debugfs
+ *   Phase 2c: Kernel perf_event_create_kernel_counter for ARM SPE
+ *
+ * Copyright (C) 2026 Wang Lian <lianux.mm@gmail.com>
+ */
+
+#ifndef _DAMON_SPE_H
+#define _DAMON_SPE_H
+
+#include <linux/mm_types.h>
+#include <linux/types.h>
+
+#ifdef CONFIG_DAMON_SPE
+
+/* ---- Sub-page heatmap query ---- */
+
+int damon_spe_folio_heatmap(struct folio *folio, struct vm_area_struct *vma,
+			    unsigned long addr, unsigned int target_order,
+			    unsigned long *hot_bitmap);
+
+int damon_spe_hot_fraction(struct folio *folio, struct vm_area_struct *vma,
+			   unsigned long addr, unsigned int target_order);
+
+/* ---- Recording (called from SPE event handler or userspace daemon) ---- */
+
+void damon_spe_record_access(unsigned long pfn, pid_t pid);
+
+/* ---- Maintenance ---- */
+
+void damon_spe_prune(void);
+void damon_spe_stats(unsigned int *nr_entries, unsigned long *total_accesses);
+
+#else /* !CONFIG_DAMON_SPE */
+
+static inline int damon_spe_folio_heatmap(struct folio *folio,
+		struct vm_area_struct *vma, unsigned long addr,
+		unsigned int target_order, unsigned long *hot_bitmap)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int damon_spe_hot_fraction(struct folio *folio,
+		struct vm_area_struct *vma, unsigned long addr,
+		unsigned int target_order)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void damon_spe_record_access(unsigned long pfn, pid_t pid) {}
+static inline void damon_spe_prune(void) {}
+static inline void damon_spe_stats(unsigned int *nr, unsigned long *total) {}
+
+#endif /* CONFIG_DAMON_SPE */
+#endif /* _DAMON_SPE_H */
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 1957e390a277..cb3ea2766b9e 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -18,6 +18,7 @@
 
 #include "../internal.h"
 #include "ops-common.h"
+#include "spe.h"
 
 #ifdef CONFIG_DAMON_VADDR_KUNIT_TEST
 #undef DAMON_MIN_REGION_SZ
@@ -945,6 +946,7 @@ static unsigned long damos_va_mthp_split(struct damon_target *target,
 	struct vm_area_struct *vma;
 	struct folio *folio;
 	struct folio_walk fw;
+	int hot_pct;
 
 	mm = damon_get_mm(target);
 	if (!mm)
@@ -979,8 +981,18 @@ static unsigned long damos_va_mthp_split(struct damon_target *target,
 			folio_get(folio);
 			folio_walk_end(&fw, vma);
 
-			if (!split_folio_to_order(folio, target_order))
-				applied += chunk_sz;
+			hot_pct = damon_spe_hot_fraction(folio, vma, addr,
+						 target_order);
+			/*
+			 * hot_pct < 0: no heatmap data (no SPE, PMD-mapped),
+			 * split unconditionally — DAMON access pattern already
+			 * identified this region as cold.
+			 */
+			if (hot_pct < 0 ||
+			    (unsigned int)hot_pct < s->hot_threshold) {
+				if (!split_folio_to_order(folio, target_order))
+					applied += chunk_sz;
+			}
 
 			folio_unlock(folio);
 			folio_put(folio);
-- 
2.50.1 (Apple Git-155)