Add folios_mc_copy() which walks list of src and dst folios in lockstep,
and copies folio content via folio_mc_copy(). folios_cnt parameter is
unused here, but is part of the offload_copy callback signature used by
later patches in the series.

Signed-off-by: Shivank Garg <shivankg@amd.com>
---
 include/linux/mm.h |  2 ++
 mm/util.c          | 31 +++++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5be3d8a8f806..e1ca4d6b7361 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1644,6 +1644,8 @@ void __folio_put(struct folio *folio);
 void split_page(struct page *page, unsigned int order);
 void folio_copy(struct folio *dst, struct folio *src);
 int folio_mc_copy(struct folio *dst, struct folio *src);
+int folios_mc_copy(struct list_head *dst_list, struct list_head *src_list,
+		unsigned int __always_unused folios_cnt);
 
 unsigned long nr_free_buffer_pages(void);
 
diff --git a/mm/util.c b/mm/util.c
index b05ab6f97e11..5bda599168f8 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -749,6 +749,37 @@ int folio_mc_copy(struct folio *dst, struct folio *src)
 }
 EXPORT_SYMBOL(folio_mc_copy);
 
+/**
+ * folios_mc_copy - Copy the contents of list of folios.
+ * @dst_list: destination folio list.
+ * @src_list: source folio list.
+ * @folios_cnt: unused here, present for callback signature compatibility.
+ *
+ * Walks list of src and dst folios in lockstep and copies folio
+ * content via folio_mc_copy(). The caller must ensure both lists have
+ * the same number of entries. This may sleep.
+ *
+ * Return: 0 on success, negative errno on failure.
+ */
+int folios_mc_copy(struct list_head *dst_list, struct list_head *src_list,
+		unsigned int __always_unused folios_cnt)
+{
+	struct folio *src, *dst;
+	int ret;
+
+	dst = list_first_entry(dst_list, struct folio, lru);
+	list_for_each_entry(src, src_list, lru) {
+		cond_resched();
+		ret = folio_mc_copy(dst, src);
+		if (ret)
+			return ret;
+		dst = list_next_entry(dst, lru);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(folios_mc_copy);
+
 int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
 static int sysctl_overcommit_ratio __read_mostly = 50;
 static unsigned long sysctl_overcommit_kbytes __read_mostly;
-- 
2.43.0


Add a PAGE_ALREADY_COPIED flag to the dst->private migration state.
When set, __migrate_folio() skips folio_mc_copy() and performs
metadata-only migration. All callers currently pass
already_copied=false. The batch-copy path enables it in a later patch.

Move the dst->private state enum earlier in the file so
__migrate_folio() and move_to_new_folio() can see PAGE_ALREADY_COPIED.

Signed-off-by: Shivank Garg <shivankg@amd.com>
---
 mm/migrate.c | 52 +++++++++++++++++++++++++++++++---------------------
 1 file changed, 31 insertions(+), 21 deletions(-)

diff --git a/mm/migrate.c b/mm/migrate.c
index 1bf2cf8c44dd..1d8c1fb627c9 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -848,6 +848,18 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio)
 }
 EXPORT_SYMBOL(folio_migrate_flags);
 
+/*
+ * To record some information during migration, we use unused private
+ * field of struct folio of the newly allocated destination folio.
+ * This is safe because nobody is using it except us.
+ */
+enum {
+	PAGE_WAS_MAPPED = BIT(0),
+	PAGE_WAS_MLOCKED = BIT(1),
+	PAGE_ALREADY_COPIED = BIT(2),
+	PAGE_OLD_STATES = PAGE_WAS_MAPPED | PAGE_WAS_MLOCKED | PAGE_ALREADY_COPIED,
+};
+
 /************************************************************
  *                    Migration functions
  ***********************************************************/
@@ -857,14 +869,20 @@ static int __migrate_folio(struct address_space *mapping, struct folio *dst,
 			   enum migrate_mode mode)
 {
 	int rc, expected_count = folio_expected_ref_count(src) + 1;
+	bool already_copied = ((unsigned long)dst->private & PAGE_ALREADY_COPIED);
+
+	if (already_copied)
+		dst->private = NULL;
 
 	/* Check whether src does not have extra refs before we do more work */
 	if (folio_ref_count(src) != expected_count)
 		return -EAGAIN;
 
-	rc = folio_mc_copy(dst, src);
-	if (unlikely(rc))
-		return rc;
+	if (!already_copied) {
+		rc = folio_mc_copy(dst, src);
+		if (unlikely(rc))
+			return rc;
+	}
 
 	rc = __folio_migrate_mapping(mapping, dst, src, expected_count);
 	if (rc)
@@ -1088,7 +1106,7 @@ static int fallback_migrate_folio(struct address_space *mapping,
  *     0 - success
  */
 static int move_to_new_folio(struct folio *dst, struct folio *src,
-				enum migrate_mode mode)
+		enum migrate_mode mode, bool already_copied)
 {
 	struct address_space *mapping = folio_mapping(src);
 	int rc = -EAGAIN;
@@ -1096,6 +1114,9 @@ static int move_to_new_folio(struct folio *dst, struct folio *src,
 	VM_BUG_ON_FOLIO(!folio_test_locked(src), src);
 	VM_BUG_ON_FOLIO(!folio_test_locked(dst), dst);
 
+	if (already_copied)
+		dst->private = (void *)(unsigned long)PAGE_ALREADY_COPIED;
+
 	if (!mapping)
 		rc = migrate_folio(mapping, dst, src, mode);
 	else if (mapping_inaccessible(mapping))
@@ -1127,17 +1148,6 @@ static int move_to_new_folio(struct folio *dst, struct folio *src,
 	return rc;
 }
 
-/*
- * To record some information during migration, we use unused private
- * field of struct folio of the newly allocated destination folio.
- * This is safe because nobody is using it except us.
- */
-enum {
-	PAGE_WAS_MAPPED = BIT(0),
-	PAGE_WAS_MLOCKED = BIT(1),
-	PAGE_OLD_STATES = PAGE_WAS_MAPPED | PAGE_WAS_MLOCKED,
-};
-
 static void __migrate_folio_record(struct folio *dst,
 				   int old_page_state,
 				   struct anon_vma *anon_vma)
@@ -1353,7 +1363,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
 static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
 			      struct folio *src, struct folio *dst,
 			      enum migrate_mode mode, enum migrate_reason reason,
-			      struct list_head *ret)
+			      struct list_head *ret, bool already_copied)
 {
 	int rc;
 	int old_page_state = 0;
@@ -1371,7 +1381,7 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
 		goto out_unlock_both;
 	}
 
-	rc = move_to_new_folio(dst, src, mode);
+	rc = move_to_new_folio(dst, src, mode, already_copied);
 	if (rc)
 		goto out;
 
@@ -1519,7 +1529,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio,
 	}
 
 	if (!folio_mapped(src))
-		rc = move_to_new_folio(dst, src, mode);
+		rc = move_to_new_folio(dst, src, mode, false);
 
 	if (page_was_mapped)
 		remove_migration_ptes(src, !rc ? dst : src, ttu);
@@ -1703,7 +1713,7 @@ static void migrate_folios_move(struct list_head *src_folios,
 		struct list_head *ret_folios,
 		struct migrate_pages_stats *stats,
 		int *retry, int *thp_retry, int *nr_failed,
-		int *nr_retry_pages)
+		int *nr_retry_pages, bool already_copied)
 {
 	struct folio *folio, *folio2, *dst, *dst2;
 	bool is_thp;
@@ -1720,7 +1730,7 @@ static void migrate_folios_move(struct list_head *src_folios,
 
 		rc = migrate_folio_move(put_new_folio, private,
 				folio, dst, mode,
-				reason, ret_folios);
+				reason, ret_folios, already_copied);
 		/*
 		 * The rules are:
 		 *	0: folio will be freed
@@ -1977,7 +1987,7 @@ static int migrate_pages_batch(struct list_head *from,
 		migrate_folios_move(&unmap_folios, &dst_folios,
 				put_new_folio, private, mode, reason,
 				ret_folios, stats, &retry, &thp_retry,
-				&nr_failed, &nr_retry_pages);
+				&nr_failed, &nr_retry_pages, false);
 	}
 	nr_failed += retry;
 	stats->nr_thp_failed += thp_retry;
-- 
2.43.0


Split unmapped folios into batch-eligible (src_batch/dst_batch) and
standard (src_std/dst_std) lists, gated by the migrate_offload_enabled
which is off by default. So, when no offload driver is active, the
branch is never taken and everything goes through the standard path.

After TLB flush, batch copy the eligible folios via folios_mc_copy()
and pass already_copied=true into migrate_folios_move() so
__migrate_folio() skips the per-folio copy.

On batch copy failure, already_copied flag stays false and each folio
fall back to individual copy.

Signed-off-by: Shivank Garg <shivankg@amd.com>
---
 mm/migrate.c | 55 +++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 44 insertions(+), 11 deletions(-)

diff --git a/mm/migrate.c b/mm/migrate.c
index 1d8c1fb627c9..69daa16f9cf3 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -43,6 +43,7 @@
 #include <linux/sched/sysctl.h>
 #include <linux/memory-tiers.h>
 #include <linux/pagewalk.h>
+#include <linux/jump_label.h>
 
 #include <asm/tlbflush.h>
 
@@ -51,6 +52,8 @@
 #include "internal.h"
 #include "swap.h"
 
+DEFINE_STATIC_KEY_FALSE(migrate_offload_enabled);
+
 static const struct movable_operations *offline_movable_ops;
 static const struct movable_operations *zsmalloc_movable_ops;
 
@@ -1706,6 +1709,12 @@ static int migrate_hugetlbs(struct list_head *from, new_folio_t get_new_folio,
 	return nr_failed;
 }
 
+/* movable_ops folios have their own migrate path */
+static bool folio_supports_batch_copy(struct folio *folio)
+{
+	return likely(!page_has_movable_ops(&folio->page));
+}
+
 static void migrate_folios_move(struct list_head *src_folios,
 		struct list_head *dst_folios,
 		free_folio_t put_new_folio, unsigned long private,
@@ -1805,8 +1814,12 @@ static int migrate_pages_batch(struct list_head *from,
 	bool is_large = false;
 	struct folio *folio, *folio2, *dst = NULL;
 	int rc, rc_saved = 0, nr_pages;
-	LIST_HEAD(unmap_folios);
-	LIST_HEAD(dst_folios);
+	unsigned int nr_batch = 0;
+	bool batch_copied = false;
+	LIST_HEAD(src_batch);
+	LIST_HEAD(dst_batch);
+	LIST_HEAD(src_std);
+	LIST_HEAD(dst_std);
 	bool nosplit = (reason == MR_NUMA_MISPLACED);
 
 	VM_WARN_ON_ONCE(mode != MIGRATE_ASYNC &&
@@ -1943,7 +1956,7 @@ static int migrate_pages_batch(struct list_head *from,
 				/* nr_failed isn't updated for not used */
 				stats->nr_thp_failed += thp_retry;
 				rc_saved = rc;
-				if (list_empty(&unmap_folios))
+				if (list_empty(&src_batch) && list_empty(&src_std))
 					goto out;
 				else
 					goto move;
@@ -1953,8 +1966,15 @@ static int migrate_pages_batch(struct list_head *from,
 				nr_retry_pages += nr_pages;
 				break;
 			case 0:
-				list_move_tail(&folio->lru, &unmap_folios);
-				list_add_tail(&dst->lru, &dst_folios);
+				if (static_branch_unlikely(&migrate_offload_enabled) &&
+				    folio_supports_batch_copy(folio)) {
+					list_move_tail(&folio->lru, &src_batch);
+					list_add_tail(&dst->lru, &dst_batch);
+					nr_batch++;
+				} else {
+					list_move_tail(&folio->lru, &src_std);
+					list_add_tail(&dst->lru, &dst_std);
+				}
 				break;
 			default:
 				/*
@@ -1977,17 +1997,28 @@ static int migrate_pages_batch(struct list_head *from,
 	/* Flush TLBs for all unmapped folios */
 	try_to_unmap_flush();
 
+	/* Batch-copy eligible folios before the move phase */
+	if (!list_empty(&src_batch)) {
+		rc = folios_mc_copy(&dst_batch, &src_batch, nr_batch);
+		batch_copied = (rc == 0);
+	}
+
 	retry = 1;
 	for (pass = 0; pass < nr_pass && retry; pass++) {
 		retry = 0;
 		thp_retry = 0;
 		nr_retry_pages = 0;
 
-		/* Move the unmapped folios */
-		migrate_folios_move(&unmap_folios, &dst_folios,
-				put_new_folio, private, mode, reason,
-				ret_folios, stats, &retry, &thp_retry,
-				&nr_failed, &nr_retry_pages, false);
+		if (!list_empty(&src_batch))
+			migrate_folios_move(&src_batch, &dst_batch, put_new_folio,
+					private, mode, reason, ret_folios, stats,
+					&retry, &thp_retry, &nr_failed,
+					&nr_retry_pages, batch_copied);
+		if (!list_empty(&src_std))
+			migrate_folios_move(&src_std, &dst_std,	put_new_folio,
+					private, mode, reason, ret_folios, stats,
+					&retry, &thp_retry, &nr_failed,
+					&nr_retry_pages, false);
 	}
 	nr_failed += retry;
 	stats->nr_thp_failed += thp_retry;
@@ -1996,7 +2027,9 @@ static int migrate_pages_batch(struct list_head *from,
 	rc = rc_saved ? : nr_failed;
 out:
 	/* Cleanup remaining folios */
-	migrate_folios_undo(&unmap_folios, &dst_folios,
+	migrate_folios_undo(&src_batch, &dst_batch,
+			put_new_folio, private, ret_folios);
+	migrate_folios_undo(&src_std, &dst_std,
 			put_new_folio, private, ret_folios);
 
 	return rc;
-- 
2.43.0


Introduce CONFIG_MIGRATION_COPY_OFFLOAD, which lets offload driver
(DMA, multi-threaded CPU copy, etc) take over the batch folio copy in
migrate_pages_batch().

Offload driver fill in a struct migrator with their offload_copy() and
should_batch() implementation and call migrate_offload_start(), which
patches the migrate_offload_copy() static_call and flips the
migrate_offload_enabled static branch. The migrate_offload_stop() call
reverts both.

Only one migrator can be active a time. A second registration returns
-EBUSY, and only the active migrator can stop itself. The static_call
dispatch is under SRCU so synchronize_srcu() in stop path guarantees
no in-flight copy before the module reference is dropped.

Co-developed-by: Mike Day <michael.day@amd.com>
Signed-off-by: Mike Day <michael.day@amd.com>
Signed-off-by: Shivank Garg <shivankg@amd.com>
---
 include/linux/migrate_copy_offload.h | 34 ++++++++++
 mm/Kconfig                           |  9 +++
 mm/Makefile                          |  1 +
 mm/migrate.c                         | 30 ++++++++-
 mm/migrate_copy_offload.c            | 99 ++++++++++++++++++++++++++++
 5 files changed, 171 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/migrate_copy_offload.h
 create mode 100644 mm/migrate_copy_offload.c

diff --git a/include/linux/migrate_copy_offload.h b/include/linux/migrate_copy_offload.h
new file mode 100644
index 000000000000..ee112826ebdf
--- /dev/null
+++ b/include/linux/migrate_copy_offload.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_MIGRATE_COPY_OFFLOAD_H
+#define _LINUX_MIGRATE_COPY_OFFLOAD_H
+
+#include <linux/jump_label.h>
+#include <linux/srcu.h>
+#include <linux/types.h>
+
+struct list_head;
+struct module;
+
+#define MIGRATOR_NAME_LEN 32
+
+struct migrator {
+	char name[MIGRATOR_NAME_LEN];
+	int (*offload_copy)(struct list_head *dst_list,
+			    struct list_head *src_list,
+			    unsigned int folio_cnt);
+	bool (*should_batch)(int reason);
+	struct module *owner;
+};
+
+#ifdef CONFIG_MIGRATION_COPY_OFFLOAD
+extern struct static_key_false migrate_offload_enabled;
+extern struct srcu_struct migrate_offload_srcu;
+bool migrate_should_batch_default(int reason);
+int migrate_offload_start(struct migrator *m);
+int migrate_offload_stop(struct migrator *m);
+#else
+static inline int migrate_offload_start(struct migrator *m) { return 0; }
+static inline int migrate_offload_stop(struct migrator *m) { return 0; }
+#endif /* CONFIG_MIGRATION_COPY_OFFLOAD */
+
+#endif /* _LINUX_MIGRATE_COPY_OFFLOAD_H */
diff --git a/mm/Kconfig b/mm/Kconfig
index ebd8ea353687..faf0cae9991b 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -648,6 +648,15 @@ config MIGRATION
 config DEVICE_MIGRATION
 	def_bool MIGRATION && ZONE_DEVICE
 
+config MIGRATION_COPY_OFFLOAD
+	bool "Page migration copy offload"
+	depends on MIGRATION
+	help
+	  Adds migration copy offload infrastructure which allow
+	  offload engines (DMA, multi-threaded CPU copy, etc.) to
+	  register as the batch-copy provider for page migration
+	  via migrate_offload_start()/migrate_offload_stop().
+
 config ARCH_ENABLE_HUGEPAGE_MIGRATION
 	bool
 
diff --git a/mm/Makefile b/mm/Makefile
index 8ad2ab08244e..db1ac8097089 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -96,6 +96,7 @@ obj-$(CONFIG_FAILSLAB) += failslab.o
 obj-$(CONFIG_FAIL_PAGE_ALLOC) += fail_page_alloc.o
 obj-$(CONFIG_MEMTEST)		+= memtest.o
 obj-$(CONFIG_MIGRATION) += migrate.o
+obj-$(CONFIG_MIGRATION_COPY_OFFLOAD) += migrate_copy_offload.o
 obj-$(CONFIG_NUMA) += memory-tiers.o
 obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o
 obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
diff --git a/mm/migrate.c b/mm/migrate.c
index 69daa16f9cf3..acaaa9cc0d4f 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -44,6 +44,8 @@
 #include <linux/memory-tiers.h>
 #include <linux/pagewalk.h>
 #include <linux/jump_label.h>
+#include <linux/static_call.h>
+#include <linux/migrate_copy_offload.h>
 
 #include <asm/tlbflush.h>
 
@@ -54,6 +56,17 @@
 
 DEFINE_STATIC_KEY_FALSE(migrate_offload_enabled);
 
+#ifdef CONFIG_MIGRATION_COPY_OFFLOAD
+DEFINE_SRCU(migrate_offload_srcu);
+DEFINE_STATIC_CALL(migrate_offload_copy, folios_mc_copy);
+
+bool migrate_should_batch_default(int reason)
+{
+	return false;
+}
+DEFINE_STATIC_CALL(migrate_should_batch, migrate_should_batch_default);
+#endif
+
 static const struct movable_operations *offline_movable_ops;
 static const struct movable_operations *zsmalloc_movable_ops;
 
@@ -1820,11 +1833,18 @@ static int migrate_pages_batch(struct list_head *from,
 	LIST_HEAD(dst_batch);
 	LIST_HEAD(src_std);
 	LIST_HEAD(dst_std);
+	bool do_batch = false;
 	bool nosplit = (reason == MR_NUMA_MISPLACED);
 
 	VM_WARN_ON_ONCE(mode != MIGRATE_ASYNC &&
 			!list_empty(from) && !list_is_singular(from));
 
+#ifdef CONFIG_MIGRATION_COPY_OFFLOAD
+	/* Check if the offload driver wants to batch for this reason */
+	if (static_branch_unlikely(&migrate_offload_enabled))
+		do_batch = static_call(migrate_should_batch)(reason);
+#endif
+
 	for (pass = 0; pass < nr_pass && retry; pass++) {
 		retry = 0;
 		thp_retry = 0;
@@ -1967,7 +1987,7 @@ static int migrate_pages_batch(struct list_head *from,
 				break;
 			case 0:
 				if (static_branch_unlikely(&migrate_offload_enabled) &&
-				    folio_supports_batch_copy(folio)) {
+				    do_batch && folio_supports_batch_copy(folio)) {
 					list_move_tail(&folio->lru, &src_batch);
 					list_add_tail(&dst->lru, &dst_batch);
 					nr_batch++;
@@ -1997,11 +2017,17 @@ static int migrate_pages_batch(struct list_head *from,
 	/* Flush TLBs for all unmapped folios */
 	try_to_unmap_flush();
 
+#ifdef CONFIG_MIGRATION_COPY_OFFLOAD
 	/* Batch-copy eligible folios before the move phase */
 	if (!list_empty(&src_batch)) {
-		rc = folios_mc_copy(&dst_batch, &src_batch, nr_batch);
+		int idx = srcu_read_lock(&migrate_offload_srcu);
+
+		rc = static_call(migrate_offload_copy)(&dst_batch,
+				&src_batch, nr_batch);
+		srcu_read_unlock(&migrate_offload_srcu, idx);
 		batch_copied = (rc == 0);
 	}
+#endif
 
 	retry = 1;
 	for (pass = 0; pass < nr_pass && retry; pass++) {
diff --git a/mm/migrate_copy_offload.c b/mm/migrate_copy_offload.c
new file mode 100644
index 000000000000..c22068fe09a0
--- /dev/null
+++ b/mm/migrate_copy_offload.c
@@ -0,0 +1,99 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/jump_label.h>
+#include <linux/module.h>
+#include <linux/srcu.h>
+#include <linux/migrate.h>
+#include <linux/migrate_copy_offload.h>
+#include <linux/static_call.h>
+
+static DEFINE_MUTEX(migrator_mutex);
+static struct migrator *active_migrator;
+
+DECLARE_STATIC_CALL(migrate_offload_copy, folios_mc_copy);
+DECLARE_STATIC_CALL(migrate_should_batch, migrate_should_batch_default);
+
+/**
+ * migrate_offload_start - register a batch-copy provider for page migration.
+ * @m: migrator to install.
+ *
+ * Only one provider can be active at a time, returns -EBUSY if another migrator
+ * is already registered.
+ *
+ * Return: 0 on success, negative errno on failure.
+ */
+int migrate_offload_start(struct migrator *m)
+{
+	int ret = 0;
+
+	if (!m || !m->offload_copy)
+		return -EINVAL;
+
+	mutex_lock(&migrator_mutex);
+	if (active_migrator) {
+		ret = -EBUSY;
+		goto unlock;
+	}
+
+	if (m->owner && !try_module_get(m->owner)) {
+		ret = -ENODEV;
+		goto unlock;
+	}
+
+	static_call_update(migrate_offload_copy, m->offload_copy);
+	static_call_update(migrate_should_batch,
+		m->should_batch ? m->should_batch : migrate_should_batch_default);
+	active_migrator = m;
+	static_branch_enable(&migrate_offload_enabled);
+
+unlock:
+	mutex_unlock(&migrator_mutex);
+
+	if (ret)
+		pr_err("migrate_offload: %s: failed to register (%d)\n",
+		       m->name, ret);
+	else
+		pr_info("migrate_offload: enabled by %s\n", m->name);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(migrate_offload_start);
+
+/**
+ * migrate_offload_stop - unregister the active batch-copy provider.
+ * @m: migrator to remove (must be the currently active one).
+ *
+ * Reverts static_call targets and waits for SRCU grace period so that
+ * no in-flight migration is still calling the driver functions before
+ * releasing the module.
+ *
+ * Return: 0 on success, negative errno on failure.
+ */
+int migrate_offload_stop(struct migrator *m)
+{
+	struct module *owner;
+
+	mutex_lock(&migrator_mutex);
+	if (active_migrator != m) {
+		mutex_unlock(&migrator_mutex);
+		return -EINVAL;
+	}
+
+	/*
+	 * Disable the static branch first so new migrate_pages_batch calls
+	 * won't enter the batch copy path.
+	 */
+	static_branch_disable(&migrate_offload_enabled);
+	static_call_update(migrate_offload_copy, folios_mc_copy);
+	static_call_update(migrate_should_batch, migrate_should_batch_default);
+	owner = active_migrator->owner;
+	active_migrator = NULL;
+	mutex_unlock(&migrator_mutex);
+
+	/* Wait for all in-flight callers to finish before module_put(). */
+	synchronize_srcu(&migrate_offload_srcu);
+	if (owner)
+		module_put(owner);
+
+	pr_info("migrate_offload: disabled by %s\n", m->name);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(migrate_offload_stop);
-- 
2.43.0


Simple DMAEngine based driver that uses memcpy channels to batch-copy
folios during page migration. Primarily for testing the copy offload
infrastructure.

When DMA fails the callback returns an error and the migration path
falls back to per-folio CPU copy.

Sysfs interface under /sys/kernel/dcbm/:
  offloading      - enable/disable DMA offload
  nr_dma_chan     - max number of DMA channels to use
  folios_migrated - folios copied via DMA
  folios_failures - fallback count

Signed-off-by: Shivank Garg <shivankg@amd.com>
---
 drivers/Kconfig                       |   2 +
 drivers/Makefile                      |   2 +
 drivers/migrate_offload/Kconfig       |   8 +
 drivers/migrate_offload/Makefile      |   1 +
 drivers/migrate_offload/dcbm/Makefile |   1 +
 drivers/migrate_offload/dcbm/dcbm.c   | 457 ++++++++++++++++++++++++++
 6 files changed, 471 insertions(+)
 create mode 100644 drivers/migrate_offload/Kconfig
 create mode 100644 drivers/migrate_offload/Makefile
 create mode 100644 drivers/migrate_offload/dcbm/Makefile
 create mode 100644 drivers/migrate_offload/dcbm/dcbm.c

diff --git a/drivers/Kconfig b/drivers/Kconfig
index c0f1fb893ec0..3dbea1380603 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -255,4 +255,6 @@ source "drivers/cdx/Kconfig"
 
 source "drivers/resctrl/Kconfig"
 
+source "drivers/migrate_offload/Kconfig"
+
 endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index 53fbd2e0acdd..f55bddf490cc 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -42,6 +42,8 @@ obj-y				+= clk/
 # really early.
 obj-$(CONFIG_DMADEVICES)	+= dma/
 
+obj-$(CONFIG_MIGRATION_COPY_OFFLOAD)	+= migrate_offload/
+
 # SOC specific infrastructure drivers.
 obj-y				+= soc/
 obj-$(CONFIG_PM_GENERIC_DOMAINS)	+= pmdomain/
diff --git a/drivers/migrate_offload/Kconfig b/drivers/migrate_offload/Kconfig
new file mode 100644
index 000000000000..0bbaedbae4ad
--- /dev/null
+++ b/drivers/migrate_offload/Kconfig
@@ -0,0 +1,8 @@
+config DCBM_DMA
+	bool "DMA Core Batch Migrator"
+	depends on MIGRATION_COPY_OFFLOAD && DMA_ENGINE
+	help
+	  DMA-based batch copy engine for page migration. Uses
+	  DMAEngine memcpy channels to offload folio data copies
+	  during migration. Primarily intended for testing the copy
+	  offload infrastructure.
diff --git a/drivers/migrate_offload/Makefile b/drivers/migrate_offload/Makefile
new file mode 100644
index 000000000000..9e16018beb15
--- /dev/null
+++ b/drivers/migrate_offload/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_DCBM_DMA)		+= dcbm/
diff --git a/drivers/migrate_offload/dcbm/Makefile b/drivers/migrate_offload/dcbm/Makefile
new file mode 100644
index 000000000000..56ba47cce0f1
--- /dev/null
+++ b/drivers/migrate_offload/dcbm/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_DCBM_DMA) += dcbm.o
diff --git a/drivers/migrate_offload/dcbm/dcbm.c b/drivers/migrate_offload/dcbm/dcbm.c
new file mode 100644
index 000000000000..89751d03101e
--- /dev/null
+++ b/drivers/migrate_offload/dcbm/dcbm.c
@@ -0,0 +1,457 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * DMA Core Batch Migrator (DCBM)
+ *
+ * Uses DMAEngine memcpy channels to offload batch folio copies during
+ * page migration. Reference driver meant for testing the offload
+ * infrastructure.
+ *
+ * Copyright (C) 2024-26 Advanced Micro Devices, Inc.
+ */
+
+#include <linux/module.h>
+#include <linux/dma-mapping.h>
+#include <linux/dmaengine.h>
+#include <linux/migrate.h>
+#include <linux/migrate_copy_offload.h>
+
+#define MAX_DMA_CHANNELS	16
+
+static unsigned long long folios_migrated;
+static unsigned long long folios_failures;
+
+static bool offloading_enabled;
+static unsigned int nr_dma_channels = 1;
+static DEFINE_MUTEX(dcbm_mutex);
+
+struct dma_work {
+	struct dma_chan *chan;
+	struct completion done;
+	atomic_t pending;
+	struct sg_table *src_sgt;
+	struct sg_table *dst_sgt;
+	bool mapped;
+};
+
+static void dma_completion_callback(void *data)
+{
+	struct dma_work *work = data;
+
+	if (atomic_dec_and_test(&work->pending))
+		complete(&work->done);
+}
+
+static int setup_sg_tables(struct dma_work *work, struct list_head **src_pos,
+		struct list_head **dst_pos, int nr)
+{
+	struct scatterlist *sg_src, *sg_dst;
+	struct device *dev;
+	int i, ret;
+
+	work->src_sgt = kmalloc_obj(*work->src_sgt, GFP_KERNEL);
+	if (!work->src_sgt)
+		return -ENOMEM;
+	work->dst_sgt = kmalloc_obj(*work->dst_sgt, GFP_KERNEL);
+	if (!work->dst_sgt)
+		goto err_free_src;
+
+	ret = sg_alloc_table(work->src_sgt, nr, GFP_KERNEL);
+	if (ret)
+		goto err_free_dst;
+	ret = sg_alloc_table(work->dst_sgt, nr, GFP_KERNEL);
+	if (ret)
+		goto err_free_src_table;
+
+	sg_src = work->src_sgt->sgl;
+	sg_dst = work->dst_sgt->sgl;
+	for (i = 0; i < nr; i++) {
+		struct folio *src = list_entry(*src_pos, struct folio, lru);
+		struct folio *dst = list_entry(*dst_pos, struct folio, lru);
+
+		sg_set_folio(sg_src, src, folio_size(src), 0);
+		sg_set_folio(sg_dst, dst, folio_size(dst), 0);
+
+		*src_pos = (*src_pos)->next;
+		*dst_pos = (*dst_pos)->next;
+
+		if (i < nr - 1) {
+			sg_src = sg_next(sg_src);
+			sg_dst = sg_next(sg_dst);
+		}
+	}
+
+	dev = dmaengine_get_dma_device(work->chan);
+	if (!dev) {
+		ret = -ENODEV;
+		goto err_free_dst_table;
+	}
+	ret = dma_map_sgtable(dev, work->src_sgt, DMA_TO_DEVICE,
+			DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_NO_KERNEL_MAPPING);
+	if (ret)
+		goto err_free_dst_table;
+	ret = dma_map_sgtable(dev, work->dst_sgt, DMA_FROM_DEVICE,
+			DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_NO_KERNEL_MAPPING);
+	if (ret)
+		goto err_unmap_src;
+
+	if (work->src_sgt->nents != work->dst_sgt->nents) {
+		ret = -EINVAL;
+		goto err_unmap_dst;
+	}
+	work->mapped = true;
+	return 0;
+
+err_unmap_dst:
+	dma_unmap_sgtable(dev, work->dst_sgt, DMA_FROM_DEVICE,
+			DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_NO_KERNEL_MAPPING);
+err_unmap_src:
+	dma_unmap_sgtable(dev, work->src_sgt, DMA_TO_DEVICE,
+			DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_NO_KERNEL_MAPPING);
+err_free_dst_table:
+	sg_free_table(work->dst_sgt);
+err_free_src_table:
+	sg_free_table(work->src_sgt);
+err_free_dst:
+	kfree(work->dst_sgt);
+	work->dst_sgt = NULL;
+err_free_src:
+	kfree(work->src_sgt);
+	work->src_sgt = NULL;
+	return ret;
+}
+
+static void cleanup_dma_work(struct dma_work *works, int actual_channels)
+{
+	struct device *dev;
+	int i;
+
+	if (!works)
+		return;
+
+	for (i = 0; i < actual_channels; i++) {
+		if (!works[i].chan)
+			continue;
+
+		dev = dmaengine_get_dma_device(works[i].chan);
+
+		if (works[i].mapped)
+			dmaengine_terminate_sync(works[i].chan);
+
+		if (dev && works[i].mapped) {
+			if (works[i].src_sgt) {
+				dma_unmap_sgtable(dev, works[i].src_sgt,
+						DMA_TO_DEVICE,
+						DMA_ATTR_SKIP_CPU_SYNC |
+						DMA_ATTR_NO_KERNEL_MAPPING);
+				sg_free_table(works[i].src_sgt);
+				kfree(works[i].src_sgt);
+			}
+			if (works[i].dst_sgt) {
+				dma_unmap_sgtable(dev, works[i].dst_sgt,
+						DMA_FROM_DEVICE,
+						DMA_ATTR_SKIP_CPU_SYNC |
+						DMA_ATTR_NO_KERNEL_MAPPING);
+				sg_free_table(works[i].dst_sgt);
+				kfree(works[i].dst_sgt);
+			}
+		}
+		dma_release_channel(works[i].chan);
+	}
+	kfree(works);
+}
+
+static int submit_dma_transfers(struct dma_work *work)
+{
+	struct scatterlist *sg_src, *sg_dst;
+	struct dma_async_tx_descriptor *tx;
+	unsigned long flags = DMA_CTRL_ACK;
+	dma_cookie_t cookie;
+	int i;
+
+	atomic_set(&work->pending, 1);
+
+	sg_src = work->src_sgt->sgl;
+	sg_dst = work->dst_sgt->sgl;
+	for_each_sgtable_dma_sg(work->src_sgt, sg_src, i) {
+		if (i == work->src_sgt->nents - 1)
+			flags |= DMA_PREP_INTERRUPT;
+
+		tx = dmaengine_prep_dma_memcpy(work->chan,
+				sg_dma_address(sg_dst),
+				sg_dma_address(sg_src),
+				sg_dma_len(sg_src), flags);
+		if (!tx) {
+			atomic_set(&work->pending, 0);
+			return -EIO;
+		}
+
+		if (i == work->src_sgt->nents - 1) {
+			tx->callback = dma_completion_callback;
+			tx->callback_param = work;
+		}
+
+		cookie = dmaengine_submit(tx);
+		if (dma_submit_error(cookie)) {
+			atomic_set(&work->pending, 0);
+			return -EIO;
+		}
+		sg_dst = sg_next(sg_dst);
+	}
+	return 0;
+}
+
+/**
+ * folios_copy_dma - copy a batch of folios via DMA memcpy
+ * @dst_list: destination folio list
+ * @src_list: source folio list
+ * @nr_folios: number of folios in each list
+ *
+ * Return: 0 on success, negative errno on failure.
+ */
+static int folios_copy_dma(struct list_head *dst_list,
+		struct list_head *src_list, unsigned int nr_folios)
+{
+	struct dma_work *works;
+	struct list_head *src_pos = src_list->next;
+	struct list_head *dst_pos = dst_list->next;
+	int i, folios_per_chan, ret;
+	dma_cap_mask_t mask;
+	int actual_channels = 0;
+	unsigned int max_channels;
+
+	max_channels = min3(nr_dma_channels, nr_folios,
+			(unsigned int)MAX_DMA_CHANNELS);
+
+	works = kcalloc(max_channels, sizeof(*works), GFP_KERNEL);
+	if (!works)
+		return -ENOMEM;
+
+	dma_cap_zero(mask);
+	dma_cap_set(DMA_MEMCPY, mask);
+
+	for (i = 0; i < max_channels; i++) {
+		works[actual_channels].chan = dma_request_chan_by_mask(&mask);
+		if (IS_ERR(works[actual_channels].chan))
+			break;
+		init_completion(&works[actual_channels].done);
+		actual_channels++;
+	}
+
+	if (actual_channels == 0) {
+		kfree(works);
+		return -ENODEV;
+	}
+
+	for (i = 0; i < actual_channels; i++) {
+		folios_per_chan = nr_folios * (i + 1) / actual_channels -
+				(nr_folios * i) / actual_channels;
+		if (folios_per_chan == 0)
+			continue;
+
+		ret = setup_sg_tables(&works[i], &src_pos, &dst_pos,
+				folios_per_chan);
+		if (ret)
+			goto err_cleanup;
+	}
+
+	for (i = 0; i < actual_channels; i++) {
+		ret = submit_dma_transfers(&works[i]);
+		if (ret)
+			goto err_cleanup;
+	}
+
+	for (i = 0; i < actual_channels; i++) {
+		if (atomic_read(&works[i].pending) > 0)
+			dma_async_issue_pending(works[i].chan);
+	}
+
+	for (i = 0; i < actual_channels; i++) {
+		if (atomic_read(&works[i].pending) == 0)
+			continue;
+		if (!wait_for_completion_timeout(&works[i].done,
+				msecs_to_jiffies(10000))) {
+			ret = -ETIMEDOUT;
+			goto err_cleanup;
+		}
+	}
+
+	cleanup_dma_work(works, actual_channels);
+
+	mutex_lock(&dcbm_mutex);
+	folios_migrated += nr_folios;
+	mutex_unlock(&dcbm_mutex);
+	return 0;
+
+err_cleanup:
+	pr_warn_ratelimited("dcbm: DMA copy failed (%d), falling back to CPU\n",
+			ret);
+	cleanup_dma_work(works, actual_channels);
+
+	mutex_lock(&dcbm_mutex);
+	folios_failures += nr_folios;
+	mutex_unlock(&dcbm_mutex);
+	return ret;
+}
+
+/* TODO: tune based on usecase */
+static bool dma_should_batch(int reason)
+{
+	if (reason == MR_SYSCALL || reason == MR_COMPACTION || reason == MR_DEMOTION ||
+	    reason == MR_NUMA_MISPLACED)
+		return true;
+	return false;
+}
+
+static struct migrator dma_migrator = {
+	.name = "DCBM",
+	.offload_copy = folios_copy_dma,
+	.should_batch = dma_should_batch,
+	.owner = THIS_MODULE,
+};
+
+static ssize_t offloading_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%d\n", offloading_enabled);
+}
+
+static ssize_t offloading_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	bool enable;
+	int ret;
+
+	ret = kstrtobool(buf, &enable);
+	if (ret)
+		return ret;
+
+	mutex_lock(&dcbm_mutex);
+
+	if (enable == offloading_enabled)
+		goto out;
+
+	if (enable) {
+		ret = migrate_offload_start(&dma_migrator);
+		if (ret) {
+			mutex_unlock(&dcbm_mutex);
+			return ret;
+		}
+		offloading_enabled = true;
+	} else {
+		migrate_offload_stop(&dma_migrator);
+		offloading_enabled = false;
+	}
+out:
+	mutex_unlock(&dcbm_mutex);
+	return count;
+}
+
+static ssize_t folios_migrated_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%llu\n", folios_migrated);
+}
+
+static ssize_t folios_migrated_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	mutex_lock(&dcbm_mutex);
+	folios_migrated = 0;
+	mutex_unlock(&dcbm_mutex);
+	return count;
+}
+
+static ssize_t folios_failures_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%llu\n", folios_failures);
+}
+
+static ssize_t folios_failures_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	mutex_lock(&dcbm_mutex);
+	folios_failures = 0;
+	mutex_unlock(&dcbm_mutex);
+	return count;
+}
+
+static ssize_t nr_dma_chan_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%u\n", nr_dma_channels);
+}
+
+static ssize_t nr_dma_chan_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	unsigned int val;
+	int ret;
+
+	ret = kstrtouint(buf, 0, &val);
+	if (ret)
+		return ret;
+
+	if (val < 1 || val > MAX_DMA_CHANNELS)
+		return -EINVAL;
+
+	mutex_lock(&dcbm_mutex);
+	nr_dma_channels = val;
+	mutex_unlock(&dcbm_mutex);
+	return count;
+}
+
+static struct kobj_attribute offloading_attr = __ATTR_RW(offloading);
+static struct kobj_attribute nr_dma_chan_attr = __ATTR_RW(nr_dma_chan);
+static struct kobj_attribute folios_migrated_attr = __ATTR_RW(folios_migrated);
+static struct kobj_attribute folios_failures_attr = __ATTR_RW(folios_failures);
+
+static struct attribute *dcbm_attrs[] = {
+	&offloading_attr.attr,
+	&nr_dma_chan_attr.attr,
+	&folios_migrated_attr.attr,
+	&folios_failures_attr.attr,
+	NULL
+};
+ATTRIBUTE_GROUPS(dcbm);
+
+static struct kobject *dcbm_kobj;
+
+static int __init dcbm_init(void)
+{
+	int ret;
+
+	dcbm_kobj = kobject_create_and_add("dcbm", kernel_kobj);
+	if (!dcbm_kobj)
+		return -ENOMEM;
+
+	ret = sysfs_create_groups(dcbm_kobj, dcbm_groups);
+	if (ret) {
+		kobject_put(dcbm_kobj);
+		return ret;
+	}
+
+	pr_info("dcbm: DMA Core Batch Migrator initialized\n");
+	return 0;
+}
+
+static void __exit dcbm_exit(void)
+{
+	mutex_lock(&dcbm_mutex);
+	if (offloading_enabled) {
+		migrate_offload_stop(&dma_migrator);
+		offloading_enabled = false;
+	}
+	mutex_unlock(&dcbm_mutex);
+
+	sysfs_remove_groups(dcbm_kobj, dcbm_groups);
+	kobject_put(dcbm_kobj);
+	pr_info("dcbm: DMA Core Batch Migrator unloaded\n");
+}
+
+module_init(dcbm_init);
+module_exit(dcbm_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Shivank Garg");
+MODULE_DESCRIPTION("DMA Core Batch Migrator");
-- 
2.43.0


From: Zi Yan <ziy@nvidia.com>

Change NR_MAX_BATCHED_MIGRATION to HPAGE_PUD_NR to allow batching THP
copies.

These are for testing purpose only.

Signed-off-by: Zi Yan <ziy@nvidia.com>
Signed-off-by: Shivank Garg <shivankg@amd.com>
---
 mm/migrate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/migrate.c b/mm/migrate.c
index acaaa9cc0d4f..8540e303190b 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1606,7 +1606,7 @@ static inline int try_split_folio(struct folio *folio, struct list_head *split_f
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define NR_MAX_BATCHED_MIGRATION	HPAGE_PMD_NR
+#define NR_MAX_BATCHED_MIGRATION	HPAGE_PUD_NR
 #else
 #define NR_MAX_BATCHED_MIGRATION	512
 #endif
-- 
2.43.0