Consolidate the duplicated stable_node allocation and initialization code in stable_tree_insert() into a new helper function alloc_init_stable_node_dup(). Also refactor write_protect_page() and replace_page() to expose address-based variants (_addr suffix). The wrappers maintain existing behavior by calculating the address first. This refactoring prepares for the upcoming memory error recovery feature, which will need to: 1) Allocate and initialize stable_node duplicates 2) Operate on specific addresses without re-calculation No functional changes. Signed-off-by: Longlong Xia --- mm/ksm.c | 89 +++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 63 insertions(+), 26 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index 160787bb121c..13ec057667af 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1247,11 +1247,11 @@ static u32 calc_checksum(struct page *page) return checksum; } -static int write_protect_page(struct vm_area_struct *vma, struct folio *folio, - pte_t *orig_pte) +static int write_protect_page_addr(struct vm_area_struct *vma, struct folio *folio, + unsigned long address, pte_t *orig_pte) { struct mm_struct *mm = vma->vm_mm; - DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, 0, 0); + DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); int swapped; int err = -EFAULT; struct mmu_notifier_range range; @@ -1261,10 +1261,10 @@ static int write_protect_page(struct vm_area_struct *vma, struct folio *folio, if (WARN_ON_ONCE(folio_test_large(folio))) return err; - pvmw.address = page_address_in_vma(folio, folio_page(folio, 0), vma); - if (pvmw.address == -EFAULT) - goto out; + if (address < vma->vm_start || address >= vma->vm_end) + return err; + pvmw.address = address; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, pvmw.address, pvmw.address + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); @@ -1334,21 +1334,26 @@ static int write_protect_page(struct vm_area_struct *vma, struct folio *folio, page_vma_mapped_walk_done(&pvmw); out_mn: mmu_notifier_invalidate_range_end(&range); -out: return err; } -/** - * replace_page - replace page in vma by new ksm page - * @vma: vma that holds the pte pointing to page - * @page: the page we are replacing by kpage - * @kpage: the ksm page we replace page by - * @orig_pte: the original value of the pte - * - * Returns 0 on success, -EFAULT on failure. - */ -static int replace_page(struct vm_area_struct *vma, struct page *page, - struct page *kpage, pte_t orig_pte) +static int write_protect_page(struct vm_area_struct *vma, struct folio *folio, + pte_t *orig_pte) +{ + unsigned long address; + + if (WARN_ON_ONCE(folio_test_large(folio))) + return -EFAULT; + + address = page_address_in_vma(folio, folio_page(folio, 0), vma); + if (address == -EFAULT) + return -EFAULT; + + return write_protect_page_addr(vma, folio, address, orig_pte); +} + +static int replace_page_addr(struct vm_area_struct *vma, struct page *page, + struct page *kpage, unsigned long addr, pte_t orig_pte) { struct folio *kfolio = page_folio(kpage); struct mm_struct *mm = vma->vm_mm; @@ -1358,17 +1363,16 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, pte_t *ptep; pte_t newpte; spinlock_t *ptl; - unsigned long addr; int err = -EFAULT; struct mmu_notifier_range range; - addr = page_address_in_vma(folio, page, vma); - if (addr == -EFAULT) + if (addr < vma->vm_start || addr >= vma->vm_end) goto out; pmd = mm_find_pmd(mm, addr); if (!pmd) goto out; + /* * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at() * without holding anon_vma lock for write. So when looking for a @@ -1441,6 +1445,29 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, return err; } + +/** + * replace_page - replace page in vma by new ksm page + * @vma: vma that holds the pte pointing to page + * @page: the page we are replacing by kpage + * @kpage: the ksm page we replace page by + * @orig_pte: the original value of the pte + * + * Returns 0 on success, -EFAULT on failure. + */ +static int replace_page(struct vm_area_struct *vma, struct page *page, + struct page *kpage, pte_t orig_pte) +{ + unsigned long addr; + struct folio *folio = page_folio(page); + + addr = page_address_in_vma(folio, page, vma); + if (addr == -EFAULT) + return -EFAULT; + + return replace_page_addr(vma, page, kpage, addr, orig_pte); +} + /* * try_to_merge_one_page - take two pages and merge them into one * @vma: the vma that holds the pte pointing to page @@ -2007,6 +2034,20 @@ static struct folio *stable_tree_search(struct page *page) goto out; } +static struct ksm_stable_node *alloc_init_stable_node_dup(unsigned long kpfn, + int nid __maybe_unused) +{ + struct ksm_stable_node *stable_node = alloc_stable_node(); + + if (stable_node) { + INIT_HLIST_HEAD(&stable_node->hlist); + stable_node->kpfn = kpfn; + stable_node->rmap_hlist_len = 0; + DO_NUMA(stable_node->nid = nid); + } + return stable_node; +} + /* * stable_tree_insert - insert stable tree node pointing to new ksm page * into the stable tree. @@ -2065,14 +2106,10 @@ static struct ksm_stable_node *stable_tree_insert(struct folio *kfolio) } } - stable_node_dup = alloc_stable_node(); + stable_node_dup = alloc_init_stable_node_dup(kpfn, nid); if (!stable_node_dup) return NULL; - INIT_HLIST_HEAD(&stable_node_dup->hlist); - stable_node_dup->kpfn = kpfn; - stable_node_dup->rmap_hlist_len = 0; - DO_NUMA(stable_node_dup->nid = nid); if (!need_chain) { rb_link_node(&stable_node_dup->node, parent, new); rb_insert_color(&stable_node_dup->node, root); -- 2.43.0 When a hardware memory error occurs on a KSM page, the current behavior is to kill all processes mapping that page. This can be overly aggressive when KSM has multiple duplicate pages in a chain where other duplicates are still healthy. This patch introduces a recovery mechanism that attempts to migrate mappings from the failing KSM page to a newly allocated KSM page or another healthy duplicate already present in the same chain, before falling back to the process-killing procedure. The recovery process works as follows: 1. Identify if the failing KSM page belongs to a stable node chain. 2. Locate a healthy duplicate KSM page within the same chain. 3. For each process mapping the failing page: a. Attempt to allocate a new KSM page copy from healthy duplicate KSM page. If successful, migrate the mapping to this new KSM page. b. If allocation fails, migrate the mapping to the existing healthy duplicate KSM page. 4. If all migrations succeed, remove the failing KSM page from the chain. 5. Only if recovery fails (e.g., no healthy duplicate found or migration error) does the kernel fall back to killing the affected processes. Signed-off-by: Longlong Xia --- mm/ksm.c | 215 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 215 insertions(+) diff --git a/mm/ksm.c b/mm/ksm.c index 13ec057667af..159b486b11f1 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -3121,6 +3121,215 @@ void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc) } #ifdef CONFIG_MEMORY_FAILURE + +static struct rb_node *find_stable_node_in_tree(struct ksm_stable_node *dup_node, + const struct rb_root *root) +{ + struct rb_node *node; + struct ksm_stable_node *stable_node, *dup; + + for (node = rb_first(root); node; node = rb_next(node)) { + stable_node = rb_entry(node, struct ksm_stable_node, node); + if (!is_stable_node_chain(stable_node)) + continue; + hlist_for_each_entry(dup, &stable_node->hlist, hlist_dup) { + if (dup == dup_node) + return node; + } + cond_resched(); + } + return NULL; +} + +static struct ksm_stable_node *find_chain_head(struct ksm_stable_node *dup_node) +{ + struct rb_node *node; + int nid; + + if (!is_stable_node_dup(dup_node)) + return NULL; + + for (nid = 0; nid < ksm_nr_node_ids; nid++) { + node = find_stable_node_in_tree(dup_node, root_stable_tree + nid); + if (node) + return rb_entry(node, struct ksm_stable_node, node); + } + + return NULL; +} + +static struct folio *find_healthy_folio(struct ksm_stable_node *chain_head, + struct ksm_stable_node *failing_node, + struct ksm_stable_node **healthy_stable_node) +{ + struct ksm_stable_node *dup; + struct hlist_node *hlist_safe; + struct folio *healthy_folio; + + if (!is_stable_node_chain(chain_head) || + !is_stable_node_dup(failing_node)) + return NULL; + + hlist_for_each_entry_safe(dup, hlist_safe, &chain_head->hlist, + hlist_dup) { + if (dup == failing_node) + continue; + + healthy_folio = ksm_get_folio(dup, KSM_GET_FOLIO_TRYLOCK); + if (healthy_folio) { + *healthy_stable_node = dup; + return healthy_folio; + } + } + + return NULL; +} + +static struct folio *create_new_stable_node_dup(struct ksm_stable_node *chain_head, + struct folio *healthy_folio, + struct ksm_stable_node **new_stable_node) +{ + struct folio *new_folio; + struct page *new_page; + unsigned long kpfn; + int nid; + + if (!is_stable_node_chain(chain_head)) + return NULL; + + new_page = alloc_page(GFP_HIGHUSER_MOVABLE); + if (!new_page) + return NULL; + + new_folio = page_folio(new_page); + copy_highpage(new_page, folio_page(healthy_folio, 0)); + + kpfn = folio_pfn(new_folio); + nid = get_kpfn_nid(kpfn); + *new_stable_node = alloc_init_stable_node_dup(kpfn, nid); + if (!*new_stable_node) { + folio_put(new_folio); + return NULL; + } + + stable_node_chain_add_dup(*new_stable_node, chain_head); + folio_set_stable_node(new_folio, *new_stable_node); + + /* Lock the folio before adding to LRU, consistent with ksm_get_folio */ + folio_lock(new_folio); + folio_add_lru(new_folio); + + return new_folio; +} + +static void migrate_to_target_dup(struct ksm_stable_node *failing_node, + struct folio *failing_folio, + struct folio *target_folio, + struct ksm_stable_node *target_dup) +{ + struct ksm_rmap_item *rmap_item; + struct hlist_node *hlist_safe; + struct page *target_page = folio_page(target_folio, 0); + int err; + + hlist_for_each_entry_safe(rmap_item, hlist_safe, &failing_node->hlist, hlist) { + struct mm_struct *mm = rmap_item->mm; + const unsigned long addr = rmap_item->address & PAGE_MASK; + struct vm_area_struct *vma; + pte_t orig_pte = __pte(0); + + guard(mmap_read_lock)(mm); + + vma = find_mergeable_vma(mm, addr); + if (!vma) + continue; + + folio_lock(failing_folio); + + err = write_protect_page_addr(vma, failing_folio, addr, &orig_pte); + if (err) { + folio_unlock(failing_folio); + continue; + } + + err = replace_page_addr(vma, &failing_folio->page, target_page, addr, orig_pte); + if (!err) { + hlist_del(&rmap_item->hlist); + rmap_item->head = target_dup; + DO_NUMA(rmap_item->nid = target_dup->nid); + hlist_add_head(&rmap_item->hlist, &target_dup->hlist); + target_dup->rmap_hlist_len++; + failing_node->rmap_hlist_len--; + } + folio_unlock(failing_folio); + } +} + +static bool ksm_recover_within_chain(struct ksm_stable_node *failing_node) +{ + struct folio *failing_folio, *healthy_folio, *target_folio; + struct ksm_stable_node *healthy_stable_node, *chain_head, *target_dup; + struct folio *new_folio = NULL; + struct ksm_stable_node *new_stable_node = NULL; + + if (!is_stable_node_dup(failing_node)) + return false; + + guard(mutex)(&ksm_thread_mutex); + + failing_folio = ksm_get_folio(failing_node, KSM_GET_FOLIO_NOLOCK); + if (!failing_folio) + return false; + + chain_head = find_chain_head(failing_node); + if (!chain_head) { + folio_put(failing_folio); + return false; + } + + healthy_folio = find_healthy_folio(chain_head, failing_node, &healthy_stable_node); + if (!healthy_folio) { + folio_put(failing_folio); + return false; + } + + new_folio = create_new_stable_node_dup(chain_head, healthy_folio, &new_stable_node); + + if (new_folio && new_stable_node) { + target_folio = new_folio; + target_dup = new_stable_node; + + /* Release healthy_folio since we're using new_folio */ + folio_unlock(healthy_folio); + folio_put(healthy_folio); + } else { + target_folio = healthy_folio; + target_dup = healthy_stable_node; + } + + /* + * failing_folio was locked in memory_failure(). Unlock it before + * acquiring mmap_read_lock to avoid lock inversion deadlock. + */ + folio_unlock(failing_folio); + migrate_to_target_dup(failing_node, failing_folio, target_folio, target_dup); + folio_lock(failing_folio); + + folio_unlock(target_folio); + folio_put(target_folio); + + if (failing_node->rmap_hlist_len == 0) { + folio_set_stable_node(failing_folio, NULL); + __stable_node_dup_del(failing_node); + free_stable_node(failing_node); + folio_put(failing_folio); + return true; + } + + folio_put(failing_folio); + return false; +} + /* * Collect processes when the error hit an ksm page. */ @@ -3135,6 +3344,12 @@ void collect_procs_ksm(const struct folio *folio, const struct page *page, stable_node = folio_stable_node(folio); if (!stable_node) return; + + if (ksm_recover_within_chain(stable_node)) { + pr_info("ksm: recovery successful, no need to kill processes\n"); + return; + } + hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { struct anon_vma *av = rmap_item->anon_vma; -- 2.43.0