From: Lance Yang Let's skip unsuitable VMAs early in the khugepaged scan; specifically, mlocked VMAs should not be touched. Note that the only other user of the VM_NO_KHUGEPAGED mask is __thp_vma_allowable_orders(), which is also used by the MADV_COLLAPSE path. Since MADV_COLLAPSE has different rules (e.g., for mlocked VMAs), we cannot simply make the shared mask stricter as that would break it. So, we also introduce a new VM_NO_THP_COLLAPSE mask for that helper, leaving the stricter checks to be applied only within the khugepaged path itself. Signed-off-by: Lance Yang --- include/linux/mm.h | 6 +++++- mm/huge_memory.c | 2 +- mm/khugepaged.c | 14 +++++++++++++- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index be3e6fb4d0db..cb54d94b2343 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -505,7 +505,11 @@ extern unsigned int kobjsize(const void *objp); #define VM_REMAP_FLAGS (VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP) /* This mask prevents VMA from being scanned with khugepaged */ -#define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB) +#define VM_NO_KHUGEPAGED \ + (VM_SPECIAL | VM_HUGETLB | VM_LOCKED_MASK | VM_NOHUGEPAGE) + +/* This mask prevents VMA from being collapsed by any THP path */ +#define VM_NO_THP_COLLAPSE (VM_SPECIAL | VM_HUGETLB) /* This mask defines which mm->def_flags a process can inherit its parent */ #define VM_INIT_DEF_MASK VM_NOHUGEPAGE diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d6fc669e11c1..2e91526a037f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -134,7 +134,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, * Must be checked after dax since some dax mappings may have * VM_MIXEDMAP set. */ - if (!in_pf && !smaps && (vm_flags & VM_NO_KHUGEPAGED)) + if (!in_pf && !smaps && (vm_flags & VM_NO_THP_COLLAPSE)) return 0; /* diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 7c5ff1b23e93..e54f99bb0b57 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -345,6 +345,17 @@ struct attribute_group khugepaged_attr_group = { }; #endif /* CONFIG_SYSFS */ +/** + * khugepaged_should_scan_vma - check if a VMA is a candidate for collapse + * @vm_flags: The flags of the VMA to check. + * + * Returns: true if the VMA should be scanned by khugepaged, false otherwise. + */ +static inline bool khugepaged_should_scan_vma(vm_flags_t vm_flags) +{ + return !(vm_flags & VM_NO_KHUGEPAGED); +} + int hugepage_madvise(struct vm_area_struct *vma, vm_flags_t *vm_flags, int advice) { @@ -2443,7 +2454,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, progress++; break; } - if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) { + if (!khugepaged_should_scan_vma(vma->vm_flags) || + !thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) { skip: progress++; continue; -- 2.49.0 From: Lance Yang is_guard_pte_marker() performs a redundant check because it calls both is_pte_marker() and is_guard_swp_entry(), both of which internally check for a PTE marker. is_guard_pte_marker() |- is_pte_marker() | `- is_pte_marker_entry() // First check `- is_guard_swp_entry() `- is_pte_marker_entry() // Second, redundant check While a modern compiler could likely optimize this away, let's have clean code and not rely on it ;) Also, make it available for hugepage collapsing code. Cc: Kairui Song Signed-off-by: Lance Yang --- include/linux/swapops.h | 6 ++++++ mm/madvise.c | 6 ------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 59c5889a4d54..7f5684fa043b 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -469,6 +469,12 @@ static inline int is_guard_swp_entry(swp_entry_t entry) (pte_marker_get(entry) & PTE_MARKER_GUARD); } +static inline bool is_guard_pte_marker(pte_t ptent) +{ + return is_swap_pte(ptent) && + is_guard_swp_entry(pte_to_swp_entry(ptent)); +} + /* * This is a special version to check pte_none() just to cover the case when * the pte is a pte marker. It existed because in many cases the pte marker diff --git a/mm/madvise.c b/mm/madvise.c index 35ed4ab0d7c5..bd46e6788fac 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1069,12 +1069,6 @@ static bool is_valid_guard_vma(struct vm_area_struct *vma, bool allow_locked) return !(vma->vm_flags & disallowed); } -static bool is_guard_pte_marker(pte_t ptent) -{ - return is_pte_marker(ptent) && - is_guard_swp_entry(pte_to_swp_entry(ptent)); -} - static int guard_install_pud_entry(pud_t *pud, unsigned long addr, unsigned long next, struct mm_walk *walk) { -- 2.49.0 From: Lance Yang Guard PTE markers are installed via MADV_GUARD_INSTALL to create lightweight guard regions. Currently, any collapse path (khugepaged or MADV_COLLAPSE) will fail when encountering such a range. MADV_COLLAPSE fails deep inside the collapse logic when trying to swap-in the special marker in __collapse_huge_page_swapin(). hpage_collapse_scan_pmd() `- collapse_huge_page() `- __collapse_huge_page_swapin() -> fails! khugepaged's behavior is slightly different due to its max_ptes_swap limit (default 64). It won't fail as deep, but it will still needlessly scan up to 64 swap entries before bailing out. IMHO, we can and should detect this much earlier ;) This patch adds a check directly inside the PTE scan loop. If a guard marker is found, the scan is aborted immediately with a new SCAN_PTE_GUARD status, avoiding wasted work. Signed-off-by: Lance Yang --- mm/khugepaged.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index e54f99bb0b57..910a6f2ec8a9 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -59,6 +59,7 @@ enum scan_result { SCAN_STORE_FAILED, SCAN_COPY_MC, SCAN_PAGE_FILLED, + SCAN_PTE_GUARD, }; #define CREATE_TRACE_POINTS @@ -1317,6 +1318,16 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, result = SCAN_PTE_UFFD_WP; goto out_unmap; } + /* + * Guard PTE markers are installed by + * MADV_GUARD_INSTALL. Any collapse path must + * not touch them, so abort the scan immediately + * if one is found. + */ + if (is_guard_pte_marker(pteval)) { + result = SCAN_PTE_GUARD; + goto out_unmap; + } continue; } else { result = SCAN_EXCEED_SWAP_PTE; @@ -2860,6 +2871,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start, case SCAN_PAGE_COMPOUND: case SCAN_PAGE_LRU: case SCAN_DEL_PAGE_LRU: + case SCAN_PTE_GUARD: last_fail = result; break; default: -- 2.49.0