The following data is traced by bpftrace on a desktop system. After the system has been left idle for 10 minutes upon booting, a lot of SCAN_PMD_MAPPED or SCAN_PMD_NONE are observed during a full scan by khugepaged. @scan_pmd_status[1]: 1 ## SCAN_SUCCEED @scan_pmd_status[4]: 158 ## SCAN_PMD_MAPPED @scan_pmd_status[3]: 174 ## SCAN_PMD_NONE total progress size: 701 MB Total time : 440 seconds ## include khugepaged_scan_sleep_millisecs The khugepaged_scan list save all task that support collapse into hugepage, as long as the take is not destroyed, khugepaged will not remove it from the khugepaged_scan list. This exist a phenomenon where task has already collapsed all memory regions into hugepage, but khugepaged continues to scan it, which wastes CPU time and invalid, and due to khugepaged_scan_sleep_millisecs (default 10s) causes a long wait for scanning a large number of invalid task, so scanning really valid task is later. After applying this patch, when all memory is either SCAN_PMD_MAPPED or SCAN_PMD_NONE, the mm is automatically removed from khugepaged's scan list. If the page fault or MADV_HUGEPAGE again, it is added back to khugepaged. Signed-off-by: Vernon Yang --- mm/khugepaged.c | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 0598a19a98cc..1ec1af5be3c8 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -115,6 +115,7 @@ struct khugepaged_scan { struct list_head mm_head; struct mm_slot *mm_slot; unsigned long address; + bool maybe_collapse; }; static struct khugepaged_scan khugepaged_scan = { @@ -1420,22 +1421,19 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, return result; } -static void collect_mm_slot(struct mm_slot *slot) +static void collect_mm_slot(struct mm_slot *slot, bool maybe_collapse) { struct mm_struct *mm = slot->mm; lockdep_assert_held(&khugepaged_mm_lock); - if (hpage_collapse_test_exit(mm)) { + if (hpage_collapse_test_exit(mm) || !maybe_collapse) { /* free mm_slot */ hash_del(&slot->hash); list_del(&slot->mm_node); - /* - * Not strictly needed because the mm exited already. - * - * mm_flags_clear(MMF_VM_HUGEPAGE, mm); - */ + if (!maybe_collapse) + mm_flags_clear(MMF_VM_HUGEPAGE, mm); /* khugepaged_mm_lock actually not necessary for the below */ mm_slot_free(mm_slot_cache, slot); @@ -2397,6 +2395,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, struct mm_slot, mm_node); khugepaged_scan.address = 0; khugepaged_scan.mm_slot = slot; + khugepaged_scan.maybe_collapse = false; } spin_unlock(&khugepaged_mm_lock); @@ -2470,8 +2469,18 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, khugepaged_scan.address, &mmap_locked, cc); } - if (*result == SCAN_SUCCEED) + switch (*result) { + case SCAN_PMD_NULL: + case SCAN_PMD_NONE: + case SCAN_PMD_MAPPED: + case SCAN_PTE_MAPPED_HUGEPAGE: + break; + case SCAN_SUCCEED: ++khugepaged_pages_collapsed; + fallthrough; + default: + khugepaged_scan.maybe_collapse = true; + } /* move to next address */ khugepaged_scan.address += HPAGE_PMD_SIZE; @@ -2500,6 +2509,11 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, * if we scanned all vmas of this mm. */ if (hpage_collapse_test_exit(mm) || !vma) { + bool maybe_collapse = khugepaged_scan.maybe_collapse; + + if (mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm)) + maybe_collapse = true; + /* * Make sure that if mm_users is reaching zero while * khugepaged runs here, khugepaged_exit will find @@ -2508,12 +2522,13 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, if (!list_is_last(&slot->mm_node, &khugepaged_scan.mm_head)) { khugepaged_scan.mm_slot = list_next_entry(slot, mm_node); khugepaged_scan.address = 0; + khugepaged_scan.maybe_collapse = false; } else { khugepaged_scan.mm_slot = NULL; khugepaged_full_scans++; } - collect_mm_slot(slot); + collect_mm_slot(slot, maybe_collapse); } trace_mm_khugepaged_scan(mm, progress, khugepaged_scan.mm_slot == NULL); @@ -2616,7 +2631,7 @@ static int khugepaged(void *none) slot = khugepaged_scan.mm_slot; khugepaged_scan.mm_slot = NULL; if (slot) - collect_mm_slot(slot); + collect_mm_slot(slot, true); spin_unlock(&khugepaged_mm_lock); return 0; } -- 2.51.0