Add mm_khugepaged_scan event to track the total time for full scan and the total number of pages scanned of khugepaged. Signed-off-by: Vernon Yang --- include/trace/events/huge_memory.h | 24 ++++++++++++++++++++++++ mm/khugepaged.c | 2 ++ 2 files changed, 26 insertions(+) diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index dd94d14a2427..b2824c2f8238 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -237,5 +237,29 @@ TRACE_EVENT(mm_khugepaged_collapse_file, __print_symbolic(__entry->result, SCAN_STATUS)) ); +TRACE_EVENT(mm_khugepaged_scan, + + TP_PROTO(struct mm_struct *mm, int progress, bool full), + + TP_ARGS(mm, progress, full), + + TP_STRUCT__entry( + __field(struct mm_struct *, mm) + __field(int, progress) + __field(bool, full) + ), + + TP_fast_assign( + __entry->mm = mm; + __entry->progress = progress; + __entry->full = full; + ), + + TP_printk("mm=%p, progress=%d, full=%d", + __entry->mm, + __entry->progress, + __entry->full) +); + #endif /* __HUGE_MEMORY_H */ #include diff --git a/mm/khugepaged.c b/mm/khugepaged.c index abe54f0043c7..0598a19a98cc 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2516,6 +2516,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, collect_mm_slot(slot); } + trace_mm_khugepaged_scan(mm, progress, khugepaged_scan.mm_slot == NULL); + return progress; } -- 2.51.0 The following data is traced by bpftrace on a desktop system. After the system has been left idle for 10 minutes upon booting, a lot of SCAN_PMD_MAPPED or SCAN_PMD_NONE are observed during a full scan by khugepaged. @scan_pmd_status[1]: 1 ## SCAN_SUCCEED @scan_pmd_status[4]: 158 ## SCAN_PMD_MAPPED @scan_pmd_status[3]: 174 ## SCAN_PMD_NONE total progress size: 701 MB Total time : 440 seconds ## include khugepaged_scan_sleep_millisecs The khugepaged_scan list save all task that support collapse into hugepage, as long as the take is not destroyed, khugepaged will not remove it from the khugepaged_scan list. This exist a phenomenon where task has already collapsed all memory regions into hugepage, but khugepaged continues to scan it, which wastes CPU time and invalid, and due to khugepaged_scan_sleep_millisecs (default 10s) causes a long wait for scanning a large number of invalid task, so scanning really valid task is later. After applying this patch, when all memory is either SCAN_PMD_MAPPED or SCAN_PMD_NONE, the mm is automatically removed from khugepaged's scan list. If the page fault or MADV_HUGEPAGE again, it is added back to khugepaged. Signed-off-by: Vernon Yang --- mm/khugepaged.c | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 0598a19a98cc..1ec1af5be3c8 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -115,6 +115,7 @@ struct khugepaged_scan { struct list_head mm_head; struct mm_slot *mm_slot; unsigned long address; + bool maybe_collapse; }; static struct khugepaged_scan khugepaged_scan = { @@ -1420,22 +1421,19 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, return result; } -static void collect_mm_slot(struct mm_slot *slot) +static void collect_mm_slot(struct mm_slot *slot, bool maybe_collapse) { struct mm_struct *mm = slot->mm; lockdep_assert_held(&khugepaged_mm_lock); - if (hpage_collapse_test_exit(mm)) { + if (hpage_collapse_test_exit(mm) || !maybe_collapse) { /* free mm_slot */ hash_del(&slot->hash); list_del(&slot->mm_node); - /* - * Not strictly needed because the mm exited already. - * - * mm_flags_clear(MMF_VM_HUGEPAGE, mm); - */ + if (!maybe_collapse) + mm_flags_clear(MMF_VM_HUGEPAGE, mm); /* khugepaged_mm_lock actually not necessary for the below */ mm_slot_free(mm_slot_cache, slot); @@ -2397,6 +2395,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, struct mm_slot, mm_node); khugepaged_scan.address = 0; khugepaged_scan.mm_slot = slot; + khugepaged_scan.maybe_collapse = false; } spin_unlock(&khugepaged_mm_lock); @@ -2470,8 +2469,18 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, khugepaged_scan.address, &mmap_locked, cc); } - if (*result == SCAN_SUCCEED) + switch (*result) { + case SCAN_PMD_NULL: + case SCAN_PMD_NONE: + case SCAN_PMD_MAPPED: + case SCAN_PTE_MAPPED_HUGEPAGE: + break; + case SCAN_SUCCEED: ++khugepaged_pages_collapsed; + fallthrough; + default: + khugepaged_scan.maybe_collapse = true; + } /* move to next address */ khugepaged_scan.address += HPAGE_PMD_SIZE; @@ -2500,6 +2509,11 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, * if we scanned all vmas of this mm. */ if (hpage_collapse_test_exit(mm) || !vma) { + bool maybe_collapse = khugepaged_scan.maybe_collapse; + + if (mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm)) + maybe_collapse = true; + /* * Make sure that if mm_users is reaching zero while * khugepaged runs here, khugepaged_exit will find @@ -2508,12 +2522,13 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, if (!list_is_last(&slot->mm_node, &khugepaged_scan.mm_head)) { khugepaged_scan.mm_slot = list_next_entry(slot, mm_node); khugepaged_scan.address = 0; + khugepaged_scan.maybe_collapse = false; } else { khugepaged_scan.mm_slot = NULL; khugepaged_full_scans++; } - collect_mm_slot(slot); + collect_mm_slot(slot, maybe_collapse); } trace_mm_khugepaged_scan(mm, progress, khugepaged_scan.mm_slot == NULL); @@ -2616,7 +2631,7 @@ static int khugepaged(void *none) slot = khugepaged_scan.mm_slot; khugepaged_scan.mm_slot = NULL; if (slot) - collect_mm_slot(slot); + collect_mm_slot(slot, true); spin_unlock(&khugepaged_mm_lock); return 0; } -- 2.51.0 For example, create three task: hot1 -> cold -> hot2. After all three task are created, each allocate memory 128MB. the hot1/hot2 task continuously access 128 MB memory, while the cold task only accesses its memory briefly andthen call madvise(MADV_COLD). However, khugepaged still prioritizes scanning the cold task and only scans the hot2 task after completing the scan of the cold task. So if the user has explicitly informed us via MADV_COLD/FREE that this memory is cold or will be freed, it is appropriate for khugepaged to scan it only at the latest possible moment, thereby avoiding unnecessary scan and collapse operations to reducing CPU wastage. Here are the performance test results: (Throughput bigger is better, other smaller is better) Testing on x86_64 machine: | task hot2 | without patch | with patch | delta | |---------------------|---------------|---------------|---------| | total accesses time | 3.14 sec | 2.92 sec | -7.01% | | cycles per access | 4.91 | 2.07 | -57.84% | | Throughput | 104.38 M/sec | 112.12 M/sec | +7.42% | | dTLB-load-misses | 288966432 | 1292908 | -99.55% | Testing on qemu-system-x86_64 -enable-kvm: | task hot2 | without patch | with patch | delta | |---------------------|---------------|---------------|---------| | total accesses time | 3.35 sec | 2.96 sec | -11.64% | | cycles per access | 7.23 | 2.12 | -70.68% | | Throughput | 97.88 M/sec | 110.76 M/sec | +13.16% | | dTLB-load-misses | 237406497 | 3189194 | -98.66% | Signed-off-by: Vernon Yang --- include/linux/khugepaged.h | 1 + mm/khugepaged.c | 14 ++++++++++++++ mm/madvise.c | 3 +++ 3 files changed, 18 insertions(+) diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index eb1946a70cff..726e99de84e9 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -15,6 +15,7 @@ extern void __khugepaged_enter(struct mm_struct *mm); extern void __khugepaged_exit(struct mm_struct *mm); extern void khugepaged_enter_vma(struct vm_area_struct *vma, vm_flags_t vm_flags); +void khugepaged_move_tail(struct mm_struct *mm); extern void khugepaged_min_free_kbytes_update(void); extern bool current_is_khugepaged(void); extern int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 1ec1af5be3c8..91836dda2015 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -468,6 +468,20 @@ void khugepaged_enter_vma(struct vm_area_struct *vma, } } +void khugepaged_move_tail(struct mm_struct *mm) +{ + struct mm_slot *slot; + + if (!mm_flags_test(MMF_VM_HUGEPAGE, mm)) + return; + + spin_lock(&khugepaged_mm_lock); + slot = mm_slot_lookup(mm_slots_hash, mm); + if (slot && khugepaged_scan.mm_slot != slot) + list_move_tail(&slot->mm_node, &khugepaged_scan.mm_head); + spin_unlock(&khugepaged_mm_lock); +} + void __khugepaged_exit(struct mm_struct *mm) { struct mm_slot *slot; diff --git a/mm/madvise.c b/mm/madvise.c index fb1c86e630b6..3f9ca7af2c82 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -608,6 +608,8 @@ static long madvise_cold(struct madvise_behavior *madv_behavior) madvise_cold_page_range(&tlb, madv_behavior); tlb_finish_mmu(&tlb); + khugepaged_move_tail(vma->vm_mm); + return 0; } @@ -835,6 +837,7 @@ static int madvise_free_single_vma(struct madvise_behavior *madv_behavior) &walk_ops, tlb); tlb_end_vma(tlb, vma); mmu_notifier_invalidate_range_end(&range); + khugepaged_move_tail(mm); return 0; } -- 2.51.0 When an mm with the MMF_DISABLE_THP_COMPLETELY flag is detected during scanning, directly set khugepaged_scan.mm_slot to the next mm_slot, reduce redundant operation. Signed-off-by: Vernon Yang --- mm/khugepaged.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 91836dda2015..a8723eea12f1 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2432,6 +2432,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, cond_resched(); if (unlikely(hpage_collapse_test_exit_or_disable(mm))) { + vma = NULL; progress++; break; } @@ -2452,8 +2453,10 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, bool mmap_locked = true; cond_resched(); - if (unlikely(hpage_collapse_test_exit_or_disable(mm))) + if (unlikely(hpage_collapse_test_exit_or_disable(mm))) { + vma = NULL; goto breakouterloop; + } VM_BUG_ON(khugepaged_scan.address < hstart || khugepaged_scan.address + HPAGE_PMD_SIZE > @@ -2470,8 +2473,10 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, fput(file); if (*result == SCAN_PTE_MAPPED_HUGEPAGE) { mmap_read_lock(mm); - if (hpage_collapse_test_exit_or_disable(mm)) + if (hpage_collapse_test_exit_or_disable(mm)) { + vma = NULL; goto breakouterloop; + } *result = collapse_pte_mapped_thp(mm, khugepaged_scan.address, false); if (*result == SCAN_PMD_MAPPED) -- 2.51.0