Add mm_khugepaged_scan event to track the total time for full scan and the total number of pages scanned of khugepaged. Signed-off-by: Vernon Yang Acked-by: David Hildenbrand (Red Hat) --- include/trace/events/huge_memory.h | 24 ++++++++++++++++++++++++ mm/khugepaged.c | 2 ++ 2 files changed, 26 insertions(+) diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index 4cde53b45a85..01225dd27ad5 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -236,5 +236,29 @@ TRACE_EVENT(mm_khugepaged_collapse_file, __print_symbolic(__entry->result, SCAN_STATUS)) ); +TRACE_EVENT(mm_khugepaged_scan, + + TP_PROTO(struct mm_struct *mm, int progress, bool full_scan_finished), + + TP_ARGS(mm, progress, full_scan_finished), + + TP_STRUCT__entry( + __field(struct mm_struct *, mm) + __field(int, progress) + __field(bool, full_scan_finished) + ), + + TP_fast_assign( + __entry->mm = mm; + __entry->progress = progress; + __entry->full_scan_finished = full_scan_finished; + ), + + TP_printk("mm=%p, progress=%d, full_scan_finished=%d", + __entry->mm, + __entry->progress, + __entry->full_scan_finished) +); + #endif /* __HUGE_MEMORY_H */ #include diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 97d1b2824386..9f99f61689f8 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2533,6 +2533,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, collect_mm_slot(slot); } + trace_mm_khugepaged_scan(mm, progress, khugepaged_scan.mm_slot == NULL); + return progress; } -- 2.51.0 The following data is traced by bpftrace on a desktop system. After the system has been left idle for 10 minutes upon booting, a lot of SCAN_PMD_MAPPED or SCAN_NO_PTE_TABLE are observed during a full scan by khugepaged. @scan_pmd_status[1]: 1 ## SCAN_SUCCEED @scan_pmd_status[6]: 2 ## SCAN_EXCEED_SHARED_PTE @scan_pmd_status[3]: 142 ## SCAN_PMD_MAPPED @scan_pmd_status[2]: 178 ## SCAN_NO_PTE_TABLE total progress size: 674 MB Total time : 419 seconds ## include khugepaged_scan_sleep_millisecs The khugepaged_scan list save all task that support collapse into hugepage, as long as the task is not destroyed, khugepaged will not remove it from the khugepaged_scan list. This exist a phenomenon where task has already collapsed all memory regions into hugepage, but khugepaged continues to scan it, which wastes CPU time and invalid, and due to khugepaged_scan_sleep_millisecs (default 10s) causes a long wait for scanning a large number of invalid task, so scanning really valid task is later. After applying this patch, when the memory is either SCAN_PMD_MAPPED or SCAN_NO_PTE_TABLE, just skip it, as follow: @scan_pmd_status[6]: 2 @scan_pmd_status[3]: 147 @scan_pmd_status[2]: 173 total progress size: 45 MB Total time : 20 seconds Signed-off-by: Vernon Yang --- mm/khugepaged.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 9f99f61689f8..2b3685b195f5 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -66,7 +66,10 @@ enum scan_result { static struct task_struct *khugepaged_thread __read_mostly; static DEFINE_MUTEX(khugepaged_mutex); -/* default scan 8*HPAGE_PMD_NR ptes (or vmas) every 10 second */ +/* + * default scan 8*HPAGE_PMD_NR ptes, pmd_mapped, no_pte_table or vmas + * every 10 second. + */ static unsigned int khugepaged_pages_to_scan __read_mostly; static unsigned int khugepaged_pages_collapsed; static unsigned int khugepaged_full_scans; @@ -2487,12 +2490,22 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, khugepaged_scan.address, &mmap_locked, cc); } - if (*result == SCAN_SUCCEED) - ++khugepaged_pages_collapsed; - /* move to next address */ khugepaged_scan.address += HPAGE_PMD_SIZE; - progress += HPAGE_PMD_NR; + + switch (*result) { + case SCAN_NO_PTE_TABLE: + case SCAN_PMD_MAPPED: + case SCAN_PTE_MAPPED_HUGEPAGE: + progress++; + break; + case SCAN_SUCCEED: + ++khugepaged_pages_collapsed; + fallthrough; + default: + progress += HPAGE_PMD_NR; + } + if (!mmap_locked) /* * We released mmap_lock so break loop. Note -- 2.51.0 For example, create three task: hot1 -> cold -> hot2. After all three task are created, each allocate memory 128MB. the hot1/hot2 task continuously access 128 MB memory, while the cold task only accesses its memory briefly andthen call madvise(MADV_COLD). However, khugepaged still prioritizes scanning the cold task and only scans the hot2 task after completing the scan of the cold task. So if the user has explicitly informed us via MADV_COLD/FREE that this memory is cold or will be freed, it is appropriate for khugepaged to skip it only, thereby avoiding unnecessary scan and collapse operations to reducing CPU wastage. Here are the performance test results: (Throughput bigger is better, other smaller is better) Testing on x86_64 machine: | task hot2 | without patch | with patch | delta | |---------------------|---------------|---------------|---------| | total accesses time | 3.14 sec | 2.93 sec | -6.69% | | cycles per access | 4.96 | 2.21 | -55.44% | | Throughput | 104.38 M/sec | 111.89 M/sec | +7.19% | | dTLB-load-misses | 284814532 | 69597236 | -75.56% | Testing on qemu-system-x86_64 -enable-kvm: | task hot2 | without patch | with patch | delta | |---------------------|---------------|---------------|---------| | total accesses time | 3.35 sec | 2.96 sec | -11.64% | | cycles per access | 7.29 | 2.07 | -71.60% | | Throughput | 97.67 M/sec | 110.77 M/sec | +13.41% | | dTLB-load-misses | 241600871 | 3216108 | -98.67% | Signed-off-by: Vernon Yang --- mm/madvise.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index b617b1be0f53..3a48d725a3fc 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1360,11 +1360,8 @@ static int madvise_vma_behavior(struct madvise_behavior *madv_behavior) return madvise_remove(madv_behavior); case MADV_WILLNEED: return madvise_willneed(madv_behavior); - case MADV_COLD: - return madvise_cold(madv_behavior); case MADV_PAGEOUT: return madvise_pageout(madv_behavior); - case MADV_FREE: case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: return madvise_dontneed_free(madv_behavior); @@ -1378,6 +1375,18 @@ static int madvise_vma_behavior(struct madvise_behavior *madv_behavior) /* The below behaviours update VMAs via madvise_update_vma(). */ + case MADV_COLD: + error = madvise_cold(madv_behavior); + if (error) + goto out; + new_flags = (new_flags & ~VM_HUGEPAGE) | VM_NOHUGEPAGE; + break; + case MADV_FREE: + error = madvise_dontneed_free(madv_behavior); + if (error) + goto out; + new_flags = (new_flags & ~VM_HUGEPAGE) | VM_NOHUGEPAGE; + break; case MADV_NORMAL: new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; break; @@ -1756,7 +1765,6 @@ static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavi switch (madv_behavior->behavior) { case MADV_REMOVE: case MADV_WILLNEED: - case MADV_COLD: case MADV_PAGEOUT: case MADV_POPULATE_READ: case MADV_POPULATE_WRITE: @@ -1766,7 +1774,6 @@ static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavi case MADV_GUARD_REMOVE: case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: - case MADV_FREE: return MADVISE_VMA_READ_LOCK; default: return MADVISE_MMAP_WRITE_LOCK; -- 2.51.0 When an mm with the MMF_DISABLE_THP_COMPLETELY flag is detected during scanning, directly set khugepaged_scan.mm_slot to the next mm_slot, reduce redundant operation. Signed-off-by: Vernon Yang --- mm/khugepaged.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 2b3685b195f5..72be87ef384b 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2439,6 +2439,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, cond_resched(); if (unlikely(hpage_collapse_test_exit_or_disable(mm))) { + vma = NULL; progress++; break; } @@ -2459,8 +2460,10 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, bool mmap_locked = true; cond_resched(); - if (unlikely(hpage_collapse_test_exit_or_disable(mm))) + if (unlikely(hpage_collapse_test_exit_or_disable(mm))) { + vma = NULL; goto breakouterloop; + } VM_BUG_ON(khugepaged_scan.address < hstart || khugepaged_scan.address + HPAGE_PMD_SIZE > @@ -2477,8 +2480,10 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, fput(file); if (*result == SCAN_PTE_MAPPED_HUGEPAGE) { mmap_read_lock(mm); - if (hpage_collapse_test_exit_or_disable(mm)) + if (hpage_collapse_test_exit_or_disable(mm)) { + vma = NULL; goto breakouterloop; + } *result = collapse_pte_mapped_thp(mm, khugepaged_scan.address, false); if (*result == SCAN_PMD_MAPPED) -- 2.51.0