From: Lance Yang As David suggested, the PTE scanning logic in hpage_collapse_scan_pmd() and __collapse_huge_page_isolate() was almost duplicated. This patch cleans things up by moving all the common PTE checking logic into a new shared helper, thp_collapse_check_pte(). Suggested-by: David Hildenbrand Signed-off-by: Lance Yang --- mm/khugepaged.c | 167 ++++++++++++++++++++++++++++++------------------ 1 file changed, 104 insertions(+), 63 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 808523f92c7b..2a897cfb1d03 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -61,6 +61,12 @@ enum scan_result { SCAN_PAGE_FILLED, }; +enum pte_check_result { + PTE_CHECK_SUCCEED, + PTE_CHECK_CONTINUE, + PTE_CHECK_FAIL, +}; + #define CREATE_TRACE_POINTS #include @@ -533,6 +539,87 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte, } } +/* + * thp_collapse_check_pte - Check if a PTE is suitable for THP collapse + * @pte: PTE to check + * @vma: VMA the PTE belongs to + * @cc: Collapse control settings + * @scan_swap_pte: Allow scanning of swap PTEs if true + * @none_or_zero: Counter for none/zero PTEs (must be non-NULL) + * @unmapped: Counter for swap PTEs (must be non-NULL if scan_swap_pte + * is true) + * @scan_result: Used to return the failure reason (SCAN_*) on a + * PTE_CHECK_FAIL return. Must be non-NULL + * + * Returns: + * PTE_CHECK_SUCCEED - Valid PTE, proceed with collapse + * PTE_CHECK_CONTINUE - Skip this none/zero PTE but continue scanning + * PTE_CHECK_FAIL - Abort collapse scan + */ +static inline int thp_collapse_check_pte(pte_t pte, struct vm_area_struct *vma, + struct collapse_control *cc, bool scan_swap_pte, + int *none_or_zero, int *unmapped, int *scan_result) +{ + VM_BUG_ON(!none_or_zero || !scan_result); + VM_BUG_ON(scan_swap_pte && !unmapped); + + if (pte_none(pte) || is_zero_pfn(pte_pfn(pte))) { + (*none_or_zero)++; + if (!userfaultfd_armed(vma) && + (!cc->is_khugepaged || + *none_or_zero <= khugepaged_max_ptes_none)) { + return PTE_CHECK_CONTINUE; + } else { + *scan_result = SCAN_EXCEED_NONE_PTE; + count_vm_event(THP_SCAN_EXCEED_NONE_PTE); + return PTE_CHECK_FAIL; + } + } else if (!pte_present(pte)) { + if (!scan_swap_pte) { + *scan_result = SCAN_PTE_NON_PRESENT; + return PTE_CHECK_FAIL; + } + + if (non_swap_entry(pte_to_swp_entry(pte))) { + *scan_result = SCAN_PTE_NON_PRESENT; + return PTE_CHECK_FAIL; + } + + (*unmapped)++; + if (!cc->is_khugepaged || + *unmapped <= khugepaged_max_ptes_swap) { + /* + * Always be strict with uffd-wp + * enabled swap entries. Please see + * comment below for pte_uffd_wp(). + */ + if (pte_swp_uffd_wp(pte)) { + *scan_result = SCAN_PTE_UFFD_WP; + return PTE_CHECK_FAIL; + } + return PTE_CHECK_CONTINUE; + } else { + *scan_result = SCAN_EXCEED_SWAP_PTE; + count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); + return PTE_CHECK_FAIL; + } + } else if (pte_uffd_wp(pte)) { + /* + * Don't collapse the page if any of the small + * PTEs are armed with uffd write protection. + * Here we can also mark the new huge pmd as + * write protected if any of the small ones is + * marked but that could bring unknown + * userfault messages that falls outside of + * the registered range. So, just be simple. + */ + *scan_result = SCAN_PTE_UFFD_WP; + return PTE_CHECK_FAIL; + } + + return PTE_CHECK_SUCCEED; +} + static int __collapse_huge_page_isolate(struct vm_area_struct *vma, unsigned long start_addr, pte_t *pte, @@ -544,28 +631,20 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, unsigned long addr = start_addr; pte_t *_pte; int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0; + int pte_check_res; for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, addr += PAGE_SIZE) { pte_t pteval = ptep_get(_pte); - if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { - ++none_or_zero; - if (!userfaultfd_armed(vma) && - (!cc->is_khugepaged || - none_or_zero <= khugepaged_max_ptes_none)) { - continue; - } else { - result = SCAN_EXCEED_NONE_PTE; - count_vm_event(THP_SCAN_EXCEED_NONE_PTE); - goto out; - } - } else if (!pte_present(pteval)) { - result = SCAN_PTE_NON_PRESENT; - goto out; - } else if (pte_uffd_wp(pteval)) { - result = SCAN_PTE_UFFD_WP; + pte_check_res = thp_collapse_check_pte( + pteval, vma, cc, false, /* scan_swap_pte = false */ + &none_or_zero, NULL, &result); + + if (pte_check_res == PTE_CHECK_CONTINUE) + continue; + else if (pte_check_res == PTE_CHECK_FAIL) goto out; - } + page = vm_normal_page(vma, addr, pteval); if (unlikely(!page) || unlikely(is_zone_device_page(page))) { result = SCAN_PAGE_NULL; @@ -1260,6 +1339,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, unsigned long addr; spinlock_t *ptl; int node = NUMA_NO_NODE, unmapped = 0; + int pte_check_res; VM_BUG_ON(start_addr & ~HPAGE_PMD_MASK); @@ -1278,54 +1358,15 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, addr += PAGE_SIZE) { pte_t pteval = ptep_get(_pte); - if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { - ++none_or_zero; - if (!userfaultfd_armed(vma) && - (!cc->is_khugepaged || - none_or_zero <= khugepaged_max_ptes_none)) { - continue; - } else { - result = SCAN_EXCEED_NONE_PTE; - count_vm_event(THP_SCAN_EXCEED_NONE_PTE); - goto out_unmap; - } - } else if (!pte_present(pteval)) { - if (non_swap_entry(pte_to_swp_entry(pteval))) { - result = SCAN_PTE_NON_PRESENT; - goto out_unmap; - } - ++unmapped; - if (!cc->is_khugepaged || - unmapped <= khugepaged_max_ptes_swap) { - /* - * Always be strict with uffd-wp - * enabled swap entries. Please see - * comment below for pte_uffd_wp(). - */ - if (pte_swp_uffd_wp(pteval)) { - result = SCAN_PTE_UFFD_WP; - goto out_unmap; - } - continue; - } else { - result = SCAN_EXCEED_SWAP_PTE; - count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); - goto out_unmap; - } - } else if (pte_uffd_wp(pteval)) { - /* - * Don't collapse the page if any of the small - * PTEs are armed with uffd write protection. - * Here we can also mark the new huge pmd as - * write protected if any of the small ones is - * marked but that could bring unknown - * userfault messages that falls outside of - * the registered range. So, just be simple. - */ - result = SCAN_PTE_UFFD_WP; + pte_check_res = thp_collapse_check_pte( + pteval, vma, cc, true, /* scan_swap_pte = true */ + &none_or_zero, &unmapped, &result); + + if (pte_check_res == PTE_CHECK_CONTINUE) + continue; + else if (pte_check_res == PTE_CHECK_FAIL) goto out_unmap; - } page = vm_normal_page(vma, addr, pteval); if (unlikely(!page) || unlikely(is_zone_device_page(page))) { -- 2.49.0