When MADV_COLLAPSE is called on file-backed mappings (e.g., executable text sections), the pages may still be dirty from recent writes. The current code triggers an async flush via filemap_flush() and returns SCAN_FAIL, requiring userspace to retry the operation. This is problematic for userspace that wants to collapse text pages into THPs to reduce ITLB pressure. The first madvise() call always fails with EINVAL, and only subsequent calls succeed after writeback completes. For direct MADV_COLLAPSE calls (!cc->is_khugepaged), perform a synchronous writeback using filemap_write_and_wait_range() before scanning the folios. This ensures that folios are clean on the first attempt. Reported-by: Branden Moore Closes: https://lore.kernel.org/all/4e26fe5e-7374-467c-a333-9dd48f85d7cc@amd.com Fixes: 34488399fa08 ("mm/madvise: add file and shmem support to MADV_COLLAPSE") Suggested-by: David Hildenbrand Signed-off-by: Shivank Garg --- Applies cleanly on: 6.18-rc5 mm-stable:e9a6fb0bc mm/khugepaged.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index abe54f0043c7..d08ed6eb9ce1 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -1845,6 +1846,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, struct page *dst; struct folio *folio, *tmp, *new_folio; pgoff_t index = 0, end = start + HPAGE_PMD_NR; + loff_t range_start, range_end; LIST_HEAD(pagelist); XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); int nr_none = 0, result = SCAN_SUCCEED; @@ -1853,6 +1855,21 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem); VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); + /* + * For MADV_COLLAPSE on regular files, do a synchronous writeback + * to ensure dirty folios are flushed before we attempt collapse. + * This is a best-effort approach to avoid failing on the first + * attempt when freshly-written executable text is still dirty. + */ + if (!is_shmem && cc && !cc->is_khugepaged && mapping_can_writeback(mapping)) { + range_start = (loff_t)start << PAGE_SHIFT; + range_end = ((loff_t)end << PAGE_SHIFT) - 1; + if (filemap_write_and_wait_range(mapping, range_start, range_end)) { + result = SCAN_FAIL; + goto out; + } + } + result = alloc_charge_folio(&new_folio, mm, cc); if (result != SCAN_SUCCEED) goto out; base-commit: e9a6fb0bcdd7609be6969112f3fbfcce3b1d4a7c -- 2.43.0 When MADV_COLLAPSE encounters dirty file-backed pages, it currently returns -EINVAL, this is misleading as EINVAL suggests invalid arguments, whereas dirty pages are a transient condition that may resolve on retry. Introduce SCAN_PAGE_DIRTY and map it to -EAGAIN. For khugepaged, this is harmless as it will revisit the range after async writeback completes. Signed-off-by: Shivank Garg --- include/trace/events/huge_memory.h | 3 ++- mm/khugepaged.c | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index dd94d14a2427..9014a9bbe64c 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -38,7 +38,8 @@ EM( SCAN_PAGE_HAS_PRIVATE, "page_has_private") \ EM( SCAN_STORE_FAILED, "store_failed") \ EM( SCAN_COPY_MC, "copy_poisoned_page") \ - EMe(SCAN_PAGE_FILLED, "page_filled") + EM(SCAN_PAGE_FILLED, "page_filled") \ + EMe(SCAN_PAGE_DIRTY, "page_dirty") #undef EM #undef EMe diff --git a/mm/khugepaged.c b/mm/khugepaged.c index d08ed6eb9ce1..7df329c9c87d 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -60,6 +60,7 @@ enum scan_result { SCAN_STORE_FAILED, SCAN_COPY_MC, SCAN_PAGE_FILLED, + SCAN_PAGE_DIRTY, }; #define CREATE_TRACE_POINTS @@ -1967,7 +1968,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, */ xas_unlock_irq(&xas); filemap_flush(mapping); - result = SCAN_FAIL; + result = SCAN_PAGE_DIRTY; goto xa_unlocked; } else if (folio_test_writeback(folio)) { xas_unlock_irq(&xas); @@ -2747,6 +2748,7 @@ static int madvise_collapse_errno(enum scan_result r) case SCAN_PAGE_LRU: case SCAN_DEL_PAGE_LRU: case SCAN_PAGE_FILLED: + case SCAN_PAGE_DIRTY: return -EAGAIN; /* * Other: Trying again likely not to succeed / error intrinsic to -- 2.43.0