collapse_file() calls xas_create_range() to pre-create all slots needed. If collapse_file() finally fails, these pre-created slots are empty nodes and aren't destroyed. I can reproduce it with following steps. 1) create file /tmp/test_madvise_collapse and ftruncate to 4MB size, and then mmap the file 2) memset for the first 2MB 3) madvise(MADV_COLLAPSE) for the second 2MB 4) unlink the file in 3), collapse_file() calls xas_create_range() to expand xarray depth, and fails to collapse due to the whole 2M region is empty. collapse_file() rollback path doesn't destroy the pre-created empty nodes. When the file is deleted, shmem_evict_inode()->shmem_truncate_range() traverses all entries and calls xas_store(xas, NULL) to delete, if the leaf xa_node that stores deleted entry becomes emtry, xas_store() will automatically delete the empty node and delete it's parent is empty too, until parent node isn't empty. shmem_evict_inode() won't traverse the empty nodes created by xas_create_range() due to these nodes doesn't store any entries. As a result, these empty nodes are leaked. We couldn't simply destroy empty nodes in rollback path, because xarray lock is released and re-held several times in collapse_file(). Another collapse_file() call may take concurrently, and those empty nodes may be needed by the another collapse_file() call. To fix it, move xas_create_range() call just before update new_folio to xarray, to guarantee collapse_file() doesn't unlock xarray lock temporarily. Besides, xas_create_range() may fails too, we don't unlock xarray lock and retry again, just destroy the new created empty xa_nodes with xarray lock held to prevent any concurrency. Fixes: 77da9389b9d5 ("mm: Convert collapse_shmem to XArray") Signed-off-by: Jinjiang Tu --- include/linux/xarray.h | 1 + lib/xarray.c | 19 +++++++++++++++++++ mm/khugepaged.c | 36 +++++++++++++++++++----------------- 3 files changed, 39 insertions(+), 17 deletions(-) diff --git a/include/linux/xarray.h b/include/linux/xarray.h index be850174e802..972df5ceeb84 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -1555,6 +1555,7 @@ void xas_destroy(struct xa_state *); void xas_pause(struct xa_state *); void xas_create_range(struct xa_state *); +void xas_destroy_range(struct xa_state *xas, unsigned long start, unsigned long end); #ifdef CONFIG_XARRAY_MULTI int xa_get_order(struct xarray *, unsigned long index); diff --git a/lib/xarray.c b/lib/xarray.c index 9a8b4916540c..e6126052f141 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -752,6 +752,25 @@ void xas_create_range(struct xa_state *xas) } EXPORT_SYMBOL_GPL(xas_create_range); +void xas_destroy_range(struct xa_state *xas, unsigned long start, unsigned long end) +{ + unsigned long index; + void *entry; + + for (index = start; index < end; ++index) { + xas_set(xas, index); + entry = xas_load(xas); + if (entry) + continue; + + if (!xas->xa_node || xas_invalid(xas)) + continue; + + if (!xas->xa_node->count) + xas_delete_node(xas); + } +} + static void update_node(struct xa_state *xas, struct xa_node *node, int count, int values) { diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 97d1b2824386..969058088eee 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1863,7 +1863,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, struct folio *folio, *tmp, *new_folio; pgoff_t index = 0, end = start + HPAGE_PMD_NR; LIST_HEAD(pagelist); - XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); + XA_STATE(xas, &mapping->i_pages, 0); int nr_none = 0, result = SCAN_SUCCEED; bool is_shmem = shmem_file(file); @@ -1882,22 +1882,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, new_folio->index = start; new_folio->mapping = mapping; - /* - * Ensure we have slots for all the pages in the range. This is - * almost certainly a no-op because most of the pages must be present - */ - do { - xas_lock_irq(&xas); - xas_create_range(&xas); - if (!xas_error(&xas)) - break; - xas_unlock_irq(&xas); - if (!xas_nomem(&xas, GFP_KERNEL)) { - result = SCAN_FAIL; - goto rollback; - } - } while (1); - + xas_lock_irq(&xas); for (index = start; index < end;) { xas_set(&xas, index); folio = xas_load(&xas); @@ -2194,6 +2179,23 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, xas_lock_irq(&xas); } + xas_set_order(&xas, start, HPAGE_PMD_ORDER); + xas_create_range(&xas); + if (xas_error(&xas)) { + xas_set_order(&xas, start, 0); + if (nr_none) { + for (index = start; index < end; index++) { + if (xas_next(&xas) == XA_RETRY_ENTRY) + xas_store(&xas, NULL); + } + } + xas_destroy_range(&xas, start, end); + xas_unlock_irq(&xas); + result = SCAN_FAIL; + + goto rollback; + } + if (is_shmem) lruvec_stat_mod_folio(new_folio, NR_SHMEM_THPS, HPAGE_PMD_NR); else -- 2.43.0