From: Zhang Peng Currently we flush TLB for every dirty folio, which is a bottleneck for systems with many cores as this causes heavy IPI usage. So instead, batch the folios, and flush once for every 31 folios (one folio_batch). These folios will be held in a folio_batch releasing their lock, then when folio_batch is full, do following steps: - For each folio: lock - check still evictable (writeback, mapped, dma_pinned) - If no longer evictable, put back to LRU - Flush TLB once for the batch - Pageout the folios Suggested-by: Kairui Song Signed-off-by: Zhang Peng --- mm/vmscan.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 68 insertions(+), 7 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 63cc88c875e8..8d18e1c5b53d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1217,6 +1217,55 @@ static void pageout_one(struct folio *folio, struct list_head *ret_folios, folio_test_unevictable(folio), folio); } +static void pageout_batch(struct folio_batch *fbatch, + struct list_head *ret_folios, + struct folio_batch *free_folios, + struct scan_control *sc, struct reclaim_stat *stat, + struct swap_iocb **plug, struct list_head *folio_list) +{ + int i, count = folio_batch_count(fbatch); + struct folio *folio; + + /* + * Reuse fbatch in-place: reinit only clears the count, the + * underlying folios array is still accessible via saved count. + * Filter and re-add valid folios back into the same batch. + */ + folio_batch_reinit(fbatch); + for (i = 0; i < count; ++i) { + folio = fbatch->folios[i]; + if (!folio_trylock(folio)) { + list_add(&folio->lru, ret_folios); + continue; + } + + VM_WARN_ON_FOLIO(folio_test_lru(folio), folio); + + if (folio_test_writeback(folio) || folio_mapped(folio) || + folio_maybe_dma_pinned(folio)) { + folio_unlock(folio); + list_add(&folio->lru, ret_folios); + continue; + } + + folio_batch_add(fbatch, folio); + } + + i = 0; + count = folio_batch_count(fbatch); + if (!count) + return; + /* One TLB flush for the batch */ + try_to_unmap_flush_dirty(); + for (i = 0; i < count; ++i) { + folio = fbatch->folios[i]; + pageout_one(folio, ret_folios, free_folios, sc, stat, plug, + folio_list); + } + /* Clear the batch for the caller's next use */ + folio_batch_reinit(fbatch); +} + static bool folio_try_unmap(struct folio *folio, struct reclaim_stat *stat, unsigned int nr_pages) { @@ -1264,6 +1313,8 @@ static void shrink_folio_list(struct list_head *folio_list, struct mem_cgroup *memcg) { struct folio_batch free_folios; + struct folio_batch flush_folios; + LIST_HEAD(ret_folios); LIST_HEAD(demote_folios); unsigned int nr_demoted = 0; @@ -1272,6 +1323,8 @@ static void shrink_folio_list(struct list_head *folio_list, struct swap_iocb *plug = NULL; folio_batch_init(&free_folios); + folio_batch_init(&flush_folios); + memset(stat, 0, sizeof(*stat)); cond_resched(); do_demote_pass = can_demote(pgdat->node_id, sc, memcg); @@ -1565,15 +1618,19 @@ static void shrink_folio_list(struct list_head *folio_list, goto keep_locked; if (!sc->may_writepage) goto keep_locked; - /* - * Folio is dirty. Flush the TLB if a writable entry - * potentially exists to avoid CPU writes after I/O - * starts and then write it out here. + * Unlock while batching: holding the lock until the + * batch fills would stall swap faults that find this + * folio via swap cache lookup. pageout_batch() will + * relock each folio and recheck its state before + * writing it out. */ - try_to_unmap_flush_dirty(); - pageout_one(folio, &ret_folios, &free_folios, sc, stat, - &plug, folio_list); + folio_unlock(folio); + if (!folio_batch_add(&flush_folios, folio)) + pageout_batch(&flush_folios, + &ret_folios, &free_folios, + sc, stat, &plug, + folio_list); goto next; } @@ -1603,6 +1660,10 @@ static void shrink_folio_list(struct list_head *folio_list, next: continue; } + if (folio_batch_count(&flush_folios)) { + pageout_batch(&flush_folios, &ret_folios, &free_folios, sc, + stat, &plug, folio_list); + } /* 'folio_list' is always empty here */ /* Migrate folios selected for demotion */ -- 2.43.7