folio_end_dropbehind() is called from folio_end_writeback(), which can run in IRQ context through buffer_head completion. Previously, when folio_end_dropbehind() detected !in_task(), it skipped the invalidation entirely. This meant that folios marked for dropbehind via RWF_DONTCACHE would remain in the page cache after writeback when completed from IRQ context, defeating the purpose of using it. Fix this by deferring the dropbehind invalidation to a work item. When folio_end_dropbehind() is called from IRQ context, the folio is added to a global folio_batch and the work item is scheduled. The worker drains the batch, locking each folio and calling filemap_end_dropbehind(), and re-drains if new folios arrived while processing. This unblocks enabling RWF_UNCACHED for block devices and other buffer_head-based I/O. Signed-off-by: Tal Zussman --- mm/filemap.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 79 insertions(+), 5 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index ebd75684cb0a..6263f35c5d13 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1085,6 +1085,8 @@ static const struct ctl_table filemap_sysctl_table[] = { } }; +static void __init dropbehind_init(void); + void __init pagecache_init(void) { int i; @@ -1092,6 +1094,7 @@ void __init pagecache_init(void) for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++) init_waitqueue_head(&folio_wait_table[i]); + dropbehind_init(); page_writeback_init(); register_sysctl_init("vm", filemap_sysctl_table); } @@ -1613,23 +1616,94 @@ static void filemap_end_dropbehind(struct folio *folio) * If folio was marked as dropbehind, then pages should be dropped when writeback * completes. Do that now. If we fail, it's likely because of a big folio - * just reset dropbehind for that case and latter completions should invalidate. + * + * When called from IRQ context (e.g. buffer_head completion), we cannot lock + * the folio and invalidate. Defer to a workqueue so that callers like + * end_buffer_async_write() that complete in IRQ context still get their folios + * pruned. */ +static DEFINE_SPINLOCK(dropbehind_lock); +static struct folio_batch dropbehind_fbatch; +static struct work_struct dropbehind_work; + +static void dropbehind_work_fn(struct work_struct *w) +{ + struct folio_batch fbatch; + +again: + spin_lock_irq(&dropbehind_lock); + fbatch = dropbehind_fbatch; + folio_batch_reinit(&dropbehind_fbatch); + spin_unlock_irq(&dropbehind_lock); + + for (int i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; + + if (folio_trylock(folio)) { + filemap_end_dropbehind(folio); + folio_unlock(folio); + } + folio_put(folio); + } + + /* Drain folios that were added while we were processing. */ + spin_lock_irq(&dropbehind_lock); + if (folio_batch_count(&dropbehind_fbatch)) { + spin_unlock_irq(&dropbehind_lock); + goto again; + } + spin_unlock_irq(&dropbehind_lock); +} + +static void __init dropbehind_init(void) +{ + folio_batch_init(&dropbehind_fbatch); + INIT_WORK(&dropbehind_work, dropbehind_work_fn); +} + +static void folio_end_dropbehind_irq(struct folio *folio) +{ + unsigned long flags; + + spin_lock_irqsave(&dropbehind_lock, flags); + + /* If there is no space in the folio_batch, skip the invalidation. */ + if (!folio_batch_space(&dropbehind_fbatch)) { + spin_unlock_irqrestore(&dropbehind_lock, flags); + return; + } + + folio_get(folio); + folio_batch_add(&dropbehind_fbatch, folio); + spin_unlock_irqrestore(&dropbehind_lock, flags); + + schedule_work(&dropbehind_work); +} + void folio_end_dropbehind(struct folio *folio) { if (!folio_test_dropbehind(folio)) return; /* - * Hitting !in_task() should not happen off RWF_DONTCACHE writeback, - * but can happen if normal writeback just happens to find dirty folios - * that were created as part of uncached writeback, and that writeback - * would otherwise not need non-IRQ handling. Just skip the - * invalidation in that case. + * Hitting !in_task() can happen for IO completed from IRQ contexts or + * if normal writeback just happens to find dirty folios that were + * created as part of uncached writeback, and that writeback would + * otherwise not need non-IRQ handling. */ if (in_task() && folio_trylock(folio)) { filemap_end_dropbehind(folio); folio_unlock(folio); + return; } + + /* + * In IRQ context we cannot lock the folio or call into the + * invalidation path. Defer to a workqueue. This happens for + * buffer_head-based writeback which runs from bio IRQ context. + */ + if (!in_task()) + folio_end_dropbehind_irq(folio); } EXPORT_SYMBOL_GPL(folio_end_dropbehind); -- 2.39.5