folio_end_dropbehind() is called from folio_end_writeback(), which can run in IRQ context through buffer_head completion. Previously, when folio_end_dropbehind() detected !in_task(), it skipped the invalidation entirely. This meant that folios marked for dropbehind via RWF_DONTCACHE would remain in the page cache after writeback when completed from IRQ context, defeating the purpose of using it. Fix this by adding folio_end_dropbehind_irq() which defers the invalidation to a workqueue. The folio is added to a per-cpu folio_batch protected by a local_lock, and a work item pinned to that CPU drains the batch. folio_end_writeback() dispatches between the task and IRQ paths based on in_task(). A CPU hotplug dead callback drains any remaining folios from the departing CPU's batch to avoid leaking folio references. This unblocks enabling RWF_DONTCACHE for block devices and other buffer_head-based I/O. Signed-off-by: Tal Zussman --- include/linux/pagemap.h | 1 + mm/filemap.c | 130 ++++++++++++++++++++++++++++++++++++++++++++---- mm/page_alloc.c | 1 + 3 files changed, 123 insertions(+), 9 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index ec442af3f886..ae0632cfdedd 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1260,6 +1260,7 @@ void end_page_writeback(struct page *page); void folio_end_writeback(struct folio *folio); void folio_end_writeback_no_dropbehind(struct folio *folio); void folio_end_dropbehind(struct folio *folio); +void dropbehind_drain_cpu(int cpu); void folio_wait_stable(struct folio *folio); void __folio_mark_dirty(struct folio *folio, struct address_space *, int warn); void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb); diff --git a/mm/filemap.c b/mm/filemap.c index ebd75684cb0a..b223dca708df 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include "internal.h" @@ -1085,6 +1086,8 @@ static const struct ctl_table filemap_sysctl_table[] = { } }; +static void __init dropbehind_init(void); + void __init pagecache_init(void) { int i; @@ -1092,6 +1095,7 @@ void __init pagecache_init(void) for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++) init_waitqueue_head(&folio_wait_table[i]); + dropbehind_init(); page_writeback_init(); register_sysctl_init("vm", filemap_sysctl_table); } @@ -1613,26 +1617,131 @@ static void filemap_end_dropbehind(struct folio *folio) * If folio was marked as dropbehind, then pages should be dropped when writeback * completes. Do that now. If we fail, it's likely because of a big folio - * just reset dropbehind for that case and latter completions should invalidate. + * + * When called from IRQ context (e.g. buffer_head completion), we cannot lock + * the folio and invalidate. Defer to a workqueue so that callers like + * end_buffer_async_write() that complete in IRQ context still get their folios + * pruned. + */ +struct dropbehind_batch { + local_lock_t lock_irq; + struct folio_batch fbatch; + struct work_struct work; +}; + +static DEFINE_PER_CPU(struct dropbehind_batch, dropbehind_batch) = { + .lock_irq = INIT_LOCAL_LOCK(lock_irq), +}; + +static void dropbehind_work_fn(struct work_struct *w) +{ + struct dropbehind_batch *db_batch; + struct folio_batch fbatch; + +again: + local_lock_irq(&dropbehind_batch.lock_irq); + db_batch = this_cpu_ptr(&dropbehind_batch); + fbatch = db_batch->fbatch; + folio_batch_reinit(&db_batch->fbatch); + local_unlock_irq(&dropbehind_batch.lock_irq); + + for (int i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; + + if (folio_trylock(folio)) { + filemap_end_dropbehind(folio); + folio_unlock(folio); + } + folio_put(folio); + } + + /* Drain folios that were added while we were processing. */ + local_lock_irq(&dropbehind_batch.lock_irq); + if (folio_batch_count(&db_batch->fbatch)) { + local_unlock_irq(&dropbehind_batch.lock_irq); + goto again; + } + local_unlock_irq(&dropbehind_batch.lock_irq); +} + +/* + * Drain a dead CPU's dropbehind batch. The CPU is already dead so no + * locking is needed. + */ +void dropbehind_drain_cpu(int cpu) +{ + struct dropbehind_batch *db_batch = per_cpu_ptr(&dropbehind_batch, cpu); + struct folio_batch *fbatch = &db_batch->fbatch; + + for (int i = 0; i < folio_batch_count(fbatch); i++) { + struct folio *folio = fbatch->folios[i]; + + if (folio_trylock(folio)) { + filemap_end_dropbehind(folio); + folio_unlock(folio); + } + folio_put(folio); + } + folio_batch_reinit(fbatch); +} + +static void __init dropbehind_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct dropbehind_batch *db_batch = per_cpu_ptr(&dropbehind_batch, cpu); + + folio_batch_init(&db_batch->fbatch); + INIT_WORK(&db_batch->work, dropbehind_work_fn); + } +} + +/* + * Must be called from task context. Use folio_end_dropbehind_irq() for + * IRQ context (e.g. buffer_head completion). */ void folio_end_dropbehind(struct folio *folio) { if (!folio_test_dropbehind(folio)) return; - /* - * Hitting !in_task() should not happen off RWF_DONTCACHE writeback, - * but can happen if normal writeback just happens to find dirty folios - * that were created as part of uncached writeback, and that writeback - * would otherwise not need non-IRQ handling. Just skip the - * invalidation in that case. - */ - if (in_task() && folio_trylock(folio)) { + if (folio_trylock(folio)) { filemap_end_dropbehind(folio); folio_unlock(folio); } } EXPORT_SYMBOL_GPL(folio_end_dropbehind); +/* + * In IRQ context we cannot lock the folio or call into the invalidation + * path. Defer to a workqueue. This happens for buffer_head-based writeback + * which runs from bio IRQ context. + */ +static void folio_end_dropbehind_irq(struct folio *folio) +{ + struct dropbehind_batch *db_batch; + unsigned long flags; + + if (!folio_test_dropbehind(folio)) + return; + + local_lock_irqsave(&dropbehind_batch.lock_irq, flags); + db_batch = this_cpu_ptr(&dropbehind_batch); + + /* If there is no space in the folio_batch, skip the invalidation. */ + if (!folio_batch_space(&db_batch->fbatch)) { + local_unlock_irqrestore(&dropbehind_batch.lock_irq, flags); + return; + } + + folio_get(folio); + folio_batch_add(&db_batch->fbatch, folio); + local_unlock_irqrestore(&dropbehind_batch.lock_irq, flags); + + schedule_work_on(smp_processor_id(), &db_batch->work); +} + /** * folio_end_writeback_no_dropbehind - End writeback against a folio. * @folio: The folio. @@ -1685,7 +1794,10 @@ void folio_end_writeback(struct folio *folio) */ folio_get(folio); folio_end_writeback_no_dropbehind(folio); - folio_end_dropbehind(folio); + if (in_task()) + folio_end_dropbehind(folio); + else + folio_end_dropbehind_irq(folio); folio_put(folio); } EXPORT_SYMBOL(folio_end_writeback); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cbf758e27aa2..8208223fd764 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6277,6 +6277,7 @@ static int page_alloc_cpu_dead(unsigned int cpu) struct zone *zone; lru_add_drain_cpu(cpu); + dropbehind_drain_cpu(cpu); mlock_drain_remote(cpu); drain_pages(cpu); -- 2.39.5 Block device buffered reads and writes already pass through filemap_read() and iomap_file_buffered_write() respectively, both of which handle IOCB_DONTCACHE. Enable RWF_DONTCACHE for block device files by setting FOP_DONTCACHE in def_blk_fops. For CONFIG_BUFFER_HEAD paths, add block_write_begin_iocb() which threads the kiocb through so that buffer_head-based I/O can use DONTCACHE behavior. The existing block_write_begin() is preserved as a wrapper that passes a NULL iocb. This support is useful for databases that operate on raw block devices, among other userspace applications. Signed-off-by: Tal Zussman --- block/fops.c | 5 +++-- fs/buffer.c | 19 ++++++++++++++++--- include/linux/buffer_head.h | 3 +++ 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/block/fops.c b/block/fops.c index 4d32785b31d9..d8165f6ba71c 100644 --- a/block/fops.c +++ b/block/fops.c @@ -505,7 +505,8 @@ static int blkdev_write_begin(const struct kiocb *iocb, unsigned len, struct folio **foliop, void **fsdata) { - return block_write_begin(mapping, pos, len, foliop, blkdev_get_block); + return block_write_begin_iocb(iocb, mapping, pos, len, foliop, + blkdev_get_block); } static int blkdev_write_end(const struct kiocb *iocb, @@ -967,7 +968,7 @@ const struct file_operations def_blk_fops = { .splice_write = iter_file_splice_write, .fallocate = blkdev_fallocate, .uring_cmd = blkdev_uring_cmd, - .fop_flags = FOP_BUFFER_RASYNC, + .fop_flags = FOP_BUFFER_RASYNC | FOP_DONTCACHE, }; static __init int blkdev_init(void) diff --git a/fs/buffer.c b/fs/buffer.c index 838c0c571022..18f1d128bb19 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2241,14 +2241,19 @@ EXPORT_SYMBOL(block_commit_write); * * The filesystem needs to handle block truncation upon failure. */ -int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, +int block_write_begin_iocb(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, get_block_t *get_block) { pgoff_t index = pos >> PAGE_SHIFT; + fgf_t fgp_flags = FGP_WRITEBEGIN; struct folio *folio; int status; - folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + if (iocb && iocb->ki_flags & IOCB_DONTCACHE) + fgp_flags |= FGP_DONTCACHE; + + folio = __filemap_get_folio(mapping, index, fgp_flags, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); @@ -2263,6 +2268,13 @@ int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, *foliop = folio; return status; } + +int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, + struct folio **foliop, get_block_t *get_block) +{ + return block_write_begin_iocb(NULL, mapping, pos, len, foliop, + get_block); +} EXPORT_SYMBOL(block_write_begin); int block_write_end(loff_t pos, unsigned len, unsigned copied, @@ -2591,7 +2603,8 @@ int cont_write_begin(const struct kiocb *iocb, struct address_space *mapping, (*bytes)++; } - return block_write_begin(mapping, pos, len, foliop, get_block); + return block_write_begin_iocb(iocb, mapping, pos, len, foliop, + get_block); } EXPORT_SYMBOL(cont_write_begin); diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index b16b88bfbc3e..ddf88ce290f2 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -260,6 +260,9 @@ int block_read_full_folio(struct folio *, get_block_t *); bool block_is_partially_uptodate(struct folio *, size_t from, size_t count); int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, get_block_t *get_block); +int block_write_begin_iocb(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, + struct folio **foliop, get_block_t *get_block); int __block_write_begin(struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block); int block_write_end(loff_t pos, unsigned len, unsigned copied, struct folio *); -- 2.39.5