folio_end_dropbehind() is called from folio_end_writeback(), which can run in IRQ context through buffer_head completion. Previously, when folio_end_dropbehind() detected !in_task(), it skipped the invalidation entirely. This meant that folios marked for dropbehind via RWF_DONTCACHE would remain in the page cache after writeback when completed from IRQ context, defeating the purpose of using it. Fix this by deferring the dropbehind invalidation to a work item. When folio_end_dropbehind() is called from IRQ context, the folio is added to a global folio_batch and the work item is scheduled. The worker drains the batch, locking each folio and calling filemap_end_dropbehind(), and re-drains if new folios arrived while processing. This unblocks enabling RWF_UNCACHED for block devices and other buffer_head-based I/O. Signed-off-by: Tal Zussman --- mm/filemap.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 79 insertions(+), 5 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index ebd75684cb0a..6263f35c5d13 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1085,6 +1085,8 @@ static const struct ctl_table filemap_sysctl_table[] = { } }; +static void __init dropbehind_init(void); + void __init pagecache_init(void) { int i; @@ -1092,6 +1094,7 @@ void __init pagecache_init(void) for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++) init_waitqueue_head(&folio_wait_table[i]); + dropbehind_init(); page_writeback_init(); register_sysctl_init("vm", filemap_sysctl_table); } @@ -1613,23 +1616,94 @@ static void filemap_end_dropbehind(struct folio *folio) * If folio was marked as dropbehind, then pages should be dropped when writeback * completes. Do that now. If we fail, it's likely because of a big folio - * just reset dropbehind for that case and latter completions should invalidate. + * + * When called from IRQ context (e.g. buffer_head completion), we cannot lock + * the folio and invalidate. Defer to a workqueue so that callers like + * end_buffer_async_write() that complete in IRQ context still get their folios + * pruned. */ +static DEFINE_SPINLOCK(dropbehind_lock); +static struct folio_batch dropbehind_fbatch; +static struct work_struct dropbehind_work; + +static void dropbehind_work_fn(struct work_struct *w) +{ + struct folio_batch fbatch; + +again: + spin_lock_irq(&dropbehind_lock); + fbatch = dropbehind_fbatch; + folio_batch_reinit(&dropbehind_fbatch); + spin_unlock_irq(&dropbehind_lock); + + for (int i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; + + if (folio_trylock(folio)) { + filemap_end_dropbehind(folio); + folio_unlock(folio); + } + folio_put(folio); + } + + /* Drain folios that were added while we were processing. */ + spin_lock_irq(&dropbehind_lock); + if (folio_batch_count(&dropbehind_fbatch)) { + spin_unlock_irq(&dropbehind_lock); + goto again; + } + spin_unlock_irq(&dropbehind_lock); +} + +static void __init dropbehind_init(void) +{ + folio_batch_init(&dropbehind_fbatch); + INIT_WORK(&dropbehind_work, dropbehind_work_fn); +} + +static void folio_end_dropbehind_irq(struct folio *folio) +{ + unsigned long flags; + + spin_lock_irqsave(&dropbehind_lock, flags); + + /* If there is no space in the folio_batch, skip the invalidation. */ + if (!folio_batch_space(&dropbehind_fbatch)) { + spin_unlock_irqrestore(&dropbehind_lock, flags); + return; + } + + folio_get(folio); + folio_batch_add(&dropbehind_fbatch, folio); + spin_unlock_irqrestore(&dropbehind_lock, flags); + + schedule_work(&dropbehind_work); +} + void folio_end_dropbehind(struct folio *folio) { if (!folio_test_dropbehind(folio)) return; /* - * Hitting !in_task() should not happen off RWF_DONTCACHE writeback, - * but can happen if normal writeback just happens to find dirty folios - * that were created as part of uncached writeback, and that writeback - * would otherwise not need non-IRQ handling. Just skip the - * invalidation in that case. + * Hitting !in_task() can happen for IO completed from IRQ contexts or + * if normal writeback just happens to find dirty folios that were + * created as part of uncached writeback, and that writeback would + * otherwise not need non-IRQ handling. */ if (in_task() && folio_trylock(folio)) { filemap_end_dropbehind(folio); folio_unlock(folio); + return; } + + /* + * In IRQ context we cannot lock the folio or call into the + * invalidation path. Defer to a workqueue. This happens for + * buffer_head-based writeback which runs from bio IRQ context. + */ + if (!in_task()) + folio_end_dropbehind_irq(folio); } EXPORT_SYMBOL_GPL(folio_end_dropbehind); -- 2.39.5 Block device buffered reads and writes already pass through filemap_read() and iomap_file_buffered_write() respectively, both of which handle IOCB_DONTCACHE. Enable RWF_DONTCACHE for block device files by setting FOP_DONTCACHE in def_blk_fops. For CONFIG_BUFFER_HEAD paths, thread the kiocb through block_write_begin() so that buffer_head-based I/O can use DONTCACHE behavior as well. Callers without a kiocb context (e.g. nilfs2 recovery) pass NULL, which preserves the existing behavior. This support is useful for databases that operate on raw block devices, among other userspace applications. Reviewed-by: Jan Kara Signed-off-by: Tal Zussman --- block/fops.c | 4 ++-- fs/bfs/file.c | 2 +- fs/buffer.c | 12 ++++++++---- fs/exfat/inode.c | 2 +- fs/ext2/inode.c | 2 +- fs/jfs/inode.c | 2 +- fs/minix/inode.c | 2 +- fs/nilfs2/inode.c | 2 +- fs/nilfs2/recovery.c | 2 +- fs/ntfs3/inode.c | 2 +- fs/omfs/file.c | 2 +- fs/udf/inode.c | 2 +- fs/ufs/inode.c | 2 +- include/linux/buffer_head.h | 5 +++-- 14 files changed, 24 insertions(+), 19 deletions(-) diff --git a/block/fops.c b/block/fops.c index 4d32785b31d9..6bc727f8b252 100644 --- a/block/fops.c +++ b/block/fops.c @@ -505,7 +505,7 @@ static int blkdev_write_begin(const struct kiocb *iocb, unsigned len, struct folio **foliop, void **fsdata) { - return block_write_begin(mapping, pos, len, foliop, blkdev_get_block); + return block_write_begin(iocb, mapping, pos, len, foliop, blkdev_get_block); } static int blkdev_write_end(const struct kiocb *iocb, @@ -967,7 +967,7 @@ const struct file_operations def_blk_fops = { .splice_write = iter_file_splice_write, .fallocate = blkdev_fallocate, .uring_cmd = blkdev_uring_cmd, - .fop_flags = FOP_BUFFER_RASYNC, + .fop_flags = FOP_BUFFER_RASYNC | FOP_DONTCACHE, }; static __init int blkdev_init(void) diff --git a/fs/bfs/file.c b/fs/bfs/file.c index d33d6bde992b..f2804e38b8a7 100644 --- a/fs/bfs/file.c +++ b/fs/bfs/file.c @@ -177,7 +177,7 @@ static int bfs_write_begin(const struct kiocb *iocb, { int ret; - ret = block_write_begin(mapping, pos, len, foliop, bfs_get_block); + ret = block_write_begin(iocb, mapping, pos, len, foliop, bfs_get_block); if (unlikely(ret)) bfs_write_failed(mapping, pos + len); diff --git a/fs/buffer.c b/fs/buffer.c index 838c0c571022..33c3580b85d8 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2241,14 +2241,18 @@ EXPORT_SYMBOL(block_commit_write); * * The filesystem needs to handle block truncation upon failure. */ -int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, - struct folio **foliop, get_block_t *get_block) +int block_write_begin(const struct kiocb *iocb, struct address_space *mapping, + loff_t pos, unsigned len, struct folio **foliop, get_block_t *get_block) { pgoff_t index = pos >> PAGE_SHIFT; + fgf_t fgp_flags = FGP_WRITEBEGIN; struct folio *folio; int status; - folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + if (iocb && iocb->ki_flags & IOCB_DONTCACHE) + fgp_flags |= FGP_DONTCACHE; + + folio = __filemap_get_folio(mapping, index, fgp_flags, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); @@ -2591,7 +2595,7 @@ int cont_write_begin(const struct kiocb *iocb, struct address_space *mapping, (*bytes)++; } - return block_write_begin(mapping, pos, len, foliop, get_block); + return block_write_begin(iocb, mapping, pos, len, foliop, get_block); } EXPORT_SYMBOL(cont_write_begin); diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index f9501c3a3666..39d36e8fdfd6 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -456,7 +456,7 @@ static int exfat_write_begin(const struct kiocb *iocb, if (unlikely(exfat_forced_shutdown(mapping->host->i_sb))) return -EIO; - ret = block_write_begin(mapping, pos, len, foliop, exfat_get_block); + ret = block_write_begin(iocb, mapping, pos, len, foliop, exfat_get_block); if (ret < 0) exfat_write_failed(mapping, pos+len); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index dbfe9098a124..11aab03de752 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -930,7 +930,7 @@ ext2_write_begin(const struct kiocb *iocb, struct address_space *mapping, { int ret; - ret = block_write_begin(mapping, pos, len, foliop, ext2_get_block); + ret = block_write_begin(iocb, mapping, pos, len, foliop, ext2_get_block); if (ret < 0) ext2_write_failed(mapping, pos + len); return ret; diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 4709762713ef..ae52db437771 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -303,7 +303,7 @@ static int jfs_write_begin(const struct kiocb *iocb, { int ret; - ret = block_write_begin(mapping, pos, len, foliop, jfs_get_block); + ret = block_write_begin(iocb, mapping, pos, len, foliop, jfs_get_block); if (unlikely(ret)) jfs_write_failed(mapping, pos + len); diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 51ea9bdc813f..9075c0ba2f20 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -465,7 +465,7 @@ static int minix_write_begin(const struct kiocb *iocb, { int ret; - ret = block_write_begin(mapping, pos, len, foliop, minix_get_block); + ret = block_write_begin(iocb, mapping, pos, len, foliop, minix_get_block); if (unlikely(ret)) minix_write_failed(mapping, pos + len); diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 51bde45d5865..d9d57eeecc5d 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -230,7 +230,7 @@ static int nilfs_write_begin(const struct kiocb *iocb, if (unlikely(err)) return err; - err = block_write_begin(mapping, pos, len, foliop, nilfs_get_block); + err = block_write_begin(iocb, mapping, pos, len, foliop, nilfs_get_block); if (unlikely(err)) { nilfs_write_failed(mapping, pos + len); nilfs_transaction_abort(inode->i_sb); diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index a9c61d0492cb..2f5fe44bf736 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -541,7 +541,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, } pos = rb->blkoff << inode->i_blkbits; - err = block_write_begin(inode->i_mapping, pos, blocksize, + err = block_write_begin(NULL, inode->i_mapping, pos, blocksize, &folio, nilfs_get_block); if (unlikely(err)) { loff_t isize = inode->i_size; diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index 0a9ac5efeb67..8c788feb319e 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -966,7 +966,7 @@ int ntfs_write_begin(const struct kiocb *iocb, struct address_space *mapping, goto out; } - err = block_write_begin(mapping, pos, len, foliop, + err = block_write_begin(iocb, mapping, pos, len, foliop, ntfs_get_block_write_begin); out: diff --git a/fs/omfs/file.c b/fs/omfs/file.c index 49a1de5a827f..3bade632e36e 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -317,7 +317,7 @@ static int omfs_write_begin(const struct kiocb *iocb, { int ret; - ret = block_write_begin(mapping, pos, len, foliop, omfs_get_block); + ret = block_write_begin(iocb, mapping, pos, len, foliop, omfs_get_block); if (unlikely(ret)) omfs_write_failed(mapping, pos + len); diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 7fae8002344a..aec9cdc938be 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -259,7 +259,7 @@ static int udf_write_begin(const struct kiocb *iocb, int ret; if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { - ret = block_write_begin(mapping, pos, len, foliop, + ret = block_write_begin(iocb, mapping, pos, len, foliop, udf_get_block); if (unlikely(ret)) udf_write_failed(mapping, pos + len); diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index e2b0a35de2a7..dfba985265a8 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -481,7 +481,7 @@ static int ufs_write_begin(const struct kiocb *iocb, { int ret; - ret = block_write_begin(mapping, pos, len, foliop, ufs_getfrag_block); + ret = block_write_begin(iocb, mapping, pos, len, foliop, ufs_getfrag_block); if (unlikely(ret)) ufs_write_failed(mapping, pos + len); diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index b16b88bfbc3e..4b07dec5f8eb 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -258,8 +258,9 @@ int __block_write_full_folio(struct inode *inode, struct folio *folio, get_block_t *get_block, struct writeback_control *wbc); int block_read_full_folio(struct folio *, get_block_t *); bool block_is_partially_uptodate(struct folio *, size_t from, size_t count); -int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, - struct folio **foliop, get_block_t *get_block); +int block_write_begin(const struct kiocb *iocb, struct address_space *mapping, + loff_t pos, unsigned len, struct folio **foliop, + get_block_t *get_block); int __block_write_begin(struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block); int block_write_end(loff_t pos, unsigned len, unsigned copied, struct folio *); -- 2.39.5