Block device buffered reads and writes already pass through filemap_read() and iomap_file_buffered_write() respectively, both of which handle IOCB_DONTCACHE. Enable RWF_DONTCACHE for block device files by setting FOP_DONTCACHE in def_blk_fops. For CONFIG_BUFFER_HEAD paths, thread the kiocb through block_write_begin() so that buffer_head-based I/O can use DONTCACHE behavior as well. Callers without a kiocb context (e.g. nilfs2 recovery) pass NULL, which preserves the existing behavior. This support is useful for databases that operate on raw block devices, among other userspace applications. Signed-off-by: Tal Zussman --- This is based on v6.19. Please let me know if there's a different tree I should base this on. I wasn't sure if the block_write_begin() changes were necessary for block device support if CONFIG_BUFFER_HEAD is set (hence the RFC tag). I can remove those if they're not necessary. --- block/fops.c | 4 ++-- fs/bfs/file.c | 2 +- fs/buffer.c | 12 ++++++++---- fs/exfat/inode.c | 2 +- fs/ext2/inode.c | 2 +- fs/jfs/inode.c | 2 +- fs/minix/inode.c | 2 +- fs/nilfs2/inode.c | 2 +- fs/nilfs2/recovery.c | 2 +- fs/ntfs3/inode.c | 2 +- fs/omfs/file.c | 2 +- fs/udf/inode.c | 2 +- fs/ufs/inode.c | 2 +- include/linux/buffer_head.h | 5 +++-- 14 files changed, 24 insertions(+), 19 deletions(-) diff --git a/block/fops.c b/block/fops.c index 4d32785b31d9..6bc727f8b252 100644 --- a/block/fops.c +++ b/block/fops.c @@ -505,7 +505,7 @@ static int blkdev_write_begin(const struct kiocb *iocb, unsigned len, struct folio **foliop, void **fsdata) { - return block_write_begin(mapping, pos, len, foliop, blkdev_get_block); + return block_write_begin(iocb, mapping, pos, len, foliop, blkdev_get_block); } static int blkdev_write_end(const struct kiocb *iocb, @@ -967,7 +967,7 @@ const struct file_operations def_blk_fops = { .splice_write = iter_file_splice_write, .fallocate = blkdev_fallocate, .uring_cmd = blkdev_uring_cmd, - .fop_flags = FOP_BUFFER_RASYNC, + .fop_flags = FOP_BUFFER_RASYNC | FOP_DONTCACHE, }; static __init int blkdev_init(void) diff --git a/fs/bfs/file.c b/fs/bfs/file.c index d33d6bde992b..f2804e38b8a7 100644 --- a/fs/bfs/file.c +++ b/fs/bfs/file.c @@ -177,7 +177,7 @@ static int bfs_write_begin(const struct kiocb *iocb, { int ret; - ret = block_write_begin(mapping, pos, len, foliop, bfs_get_block); + ret = block_write_begin(iocb, mapping, pos, len, foliop, bfs_get_block); if (unlikely(ret)) bfs_write_failed(mapping, pos + len); diff --git a/fs/buffer.c b/fs/buffer.c index 838c0c571022..33c3580b85d8 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2241,14 +2241,18 @@ EXPORT_SYMBOL(block_commit_write); * * The filesystem needs to handle block truncation upon failure. */ -int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, - struct folio **foliop, get_block_t *get_block) +int block_write_begin(const struct kiocb *iocb, struct address_space *mapping, + loff_t pos, unsigned len, struct folio **foliop, get_block_t *get_block) { pgoff_t index = pos >> PAGE_SHIFT; + fgf_t fgp_flags = FGP_WRITEBEGIN; struct folio *folio; int status; - folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + if (iocb && iocb->ki_flags & IOCB_DONTCACHE) + fgp_flags |= FGP_DONTCACHE; + + folio = __filemap_get_folio(mapping, index, fgp_flags, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); @@ -2591,7 +2595,7 @@ int cont_write_begin(const struct kiocb *iocb, struct address_space *mapping, (*bytes)++; } - return block_write_begin(mapping, pos, len, foliop, get_block); + return block_write_begin(iocb, mapping, pos, len, foliop, get_block); } EXPORT_SYMBOL(cont_write_begin); diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index f9501c3a3666..39d36e8fdfd6 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -456,7 +456,7 @@ static int exfat_write_begin(const struct kiocb *iocb, if (unlikely(exfat_forced_shutdown(mapping->host->i_sb))) return -EIO; - ret = block_write_begin(mapping, pos, len, foliop, exfat_get_block); + ret = block_write_begin(iocb, mapping, pos, len, foliop, exfat_get_block); if (ret < 0) exfat_write_failed(mapping, pos+len); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index dbfe9098a124..11aab03de752 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -930,7 +930,7 @@ ext2_write_begin(const struct kiocb *iocb, struct address_space *mapping, { int ret; - ret = block_write_begin(mapping, pos, len, foliop, ext2_get_block); + ret = block_write_begin(iocb, mapping, pos, len, foliop, ext2_get_block); if (ret < 0) ext2_write_failed(mapping, pos + len); return ret; diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 4709762713ef..ae52db437771 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -303,7 +303,7 @@ static int jfs_write_begin(const struct kiocb *iocb, { int ret; - ret = block_write_begin(mapping, pos, len, foliop, jfs_get_block); + ret = block_write_begin(iocb, mapping, pos, len, foliop, jfs_get_block); if (unlikely(ret)) jfs_write_failed(mapping, pos + len); diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 51ea9bdc813f..9075c0ba2f20 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -465,7 +465,7 @@ static int minix_write_begin(const struct kiocb *iocb, { int ret; - ret = block_write_begin(mapping, pos, len, foliop, minix_get_block); + ret = block_write_begin(iocb, mapping, pos, len, foliop, minix_get_block); if (unlikely(ret)) minix_write_failed(mapping, pos + len); diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 51bde45d5865..d9d57eeecc5d 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -230,7 +230,7 @@ static int nilfs_write_begin(const struct kiocb *iocb, if (unlikely(err)) return err; - err = block_write_begin(mapping, pos, len, foliop, nilfs_get_block); + err = block_write_begin(iocb, mapping, pos, len, foliop, nilfs_get_block); if (unlikely(err)) { nilfs_write_failed(mapping, pos + len); nilfs_transaction_abort(inode->i_sb); diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index a9c61d0492cb..2f5fe44bf736 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -541,7 +541,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, } pos = rb->blkoff << inode->i_blkbits; - err = block_write_begin(inode->i_mapping, pos, blocksize, + err = block_write_begin(NULL, inode->i_mapping, pos, blocksize, &folio, nilfs_get_block); if (unlikely(err)) { loff_t isize = inode->i_size; diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index 0a9ac5efeb67..8c788feb319e 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -966,7 +966,7 @@ int ntfs_write_begin(const struct kiocb *iocb, struct address_space *mapping, goto out; } - err = block_write_begin(mapping, pos, len, foliop, + err = block_write_begin(iocb, mapping, pos, len, foliop, ntfs_get_block_write_begin); out: diff --git a/fs/omfs/file.c b/fs/omfs/file.c index 49a1de5a827f..3bade632e36e 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -317,7 +317,7 @@ static int omfs_write_begin(const struct kiocb *iocb, { int ret; - ret = block_write_begin(mapping, pos, len, foliop, omfs_get_block); + ret = block_write_begin(iocb, mapping, pos, len, foliop, omfs_get_block); if (unlikely(ret)) omfs_write_failed(mapping, pos + len); diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 7fae8002344a..aec9cdc938be 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -259,7 +259,7 @@ static int udf_write_begin(const struct kiocb *iocb, int ret; if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { - ret = block_write_begin(mapping, pos, len, foliop, + ret = block_write_begin(iocb, mapping, pos, len, foliop, udf_get_block); if (unlikely(ret)) udf_write_failed(mapping, pos + len); diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index e2b0a35de2a7..dfba985265a8 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -481,7 +481,7 @@ static int ufs_write_begin(const struct kiocb *iocb, { int ret; - ret = block_write_begin(mapping, pos, len, foliop, ufs_getfrag_block); + ret = block_write_begin(iocb, mapping, pos, len, foliop, ufs_getfrag_block); if (unlikely(ret)) ufs_write_failed(mapping, pos + len); diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index b16b88bfbc3e..4b07dec5f8eb 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -258,8 +258,9 @@ int __block_write_full_folio(struct inode *inode, struct folio *folio, get_block_t *get_block, struct writeback_control *wbc); int block_read_full_folio(struct folio *, get_block_t *); bool block_is_partially_uptodate(struct folio *, size_t from, size_t count); -int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, - struct folio **foliop, get_block_t *get_block); +int block_write_begin(const struct kiocb *iocb, struct address_space *mapping, + loff_t pos, unsigned len, struct folio **foliop, + get_block_t *get_block); int __block_write_begin(struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block); int block_write_end(loff_t pos, unsigned len, unsigned copied, struct folio *); --- base-commit: 05f7e89ab9731565d8a62e3b5d1ec206485eeb0b change-id: 20260218-blk-dontcache-338133dd045e Best regards, -- Tal Zussman