Add special handling of PG_atomic flag to iomap buffered write path. To flag an iomap iter for an atomic write, set IOMAP_ATOMIC. For a folio associated with a write which has IOMAP_ATOMIC set, set PG_atomic. Otherwise, when IOMAP_ATOMIC is unset, clear PG_atomic. This means that for an "atomic" folio which has not been written back, it loses it "atomicity". So if userspace issues a write with RWF_ATOMIC set and another write with RWF_ATOMIC unset, that folio is not written back atomically. For such a scenario to occur, it would be considered a userspace usage error. To ensure that a buffered atomic write is written back atomically when the write syscall returns, RWF_SYNC or similar needs to be used (in conjunction with RWF_ATOMIC). Only a single BIO should ever be submitted for an atomic write. So modify iomap_add_to_ioend() to ensure that we don't try to write back an atomic folio as part of a larger mixed-atomicity BIO. In iomap_alloc_ioend(), handle an atomic write by setting REQ_ATOMIC for the allocated BIO. When a folio is written back, again clear PG_atomic, as it is no longer required. Currently, RWF_ATOMIC with buffered IO is limited to single block size writes, and has 2 main restrictions: 1. Only blocksize == pagesize is supported 2. Writes where the user buffer is not aligned to PAGE_SIZE are not supported For more details, refer to the comment in generic_atomic_write_valid() Co-developed-by: John Garry Signed-off-by: John Garry Signed-off-by: Ojaswin Mujoo --- fs/iomap/buffered-io.c | 48 ++++++++++++++++++++++++++++++++++++------ fs/iomap/ioend.c | 18 ++++++++++++---- fs/read_write.c | 34 ++++++++++++++++++++++++++++-- include/linux/iomap.h | 2 ++ 4 files changed, 89 insertions(+), 13 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index f099c086cbe8..947c76c2688a 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -850,11 +850,13 @@ static int iomap_write_begin(struct iomap_iter *iter, { const struct iomap *srcmap = iomap_iter_srcmap(iter); loff_t pos; - u64 len = min_t(u64, SIZE_MAX, iomap_length(iter)); + u64 orig_len = min_t(u64, SIZE_MAX, iomap_length(iter)); + u64 len; struct folio *folio; int status = 0; + bool is_atomic = iter->flags & IOMAP_ATOMIC; - len = min_not_zero(len, *plen); + len = min_not_zero(orig_len, *plen); *foliop = NULL; *plen = 0; @@ -922,6 +924,11 @@ static int iomap_write_begin(struct iomap_iter *iter, if (unlikely(status)) goto out_unlock; + if (is_atomic && (len != orig_len)) { + status = -EINVAL; + goto out_unlock; + } + *foliop = folio; *plen = len; return 0; @@ -931,7 +938,7 @@ static int iomap_write_begin(struct iomap_iter *iter, return status; } -static bool __iomap_write_end(struct inode *inode, loff_t pos, size_t len, +static bool __iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, size_t copied, struct folio *folio) { flush_dcache_folio(folio); @@ -951,7 +958,27 @@ static bool __iomap_write_end(struct inode *inode, loff_t pos, size_t len, return false; iomap_set_range_uptodate(folio, offset_in_folio(folio, pos), len); iomap_set_range_dirty(folio, offset_in_folio(folio, pos), copied); - filemap_dirty_folio(inode->i_mapping, folio); + filemap_dirty_folio(iter->inode->i_mapping, folio); + + /* + * Policy: non atomic write over a previously atomic range makes the + * range non-atomic. Handle this here. + */ + if (iter->flags & IOMAP_ATOMIC) { + if (copied < len) { + /* + * A short atomic write is only okay as long as nothing + * is written at all. If we have a partial write, there + * is a bug in our code. + */ + WARN_ON_ONCE(copied != 0); + + return false; + } + folio_set_atomic(folio); + } else + folio_clear_atomic(folio); + return true; } @@ -997,7 +1024,7 @@ static bool iomap_write_end(struct iomap_iter *iter, size_t len, size_t copied, return bh_written == copied; } - return __iomap_write_end(iter->inode, pos, len, copied, folio); + return __iomap_write_end(iter, pos, len, copied, folio); } static int iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i, @@ -1124,6 +1151,8 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, iter.flags |= IOMAP_NOWAIT; if (iocb->ki_flags & IOCB_DONTCACHE) iter.flags |= IOMAP_DONTCACHE; + if (iocb->ki_flags & IOCB_ATOMIC) + iter.flags |= IOMAP_ATOMIC; while ((ret = iomap_iter(&iter, ops)) > 0) iter.status = iomap_write_iter(&iter, i, write_ops); @@ -1588,6 +1617,7 @@ static int iomap_folio_mkwrite_iter(struct iomap_iter *iter, } else { WARN_ON_ONCE(!folio_test_uptodate(folio)); folio_mark_dirty(folio); + folio_clear_atomic(folio); } return iomap_iter_advance(iter, length); @@ -1642,8 +1672,10 @@ void iomap_finish_folio_write(struct inode *inode, struct folio *folio, WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs); WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= 0); - if (!ifs || atomic_sub_and_test(len, &ifs->write_bytes_pending)) + if (!ifs || atomic_sub_and_test(len, &ifs->write_bytes_pending)) { + folio_clear_atomic(folio); folio_end_writeback(folio); + } } EXPORT_SYMBOL_GPL(iomap_finish_folio_write); @@ -1807,8 +1839,10 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio) if (atomic_dec_and_test(&ifs->write_bytes_pending)) folio_end_writeback(folio); } else { - if (!wb_pending) + if (!wb_pending) { + folio_clear_atomic(folio); folio_end_writeback(folio); + } } mapping_set_error(inode->i_mapping, error); return error; diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c index b49fa75eab26..c129a695ceca 100644 --- a/fs/iomap/ioend.c +++ b/fs/iomap/ioend.c @@ -98,13 +98,17 @@ int iomap_ioend_writeback_submit(struct iomap_writepage_ctx *wpc, int error) EXPORT_SYMBOL_GPL(iomap_ioend_writeback_submit); static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc, - loff_t pos, u16 ioend_flags) + loff_t pos, u16 ioend_flags, + bool atomic) { struct bio *bio; + blk_opf_t opf = REQ_OP_WRITE | wbc_to_write_flags(wpc->wbc); + + if (atomic) + opf |= REQ_ATOMIC; bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS, - REQ_OP_WRITE | wbc_to_write_flags(wpc->wbc), - GFP_NOFS, &iomap_ioend_bioset); + opf, GFP_NOFS, &iomap_ioend_bioset); bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos); bio->bi_write_hint = wpc->inode->i_write_hint; wbc_init_bio(wpc->wbc, bio); @@ -122,6 +126,9 @@ static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos, if ((ioend_flags & IOMAP_IOEND_NOMERGE_FLAGS) != (ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS)) return false; + if ((ioend_flags & IOMAP_IOEND_ATOMIC) || + (ioend->io_flags & IOMAP_IOEND_ATOMIC)) + return false; if (pos != ioend->io_offset + ioend->io_size) return false; if (!(wpc->iomap.flags & IOMAP_F_ANON_WRITE) && @@ -156,6 +163,7 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio, unsigned int ioend_flags = 0; unsigned int map_len = min_t(u64, dirty_len, wpc->iomap.offset + wpc->iomap.length - pos); + bool is_atomic = folio_test_atomic(folio); int error; trace_iomap_add_to_ioend(wpc->inode, pos, dirty_len, &wpc->iomap); @@ -180,6 +188,8 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio, ioend_flags |= IOMAP_IOEND_DONTCACHE; if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY)) ioend_flags |= IOMAP_IOEND_BOUNDARY; + if (is_atomic) + ioend_flags |= IOMAP_IOEND_ATOMIC; if (!ioend || !iomap_can_add_to_ioend(wpc, pos, ioend_flags)) { new_ioend: @@ -188,7 +198,7 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio, if (error) return error; } - wpc->wb_ctx = ioend = iomap_alloc_ioend(wpc, pos, ioend_flags); + wpc->wb_ctx = ioend = iomap_alloc_ioend(wpc, pos, ioend_flags, is_atomic); } if (!bio_add_folio(&ioend->io_bio, folio, map_len, poff)) diff --git a/fs/read_write.c b/fs/read_write.c index 833bae068770..37546aa40f0d 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1802,6 +1802,8 @@ int generic_file_rw_checks(struct file *file_in, struct file *file_out) int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter) { + struct super_block *sb = iocb->ki_filp->f_mapping->host->i_sb; + size_t len = iov_iter_count(iter); if (!iter_is_ubuf(iter)) @@ -1813,8 +1815,36 @@ int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter) if (!IS_ALIGNED(iocb->ki_pos, len)) return -EINVAL; - if (!(iocb->ki_flags & IOCB_DIRECT)) - return -EOPNOTSUPP; + if (!(iocb->ki_flags & IOCB_DIRECT)) { + /* Some restrictions to buferred IO */ + + /* + * We only support block size == page size + * right now. This is to avoid the following: + * 1. 4kb block atomic write marks the complete 64kb folio as + * atomic. + * 2. Other writes, dirty the whole 64kb folio. + * 3. Writeback sees the whole folio dirty and atomic and tries + * to send a 64kb atomic write, which might exceed the + * allowed size and fail. + * + * Once we support sub-page atomic write tracking, we can remove + * this restriction. + */ + if (sb->s_blocksize != PAGE_SIZE) + return -EOPNOTSUPP; + + /* + * If the user buffer of atomic write crosses page boundary, + * there's a possibility of short write, example if 1 user page + * could not be faulted or got reclaimed before the copy + * operation. For now don't allow such a scenario by ensuring + * user buffer is page aligned. + */ + if (!PAGE_ALIGNED(iov_iter_alignment(iter))) + return -EOPNOTSUPP; + + } return 0; } diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 8b1ac08c7474..693f3e5ad03c 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -390,6 +390,8 @@ sector_t iomap_bmap(struct address_space *mapping, sector_t bno, #define IOMAP_IOEND_DIRECT (1U << 3) /* is DONTCACHE I/O */ #define IOMAP_IOEND_DONTCACHE (1U << 4) +/* is atomic I/O. These are never merged */ +#define IOMAP_IOEND_ATOMIC (1U << 5) /* * Flags that if set on either ioend prevent the merge of two ioends. -- 2.51.0