As a first step to facilitate efficient post-eof zeroing in tmpfs, zero post-eof uptodate folios at swap out time. This ensures that post-eof ranges are zeroed "on disk" (i.e. analogous to traditional pagecache writeback) and facilitates zeroing on file size changes by allowing it to not have to swap in. Note that shmem_writeout() already zeroes !uptodate folios so this introduces some duplicate logic. We'll clean this up in the next patch. Signed-off-by: Brian Foster --- mm/shmem.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 0a25ee095b86..5fb3c911894f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1577,6 +1577,8 @@ int shmem_writeout(struct folio *folio, struct swap_iocb **plug, struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + loff_t i_size = i_size_read(inode); + pgoff_t end_index = DIV_ROUND_UP(i_size, PAGE_SIZE); pgoff_t index; int nr_pages; bool split = false; @@ -1596,8 +1598,7 @@ int shmem_writeout(struct folio *folio, struct swap_iocb **plug, * (unless fallocate has been used to preallocate beyond EOF). */ if (folio_test_large(folio)) { - index = shmem_fallocend(inode, - DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE)); + index = shmem_fallocend(inode, end_index); if ((index > folio->index && index < folio_next_index(folio)) || !IS_ENABLED(CONFIG_THP_SWAP)) split = true; @@ -1647,6 +1648,20 @@ int shmem_writeout(struct folio *folio, struct swap_iocb **plug, folio_mark_uptodate(folio); } + /* + * Ranges beyond EOF must be zeroed at writeout time. This mirrors + * traditional writeback behavior and facilitates zeroing on file size + * changes without having to swap back in. + */ + if (folio_next_index(folio) >= end_index) { + size_t from = offset_in_folio(folio, i_size); + + if (index >= end_index) { + folio_zero_segment(folio, 0, folio_size(folio)); + } else if (from) + folio_zero_segment(folio, from, folio_size(folio)); + } + if (!folio_alloc_swap(folio)) { bool first_swapped = shmem_recalc_inode(inode, 0, nr_pages); int error; -- 2.51.1 shmem_writeout() zeroes folios that are !uptodate (before marking them uptodate) or that extend beyond EOF to preserve data integrity according to POSIX. This is handled in a couple different blocks. Fold the !uptodate zeroing into the post-eof block so we zero from one place. Signed-off-by: Brian Foster --- mm/shmem.c | 40 +++++++++++++++++++--------------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 5fb3c911894f..7925ced8a05d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1627,25 +1627,20 @@ int shmem_writeout(struct folio *folio, struct swap_iocb **plug, * good idea to continue anyway, once we're pushing into swap. So * reactivate the folio, and let shmem_fallocate() quit when too many. */ - if (!folio_test_uptodate(folio)) { - if (inode->i_private) { - struct shmem_falloc *shmem_falloc; - spin_lock(&inode->i_lock); - shmem_falloc = inode->i_private; - if (shmem_falloc && - !shmem_falloc->waitq && - index >= shmem_falloc->start && - index < shmem_falloc->next) - shmem_falloc->nr_unswapped += nr_pages; - else - shmem_falloc = NULL; - spin_unlock(&inode->i_lock); - if (shmem_falloc) - goto redirty; - } - folio_zero_range(folio, 0, folio_size(folio)); - flush_dcache_folio(folio); - folio_mark_uptodate(folio); + if (!folio_test_uptodate(folio) && inode->i_private) { + struct shmem_falloc *shmem_falloc; + spin_lock(&inode->i_lock); + shmem_falloc = inode->i_private; + if (shmem_falloc && + !shmem_falloc->waitq && + index >= shmem_falloc->start && + index < shmem_falloc->next) + shmem_falloc->nr_unswapped += nr_pages; + else + shmem_falloc = NULL; + spin_unlock(&inode->i_lock); + if (shmem_falloc) + goto redirty; } /* @@ -1653,11 +1648,14 @@ int shmem_writeout(struct folio *folio, struct swap_iocb **plug, * traditional writeback behavior and facilitates zeroing on file size * changes without having to swap back in. */ - if (folio_next_index(folio) >= end_index) { + if (!folio_test_uptodate(folio) || + folio_next_index(folio) >= end_index) { size_t from = offset_in_folio(folio, i_size); - if (index >= end_index) { + if (!folio_test_uptodate(folio) || index >= end_index) { folio_zero_segment(folio, 0, folio_size(folio)); + flush_dcache_folio(folio); + folio_mark_uptodate(folio); } else if (from) folio_zero_segment(folio, from, folio_size(folio)); } -- 2.51.1 POSIX requires that "If the file size is increased, the extended area shall appear as if it were zero-filled". It is possible to use mmap to write past EOF and that data will become visible instead of zeroes. This behavior is reproduced by fstests test generic/363. Most traditional filesystems zero any post-eof portion of a folio at writeback time or when the file size is extended by truncate or extending writes. This ensures that the previously post-eof range of the folio is zeroed before it is exposed to the file. The tmpfs writeout path has been updated to zero post-eof folio ranges similar to traditional writeback. This ensures post-eof ranges are zeroed "on disk" and allows size extension zeroing to skip over swap entries as they are already appropriately zeroed. To that end, introduce a new zeroing helper for proper zeroing on file extending operations. This looks up resident folios between the original and new eof and for those that are uptodate, zeroes them before the associated ranges are exposed to the file. This preserves POSIX semantics and allows generic/363 to pass on tmpfs. Signed-off-by: Brian Foster --- mm/shmem.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 79 insertions(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index 7925ced8a05d..a4aceb474377 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1101,6 +1101,78 @@ static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index) return folio; } +/* + * Zero a post-EOF range about to be exposed by size extension. Zero from the + * current i_size through lend, the latter of which typically refers to the + * start offset of an extending operation. Skip swap entries because associated + * folios were zeroed at swapout time. + */ +static void shmem_zero_eof(struct inode *inode, loff_t lend) +{ + struct address_space *mapping = inode->i_mapping; + loff_t lstart = i_size_read(inode); + pgoff_t index = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT; + pgoff_t end = lend >> PAGE_SHIFT; + struct folio_batch fbatch; + struct folio *folio; + int i; + bool same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT); + + folio = filemap_lock_folio(mapping, lstart >> PAGE_SHIFT); + if (!IS_ERR(folio)) { + same_folio = lend < folio_next_pos(folio); + index = folio_next_index(folio); + + if (folio_test_uptodate(folio)) { + size_t from = offset_in_folio(folio, lstart); + size_t len = min_t(loff_t, folio_size(folio) - from, + lend - lstart); + + folio_zero_range(folio, from, len); + } + + folio_unlock(folio); + folio_put(folio); + } + + if (!same_folio) { + folio = filemap_lock_folio(mapping, lend >> PAGE_SHIFT); + if (!IS_ERR(folio)) { + end = folio->index; + + if (folio_test_uptodate(folio)) { + size_t len = lend - folio_pos(folio); + folio_zero_range(folio, 0, len); + } + + folio_unlock(folio); + folio_put(folio); + } + } + + /* + * Zero uptodate folios fully within the target range. Uptodate folios + * beyond EOF are generally unexpected, but can exist if a larger + * falloc'd and uptodate EOF folio is split. + */ + folio_batch_init(&fbatch); + while (index < end) { + if (!filemap_get_folios(mapping, &index, end - 1, &fbatch)) + break; + for (i = 0; i < folio_batch_count(&fbatch); i++) { + folio = fbatch.folios[i]; + + folio_lock(folio); + if (folio_test_uptodate(folio) && + folio->mapping == mapping) { + folio_zero_segment(folio, 0, folio_size(folio)); + } + folio_unlock(folio); + } + folio_batch_release(&fbatch); + } +} + /* * Remove range of pages and swap entries from page cache, and free them. * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. @@ -1331,6 +1403,8 @@ static int shmem_setattr(struct mnt_idmap *idmap, oldsize, newsize); if (error) return error; + if (newsize > oldsize) + shmem_zero_eof(inode, newsize); i_size_write(inode, newsize); update_mtime = true; } else { @@ -3512,6 +3586,8 @@ static ssize_t shmem_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ret = file_update_time(file); if (ret) goto unlock; + if (iocb->ki_pos > i_size_read(inode)) + shmem_zero_eof(inode, iocb->ki_pos); ret = generic_perform_write(iocb, from); unlock: inode_unlock(inode); @@ -3844,8 +3920,10 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, cond_resched(); } - if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) + if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) { + shmem_zero_eof(inode, offset + len); i_size_write(inode, offset + len); + } undone: spin_lock(&inode->i_lock); inode->i_private = NULL; -- 2.51.1