Move the VFS-level generic write path out of mm/filemap.c into fs/read_write.c next to the just-relocated read path: - generic_file_write_iter() - __generic_file_write_iter() - generic_file_direct_write() - generic_perform_write() - kiocb_invalidate_pages() - kiocb_invalidate_post_direct_write() - dio_warn_stale_pagecache() The kiocb_invalidate_* prototypes move from to , joining kiocb_write_and_wait() and the other generic read/write declarations. Drop extern from the prototypes of all five generic_file_* declarations in . Reflow the generic_file_direct_write() definition to fit on one line. Signed-off-by: Tal Zussman --- fs/read_write.c | 276 +++++++++++++++++++++++++++++++++++++++++++++++ include/linux/fs.h | 8 +- include/linux/pagemap.h | 2 - mm/filemap.c | 277 ------------------------------------------------ 4 files changed, 281 insertions(+), 282 deletions(-) diff --git a/fs/read_write.c b/fs/read_write.c index 59ceea85c163..cea5f79fdacf 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1071,6 +1071,282 @@ ssize_t generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) } EXPORT_SYMBOL(generic_file_read_iter); +int kiocb_invalidate_pages(struct kiocb *iocb, size_t count) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + + return filemap_invalidate_pages(mapping, iocb->ki_pos, + iocb->ki_pos + count - 1, + iocb->ki_flags & IOCB_NOWAIT); +} +EXPORT_SYMBOL_GPL(kiocb_invalidate_pages); + +/* + * Warn about a page cache invalidation failure during a direct I/O write. + */ +static void dio_warn_stale_pagecache(struct file *filp) +{ + static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST); + char pathname[128]; + char *path; + + errseq_set(&filp->f_mapping->wb_err, -EIO); + if (__ratelimit(&_rs)) { + path = file_path(filp, pathname, sizeof(pathname)); + if (IS_ERR(path)) + path = "(unknown)"; + pr_crit("Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O!\n"); + pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid, + current->comm); + } +} + +void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + + if (mapping->nrpages && + invalidate_inode_pages2_range(mapping, + iocb->ki_pos >> PAGE_SHIFT, + (iocb->ki_pos + count - 1) >> PAGE_SHIFT)) + dio_warn_stale_pagecache(iocb->ki_filp); +} + +ssize_t generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + size_t write_len = iov_iter_count(from); + ssize_t written; + + /* + * If a page can not be invalidated, return 0 to fall back + * to buffered write. + */ + written = kiocb_invalidate_pages(iocb, write_len); + if (written) { + if (written == -EBUSY) + return 0; + return written; + } + + written = mapping->a_ops->direct_IO(iocb, from); + + /* + * Finally, try again to invalidate clean pages which might have been + * cached by non-direct readahead, or faulted in by get_user_pages() + * if the source of the write was an mmap'ed region of the file + * we're writing. Either one is a pretty crazy thing to do, + * so we don't support it 100%. If this invalidation + * fails, tough, the write still worked... + * + * Most of the time we do not need this since dio_complete() will do + * the invalidation for us. However there are some file systems that + * do not end up with dio_complete() being called, so let's not break + * them by removing it completely. + * + * Noticeable example is a blkdev_direct_IO(). + * + * Skip invalidation for async writes or if mapping has no pages. + */ + if (written > 0) { + struct inode *inode = mapping->host; + loff_t pos = iocb->ki_pos; + + kiocb_invalidate_post_direct_write(iocb, written); + pos += written; + write_len -= written; + if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { + i_size_write(inode, pos); + mark_inode_dirty(inode); + } + iocb->ki_pos = pos; + } + if (written != -EIOCBQUEUED) + iov_iter_revert(from, write_len - iov_iter_count(from)); + return written; +} +EXPORT_SYMBOL(generic_file_direct_write); + +ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i) +{ + struct file *file = iocb->ki_filp; + loff_t pos = iocb->ki_pos; + struct address_space *mapping = file->f_mapping; + const struct address_space_operations *a_ops = mapping->a_ops; + size_t chunk = mapping_max_folio_size(mapping); + long status = 0; + ssize_t written = 0; + + do { + struct folio *folio; + size_t offset; /* Offset into folio */ + size_t bytes; /* Bytes to write to folio */ + size_t copied; /* Bytes copied from user */ + void *fsdata = NULL; + + bytes = iov_iter_count(i); +retry: + offset = pos & (chunk - 1); + bytes = min(chunk - offset, bytes); + balance_dirty_pages_ratelimited(mapping); + + if (fatal_signal_pending(current)) { + status = -EINTR; + break; + } + + status = a_ops->write_begin(iocb, mapping, pos, bytes, + &folio, &fsdata); + if (unlikely(status < 0)) + break; + + offset = offset_in_folio(folio, pos); + if (bytes > folio_size(folio) - offset) + bytes = folio_size(folio) - offset; + + if (mapping_writably_mapped(mapping)) + flush_dcache_folio(folio); + + /* + * Faults here on mmap()s can recurse into arbitrary + * filesystem code. Lots of locks are held that can + * deadlock. Use an atomic copy to avoid deadlocking + * in page fault handling. + */ + copied = copy_folio_from_iter_atomic(folio, offset, bytes, i); + flush_dcache_folio(folio); + + status = a_ops->write_end(iocb, mapping, pos, bytes, copied, + folio, fsdata); + if (unlikely(status != copied)) { + iov_iter_revert(i, copied - max(status, 0L)); + if (unlikely(status < 0)) + break; + } + cond_resched(); + + if (unlikely(status == 0)) { + /* + * A short copy made ->write_end() reject the + * thing entirely. Might be memory poisoning + * halfway through, might be a race with munmap, + * might be severe memory pressure. + */ + if (chunk > PAGE_SIZE) + chunk /= 2; + if (copied) { + bytes = copied; + goto retry; + } + + /* + * 'folio' is now unlocked and faults on it can be + * handled. Ensure forward progress by trying to + * fault it in now. + */ + if (fault_in_iov_iter_readable(i, bytes) == bytes) { + status = -EFAULT; + break; + } + } else { + pos += status; + written += status; + } + } while (iov_iter_count(i)); + + if (!written) + return status; + iocb->ki_pos += written; + return written; +} +EXPORT_SYMBOL(generic_perform_write); + +/** + * __generic_file_write_iter - write data to a file + * @iocb: IO state structure (file, offset, etc.) + * @from: iov_iter with data to write + * + * This function does all the work needed for actually writing data to a + * file. It does all basic checks, removes SUID from the file, updates + * modification times and calls proper subroutines depending on whether we + * do direct IO or a standard buffered write. + * + * It expects i_rwsem to be grabbed unless we work on a block device or similar + * object which does not need locking at all. + * + * This function does *not* take care of syncing data in case of O_SYNC write. + * A caller has to handle it. This is mainly due to the fact that we want to + * avoid syncing under i_rwsem. + * + * Return: + * * number of bytes written, even for truncated writes + * * negative error code if no data has been written at all + */ +ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + ssize_t ret; + + ret = file_remove_privs(file); + if (ret) + return ret; + + ret = file_update_time(file); + if (ret) + return ret; + + if (iocb->ki_flags & IOCB_DIRECT) { + ret = generic_file_direct_write(iocb, from); + /* + * If the write stopped short of completing, fall back to + * buffered writes. Some filesystems do this for writes to + * holes, for example. For DAX files, a buffered write will + * not succeed (even if it did, DAX does not handle dirty + * page-cache pages correctly). + */ + if (ret < 0 || !iov_iter_count(from) || IS_DAX(inode)) + return ret; + return direct_write_fallback(iocb, from, ret, + generic_perform_write(iocb, from)); + } + + return generic_perform_write(iocb, from); +} +EXPORT_SYMBOL(__generic_file_write_iter); + +/** + * generic_file_write_iter - write data to a file + * @iocb: IO state structure + * @from: iov_iter with data to write + * + * This is a wrapper around __generic_file_write_iter() to be used by most + * filesystems. It takes care of syncing the file in case of O_SYNC file + * and acquires i_rwsem as needed. + * Return: + * * negative error code if no data has been written at all of + * vfs_fsync_range() failed for a synchronous write + * * number of bytes written, even for truncated writes + */ +ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + ssize_t ret; + + inode_lock(inode); + ret = generic_write_checks(iocb, from); + if (ret > 0) + ret = __generic_file_write_iter(iocb, from); + inode_unlock(inode); + + if (ret > 0) + ret = generic_write_sync(iocb, ret); + return ret; +} +EXPORT_SYMBOL(generic_file_write_iter); + static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, unsigned long vlen, loff_t *pos, rwf_t flags) { diff --git a/include/linux/fs.h b/include/linux/fs.h index c0151ced8e7a..6cfb9e46bc37 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3057,9 +3057,11 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *to, ssize_t already_read); ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *); int kiocb_write_and_wait(struct kiocb *iocb, size_t count); -extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *); -extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *); -extern ssize_t generic_file_direct_write(struct kiocb *, struct iov_iter *); +int kiocb_invalidate_pages(struct kiocb *iocb, size_t count); +void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count); +ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *); +ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *); +ssize_t generic_file_direct_write(struct kiocb *, struct iov_iter *); ssize_t generic_perform_write(struct kiocb *, struct iov_iter *); ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter, ssize_t direct_written, ssize_t buffered_written); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 46cefd552a51..b7c2dc8076ab 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -31,8 +31,6 @@ static inline void invalidate_remote_inode(struct inode *inode) int invalidate_inode_pages2(struct address_space *mapping); int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end); -int kiocb_invalidate_pages(struct kiocb *iocb, size_t count); -void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count); int filemap_invalidate_pages(struct address_space *mapping, loff_t pos, loff_t end, bool nowait); diff --git a/mm/filemap.c b/mm/filemap.c index db7c53cd681b..284c0296a011 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2276,17 +2276,6 @@ int filemap_invalidate_pages(struct address_space *mapping, end >> PAGE_SHIFT); } -int kiocb_invalidate_pages(struct kiocb *iocb, size_t count) -{ - struct address_space *mapping = iocb->ki_filp->f_mapping; - - return filemap_invalidate_pages(mapping, iocb->ki_pos, - iocb->ki_pos + count - 1, - iocb->ki_flags & IOCB_NOWAIT); -} -EXPORT_SYMBOL_GPL(kiocb_invalidate_pages); - - /* * Splice subpages from a folio into a pipe. */ @@ -3500,272 +3489,6 @@ struct page *read_cache_page_gfp(struct address_space *mapping, } EXPORT_SYMBOL(read_cache_page_gfp); -/* - * Warn about a page cache invalidation failure during a direct I/O write. - */ -static void dio_warn_stale_pagecache(struct file *filp) -{ - static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST); - char pathname[128]; - char *path; - - errseq_set(&filp->f_mapping->wb_err, -EIO); - if (__ratelimit(&_rs)) { - path = file_path(filp, pathname, sizeof(pathname)); - if (IS_ERR(path)) - path = "(unknown)"; - pr_crit("Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O!\n"); - pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid, - current->comm); - } -} - -void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count) -{ - struct address_space *mapping = iocb->ki_filp->f_mapping; - - if (mapping->nrpages && - invalidate_inode_pages2_range(mapping, - iocb->ki_pos >> PAGE_SHIFT, - (iocb->ki_pos + count - 1) >> PAGE_SHIFT)) - dio_warn_stale_pagecache(iocb->ki_filp); -} - -ssize_t -generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) -{ - struct address_space *mapping = iocb->ki_filp->f_mapping; - size_t write_len = iov_iter_count(from); - ssize_t written; - - /* - * If a page can not be invalidated, return 0 to fall back - * to buffered write. - */ - written = kiocb_invalidate_pages(iocb, write_len); - if (written) { - if (written == -EBUSY) - return 0; - return written; - } - - written = mapping->a_ops->direct_IO(iocb, from); - - /* - * Finally, try again to invalidate clean pages which might have been - * cached by non-direct readahead, or faulted in by get_user_pages() - * if the source of the write was an mmap'ed region of the file - * we're writing. Either one is a pretty crazy thing to do, - * so we don't support it 100%. If this invalidation - * fails, tough, the write still worked... - * - * Most of the time we do not need this since dio_complete() will do - * the invalidation for us. However there are some file systems that - * do not end up with dio_complete() being called, so let's not break - * them by removing it completely. - * - * Noticeable example is a blkdev_direct_IO(). - * - * Skip invalidation for async writes or if mapping has no pages. - */ - if (written > 0) { - struct inode *inode = mapping->host; - loff_t pos = iocb->ki_pos; - - kiocb_invalidate_post_direct_write(iocb, written); - pos += written; - write_len -= written; - if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { - i_size_write(inode, pos); - mark_inode_dirty(inode); - } - iocb->ki_pos = pos; - } - if (written != -EIOCBQUEUED) - iov_iter_revert(from, write_len - iov_iter_count(from)); - return written; -} -EXPORT_SYMBOL(generic_file_direct_write); - -ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i) -{ - struct file *file = iocb->ki_filp; - loff_t pos = iocb->ki_pos; - struct address_space *mapping = file->f_mapping; - const struct address_space_operations *a_ops = mapping->a_ops; - size_t chunk = mapping_max_folio_size(mapping); - long status = 0; - ssize_t written = 0; - - do { - struct folio *folio; - size_t offset; /* Offset into folio */ - size_t bytes; /* Bytes to write to folio */ - size_t copied; /* Bytes copied from user */ - void *fsdata = NULL; - - bytes = iov_iter_count(i); -retry: - offset = pos & (chunk - 1); - bytes = min(chunk - offset, bytes); - balance_dirty_pages_ratelimited(mapping); - - if (fatal_signal_pending(current)) { - status = -EINTR; - break; - } - - status = a_ops->write_begin(iocb, mapping, pos, bytes, - &folio, &fsdata); - if (unlikely(status < 0)) - break; - - offset = offset_in_folio(folio, pos); - if (bytes > folio_size(folio) - offset) - bytes = folio_size(folio) - offset; - - if (mapping_writably_mapped(mapping)) - flush_dcache_folio(folio); - - /* - * Faults here on mmap()s can recurse into arbitrary - * filesystem code. Lots of locks are held that can - * deadlock. Use an atomic copy to avoid deadlocking - * in page fault handling. - */ - copied = copy_folio_from_iter_atomic(folio, offset, bytes, i); - flush_dcache_folio(folio); - - status = a_ops->write_end(iocb, mapping, pos, bytes, copied, - folio, fsdata); - if (unlikely(status != copied)) { - iov_iter_revert(i, copied - max(status, 0L)); - if (unlikely(status < 0)) - break; - } - cond_resched(); - - if (unlikely(status == 0)) { - /* - * A short copy made ->write_end() reject the - * thing entirely. Might be memory poisoning - * halfway through, might be a race with munmap, - * might be severe memory pressure. - */ - if (chunk > PAGE_SIZE) - chunk /= 2; - if (copied) { - bytes = copied; - goto retry; - } - - /* - * 'folio' is now unlocked and faults on it can be - * handled. Ensure forward progress by trying to - * fault it in now. - */ - if (fault_in_iov_iter_readable(i, bytes) == bytes) { - status = -EFAULT; - break; - } - } else { - pos += status; - written += status; - } - } while (iov_iter_count(i)); - - if (!written) - return status; - iocb->ki_pos += written; - return written; -} -EXPORT_SYMBOL(generic_perform_write); - -/** - * __generic_file_write_iter - write data to a file - * @iocb: IO state structure (file, offset, etc.) - * @from: iov_iter with data to write - * - * This function does all the work needed for actually writing data to a - * file. It does all basic checks, removes SUID from the file, updates - * modification times and calls proper subroutines depending on whether we - * do direct IO or a standard buffered write. - * - * It expects i_rwsem to be grabbed unless we work on a block device or similar - * object which does not need locking at all. - * - * This function does *not* take care of syncing data in case of O_SYNC write. - * A caller has to handle it. This is mainly due to the fact that we want to - * avoid syncing under i_rwsem. - * - * Return: - * * number of bytes written, even for truncated writes - * * negative error code if no data has been written at all - */ -ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) -{ - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - ssize_t ret; - - ret = file_remove_privs(file); - if (ret) - return ret; - - ret = file_update_time(file); - if (ret) - return ret; - - if (iocb->ki_flags & IOCB_DIRECT) { - ret = generic_file_direct_write(iocb, from); - /* - * If the write stopped short of completing, fall back to - * buffered writes. Some filesystems do this for writes to - * holes, for example. For DAX files, a buffered write will - * not succeed (even if it did, DAX does not handle dirty - * page-cache pages correctly). - */ - if (ret < 0 || !iov_iter_count(from) || IS_DAX(inode)) - return ret; - return direct_write_fallback(iocb, from, ret, - generic_perform_write(iocb, from)); - } - - return generic_perform_write(iocb, from); -} -EXPORT_SYMBOL(__generic_file_write_iter); - -/** - * generic_file_write_iter - write data to a file - * @iocb: IO state structure - * @from: iov_iter with data to write - * - * This is a wrapper around __generic_file_write_iter() to be used by most - * filesystems. It takes care of syncing the file in case of O_SYNC file - * and acquires i_rwsem as needed. - * Return: - * * negative error code if no data has been written at all of - * vfs_fsync_range() failed for a synchronous write - * * number of bytes written, even for truncated writes - */ -ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - ssize_t ret; - - inode_lock(inode); - ret = generic_write_checks(iocb, from); - if (ret > 0) - ret = __generic_file_write_iter(iocb, from); - inode_unlock(inode); - - if (ret > 0) - ret = generic_write_sync(iocb, ret); - return ret; -} -EXPORT_SYMBOL(generic_file_write_iter); /** * filemap_release_folio() - Release fs-specific metadata on a folio. -- 2.39.5