From: Chi Zhiling Add a new sgp_type SGP_GET which is similar to SGP_READ but returns the folio unlocked with an increased refcount. This eliminates the lock/unlock overhead for read-only operations. SGP_GET skips folio lock and mapping check, suitable only for short-lived access. Caller must not rely on folio->mapping validity as it can become invalid due to concurrent truncate. Safety relies on refcount and uptodate flag (truncate doesn't clear content). Signed-off-by: Chi Zhiling --- include/linux/shmem_fs.h | 3 ++- mm/shmem.c | 15 ++++++++++++--- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 93a0ba872ebe..24698faea5a4 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -164,7 +164,8 @@ extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, /* Flag allocation requirements to shmem_get_folio */ enum sgp_type { - SGP_READ, /* don't exceed i_size, don't allocate page */ + SGP_GET, /* don't exceed i_size, don't allocate page, don't lock */ + SGP_READ, /* don't exceed i_size, don't allocate page, lock folio */ SGP_NOALLOC, /* similar, but fail on hole or use fallocated page */ SGP_CACHE, /* don't exceed i_size, may allocate page */ SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */ diff --git a/mm/shmem.c b/mm/shmem.c index 3b5dc21b323c..ef19968cc51c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2504,6 +2504,13 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, } if (folio) { + if (sgp == SGP_GET) { + if (!folio_test_uptodate(folio)) { + folio_put(folio); + folio = NULL; + } + goto out; + } folio_lock(folio); /* Has the folio been truncated or swapped out? */ @@ -2524,11 +2531,11 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, } /* - * SGP_READ: succeed on hole, with NULL folio, letting caller zero. + * SGP_READ/SGP_GET: succeed on hole, with NULL folio, letting caller zero. * SGP_NOALLOC: fail on hole, with NULL folio, letting caller fail. */ *foliop = NULL; - if (sgp == SGP_READ) + if (sgp == SGP_READ || sgp == SGP_GET) return 0; if (sgp == SGP_NOALLOC) return -ENOENT; @@ -2649,13 +2656,15 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, * @sgp: SGP_* flags to control behavior * * Looks up the page cache entry at @inode & @index. If a folio is - * present, it is returned locked with an increased refcount. + * present, it is returned locked with an increased refcount, except + * for SGP_GET which returns the folio unlocked with an increased refcount. * * If the caller modifies data in the folio, it must call folio_mark_dirty() * before unlocking the folio to ensure that the folio is not reclaimed. * There is no need to reserve space before calling folio_mark_dirty(). * * When no folio is found, the behavior depends on @sgp: + * - for SGP_GET, *@foliop is %NULL and 0 is returned * - for SGP_READ, *@foliop is %NULL and 0 is returned * - for SGP_NOALLOC, *@foliop is %NULL and -ENOENT is returned * - for all other flags a new folio is allocated, inserted into the -- 2.43.0 From: Chi Zhiling Replace SGP_READ with SGP_GET in shmem_file_read_iter(), shmem_file_splice_read(), and shmem_get_link(). These functions immediately unlock the folio after getting it, making the lock acquisition redundant. Even though folio_lock can protect folio data consistency or prevent truncate while holding the lock, these can still happen after unlock. Since these functions continue reading data after unlocking, the lock does not provide effective protection. The folio reference count is what actually prevents reclamation during access, making the lock unnecessary. Signed-off-by: Chi Zhiling --- mm/shmem.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index ef19968cc51c..767610f78d0d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3370,15 +3370,13 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) break; index = iocb->ki_pos >> PAGE_SHIFT; - error = shmem_get_folio(inode, index, 0, &folio, SGP_READ); + error = shmem_get_folio(inode, index, 0, &folio, SGP_GET); if (error) { if (error == -EINVAL) error = 0; break; } if (folio) { - folio_unlock(folio); - page = folio_file_page(folio, index); if (PageHWPoison(page)) { folio_put(folio); @@ -3561,15 +3559,13 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, break; index = *ppos >> PAGE_SHIFT; - error = shmem_get_folio(inode, index, 0, &folio, SGP_READ); + error = shmem_get_folio(inode, index, 0, &folio, SGP_GET); if (error) { if (error == -EINVAL) error = 0; break; } if (folio) { - folio_unlock(folio); - page = folio_file_page(folio, index); if (PageHWPoison(page)) { error = -EIO; @@ -4170,17 +4166,15 @@ static const char *shmem_get_link(struct dentry *dentry, struct inode *inode, return ERR_PTR(-ECHILD); } } else { - error = shmem_get_folio(inode, 0, 0, &folio, SGP_READ); + error = shmem_get_folio(inode, 0, 0, &folio, SGP_GET); if (error) return ERR_PTR(error); if (!folio) return ERR_PTR(-ECHILD); if (PageHWPoison(folio_page(folio, 0))) { - folio_unlock(folio); folio_put(folio); return ERR_PTR(-ECHILD); } - folio_unlock(folio); } set_delayed_call(done, shmem_put_link, folio); return folio_address(folio); -- 2.43.0 From: Chi Zhiling Optimize shmem file read by implementing folio batching in the read iteration path. Only uptodate folios are added to the batch, ensuring all folios in the batch are valid and ready for use without additional checking. Signed-off-by: Chi Zhiling --- mm/shmem.c | 102 ++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 93 insertions(+), 9 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 767610f78d0d..4bc4e463ca97 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3348,16 +3348,100 @@ shmem_write_end(const struct kiocb *iocb, struct address_space *mapping, return copied; } +static pgoff_t shmem_get_read_batch(struct address_space *mapping, + pgoff_t index, pgoff_t max, struct folio_batch *fbatch) +{ + XA_STATE(xas, &mapping->i_pages, index); + struct folio *folio; + pgoff_t end = max; + + rcu_read_lock(); + xas_for_each(&xas, folio, max) { + if (xas_retry(&xas, folio)) + continue; + if (xa_is_value(folio)) { + end = xas.xa_index; + break; + } + if (!folio_try_get(folio)) + goto retry; + + if (unlikely(folio != xas_reload(&xas))) + goto put_folio; + + end = folio_next_index(folio); + + if (!folio_test_uptodate(folio)) { + xas_advance(&xas, end - 1); + folio_put(folio); + continue; + } + if (!folio_batch_add(fbatch, folio)) + break; + xas_advance(&xas, end - 1); + continue; +put_folio: + folio_put(folio); +retry: + xas_reset(&xas); + } + rcu_read_unlock(); + + return end; +} + +static inline int shmem_get_folio_from_batch(struct inode *inode, + pgoff_t index, pgoff_t last_index, struct folio **folio, + struct folio_batch *fbatch, pgoff_t *batch_end) +{ + struct folio *next; + int error; + + if (*batch_end > index) { +retry: + next = folio_batch_next(fbatch); + if (next) { + if (next->index > index) { + next = NULL; + fbatch->i--; /* revert folio_batch_next */ + } + } + *folio = next; + return 0; + } + + for (int i = 0; i < folio_batch_count(fbatch); i++) + folio_put(fbatch->folios[i]); + folio_batch_init(fbatch); + + *batch_end = shmem_get_read_batch(inode->i_mapping, index, + last_index, fbatch); + if (*batch_end > index) + goto retry; + + error = shmem_get_folio(inode, index, 0, folio, SGP_GET); + if (unlikely(error)) + return error; + if (*folio) { + folio_batch_add(fbatch, *folio); + goto retry; + } + return 0; +} + static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; - pgoff_t index; + struct folio_batch fbatch; + pgoff_t index, last_index, fbatch_end = 0; unsigned long offset; int error = 0; ssize_t retval = 0; + folio_batch_init(&fbatch); + for (;;) { struct folio *folio = NULL; struct page *page = NULL; @@ -3370,8 +3454,10 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) break; index = iocb->ki_pos >> PAGE_SHIFT; - error = shmem_get_folio(inode, index, 0, &folio, SGP_GET); - if (error) { + last_index = (iocb->ki_pos + to->count - 1) >> PAGE_SHIFT; + error = shmem_get_folio_from_batch(inode, index, last_index, &folio, + &fbatch, &fbatch_end); + if (unlikely(error)) { if (error == -EINVAL) error = 0; break; @@ -3379,7 +3465,6 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (folio) { page = folio_file_page(folio, index); if (PageHWPoison(page)) { - folio_put(folio); error = -EIO; break; } @@ -3394,11 +3479,9 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) * are called without i_rwsem protection against truncate */ i_size = i_size_read(inode); - if (unlikely(iocb->ki_pos >= i_size)) { - if (folio) - folio_put(folio); + if (unlikely(iocb->ki_pos >= i_size)) break; - } + end_offset = min_t(loff_t, i_size, iocb->ki_pos + to->count); if (folio && likely(!fallback_page_copy)) fsize = folio_size(folio); @@ -3433,7 +3516,6 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) ret = copy_folio_to_iter(folio, offset, nr, to); else ret = copy_page_to_iter(page, offset, nr, to); - folio_put(folio); } else if (user_backed_iter(to)) { /* * Copy to user tends to be so well optimized, but @@ -3462,6 +3544,8 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) cond_resched(); } + for (int i = 0; i < folio_batch_count(&fbatch); i++) + folio_put(fbatch.folios[i]); file_accessed(file); return retval ? retval : error; } -- 2.43.0 From: Chi Zhiling Change SGP_NOALLOC to return 0 with NULL folio on hole, matching SGP_READ/SGP_GET behavior. This simplifies the sgp_type handling by unifying hole semantics across these three types. Previously, SGP_NOALLOC returned -ENOENT on hole, while SGP_READ/SGP_GET returned 0. This inconsistency required special handling in callers like khugepaged and userfaultfd. After this change: - khugepaged: behavior unchanged (checks both error and NULL folio) - userfaultfd: behavior unchanged (both -ENOENT and NULL are converted to -EFAULT before returning to userspace) Signed-off-by: Chi Zhiling --- include/linux/shmem_fs.h | 2 +- mm/khugepaged.c | 2 +- mm/shmem.c | 9 +++------ 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 24698faea5a4..6f7f8b9e2a10 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -166,7 +166,7 @@ extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, enum sgp_type { SGP_GET, /* don't exceed i_size, don't allocate page, don't lock */ SGP_READ, /* don't exceed i_size, don't allocate page, lock folio */ - SGP_NOALLOC, /* similar, but fail on hole or use fallocated page */ + SGP_NOALLOC, /* like SGP_READ, but accept fallocated page */ SGP_CACHE, /* don't exceed i_size, may allocate page */ SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */ SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ diff --git a/mm/khugepaged.c b/mm/khugepaged.c index b8452dbdb043..3309d1c094df 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1950,7 +1950,7 @@ static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr, xas_unlock_irq(&xas); /* swap in or instantiate fallocated page */ if (shmem_get_folio(mapping->host, index, 0, - &folio, SGP_NOALLOC)) { + &folio, SGP_NOALLOC) || !folio) { result = SCAN_FAIL; goto xa_unlocked; } diff --git a/mm/shmem.c b/mm/shmem.c index 4bc4e463ca97..41f5e251f7ed 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2531,14 +2531,11 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, } /* - * SGP_READ/SGP_GET: succeed on hole, with NULL folio, letting caller zero. - * SGP_NOALLOC: fail on hole, with NULL folio, letting caller fail. + * SGP_READ/SGP_GET/SGP_NOALLOC: succeed on hole, with NULL folio. */ *foliop = NULL; - if (sgp == SGP_READ || sgp == SGP_GET) + if (sgp <= SGP_NOALLOC) return 0; - if (sgp == SGP_NOALLOC) - return -ENOENT; /* * Fast cache lookup and swap lookup did not find it: allocate. @@ -2666,7 +2663,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, * When no folio is found, the behavior depends on @sgp: * - for SGP_GET, *@foliop is %NULL and 0 is returned * - for SGP_READ, *@foliop is %NULL and 0 is returned - * - for SGP_NOALLOC, *@foliop is %NULL and -ENOENT is returned + * - for SGP_NOALLOC, *@foliop is %NULL and 0 is returned * - for all other flags a new folio is allocated, inserted into the * page cache and returned locked in @foliop. * -- 2.43.0