From: Kiryl Shutsemau Accesses within VMA, but beyond i_size rounded up to PAGE_SIZE are supposed to generate SIGBUS. Recent changes attempted to fault in full folio where possible. They did not respect i_size, which led to populating PTEs beyond i_size and breaking SIGBUS semantics. Darrick reported generic/749 breakage because of this. However, the problem existed before the recent changes. With huge=always tmpfs, any write to a file leads to PMD-size allocation. Following the fault-in of the folio will install PMD mapping regardless of i_size. Fix filemap_map_pages() and finish_fault() to not install: - PTEs beyond i_size; - PMD mappings across i_size; Signed-off-by: Kiryl Shutsemau Fixes: 19773df031bc ("mm/fault: try to map the entire file folio in finish_fault()") Fixes: 357b92761d94 ("mm/filemap: map entire large folio faultaround") Fixes: 800d8c63b2e9 ("shmem: add huge pages support") Reported-by: "Darrick J. Wong" --- mm/filemap.c | 18 ++++++++++-------- mm/memory.c | 12 ++++++++++-- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 13f0259d993c..0d251f6ab480 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3681,7 +3681,8 @@ static struct folio *next_uptodate_folio(struct xa_state *xas, static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, struct folio *folio, unsigned long start, unsigned long addr, unsigned int nr_pages, - unsigned long *rss, unsigned short *mmap_miss) + unsigned long *rss, unsigned short *mmap_miss, + pgoff_t file_end) { unsigned int ref_from_caller = 1; vm_fault_t ret = 0; @@ -3697,7 +3698,8 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, */ addr0 = addr - start * PAGE_SIZE; if (folio_within_vma(folio, vmf->vma) && - (addr0 & PMD_MASK) == ((addr0 + folio_size(folio) - 1) & PMD_MASK)) { + (addr0 & PMD_MASK) == ((addr0 + folio_size(folio) - 1) & PMD_MASK) && + file_end >= folio_next_index(folio)) { vmf->pte -= start; page -= start; addr = addr0; @@ -3817,7 +3819,11 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, if (!folio) goto out; - if (filemap_map_pmd(vmf, folio, start_pgoff)) { + file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE) - 1; + end_pgoff = min(end_pgoff, file_end); + + if (file_end >= folio_next_index(folio) && + filemap_map_pmd(vmf, folio, start_pgoff)) { ret = VM_FAULT_NOPAGE; goto out; } @@ -3830,10 +3836,6 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, goto out; } - file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE) - 1; - if (end_pgoff > file_end) - end_pgoff = file_end; - folio_type = mm_counter_file(folio); do { unsigned long end; @@ -3850,7 +3852,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, else ret |= filemap_map_folio_range(vmf, folio, xas.xa_index - folio->index, addr, - nr_pages, &rss, &mmap_miss); + nr_pages, &rss, &mmap_miss, file_end); folio_unlock(folio); } while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL); diff --git a/mm/memory.c b/mm/memory.c index 74b45e258323..dfa5b437c9d9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5480,6 +5480,7 @@ vm_fault_t finish_fault(struct vm_fault *vmf) int type, nr_pages; unsigned long addr; bool needs_fallback = false; + pgoff_t file_end = -1UL; fallback: addr = vmf->address; @@ -5501,8 +5502,14 @@ vm_fault_t finish_fault(struct vm_fault *vmf) return ret; } + if (vma->vm_file) { + struct inode *inode = vma->vm_file->f_mapping->host; + file_end = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + } + if (pmd_none(*vmf->pmd)) { - if (folio_test_pmd_mappable(folio)) { + if (folio_test_pmd_mappable(folio) && + file_end >= folio_next_index(folio)) { ret = do_set_pmd(vmf, folio, page); if (ret != VM_FAULT_FALLBACK) return ret; @@ -5533,7 +5540,8 @@ vm_fault_t finish_fault(struct vm_fault *vmf) if (unlikely(vma_off < idx || vma_off + (nr_pages - idx) > vma_pages(vma) || pte_off < idx || - pte_off + (nr_pages - idx) > PTRS_PER_PTE)) { + pte_off + (nr_pages - idx) > PTRS_PER_PTE || + file_end < folio_next_index(folio))) { nr_pages = 1; } else { /* Now we can set mappings for the whole large folio. */ -- 2.50.1 From: Kiryl Shutsemau Accesses within VMA, but beyond i_size rounded up to PAGE_SIZE are supposed to generate SIGBUS. This behavior might not be respected on truncation. During truncation, the kernel splits a large folio in order to reclaim memory. As a side effect, it unmaps the folio and destroys PMD mappings of the folio. The folio will be refaulted as PTEs and SIGBUS semantics are preserved. However, if the split fails, PMD mappings are preserved and the user will not receive SIGBUS on any accesses within the PMD. Unmap the folio on split failure. It will lead to refault as PTEs and preserve SIGBUS semantics. Signed-off-by: Kiryl Shutsemau --- mm/truncate.c | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/mm/truncate.c b/mm/truncate.c index 91eb92a5ce4f..cdb698b5f7fa 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -177,6 +177,28 @@ int truncate_inode_folio(struct address_space *mapping, struct folio *folio) return 0; } +static int try_folio_split_or_unmap(struct folio *folio, struct page *split_at) +{ + enum ttu_flags ttu_flags = + TTU_RMAP_LOCKED | + TTU_SYNC | + TTU_BATCH_FLUSH | + TTU_SPLIT_HUGE_PMD | + TTU_IGNORE_MLOCK; + int ret; + + ret = try_folio_split(folio, split_at, NULL); + + /* + * If the split fails, unmap the folio, so it will be refaulted + * with PTEs to respect SIGBUS semantics. + */ + if (ret) + try_to_unmap(folio, ttu_flags); + + return ret; +} + /* * Handle partial folios. The folio may be entirely within the * range if a split has raced with us. If not, we zero the part of the @@ -224,7 +246,7 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) return true; split_at = folio_page(folio, PAGE_ALIGN_DOWN(offset) / PAGE_SIZE); - if (!try_folio_split(folio, split_at, NULL)) { + if (!try_folio_split_or_unmap(folio, split_at)) { /* * try to split at offset + length to make sure folios within * the range can be dropped, especially to avoid memory waste @@ -249,12 +271,13 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) goto out; /* + * Split the folio. + * * make sure folio2 is large and does not change its mapping. - * Its split result does not matter here. */ if (folio_test_large(folio2) && folio2->mapping == folio->mapping) - try_folio_split(folio2, split_at2, NULL); + try_folio_split_or_unmap(folio2, split_at2); folio_unlock(folio2); out: -- 2.50.1