Inflating a VM's balloon while vhost-user-net fork+exec's a helper triggers "still mapped when deleted" on the memfd backing guest RAM: BUG: Bad page cache in process __balloon pfn:6520704 page dumped because: still mapped when deleted ... shmem_undo_range+0x3fa/0x570 shmem_fallocate+0x366/0x4d0 vfs_fallocate+0x13c/0x310 This BUG also resulted in guests seeing stale mappings backed by a zeroed page, causing guest kernel panics. I was unable to trace that specific interaction, but it appears to be related to THP splitting. Two races allow PTEs to be re-installed for a folio that fallocate is about to remove from page cache: Race 1 — fault-around (filemap_map_pages): fallocate fault-around fork -------- ------------ ---- set i_private unmap_mapping_range() # zaps PTEs filemap_map_pages() # re-maps folio! dup_mmap() # child VMA # in tree shmem_undo_range() lock folio unmap_mapping_folio() # child VMA: # no PTE, skip copy_page_range() # copies PTE # parent VMA: # zaps PTE filemap_remove_folio() # mapcount=1, BUG! filemap_map_pages() is called directly as .map_pages, bypassing shmem_fault()'s i_private synchronization. Race 2 — shmem_fault TOCTOU: fallocate shmem_fault -------- ----------- check i_private → NULL set i_private unmap_mapping_range() # zaps PTEs shmem_get_folio_gfp() # finds folio in cache finish_fault() # installs PTE shmem_undo_range() truncate_inode_folio() # mapcount=1, BUG! Fix both races with invalidate_lock. This matches the existing pattern used by secretmem_fault(), udf_page_mkwrite(), and zonefs_filemap_page_mkwrite(), all of which take invalidate_lock shared under mmap_lock in their fault handlers. This also requires removing the rcu_read_lock() from do_fault_around() so that .map_pages may use sleeping locks. The outer rcu_read_lock is redundant for all in-tree .map_pages implementations: every one either IS filemap_map_pages (which takes rcu_read_lock) or is a thin wrapper around it. Fixes: d7c1755179b8 ("mm: implement ->map_pages for shmem/tmpfs") Cc: stable@vger.kernel.org Signed-off-by: Gregory Price --- mm/memory.c | 2 -- mm/shmem.c | 33 ++++++++++++++++++++++++++++++--- 2 files changed, 30 insertions(+), 5 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index e44469f9cf65..838583591fdf 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5900,11 +5900,9 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf) return VM_FAULT_OOM; } - rcu_read_lock(); ret = vmf->vma->vm_ops->map_pages(vmf, vmf->pgoff + from_pte - pte_off, vmf->pgoff + to_pte - pte_off); - rcu_read_unlock(); return ret; } diff --git a/mm/shmem.c b/mm/shmem.c index 4ecefe02881d..5c654b86f3cf 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2731,7 +2731,8 @@ static vm_fault_t shmem_falloc_wait(struct vm_fault *vmf, struct inode *inode) static vm_fault_t shmem_fault(struct vm_fault *vmf) { struct inode *inode = file_inode(vmf->vma->vm_file); - gfp_t gfp = mapping_gfp_mask(inode->i_mapping); + struct address_space *mapping = inode->i_mapping; + gfp_t gfp = mapping_gfp_mask(mapping); struct folio *folio = NULL; vm_fault_t ret = 0; int err; @@ -2747,8 +2748,15 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) } WARN_ON_ONCE(vmf->page != NULL); + /* + * shmem_fallocate(PUNCH_HOLE) holds invalidate_lock exclusive across + * unmap+truncate. Take it shared here so shmem_fault cannot obtain + * a folio in the process of being punched. + */ + filemap_invalidate_lock_shared(mapping); err = shmem_get_folio_gfp(inode, vmf->pgoff, 0, &folio, SGP_CACHE, gfp, vmf, &ret); + filemap_invalidate_unlock_shared(mapping); if (err) return vmf_error(err); if (folio) { @@ -3683,11 +3691,13 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, inode->i_private = &shmem_falloc; spin_unlock(&inode->i_lock); + filemap_invalidate_lock(mapping); if ((u64)unmap_end > (u64)unmap_start) unmap_mapping_range(mapping, unmap_start, 1 + unmap_end - unmap_start, 0); shmem_truncate_range(inode, offset, offset + len - 1); /* No need to unmap again: hole-punching leaves COWed pages */ + filemap_invalidate_unlock(mapping); spin_lock(&inode->i_lock); inode->i_private = NULL; @@ -5268,9 +5278,26 @@ static const struct super_operations shmem_ops = { #endif }; +/* + * shmem_fallocate(PUNCH_HOLE) holds invalidate_lock for write across + * unmap+truncate. Take it for read here so fault-around cannot re-map + * pages being punched. + */ +static vm_fault_t shmem_map_pages(struct vm_fault *vmf, + pgoff_t start_pgoff, pgoff_t end_pgoff) +{ + struct address_space *mapping = vmf->vma->vm_file->f_mapping; + vm_fault_t ret; + + filemap_invalidate_lock_shared(mapping); + ret = filemap_map_pages(vmf, start_pgoff, end_pgoff); + filemap_invalidate_unlock_shared(mapping); + return ret; +} + static const struct vm_operations_struct shmem_vm_ops = { .fault = shmem_fault, - .map_pages = filemap_map_pages, + .map_pages = shmem_map_pages, #ifdef CONFIG_NUMA .set_policy = shmem_set_policy, .get_policy = shmem_get_policy, @@ -5282,7 +5309,7 @@ static const struct vm_operations_struct shmem_vm_ops = { static const struct vm_operations_struct shmem_anon_vm_ops = { .fault = shmem_fault, - .map_pages = filemap_map_pages, + .map_pages = shmem_map_pages, #ifdef CONFIG_NUMA .set_policy = shmem_set_policy, .get_policy = shmem_get_policy, -- 2.53.0