From: Shivam Kalra Extract page freeing and NR_VMALLOC stat accounting from vfree() into a reusable vm_area_free_pages() helper. The helper operates on a range [start_idx, end_idx) of pages from a vm_struct, making it suitable for both full free (vfree) and partial free (upcoming vrealloc shrink). Freed page pointers in vm->pages[] are set to NULL to prevent stale references when the vm_struct outlives the free (as in vrealloc shrink). Signed-off-by: Shivam Kalra --- mm/vmalloc.c | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 878c5b7bf837..1073abb6094e 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3416,6 +3416,32 @@ void vfree_atomic(const void *addr) schedule_work(&p->wq); } +/* + * vm_area_free_pages - free a range of pages from a vmalloc allocation + * @vm: the vm_struct containing the pages + * @start_idx: first page index to free (inclusive) + * @end_idx: last page index to free (exclusive) + * + * Free pages [start_idx, end_idx) updating NR_VMALLOC stat accounting. + * Freed vm->pages[] entries are set to NULL. + * Caller is responsible for unmapping (vunmap_range) and KASAN + * poisoning before calling this. + */ +static void vm_area_free_pages(struct vm_struct *vm, unsigned int start_idx, + unsigned int end_idx) +{ + unsigned int i; + + if (!(vm->flags & VM_MAP_PUT_PAGES)) { + for (i = start_idx; i < end_idx; i++) + mod_lruvec_page_state(vm->pages[i], NR_VMALLOC, -1); + } + free_pages_bulk(vm->pages + start_idx, end_idx - start_idx); + + for (i = start_idx; i < end_idx; i++) + vm->pages[i] = NULL; +} + /** * vfree - Release memory allocated by vmalloc() * @addr: Memory base address @@ -3436,7 +3462,6 @@ void vfree_atomic(const void *addr) void vfree(const void *addr) { struct vm_struct *vm; - int i; if (unlikely(in_interrupt())) { vfree_atomic(addr); @@ -3460,12 +3485,7 @@ void vfree(const void *addr) if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS)) vm_reset_perms(vm); - if (!(vm->flags & VM_MAP_PUT_PAGES)) { - for (i = 0; i < vm->nr_pages; i++) - mod_lruvec_page_state(vm->pages[i], NR_VMALLOC, -1); - } - free_pages_bulk(vm->pages, vm->nr_pages); - + vm_area_free_pages(vm, 0, vm->nr_pages); kvfree(vm->pages); kfree(vm); } -- 2.43.0 From: Shivam Kalra Update the grow-in-place check in vrealloc() to compare the requested size against the actual physical page count (vm->nr_pages) rather than the virtual area size (alloced_size, derived from get_vm_area_size()). Currently both values are equivalent, but the upcoming vrealloc() shrink functionality will free pages without reducing the virtual reservation size. After such a shrink, the old alloced_size-based comparison would incorrectly allow a grow-in-place operation to succeed and attempt to access freed pages. Switch to vm->nr_pages now so the check remains correct once shrink support is added. Signed-off-by: Shivam Kalra --- mm/vmalloc.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 1073abb6094e..9cb3e287a1e8 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -4338,6 +4338,12 @@ void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align if (unlikely(flags & __GFP_THISNODE) && nid != NUMA_NO_NODE && nid != page_to_nid(vmalloc_to_page(p))) goto need_realloc; + } else { + /* + * If p is NULL, vrealloc behaves exactly like vmalloc. + * Skip the shrink and in-place grow paths. + */ + goto need_realloc; } /* @@ -4356,7 +4362,7 @@ void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align /* * We already have the bytes available in the allocation; use them. */ - if (size <= alloced_size) { + if (size <= vm->nr_pages << PAGE_SHIFT) { /* * No need to zero memory here, as unused memory will have * already been zeroed at initial allocation time or during -- 2.43.0 From: Shivam Kalra Update vread_iter() to derive the vm area size from vm->nr_pages rather than get_vm_area_size(). Currently both values are equivalent, but the upcoming vrealloc() shrink functionality will free pages without reducing the virtual reservation size. After such a shrink, the old get_vm_area_size() based calculation would overestimate the mapped range, causing vread_iter() to attempt reading from unmapped addresses. Switch to vm->nr_pages now so the reader remains correct once shrink support is added. Signed-off-by: Shivam Kalra --- mm/vmalloc.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 9cb3e287a1e8..65e0a23efb3b 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -4661,7 +4661,14 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count) smp_rmb(); vaddr = (char *) va->va_start; - size = vm ? get_vm_area_size(vm) : va_size(va); + if (vm) + /* + * Cannot use get_vm_area_size() because realloc() + * may shrink the mapping and area->size may be outdated. + */ + size = vm->nr_pages << PAGE_SHIFT; + else + size = va_size(va); if (addr >= vaddr + size) goto next_va; -- 2.43.0 From: Shivam Kalra When vrealloc() shrinks an allocation and the new size crosses a page boundary, unmap and free the tail pages that are no longer needed. This reclaims physical memory that was previously wasted for the lifetime of the allocation. The heuristic is simple: always free when at least one full page becomes unused. Huge page allocations (page_order > 0) are skipped, as partial freeing would require splitting. Allocations with VM_FLUSH_RESET_PERMS are also skipped, as their direct-map permissions must be reset before pages are returned to the page allocator, which is handled by vm_reset_perms() during vfree(). Additionally, allocations with VM_USERMAP are skipped because remap_vmalloc_range_partial() validates mapping requests against the unchanged vm->size; freeing tail pages would cause vmalloc_to_page() to return NULL for the unmapped range. To protect concurrent readers, the shrink path uses Node lock to synchronize before freeing the pages. Finally, we notify kmemleak of the reduced allocation size using kmemleak_free_part() to prevent the kmemleak scanner from faulting on the newly unmapped virtual addresses. The virtual address reservation (vm->size / vmap_area) is intentionally kept unchanged, preserving the address for potential future grow-in-place support. Suggested-by: Danilo Krummrich Signed-off-by: Shivam Kalra --- mm/vmalloc.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 52 insertions(+), 4 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 65e0a23efb3b..9f810d306db9 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -4346,14 +4346,62 @@ void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align goto need_realloc; } - /* - * TODO: Shrink the vm_area, i.e. unmap and free unused pages. What - * would be a good heuristic for when to shrink the vm_area? - */ if (size <= old_size) { + unsigned int new_nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT; + /* Zero out "freed" memory, potentially for future realloc. */ if (want_init_on_free() || want_init_on_alloc(flags)) memset((void *)p + size, 0, old_size - size); + + /* + * Free tail pages when shrink crosses a page boundary. + * + * Skip huge page allocations (page_order > 0) as partial + * freeing would require splitting. + * + * Skip VM_FLUSH_RESET_PERMS, as direct-map permissions must + * be reset before pages are returned to the allocator. + * + * Skip VM_USERMAP, as remap_vmalloc_range_partial() validates + * mapping requests against the unchanged vm->size; freeing + * tail pages would cause vmalloc_to_page() to return NULL for + * the unmapped range. + * + * Skip if either GFP_NOFS or GFP_NOIO are used. + * kmemleak_free_part() internally allocates with + * GFP_KERNEL, which could trigger a recursive deadlock + * if we are under filesystem or I/O reclaim. + */ + if (new_nr_pages < vm->nr_pages && !vm_area_page_order(vm) && + !(vm->flags & (VM_FLUSH_RESET_PERMS | VM_USERMAP)) && + gfp_has_io_fs(flags)) { + unsigned long addr = (unsigned long)kasan_reset_tag(p); + unsigned int old_nr_pages = vm->nr_pages; + + /* + * Use the node lock to synchronize with concurrent + * readers (vmalloc_info_show). + */ + struct vmap_node *vn = addr_to_node(addr); + + spin_lock(&vn->busy.lock); + vm->nr_pages = new_nr_pages; + spin_unlock(&vn->busy.lock); + + /* Notify kmemleak of the reduced allocation size before unmapping. */ + kmemleak_free_part( + (void *)addr + ((unsigned long)new_nr_pages + << PAGE_SHIFT), + (unsigned long)(old_nr_pages - new_nr_pages) + << PAGE_SHIFT); + + vunmap_range(addr + ((unsigned long)new_nr_pages + << PAGE_SHIFT), + addr + ((unsigned long)old_nr_pages + << PAGE_SHIFT)); + + vm_area_free_pages(vm, new_nr_pages, old_nr_pages); + } vm->requested_size = size; kasan_vrealloc(p, old_size, size); return (void *)p; -- 2.43.0 From: Shivam Kalra Introduce a new test case "vrealloc_test" that exercises the vrealloc() shrink and in-place grow paths: - Grow beyond allocated pages (triggers full reallocation). - Shrink crossing a page boundary (frees tail pages). - Shrink within the same page (no page freeing). - Grow within the already allocated page count (in-place). Data integrity is validated after each realloc step by checking that the first byte of the original allocation is preserved. The test is gated behind run_test_mask bit 12 (id 4096). Reviewed-by: Uladzislau Rezki (Sony) Signed-off-by: Shivam Kalra --- lib/test_vmalloc.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c index 876c72c18a0c..b23f85e8f8ca 100644 --- a/lib/test_vmalloc.c +++ b/lib/test_vmalloc.c @@ -55,6 +55,7 @@ __param(int, run_test_mask, 7, "\t\tid: 512, name: kvfree_rcu_2_arg_vmalloc_test\n" "\t\tid: 1024, name: vm_map_ram_test\n" "\t\tid: 2048, name: no_block_alloc_test\n" + "\t\tid: 4096, name: vrealloc_test\n" /* Add a new test case description here. */ ); @@ -421,6 +422,66 @@ vm_map_ram_test(void) return nr_allocated != map_nr_pages; } +static int vrealloc_test(void) +{ + void *ptr, *tmp; + int i; + + for (i = 0; i < test_loop_count; i++) { + int err = -1; + + ptr = vrealloc(NULL, PAGE_SIZE, GFP_KERNEL); + if (!ptr) + return -1; + + *((__u8 *)ptr) = 'a'; + + /* Grow: beyond allocated pages, triggers full realloc. */ + tmp = vrealloc(ptr, 4 * PAGE_SIZE, GFP_KERNEL); + if (!tmp) + goto error; + ptr = tmp; + + if (*((__u8 *)ptr) != 'a') + goto error; + + /* Shrink: crosses page boundary, frees tail pages. */ + tmp = vrealloc(ptr, PAGE_SIZE, GFP_KERNEL); + if (!tmp) + goto error; + ptr = tmp; + + if (*((__u8 *)ptr) != 'a') + goto error; + + /* Shrink: within same page, no page freeing. */ + tmp = vrealloc(ptr, PAGE_SIZE / 2, GFP_KERNEL); + if (!tmp) + goto error; + ptr = tmp; + + if (*((__u8 *)ptr) != 'a') + goto error; + + /* Grow: within allocated page, in-place, no realloc. */ + tmp = vrealloc(ptr, PAGE_SIZE, GFP_KERNEL); + if (!tmp) + goto error; + ptr = tmp; + + if (*((__u8 *)ptr) != 'a') + goto error; + + err = 0; +error: + vfree(ptr); + if (err) + return err; + } + + return 0; +} + struct test_case_desc { const char *test_name; int (*test_func)(void); @@ -440,6 +501,7 @@ static struct test_case_desc test_case_array[] = { { "kvfree_rcu_2_arg_vmalloc_test", kvfree_rcu_2_arg_vmalloc_test, }, { "vm_map_ram_test", vm_map_ram_test, }, { "no_block_alloc_test", no_block_alloc_test, true }, + { "vrealloc_test", vrealloc_test, }, /* Add a new test case here. */ }; -- 2.43.0