From: Shivam Kalra Extract the page-freeing loop and NR_VMALLOC stat accounting from vfree() into a reusable vm_area_free_pages() helper. The helper operates on a range [start_idx, end_idx) of pages from a vm_struct, making it suitable for both full free (vfree) and partial free (upcoming vrealloc shrink). Freed page pointers in vm->pages[] are set to NULL to prevent stale references when the vm_struct outlives the free (as in vrealloc shrink). Reviewed-by: Alice Ryhl Signed-off-by: Shivam Kalra --- mm/vmalloc.c | 47 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d75151649c97..79a57955345d 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3416,6 +3416,38 @@ void vfree_atomic(const void *addr) schedule_work(&p->wq); } +/* + * vm_area_free_pages - free a range of pages from a vmalloc allocation + * @vm: the vm_struct containing the pages + * @start_idx: first page index to free (inclusive) + * @end_idx: last page index to free (exclusive) + * + * Free pages [start_idx, end_idx) updating NR_VMALLOC stat accounting. + * Freed vm->pages[] entries are set to NULL. + * Caller is responsible for unmapping (vunmap_range) and KASAN + * poisoning before calling this. + */ +static void vm_area_free_pages(struct vm_struct *vm, unsigned int start_idx, + unsigned int end_idx) +{ + unsigned int i; + + for (i = start_idx; i < end_idx; i++) { + struct page *page = vm->pages[i]; + + BUG_ON(!page); + /* + * High-order allocs for huge vmallocs are split, so + * can be freed as an array of order-0 allocations + */ + if (!(vm->flags & VM_MAP_PUT_PAGES)) + mod_lruvec_page_state(page, NR_VMALLOC, -1); + __free_page(page); + vm->pages[i] = NULL; + cond_resched(); + } +} + /** * vfree - Release memory allocated by vmalloc() * @addr: Memory base address @@ -3436,7 +3468,6 @@ void vfree_atomic(const void *addr) void vfree(const void *addr) { struct vm_struct *vm; - int i; if (unlikely(in_interrupt())) { vfree_atomic(addr); @@ -3459,19 +3490,7 @@ void vfree(const void *addr) if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS)) vm_reset_perms(vm); - for (i = 0; i < vm->nr_pages; i++) { - struct page *page = vm->pages[i]; - - BUG_ON(!page); - /* - * High-order allocs for huge vmallocs are split, so - * can be freed as an array of order-0 allocations - */ - if (!(vm->flags & VM_MAP_PUT_PAGES)) - mod_lruvec_page_state(page, NR_VMALLOC, -1); - __free_page(page); - cond_resched(); - } + vm_area_free_pages(vm, 0, vm->nr_pages); kvfree(vm->pages); kfree(vm); } -- 2.43.0 From: Shivam Kalra Fix the grow-in-place check in vrealloc() to compare the requested size against the actual physical page count (vm->nr_pages) rather than the virtual area size (alloced_size, derived from get_vm_area_size()). The virtual reservation size (get_vm_area_size()) does not decrease when pages are freed during a shrink operation. Consequently, without this fix, a subsequent grow-in-place operation after a shrink would incorrectly succeed and attempt to access freed pages. Correcting this check is a prerequisite for the upcoming vrealloc() shrink functionality. Signed-off-by: Shivam Kalra --- mm/vmalloc.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 79a57955345d..133c3b0418fe 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -4343,6 +4343,12 @@ void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align if (unlikely(flags & __GFP_THISNODE) && nid != NUMA_NO_NODE && nid != page_to_nid(vmalloc_to_page(p))) goto need_realloc; + } else { + /* + * If p is NULL, vrealloc behaves exactly like vmalloc. + * Skip the shrink and in-place grow paths. + */ + goto need_realloc; } /* @@ -4361,7 +4367,7 @@ void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align /* * We already have the bytes available in the allocation; use them. */ - if (size <= alloced_size) { + if (size <= (size_t)vm->nr_pages << PAGE_SHIFT) { /* * No need to zero memory here, as unused memory will have * already been zeroed at initial allocation time or during -- 2.43.0 From: Shivam Kalra When growing an existing vmalloc allocation in-place, zero the newly exposed memory region [old_size, size) if the caller requested it via __GFP_ZERO (checked via want_init_on_alloc(flags)). Previously, the code assumed that the unused capacity in the vm_struct was already zeroed either at initial allocation time or during a prior shrink. However, if an intermediate shrink operation occurred without __GFP_ZERO and without init_on_free enabled, the "freed" portion of the allocation would retain its old data. If a subsequent grow-in-place operation then explicitly requests __GFP_ZERO, failing to zero the memory here would violate the allocation flags and leak the previously discarded, potentially sensitive data. Signed-off-by: Shivam Kalra --- mm/vmalloc.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 133c3b0418fe..ddb689bf9ba5 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -4368,13 +4368,16 @@ void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align * We already have the bytes available in the allocation; use them. */ if (size <= (size_t)vm->nr_pages << PAGE_SHIFT) { - /* - * No need to zero memory here, as unused memory will have - * already been zeroed at initial allocation time or during - * realloc shrink time. - */ vm->requested_size = size; kasan_vrealloc(p, old_size, size); + + /* + * Zero the newly exposed bytes if requested. + * The region [old_size, size) may contain stale data from + * a previous shrink that did not use __GFP_ZERO. + */ + if (want_init_on_alloc(flags)) + memset((void *)p + old_size, 0, size - old_size); return (void *)p; } -- 2.43.0 From: Shivam Kalra The vmalloc status readers (vmalloc_info_show(), show_numa_info(), and vmalloc_dump_obj()) currently read v->nr_pages and the v->pages array without any concurrent protection. In preparation for vrealloc() shrink support, where v->nr_pages can be decreased and entries in the v->pages array can be nulled out concurrently, these readers must be protected to prevent use-after-free or NULL pointer dereferences. Update these functions to use READ_ONCE() when accessing v->nr_pages and v->pages[nr]. This ensures the compiler does not re-fetch these values and provides a consistent view of the vmap area's state. Additionally, in show_numa_info(), explicitly check for a NULL page pointer before dereferencing it to avoid potential crashes if a page was concurrently removed during a shrink operation. Signed-off-by: Shivam Kalra --- mm/vmalloc.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ddb689bf9ba5..c6bdddee6266 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -5189,7 +5189,7 @@ bool vmalloc_dump_obj(void *object) vm = va->vm; addr = (unsigned long) vm->addr; caller = vm->caller; - nr_pages = vm->nr_pages; + nr_pages = READ_ONCE(vm->nr_pages); spin_unlock(&vn->busy.lock); pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n", @@ -5210,7 +5210,7 @@ bool vmalloc_dump_obj(void *object) static void show_numa_info(struct seq_file *m, struct vm_struct *v, unsigned int *counters) { - unsigned int nr; + unsigned int nr, nr_pages; unsigned int step = 1U << vm_area_page_order(v); if (!counters) @@ -5218,8 +5218,13 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v, memset(counters, 0, nr_node_ids * sizeof(unsigned int)); - for (nr = 0; nr < v->nr_pages; nr += step) - counters[page_to_nid(v->pages[nr])] += step; + nr_pages = READ_ONCE(v->nr_pages); + for (nr = 0; nr < nr_pages; nr += step) { + struct page *page = READ_ONCE(v->pages[nr]); + + if (page) + counters[page_to_nid(page)] += step; + } for_each_node_state(nr, N_HIGH_MEMORY) if (counters[nr]) seq_printf(m, " N%u=%u", nr, counters[nr]); @@ -5247,6 +5252,7 @@ static int vmalloc_info_show(struct seq_file *m, void *p) struct vmap_area *va; struct vm_struct *v; unsigned int *counters; + unsigned int nr_pages; if (IS_ENABLED(CONFIG_NUMA)) counters = kmalloc_array(nr_node_ids, sizeof(unsigned int), GFP_KERNEL); @@ -5276,8 +5282,9 @@ static int vmalloc_info_show(struct seq_file *m, void *p) if (v->caller) seq_printf(m, " %pS", v->caller); - if (v->nr_pages) - seq_printf(m, " pages=%d", v->nr_pages); + nr_pages = READ_ONCE(v->nr_pages); + if (nr_pages) + seq_printf(m, " pages=%d", nr_pages); if (v->phys_addr) seq_printf(m, " phys=%pa", &v->phys_addr); -- 2.43.0 From: Shivam Kalra When vrealloc() shrinks an allocation and the new size crosses a page boundary, unmap and free the tail pages that are no longer needed. This reclaims physical memory that was previously wasted for the lifetime of the allocation. The heuristic is simple: always free when at least one full page becomes unused. Huge page allocations (page_order > 0) are skipped, as partial freeing would require splitting. Allocations with VM_FLUSH_RESET_PERMS are also skipped, as their direct-map permissions must be reset before pages are returned to the page allocator, which is handled by vm_reset_perms() during vfree(). Additionally, allocations with VM_USERMAP are skipped because remap_vmalloc_range_partial() validates mapping requests against the unchanged vm->size; freeing tail pages would cause vmalloc_to_page() to return NULL for the unmapped range. To protect concurrent readers, the shrink path uses Node lock to synchronize before freeing the pages. Finally, we notify kmemleak of the reduced allocation size using kmemleak_free_part() to prevent the kmemleak scanner from faulting on the newly unmapped virtual addresses. The virtual address reservation (vm->size / vmap_area) is intentionally kept unchanged, preserving the address for potential future grow-in-place support. Suggested-by: Danilo Krummrich Signed-off-by: Shivam Kalra --- mm/vmalloc.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 52 insertions(+), 4 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index c6bdddee6266..57aec552e038 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -4351,14 +4351,62 @@ void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align goto need_realloc; } - /* - * TODO: Shrink the vm_area, i.e. unmap and free unused pages. What - * would be a good heuristic for when to shrink the vm_area? - */ if (size <= old_size) { + unsigned int new_nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT; + /* Zero out "freed" memory, potentially for future realloc. */ if (want_init_on_free() || want_init_on_alloc(flags)) memset((void *)p + size, 0, old_size - size); + + /* + * Free tail pages when shrink crosses a page boundary. + * + * Skip huge page allocations (page_order > 0) as partial + * freeing would require splitting. + * + * Skip VM_FLUSH_RESET_PERMS, as direct-map permissions must + * be reset before pages are returned to the allocator. + * + * Skip VM_USERMAP, as remap_vmalloc_range_partial() validates + * mapping requests against the unchanged vm->size; freeing + * tail pages would cause vmalloc_to_page() to return NULL for + * the unmapped range. + * + * Skip if either GFP_NOFS or GFP_NOIO are used. + * kmemleak_free_part() internally allocates with + * GFP_KERNEL, which could trigger a recursive deadlock + * if we are under filesystem or I/O reclaim. + */ + if (new_nr_pages < vm->nr_pages && !vm_area_page_order(vm) && + !(vm->flags & (VM_FLUSH_RESET_PERMS | VM_USERMAP)) && + gfp_has_io_fs(flags)) { + unsigned long addr = (unsigned long)kasan_reset_tag(p); + unsigned int old_nr_pages = vm->nr_pages; + + /* Notify kmemleak of the reduced allocation size before unmapping. */ + kmemleak_free_part( + (void *)addr + ((unsigned long)new_nr_pages + << PAGE_SHIFT), + (unsigned long)(old_nr_pages - new_nr_pages) + << PAGE_SHIFT); + + vunmap_range(addr + ((unsigned long)new_nr_pages + << PAGE_SHIFT), + addr + ((unsigned long)old_nr_pages + << PAGE_SHIFT)); + + /* + * Use the node lock to synchronize with concurrent + * readers (vmalloc_info_show). + */ + struct vmap_node *vn = addr_to_node(addr); + + spin_lock(&vn->busy.lock); + WRITE_ONCE(vm->nr_pages, new_nr_pages); + spin_unlock(&vn->busy.lock); + + vm_area_free_pages(vm, new_nr_pages, old_nr_pages); + } vm->requested_size = size; kasan_vrealloc(p, old_size, size); return (void *)p; -- 2.43.0 From: Shivam Kalra Introduce a new test case "vrealloc_test" that exercises the vrealloc() shrink and in-place grow paths: - Grow beyond allocated pages (triggers full reallocation). - Shrink crossing a page boundary (frees tail pages). - Shrink within the same page (no page freeing). - Grow within the already allocated page count (in-place). Data integrity is validated after each realloc step by checking that the first byte of the original allocation is preserved. The test is gated behind run_test_mask bit 12 (id 4096). Signed-off-by: Shivam Kalra --- lib/test_vmalloc.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c index 876c72c18a0c..b23f85e8f8ca 100644 --- a/lib/test_vmalloc.c +++ b/lib/test_vmalloc.c @@ -55,6 +55,7 @@ __param(int, run_test_mask, 7, "\t\tid: 512, name: kvfree_rcu_2_arg_vmalloc_test\n" "\t\tid: 1024, name: vm_map_ram_test\n" "\t\tid: 2048, name: no_block_alloc_test\n" + "\t\tid: 4096, name: vrealloc_test\n" /* Add a new test case description here. */ ); @@ -421,6 +422,66 @@ vm_map_ram_test(void) return nr_allocated != map_nr_pages; } +static int vrealloc_test(void) +{ + void *ptr, *tmp; + int i; + + for (i = 0; i < test_loop_count; i++) { + int err = -1; + + ptr = vrealloc(NULL, PAGE_SIZE, GFP_KERNEL); + if (!ptr) + return -1; + + *((__u8 *)ptr) = 'a'; + + /* Grow: beyond allocated pages, triggers full realloc. */ + tmp = vrealloc(ptr, 4 * PAGE_SIZE, GFP_KERNEL); + if (!tmp) + goto error; + ptr = tmp; + + if (*((__u8 *)ptr) != 'a') + goto error; + + /* Shrink: crosses page boundary, frees tail pages. */ + tmp = vrealloc(ptr, PAGE_SIZE, GFP_KERNEL); + if (!tmp) + goto error; + ptr = tmp; + + if (*((__u8 *)ptr) != 'a') + goto error; + + /* Shrink: within same page, no page freeing. */ + tmp = vrealloc(ptr, PAGE_SIZE / 2, GFP_KERNEL); + if (!tmp) + goto error; + ptr = tmp; + + if (*((__u8 *)ptr) != 'a') + goto error; + + /* Grow: within allocated page, in-place, no realloc. */ + tmp = vrealloc(ptr, PAGE_SIZE, GFP_KERNEL); + if (!tmp) + goto error; + ptr = tmp; + + if (*((__u8 *)ptr) != 'a') + goto error; + + err = 0; +error: + vfree(ptr); + if (err) + return err; + } + + return 0; +} + struct test_case_desc { const char *test_name; int (*test_func)(void); @@ -440,6 +501,7 @@ static struct test_case_desc test_case_array[] = { { "kvfree_rcu_2_arg_vmalloc_test", kvfree_rcu_2_arg_vmalloc_test, }, { "vm_map_ram_test", vm_map_ram_test, }, { "no_block_alloc_test", no_block_alloc_test, true }, + { "vrealloc_test", vrealloc_test, }, /* Add a new test case here. */ }; -- 2.43.0