Introduce a new test case "no_block_alloc_test" that verifies non-blocking allocations using __vmalloc() with GFP_ATOMIC and GFP_NOWAIT flags. It is recommended to build kernel with CONFIG_DEBUG_ATOMIC_SLEEP enabled to help catch "sleeping while atomic" issues. This test ensures that memory allocation logic under atomic constraints does not inadvertently sleep. Signed-off-by: Uladzislau Rezki (Sony) --- lib/test_vmalloc.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c index 2815658ccc37..aae5f4910aff 100644 --- a/lib/test_vmalloc.c +++ b/lib/test_vmalloc.c @@ -54,6 +54,7 @@ __param(int, run_test_mask, 7, "\t\tid: 256, name: kvfree_rcu_1_arg_vmalloc_test\n" "\t\tid: 512, name: kvfree_rcu_2_arg_vmalloc_test\n" "\t\tid: 1024, name: vm_map_ram_test\n" + "\t\tid: 2048, name: no_block_alloc_test\n" /* Add a new test case description here. */ ); @@ -283,6 +284,30 @@ static int fix_size_alloc_test(void) return 0; } +static int no_block_alloc_test(void) +{ + void *ptr; + int i; + + for (i = 0; i < test_loop_count; i++) { + bool use_atomic = !!(get_random_u8() % 2); + gfp_t gfp = use_atomic ? GFP_ATOMIC : GFP_NOWAIT; + unsigned long size = (nr_pages > 0 ? nr_pages : 1) * PAGE_SIZE; + + preempt_disable(); + ptr = __vmalloc(size, gfp); + preempt_enable(); + + if (!ptr) + return -1; + + *((__u8 *)ptr) = 0; + vfree(ptr); + } + + return 0; +} + static int pcpu_alloc_test(void) { @@ -411,6 +436,7 @@ static struct test_case_desc test_case_array[] = { { "kvfree_rcu_1_arg_vmalloc_test", kvfree_rcu_1_arg_vmalloc_test, }, { "kvfree_rcu_2_arg_vmalloc_test", kvfree_rcu_2_arg_vmalloc_test, }, { "vm_map_ram_test", vm_map_ram_test, }, + { "no_block_alloc_test", no_block_alloc_test, true }, /* Add a new test case here. */ }; -- 2.39.5 A test marked with "xfail = true" is expected to fail but that does not mean it is predetermined to fail. Remove "xfail" condition check for tests which pass successfully. Signed-off-by: Uladzislau Rezki (Sony) --- lib/test_vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c index aae5f4910aff..6521c05c7816 100644 --- a/lib/test_vmalloc.c +++ b/lib/test_vmalloc.c @@ -500,7 +500,7 @@ static int test_func(void *private) for (j = 0; j < test_repeat_count; j++) { ret = test_case_array[index].test_func(); - if (!ret && !test_case_array[index].xfail) + if (!ret) t->data[index].test_passed++; else if (ret && test_case_array[index].xfail) t->data[index].test_xfailed++; -- 2.39.5 alloc_vmap_area() currently assumes that sleeping is allowed during allocation. This is not true for callers which pass non-blocking GFP flags, such as GFP_ATOMIC or GFP_NOWAIT. This patch adds logic to detect whether the given gfp_mask permits blocking. It avoids invoking might_sleep() or falling back to reclaim path if blocking is not allowed. This makes alloc_vmap_area() safer for use in non-sleeping contexts, where previously it could hit unexpected sleeps, trigger warnings. Signed-off-by: Uladzislau Rezki (Sony) --- mm/vmalloc.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 6dbcdceecae1..81b6d3bde719 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2017,6 +2017,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, unsigned long freed; unsigned long addr; unsigned int vn_id; + bool allow_block; int purged = 0; int ret; @@ -2026,7 +2027,8 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, if (unlikely(!vmap_initialized)) return ERR_PTR(-EBUSY); - might_sleep(); + allow_block = gfpflags_allow_blocking(gfp_mask); + might_sleep_if(allow_block); /* * If a VA is obtained from a global heap(if it fails here) @@ -2065,8 +2067,16 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, * If an allocation fails, the error value is * returned. Therefore trigger the overflow path. */ - if (IS_ERR_VALUE(addr)) - goto overflow; + if (IS_ERR_VALUE(addr)) { + if (allow_block) + goto overflow; + + /* + * We can not trigger any reclaim logic because + * sleeping is not allowed, thus fail an allocation. + */ + goto error; + } va->va_start = addr; va->va_end = addr + size; @@ -2116,6 +2126,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, pr_warn("vmalloc_node_range for size %lu failed: Address range restricted to %#lx - %#lx\n", size, vstart, vend); +error: kmem_cache_free(vmap_area_cachep, va); return ERR_PTR(-EBUSY); } -- 2.39.5 The vm_area_alloc_pages() function uses cond_resched() to yield the CPU during potentially long-running loops. However, these loops are not considered long-running under normal conditions. In non-blocking contexts, calling cond_resched() is inappropriate also. Remove these calls to ensure correctness for blocking/non-blocking contexts. This also simplifies the code path. In fact, a slow path of page allocator already includes reschedule points to mitigate latency. This patch was tested for !CONFIG_PREEMPT kernel and with large allocation chunks(~1GB), without triggering any "BUG: soft lockup" warnings. Signed-off-by: Uladzislau Rezki (Sony) --- mm/vmalloc.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 81b6d3bde719..b0255e0c74b3 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3633,7 +3633,6 @@ vm_area_alloc_pages(gfp_t gfp, int nid, pages + nr_allocated); nr_allocated += nr; - cond_resched(); /* * If zero or pages were obtained partly, @@ -3675,7 +3674,6 @@ vm_area_alloc_pages(gfp_t gfp, int nid, for (i = 0; i < (1U << order); i++) pages[nr_allocated + i] = page + i; - cond_resched(); nr_allocated += 1U << order; } -- 2.39.5 The function kasan_populate_vmalloc() internally allocates a page using a hardcoded GFP_KERNEL flag. This is not safe in contexts where non-blocking allocation flags are required, such as GFP_ATOMIC or GFP_NOWAIT, for example during atomic vmalloc paths. This patch modifies kasan_populate_vmalloc() and its helpers to accept a gfp_mask argument to use it for a page allocation. It allows the caller to specify the correct allocation context. Also, when non-blocking flags are used, memalloc_noreclaim_save/restore() is used around apply_to_page_range() to suppress potential reclaim behavior that may otherwise violate atomic constraints. Cc: Andrey Ryabinin Cc: Alexander Potapenko Signed-off-by: Uladzislau Rezki (Sony) --- include/linux/kasan.h | 6 +++--- mm/kasan/shadow.c | 22 +++++++++++++++------- mm/vmalloc.c | 4 ++-- 3 files changed, 20 insertions(+), 12 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 890011071f2b..fe5ce9215821 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -562,7 +562,7 @@ static inline void kasan_init_hw_tags(void) { } #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) void kasan_populate_early_vm_area_shadow(void *start, unsigned long size); -int kasan_populate_vmalloc(unsigned long addr, unsigned long size); +int kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask); void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, unsigned long free_region_end, @@ -574,7 +574,7 @@ static inline void kasan_populate_early_vm_area_shadow(void *start, unsigned long size) { } static inline int kasan_populate_vmalloc(unsigned long start, - unsigned long size) + unsigned long size, gfp_t gfp_mask) { return 0; } @@ -610,7 +610,7 @@ static __always_inline void kasan_poison_vmalloc(const void *start, static inline void kasan_populate_early_vm_area_shadow(void *start, unsigned long size) { } static inline int kasan_populate_vmalloc(unsigned long start, - unsigned long size) + unsigned long size, gfp_t gfp_mask) { return 0; } diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index d2c70cd2afb1..5edfc1f6b53e 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -335,13 +335,13 @@ static void ___free_pages_bulk(struct page **pages, int nr_pages) } } -static int ___alloc_pages_bulk(struct page **pages, int nr_pages) +static int ___alloc_pages_bulk(struct page **pages, int nr_pages, gfp_t gfp_mask) { unsigned long nr_populated, nr_total = nr_pages; struct page **page_array = pages; while (nr_pages) { - nr_populated = alloc_pages_bulk(GFP_KERNEL, nr_pages, pages); + nr_populated = alloc_pages_bulk(gfp_mask, nr_pages, pages); if (!nr_populated) { ___free_pages_bulk(page_array, nr_total - nr_pages); return -ENOMEM; @@ -353,25 +353,33 @@ static int ___alloc_pages_bulk(struct page **pages, int nr_pages) return 0; } -static int __kasan_populate_vmalloc(unsigned long start, unsigned long end) +static int __kasan_populate_vmalloc(unsigned long start, unsigned long end, gfp_t gfp_mask) { unsigned long nr_pages, nr_total = PFN_UP(end - start); + bool noblock = !gfpflags_allow_blocking(gfp_mask); struct vmalloc_populate_data data; + unsigned int flags; int ret = 0; - data.pages = (struct page **)__get_free_page(GFP_KERNEL | __GFP_ZERO); + data.pages = (struct page **)__get_free_page(gfp_mask | __GFP_ZERO); if (!data.pages) return -ENOMEM; while (nr_total) { nr_pages = min(nr_total, PAGE_SIZE / sizeof(data.pages[0])); - ret = ___alloc_pages_bulk(data.pages, nr_pages); + ret = ___alloc_pages_bulk(data.pages, nr_pages, gfp_mask); if (ret) break; data.start = start; + if (noblock) + flags = memalloc_noreclaim_save(); + ret = apply_to_page_range(&init_mm, start, nr_pages * PAGE_SIZE, kasan_populate_vmalloc_pte, &data); + if (noblock) + memalloc_noreclaim_restore(flags); + ___free_pages_bulk(data.pages, nr_pages); if (ret) break; @@ -385,7 +393,7 @@ static int __kasan_populate_vmalloc(unsigned long start, unsigned long end) return ret; } -int kasan_populate_vmalloc(unsigned long addr, unsigned long size) +int kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask) { unsigned long shadow_start, shadow_end; int ret; @@ -414,7 +422,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size) shadow_start = PAGE_ALIGN_DOWN(shadow_start); shadow_end = PAGE_ALIGN(shadow_end); - ret = __kasan_populate_vmalloc(shadow_start, shadow_end); + ret = __kasan_populate_vmalloc(shadow_start, shadow_end, gfp_mask); if (ret) return ret; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index b0255e0c74b3..7f48a54ec108 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2099,7 +2099,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, BUG_ON(va->va_start < vstart); BUG_ON(va->va_end > vend); - ret = kasan_populate_vmalloc(addr, size); + ret = kasan_populate_vmalloc(addr, size, gfp_mask); if (ret) { free_vmap_area(va); return ERR_PTR(ret); @@ -4835,7 +4835,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, /* populate the kasan shadow space */ for (area = 0; area < nr_vms; area++) { - if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area])) + if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area], GFP_KERNEL)) goto err_free_shadow; } -- 2.39.5 __vmalloc_area_node() may call free_vmap_area() or vfree() on error paths, both of which can sleep. This becomes problematic if the function is invoked from an atomic context, such as when GFP_ATOMIC or GFP_NOWAIT is passed via gfp_mask. To fix this, unify error paths and defer the cleanup of partly initialized vm_struct objects to a workqueue. This ensures that freeing happens in a process context and avoids invalid sleeps in atomic regions. Signed-off-by: Uladzislau Rezki (Sony) --- include/linux/vmalloc.h | 6 +++++- mm/vmalloc.c | 34 +++++++++++++++++++++++++++++++--- 2 files changed, 36 insertions(+), 4 deletions(-) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index fdc9aeb74a44..b1425fae8cbf 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -50,7 +50,11 @@ struct iov_iter; /* in uio.h */ #endif struct vm_struct { - struct vm_struct *next; + union { + struct vm_struct *next; /* Early registration of vm_areas. */ + struct llist_node llnode; /* Asynchronous freeing on error paths. */ + }; + void *addr; unsigned long size; unsigned long flags; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 7f48a54ec108..2424f80d524a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3680,6 +3680,35 @@ vm_area_alloc_pages(gfp_t gfp, int nid, return nr_allocated; } +static LLIST_HEAD(pending_vm_area_cleanup); +static void cleanup_vm_area_work(struct work_struct *work) +{ + struct vm_struct *area, *tmp; + struct llist_node *head; + + head = llist_del_all(&pending_vm_area_cleanup); + if (!head) + return; + + llist_for_each_entry_safe(area, tmp, head, llnode) { + if (!area->pages) + free_vm_area(area); + else + vfree(area->addr); + } +} + +/* + * Helper for __vmalloc_area_node() to defer cleanup + * of partially initialized vm_struct in error paths. + */ +static DECLARE_WORK(cleanup_vm_area, cleanup_vm_area_work); +static void defer_vm_area_cleanup(struct vm_struct *area) +{ + if (llist_add(&area->llnode, &pending_vm_area_cleanup)) + schedule_work(&cleanup_vm_area); +} + static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, unsigned int page_shift, int node) @@ -3711,8 +3740,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, failed to allocated page array size %lu", nr_small_pages * PAGE_SIZE, array_size); - free_vm_area(area); - return NULL; + goto fail; } set_vm_area_page_order(area, page_shift - PAGE_SHIFT); @@ -3789,7 +3817,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, return area->addr; fail: - vfree(area->addr); + defer_vm_area_cleanup(area); return NULL; } -- 2.39.5 This patch makes __vmalloc_area_node() to correctly handle non-blocking allocation requests, such as GFP_ATOMIC and GFP_NOWAIT. Main changes: - Add a __GFP_HIGHMEM to gfp_mask only for blocking requests if there are no DMA constraints. - vmap_page_range() is wrapped by memalloc_noreclaim_save/restore() to avoid memory reclaim related operations that could sleep during page table setup or mapping pages. This is particularly important for page table allocations that internally use GFP_PGTABLE_KERNEL, which may sleep unless such scope restrictions are applied. For example: __pte_alloc_kernel() pte_alloc_one_kernel(&init_mm); pagetable_alloc_noprof(GFP_PGTABLE_KERNEL & ~__GFP_HIGHMEM, 0); Note: in most cases, PTE entries are established only up to the level required by current vmap space usage, meaning the page tables are typically fully populated during the mapping process. Signed-off-by: Uladzislau Rezki (Sony) --- mm/vmalloc.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 2424f80d524a..8a7eab810561 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3721,12 +3721,20 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, unsigned int nr_small_pages = size >> PAGE_SHIFT; unsigned int page_order; unsigned int flags; + bool noblock; int ret; array_size = (unsigned long)nr_small_pages * sizeof(struct page *); + noblock = !gfpflags_allow_blocking(gfp_mask); - if (!(gfp_mask & (GFP_DMA | GFP_DMA32))) - gfp_mask |= __GFP_HIGHMEM; + if (noblock) { + /* __GFP_NOFAIL and "noblock" flags are mutually exclusive. */ + nofail = false; + } else { + /* Allow highmem allocations if there are no DMA constraints. */ + if (!(gfp_mask & (GFP_DMA | GFP_DMA32))) + gfp_mask |= __GFP_HIGHMEM; + } /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { @@ -3790,7 +3798,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, * page tables allocations ignore external gfp mask, enforce it * by the scope API */ - if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) + if (noblock) + flags = memalloc_noreclaim_save(); + else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) flags = memalloc_nofs_save(); else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) flags = memalloc_noio_save(); @@ -3802,7 +3812,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, schedule_timeout_uninterruptible(1); } while (nofail && (ret < 0)); - if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) + if (noblock) + memalloc_noreclaim_restore(flags); + else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) memalloc_nofs_restore(flags); else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) memalloc_noio_restore(flags); -- 2.39.5 The memory allocator already avoids reclaim when PF_MEMALLOC is set. Clear __GFP_DIRECT_RECLAIM explicitly to suppress might_alloc() warnings to make more correct behavior. Signed-off-by: Uladzislau Rezki (Sony) --- include/linux/sched/mm.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 2201da0afecc..8332fc09f8ac 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -246,12 +246,14 @@ static inline bool in_vfork(struct task_struct *tsk) * PF_MEMALLOC_NOIO implies GFP_NOIO * PF_MEMALLOC_NOFS implies GFP_NOFS * PF_MEMALLOC_PIN implies !GFP_MOVABLE + * PF_MEMALLOC implies !__GFP_DIRECT_RECLAIM */ static inline gfp_t current_gfp_context(gfp_t flags) { unsigned int pflags = READ_ONCE(current->flags); - if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS | PF_MEMALLOC_PIN))) { + if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS | + PF_MEMALLOC_PIN | PF_MEMALLOC))) { /* * NOIO implies both NOIO and NOFS and it is a weaker context * so always make sure it makes precedence @@ -263,6 +265,9 @@ static inline gfp_t current_gfp_context(gfp_t flags) if (pflags & PF_MEMALLOC_PIN) flags &= ~__GFP_MOVABLE; + + if (pflags & PF_MEMALLOC) + flags &= ~__GFP_DIRECT_RECLAIM; } return flags; } -- 2.39.5