Some architectures invoke pagetable_alloc() with preemption disabled (e.g., arm64’s linear_map_split_to_ptes()). Under PREEMPT_RT, calling pagetable_alloc() with preemption disabled is not allowed, because it may acquire a spin lock that becomes sleepable on RT, potentially causing a sleep during page allocation. To address this, introduce a pagetable_alloc_nolock() API and permit two additional GFP flags for alloc_pages_nolock() — __GFP_HIGH and __GFP_ZERO. Signed-off-by: Yeoreum Yun --- include/linux/mm.h | 18 ++++++++++++++++++ kernel/bpf/stream.c | 2 +- kernel/bpf/syscall.c | 2 +- mm/page_alloc.c | 10 +++------- 4 files changed, 23 insertions(+), 9 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 7c79b3369b82..11a27f60838b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2990,6 +2990,24 @@ static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int orde } #define pagetable_alloc(...) alloc_hooks(pagetable_alloc_noprof(__VA_ARGS__)) +/** + * pagetable_alloc_nolock - opportunistic reetentrant pagetables allocation + * from any context + * @gfp: GFP flags. Only __GFP_ZERO, __GFP_HIGH, __GFP_ACCOUNT allowed. + * @order: desired pagetable order + * + * opportunistic reetentrant version of pagetable_alloc(). + * + * Return: The ptdesc describing the allocated page tables. + */ +static inline struct ptdesc *pagetable_alloc_nolock_noprof(gfp_t gfp, unsigned int order) +{ + struct page *page = alloc_pages_nolock_noprof(gfp, NUMA_NO_NODE, order); + + return page_ptdesc(page); +} +#define pagetable_alloc_nolock(...) alloc_hooks(pagetable_alloc_nolock_noprof(__VA_ARGS__)) + /** * pagetable_free - Free pagetables * @pt: The page table descriptor diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c index ff16c631951b..3c80c8007d91 100644 --- a/kernel/bpf/stream.c +++ b/kernel/bpf/stream.c @@ -83,7 +83,7 @@ static struct bpf_stream_page *bpf_stream_page_replace(void) struct bpf_stream_page *stream_page, *old_stream_page; struct page *page; - page = alloc_pages_nolock(/* Don't account */ 0, NUMA_NO_NODE, 0); + page = alloc_pages_nolock(/* Don't account */ __GFP_ZERO, NUMA_NO_NODE, 0); if (!page) return NULL; stream_page = page_address(page); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8a129746bd6c..cbc0f8d0c18b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -598,7 +598,7 @@ static bool can_alloc_pages(void) static struct page *__bpf_alloc_page(int nid) { if (!can_alloc_pages()) - return alloc_pages_nolock(__GFP_ACCOUNT, nid, 0); + return alloc_pages_nolock(__GFP_ZERO | __GFP_ACCOUNT, nid, 0); return alloc_pages_node(nid, GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ed82ee55e66a..88a920dc1e9a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7542,21 +7542,17 @@ struct page *alloc_frozen_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned * various contexts. We cannot use printk_deferred_enter() to mitigate, * since the running context is unknown. * - * Specify __GFP_ZERO to make sure that call to kmsan_alloc_page() below - * is safe in any context. Also zeroing the page is mandatory for - * BPF use cases. - * * Though __GFP_NOMEMALLOC is not checked in the code path below, * specify it here to highlight that alloc_pages_nolock() * doesn't want to deplete reserves. */ - gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_NOMEMALLOC | __GFP_COMP + gfp_t alloc_gfp = __GFP_NOWARN | __GFP_NOMEMALLOC | __GFP_COMP | gfp_flags; unsigned int alloc_flags = ALLOC_TRYLOCK; struct alloc_context ac = { }; struct page *page; - VM_WARN_ON_ONCE(gfp_flags & ~__GFP_ACCOUNT); + VM_WARN_ON_ONCE(gfp_flags & ~(__GFP_HIGH | __GFP_ZERO | __GFP_ACCOUNT)); /* * In PREEMPT_RT spin_trylock() will call raw_spin_lock() which is * unsafe in NMI. If spin_trylock() is called from hard IRQ the current @@ -7602,7 +7598,7 @@ struct page *alloc_frozen_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned } /** * alloc_pages_nolock - opportunistic reentrant allocation from any context - * @gfp_flags: GFP flags. Only __GFP_ACCOUNT allowed. + * @gfp_flags: GFP flags. Only __GFP_ZERO, __GFP_HIGH, __GFP_ACCOUNT allowed. * @nid: node to allocate from * @order: allocation order size * -- LEVI:{C3F47F37-75D8-414A-A8BA-3980EC8A46D7}