Store the task pointer in the ptes of the unpopulated pages of dynamic stacks, to allow the vm_struct pointer to be retrieved without relying on any locks or current. This relies on being able to pack the struct task_struct pointer into a pte. Since the struct is 64 byte aligned, that gives 5 bits of leeway, which should be viable on most architectures. Any architecture which enables dynamic thread stacks must provide make_data_kpte() and unpack_data_kpte(), which pack/unpack a right shifted pointer value into/from a pte. Signed-off-by: David Stevens --- include/linux/sched/task_stack.h | 1 + kernel/fork.c | 74 +++++++++++++++++++++++++++++--- mm/vmalloc.c | 2 +- 3 files changed, 69 insertions(+), 8 deletions(-) diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h index 7dcff2836d7e..7cf00ce97f7c 100644 --- a/include/linux/sched/task_stack.h +++ b/include/linux/sched/task_stack.h @@ -105,6 +105,7 @@ void exit_task_stack_account(struct task_struct *tsk); void dynamic_stack_refill_pages(void); unsigned long dynamic_stack_accounting(struct task_struct *tsk, bool finalize); bool dynamic_stack_fault(struct task_struct *tsk, unsigned long address, bool *on_stack); +struct task_struct *task_from_stack_address(unsigned long address); /* * Refill and charge for the used pages. diff --git a/kernel/fork.c b/kernel/fork.c index 9ac9d23f5f4b..733fc1f58b8b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -296,16 +296,40 @@ static bool try_release_thread_stack_to_cache(struct vm_struct *vm_area) static DEFINE_PER_CPU(struct page *, dynamic_stack_pages[DYNSTK_PAGE_POOL_NR]); +#define TASK_PTR_SHIFT (ilog2(__alignof__(struct task_struct))) + static void link_vmap_stack_to_task(struct task_struct *tsk, struct vm_struct *vm_area) { + int i; + unsigned long addr; + pte_t *ptep, pte; + + pte = make_data_kpte(((unsigned long)tsk) >> TASK_PTR_SHIFT); + tsk->stack_vm_area = vm_area; tsk->packed_stack = (unsigned long)kasan_reset_tag(vm_area->addr); + + addr = (unsigned long)vm_area->addr; + ptep = virt_to_kpte(addr); + for (i = vm_area->nr_pages; i < THREAD_SIZE >> PAGE_SHIFT; + i++, addr += PAGE_SIZE, ptep++) + set_pte_at(&init_mm, addr, ptep, pte); } -static void free_vmap_stack(struct vm_struct *vm_area) +static void free_vmap_stack(struct vm_struct *vm_area, bool was_mapped) { int i; + /* Clear data kptes since vunmap expects present or none. */ + if (was_mapped) { + unsigned long addr = (unsigned long)vm_area->addr; + pte_t *ptep = virt_to_kpte(addr); + unsigned int nr_to_clear = (THREAD_SIZE >> PAGE_SHIFT) - vm_area->nr_pages; + + if (nr_to_clear) + clear_ptes(&init_mm, addr, ptep, nr_to_clear); + } + remove_vm_area(vm_area->addr); for (i = 0; i < vm_area->nr_pages; i++) @@ -354,7 +378,7 @@ static struct vm_struct *alloc_vmap_stack(int node) return vm_area; cleanup_err: - free_vmap_stack(vm_area); + free_vmap_stack(vm_area, false); return NULL; } @@ -477,6 +501,42 @@ unsigned long dynamic_stack_accounting(struct task_struct *tsk, bool finalize) return i; } +noinstr struct task_struct *task_from_stack_address(unsigned long address) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + BUILD_BUG_ON((BITS_PER_LONG - TASK_PTR_SHIFT) > KPTE_AVAILABLE_DATA_BITS); + + if (!is_vmalloc_addr((void *)address)) + return NULL; + + pgd = pgd_offset_k(address); + if (pgd_none(*pgd) || pgd_leaf(*pgd)) + return NULL; + + p4d = p4d_offset(pgd, address); + if (p4d_none(*p4d) || p4d_leaf(*p4d)) + return NULL; + + pud = pud_offset(p4d, address); + if (pud_none(*pud) || pud_leaf(*pud)) + return NULL; + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd) || pmd_leaf(*pmd)) + return NULL; + + pte = pte_offset_kernel(pmd, address); + if (pte_present(*pte) || pte_none(*pte)) + return NULL; + + return (struct task_struct *)(unpack_data_kpte(*pte) << TASK_PTR_SHIFT); +} + bool noinstr dynamic_stack_fault(struct task_struct *tsk, unsigned long address, bool *on_stack) { unsigned long stack, hole_end, addr; @@ -570,7 +630,7 @@ static inline struct vm_struct *alloc_vmap_stack(int node) return stack ? find_vm_area(stack) : NULL; } -static inline void free_vmap_stack(struct vm_struct *vm_area) +static inline void free_vmap_stack(struct vm_struct *vm_area, bool was_mapped) { vfree(vm_area->addr); } @@ -590,7 +650,7 @@ static void thread_stack_free_work(struct work_struct *work) if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area)) return; - free_vmap_stack(vm_area); + free_vmap_stack(vm_area, true); } static void thread_stack_delayed_free(struct task_struct *tsk) @@ -618,7 +678,7 @@ static int free_vm_stack_cache(unsigned int cpu) if (!vm_area) continue; - free_vmap_stack(vm_area); + free_vmap_stack(vm_area, true); cached_vm_stack_areas[i] = NULL; } @@ -653,7 +713,7 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node) unsigned long memset_offset = 0; if (memcg_charge_kernel_stack(vm_area)) { - free_vmap_stack(vm_area); + free_vmap_stack(vm_area, true); return -ENOMEM; } @@ -674,7 +734,7 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node) return -ENOMEM; if (memcg_charge_kernel_stack(vm_area)) { - free_vmap_stack(vm_area); + free_vmap_stack(vm_area, true); return -ENOMEM; } link_vmap_stack_to_task(tsk, vm_area); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 39b7e118cbce..76955c101180 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -76,7 +76,7 @@ early_param("nohugevmalloc", set_nohugevmalloc); static const bool vmap_allow_huge = false; #endif /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */ -bool is_vmalloc_addr(const void *x) +noinstr bool is_vmalloc_addr(const void *x) { unsigned long addr = (unsigned long)kasan_reset_tag(x); -- 2.54.0.rc2.544.gc7ae2d5bb8-goog