Introduce struct task_exec_state, a per-task RCU-protected structure that holds the dumpable mode and stays attached to the task for its full lifetime. task_exec_state_rcu() is the canonical reader: asserts RCU or task_lock is held, WARNs on a NULL state, returns the rcu_dereference()'d pointer. Reviewed-by: Jann Horn Signed-off-by: Christian Brauner (Amutable) --- include/linux/sched.h | 2 + include/linux/sched/exec_state.h | 31 +++++++++++ kernel/Makefile | 2 +- kernel/exec_state.c | 116 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 150 insertions(+), 1 deletion(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index ee06cba5c6f5..6674dbf960b5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -962,6 +962,8 @@ struct task_struct { struct mm_struct *mm; struct mm_struct *active_mm; + struct task_exec_state __rcu *exec_state; + int exit_state; int exit_code; int exit_signal; diff --git a/include/linux/sched/exec_state.h b/include/linux/sched/exec_state.h new file mode 100644 index 000000000000..dc5a795cbfe2 --- /dev/null +++ b/include/linux/sched/exec_state.h @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Christian Brauner */ +#ifndef _LINUX_SCHED_EXEC_STATE_H +#define _LINUX_SCHED_EXEC_STATE_H + +#include +#include +#include +#include +#include + +struct task_exec_state { + refcount_t count; + enum task_dumpable dumpable; + struct user_namespace *user_ns; + struct rcu_head rcu; +}; + +struct task_exec_state *alloc_task_exec_state(struct user_namespace *user_ns); +void put_task_exec_state(struct task_exec_state *exec_state); +struct task_exec_state *task_exec_state_rcu(const struct task_struct *tsk); +struct task_exec_state *task_exec_state_replace(struct task_struct *tsk, + struct task_exec_state *exec_state); +void task_exec_state_set_dumpable(enum task_dumpable value); +enum task_dumpable task_exec_state_get_dumpable(struct task_struct *task); +int task_exec_state_copy(struct task_struct *tsk); +void __init exec_state_init(void); + +DEFINE_FREE(put_task_exec_state, struct task_exec_state *, put_task_exec_state(_T)) + +#endif /* _LINUX_SCHED_EXEC_STATE_H */ diff --git a/kernel/Makefile b/kernel/Makefile index 6785982013dc..1e1a31673577 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -3,7 +3,7 @@ # Makefile for the linux kernel. # -obj-y = fork.o exec_domain.o panic.o \ +obj-y = fork.o exec_domain.o exec_state.o panic.o \ cpu.o exit.o softirq.o resource.o \ sysctl.o capability.o ptrace.o user.o \ signal.o sys.o umh.o workqueue.o pid.o task_work.o \ diff --git a/kernel/exec_state.c b/kernel/exec_state.c new file mode 100644 index 000000000000..a0ca5d913900 --- /dev/null +++ b/kernel/exec_state.c @@ -0,0 +1,116 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Christian Brauner */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct kmem_cache *task_exec_state_cachep; + +static void __free_task_exec_state(struct rcu_head *rcu) +{ + struct task_exec_state *exec_state = container_of(rcu, struct task_exec_state, rcu); + + put_user_ns(exec_state->user_ns); + kmem_cache_free(task_exec_state_cachep, exec_state); +} + +void put_task_exec_state(struct task_exec_state *exec_state) +{ + if (exec_state && refcount_dec_and_test(&exec_state->count)) + call_rcu(&exec_state->rcu, __free_task_exec_state); +} + +struct task_exec_state *alloc_task_exec_state(struct user_namespace *user_ns) +{ + struct task_exec_state *exec_state; + + exec_state = kmem_cache_alloc(task_exec_state_cachep, GFP_KERNEL); + if (!exec_state) + return NULL; + refcount_set(&exec_state->count, 1); + exec_state->dumpable = TASK_DUMPABLE_OFF; + exec_state->user_ns = get_user_ns(user_ns); + return exec_state; +} + +struct task_exec_state *task_exec_state_rcu(const struct task_struct *tsk) +{ + struct task_exec_state *exec_state; + + exec_state = rcu_dereference_check(tsk->exec_state, + lockdep_is_held(&tsk->alloc_lock)); + WARN_ON_ONCE(!exec_state); + return exec_state; +} + +struct task_exec_state *task_exec_state_replace(struct task_struct *tsk, + struct task_exec_state *exec_state) +{ + /* + * Updates must hold both locks so callers needing a consistent + * snapshot of mm + dumpability are covered. + */ + lockdep_assert_held(&tsk->alloc_lock); + lockdep_assert_held_write(&tsk->signal->exec_update_lock); + + return rcu_replace_pointer(tsk->exec_state, exec_state, true); +} + +/* + * The non-CLONE_VM clone path: allocate a fresh exec_state and + * inherit the parent's dumpable mode and user_ns reference. CLONE_VM + * siblings refcount-share via copy_exec_state() in fork.c; only this + * path and execve() ever allocate. + */ +int task_exec_state_copy(struct task_struct *tsk) +{ + struct task_exec_state *src, *dst; + + src = rcu_dereference_protected(current->exec_state, true); + dst = alloc_task_exec_state(src->user_ns); + if (!dst) + return -ENOMEM; + dst->dumpable = src->dumpable; + rcu_assign_pointer(tsk->exec_state, dst); + return 0; +} + +/* + * Store TASK_DUMPABLE_* on current->exec_state. All callers + * (commit_creds, begin_new_exec, prctl(PR_SET_DUMPABLE)) act on the + * running task, which guarantees ->exec_state is allocated and cannot + * be replaced under us. + */ +void task_exec_state_set_dumpable(enum task_dumpable value) +{ + struct task_exec_state *exec_state; + + if (WARN_ON(value > TASK_DUMPABLE_ROOT)) + value = TASK_DUMPABLE_OFF; + + exec_state = rcu_dereference_protected(current->exec_state, true); + WRITE_ONCE(exec_state->dumpable, value); +} + +enum task_dumpable task_exec_state_get_dumpable(struct task_struct *task) +{ + struct task_exec_state *exec_state; + + guard(rcu)(); + exec_state = rcu_dereference(task->exec_state); + return READ_ONCE(exec_state->dumpable); +} + +void __init exec_state_init(void) +{ + task_exec_state_cachep = kmem_cache_create("task_exec_state", + sizeof(struct task_exec_state), 0, + SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, + NULL); +} -- 2.47.3