Introduce struct task_exec_state, a per-task RCU-protected structure that holds the dumpable mode and stays attached to the task for its full lifetime. task_exec_state_rcu() is the canonical reader: asserts RCU or task_lock is held, WARNs on a NULL state, returns the rcu_dereference()'d pointer. Signed-off-by: Christian Brauner (Amutable) Signed-off-by: Christian Brauner --- include/linux/sched.h | 3 ++ include/linux/sched/exec_state.h | 31 ++++++++++++ kernel/Makefile | 2 +- kernel/exec_state.c | 105 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 140 insertions(+), 1 deletion(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index ee06cba5c6f5..d895c3ff2154 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -962,6 +962,9 @@ struct task_struct { struct mm_struct *mm; struct mm_struct *active_mm; + /* Exec-time state outliving exit_mm(); see . */ + struct task_exec_state __rcu *exec_state; + int exit_state; int exit_code; int exit_signal; diff --git a/include/linux/sched/exec_state.h b/include/linux/sched/exec_state.h new file mode 100644 index 000000000000..7a267efc34d3 --- /dev/null +++ b/include/linux/sched/exec_state.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_EXEC_STATE_H +#define _LINUX_SCHED_EXEC_STATE_H + +#include +#include +#include +#include +#include + +struct task_exec_state { + refcount_t count; + enum task_dumpable dumpable; + struct user_namespace *user_ns; + struct rcu_head rcu; +}; + +struct task_exec_state *alloc_task_exec_state(void); +void put_task_exec_state(struct task_exec_state *es); +struct task_exec_state *task_exec_state_rcu(const struct task_struct *tsk); +struct task_exec_state *task_exec_state_replace(struct task_struct *tsk, + struct task_exec_state *exec_state); +void task_exec_state_set_dumpable(enum task_dumpable value); +enum task_dumpable task_exec_state_get_dumpable(struct task_struct *task); +void copy_exec_state(struct task_struct *tsk); +void __init exec_state_init(void); + +DEFINE_FREE(put_task_exec_state, struct task_exec_state *, + if (_T) put_task_exec_state(_T)) + +#endif /* _LINUX_SCHED_EXEC_STATE_H */ diff --git a/kernel/Makefile b/kernel/Makefile index 6785982013dc..1e1a31673577 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -3,7 +3,7 @@ # Makefile for the linux kernel. # -obj-y = fork.o exec_domain.o panic.o \ +obj-y = fork.o exec_domain.o exec_state.o panic.o \ cpu.o exit.o softirq.o resource.o \ sysctl.o capability.o ptrace.o user.o \ signal.o sys.o umh.o workqueue.o pid.o task_work.o \ diff --git a/kernel/exec_state.c b/kernel/exec_state.c new file mode 100644 index 000000000000..85178b1d2c57 --- /dev/null +++ b/kernel/exec_state.c @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include +#include + +static struct kmem_cache *task_exec_state_cachep; + +static void __free_task_exec_state(struct rcu_head *rcu) +{ + struct task_exec_state *es = container_of(rcu, struct task_exec_state, rcu); + + kmem_cache_free(task_exec_state_cachep, es); +} + +void put_task_exec_state(struct task_exec_state *es) +{ + if (es && refcount_dec_and_test(&es->count)) + call_rcu(&es->rcu, __free_task_exec_state); +} + +struct task_exec_state *alloc_task_exec_state(void) +{ + struct task_exec_state *es; + + es = kmem_cache_alloc(task_exec_state_cachep, GFP_KERNEL); + if (!es) + return NULL; + refcount_set(&es->count, 1); + es->dumpable = TASK_DUMPABLE_OFF; + return es; +} + +struct task_exec_state *task_exec_state_rcu(const struct task_struct *tsk) +{ + RCU_LOCKDEP_WARN(!rcu_read_lock_held() && !lockdep_is_held(&tsk->alloc_lock), + "task_exec_state_rcu() requires RCU or task_lock"); + WARN_ON_ONCE(!tsk->exec_state); + return rcu_dereference(tsk->exec_state); +} + +struct task_exec_state *task_exec_state_replace(struct task_struct *tsk, + struct task_exec_state *exec_state) +{ + /* + * Updates must hold both locks so callers needing a consistent + * snapshot of mm + dumpability are covered. + */ + lockdep_assert_held(&tsk->alloc_lock); + lockdep_assert_held_write(&tsk->signal->exec_update_lock); + + return rcu_replace_pointer(tsk->exec_state, exec_state, true); +} + +/* + * exec_state is anchored to the execve() that established the current + * privilege domain. All clone() variants refcount-share it; only a + * subsequent execve() in the child swaps in a fresh one. + */ +void copy_exec_state(struct task_struct *tsk) +{ + struct task_exec_state *es = current->exec_state; + + refcount_inc(&es->count); + rcu_assign_pointer(tsk->exec_state, es); +} + +/* + * Store TASK_DUMPABLE_* on current->exec_state. All callers + * (commit_creds, begin_new_exec, prctl(PR_SET_DUMPABLE)) act on the + * running task, which guarantees ->exec_state is allocated and cannot + * be replaced under us. + */ +void task_exec_state_set_dumpable(enum task_dumpable value) +{ + struct task_exec_state *es; + + if (WARN_ON(value > TASK_DUMPABLE_ROOT)) + value = TASK_DUMPABLE_OFF; + + es = rcu_dereference_protected(current->exec_state, true); + WRITE_ONCE(es->dumpable, value); +} + +enum task_dumpable task_exec_state_get_dumpable(struct task_struct *task) +{ + struct task_exec_state *es; + + guard(rcu)(); + es = rcu_dereference(task->exec_state); + return READ_ONCE(es->dumpable); +} + +void __init exec_state_init(void) +{ + task_exec_state_cachep = kmem_cache_create("task_exec_state", + sizeof(struct task_exec_state), 0, + SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, + NULL); +} -- 2.47.3