Preparation. After the next change setup_new_exec() can fail. Signed-off-by: Oleg Nesterov --- fs/binfmt_elf.c | 4 +++- fs/binfmt_elf_fdpic.c | 4 +++- fs/binfmt_flat.c | 4 +++- fs/exec.c | 4 +++- include/linux/binfmts.h | 2 +- 5 files changed, 13 insertions(+), 5 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index e4653bb99946..8a38689ae4b0 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1021,7 +1021,9 @@ static int load_elf_binary(struct linux_binprm *bprm) if (!(current->personality & ADDR_NO_RANDOMIZE) && snapshot_randomize_va_space) current->flags |= PF_RANDOMIZE; - setup_new_exec(bprm); + retval = setup_new_exec(bprm); + if (retval) + goto out_free_dentry; /* Do this so that we can load the interpreter, if need be. We will change some of these later */ diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 48fd2de3bca0..7ad98b8132fc 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -351,7 +351,9 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm) if (elf_read_implies_exec(&exec_params.hdr, executable_stack)) current->personality |= READ_IMPLIES_EXEC; - setup_new_exec(bprm); + retval = setup_new_exec(bprm); + if (retval) + goto error; set_binfmt(&elf_fdpic_format); diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index b5b5ca1a44f7..78e9ca128ea7 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -512,7 +512,9 @@ static int load_flat_file(struct linux_binprm *bprm, /* OK, This is the point of no return */ set_personality(PER_LINUX_32BIT); - setup_new_exec(bprm); + ret = setup_new_exec(bprm); + if (ret) + goto err; /* * calculate the extra space we need to map in diff --git a/fs/exec.c b/fs/exec.c index 6b70c6726d31..136a7ab5d91c 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1321,7 +1321,7 @@ void would_dump(struct linux_binprm *bprm, struct file *file) } EXPORT_SYMBOL(would_dump); -void setup_new_exec(struct linux_binprm * bprm) +int setup_new_exec(struct linux_binprm * bprm) { /* Setup things that can depend upon the personality */ struct task_struct *me = current; @@ -1337,6 +1337,8 @@ void setup_new_exec(struct linux_binprm * bprm) me->mm->task_size = TASK_SIZE; up_write(&me->signal->exec_update_lock); mutex_unlock(&me->signal->cred_guard_mutex); + + return 0; } EXPORT_SYMBOL(setup_new_exec); diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 65abd5ab8836..678b7525ac5a 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -123,7 +123,7 @@ extern void unregister_binfmt(struct linux_binfmt *); extern int __must_check remove_arg_zero(struct linux_binprm *); extern int begin_new_exec(struct linux_binprm * bprm); -extern void setup_new_exec(struct linux_binprm * bprm); +extern int setup_new_exec(struct linux_binprm * bprm); extern void finalize_exec(struct linux_binprm *bprm); extern void would_dump(struct linux_binprm *, struct file *); -- 2.25.1.362.g51ebf55 This simple program #include #include #include #include void *thread(void *arg) { ptrace(PTRACE_TRACEME, 0,0,0); return NULL; } int main(void) { int pid = fork(); if (!pid) { pthread_t pt; pthread_create(&pt, NULL, thread, NULL); pthread_join(pt, NULL); execlp("echo", "echo", "passed", NULL); } sleep(1); ptrace(PTRACE_ATTACH, pid, 0,0); kill(pid, SIGCONT); return 0; } hangs because de_thread() waits for debugger which should release the killed thread with cred_guard_mutex held, while the debugger sleeps waiting for the same mutex. Not really that bad, the tracer can be killed, but still this is a bug and people hit it in practice. With this patch: - de_thread() waits until all the sub-threads pass exit_notify() and become zombies. - setup_new_exec() waits until all the sub-threads are reaped without cred_guard_mutex held. - unshare_sighand() and flush_signal_handlers() are moved from begin_new_exec() to setup_new_exec(), we can't call them until all sub-threads go away. Signed-off-by: Oleg Nesterov --- fs/exec.c | 140 +++++++++++++++++++++++------------------------- kernel/exit.c | 9 ++-- kernel/signal.c | 2 +- 3 files changed, 71 insertions(+), 80 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 136a7ab5d91c..2bac7deb9a98 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -905,42 +905,56 @@ static int exec_mmap(struct mm_struct *mm) return 0; } -static int de_thread(struct task_struct *tsk) +static int kill_sub_threads(struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; - struct sighand_struct *oldsighand = tsk->sighand; - spinlock_t *lock = &oldsighand->siglock; - - if (thread_group_empty(tsk)) - goto no_thread_group; + int err = -EINTR; - /* - * Kill all other threads in the thread group. - */ - spin_lock_irq(lock); - if ((sig->flags & SIGNAL_GROUP_EXIT) || sig->group_exec_task) { - /* - * Another group action in progress, just - * return so that the signal is processed. - */ - spin_unlock_irq(lock); - return -EAGAIN; + read_lock(&tasklist_lock); + spin_lock_irq(&tsk->sighand->siglock); + if (!((sig->flags & SIGNAL_GROUP_EXIT) || sig->group_exec_task)) { + sig->group_exec_task = tsk; + sig->notify_count = -zap_other_threads(tsk); + err = 0; } + spin_unlock_irq(&tsk->sighand->siglock); + read_unlock(&tasklist_lock); - sig->group_exec_task = tsk; - sig->notify_count = zap_other_threads(tsk); - if (!thread_group_leader(tsk)) - sig->notify_count--; + return err; +} - while (sig->notify_count) { - __set_current_state(TASK_KILLABLE); - spin_unlock_irq(lock); - schedule(); +static int wait_for_notify_count(struct task_struct *tsk) +{ + for (;;) { if (__fatal_signal_pending(tsk)) - goto killed; - spin_lock_irq(lock); + return -EINTR; + set_current_state(TASK_KILLABLE); + if (!tsk->signal->notify_count) + break; + schedule(); } - spin_unlock_irq(lock); + __set_current_state(TASK_RUNNING); + return 0; +} + +static void clear_group_exec_task(struct task_struct *tsk) +{ + struct signal_struct *sig = tsk->signal; + + /* protects against exit_notify() and __exit_signal() */ + read_lock(&tasklist_lock); + sig->group_exec_task = NULL; + sig->notify_count = 0; + read_unlock(&tasklist_lock); +} + +static int de_thread(struct task_struct *tsk) +{ + if (thread_group_empty(tsk)) + goto no_thread_group; + + if (kill_sub_threads(tsk) || wait_for_notify_count(tsk)) + return -EINTR; /* * At this point all other threads have exited, all we have to @@ -948,26 +962,10 @@ static int de_thread(struct task_struct *tsk) * and to assume its PID: */ if (!thread_group_leader(tsk)) { - struct task_struct *leader = tsk->group_leader; - - for (;;) { - cgroup_threadgroup_change_begin(tsk); - write_lock_irq(&tasklist_lock); - /* - * Do this under tasklist_lock to ensure that - * exit_notify() can't miss ->group_exec_task - */ - sig->notify_count = -1; - if (likely(leader->exit_state)) - break; - __set_current_state(TASK_KILLABLE); - write_unlock_irq(&tasklist_lock); - cgroup_threadgroup_change_end(tsk); - schedule(); - if (__fatal_signal_pending(tsk)) - goto killed; - } + struct task_struct *leader = tsk->group_leader, *t; + cgroup_threadgroup_change_begin(tsk); + write_lock_irq(&tasklist_lock); /* * The only record we have of the real-time age of a * process, regardless of execs it's done, is start_time. @@ -1000,8 +998,8 @@ static int de_thread(struct task_struct *tsk) list_replace_rcu(&leader->tasks, &tsk->tasks); list_replace_init(&leader->sibling, &tsk->sibling); - tsk->group_leader = tsk; - leader->group_leader = tsk; + for_each_thread(tsk, t) + t->group_leader = tsk; tsk->exit_signal = SIGCHLD; leader->exit_signal = -1; @@ -1021,23 +1019,11 @@ static int de_thread(struct task_struct *tsk) release_task(leader); } - sig->group_exec_task = NULL; - sig->notify_count = 0; - no_thread_group: /* we have changed execution domain */ tsk->exit_signal = SIGCHLD; - BUG_ON(!thread_group_leader(tsk)); return 0; - -killed: - /* protects against exit_notify() and __exit_signal() */ - read_lock(&tasklist_lock); - sig->group_exec_task = NULL; - sig->notify_count = 0; - read_unlock(&tasklist_lock); - return -EAGAIN; } @@ -1171,13 +1157,6 @@ int begin_new_exec(struct linux_binprm * bprm) flush_itimer_signals(); #endif - /* - * Make the signal table private. - */ - retval = unshare_sighand(me); - if (retval) - goto out_unlock; - me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_NOFREEZE | PF_NO_SETAFFINITY); flush_thread(); @@ -1249,7 +1228,6 @@ int begin_new_exec(struct linux_binprm * bprm) /* An exec changes our domain. We are no longer part of the thread group */ WRITE_ONCE(me->self_exec_id, me->self_exec_id + 1); - flush_signal_handlers(me, 0); retval = set_cred_ucounts(bprm->cred); if (retval < 0) @@ -1293,8 +1271,9 @@ int begin_new_exec(struct linux_binprm * bprm) up_write(&me->signal->exec_update_lock); if (!bprm->cred) mutex_unlock(&me->signal->cred_guard_mutex); - out: + if (me->signal->group_exec_task == me) + clear_group_exec_task(me); return retval; } EXPORT_SYMBOL(begin_new_exec); @@ -1325,6 +1304,8 @@ int setup_new_exec(struct linux_binprm * bprm) { /* Setup things that can depend upon the personality */ struct task_struct *me = current; + struct signal_struct *sig = me->signal; + int err = 0; arch_pick_mmap_layout(me->mm, &bprm->rlim_stack); @@ -1335,10 +1316,23 @@ int setup_new_exec(struct linux_binprm * bprm) * some architectures like powerpc */ me->mm->task_size = TASK_SIZE; - up_write(&me->signal->exec_update_lock); - mutex_unlock(&me->signal->cred_guard_mutex); + up_write(&sig->exec_update_lock); + mutex_unlock(&sig->cred_guard_mutex); - return 0; + if (sig->group_exec_task) { + spin_lock_irq(&me->sighand->siglock); + sig->notify_count = sig->nr_threads - 1; + spin_unlock_irq(&me->sighand->siglock); + + err = wait_for_notify_count(me); + clear_group_exec_task(me); + } + + if (!err) + err = unshare_sighand(me); + if (!err) + flush_signal_handlers(me, 0); + return err; } EXPORT_SYMBOL(setup_new_exec); diff --git a/kernel/exit.c b/kernel/exit.c index f041f0c05ebb..bcde78c97253 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -178,10 +178,7 @@ static void __exit_signal(struct release_task_post *post, struct task_struct *ts tty = sig->tty; sig->tty = NULL; } else { - /* - * If there is any task waiting for the group exit - * then notify it: - */ + /* mt-exec, setup_new_exec() -> wait_for_notify_count() */ if (sig->notify_count > 0 && !--sig->notify_count) wake_up_process(sig->group_exec_task); @@ -766,8 +763,8 @@ static void exit_notify(struct task_struct *tsk, int group_dead) list_add(&tsk->ptrace_entry, &dead); } - /* mt-exec, de_thread() is waiting for group leader */ - if (unlikely(tsk->signal->notify_count < 0)) + /* mt-exec, de_thread() -> wait_for_notify_count() */ + if (tsk->signal->notify_count < 0 && !++tsk->signal->notify_count) wake_up_process(tsk->signal->group_exec_task); write_unlock_irq(&tasklist_lock); diff --git a/kernel/signal.c b/kernel/signal.c index fe9190d84f28..334212044940 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1343,13 +1343,13 @@ int zap_other_threads(struct task_struct *p) for_other_threads(p, t) { task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); - count++; /* Don't bother with already dead threads */ if (t->exit_state) continue; sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); + count++; } return count; -- 2.25.1.362.g51ebf55 The previous patch fixed the deadlock when mt-exec waits for debugger which should reap a zombie thread, but we can hit the same problem if the killed sub-thread stops in ptrace_event(PTRACE_EVENT_EXIT). Change ptrace_stop() to check signal->group_exit_task. This is a user-visible change. But hopefully it can't break anything. Note that the semantics of PTRACE_EVENT_EXIT was never really defined, it depends on /dev/random. Just for example, currently a sub-thread killed by exec will stop, but if it exits on its own and races with exec it will not stop, so nobody can rely on PTRACE_EVENT_EXIT anyway. We really need to finally define what PTRACE_EVENT_EXIT should actually do, but this needs other changes. Signed-off-by: Oleg Nesterov --- kernel/signal.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/signal.c b/kernel/signal.c index 334212044940..59f61e07905b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2376,6 +2376,10 @@ static int ptrace_stop(int exit_code, int why, unsigned long message, if (!current->ptrace || __fatal_signal_pending(current)) return exit_code; + /* de_thread() -> wait_for_notify_count() waits for us */ + if (current->signal->group_exec_task) + return exit_code; + set_special_state(TASK_TRACED); current->jobctl |= JOBCTL_TRACED; -- 2.25.1.362.g51ebf55