Preparation. After the next change setup_new_exec() can fail.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 fs/binfmt_elf.c         | 4 +++-
 fs/binfmt_elf_fdpic.c   | 4 +++-
 fs/binfmt_flat.c        | 4 +++-
 fs/exec.c               | 4 +++-
 include/linux/binfmts.h | 2 +-
 5 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index e4653bb99946..8a38689ae4b0 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1021,7 +1021,9 @@ static int load_elf_binary(struct linux_binprm *bprm)
 	if (!(current->personality & ADDR_NO_RANDOMIZE) && snapshot_randomize_va_space)
 		current->flags |= PF_RANDOMIZE;
 
-	setup_new_exec(bprm);
+	retval = setup_new_exec(bprm);
+	if (retval)
+		goto out_free_dentry;
 
 	/* Do this so that we can load the interpreter, if need be.  We will
 	   change some of these later */
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 48fd2de3bca0..7ad98b8132fc 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -351,7 +351,9 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
 	if (elf_read_implies_exec(&exec_params.hdr, executable_stack))
 		current->personality |= READ_IMPLIES_EXEC;
 
-	setup_new_exec(bprm);
+	retval = setup_new_exec(bprm);
+	if (retval)
+		goto error;
 
 	set_binfmt(&elf_fdpic_format);
 
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index b5b5ca1a44f7..78e9ca128ea7 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -512,7 +512,9 @@ static int load_flat_file(struct linux_binprm *bprm,
 
 	/* OK, This is the point of no return */
 	set_personality(PER_LINUX_32BIT);
-	setup_new_exec(bprm);
+	ret = setup_new_exec(bprm);
+	if (ret)
+		goto err;
 
 	/*
 	 * calculate the extra space we need to map in
diff --git a/fs/exec.c b/fs/exec.c
index 6b70c6726d31..136a7ab5d91c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1321,7 +1321,7 @@ void would_dump(struct linux_binprm *bprm, struct file *file)
 }
 EXPORT_SYMBOL(would_dump);
 
-void setup_new_exec(struct linux_binprm * bprm)
+int setup_new_exec(struct linux_binprm * bprm)
 {
 	/* Setup things that can depend upon the personality */
 	struct task_struct *me = current;
@@ -1337,6 +1337,8 @@ void setup_new_exec(struct linux_binprm * bprm)
 	me->mm->task_size = TASK_SIZE;
 	up_write(&me->signal->exec_update_lock);
 	mutex_unlock(&me->signal->cred_guard_mutex);
+
+	return 0;
 }
 EXPORT_SYMBOL(setup_new_exec);
 
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 65abd5ab8836..678b7525ac5a 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -123,7 +123,7 @@ extern void unregister_binfmt(struct linux_binfmt *);
 
 extern int __must_check remove_arg_zero(struct linux_binprm *);
 extern int begin_new_exec(struct linux_binprm * bprm);
-extern void setup_new_exec(struct linux_binprm * bprm);
+extern int setup_new_exec(struct linux_binprm * bprm);
 extern void finalize_exec(struct linux_binprm *bprm);
 extern void would_dump(struct linux_binprm *, struct file *);
 
-- 
2.25.1.362.g51ebf55


This simple program

    #include <unistd.h>
    #include <signal.h>
    #include <sys/ptrace.h>
    #include <pthread.h>

    void *thread(void *arg)
    {
	    ptrace(PTRACE_TRACEME, 0,0,0);
	    return NULL;
    }

    int main(void)
    {
	    int pid = fork();

	    if (!pid) {
		    pthread_t pt;
		    pthread_create(&pt, NULL, thread, NULL);
		    pthread_join(pt, NULL);
		    execlp("echo", "echo", "passed", NULL);
	    }

	    sleep(1);
	    ptrace(PTRACE_ATTACH, pid, 0,0);
	    kill(pid, SIGCONT);

	    return 0;
    }

hangs because de_thread() waits for debugger which should release the killed
thread with cred_guard_mutex held, while the debugger sleeps waiting for the
same mutex. Not really that bad, the tracer can be killed, but still this is
a bug and people hit it in practice.

With this patch:

	- de_thread() waits until all the sub-threads pass exit_notify() and
	  become zombies.

	- setup_new_exec() waits until all the sub-threads are reaped without
	  cred_guard_mutex held.

	- unshare_sighand() and flush_signal_handlers() are moved from
	  begin_new_exec() to setup_new_exec(), we can't call them until all
	  sub-threads go away.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 fs/exec.c       | 140 +++++++++++++++++++++++-------------------------
 kernel/exit.c   |   9 ++--
 kernel/signal.c |   2 +-
 3 files changed, 71 insertions(+), 80 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 136a7ab5d91c..2bac7deb9a98 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -905,42 +905,56 @@ static int exec_mmap(struct mm_struct *mm)
 	return 0;
 }
 
-static int de_thread(struct task_struct *tsk)
+static int kill_sub_threads(struct task_struct *tsk)
 {
 	struct signal_struct *sig = tsk->signal;
-	struct sighand_struct *oldsighand = tsk->sighand;
-	spinlock_t *lock = &oldsighand->siglock;
-
-	if (thread_group_empty(tsk))
-		goto no_thread_group;
+	int err = -EINTR;
 
-	/*
-	 * Kill all other threads in the thread group.
-	 */
-	spin_lock_irq(lock);
-	if ((sig->flags & SIGNAL_GROUP_EXIT) || sig->group_exec_task) {
-		/*
-		 * Another group action in progress, just
-		 * return so that the signal is processed.
-		 */
-		spin_unlock_irq(lock);
-		return -EAGAIN;
+	read_lock(&tasklist_lock);
+	spin_lock_irq(&tsk->sighand->siglock);
+	if (!((sig->flags & SIGNAL_GROUP_EXIT) || sig->group_exec_task)) {
+		sig->group_exec_task = tsk;
+		sig->notify_count = -zap_other_threads(tsk);
+		err = 0;
 	}
+	spin_unlock_irq(&tsk->sighand->siglock);
+	read_unlock(&tasklist_lock);
 
-	sig->group_exec_task = tsk;
-	sig->notify_count = zap_other_threads(tsk);
-	if (!thread_group_leader(tsk))
-		sig->notify_count--;
+	return err;
+}
 
-	while (sig->notify_count) {
-		__set_current_state(TASK_KILLABLE);
-		spin_unlock_irq(lock);
-		schedule();
+static int wait_for_notify_count(struct task_struct *tsk)
+{
+	for (;;) {
 		if (__fatal_signal_pending(tsk))
-			goto killed;
-		spin_lock_irq(lock);
+			return -EINTR;
+		set_current_state(TASK_KILLABLE);
+		if (!tsk->signal->notify_count)
+			break;
+		schedule();
 	}
-	spin_unlock_irq(lock);
+	__set_current_state(TASK_RUNNING);
+	return 0;
+}
+
+static void clear_group_exec_task(struct task_struct *tsk)
+{
+	struct signal_struct *sig = tsk->signal;
+
+	/* protects against exit_notify() and __exit_signal() */
+	read_lock(&tasklist_lock);
+	sig->group_exec_task = NULL;
+	sig->notify_count = 0;
+	read_unlock(&tasklist_lock);
+}
+
+static int de_thread(struct task_struct *tsk)
+{
+	if (thread_group_empty(tsk))
+		goto no_thread_group;
+
+	if (kill_sub_threads(tsk) || wait_for_notify_count(tsk))
+		return -EINTR;
 
 	/*
 	 * At this point all other threads have exited, all we have to
@@ -948,26 +962,10 @@ static int de_thread(struct task_struct *tsk)
 	 * and to assume its PID:
 	 */
 	if (!thread_group_leader(tsk)) {
-		struct task_struct *leader = tsk->group_leader;
-
-		for (;;) {
-			cgroup_threadgroup_change_begin(tsk);
-			write_lock_irq(&tasklist_lock);
-			/*
-			 * Do this under tasklist_lock to ensure that
-			 * exit_notify() can't miss ->group_exec_task
-			 */
-			sig->notify_count = -1;
-			if (likely(leader->exit_state))
-				break;
-			__set_current_state(TASK_KILLABLE);
-			write_unlock_irq(&tasklist_lock);
-			cgroup_threadgroup_change_end(tsk);
-			schedule();
-			if (__fatal_signal_pending(tsk))
-				goto killed;
-		}
+		struct task_struct *leader = tsk->group_leader, *t;
 
+		cgroup_threadgroup_change_begin(tsk);
+		write_lock_irq(&tasklist_lock);
 		/*
 		 * The only record we have of the real-time age of a
 		 * process, regardless of execs it's done, is start_time.
@@ -1000,8 +998,8 @@ static int de_thread(struct task_struct *tsk)
 		list_replace_rcu(&leader->tasks, &tsk->tasks);
 		list_replace_init(&leader->sibling, &tsk->sibling);
 
-		tsk->group_leader = tsk;
-		leader->group_leader = tsk;
+		for_each_thread(tsk, t)
+			t->group_leader = tsk;
 
 		tsk->exit_signal = SIGCHLD;
 		leader->exit_signal = -1;
@@ -1021,23 +1019,11 @@ static int de_thread(struct task_struct *tsk)
 		release_task(leader);
 	}
 
-	sig->group_exec_task = NULL;
-	sig->notify_count = 0;
-
 no_thread_group:
 	/* we have changed execution domain */
 	tsk->exit_signal = SIGCHLD;
-
 	BUG_ON(!thread_group_leader(tsk));
 	return 0;
-
-killed:
-	/* protects against exit_notify() and __exit_signal() */
-	read_lock(&tasklist_lock);
-	sig->group_exec_task = NULL;
-	sig->notify_count = 0;
-	read_unlock(&tasklist_lock);
-	return -EAGAIN;
 }
 
 
@@ -1171,13 +1157,6 @@ int begin_new_exec(struct linux_binprm * bprm)
 	flush_itimer_signals();
 #endif
 
-	/*
-	 * Make the signal table private.
-	 */
-	retval = unshare_sighand(me);
-	if (retval)
-		goto out_unlock;
-
 	me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC |
 					PF_NOFREEZE | PF_NO_SETAFFINITY);
 	flush_thread();
@@ -1249,7 +1228,6 @@ int begin_new_exec(struct linux_binprm * bprm)
 	/* An exec changes our domain. We are no longer part of the thread
 	   group */
 	WRITE_ONCE(me->self_exec_id, me->self_exec_id + 1);
-	flush_signal_handlers(me, 0);
 
 	retval = set_cred_ucounts(bprm->cred);
 	if (retval < 0)
@@ -1293,8 +1271,9 @@ int begin_new_exec(struct linux_binprm * bprm)
 	up_write(&me->signal->exec_update_lock);
 	if (!bprm->cred)
 		mutex_unlock(&me->signal->cred_guard_mutex);
-
 out:
+	if (me->signal->group_exec_task == me)
+		clear_group_exec_task(me);
 	return retval;
 }
 EXPORT_SYMBOL(begin_new_exec);
@@ -1325,6 +1304,8 @@ int setup_new_exec(struct linux_binprm * bprm)
 {
 	/* Setup things that can depend upon the personality */
 	struct task_struct *me = current;
+	struct signal_struct *sig = me->signal;
+	int err = 0;
 
 	arch_pick_mmap_layout(me->mm, &bprm->rlim_stack);
 
@@ -1335,10 +1316,23 @@ int setup_new_exec(struct linux_binprm * bprm)
 	 * some architectures like powerpc
 	 */
 	me->mm->task_size = TASK_SIZE;
-	up_write(&me->signal->exec_update_lock);
-	mutex_unlock(&me->signal->cred_guard_mutex);
+	up_write(&sig->exec_update_lock);
+	mutex_unlock(&sig->cred_guard_mutex);
 
-	return 0;
+	if (sig->group_exec_task) {
+		spin_lock_irq(&me->sighand->siglock);
+		sig->notify_count = sig->nr_threads - 1;
+		spin_unlock_irq(&me->sighand->siglock);
+
+		err = wait_for_notify_count(me);
+		clear_group_exec_task(me);
+	}
+
+	if (!err)
+		err = unshare_sighand(me);
+	if (!err)
+		flush_signal_handlers(me, 0);
+	return err;
 }
 EXPORT_SYMBOL(setup_new_exec);
 
diff --git a/kernel/exit.c b/kernel/exit.c
index f041f0c05ebb..bcde78c97253 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -178,10 +178,7 @@ static void __exit_signal(struct release_task_post *post, struct task_struct *ts
 		tty = sig->tty;
 		sig->tty = NULL;
 	} else {
-		/*
-		 * If there is any task waiting for the group exit
-		 * then notify it:
-		 */
+		/* mt-exec, setup_new_exec() -> wait_for_notify_count() */
 		if (sig->notify_count > 0 && !--sig->notify_count)
 			wake_up_process(sig->group_exec_task);
 
@@ -766,8 +763,8 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 		list_add(&tsk->ptrace_entry, &dead);
 	}
 
-	/* mt-exec, de_thread() is waiting for group leader */
-	if (unlikely(tsk->signal->notify_count < 0))
+	/* mt-exec, de_thread() -> wait_for_notify_count() */
+	if (tsk->signal->notify_count < 0 && !++tsk->signal->notify_count)
 		wake_up_process(tsk->signal->group_exec_task);
 	write_unlock_irq(&tasklist_lock);
 
diff --git a/kernel/signal.c b/kernel/signal.c
index fe9190d84f28..334212044940 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1343,13 +1343,13 @@ int zap_other_threads(struct task_struct *p)
 
 	for_other_threads(p, t) {
 		task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
-		count++;
 
 		/* Don't bother with already dead threads */
 		if (t->exit_state)
 			continue;
 		sigaddset(&t->pending.signal, SIGKILL);
 		signal_wake_up(t, 1);
+		count++;
 	}
 
 	return count;
-- 
2.25.1.362.g51ebf55


The previous patch fixed the deadlock when mt-exec waits for debugger
which should reap a zombie thread, but we can hit the same problem if
the killed sub-thread stops in ptrace_event(PTRACE_EVENT_EXIT). Change
ptrace_stop() to check signal->group_exit_task.

This is a user-visible change. But hopefully it can't break anything.
Note that the semantics of PTRACE_EVENT_EXIT was never really defined,
it depends on /dev/random. Just for example, currently a sub-thread
killed by exec will stop, but if it exits on its own and races with
exec it will not stop, so nobody can rely on PTRACE_EVENT_EXIT anyway.

We really need to finally define what PTRACE_EVENT_EXIT should actually
do, but this needs other changes.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 kernel/signal.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/signal.c b/kernel/signal.c
index 334212044940..59f61e07905b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2376,6 +2376,10 @@ static int ptrace_stop(int exit_code, int why, unsigned long message,
 	if (!current->ptrace || __fatal_signal_pending(current))
 		return exit_code;
 
+	/* de_thread() -> wait_for_notify_count() waits for us */
+	if (current->signal->group_exec_task)
+		return exit_code;
+
 	set_special_state(TASK_TRACED);
 	current->jobctl |= JOBCTL_TRACED;
 
-- 
2.25.1.362.g51ebf55