unix_stream_data_wait() does skb_peek_tail(&sk->sk_receive_queue) without
holding any lock that prevents SKBs on that queue from being dequeued and
freed.
This has been the case since commit 79f632c71bea ("unix/stream: fix
peeking with an offset larger than data in queue").
The first consequence of this is that the pointer comparison
`tail != last` can be false even if `last` semantically refers to an
already-freed SKB while `tail` is a new SKB allocated at the same address;
which can cause unix_stream_data_wait() to wrongly keep blocking after new
data has arrived, but only in a weird scenario where a peeking recv() and
a normal recv() on the same socket are racing, which is probably not a
real problem.

But since commit 2b514574f7e8 ("net: af_unix: implement splice for stream
af_unix sockets"), `tail` is actually dereferenced, which can cause UAF in
the following race scenario (where test_setup() runs single-threaded,
and afterwards, test_thread1() and test_thread2() run concurrently in
two threads:
```
static int socks[2];
void test_setup(void) {
  socketpair(AF_UNIX, SOCK_STREAM, 0, socks);
  send(socks[1], "A", 1, 0);
  int peekoff = 1;
  setsockopt(socks[0], SOL_SOCKET, SO_PEEK_OFF, &peekoff, sizeof(peekoff));
}
void test_thread1(void) {
  char dummy;
  recv(socks[0], &dummy, 1, MSG_PEEK);
}
void test_thread2(void) {
  char dummy;
  recv(socks[0], &dummy, 1, 0);
  shutdown(socks[1], SHUT_WR);
}
```

when racing like this:
```
thread1                       thread2
unix_stream_read_generic
  mutex_lock(&u->iolock)
  skb_peek(&sk->sk_receive_queue)
  skb_peek_next(skb, &sk->sk_receive_queue)
  mutex_unlock(&u->iolock)
                              unix_stream_read_generic
                                unix_state_lock(sk)
                                skb_peek(&sk->sk_receive_queue)
                                unix_state_unlock(sk)
  unix_stream_data_wait
    unix_state_lock(sk)
    tail = skb_peek_tail(&sk->sk_receive_queue)
                                spin_lock(&sk->sk_receive_queue.lock)
                                __skb_unlink(skb, &sk->sk_receive_queue)
                                spin_unlock(&sk->sk_receive_queue.lock)
                                consume_skb(skb) [frees the SKB]
    `tail != last`: false
    `tail`: true
    `tail->len != last_len` ***UAF***
```

Fix the UAF by removing the read of tail->len; checking tail->len would
only make sense if SKBs in the receive queue of a UNIX socket could grow,
which AFAIK is not supposed to happen.

Fixes: 2b514574f7e8 ("net: af_unix: implement splice for stream af_unix sockets")
Cc: stable@vger.kernel.org
Signed-off-by: Jann Horn <jannh@google.com>
---
 net/unix/af_unix.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 1cbf36ea043b..dc71ed79be4a 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2711,8 +2711,7 @@ static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
  *	Sleep until more data has arrived. But check for races..
  */
 static long unix_stream_data_wait(struct sock *sk, long timeo,
-				  struct sk_buff *last, unsigned int last_len,
-				  bool freezable)
+				  struct sk_buff *last, bool freezable)
 {
 	unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
 	struct sk_buff *tail;
@@ -2725,7 +2724,6 @@ static long unix_stream_data_wait(struct sock *sk, long timeo,
 
 		tail = skb_peek_tail(&sk->sk_receive_queue);
 		if (tail != last ||
-		    (tail && tail->len != last_len) ||
 		    sk->sk_err ||
 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
 		    signal_pending(current) ||
@@ -2921,7 +2919,6 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state,
 	int flags = state->flags;
 	bool check_creds = false;
 	struct scm_cookie scm;
-	unsigned int last_len;
 	struct unix_sock *u;
 	int copied = 0;
 	int err = 0;
@@ -2967,7 +2964,6 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state,
 			goto unlock;
 		}
 		last = skb = skb_peek(&sk->sk_receive_queue);
-		last_len = last ? last->len : 0;
 
 again:
 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
@@ -3001,8 +2997,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state,
 
 			mutex_unlock(&u->iolock);
 
-			timeo = unix_stream_data_wait(sk, timeo, last,
-						      last_len, freezable);
+			timeo = unix_stream_data_wait(sk, timeo, last, freezable);
 
 			if (signal_pending(current)) {
 				err = sock_intr_errno(timeo);
@@ -3019,7 +3014,6 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state,
 		while (skip >= unix_skb_len(skb)) {
 			skip -= unix_skb_len(skb);
 			last = skb;
-			last_len = skb->len;
 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
 			if (!skb)
 				goto again;
@@ -3094,7 +3088,6 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state,
 
 			skip = 0;
 			last = skb;
-			last_len = skb->len;
 			unix_state_lock(sk);
 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
 			if (skb)

-- 
2.54.0.563.g4f69b47b94-goog

The current implementation of unix_stream_data_wait() works like this:

 - unix_stream_read_generic() grabs locks
 - unix_stream_read_generic() determines that the read must block
 - unix_stream_read_generic() drops locks
 - unix_stream_data_wait() sets up a wait_queue_entry
 - unix_stream_data_wait() rechecks that the read must still block, with
   less locking protection than unix_stream_read_generic()
 - unix_stream_data_wait() waits, then loops back to recheck again

That seems needlessly complicated; and it also involves an ugly comparison
between a potentially-dangling `last` pointer and another
potentially-dangling `tail` pointer.

Instead, let's set up a wait_queue_entry
while the locks grabbed by unix_stream_read_generic() are still held, and
after the wait, directly retry the read.

Signed-off-by: Jann Horn <jannh@google.com>
---
 net/unix/af_unix.c | 49 ++++++++++++++-----------------------------------
 1 file changed, 14 insertions(+), 35 deletions(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index dc71ed79be4a..b38804e2c5ac 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2710,39 +2710,22 @@ static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
 /*
  *	Sleep until more data has arrived. But check for races..
  */
-static long unix_stream_data_wait(struct sock *sk, long timeo,
-				  struct sk_buff *last, bool freezable)
+static long unix_stream_data_wait(struct sock *sk, long timeo, bool freezable)
+__releases(&unix_sk(sk)->iolock)
+__releases(&unix_sk(sk)->lock)
 {
 	unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
-	struct sk_buff *tail;
 	DEFINE_WAIT(wait);
 
-	unix_state_lock(sk);
-
-	for (;;) {
-		prepare_to_wait(sk_sleep(sk), &wait, state);
-
-		tail = skb_peek_tail(&sk->sk_receive_queue);
-		if (tail != last ||
-		    sk->sk_err ||
-		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
-		    signal_pending(current) ||
-		    !timeo)
-			break;
-
-		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
-		unix_state_unlock(sk);
-		timeo = schedule_timeout(timeo);
-		unix_state_lock(sk);
-
-		if (sock_flag(sk, SOCK_DEAD))
-			break;
+	prepare_to_wait(sk_sleep(sk), &wait, state);
+	unix_state_unlock(sk);
+	mutex_unlock(&unix_sk(sk)->iolock);
 
-		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
-	}
+	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+	timeo = schedule_timeout(timeo);
+	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
 
 	finish_wait(sk_sleep(sk), &wait);
-	unix_state_unlock(sk);
 	return timeo;
 }
 
@@ -2955,7 +2938,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state,
 	skip = max(sk_peek_offset(sk, flags), 0);
 
 	do {
-		struct sk_buff *skb, *last;
+		struct sk_buff *skb;
 		int chunk;
 
 		unix_state_lock(sk);
@@ -2963,7 +2946,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state,
 			err = -ECONNRESET;
 			goto unlock;
 		}
-		last = skb = skb_peek(&sk->sk_receive_queue);
+		skb = skb_peek(&sk->sk_receive_queue);
 
 again:
 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
@@ -2989,15 +2972,13 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state,
 			if (sk->sk_shutdown & RCV_SHUTDOWN)
 				goto unlock;
 
-			unix_state_unlock(sk);
 			if (!timeo) {
 				err = -EAGAIN;
-				break;
+				goto unlock;
 			}
 
-			mutex_unlock(&u->iolock);
-
-			timeo = unix_stream_data_wait(sk, timeo, last, freezable);
+			/* does unix_state_unlock() and drops u->iolock */
+			timeo = unix_stream_data_wait(sk, timeo, freezable);
 
 			if (signal_pending(current)) {
 				err = sock_intr_errno(timeo);
@@ -3013,7 +2994,6 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state,
 
 		while (skip >= unix_skb_len(skb)) {
 			skip -= unix_skb_len(skb);
-			last = skb;
 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
 			if (!skb)
 				goto again;
@@ -3087,7 +3067,6 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state,
 				break;
 
 			skip = 0;
-			last = skb;
 			unix_state_lock(sk);
 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
 			if (skb)

-- 
2.54.0.563.g4f69b47b94-goog

Currently, a blocking recv() on a unix stream socket will wake up every
time an SKB that was sent from the socket is consumed by the peer.
So, in a synchronous communication scenario where the client sends a
message to the server, and then the client blocks in recv() until it
receives a reply from the server, the client will get a spurious wakeup
when the server receives the client's message.

Similar to other receive paths such as __skb_wait_for_more_packets(), use
DEFINE_WAIT_FUNC() to filter wakeups.

Signed-off-by: Jann Horn <jannh@google.com>
---
 net/unix/af_unix.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index b38804e2c5ac..f36ab3e5a5bb 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2707,6 +2707,14 @@ static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
 	return recv_actor(sk, skb);
 }
 
+static int unix_recv_wake(wait_queue_entry_t *wait, unsigned int mode,
+			  int sync, void *key)
+{
+	if (key && !(key_to_poll(key) & (EPOLLIN | EPOLLERR)))
+		return 0;
+	return autoremove_wake_function(wait, mode, sync, key);
+}
+
 /*
  *	Sleep until more data has arrived. But check for races..
  */
@@ -2715,7 +2723,7 @@ __releases(&unix_sk(sk)->iolock)
 __releases(&unix_sk(sk)->lock)
 {
 	unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
-	DEFINE_WAIT(wait);
+	DEFINE_WAIT_FUNC(wait, unix_recv_wake);
 
 	prepare_to_wait(sk_sleep(sk), &wait, state);
 	unix_state_unlock(sk);

-- 
2.54.0.563.g4f69b47b94-goog