unix_stream_data_wait() does skb_peek_tail(&sk->sk_receive_queue) without holding any lock that prevents SKBs on that queue from being dequeued and freed. This has been the case since commit 79f632c71bea ("unix/stream: fix peeking with an offset larger than data in queue"). The first consequence of this is that the pointer comparison `tail != last` can be false even if `last` semantically refers to an already-freed SKB while `tail` is a new SKB allocated at the same address; which can cause unix_stream_data_wait() to wrongly keep blocking after new data has arrived, but only in a weird scenario where a peeking recv() and a normal recv() on the same socket are racing, which is probably not a real problem. But since commit 2b514574f7e8 ("net: af_unix: implement splice for stream af_unix sockets"), `tail` is actually dereferenced, which can cause UAF in the following race scenario (where test_setup() runs single-threaded, and afterwards, test_thread1() and test_thread2() run concurrently in two threads: ``` static int socks[2]; void test_setup(void) { socketpair(AF_UNIX, SOCK_STREAM, 0, socks); send(socks[1], "A", 1, 0); int peekoff = 1; setsockopt(socks[0], SOL_SOCKET, SO_PEEK_OFF, &peekoff, sizeof(peekoff)); } void test_thread1(void) { char dummy; recv(socks[0], &dummy, 1, MSG_PEEK); } void test_thread2(void) { char dummy; recv(socks[0], &dummy, 1, 0); shutdown(socks[1], SHUT_WR); } ``` when racing like this: ``` thread1 thread2 unix_stream_read_generic mutex_lock(&u->iolock) skb_peek(&sk->sk_receive_queue) skb_peek_next(skb, &sk->sk_receive_queue) mutex_unlock(&u->iolock) unix_stream_read_generic unix_state_lock(sk) skb_peek(&sk->sk_receive_queue) unix_state_unlock(sk) unix_stream_data_wait unix_state_lock(sk) tail = skb_peek_tail(&sk->sk_receive_queue) spin_lock(&sk->sk_receive_queue.lock) __skb_unlink(skb, &sk->sk_receive_queue) spin_unlock(&sk->sk_receive_queue.lock) consume_skb(skb) [frees the SKB] `tail != last`: false `tail`: true `tail->len != last_len` ***UAF*** ``` Fix the UAF by removing the read of tail->len; checking tail->len would only make sense if SKBs in the receive queue of a UNIX socket could grow, which AFAIK is not supposed to happen. Fixes: 2b514574f7e8 ("net: af_unix: implement splice for stream af_unix sockets") Cc: stable@vger.kernel.org Signed-off-by: Jann Horn --- net/unix/af_unix.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 1cbf36ea043b..dc71ed79be4a 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2711,8 +2711,7 @@ static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor) * Sleep until more data has arrived. But check for races.. */ static long unix_stream_data_wait(struct sock *sk, long timeo, - struct sk_buff *last, unsigned int last_len, - bool freezable) + struct sk_buff *last, bool freezable) { unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE; struct sk_buff *tail; @@ -2725,7 +2724,6 @@ static long unix_stream_data_wait(struct sock *sk, long timeo, tail = skb_peek_tail(&sk->sk_receive_queue); if (tail != last || - (tail && tail->len != last_len) || sk->sk_err || (sk->sk_shutdown & RCV_SHUTDOWN) || signal_pending(current) || @@ -2921,7 +2919,6 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state, int flags = state->flags; bool check_creds = false; struct scm_cookie scm; - unsigned int last_len; struct unix_sock *u; int copied = 0; int err = 0; @@ -2967,7 +2964,6 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state, goto unlock; } last = skb = skb_peek(&sk->sk_receive_queue); - last_len = last ? last->len : 0; again: #if IS_ENABLED(CONFIG_AF_UNIX_OOB) @@ -3001,8 +2997,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state, mutex_unlock(&u->iolock); - timeo = unix_stream_data_wait(sk, timeo, last, - last_len, freezable); + timeo = unix_stream_data_wait(sk, timeo, last, freezable); if (signal_pending(current)) { err = sock_intr_errno(timeo); @@ -3019,7 +3014,6 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state, while (skip >= unix_skb_len(skb)) { skip -= unix_skb_len(skb); last = skb; - last_len = skb->len; skb = skb_peek_next(skb, &sk->sk_receive_queue); if (!skb) goto again; @@ -3094,7 +3088,6 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state, skip = 0; last = skb; - last_len = skb->len; unix_state_lock(sk); skb = skb_peek_next(skb, &sk->sk_receive_queue); if (skb) -- 2.54.0.563.g4f69b47b94-goog The current implementation of unix_stream_data_wait() works like this: - unix_stream_read_generic() grabs locks - unix_stream_read_generic() determines that the read must block - unix_stream_read_generic() drops locks - unix_stream_data_wait() sets up a wait_queue_entry - unix_stream_data_wait() rechecks that the read must still block, with less locking protection than unix_stream_read_generic() - unix_stream_data_wait() waits, then loops back to recheck again That seems needlessly complicated; and it also involves an ugly comparison between a potentially-dangling `last` pointer and another potentially-dangling `tail` pointer. Instead, let's set up a wait_queue_entry while the locks grabbed by unix_stream_read_generic() are still held, and after the wait, directly retry the read. Signed-off-by: Jann Horn --- net/unix/af_unix.c | 49 ++++++++++++++----------------------------------- 1 file changed, 14 insertions(+), 35 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index dc71ed79be4a..b38804e2c5ac 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2710,39 +2710,22 @@ static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor) /* * Sleep until more data has arrived. But check for races.. */ -static long unix_stream_data_wait(struct sock *sk, long timeo, - struct sk_buff *last, bool freezable) +static long unix_stream_data_wait(struct sock *sk, long timeo, bool freezable) +__releases(&unix_sk(sk)->iolock) +__releases(&unix_sk(sk)->lock) { unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE; - struct sk_buff *tail; DEFINE_WAIT(wait); - unix_state_lock(sk); - - for (;;) { - prepare_to_wait(sk_sleep(sk), &wait, state); - - tail = skb_peek_tail(&sk->sk_receive_queue); - if (tail != last || - sk->sk_err || - (sk->sk_shutdown & RCV_SHUTDOWN) || - signal_pending(current) || - !timeo) - break; - - sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); - unix_state_unlock(sk); - timeo = schedule_timeout(timeo); - unix_state_lock(sk); - - if (sock_flag(sk, SOCK_DEAD)) - break; + prepare_to_wait(sk_sleep(sk), &wait, state); + unix_state_unlock(sk); + mutex_unlock(&unix_sk(sk)->iolock); - sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); - } + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + timeo = schedule_timeout(timeo); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); finish_wait(sk_sleep(sk), &wait); - unix_state_unlock(sk); return timeo; } @@ -2955,7 +2938,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state, skip = max(sk_peek_offset(sk, flags), 0); do { - struct sk_buff *skb, *last; + struct sk_buff *skb; int chunk; unix_state_lock(sk); @@ -2963,7 +2946,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state, err = -ECONNRESET; goto unlock; } - last = skb = skb_peek(&sk->sk_receive_queue); + skb = skb_peek(&sk->sk_receive_queue); again: #if IS_ENABLED(CONFIG_AF_UNIX_OOB) @@ -2989,15 +2972,13 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state, if (sk->sk_shutdown & RCV_SHUTDOWN) goto unlock; - unix_state_unlock(sk); if (!timeo) { err = -EAGAIN; - break; + goto unlock; } - mutex_unlock(&u->iolock); - - timeo = unix_stream_data_wait(sk, timeo, last, freezable); + /* does unix_state_unlock() and drops u->iolock */ + timeo = unix_stream_data_wait(sk, timeo, freezable); if (signal_pending(current)) { err = sock_intr_errno(timeo); @@ -3013,7 +2994,6 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state, while (skip >= unix_skb_len(skb)) { skip -= unix_skb_len(skb); - last = skb; skb = skb_peek_next(skb, &sk->sk_receive_queue); if (!skb) goto again; @@ -3087,7 +3067,6 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state, break; skip = 0; - last = skb; unix_state_lock(sk); skb = skb_peek_next(skb, &sk->sk_receive_queue); if (skb) -- 2.54.0.563.g4f69b47b94-goog Currently, a blocking recv() on a unix stream socket will wake up every time an SKB that was sent from the socket is consumed by the peer. So, in a synchronous communication scenario where the client sends a message to the server, and then the client blocks in recv() until it receives a reply from the server, the client will get a spurious wakeup when the server receives the client's message. Similar to other receive paths such as __skb_wait_for_more_packets(), use DEFINE_WAIT_FUNC() to filter wakeups. Signed-off-by: Jann Horn --- net/unix/af_unix.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index b38804e2c5ac..f36ab3e5a5bb 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2707,6 +2707,14 @@ static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor) return recv_actor(sk, skb); } +static int unix_recv_wake(wait_queue_entry_t *wait, unsigned int mode, + int sync, void *key) +{ + if (key && !(key_to_poll(key) & (EPOLLIN | EPOLLERR))) + return 0; + return autoremove_wake_function(wait, mode, sync, key); +} + /* * Sleep until more data has arrived. But check for races.. */ @@ -2715,7 +2723,7 @@ __releases(&unix_sk(sk)->iolock) __releases(&unix_sk(sk)->lock) { unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE; - DEFINE_WAIT(wait); + DEFINE_WAIT_FUNC(wait, unix_recv_wake); prepare_to_wait(sk_sleep(sk), &wait, state); unix_state_unlock(sk); -- 2.54.0.563.g4f69b47b94-goog