Hold a local reference to new_sock->sk before installing callbacks in rds_tcp_accept_one. After rds_tcp_set_callbacks() or rds_tcp_reset_callbacks(), tc->t_sock is set to new_sock which may race with the shutdown path. A concurrent rds_tcp_conn_path_shutdown() may call sock_release(), which sets new_sock->sk = NULL and frees sk. Subsequent accesses to new_sock->sk->sk_state dereference NULL, causing the null dereference. So a local sock reference with sock_hold() before installing callbacks will prevent the race. Fixes: 826c1004d4ae ("net/rds: rds_tcp_conn_path_shutdown must not discard messages") Reported-by: syzbot+96046021045ffe6d7709@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=96046021045ffe6d7709 Signed-off-by: Allison Henderson --- net/rds/tcp_listen.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 6fb5c928b8fd..cdc86473a1ba 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -177,6 +177,7 @@ int rds_tcp_accept_one(struct rds_tcp_net *rtn) struct rds_tcp_connection *rs_tcp = NULL; int conn_state; struct rds_conn_path *cp; + struct sock *sk; struct in6_addr *my_addr, *peer_addr; #if !IS_ENABLED(CONFIG_IPV6) struct in6_addr saddr, daddr; @@ -298,6 +299,14 @@ int rds_tcp_accept_one(struct rds_tcp_net *rtn) rds_conn_path_drop(cp, 0); goto rst_nsk; } + /* Hold a local reference to sk before setting callbacks. Once callbacks + * are set, it is possible for a concurrent rds_tcp_conn_path_shutdown + * call to release the new_sock->sk and set it to NULL. So we use + * a local sk here to avoid racing with callbacks + */ + sk = new_sock->sk; + sock_hold(sk); + if (rs_tcp->t_sock) { /* Duelling SYN has been handled in rds_tcp_accept_one() */ rds_tcp_reset_callbacks(new_sock, cp); @@ -316,13 +325,15 @@ int rds_tcp_accept_one(struct rds_tcp_net *rtn) * knowing that "rds_tcp_conn_path_shutdown" will * dequeue pending messages. */ - if (new_sock->sk->sk_state == TCP_CLOSE_WAIT || - new_sock->sk->sk_state == TCP_LAST_ACK || - new_sock->sk->sk_state == TCP_CLOSE) + if (READ_ONCE(sk->sk_state) == TCP_CLOSE_WAIT || + READ_ONCE(sk->sk_state) == TCP_LAST_ACK || + READ_ONCE(sk->sk_state) == TCP_CLOSE) rds_conn_path_drop(cp, 0); else queue_delayed_work(cp->cp_wq, &cp->cp_recv_w, 0); + sock_put(sk); + new_sock = NULL; ret = 0; if (conn->c_npaths == 0) -- 2.43.0 The next patch will delegate fanout operations to a background worker, which requires cancel_work_sync() during connection cleanup. However, the error path of __rds_conn_create() currently calls trans->conn_free() while holding rds_conn_lock (spinlock) and rcu_read_lock, which creates an atomic context where cancel_work_sync() cannot sleep. To avoid this, refactor the error/race paths to defer trans->conn_free() calls until after locks are released. This allows transport cleanup functions to perform blocking operations safely. This patch moves the cp_transport_data cleanup to the 'out:' label where it runs outside the critical section, after the connection has been freed from the slab and cannot be accessed by racing threads. Link: https://ci.syzbot.org/series/1a5ef180-c02c-401d-9df7-670b18570a55 Signed-off-by: Allison Henderson --- net/rds/connection.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/net/rds/connection.c b/net/rds/connection.c index 185f73b01694..695ab7446178 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -170,6 +170,7 @@ static struct rds_connection *__rds_conn_create(struct net *net, struct hlist_head *head = rds_conn_bucket(laddr, faddr); struct rds_transport *loop_trans; struct rds_conn_path *free_cp = NULL; + struct rds_transport *free_trans = NULL; unsigned long flags; int ret, i; int npaths = (trans->t_mp_capable ? RDS_MPATH_WORKERS : 1); @@ -305,7 +306,7 @@ static struct rds_connection *__rds_conn_create(struct net *net, if (parent) { /* Creating passive conn */ if (parent->c_passive) { - trans->conn_free(conn->c_path[0].cp_transport_data); + free_trans = trans; free_cp = conn->c_path; kmem_cache_free(rds_conn_slab, conn); conn = parent->c_passive; @@ -321,18 +322,7 @@ static struct rds_connection *__rds_conn_create(struct net *net, found = rds_conn_lookup(net, head, laddr, faddr, trans, tos, dev_if); if (found) { - struct rds_conn_path *cp; - int i; - - for (i = 0; i < npaths; i++) { - cp = &conn->c_path[i]; - /* The ->conn_alloc invocation may have - * allocated resource for all paths, so all - * of them may have to be freed here. - */ - if (cp->cp_transport_data) - trans->conn_free(cp->cp_transport_data); - } + free_trans = trans; free_cp = conn->c_path; kmem_cache_free(rds_conn_slab, conn); conn = found; @@ -349,9 +339,23 @@ static struct rds_connection *__rds_conn_create(struct net *net, out: if (free_cp) { - for (i = 0; i < npaths; i++) + for (i = 0; i < npaths; i++) { + /* + * The trans->conn_alloc call may have allocated + * resources for the cp paths, which will need to + * be freed before freeing cp itself. We do this here + * so it runs outside the rds_conn_lock spinlock + * and rcu_read_lock section, because conn_free() + * may call cancel_work_sync() which + * can sleep. free_trans is only set in the + * race-loss paths where conn_alloc() succeeded. + */ + if (free_trans && free_cp[i].cp_transport_data) + free_trans->conn_free + (free_cp[i].cp_transport_data); if (free_cp[i].cp_wq != rds_wq) destroy_workqueue(free_cp[i].cp_wq); + } kfree(free_cp); } -- 2.43.0 From: Gerd Rausch Delegate fan-out to a background worker in order to allow kernel_getpeername() to acquire a lock on the socket. This has become necessary since the introduction of commit "9dfc685e0262d ("inet: remove races in inet{6}_getname()") The socket is already locked in the context that "kernel_getpeername" used to get called by either rds_tcp_recv_path" or "tcp_v{4,6}_rcv", and therefore causing a deadlock. Luckily, the fan-out need not happen in-context nor fast, so we can easily just do the same in a background worker. Also, while we're doing this, we get rid of the unused struct members "t_conn_w", "t_send_w", "t_down_w" & "t_recv_w". The fan-out work and the shutdown worker (cp_down_w) are both queued on the same ordered workqueue (cp0->cp_wq), so they cannot execute concurrently. We only need cancel_work_sync() in rds_tcp_conn_free() and rds_tcp_conn_path_connect() because those run from outside the ordered workqueue. Signed-off-by: Gerd Rausch Signed-off-by: Allison Henderson --- net/rds/tcp.c | 3 +++ net/rds/tcp.h | 7 ++---- net/rds/tcp_connect.c | 2 ++ net/rds/tcp_listen.c | 54 +++++++++++++++++++++++++++++++------------ 4 files changed, 46 insertions(+), 20 deletions(-) diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 45484a93d75f..02f8f928c20b 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -358,6 +358,8 @@ static void rds_tcp_conn_free(void *arg) rdsdebug("freeing tc %p\n", tc); + cancel_work_sync(&tc->t_fan_out_w); + spin_lock_irqsave(&rds_tcp_conn_lock, flags); if (!tc->t_tcp_node_detached) list_del(&tc->t_tcp_node); @@ -384,6 +386,7 @@ static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp) tc->t_tinc = NULL; tc->t_tinc_hdr_rem = sizeof(struct rds_header); tc->t_tinc_data_rem = 0; + INIT_WORK(&tc->t_fan_out_w, rds_tcp_fan_out_w); init_waitqueue_head(&tc->t_recv_done_waitq); conn->c_path[i].cp_transport_data = tc; diff --git a/net/rds/tcp.h b/net/rds/tcp.h index 39c86347188c..9ecb0b6b658a 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -44,11 +44,7 @@ struct rds_tcp_connection { size_t t_tinc_hdr_rem; size_t t_tinc_data_rem; - /* XXX error report? */ - struct work_struct t_conn_w; - struct work_struct t_send_w; - struct work_struct t_down_w; - struct work_struct t_recv_w; + struct work_struct t_fan_out_w; /* for info exporting only */ struct list_head t_list_item; @@ -90,6 +86,7 @@ void rds_tcp_state_change(struct sock *sk); struct socket *rds_tcp_listen_init(struct net *net, bool isv6); void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor); void rds_tcp_listen_data_ready(struct sock *sk); +void rds_tcp_fan_out_w(struct work_struct *work); void rds_tcp_conn_slots_available(struct rds_connection *conn, bool fan_out); int rds_tcp_accept_one(struct rds_tcp_net *rtn); void rds_tcp_keepalive(struct socket *sock); diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index b77c88ffb199..6954b8c479f1 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -115,6 +115,8 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp) if (cp->cp_index > 0 && cp->cp_conn->c_npaths < 2) return -EAGAIN; + cancel_work_sync(&tc->t_fan_out_w); + mutex_lock(&tc->t_conn_path_lock); if (rds_conn_path_up(cp)) { diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index cdc86473a1ba..f2c4778be0b3 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -123,27 +123,20 @@ rds_tcp_accept_one_path(struct rds_connection *conn, struct socket *sock) return NULL; } -void rds_tcp_conn_slots_available(struct rds_connection *conn, bool fan_out) +void rds_tcp_fan_out_w(struct work_struct *work) { - struct rds_tcp_connection *tc; - struct rds_tcp_net *rtn; - struct socket *sock; + struct rds_tcp_connection *tc = container_of(work, + struct rds_tcp_connection, + t_fan_out_w); + struct rds_connection *conn = tc->t_cpath->cp_conn; + struct rds_tcp_net *rtn = tc->t_rtn; + struct socket *sock = tc->t_sock; int sport, npaths; - if (rds_destroy_pending(conn)) - return; - - tc = conn->c_path->cp_transport_data; - rtn = tc->t_rtn; - if (!rtn) - return; - - sock = tc->t_sock; - /* During fan-out, check that the connection we already * accepted in slot#0 carried the proper source port modulo. */ - if (fan_out && conn->c_with_sport_idx && sock && + if (conn->c_with_sport_idx && sock && rds_addr_cmp(&conn->c_laddr, &conn->c_faddr) > 0) { /* cp->cp_index is encoded in lowest bits of source-port */ sport = rds_tcp_get_peer_sport(sock); @@ -167,6 +160,37 @@ void rds_tcp_conn_slots_available(struct rds_connection *conn, bool fan_out) rds_tcp_accept_work(rtn); } +void rds_tcp_conn_slots_available(struct rds_connection *conn, bool fan_out) +{ + struct rds_conn_path *cp0; + struct rds_tcp_connection *tc; + struct rds_tcp_net *rtn; + + if (rds_destroy_pending(conn)) + return; + + cp0 = conn->c_path; + tc = cp0->cp_transport_data; + rtn = tc->t_rtn; + if (!rtn) + return; + + if (fan_out) + /* Delegate fan-out to a background worker in order + * to allow "kernel_getpeername" to acquire a lock + * on the socket. + * The socket is already locked in this context + * by either "rds_tcp_recv_path" or "tcp_v{4,6}_rcv", + * depending on the origin of the dequeue-request. + */ + queue_work(cp0->cp_wq, &tc->t_fan_out_w); + else + /* Fan-out either already happened or is unnecessary. + * Just go ahead and attempt to accept more connections + */ + rds_tcp_accept_work(rtn); +} + int rds_tcp_accept_one(struct rds_tcp_net *rtn) { struct socket *listen_sock = rtn->rds_tcp_listen_sock; -- 2.43.0 From: Greg Jumper The function rds_tcp_get_peer_sport() should return the peer port of a socket, even when the socket is not currently connected, so that RDS can reliably determine the MPRDS "lane" corresponding to the port. rds_tcp_get_peer_sport() calls kernel_getpeername() to get the port number; however, when paths between endpoints frequently drop and reconnect, kernel_getpeername() can return -ENOTCONN, causing rds_tcp_get_peer_sport() to return an error, and ultimately causing RDS to use the wrong lane for a port when reconnecting to a peer. This patch modifies rds_tcp_get_peer_sport() to directly call the socket-specific get-name function (inet_getname() in this case) that kernel_getpeername() also calls. The socket-specific function offers an additional argument which, when set to a value greater than 1, causes the function to return the socket's peer name even when the socket is not connected, which in turn allows rds_tcp_get_peer_sport() to return the correct port number. Signed-off-by: Greg Jumper Signed-off-by: Allison Henderson --- net/rds/tcp_listen.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index f2c4778be0b3..ba283c8ffa64 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -67,7 +67,14 @@ rds_tcp_get_peer_sport(struct socket *sock) } saddr; int sport; - if (kernel_getpeername(sock, &saddr.addr) >= 0) { + /* Call the socket's getname() function (inet_getname() in this case) + * with a final argument greater than 1 to get the peer's port + * regardless of whether the socket is currently connected. + * Using peer=2 will get the peer port even during reconnection states + * (TCPF_CLOSE, TCPF_SYN_SENT). This avoids -ENOTCONN while + * inet_dport still contains the correct peer port. + */ + if (sock->ops->getname(sock, &saddr.addr, 2) >= 0) { switch (saddr.addr.sa_family) { case AF_INET: sport = ntohs(saddr.sin.sin_port); @@ -177,7 +184,7 @@ void rds_tcp_conn_slots_available(struct rds_connection *conn, bool fan_out) if (fan_out) /* Delegate fan-out to a background worker in order - * to allow "kernel_getpeername" to acquire a lock + * to allow "sock->ops->getname()" to acquire a lock * on the socket. * The socket is already locked in this context * by either "rds_tcp_recv_path" or "tcp_v{4,6}_rcv", -- 2.43.0