From: Gerd Rausch Upon "sendmsg", RDS/TCP selects a backend connection based on a hash calculated from the source-port ("RDS_MPATH_HASH"). However, "rds_tcp_accept_one" accepts connections in the order they arrive, which is non-deterministic. Therefore the mapping of the sender's "cp->cp_index" to that of the receiver changes if the backend connections are dropped and reconnected. However, connection state that's preserved across reconnects (e.g. "cp_next_rx_seq") relies on that sender<->receiver mapping to never change. So we make sure that client and server of the TCP connection have the exact same "cp->cp_index" across reconnects by encoding "cp->cp_index" in the lower three bits of the client's TCP source port. A new extension "RDS_EXTHDR_SPORT_IDX" is introduced, that allows the server to tell the difference between clients that do the "cp->cp_index" encoding, and legacy clients that pick source ports randomly. Signed-off-by: Gerd Rausch Signed-off-by: Allison Henderson --- net/rds/message.c | 1 + net/rds/rds.h | 3 +++ net/rds/recv.c | 7 +++++++ net/rds/send.c | 4 ++++ net/rds/tcp.h | 1 + net/rds/tcp_connect.c | 22 ++++++++++++++++++++- net/rds/tcp_listen.c | 45 +++++++++++++++++++++++++++++++++++++------ 7 files changed, 76 insertions(+), 7 deletions(-) diff --git a/net/rds/message.c b/net/rds/message.c index 591a27c9c62f7..54fd000806eab 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -47,6 +47,7 @@ static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = { [RDS_EXTHDR_RDMA_BYTES] = sizeof(struct rds_ext_header_rdma_bytes), [RDS_EXTHDR_NPATHS] = sizeof(__be16), [RDS_EXTHDR_GEN_NUM] = sizeof(__be32), +[RDS_EXTHDR_SPORT_IDX] = 1, }; void rds_message_addref(struct rds_message *rm) diff --git a/net/rds/rds.h b/net/rds/rds.h index cadfd7ec0ba92..d942057b91ee4 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -147,6 +147,7 @@ struct rds_connection { c_ping_triggered:1, c_pad_to_32:29; int c_npaths; + bool c_with_sport_idx; struct rds_connection *c_passive; struct rds_transport *c_trans; @@ -277,8 +278,10 @@ struct rds_ext_header_rdma_bytes { */ #define RDS_EXTHDR_NPATHS 5 #define RDS_EXTHDR_GEN_NUM 6 +#define RDS_EXTHDR_SPORT_IDX 8 #define __RDS_EXTHDR_MAX 16 /* for now */ + #define RDS_RX_MAX_TRACES (RDS_MSG_RX_DGRAM_TRACE_MAX + 1) #define RDS_MSG_RX_HDR 0 #define RDS_MSG_RX_START 1 diff --git a/net/rds/recv.c b/net/rds/recv.c index 66680f652e74a..ddf128a023470 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -204,7 +204,9 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr, struct rds_ext_header_version version; __be16 rds_npaths; __be32 rds_gen_num; + u8 dummy; } buffer; + bool new_with_sport_idx = false; u32 new_peer_gen_num = 0; while (1) { @@ -221,11 +223,16 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr, case RDS_EXTHDR_GEN_NUM: new_peer_gen_num = be32_to_cpu(buffer.rds_gen_num); break; + case RDS_EXTHDR_SPORT_IDX: + new_with_sport_idx = true; + break; default: pr_warn_ratelimited("ignoring unknown exthdr type " "0x%x\n", type); } } + + conn->c_with_sport_idx = new_with_sport_idx; /* if RDS_EXTHDR_NPATHS was not found, default to a single-path */ conn->c_npaths = max_t(int, conn->c_npaths, 1); conn->c_ping_triggered = 0; diff --git a/net/rds/send.c b/net/rds/send.c index 306785fa7065e..85e1c5352ad80 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -1457,12 +1457,16 @@ rds_send_probe(struct rds_conn_path *cp, __be16 sport, cp->cp_conn->c_trans->t_mp_capable) { __be16 npaths = cpu_to_be16(RDS_MPATH_WORKERS); __be32 my_gen_num = cpu_to_be32(cp->cp_conn->c_my_gen_num); + u8 dummy = 0; rds_message_add_extension(&rm->m_inc.i_hdr, RDS_EXTHDR_NPATHS, &npaths); rds_message_add_extension(&rm->m_inc.i_hdr, RDS_EXTHDR_GEN_NUM, &my_gen_num); + rds_message_add_extension(&rm->m_inc.i_hdr, + RDS_EXTHDR_SPORT_IDX, + &dummy); } spin_unlock_irqrestore(&cp->cp_lock, flags); diff --git a/net/rds/tcp.h b/net/rds/tcp.h index 7d07128593b71..7c91974fcde79 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -34,6 +34,7 @@ struct rds_tcp_connection { */ struct mutex t_conn_path_lock; struct socket *t_sock; + u32 t_client_port_group; struct rds_tcp_net *t_rtn; void *t_orig_write_space; void *t_orig_data_ready; diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index 92891b0d224d3..a55a27c05934d 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -93,6 +93,8 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp) struct sockaddr_in6 sin6; struct sockaddr_in sin; struct sockaddr *addr; + int port_low, port_high, port; + int port_groups, groups_left; int addrlen; bool isv6; int ret; @@ -145,7 +147,25 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp) addrlen = sizeof(sin); } - ret = kernel_bind(sock, (struct sockaddr_unsized *)addr, addrlen); + /* encode cp->cp_index in lowest bits of source-port */ + inet_get_local_port_range(rds_conn_net(conn), &port_low, &port_high); + port_low = ALIGN(port_low, RDS_MPATH_WORKERS); + port_groups = (port_high - port_low + 1) / RDS_MPATH_WORKERS; + ret = -EADDRINUSE; + groups_left = port_groups; + while (groups_left-- > 0 && ret) { + if (++tc->t_client_port_group >= port_groups) + tc->t_client_port_group = 0; + port = port_low + + tc->t_client_port_group * RDS_MPATH_WORKERS + + cp->cp_index; + + if (isv6) + sin6.sin6_port = htons(port); + else + sin.sin_port = htons(port); + ret = kernel_bind(sock, (struct sockaddr_unsized *)addr, addrlen); + } if (ret) { rdsdebug("bind failed with %d at address %pI6c\n", ret, &conn->c_laddr); diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 551c847f2890a..900d059010a41 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -62,19 +62,52 @@ void rds_tcp_keepalive(struct socket *sock) * we special case cp_index 0 is to allow the rds probe ping itself to itself * get through efficiently. */ -static -struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn) +static struct rds_tcp_connection * +rds_tcp_accept_one_path(struct rds_connection *conn, struct socket *sock) { - int i; - int npaths = max_t(int, 1, conn->c_npaths); + union { + struct sockaddr_storage storage; + struct sockaddr addr; + struct sockaddr_in sin; + struct sockaddr_in6 sin6; + } saddr; + int sport, npaths, i_min, i_max, i; + + if (conn->c_with_sport_idx && + kernel_getpeername(sock, &saddr.addr) == 0) { + /* cp->cp_index is encoded in lowest bits of source-port */ + switch (saddr.addr.sa_family) { + case AF_INET: + sport = ntohs(saddr.sin.sin_port); + break; + case AF_INET6: + sport = ntohs(saddr.sin6.sin6_port); + break; + default: + sport = -1; + } + } else { + sport = -1; + } + + npaths = max_t(int, 1, conn->c_npaths); - for (i = 0; i < npaths; i++) { + if (sport >= 0) { + i_min = sport % npaths; + i_max = i_min; + } else { + i_min = 0; + i_max = npaths - 1; + } + + for (i = i_min; i <= i_max; i++) { struct rds_conn_path *cp = &conn->c_path[i]; if (rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) return cp->cp_transport_data; } + return NULL; } @@ -199,7 +232,7 @@ int rds_tcp_accept_one(struct rds_tcp_net *rtn) * to and discarded by the sender. * We must not throw those away! */ - rs_tcp = rds_tcp_accept_one_path(conn); + rs_tcp = rds_tcp_accept_one_path(conn, new_sock); if (!rs_tcp) { /* It's okay to stash "new_sock", since * "rds_tcp_conn_slots_available" triggers -- 2.43.0