Currently sk_rethink_txhash() re-rolls the socket's txhash on RTO, PLB, and spurious-retransmission events, but the cached route is reused and the new hash is not propagated into the ECMP path selection logic. Two changes are needed to make rehash select a different local ECMP path: 1. Add sk_dst_reset() alongside sk_rethink_txhash() in tcp_write_timeout(), tcp_rcv_spurious_retrans(), and tcp_plb_check_rehash() so the cached dst is invalidated and the next transmit triggers a fresh route lookup. 2. Set fl6->mp_hash from sk_txhash (or tcp_rsk(req)->txhash for SYN/ACK retransmits) in inet6_sk_rebuild_header(), inet6_csk_route_req(), and inet6_csk_route_socket() so fib6_select_path() picks a path based on the new hash. It is necessary to update mp_hash explicitly because the default ECMP hash derives from fl6->flowlabel via np->flow_label, which is not updated from sk_txhash (REPFLOW is off by default). ip6_make_flowlabel() cannot help either, as it runs after the route lookup. Signed-off-by: Neil Spring --- net/ipv4/tcp_input.c | 4 +++- net/ipv4/tcp_plb.c | 1 + net/ipv4/tcp_timer.c | 1 + net/ipv6/af_inet6.c | 3 +++ net/ipv6/inet6_connection_sock.c | 6 ++++++ 5 files changed, 14 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 7995a89bafc9..126dffd675c9 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5020,8 +5020,10 @@ static void tcp_rcv_spurious_retrans(struct sock *sk, skb->protocol == htons(ETH_P_IPV6) && (tcp_sk(sk)->inet_conn.icsk_ack.lrcv_flowlabel != ntohl(ip6_flowlabel(ipv6_hdr(skb)))) && - sk_rethink_txhash(sk)) + sk_rethink_txhash(sk)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH); + sk_dst_reset(sk); + } /* Save last flowlabel after a spurious retrans. */ tcp_save_lrcv_flowlabel(sk, skb); diff --git a/net/ipv4/tcp_plb.c b/net/ipv4/tcp_plb.c index c11a0cd3f8fe..0c067a54c57a 100644 --- a/net/ipv4/tcp_plb.c +++ b/net/ipv4/tcp_plb.c @@ -79,6 +79,7 @@ void tcp_plb_check_rehash(struct sock *sk, struct tcp_plb_state *plb) return; sk_rethink_txhash(sk); + sk_dst_reset(sk); plb->consec_cong_rounds = 0; WRITE_ONCE(tcp_sk(sk)->plb_rehash, tcp_sk(sk)->plb_rehash + 1); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPLBREHASH); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 322db13333c7..d92f3ba9de81 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -300,6 +300,7 @@ static int tcp_write_timeout(struct sock *sk) if (sk_rethink_txhash(sk)) { WRITE_ONCE(tp->timeout_rehash, tp->timeout_rehash + 1); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEOUTREHASH); + sk_dst_reset(sk); } return 0; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 0a88b376141d..90ff4448aa56 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -823,6 +823,9 @@ int inet6_sk_rebuild_header(struct sock *sk) fl6->flowi6_uid = sk_uid(sk); security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6)); + /* >> 1 for 31-bit mp_hash range matching nhc_upper_bound. */ + fl6->mp_hash = sk->sk_txhash >> 1; + rcu_read_lock(); final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &np->final); rcu_read_unlock(); diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 37534e116899..fc4b75de6af8 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -48,6 +48,9 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk, fl6->flowi6_uid = sk_uid(sk); security_req_classify_flow(req, flowi6_to_flowi_common(fl6)); + /* >> 1 for 31-bit mp_hash range matching nhc_upper_bound. */ + fl6->mp_hash = tcp_rsk(req)->txhash >> 1; + if (!dst) { dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); if (IS_ERR(dst)) @@ -70,6 +73,9 @@ struct dst_entry *inet6_csk_route_socket(struct sock *sk, fl6->saddr = np->saddr; fl6->flowlabel = np->flow_label; IP6_ECN_flow_xmit(sk, fl6->flowlabel); + + /* >> 1 for 31-bit mp_hash range matching nhc_upper_bound. */ + fl6->mp_hash = sk->sk_txhash >> 1; fl6->flowi6_oif = sk->sk_bound_dev_if; fl6->flowi6_mark = sk->sk_mark; fl6->fl6_sport = inet->inet_sport; -- 2.52.0