Add ecmp_rehash.sh to exercise TCP ECMP path re-selection on retransmission timeout. Three tests cover client SYN rehash, server SYN/ACK rehash, and midstream RTO rehash of an established connection over a two-path ECMP topology with one leg blocked by tc. The SYN test retries 26 times, so has a false negative probability of ~(1/2)^25 ≈ 3e-8. Signed-off-by: Neil Spring --- tools/testing/selftests/net/Makefile | 1 + tools/testing/selftests/net/ecmp_rehash.sh | 354 +++++++++++++++++++++ 2 files changed, 355 insertions(+) create mode 100755 tools/testing/selftests/net/ecmp_rehash.sh diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 6bced3ed798b..acc61a51d7e2 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -25,6 +25,7 @@ TEST_PROGS := \ cmsg_time.sh \ double_udp_encap.sh \ drop_monitor_tests.sh \ + ecmp_rehash.sh \ fcnal-ipv4.sh \ fcnal-ipv6.sh \ fcnal-other.sh \ diff --git a/tools/testing/selftests/net/ecmp_rehash.sh b/tools/testing/selftests/net/ecmp_rehash.sh new file mode 100755 index 000000000000..a062c0b51fd6 --- /dev/null +++ b/tools/testing/selftests/net/ecmp_rehash.sh @@ -0,0 +1,354 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test ECMP path re-selection on TCP retransmission timeout. +# +# Two namespaces connected by two parallel veth pairs with a 2-way ECMP +# route. When a TCP path is blocked (via tc drop), RTO triggers +# sk_rethink_txhash() + sk_dst_reset(), causing the next route lookup +# to select the other ECMP path. +# +# False negative: ~(1/2)^25 ≈ 3e-8. With tcp_syn_retries=6 (~127 s +# timeout) and tcp_syn_linear_timeouts=20 there are roughly 26 +# independent rehash attempts, each choosing one of 2 paths uniformly. + +source lib.sh + +SUBNETS=(a b) +PORT=9900 + +ALL_TESTS=" + test_ecmp_rto_rehash + test_ecmp_synack_rehash + test_ecmp_midstream_rehash +" + +link_tx_packets_get() +{ + local ns=$1; shift + local dev=$1; shift + + ip netns exec "$ns" cat "/sys/class/net/$dev/statistics/tx_packets" +} + +# Return the number of packets matched by the tc filter action on a device. +# When tc drops packets via "action drop", the device's tx_packets is not +# incremented (packet never reaches veth_xmit), but the tc action maintains +# its own counter. +tc_filter_pkt_count() +{ + local ns=$1; shift + local dev=$1; shift + + ip netns exec "$ns" tc -s filter show dev "$dev" parent 1: 2>/dev/null | + awk '/Sent .* pkt/ { for (i=1;i<=NF;i++) if ($i=="pkt") { print $(i-1); exit } }' +} + +# Read TcpTimeoutRehash counter from /proc/net/netstat in a namespace. +# This counter increments in tcp_write_timeout() on every RTO that triggers +# sk_rethink_txhash(). +get_timeout_rehash_count() +{ + local ns=$1; shift + + ip netns exec "$ns" awk ' + /^TcpExt:/ { + if (!h) { split($0, n); h=1 } + else { + split($0, v) + for (i in n) + if (n[i] == "TcpTimeoutRehash") print v[i] + } + } + ' /proc/net/netstat +} + +# Block TCP (IPv6 next-header = 6) egress, allowing ICMPv6 through. +block_tcp() +{ + local ns=$1; shift + local dev=$1; shift + + ip netns exec "$ns" tc qdisc add dev "$dev" root handle 1: prio + ip netns exec "$ns" tc filter add dev "$dev" parent 1: \ + protocol ipv6 prio 1 u32 match u8 0x06 0xff at 6 action drop +} + +unblock_tcp() +{ + local ns=$1; shift + local dev=$1; shift + + ip netns exec "$ns" tc qdisc del dev "$dev" root 2>/dev/null +} + +# Return success when both devices have dropped at least one TCP packet. +both_devs_attempted() +{ + local ns=$1; shift + local dev0=$1; shift + local dev1=$1; shift + + local c0 c1 + c0=$(tc_filter_pkt_count "$ns" "$dev0") + c1=$(tc_filter_pkt_count "$ns" "$dev1") + [ "${c0:-0}" -ge 1 ] && [ "${c1:-0}" -ge 1 ] +} + +setup() +{ + setup_ns NS1 NS2 + + local ns + for ns in "$NS1" "$NS2"; do + ip netns exec "$ns" sysctl -qw net.ipv6.conf.all.accept_dad=0 + ip netns exec "$ns" sysctl -qw net.ipv6.conf.default.accept_dad=0 + ip netns exec "$ns" sysctl -qw net.ipv6.conf.all.forwarding=1 + ip netns exec "$ns" sysctl -qw net.core.txrehash=1 + done + + local i sub + for i in 0 1; do + sub=${SUBNETS[$i]} + ip link add "veth${i}a" type veth peer name "veth${i}b" + ip link set "veth${i}a" netns "$NS1" + ip link set "veth${i}b" netns "$NS2" + ip -n "$NS1" addr add "fd00:${sub}::1/64" dev "veth${i}a" + ip -n "$NS2" addr add "fd00:${sub}::2/64" dev "veth${i}b" + ip -n "$NS1" link set "veth${i}a" up + ip -n "$NS2" link set "veth${i}b" up + done + + ip -n "$NS1" addr add fd00:ff::1/128 dev lo + ip -n "$NS2" addr add fd00:ff::2/128 dev lo + + # Allow many SYN retries at 1-second intervals (linear, no + # exponential backoff) so the rehash test has enough attempts + # to exercise both ECMP paths deterministically. + ip netns exec "$NS1" sysctl -qw net.ipv4.tcp_syn_retries=6 + ip netns exec "$NS1" sysctl -qw net.ipv4.tcp_syn_linear_timeouts=20 + + ip -n "$NS1" -6 route add fd00:ff::2/128 \ + nexthop via fd00:a::2 dev veth0a \ + nexthop via fd00:b::2 dev veth1a + + ip -n "$NS2" -6 route add fd00:ff::1/128 \ + nexthop via fd00:a::1 dev veth0b \ + nexthop via fd00:b::1 dev veth1b + + for i in 0 1; do + sub=${SUBNETS[$i]} + ip netns exec "$NS1" \ + ping -6 -c1 -W5 "fd00:${sub}::2" &>/dev/null + ip netns exec "$NS2" \ + ping -6 -c1 -W5 "fd00:${sub}::1" &>/dev/null + done + + if ! ip netns exec "$NS1" ping -6 -c1 -W5 fd00:ff::2 &>/dev/null; then + echo "Basic connectivity check failed" + return $ksft_skip + fi +} + +# Block ALL paths, start a connection, wait until SYNs have been dropped +# on both interfaces (proving rehash steered the SYN to a new path), then +# unblock so the connection completes. +test_ecmp_rto_rehash() +{ + RET=0 + + block_tcp "$NS1" veth0a + defer unblock_tcp "$NS1" veth0a + block_tcp "$NS1" veth1a + defer unblock_tcp "$NS1" veth1a + + ip netns exec "$NS2" socat \ + "TCP6-LISTEN:$PORT,bind=[fd00:ff::2],reuseaddr,fork" \ + EXEC:"echo ESTABLISH_OK" & + defer kill_process $! + + wait_local_port_listen "$NS2" $PORT tcp + + local rehash_before + rehash_before=$(get_timeout_rehash_count "$NS1") + + # Start the connection in the background; it will retry SYNs at + # 1-second intervals until an unblocked path is found. + ip netns exec "$NS1" bash -c \ + "echo test | socat - \ + 'TCP6:[fd00:ff::2]:$PORT,bind=[fd00:ff::1],connect-timeout=60'" \ + >"/tmp/ecmp_rto_$$" 2>&1 & + local client_pid=$! + defer kill_process $client_pid + + # Wait until both paths have seen at least one dropped SYN. + # This proves sk_rethink_txhash() rehashed the connection from + # one ECMP path to the other. + slowwait 30 both_devs_attempted "$NS1" veth0a veth1a + check_err $? "SYNs did not appear on both paths (rehash not working)" + if [ $RET -ne 0 ]; then + log_test "ECMP RTO rehash: establish with blocked paths" + return + fi + + # Unblock both paths and let the next SYN retransmit succeed. + unblock_tcp "$NS1" veth0a + unblock_tcp "$NS1" veth1a + + local rc=0 + wait $client_pid || rc=$? + + local result + result=$(cat "/tmp/ecmp_rto_$$" 2>/dev/null) + rm -f "/tmp/ecmp_rto_$$" + + if [ $rc -ne 0 ] || [[ "$result" != *"ESTABLISH_OK"* ]]; then + check_err 1 "connection failed after unblocking: $result" + fi + + local rehash_after + rehash_after=$(get_timeout_rehash_count "$NS1") + if [ "$rehash_after" -le "$rehash_before" ]; then + check_err 1 "TcpTimeoutRehash counter did not increment" + fi + + log_test "ECMP RTO rehash: establish with blocked paths" +} + +# Block the server's return paths so SYN/ACKs are dropped. The client +# retransmits SYNs at 1-second intervals; each duplicate SYN arriving at +# the server updates ir_iif to match the new arrival interface, so the +# retransmitted SYN/ACK routes back via the interface the SYN arrived on. +test_ecmp_synack_rehash() +{ + RET=0 + local port=$((PORT + 2)) + + block_tcp "$NS2" veth0b + defer unblock_tcp "$NS2" veth0b + block_tcp "$NS2" veth1b + defer unblock_tcp "$NS2" veth1b + + ip netns exec "$NS2" socat \ + "TCP6-LISTEN:$port,bind=[fd00:ff::2],reuseaddr,fork" \ + EXEC:"echo SYNACK_OK" & + defer kill_process $! + + wait_local_port_listen "$NS2" $port tcp + + # Start the connection; SYNs reach the server (client egress is + # open) but SYN/ACKs are dropped on the server's return path. + ip netns exec "$NS1" bash -c \ + "echo test | socat - \ + 'TCP6:[fd00:ff::2]:$port,bind=[fd00:ff::1],connect-timeout=60'" \ + >"/tmp/ecmp_synack_$$" 2>&1 & + local client_pid=$! + defer kill_process $client_pid + + # Wait until both server-side interfaces have dropped at least + # one SYN/ACK, proving the server rehashed its return path. + slowwait 30 both_devs_attempted "$NS2" veth0b veth1b + check_err $? "SYN/ACKs did not appear on both return paths" + if [ $RET -ne 0 ]; then + log_test "ECMP SYN/ACK rehash: blocked return path" + return + fi + + # Unblock and let the connection complete. + unblock_tcp "$NS2" veth0b + unblock_tcp "$NS2" veth1b + + local rc=0 + wait $client_pid || rc=$? + + local result + result=$(cat "/tmp/ecmp_synack_$$" 2>/dev/null) + rm -f "/tmp/ecmp_synack_$$" + + if [ $rc -ne 0 ] || [[ "$result" != *"SYNACK_OK"* ]]; then + check_err 1 "connection failed after unblocking: $result" + fi + + log_test "ECMP SYN/ACK rehash: blocked return path" +} + +# Establish a data transfer with both paths open, then block the +# active path. Verify the transfer continues via rehash and that +# TcpTimeoutRehash incremented. +test_ecmp_midstream_rehash() +{ + RET=0 + local port=$((PORT + 1)) + + ip netns exec "$NS2" socat -u \ + "TCP6-LISTEN:$port,bind=[fd00:ff::2],reuseaddr" - >/dev/null & + defer kill_process $! + + wait_local_port_listen "$NS2" $port tcp + + local base_tx0 base_tx1 + base_tx0=$(link_tx_packets_get "$NS1" veth0a) + base_tx1=$(link_tx_packets_get "$NS1" veth1a) + + ip netns exec "$NS1" bash -c " + for i in \$(seq 1 40); do + dd if=/dev/zero bs=10k count=1 2>/dev/null + sleep 0.25 + done | timeout 60 socat - 'TCP6:[fd00:ff::2]:$port,bind=[fd00:ff::1]' + " &>/dev/null & + local client_pid=$! + defer kill_process $client_pid + + busywait $BUSYWAIT_TIMEOUT until_counter_is \ + ">= $((base_tx0 + base_tx1 + 20))" \ + link_tx_packets_total "$NS1" + check_err $? "no TX activity detected" + if [ $RET -ne 0 ]; then + log_test "ECMP midstream rehash: block active path" + return + fi + + # Find the active path and block it. + local cur0 cur1 active_idx + cur0=$(link_tx_packets_get "$NS1" veth0a) + cur1=$(link_tx_packets_get "$NS1" veth1a) + if [ $((cur0 - base_tx0)) -ge $((cur1 - base_tx1)) ]; then + active_idx=0 + else + active_idx=1 + fi + + local rehash_before + rehash_before=$(get_timeout_rehash_count "$NS1") + + block_tcp "$NS1" "veth${active_idx}a" + defer unblock_tcp "$NS1" "veth${active_idx}a" + + local rc=0 + wait $client_pid || rc=$? + + check_err $rc "data transfer failed after blocking veth${active_idx}a" + + local rehash_after + rehash_after=$(get_timeout_rehash_count "$NS1") + if [ "$rehash_after" -le "$rehash_before" ]; then + check_err 1 "TcpTimeoutRehash counter did not increment" + fi + + log_test "ECMP midstream rehash: block active path" +} + +link_tx_packets_total() +{ + local ns=$1; shift + + echo $(( $(link_tx_packets_get "$ns" veth0a) + + $(link_tx_packets_get "$ns" veth1a) )) +} + +require_command socat + +trap cleanup_all_ns EXIT +setup || exit $? +tests_run +exit $EXIT_STATUS -- 2.52.0