Add a selftest for the netem network emulator qdisc covering basic functionality, multi-netem qdisc trees, inner qdisc combinations, and crash resistance under adversarial configurations. The test validates that HTB and HFSC parents work correctly with netem children including duplication, which exercises the code paths involved in CVE-2024-45016, CVE-2025-37890, CVE-2025-38001, and CVE-2025-38553. Signed-off-by: Stephen Hemminger --- MAINTAINERS | 1 + tools/testing/selftests/net/Makefile | 1 + tools/testing/selftests/net/config | 3 + tools/testing/selftests/net/netem.sh | 802 +++++++++++++++++++++++++++ 4 files changed, 807 insertions(+) create mode 100755 tools/testing/selftests/net/netem.sh diff --git a/MAINTAINERS b/MAINTAINERS index ff6f17458f19..60be4001c0cb 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18225,6 +18225,7 @@ M: Stephen Hemminger L: netdev@vger.kernel.org S: Maintained F: net/sched/sch_netem.c +F: tools/testing/selftests/net/netem* NETFILTER M: Pablo Neira Ayuso diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index a24ea64e2ae8..605566b4db21 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -62,6 +62,7 @@ TEST_PROGS := \ ndisc_unsolicited_na_test.sh \ netdev-l2addr.sh \ netdevice.sh \ + netem.sh \ netns-name.sh \ netns-sysctl.sh \ nl_netdev.py \ diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config index cd49b7dfe216..8678f4a346bc 100644 --- a/tools/testing/selftests/net/config +++ b/tools/testing/selftests/net/config @@ -96,6 +96,9 @@ CONFIG_NET_SCH_ETF=m CONFIG_NET_SCH_FQ=m CONFIG_NET_SCH_FQ_CODEL=m CONFIG_NET_SCH_HTB=m +CONFIG_NET_SCH_HFSC=m +CONFIG_NET_SCH_SFQ=m +CONFIG_NET_SCH_TBF=m CONFIG_NET_SCH_INGRESS=m CONFIG_NET_SCH_NETEM=y CONFIG_NET_SCH_PRIO=m diff --git a/tools/testing/selftests/net/netem.sh b/tools/testing/selftests/net/netem.sh new file mode 100755 index 000000000000..7da18429c4ff --- /dev/null +++ b/tools/testing/selftests/net/netem.sh @@ -0,0 +1,802 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test netem qdisc focusing on: +# - valid multi-netem trees (HTB, HFSC parents) +# - packet duplication under various topologies +# - robustness against misconfigurations that previously caused +# soft lockups, OOM, or UAF (syzkaller-class inputs) +# +# These tests do NOT validate statistical accuracy of delay/loss +# distributions; they verify the kernel doesn't crash and that +# packets actually flow (or are cleanly dropped) under each config. +# +# Author: Stephen Hemminger + +source lib.sh + +NSPREFIX="netem-test" +NS_SRC="${NSPREFIX}-src" +NS_DST="${NSPREFIX}-dst" + +VETH_SRC="veth-src" +VETH_DST="veth-dst" + +SRC_IP="192.168.99.1" +DST_IP="192.168.99.2" + +# How long to wait for a potentially-stuck kernel (seconds) +LOCKUP_TIMEOUT=15 + +PASS=0 +FAIL=0 +SKIP=0 + +cleanup() +{ + ip netns del "$NS_SRC" 2>/dev/null + ip netns del "$NS_DST" 2>/dev/null +} + +setup() +{ + cleanup + + ip netns add "$NS_SRC" || return $ksft_skip + ip netns add "$NS_DST" || return $ksft_skip + + ip -n "$NS_SRC" link add "$VETH_SRC" type veth \ + peer name "$VETH_DST" netns "$NS_DST" || return $ksft_skip + + ip -n "$NS_SRC" addr add "${SRC_IP}/24" dev "$VETH_SRC" + ip -n "$NS_DST" addr add "${DST_IP}/24" dev "$VETH_DST" + + ip -n "$NS_SRC" link set "$VETH_SRC" up + ip -n "$NS_DST" link set "$VETH_DST" up + + ip -n "$NS_SRC" link set lo up + ip -n "$NS_DST" link set lo up + + # Sanity: can we ping at all? + ip netns exec "$NS_SRC" ping -c 1 -W 2 "$DST_IP" >/dev/null 2>&1 + if [ $? -ne 0 ]; then + echo "# SKIP: basic connectivity failed" + return $ksft_skip + fi + + return 0 +} + +# Reset qdisc on src veth to default (remove any previous test config) +reset_qdisc() +{ + # Remove any existing root qdisc. Must use 'tc' not 'ip'. + run_tc qdisc del dev "$VETH_SRC" root 2>/dev/null + return 0 +} + +# Send ICMP ping traffic and require at least one reply. +# Uses generous intervals to accommodate delay/loss configs. +# Usage: send_traffic [timeout] +send_traffic() +{ + local count=${1:-20} + local timeout=${2:-10} + local received + + received=$(ip netns exec "$NS_SRC" \ + timeout "$timeout" \ + ping -c "$count" -i 0.1 -W 2 "$DST_IP" 2>/dev/null | + awk '/packets transmitted/ { print $4 }') + + [ "${received:-0}" -gt 0 ] 2>/dev/null +} + +# Send traffic and verify kernel didn't lock up. +# Returns 0 if kernel survived (regardless of whether traffic arrived). +# Usage: survive_traffic [timeout] +survive_traffic() +{ + local count=${1:-20} + local timeout=${2:-$LOCKUP_TIMEOUT} + + # Use ping as the traffic source — simpler and more reliable + # for crash testing. We don't care about replies. + ip netns exec "$NS_SRC" \ + timeout "$timeout" ping -c "$count" -i 0.05 -W 1 "$DST_IP" \ + >/dev/null 2>&1 + local rc=$? + + # timeout returns 124 if it had to kill the child + if [ $rc -eq 124 ]; then + return 1 # kernel likely stuck + fi + + return 0 +} + +log_result() +{ + local result=$1 + local name=$2 + + case $result in + 0) + printf " PASS: %s\n" "$name" + PASS=$((PASS + 1)) + ;; + $ksft_skip) + printf " SKIP: %s\n" "$name" + SKIP=$((SKIP + 1)) + ;; + *) + printf " FAIL: %s\n" "$name" + FAIL=$((FAIL + 1)) + ret=1 + ;; + esac +} + +run_cmd() +{ + ip netns exec "$NS_SRC" "$@" +} + +run_tc() +{ + ip netns exec "$NS_SRC" tc "$@" +} + +# ===================================================================== +# TEST CASES +# ===================================================================== + +# --- Group 1: Basic sanity --- + +test_basic_delay() +{ + local name="basic netem delay" + + reset_qdisc + run_tc qdisc replace dev "$VETH_SRC" root netem delay 10ms || { + log_result 1 "$name" + return + } + + if send_traffic 20; then + log_result 0 "$name" + else + log_result 1 "$name" + fi +} + +test_basic_duplicate() +{ + local name="basic netem duplicate 50%" + + reset_qdisc + run_tc qdisc replace dev "$VETH_SRC" root netem delay 10ms duplicate 50% || { + log_result 1 "$name" + return + } + + if send_traffic 20; then + log_result 0 "$name" + else + log_result 1 "$name" + fi +} + +test_basic_loss() +{ + local name="basic netem loss 30%" + + reset_qdisc + run_tc qdisc replace dev "$VETH_SRC" root netem delay 5ms loss 30% || { + log_result 1 "$name" + return + } + + # With 30% loss, 20 packets should still get some through + if send_traffic 40; then + log_result 0 "$name" + else + log_result 1 "$name" + fi +} + +test_basic_corrupt() +{ + local name="basic netem corrupt 10%" + + reset_qdisc + run_tc qdisc replace dev "$VETH_SRC" root netem delay 5ms corrupt 10% || { + log_result 1 "$name" + return + } + + if send_traffic 20; then + log_result 0 "$name" + else + log_result 1 "$name" + fi +} + +test_basic_reorder() +{ + local name="basic netem reorder 25% gap 5" + + reset_qdisc + run_tc qdisc replace dev "$VETH_SRC" root netem delay 50ms \ + reorder 25% 50% gap 5 || { + log_result 1 "$name" + return + } + + if send_traffic 30; then + log_result 0 "$name" + else + log_result 1 "$name" + fi +} + +test_pfifo_child() +{ + local name="netem with pfifo child qdisc" + + reset_qdisc + run_tc qdisc replace dev "$VETH_SRC" root handle 1: \ + netem delay 10ms 50ms || { + log_result 1 "$name" + return + } + run_tc qdisc add dev "$VETH_SRC" parent 1:1 \ + pfifo limit 1000 || { + log_result 1 "$name" + return + } + + if send_traffic 20; then + log_result 0 "$name" + else + log_result 1 "$name" + fi +} + +# --- Group 2: Multi-netem trees (the configs that check_netem_in_tree blocked) --- + +test_htb_two_netem_leaves() +{ + local name="HTB root, two netem leaves (no dup)" + + reset_qdisc + run_tc qdisc replace dev "$VETH_SRC" root handle 1: htb default 10 || { + log_result $ksft_skip "$name" + return + } + run_tc class add dev "$VETH_SRC" parent 1: classid 1:10 \ + htb rate 10mbit || { + log_result 1 "$name" + return + } + run_tc class add dev "$VETH_SRC" parent 1: classid 1:20 \ + htb rate 10mbit || { + log_result 1 "$name" + return + } + run_tc qdisc add dev "$VETH_SRC" parent 1:10 handle 10: \ + netem delay 10ms || { + log_result 1 "$name" + return + } + run_tc qdisc add dev "$VETH_SRC" parent 1:20 handle 20: \ + netem delay 20ms || { + log_result 1 "$name" + return + } + + if send_traffic 20; then + log_result 0 "$name" + else + log_result 1 "$name" + fi +} + +test_htb_netem_dup_one_leaf() +{ + local name="HTB root, one netem with dup, one without" + + reset_qdisc + run_tc qdisc replace dev "$VETH_SRC" root handle 1: htb default 10 || { + log_result $ksft_skip "$name" + return + } + run_tc class add dev "$VETH_SRC" parent 1: classid 1:10 \ + htb rate 10mbit + run_tc class add dev "$VETH_SRC" parent 1: classid 1:20 \ + htb rate 10mbit + + # First leaf: netem with duplication + run_tc qdisc add dev "$VETH_SRC" parent 1:10 handle 10: \ + netem delay 10ms duplicate 25% 2>/dev/null + local rc1=$? + + # Second leaf: netem without duplication + run_tc qdisc add dev "$VETH_SRC" parent 1:20 handle 20: \ + netem delay 20ms 2>/dev/null + local rc2=$? + + # check_netem_in_tree rejects this valid config — that's a bug. + if [ $rc1 -ne 0 ] || [ $rc2 -ne 0 ]; then + echo "# tc rejected multi-netem tree (check_netem_in_tree bug)" + log_result 1 "$name" + return + fi + + if send_traffic 20; then + log_result 0 "$name" + else + log_result 1 "$name" + fi +} + +test_hfsc_netem_child() +{ + local name="HFSC root, netem child (CVE-2025-37890 topology)" + + reset_qdisc + run_tc qdisc replace dev "$VETH_SRC" root handle 1: hfsc default 1 || { + log_result $ksft_skip "$name" + return + } + run_tc class add dev "$VETH_SRC" parent 1: classid 1:1 \ + hfsc ls rate 50mbit ul rate 50mbit || { + log_result 1 "$name" + return + } + run_tc qdisc add dev "$VETH_SRC" parent 1:1 handle 10: \ + netem delay 10ms || { + log_result 1 "$name" + return + } + + if send_traffic 30; then + log_result 0 "$name" + else + log_result 1 "$name" + fi +} + +test_hfsc_netem_dup() +{ + local name="HFSC root, netem child with duplicate" + + reset_qdisc + run_tc qdisc replace dev "$VETH_SRC" root handle 1: hfsc default 1 || { + log_result $ksft_skip "$name" + return + } + run_tc class add dev "$VETH_SRC" parent 1: classid 1:1 \ + hfsc ls rate 50mbit ul rate 50mbit + run_tc qdisc add dev "$VETH_SRC" parent 1:1 handle 10: \ + netem delay 10ms duplicate 30% + + if [ $? -ne 0 ]; then + echo "# tc rejected HFSC + netem duplicate" + log_result $ksft_skip "$name" + return + fi + + # This is the topology that triggered CVE-2025-37890. + # We just need to survive it. + if survive_traffic 50; then + log_result 0 "$name" + else + log_result 1 "$name" + fi +} + +test_hfsc_two_netem_classes() +{ + local name="HFSC root, two classes each with netem" + + reset_qdisc + run_tc qdisc replace dev "$VETH_SRC" root handle 1: hfsc default 1 || { + log_result $ksft_skip "$name" + return + } + run_tc class add dev "$VETH_SRC" parent 1: classid 1:1 \ + hfsc ls rate 25mbit ul rate 25mbit + run_tc class add dev "$VETH_SRC" parent 1: classid 1:2 \ + hfsc ls rate 25mbit ul rate 25mbit + + run_tc qdisc add dev "$VETH_SRC" parent 1:1 handle 10: \ + netem delay 10ms + run_tc qdisc add dev "$VETH_SRC" parent 1:2 handle 20: \ + netem delay 20ms + + if [ $? -ne 0 ]; then + log_result $ksft_skip "$name" + return + fi + + if send_traffic 30; then + log_result 0 "$name" + else + log_result 1 "$name" + fi +} + +# --- Group 3: Inner qdisc combinations --- + +test_netem_tbf_child() +{ + local name="netem with TBF child (non-work-conserving)" + + reset_qdisc + run_tc qdisc replace dev "$VETH_SRC" root handle 1: \ + netem delay 10ms || { + log_result $ksft_skip "$name" + return + } + + # TBF as inner qdisc — previously caused stalls with + # some parent qdiscs. As a direct child of netem, the + # expectation is packets flow (possibly rate-limited). + run_tc qdisc add dev "$VETH_SRC" parent 1:1 handle 10: \ + tbf rate 1mbit burst 10kb latency 50ms || { + log_result $ksft_skip "$name" + return + } + + if survive_traffic 20; then + log_result 0 "$name" + else + log_result 1 "$name" + fi +} + +test_htb_netem_tbf_chain() +{ + local name="HTB -> netem -> TBF chain" + + reset_qdisc + run_tc qdisc replace dev "$VETH_SRC" root handle 1: htb default 10 || { + log_result $ksft_skip "$name" + return + } + run_tc class add dev "$VETH_SRC" parent 1: classid 1:10 \ + htb rate 5mbit + run_tc qdisc add dev "$VETH_SRC" parent 1:10 handle 10: \ + netem delay 10ms + run_tc qdisc add dev "$VETH_SRC" parent 10:1 handle 100: \ + tbf rate 1mbit burst 10kb latency 50ms + + if [ $? -ne 0 ]; then + log_result $ksft_skip "$name" + return + fi + + # This is a historically problematic config. Just surviving is success. + if survive_traffic 30; then + log_result 0 "$name" + else + log_result 1 "$name" + fi +} + +test_netem_sfq_child() +{ + local name="netem with SFQ child qdisc" + + reset_qdisc + run_tc qdisc replace dev "$VETH_SRC" root handle 1: \ + netem delay 10ms || { + log_result 1 "$name" + return + } + run_tc qdisc add dev "$VETH_SRC" parent 1:1 handle 10: \ + sfq perturb 10 || { + log_result $ksft_skip "$name" + return + } + + if send_traffic 20; then + log_result 0 "$name" + else + log_result 1 "$name" + fi +} + +# --- Group 4: Abuse / crash resistance --- + +test_dup_100_percent() +{ + local name="netem duplicate 100% (stress)" + + reset_qdisc + run_tc qdisc replace dev "$VETH_SRC" root netem delay 5ms duplicate 100% || { + log_result 1 "$name" + return + } + + # 100% dup means every packet cloned. Must not explode. + if survive_traffic 30; then + log_result 0 "$name" + else + log_result 1 "$name" + fi +} + +test_dup_100_no_delay() +{ + local name="netem duplicate 100% with zero delay" + + reset_qdisc + run_tc qdisc replace dev "$VETH_SRC" root netem duplicate 100% || { + log_result 1 "$name" + return + } + + # Zero delay + 100% dup is a degenerate case + if survive_traffic 30; then + log_result 0 "$name" + else + log_result 1 "$name" + fi +} + +test_all_impairments() +{ + local name="netem all impairments simultaneously" + + reset_qdisc + run_tc qdisc replace dev "$VETH_SRC" root netem \ + delay 10ms 5ms 25% \ + loss 10% \ + duplicate 10% \ + corrupt 5% \ + reorder 5% 50% || { + log_result 1 "$name" + return + } + + if survive_traffic 50; then + log_result 0 "$name" + else + log_result 1 "$name" + fi +} + +test_rapid_reconfig() +{ + local name="rapid qdisc add/change/delete cycling" + + reset_qdisc + + # Rapidly add, change, and delete netem config while traffic runs. + # This is the kind of thing syzkaller does. + ip netns exec "$NS_SRC" ping -c 100 -i 0.02 -W 1 "$DST_IP" \ + >/dev/null 2>&1 & + local traffic_pid=$! + + local i + for i in $(seq 1 20); do + run_tc qdisc replace dev "$VETH_SRC" root netem delay 5ms \ + duplicate 10% 2>/dev/null + run_tc qdisc change dev "$VETH_SRC" root netem delay 10ms \ + loss 5% 2>/dev/null + run_tc qdisc change dev "$VETH_SRC" root netem delay 1ms \ + duplicate 50% 2>/dev/null + run_tc qdisc del dev "$VETH_SRC" root 2>/dev/null + done + + wait $traffic_pid 2>/dev/null + + # If we got here, the kernel didn't lock up + log_result 0 "$name" +} + +test_limit_1() +{ + local name="netem limit 1 with duplication" + + reset_qdisc + # Minimal queue limit with duplication — exercises the + # t_len >= sch->limit drop path in enqueue + run_tc qdisc replace dev "$VETH_SRC" root netem \ + limit 1 delay 10ms duplicate 50% || { + log_result 1 "$name" + return + } + + # Most packets will be dropped, but kernel must survive + if survive_traffic 30; then + log_result 0 "$name" + else + log_result 1 "$name" + fi +} + +test_slot_config() +{ + local name="netem slot configuration" + + reset_qdisc + run_tc qdisc replace dev "$VETH_SRC" root netem \ + delay 10ms slot 20ms 40ms packets 5 bytes 1500 || { + # slot may not be supported on old kernels + log_result $ksft_skip "$name" + return + } + + if send_traffic 30; then + log_result 0 "$name" + else + log_result 1 "$name" + fi +} + +test_rate_limiting() +{ + local name="netem rate limiting" + + reset_qdisc + run_tc qdisc replace dev "$VETH_SRC" root netem \ + delay 10ms rate 1mbit || { + log_result 1 "$name" + return + } + + if send_traffic 20; then + log_result 0 "$name" + else + log_result 1 "$name" + fi +} + +# --- Group 5: Multi-netem dup trees (the CVE-2025-38553 triggers) --- + +test_htb_two_netem_both_dup() +{ + local name="HTB root, two netem leaves both duplicating" + + reset_qdisc + run_tc qdisc replace dev "$VETH_SRC" root handle 1: htb default 10 || { + log_result $ksft_skip "$name" + return + } + run_tc class add dev "$VETH_SRC" parent 1: classid 1:10 \ + htb rate 10mbit + run_tc class add dev "$VETH_SRC" parent 1: classid 1:20 \ + htb rate 10mbit + + run_tc qdisc add dev "$VETH_SRC" parent 1:10 handle 10: \ + netem delay 10ms duplicate 25% 2>/dev/null + local rc1=$? + run_tc qdisc add dev "$VETH_SRC" parent 1:20 handle 20: \ + netem delay 10ms duplicate 25% 2>/dev/null + local rc2=$? + + if [ $rc1 -ne 0 ] || [ $rc2 -ne 0 ]; then + echo "# tc rejected multi-netem dup tree (check_netem_in_tree bug)" + log_result 1 "$name" + return + fi + + # This was the CVE-2025-38553 scenario. + # With recursion guard: should survive. + # Without: may lock up — that's what the timeout catches. + if survive_traffic 50 "$LOCKUP_TIMEOUT"; then + log_result 0 "$name" + else + log_result 1 "$name" + fi +} + +test_nested_netem() +{ + local name="nested netem (netem inside netem child)" + + reset_qdisc + run_tc qdisc replace dev "$VETH_SRC" root handle 1: \ + netem delay 10ms || { + log_result 1 "$name" + return + } + # Try to add netem as child of netem. This is unusual + # but shouldn't crash. + run_tc qdisc add dev "$VETH_SRC" parent 1:1 handle 10: \ + netem delay 5ms duplicate 10% 2>/dev/null + + if [ $? -ne 0 ]; then + echo "# nested netem rejected by kernel (check_netem_in_tree bug)" + log_result 1 "$name" + return + fi + + if survive_traffic 30; then + log_result 0 "$name" + else + log_result 1 "$name" + fi +} + +# ===================================================================== +# MAIN +# ===================================================================== + +trap cleanup EXIT + +ret=0 + +# Check prerequisites +if [ "$(id -u)" -ne 0 ]; then + echo "SKIP: must be run as root" + exit $ksft_skip +fi + +for tool in tc ip ping; do + if ! command -v "$tool" >/dev/null 2>&1; then + echo "SKIP: $tool not found" + exit $ksft_skip + fi +done + +# Check kernel has netem +if ! ip netns add "${NSPREFIX}-probe" 2>/dev/null; then + echo "SKIP: network namespaces not available" + exit $ksft_skip +fi +ip netns exec "${NSPREFIX}-probe" \ + tc qdisc add dev lo root netem delay 1ms 2>/dev/null +if [ $? -ne 0 ]; then + ip netns del "${NSPREFIX}-probe" + echo "SKIP: netem qdisc not available (CONFIG_NET_SCH_NETEM)" + exit $ksft_skip +fi +ip netns del "${NSPREFIX}-probe" + +setup +if [ $? -eq $ksft_skip ]; then + echo "SKIP: setup failed" + exit $ksft_skip +fi + +# Group 1: Basic sanity +test_basic_delay +test_basic_duplicate +test_basic_loss +test_basic_corrupt +test_basic_reorder +test_pfifo_child + +# Group 2: Multi-netem trees +test_htb_two_netem_leaves +test_htb_netem_dup_one_leaf +test_hfsc_netem_child +test_hfsc_netem_dup +test_hfsc_two_netem_classes + +# Group 3: Inner qdisc combos +test_netem_tbf_child +test_htb_netem_tbf_chain +test_netem_sfq_child + +# Group 4: Abuse / crash resistance +test_dup_100_percent +test_dup_100_no_delay +test_all_impairments +test_rapid_reconfig +test_limit_1 +test_slot_config +test_rate_limiting + +# Group 5: Multi-netem dup (CVE triggers) +test_htb_two_netem_both_dup +test_nested_netem + +echo +echo "Summary: $PASS pass, $FAIL fail, $SKIP skip" + +exit $ret -- 2.51.0 This reverts commit ec8e0e3d7adef940cdf9475e2352c0680189d14e. The restriction breaks valid uses of netem such as using different netem values on different branches of HTB. This even broke some of the examples in the netem documentation. The intent of blocking recursion is handled in next patch. Fixes: ec8e0e3d7adef ("net/sched: Restrict conditions for adding duplicating netems to qdisc tree") Reported-by: Ji-Soo Chung Reported-by: Gerlinde Link: https://bugzilla.kernel.org/show_bug.cgi?id=220774 Cc: stable@vger.kernel.org Originally-by: Cong Wang Signed-off-by: Stephen Hemminger --- net/sched/sch_netem.c | 40 ---------------------------------------- 1 file changed, 40 deletions(-) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 5de1c932944a..0ccf74a9cb82 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -974,41 +974,6 @@ static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, return 0; } -static const struct Qdisc_class_ops netem_class_ops; - -static int check_netem_in_tree(struct Qdisc *sch, bool duplicates, - struct netlink_ext_ack *extack) -{ - struct Qdisc *root, *q; - unsigned int i; - - root = qdisc_root_sleeping(sch); - - if (sch != root && root->ops->cl_ops == &netem_class_ops) { - if (duplicates || - ((struct netem_sched_data *)qdisc_priv(root))->duplicate) - goto err; - } - - if (!qdisc_dev(root)) - return 0; - - hash_for_each(qdisc_dev(root)->qdisc_hash, i, q, hash) { - if (sch != q && q->ops->cl_ops == &netem_class_ops) { - if (duplicates || - ((struct netem_sched_data *)qdisc_priv(q))->duplicate) - goto err; - } - } - - return 0; - -err: - NL_SET_ERR_MSG(extack, - "netem: cannot mix duplicating netems with other netems in tree"); - return -EINVAL; -} - /* Parse netlink message to set options */ static int netem_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) @@ -1067,11 +1032,6 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt, q->gap = qopt->gap; q->counter = 0; q->loss = qopt->loss; - - ret = check_netem_in_tree(sch, qopt->duplicate, extack); - if (ret) - goto unlock; - q->duplicate = qopt->duplicate; /* for compatibility with earlier versions. -- 2.51.0 Add a per-CPU recursion depth counter to netem_enqueue(). When netem duplicates a packet, the clone is re-enqueued at the root qdisc. If the tree contains other netem instances, this can recurse without bound, causing soft lockups and OOM. This approach was previously considered but rejected on the grounds that netem_dequeue calling enqueue on a child netem could bypass the depth check. That concern does not apply: the child netem's netem_enqueue() increments the same per-CPU counter, so the total nesting depth across all netem instances in the call chain is tracked correctly. A depth limit of 4 is generous for any legitimate configuration. Fixes: 0afb51e72855 ("[PKT_SCHED]: netem: reinsert for duplication") Link: https://bugzilla.kernel.org/show_bug.cgi?id=220774 Cc: stable@vger.kernel.org Reported-by: William Liu Reported-by: Savino Dicanosa Signed-off-by: Stephen Hemminger --- net/sched/sch_netem.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 0ccf74a9cb82..085fa3ad6f83 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -29,6 +30,15 @@ #define VERSION "1.3" +/* + * Limit for recursion from duplication. + * Duplicated packets are re-enqueued at the root qdisc, which may + * reach this or another netem instance, causing nested calls to + * netem_enqueue(). This per-CPU counter limits the total depth. + */ +static DEFINE_PER_CPU(unsigned int, netem_enqueue_depth); +#define NETEM_RECURSION_LIMIT 4 + /* Network Emulation Queuing algorithm. ==================================== @@ -460,6 +470,14 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, /* Do not fool qdisc_drop_all() */ skb->prev = NULL; + /* Guard against recursion from duplication re-injection. */ + if (unlikely(this_cpu_inc_return(netem_enqueue_depth) > + NETEM_RECURSION_LIMIT)) { + this_cpu_dec(netem_enqueue_depth); + qdisc_drop(skb, sch, to_free); + return NET_XMIT_DROP; + } + /* Random duplication */ if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor, &q->prng)) ++count; @@ -474,6 +492,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (count == 0) { qdisc_qstats_drop(sch); __qdisc_drop(skb, to_free); + this_cpu_dec(netem_enqueue_depth); return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; } @@ -529,6 +548,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, qdisc_drop_all(skb, sch, to_free); if (skb2) __qdisc_drop(skb2, to_free); + this_cpu_dec(netem_enqueue_depth); return NET_XMIT_DROP; } @@ -643,8 +663,10 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, /* Parent qdiscs accounted for 1 skb of size @prev_len */ qdisc_tree_reduce_backlog(sch, -(nb - 1), -(len - prev_len)); } else if (!skb) { + this_cpu_dec(netem_enqueue_depth); return NET_XMIT_DROP; } + this_cpu_dec(netem_enqueue_depth); return NET_XMIT_SUCCESS; } -- 2.51.0 netem_dequeue() currently enqueues time-ready packets into the child qdisc during the dequeue call path. This creates several problems: 1. Parent qdiscs like HFSC track class active/inactive state based on qlen transitions. The child enqueue during netem's dequeue can cause qlen to increase while the parent is mid-dequeue, leading to double-insertion in HFSC's eltree (CVE-2025-37890, CVE-2025-38001). 2. If the child qdisc is non-work-conserving (e.g., TBF), it may refuse to release packets during its dequeue even though they were just enqueued. The parent then sees netem returning NULL despite having backlog, violating the work-conserving contract and causing stalls with parents like DRR that deactivate classes in this case. Restructure netem_dequeue so that when a child qdisc is present, all time-ready packets are transferred from the tfifo to the child in a batch before asking the child for output. This ensures the child only receives packets whose delay has already elapsed. The no-child path (tfifo direct dequeue) is unchanged. Fixes: 50612537e9ab ("netem: fix classful handling") Cc: stable@vger.kernel.org Signed-off-by: Stephen Hemminger --- net/sched/sch_netem.c | 82 +++++++++++++++++++++++++++++-------------- 1 file changed, 56 insertions(+), 26 deletions(-) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 085fa3ad6f83..08006a60849e 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -726,7 +726,6 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch) struct netem_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; -tfifo_dequeue: skb = __qdisc_dequeue_head(&sch->q); if (skb) { deliver: @@ -734,24 +733,28 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch) qdisc_bstats_update(sch, skb); return skb; } - skb = netem_peek(q); - if (skb) { - u64 time_to_send; + + /* If we have a child qdisc, transfer all time-ready packets + * from the tfifo into the child, then dequeue from the child. + * This avoids enqueueing into the child during the parent's + * dequeue callback, which can confuse parents that track + * active/inactive state based on qlen transitions (HFSC). + */ + if (q->qdisc) { u64 now = ktime_get_ns(); - /* if more time remaining? */ - time_to_send = netem_skb_cb(skb)->time_to_send; - if (q->slot.slot_next && q->slot.slot_next < time_to_send) - get_slot_next(q, now); + while ((skb = netem_peek(q)) != NULL) { + u64 t = netem_skb_cb(skb)->time_to_send; + + if (t > now) + break; + if (q->slot.slot_next && q->slot.slot_next > now) + break; - if (time_to_send <= now && q->slot.slot_next <= now) { netem_erase_head(q, skb); q->t_len--; skb->next = NULL; skb->prev = NULL; - /* skb->dev shares skb->rbnode area, - * we need to restore its value. - */ skb->dev = qdisc_dev(sch); if (q->slot.slot_next) { @@ -762,7 +765,7 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch) get_slot_next(q, now); } - if (q->qdisc) { + { unsigned int pkt_len = qdisc_pkt_len(skb); struct sk_buff *to_free = NULL; int err; @@ -774,34 +777,61 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch) qdisc_qstats_drop(sch); sch->qstats.backlog -= pkt_len; sch->q.qlen--; - qdisc_tree_reduce_backlog(sch, 1, pkt_len); + qdisc_tree_reduce_backlog(sch, + 1, pkt_len); } - goto tfifo_dequeue; } + } + + skb = q->qdisc->ops->dequeue(q->qdisc); + if (skb) { sch->q.qlen--; goto deliver; } - - if (q->qdisc) { - skb = q->qdisc->ops->dequeue(q->qdisc); - if (skb) { + } else { + /* No child qdisc: dequeue directly from tfifo */ + skb = netem_peek(q); + if (skb) { + u64 time_to_send; + u64 now = ktime_get_ns(); + + time_to_send = netem_skb_cb(skb)->time_to_send; + if (q->slot.slot_next && + q->slot.slot_next < time_to_send) + get_slot_next(q, now); + + if (time_to_send <= now && + q->slot.slot_next <= now) { + netem_erase_head(q, skb); + q->t_len--; + skb->next = NULL; + skb->prev = NULL; + skb->dev = qdisc_dev(sch); + + if (q->slot.slot_next) { + q->slot.packets_left--; + q->slot.bytes_left -= + qdisc_pkt_len(skb); + if (q->slot.packets_left <= 0 || + q->slot.bytes_left <= 0) + get_slot_next(q, now); + } sch->q.qlen--; goto deliver; } } + } + + /* Schedule watchdog for next time-ready packet */ + skb = netem_peek(q); + if (skb) { + u64 time_to_send = netem_skb_cb(skb)->time_to_send; qdisc_watchdog_schedule_ns(&q->watchdog, max(time_to_send, q->slot.slot_next)); } - if (q->qdisc) { - skb = q->qdisc->ops->dequeue(q->qdisc); - if (skb) { - sch->q.qlen--; - goto deliver; - } - } return NULL; } -- 2.51.0 The 4-state Markov chain in loss_4state() has gaps at the boundaries between transition probability ranges. The comparisons use: if (rnd < a4) else if (a4 < rnd && rnd < a1 + a4) When rnd equals a boundary value exactly, neither branch matches and no state transition occurs. The redundant lower-bound check (a4 < rnd) is already implied by being in the else branch. Remove the unnecessary lower-bound comparisons so the ranges are contiguous and every random value produces a transition, matching the GI (General and Intuitive) loss model specification. This bug goes back to original implementation of this model. Fixes: 661b79725fea ("netem: revised correlated loss generator") Signed-off-by: Stephen Hemminger --- net/sched/sch_netem.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 08006a60849e..00ece854280a 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -237,7 +237,7 @@ static bool loss_4state(struct netem_sched_data *q) if (rnd < clg->a4) { clg->state = LOST_IN_GAP_PERIOD; return true; - } else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) { + } else if (rnd < clg->a1 + clg->a4) { clg->state = LOST_IN_BURST_PERIOD; return true; } else if (clg->a1 + clg->a4 < rnd) { @@ -257,7 +257,7 @@ static bool loss_4state(struct netem_sched_data *q) case LOST_IN_BURST_PERIOD: if (rnd < clg->a3) clg->state = TX_IN_BURST_PERIOD; - else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) { + else if (rnd < clg->a2 + clg->a3) { clg->state = TX_IN_GAP_PERIOD; } else if (clg->a2 + clg->a3 < rnd) { clg->state = LOST_IN_BURST_PERIOD; -- 2.51.0 get_slot_next() computes a random delay between min_delay and max_delay using: get_random_u32() * (max_delay - min_delay) >> 32 This overflows signed 64-bit arithmetic when the delay range exceeds approximately 2.1 seconds (2^31 nanoseconds), producing a negative result that effectively disables slot-based pacing. This is a realistic configuration for WAN emulation (e.g., slot 1s 5s). Use mul_u64_u32_shr() which handles the widening multiply without overflow. Fixes: 0a9fe5c375b5 ("netem: slotting with non-uniform distribution") Signed-off-by: Stephen Hemminger --- net/sched/sch_netem.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 00ece854280a..afb89d318f41 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -680,9 +680,8 @@ static void get_slot_next(struct netem_sched_data *q, u64 now) if (!q->slot_dist) next_delay = q->slot_config.min_delay + - (get_random_u32() * - (q->slot_config.max_delay - - q->slot_config.min_delay) >> 32); + mul_u64_u32_shr(q->slot_config.max_delay - q->slot_config.min_delay, + get_random_u32(), 32); else next_delay = tabledist(q->slot_config.dist_delay, (s32)(q->slot_config.dist_jitter), -- 2.51.0 The queue limit check in netem_enqueue() uses q->t_len which only counts packets in the internal tfifo. Packets placed in sch->q by the reorder path (__qdisc_enqueue_head) are not counted, allowing the total queue occupancy to exceed sch->limit under reordering. Include sch->q.qlen in the limit check. Fixes: 50612537e9ab ("netem: fix classful handling") Signed-off-by: Stephen Hemminger --- net/sched/sch_netem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index afb89d318f41..df9049816623 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -542,7 +542,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, 1<t_len >= sch->limit)) { + if (unlikely(sch->q.qlen >= sch->limit)) { /* re-link segs, so that qdisc_drop_all() frees them all */ skb->next = segs; qdisc_drop_all(skb, sch, to_free); -- 2.51.0 When tfifo_enqueue() appends a packet to the linear queue tail, nskb->next is never set to NULL. The list terminates correctly only by accident if the skb arrived with next already NULL. Explicitly null-terminate the tail to prevent list corruption. Fixes: d66280b12bd7 ("net: netem: use a list in addition to rbtree") Signed-off-by: Stephen Hemminger --- net/sched/sch_netem.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index df9049816623..0f6817a472e6 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -408,6 +408,7 @@ static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) q->t_tail->next = nskb; else q->t_head = nskb; + nskb->next = NULL; q->t_tail = nskb; } else { struct rb_node **p = &q->t_root.rb_node, *parent = NULL; -- 2.51.0 netem_change() unconditionally reseeds the PRNG on every tc change command. If TCA_NETEM_PRNG_SEED is not specified, a new random seed is generated, destroying reproducibility for users who set a deterministic seed on a previous change. Move the initial random seed generation to netem_init() and only reseed in netem_change() when TCA_NETEM_PRNG_SEED is explicitly provided by the user. Fixes: 4072d97ddc44 ("netem: add prng attribute to netem_sched_data") Signed-off-by: Stephen Hemminger --- net/sched/sch_netem.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 0f6817a472e6..dd98778ebbec 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -1123,11 +1123,10 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt, /* capping jitter to the range acceptable by tabledist() */ q->jitter = min_t(s64, abs(q->jitter), INT_MAX); - if (tb[TCA_NETEM_PRNG_SEED]) + if (tb[TCA_NETEM_PRNG_SEED]) { q->prng.seed = nla_get_u64(tb[TCA_NETEM_PRNG_SEED]); - else - q->prng.seed = get_random_u64(); - prandom_seed_state(&q->prng.prng_state, q->prng.seed); + prandom_seed_state(&q->prng.prng_state, q->prng.seed); + } unlock: sch_tree_unlock(sch); @@ -1150,6 +1149,9 @@ static int netem_init(struct Qdisc *sch, struct nlattr *opt, return -EINVAL; q->loss_model = CLG_RANDOM; + q->prng.seed = get_random_u64(); + prandom_seed_state(&q->prng.prng_state, q->prng.seed); + ret = netem_change(sch, opt, extack); if (ret) pr_info("netem: change failed\n"); -- 2.51.0 The _4_state_model and GE_state_model enum definitions are declared as struct members but are never read or written. Only the enum constants they define (TX_IN_GAP_PERIOD, GOOD_STATE, etc.) are used. Move them to file scope as anonymous enums and remove the unused struct fields, saving 8 bytes per netem instance. Signed-off-by: Stephen Hemminger --- net/sched/sch_netem.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index dd98778ebbec..1957a15f6d1a 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -81,6 +81,18 @@ struct disttable { s16 table[] __counted_by(size); }; +enum GE_state_model { + GOOD_STATE = 1, + BAD_STATE, +}; + +enum _4_state_model { + TX_IN_GAP_PERIOD = 1, + TX_IN_BURST_PERIOD, + LOST_IN_GAP_PERIOD, + LOST_IN_BURST_PERIOD, +}; + struct netem_sched_data { /* internal t(ime)fifo qdisc uses t_root and sch->limit */ struct rb_root t_root; @@ -131,18 +143,6 @@ struct netem_sched_data { CLG_GILB_ELL, } loss_model; - enum { - TX_IN_GAP_PERIOD = 1, - TX_IN_BURST_PERIOD, - LOST_IN_GAP_PERIOD, - LOST_IN_BURST_PERIOD, - } _4_state_model; - - enum { - GOOD_STATE = 1, - BAD_STATE, - } GE_state_model; - /* Correlated Loss Generation models */ struct clgstate { /* state of the Markov chain */ -- 2.51.0 The version printed was never updated and kernel version is better indication of what is fixed or not. Signed-off-by: Stephen Hemminger --- net/sched/sch_netem.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 1957a15f6d1a..9ae09e781166 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -28,8 +28,6 @@ #include #include -#define VERSION "1.3" - /* * Limit for recursion from duplication. * Duplicated packets are re-enqueued at the root qdisc, which may @@ -1371,16 +1369,15 @@ static struct Qdisc_ops netem_qdisc_ops __read_mostly = { }; MODULE_ALIAS_NET_SCH("netem"); - static int __init netem_module_init(void) { - pr_info("netem: version " VERSION "\n"); return register_qdisc(&netem_qdisc_ops); } static void __exit netem_module_exit(void) { unregister_qdisc(&netem_qdisc_ops); } + module_init(netem_module_init) module_exit(netem_module_exit) MODULE_LICENSE("GPL"); -- 2.51.0 netem predates the netlink extended ack mechanism and uses pr_info() to report configuration errors. These messages go to the kernel log where the user running tc may never see them, and in unprivileged user namespace contexts they can be used for log spam. Replace pr_info() with NL_SET_ERR_MSG() and NL_SET_ERR_MSG_FMT() which return error details to the caller via netlink. Thread the extack pointer through parse_attr() and get_loss_clg(). Remove the uninformative "netem: change failed" message from netem_init() since the error is already propagated. Signed-off-by: Stephen Hemminger --- net/sched/sch_netem.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 9ae09e781166..0b273e44e561 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -937,7 +937,8 @@ static void get_rate(struct netem_sched_data *q, const struct nlattr *attr) q->cell_size_reciprocal = (struct reciprocal_value) { 0 }; } -static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr) +static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr, + struct netlink_ext_ack *extack) { const struct nlattr *la; int rem; @@ -950,7 +951,8 @@ static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr) const struct tc_netem_gimodel *gi = nla_data(la); if (nla_len(la) < sizeof(struct tc_netem_gimodel)) { - pr_info("netem: incorrect gi model size\n"); + NL_SET_ERR_MSG(extack, + "netem: incorrect gi model size"); return -EINVAL; } @@ -969,7 +971,8 @@ static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr) const struct tc_netem_gemodel *ge = nla_data(la); if (nla_len(la) < sizeof(struct tc_netem_gemodel)) { - pr_info("netem: incorrect ge model size\n"); + NL_SET_ERR_MSG(extack, + "netem: incorrect ge model size"); return -EINVAL; } @@ -983,7 +986,7 @@ static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr) } default: - pr_info("netem: unknown loss type %u\n", type); + NL_SET_ERR_MSG_FMT(extack, "netem: unknown loss type %u", type); return -EINVAL; } } @@ -1006,12 +1009,14 @@ static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = { }; static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, - const struct nla_policy *policy, int len) + const struct nla_policy *policy, size_t len, + struct netlink_ext_ack *extack) { int nested_len = nla_len(nla) - NLA_ALIGN(len); if (nested_len < 0) { - pr_info("netem: invalid attributes len %d\n", nested_len); + NL_SET_ERR_MSG_FMT(extack, "netem: invalid attributes len %u < %zu", + nla_len(nla), NLA_ALIGN(len)); return -EINVAL; } @@ -1025,8 +1030,7 @@ static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, } /* Parse netlink message to set options */ -static int netem_change(struct Qdisc *sch, struct nlattr *opt, - struct netlink_ext_ack *extack) +static int netem_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct netem_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_NETEM_MAX + 1]; @@ -1038,7 +1042,7 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt, int ret; qopt = nla_data(opt); - ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt)); + ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt), extack); if (ret < 0) return ret; @@ -1060,7 +1064,7 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt, old_loss_model = q->loss_model; if (tb[TCA_NETEM_LOSS]) { - ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]); + ret = get_loss_clg(q, tb[TCA_NETEM_LOSS], extack); if (ret) { q->loss_model = old_loss_model; q->clg = old_clg; @@ -1151,8 +1155,6 @@ static int netem_init(struct Qdisc *sch, struct nlattr *opt, prandom_seed_state(&q->prng.prng_state, q->prng.seed); ret = netem_change(sch, opt, extack); - if (ret) - pr_info("netem: change failed\n"); return ret; } -- 2.51.0