The 4-state Markov chain in loss_4state() has gaps at the boundaries between transition probability ranges. The comparisons use: if (rnd < a4) else if (a4 < rnd && rnd < a1 + a4) When rnd equals a boundary value exactly, neither branch matches and no state transition occurs. The redundant lower-bound check (a4 < rnd) is already implied by being in the else branch. Remove the unnecessary lower-bound comparisons so the ranges are contiguous and every random value produces a transition, matching the GI (General and Intuitive) loss model specification. This bug goes back to original implementation of this model. Fixes: 661b79725fea ("netem: revised correlated loss generator") Signed-off-by: Stephen Hemminger --- net/sched/sch_netem.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 20df1c08b1e9..8ee72cac1faf 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -227,10 +227,10 @@ static bool loss_4state(struct netem_sched_data *q) if (rnd < clg->a4) { clg->state = LOST_IN_GAP_PERIOD; return true; - } else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) { + } else if (rnd < clg->a1 + clg->a4) { clg->state = LOST_IN_BURST_PERIOD; return true; - } else if (clg->a1 + clg->a4 < rnd) { + } else { clg->state = TX_IN_GAP_PERIOD; } @@ -247,9 +247,9 @@ static bool loss_4state(struct netem_sched_data *q) case LOST_IN_BURST_PERIOD: if (rnd < clg->a3) clg->state = TX_IN_BURST_PERIOD; - else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) { + else if (rnd < clg->a2 + clg->a3) { clg->state = TX_IN_GAP_PERIOD; - } else if (clg->a2 + clg->a3 < rnd) { + } else { clg->state = LOST_IN_BURST_PERIOD; return true; } -- 2.53.0 The queue limit check in netem_enqueue() uses q->t_len which only counts packets in the internal tfifo. Packets placed in sch->q by the reorder path (__qdisc_enqueue_head) are not counted, allowing the total queue occupancy to exceed sch->limit under reordering. Include sch->q.qlen in the limit check. Fixes: 50612537e9ab ("netem: fix classful handling") Signed-off-by: Stephen Hemminger --- net/sched/sch_netem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 8ee72cac1faf..d400a730eadd 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -524,7 +524,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, 1 << get_random_u32_below(8); } - if (unlikely(q->t_len >= sch->limit)) { + if (unlikely(sch->q.qlen >= sch->limit)) { /* re-link segs, so that qdisc_drop_all() frees them all */ skb->next = segs; qdisc_drop_all(skb, sch, to_free); -- 2.53.0 netem_change() unconditionally reseeds the PRNG on every tc change command. If TCA_NETEM_PRNG_SEED is not specified, a new random seed is generated, destroying reproducibility for users who set a deterministic seed on a previous change. Move the initial random seed generation to netem_init() and only reseed in netem_change() when TCA_NETEM_PRNG_SEED is explicitly provided by the user. Fixes: 4072d97ddc44 ("netem: add prng attribute to netem_sched_data") Signed-off-by: Stephen Hemminger --- net/sched/sch_netem.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index d400a730eadd..556f9747f0e7 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -1112,11 +1112,10 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt, /* capping jitter to the range acceptable by tabledist() */ q->jitter = min_t(s64, abs(q->jitter), INT_MAX); - if (tb[TCA_NETEM_PRNG_SEED]) + if (tb[TCA_NETEM_PRNG_SEED]) { q->prng.seed = nla_get_u64(tb[TCA_NETEM_PRNG_SEED]); - else - q->prng.seed = get_random_u64(); - prandom_seed_state(&q->prng.prng_state, q->prng.seed); + prandom_seed_state(&q->prng.prng_state, q->prng.seed); + } unlock: sch_tree_unlock(sch); @@ -1139,6 +1138,9 @@ static int netem_init(struct Qdisc *sch, struct nlattr *opt, return -EINVAL; q->loss_model = CLG_RANDOM; + q->prng.seed = get_random_u64(); + prandom_seed_state(&q->prng.prng_state, q->prng.seed); + ret = netem_change(sch, opt, extack); if (ret) pr_info("netem: change failed\n"); -- 2.53.0 netem_dequeue() enqueues packets into its child qdisc while being called from the parent's dequeue path. This causes two problems: - HFSC tracks class active/inactive state on qlen transitions. A child enqueue during dequeue causes double-insertion into the eltree (CVE-2025-37890, CVE-2025-38001). - Non-work-conserving children like TBF may refuse to dequeue packets just enqueued, causing netem to return NULL despite having backlog. Parents like DRR then incorrectly deactivate the class. Split the dequeue into helpers: netem_pull_tfifo() - remove head packet from tfifo netem_slot_account() - update slot pacing counters netem_dequeue_child() - batch-transfer ready packets to the child, then dequeue from the child netem_dequeue_direct()- dequeue from tfifo when no child When a child qdisc is present, all time-ready packets are moved into the child before calling its dequeue. This separates the enqueue and dequeue phases so the parent sees consistent qlen transitions. Fixes: 50612537e9ab ("netem: fix classful handling") Signed-off-by: Stephen Hemminger --- net/sched/sch_netem.c | 201 +++++++++++++++++++++++++++--------------- 1 file changed, 128 insertions(+), 73 deletions(-) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 556f9747f0e7..b93f0e886a2b 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -689,99 +689,154 @@ static struct sk_buff *netem_peek(struct netem_sched_data *q) return q->t_head; } -static void netem_erase_head(struct netem_sched_data *q, struct sk_buff *skb) +/* + * Pop the head packet from the tfifo and prepare it for delivery. + * skb->dev shares the rbnode area and must be restored after removal. + */ +static struct sk_buff *netem_pull_tfifo(struct netem_sched_data *q, + struct Qdisc *sch) { - if (skb == q->t_head) { + struct sk_buff *skb; + + if (q->t_head) { + skb = q->t_head; q->t_head = skb->next; if (!q->t_head) q->t_tail = NULL; } else { - rb_erase(&skb->rbnode, &q->t_root); + struct rb_node *p = rb_first(&q->t_root); + + if (!p) + return NULL; + skb = rb_to_skb(p); + rb_erase(p, &q->t_root); } + + q->t_len--; + skb->next = NULL; + skb->prev = NULL; + skb->dev = qdisc_dev(sch); + + return skb; } -static struct sk_buff *netem_dequeue(struct Qdisc *sch) +/* Update slot pacing counters after releasing a packet */ +static void netem_slot_account(struct netem_sched_data *q, + const struct sk_buff *skb, u64 now) +{ + if (!q->slot.slot_next) + return; + + q->slot.packets_left--; + q->slot.bytes_left -= qdisc_pkt_len(skb); + if (q->slot.packets_left <= 0 || q->slot.bytes_left <= 0) + get_slot_next(q, now); +} + +/* + * Transfer all time-ready packets from the tfifo into the child qdisc, + * then dequeue from the child. Batching the transfers avoids calling + * qdisc_enqueue() inside the parent's dequeue path, which confuses + * parents that track active/inactive state on qlen transitions (HFSC). + */ +static struct sk_buff *netem_dequeue_child(struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); + u64 now = ktime_get_ns(); struct sk_buff *skb; -tfifo_dequeue: - skb = __qdisc_dequeue_head(&sch->q); - if (skb) { -deliver: - qdisc_qstats_backlog_dec(sch, skb); - qdisc_bstats_update(sch, skb); - return skb; - } - skb = netem_peek(q); - if (skb) { - u64 time_to_send; - u64 now = ktime_get_ns(); - - /* if more time remaining? */ - time_to_send = netem_skb_cb(skb)->time_to_send; - if (q->slot.slot_next && q->slot.slot_next < time_to_send) - get_slot_next(q, now); - - if (time_to_send <= now && q->slot.slot_next <= now) { - netem_erase_head(q, skb); - q->t_len--; - skb->next = NULL; - skb->prev = NULL; - /* skb->dev shares skb->rbnode area, - * we need to restore its value. - */ - skb->dev = qdisc_dev(sch); - - if (q->slot.slot_next) { - q->slot.packets_left--; - q->slot.bytes_left -= qdisc_pkt_len(skb); - if (q->slot.packets_left <= 0 || - q->slot.bytes_left <= 0) - get_slot_next(q, now); - } + while ((skb = netem_peek(q)) != NULL) { + struct sk_buff *to_free = NULL; + unsigned int pkt_len; + int err; - if (q->qdisc) { - unsigned int pkt_len = qdisc_pkt_len(skb); - struct sk_buff *to_free = NULL; - int err; - - err = qdisc_enqueue(skb, q->qdisc, &to_free); - kfree_skb_list(to_free); - if (err != NET_XMIT_SUCCESS) { - if (net_xmit_drop_count(err)) - qdisc_qstats_drop(sch); - sch->qstats.backlog -= pkt_len; - sch->q.qlen--; - qdisc_tree_reduce_backlog(sch, 1, pkt_len); - } - goto tfifo_dequeue; - } + if (netem_skb_cb(skb)->time_to_send > now) + break; + if (q->slot.slot_next && q->slot.slot_next > now) + break; + + skb = netem_pull_tfifo(q, sch); + netem_slot_account(q, skb, now); + + pkt_len = qdisc_pkt_len(skb); + err = qdisc_enqueue(skb, q->qdisc, &to_free); + kfree_skb_list(to_free); + if (unlikely(err != NET_XMIT_SUCCESS)) { + if (net_xmit_drop_count(err)) + qdisc_qstats_drop(sch); + sch->qstats.backlog -= pkt_len; sch->q.qlen--; - goto deliver; + qdisc_tree_reduce_backlog(sch, 1, pkt_len); } + } - if (q->qdisc) { - skb = q->qdisc->ops->dequeue(q->qdisc); - if (skb) { - sch->q.qlen--; - goto deliver; - } - } + skb = q->qdisc->ops->dequeue(q->qdisc); + if (skb) + sch->q.qlen--; - qdisc_watchdog_schedule_ns(&q->watchdog, - max(time_to_send, - q->slot.slot_next)); - } + return skb; +} - if (q->qdisc) { - skb = q->qdisc->ops->dequeue(q->qdisc); - if (skb) { - sch->q.qlen--; - goto deliver; - } +/* Dequeue directly from the tfifo when no child qdisc is configured. */ +static struct sk_buff *netem_dequeue_direct(struct Qdisc *sch) +{ + struct netem_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + u64 time_to_send; + u64 now; + + skb = netem_peek(q); + if (!skb) + return NULL; + + now = ktime_get_ns(); + time_to_send = netem_skb_cb(skb)->time_to_send; + + if (q->slot.slot_next && q->slot.slot_next < time_to_send) + get_slot_next(q, now); + + if (time_to_send > now || q->slot.slot_next > now) + return NULL; + + skb = netem_pull_tfifo(q, sch); + netem_slot_account(q, skb, now); + sch->q.qlen--; + + return skb; +} + +static struct sk_buff *netem_dequeue(struct Qdisc *sch) +{ + struct netem_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + + /* First check the reorder queue */ + skb = __qdisc_dequeue_head(&sch->q); + if (skb) + goto deliver; + + if (q->qdisc) + skb = netem_dequeue_child(sch); + else + skb = netem_dequeue_direct(sch); + + if (skb) + goto deliver; + + /* Nothing ready — schedule watchdog for next packet */ + skb = netem_peek(q); + if (skb) { + u64 time_to_send = netem_skb_cb(skb)->time_to_send; + + qdisc_watchdog_schedule_ns(&q->watchdog, + max(time_to_send, q->slot.slot_next)); } return NULL; + +deliver: + qdisc_qstats_backlog_dec(sch, skb); + qdisc_bstats_update(sch, skb); + return skb; } static void netem_reset(struct Qdisc *sch) -- 2.53.0 When tfifo_enqueue() appends a packet to the linear queue tail, nskb->next is never set to NULL. The list terminates correctly only by accident if the skb arrived with next already NULL. Explicitly null-terminate the tail to prevent list corruption. Fixes: d66280b12bd7 ("net: netem: use a list in addition to rbtree") Signed-off-by: Stephen Hemminger --- net/sched/sch_netem.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index b93f0e886a2b..e405bf862163 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -398,6 +398,7 @@ static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) q->t_tail->next = nskb; else q->t_head = nskb; + nskb->next = NULL; q->t_tail = nskb; } else { struct rb_node **p = &q->t_root.rb_node, *parent = NULL; -- 2.53.0 Reject slot configuration where min_delay exceeds max_delay. The delay range computation in get_slot_next() underflows in this case, producing bogus results. Fixes: 0a9fe5c375b5 ("netem: slotting with non-uniform distribution") Signed-off-by: Stephen Hemminger --- net/sched/sch_netem.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index e405bf862163..c82f76af41aa 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -883,6 +883,18 @@ static int get_dist_table(struct disttable **tbl, const struct nlattr *attr) return 0; } +static int validate_slot(const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + const struct tc_netem_slot *c = nla_data(attr); + + if (c->min_delay > c->max_delay) { + NL_SET_ERR_MSG(extack, "slot min delay greater than max delay"); + return -EINVAL; + } + return 0; +} + static void get_slot(struct netem_sched_data *q, const struct nlattr *attr) { const struct tc_netem_slot *c = nla_data(attr); @@ -1096,6 +1108,12 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt, goto table_free; } + if (tb[TCA_NETEM_SLOT]) { + ret = validate_slot(tb[TCA_NETEM_SLOT], extack); + if (ret) + goto table_free; + } + sch_tree_lock(sch); /* backup q->clg and q->loss_model */ old_clg = q->clg; -- 2.53.0 get_slot_next() computes a random delay between min_delay and max_delay using: get_random_u32() * (max_delay - min_delay) >> 32 This overflows signed 64-bit arithmetic when the delay range exceeds approximately 2.1 seconds (2^31 nanoseconds), producing a negative result that effectively disables slot-based pacing. This is a realistic configuration for WAN emulation (e.g., slot 1s 5s). Use mul_u64_u32_shr() which handles the widening multiply without overflow. Fixes: 0a9fe5c375b5 ("netem: slotting with non-uniform distribution") Signed-off-by: Stephen Hemminger --- net/sched/sch_netem.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index c82f76af41aa..d47e1b0d7942 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -660,9 +660,8 @@ static void get_slot_next(struct netem_sched_data *q, u64 now) if (!q->slot_dist) next_delay = q->slot_config.min_delay + - (get_random_u32() * - (q->slot_config.max_delay - - q->slot_config.min_delay) >> 32); + mul_u64_u32_shr(q->slot_config.max_delay - q->slot_config.min_delay, + get_random_u32(), 32); else next_delay = tabledist(q->slot_config.dist_delay, (s32)(q->slot_config.dist_jitter), -- 2.53.0