codel_dump_stats() only runs with RTNL held, reading fields that can be changed in qdisc fast path. Add READ_ONCE()/WRITE_ONCE() annotations. Alternative would be to acquire the qdisc spinlock, but our long-term goal is to make qdisc dump operations lockless as much as we can. tc_codel_xstats fields don't need to be latched atomically, otherwise this bug would have been caught earlier. No change in kernel size: $ scripts/bloat-o-meter -t vmlinux.0 vmlinux add/remove: 0/0 grow/shrink: 1/1 up/down: 3/-1 (2) Function old new delta codel_qdisc_dequeue 2462 2465 +3 codel_dump_stats 250 249 -1 Total: Before=29739919, After=29739921, chg +0.00% Fixes: 76e3cc126bb2 ("codel: Controlled Delay AQM") Signed-off-by: Eric Dumazet --- net-next tree has been chosen to avoid a merge conflict. I will handle stable submission, this bug is minor. include/net/codel_impl.h | 45 +++++++++++++++++++++------------------- net/sched/sch_codel.c | 22 ++++++++++---------- 2 files changed, 35 insertions(+), 32 deletions(-) diff --git a/include/net/codel_impl.h b/include/net/codel_impl.h index b2c359c6dd1b84e3aeb77834272641bac73f8e6a..2c1f0ec309e9fe02e79d377129da6c5c0cbce19b 100644 --- a/include/net/codel_impl.h +++ b/include/net/codel_impl.h @@ -120,10 +120,10 @@ static bool codel_should_drop(const struct sk_buff *skb, } skb_len = skb_len_func(skb); - vars->ldelay = now - skb_time_func(skb); + WRITE_ONCE(vars->ldelay, now - skb_time_func(skb)); if (unlikely(skb_len > stats->maxpacket)) - stats->maxpacket = skb_len; + WRITE_ONCE(stats->maxpacket, skb_len); if (codel_time_before(vars->ldelay, params->target) || *backlog <= params->mtu) { @@ -159,7 +159,7 @@ static struct sk_buff *codel_dequeue(void *ctx, if (!skb) { vars->first_above_time = 0; - vars->dropping = false; + WRITE_ONCE(vars->dropping, false); return skb; } now = codel_get_time(); @@ -168,7 +168,7 @@ static struct sk_buff *codel_dequeue(void *ctx, if (vars->dropping) { if (!drop) { /* sojourn time below target - leave dropping state */ - vars->dropping = false; + WRITE_ONCE(vars->dropping, false); } else if (codel_time_after_eq(now, vars->drop_next)) { /* It's time for the next drop. Drop the current * packet and dequeue the next. The dequeue might @@ -180,16 +180,18 @@ static struct sk_buff *codel_dequeue(void *ctx, */ while (vars->dropping && codel_time_after_eq(now, vars->drop_next)) { - vars->count++; /* dont care of possible wrap - * since there is no more divide - */ + /* dont care of possible wrap + * since there is no more divide. + */ + WRITE_ONCE(vars->count, vars->count + 1); codel_Newton_step(vars); if (params->ecn && INET_ECN_set_ce(skb)) { - stats->ecn_mark++; - vars->drop_next = + WRITE_ONCE(stats->ecn_mark, + stats->ecn_mark + 1); + WRITE_ONCE(vars->drop_next, codel_control_law(vars->drop_next, params->interval, - vars->rec_inv_sqrt); + vars->rec_inv_sqrt)); goto end; } stats->drop_len += skb_len_func(skb); @@ -202,13 +204,13 @@ static struct sk_buff *codel_dequeue(void *ctx, skb_time_func, backlog, now)) { /* leave dropping state */ - vars->dropping = false; + WRITE_ONCE(vars->dropping, false); } else { /* and schedule the next drop */ - vars->drop_next = + WRITE_ONCE(vars->drop_next, codel_control_law(vars->drop_next, params->interval, - vars->rec_inv_sqrt); + vars->rec_inv_sqrt)); } } } @@ -216,7 +218,7 @@ static struct sk_buff *codel_dequeue(void *ctx, u32 delta; if (params->ecn && INET_ECN_set_ce(skb)) { - stats->ecn_mark++; + WRITE_ONCE(stats->ecn_mark, stats->ecn_mark + 1); } else { stats->drop_len += skb_len_func(skb); drop_func(skb, ctx); @@ -227,7 +229,7 @@ static struct sk_buff *codel_dequeue(void *ctx, stats, skb_len_func, skb_time_func, backlog, now); } - vars->dropping = true; + WRITE_ONCE(vars->dropping, true); /* if min went above target close to when we last went below it * assume that the drop rate that controlled the queue on the * last cycle is a good starting point to control it now. @@ -236,19 +238,20 @@ static struct sk_buff *codel_dequeue(void *ctx, if (delta > 1 && codel_time_before(now - vars->drop_next, 16 * params->interval)) { - vars->count = delta; + WRITE_ONCE(vars->count, delta); /* we dont care if rec_inv_sqrt approximation * is not very precise : * Next Newton steps will correct it quadratically. */ codel_Newton_step(vars); } else { - vars->count = 1; + WRITE_ONCE(vars->count, 1); vars->rec_inv_sqrt = ~0U >> REC_INV_SQRT_SHIFT; } - vars->lastcount = vars->count; - vars->drop_next = codel_control_law(now, params->interval, - vars->rec_inv_sqrt); + WRITE_ONCE(vars->lastcount, vars->count); + WRITE_ONCE(vars->drop_next, + codel_control_law(now, params->interval, + vars->rec_inv_sqrt)); } end: if (skb && codel_time_after(vars->ldelay, params->ce_threshold)) { @@ -262,7 +265,7 @@ static struct sk_buff *codel_dequeue(void *ctx, params->ce_threshold_selector)); } if (set_ce && INET_ECN_set_ce(skb)) - stats->ce_mark++; + WRITE_ONCE(stats->ce_mark, stats->ce_mark + 1); } return skb; } diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index dc2be90666ffbd715550800790a0091acd28701d..317aae0ec7bd6aedb4bae09b18423c981fed16e7 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -85,7 +85,7 @@ static int codel_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, return qdisc_enqueue_tail(skb, sch); } q = qdisc_priv(sch); - q->drop_overlimit++; + WRITE_ONCE(q->drop_overlimit, q->drop_overlimit + 1); return qdisc_drop_reason(skb, sch, to_free, QDISC_DROP_OVERLIMIT); } @@ -221,18 +221,18 @@ static int codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { const struct codel_sched_data *q = qdisc_priv(sch); struct tc_codel_xstats st = { - .maxpacket = q->stats.maxpacket, - .count = q->vars.count, - .lastcount = q->vars.lastcount, - .drop_overlimit = q->drop_overlimit, - .ldelay = codel_time_to_us(q->vars.ldelay), - .dropping = q->vars.dropping, - .ecn_mark = q->stats.ecn_mark, - .ce_mark = q->stats.ce_mark, + .maxpacket = READ_ONCE(q->stats.maxpacket), + .count = READ_ONCE(q->vars.count), + .lastcount = READ_ONCE(q->vars.lastcount), + .drop_overlimit = READ_ONCE(q->drop_overlimit), + .ldelay = codel_time_to_us(READ_ONCE(q->vars.ldelay)), + .dropping = READ_ONCE(q->vars.dropping), + .ecn_mark = READ_ONCE(q->stats.ecn_mark), + .ce_mark = READ_ONCE(q->stats.ce_mark), }; - if (q->vars.dropping) { - codel_tdiff_t delta = q->vars.drop_next - codel_get_time(); + if (st.dropping) { + codel_tdiff_t delta = READ_ONCE(q->vars.drop_next) - codel_get_time(); if (delta >= 0) st.drop_next = codel_time_to_us(delta); -- 2.53.0.1213.gd9a14994de-goog