Add a new u16 field, next to pkt_len : pkt_segs This will cache shinfo->gso_segs to speed up qdisc deqeue(). Move slave_dev_queue_mapping at the end of qdisc_skb_cb, and move three bits from tc_skb_cb : - post_ct - post_ct_snat - post_ct_dnat Signed-off-by: Eric Dumazet --- include/net/sch_generic.h | 18 +++++++++--------- net/core/dev.c | 2 +- net/sched/act_ct.c | 8 ++++---- net/sched/cls_api.c | 6 +++--- net/sched/cls_flower.c | 2 +- 5 files changed, 18 insertions(+), 18 deletions(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 94966692ccdf51db085c236319705aecba8c30cf..9cd8b5d4b23698fd8959ef40c303468e31c1d4af 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -429,13 +429,16 @@ struct tcf_proto { }; struct qdisc_skb_cb { - struct { - unsigned int pkt_len; - u16 slave_dev_queue_mapping; - u16 tc_classid; - }; + unsigned int pkt_len; + u16 pkt_segs; + u16 tc_classid; #define QDISC_CB_PRIV_LEN 20 unsigned char data[QDISC_CB_PRIV_LEN]; + + u16 slave_dev_queue_mapping; + u8 post_ct:1; + u8 post_ct_snat:1; + u8 post_ct_dnat:1; }; typedef void tcf_chain_head_change_t(struct tcf_proto *tp_head, void *priv); @@ -1064,11 +1067,8 @@ struct tc_skb_cb { struct qdisc_skb_cb qdisc_cb; u32 drop_reason; - u16 zone; /* Only valid if post_ct = true */ + u16 zone; /* Only valid if qdisc_skb_cb(skb)->post_ct = true */ u16 mru; - u8 post_ct:1; - u8 post_ct_snat:1; - u8 post_ct_dnat:1; }; static inline struct tc_skb_cb *tc_skb_cb(const struct sk_buff *skb) diff --git a/net/core/dev.c b/net/core/dev.c index 69515edd17bc6a157046f31b3dd343a59ae192ab..46ce6c6107805132b1322128e86634eca91e3340 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4355,7 +4355,7 @@ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb, return ret; tc_skb_cb(skb)->mru = 0; - tc_skb_cb(skb)->post_ct = false; + qdisc_skb_cb(skb)->post_ct = false; tcf_set_drop_reason(skb, *drop_reason); mini_qdisc_bstats_cpu_update(miniq, skb); diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 6749a4a9a9cd0a43897fcd20d228721ce057cb88..2b6ac7069dc168da2c534bddc5d4398e5e7a18c4 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -948,9 +948,9 @@ static int tcf_ct_act_nat(struct sk_buff *skb, return err & NF_VERDICT_MASK; if (action & BIT(NF_NAT_MANIP_SRC)) - tc_skb_cb(skb)->post_ct_snat = 1; + qdisc_skb_cb(skb)->post_ct_snat = 1; if (action & BIT(NF_NAT_MANIP_DST)) - tc_skb_cb(skb)->post_ct_dnat = 1; + qdisc_skb_cb(skb)->post_ct_dnat = 1; return err; #else @@ -986,7 +986,7 @@ TC_INDIRECT_SCOPE int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, tcf_action_update_bstats(&c->common, skb); if (clear) { - tc_skb_cb(skb)->post_ct = false; + qdisc_skb_cb(skb)->post_ct = false; ct = nf_ct_get(skb, &ctinfo); if (ct) { nf_ct_put(ct); @@ -1097,7 +1097,7 @@ TC_INDIRECT_SCOPE int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, out_push: skb_push_rcsum(skb, nh_ofs); - tc_skb_cb(skb)->post_ct = true; + qdisc_skb_cb(skb)->post_ct = true; tc_skb_cb(skb)->zone = p->zone; out_clear: if (defrag) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index f751cd5eeac8d72b4c4d138f45d25a8ba62fb1bd..ebca4b926dcf76daa3abb8ffe221503e33de30e3 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1872,9 +1872,9 @@ int tcf_classify(struct sk_buff *skb, } ext->chain = last_executed_chain; ext->mru = cb->mru; - ext->post_ct = cb->post_ct; - ext->post_ct_snat = cb->post_ct_snat; - ext->post_ct_dnat = cb->post_ct_dnat; + ext->post_ct = qdisc_skb_cb(skb)->post_ct; + ext->post_ct_snat = qdisc_skb_cb(skb)->post_ct_snat; + ext->post_ct_dnat = qdisc_skb_cb(skb)->post_ct_dnat; ext->zone = cb->zone; } } diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 099ff6a3e1f516a50cfac578666f6d5f4fbe8f29..7669371c1354c27ede83c2c83aaea5c0402e6552 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -326,7 +326,7 @@ TC_INDIRECT_SCOPE int fl_classify(struct sk_buff *skb, struct tcf_result *res) { struct cls_fl_head *head = rcu_dereference_bh(tp->root); - bool post_ct = tc_skb_cb(skb)->post_ct; + bool post_ct = qdisc_skb_cb(skb)->post_ct; u16 zone = tc_skb_cb(skb)->zone; struct fl_flow_key skb_key; struct fl_flow_mask *mask; -- 2.52.0.rc1.455.g30608eb744-goog Qdisc use shinfo->gso_segs for their pkts stats in bstats_update(), but this field needs to be initialized for SKB_GSO_DODGY users. Signed-off-by: Eric Dumazet --- net/core/dev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index 46ce6c6107805132b1322128e86634eca91e3340..dba9eef8bd83dda89b5edd870b47373722264f48 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4071,7 +4071,7 @@ EXPORT_SYMBOL_GPL(validate_xmit_skb_list); static void qdisc_pkt_len_init(struct sk_buff *skb) { - const struct skb_shared_info *shinfo = skb_shinfo(skb); + struct skb_shared_info *shinfo = skb_shinfo(skb); qdisc_skb_cb(skb)->pkt_len = skb->len; @@ -4112,6 +4112,7 @@ static void qdisc_pkt_len_init(struct sk_buff *skb) if (payload <= 0) return; gso_segs = DIV_ROUND_UP(payload, shinfo->gso_size); + shinfo->gso_segs = gso_segs; } qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len; } -- 2.52.0.rc1.455.g30608eb744-goog qdisc_pkt_len_init() is currently initalizing qdisc_skb_cb(skb)->pkt_len. Add qdisc_skb_cb(skb)->pkt_segs initialization and rename this function to qdisc_pkt_len_segs_init(). Signed-off-by: Eric Dumazet --- net/core/dev.c | 15 +++++++++++---- net/sched/sch_cake.c | 2 +- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index dba9eef8bd83dda89b5edd870b47373722264f48..895c3e37e686f0f625bd5eec7079a43cbd33a7eb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4069,17 +4069,23 @@ struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *d } EXPORT_SYMBOL_GPL(validate_xmit_skb_list); -static void qdisc_pkt_len_init(struct sk_buff *skb) +static void qdisc_pkt_len_segs_init(struct sk_buff *skb) { struct skb_shared_info *shinfo = skb_shinfo(skb); + u16 gso_segs; qdisc_skb_cb(skb)->pkt_len = skb->len; + if (!shinfo->gso_size) { + qdisc_skb_cb(skb)->pkt_segs = 1; + return; + } + + qdisc_skb_cb(skb)->pkt_segs = gso_segs = shinfo->gso_segs; /* To get more precise estimation of bytes sent on wire, * we add to pkt_len the headers size of all segments */ - if (shinfo->gso_size && skb_transport_header_was_set(skb)) { - u16 gso_segs = shinfo->gso_segs; + if (skb_transport_header_was_set(skb)) { unsigned int hdr_len; /* mac layer + network layer */ @@ -4113,6 +4119,7 @@ static void qdisc_pkt_len_init(struct sk_buff *skb) return; gso_segs = DIV_ROUND_UP(payload, shinfo->gso_size); shinfo->gso_segs = gso_segs; + qdisc_skb_cb(skb)->pkt_segs = gso_segs; } qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len; } @@ -4738,7 +4745,7 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) skb_update_prio(skb); - qdisc_pkt_len_init(skb); + qdisc_pkt_len_segs_init(skb); tcx_set_ingress(skb, false); #ifdef CONFIG_NET_EGRESS if (static_branch_unlikely(&egress_needed_key)) { diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 32bacfc314c260dccf94178d309ccb2be22d69e4..9213129f0de10bc67ce418f77c36fed2581f3781 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -1406,7 +1406,7 @@ static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb) if (!shinfo->gso_size) return cake_calc_overhead(q, len, off); - /* borrowed from qdisc_pkt_len_init() */ + /* borrowed from qdisc_pkt_len_segs_init() */ if (!skb->encapsulation) hdr_len = skb_transport_offset(skb); else -- 2.52.0.rc1.455.g30608eb744-goog sch_handle_ingress() sets qdisc_skb_cb(skb)->pkt_len. We also need to initialize qdisc_skb_cb(skb)->pkt_segs. Signed-off-by: Eric Dumazet --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index 895c3e37e686f0f625bd5eec7079a43cbd33a7eb..e19eb4e9d77c27535ab2a0ce14299281e3ef9397 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4434,7 +4434,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, *pt_prev = NULL; } - qdisc_skb_cb(skb)->pkt_len = skb->len; + qdisc_pkt_len_segs_init(skb); tcx_set_ingress(skb, true); if (static_branch_unlikely(&tcx_needed_key)) { -- 2.52.0.rc1.455.g30608eb744-goog Avoid up to two cache line misses in qdisc dequeue() to fetch skb_shinfo(skb)->gso_segs/gso_size while qdisc spinlock is held. This gives a 5 % improvement in a TX intensive workload. Signed-off-by: Eric Dumazet --- include/net/sch_generic.h | 13 ++++++++++--- net/sched/sch_cake.c | 1 + net/sched/sch_dualpi2.c | 1 + net/sched/sch_netem.c | 1 + net/sched/sch_qfq.c | 2 +- net/sched/sch_taprio.c | 1 + net/sched/sch_tbf.c | 1 + 7 files changed, 16 insertions(+), 4 deletions(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 9cd8b5d4b23698fd8959ef40c303468e31c1d4af..cdf7a58ebcf5ef2b5f8b76eb6fbe92d5f0e07899 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -829,6 +829,15 @@ static inline unsigned int qdisc_pkt_len(const struct sk_buff *skb) return qdisc_skb_cb(skb)->pkt_len; } +static inline unsigned int qdisc_pkt_segs(const struct sk_buff *skb) +{ + u32 pkt_segs = qdisc_skb_cb(skb)->pkt_segs; + + DEBUG_NET_WARN_ON_ONCE(pkt_segs != + (skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1)); + return pkt_segs; +} + /* additional qdisc xmit flags (NET_XMIT_MASK in linux/netdevice.h) */ enum net_xmit_qdisc_t { __NET_XMIT_STOLEN = 0x00010000, @@ -870,9 +879,7 @@ static inline void _bstats_update(struct gnet_stats_basic_sync *bstats, static inline void bstats_update(struct gnet_stats_basic_sync *bstats, const struct sk_buff *skb) { - _bstats_update(bstats, - qdisc_pkt_len(skb), - skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1); + _bstats_update(bstats, qdisc_pkt_len(skb), qdisc_pkt_segs(skb)); } static inline void qdisc_bstats_cpu_update(struct Qdisc *sch, diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 9213129f0de10bc67ce418f77c36fed2581f3781..a20880034aa5eacec0c25977406104448b336397 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -1800,6 +1800,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, skb_list_walk_safe(segs, segs, nskb) { skb_mark_not_on_list(segs); qdisc_skb_cb(segs)->pkt_len = segs->len; + qdisc_skb_cb(segs)->pkt_segs = 1; cobalt_set_enqueue_time(segs, now); get_cobalt_cb(segs)->adjusted_len = cake_overhead(q, segs); diff --git a/net/sched/sch_dualpi2.c b/net/sched/sch_dualpi2.c index 4b975feb52b1f3d3b37b31713d1477de5f5806d9..6d7e6389758dc8e645b1116efe4e11fb7290ac86 100644 --- a/net/sched/sch_dualpi2.c +++ b/net/sched/sch_dualpi2.c @@ -475,6 +475,7 @@ static int dualpi2_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, * (3) Enqueue fragment & set ts in dualpi2_enqueue_skb */ qdisc_skb_cb(nskb)->pkt_len = nskb->len; + qdisc_skb_cb(nskb)->pkt_segs = 1; dualpi2_skb_cb(nskb)->classified = dualpi2_skb_cb(skb)->classified; dualpi2_skb_cb(nskb)->ect = dualpi2_skb_cb(skb)->ect; diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index eafc316ae319e3f8c23b0cb0c58fdf54be102213..32a5f33040461f3be952055c097b5f2fe760a858 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -429,6 +429,7 @@ static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff *segs; netdev_features_t features = netif_skb_features(skb); + qdisc_skb_cb(skb)->pkt_segs = 1; segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); if (IS_ERR_OR_NULL(segs)) { diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 2255355e51d350eded4549c1584b60d4d9b00fff..d920f57dc6d7659c510a98956c6dd2ed9e5ee5b8 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -1250,7 +1250,7 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, } } - gso_segs = skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1; + gso_segs = qdisc_pkt_segs(skb); err = qdisc_enqueue(skb, cl->qdisc, to_free); if (unlikely(err != NET_XMIT_SUCCESS)) { pr_debug("qfq_enqueue: enqueue failed %d\n", err); diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 39b735386996eb59712a1fc28f7bb903ec1b2220..300d577b328699eb42d2b829ecfc76464fd7b186 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -595,6 +595,7 @@ static int taprio_enqueue_segmented(struct sk_buff *skb, struct Qdisc *sch, skb_list_walk_safe(segs, segs, nskb) { skb_mark_not_on_list(segs); qdisc_skb_cb(segs)->pkt_len = segs->len; + qdisc_skb_cb(segs)->pkt_segs = 1; slen += segs->len; /* FIXME: we should be segmenting to a smaller size diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 4c977f049670a600eafd219c898e5f29597be2c1..f2340164f579a25431979e12ec3d23ab828edd16 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -221,6 +221,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch, skb_mark_not_on_list(segs); seg_len = segs->len; qdisc_skb_cb(segs)->pkt_len = seg_len; + qdisc_skb_cb(segs)->pkt_segs = 1; ret = qdisc_enqueue(segs, q->qdisc, to_free); if (ret != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) -- 2.52.0.rc1.455.g30608eb744-goog Use new qdisc_pkt_segs() to avoid a cache line miss in cake_enqueue() for non GSO packets. cake_overhead() does not have to recompute it. Signed-off-by: Eric Dumazet --- net/sched/sch_cake.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index a20880034aa5eacec0c25977406104448b336397..5948a149129c6de041ba949e2e2b5b6b4eb54166 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -1398,12 +1398,12 @@ static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb) const struct skb_shared_info *shinfo = skb_shinfo(skb); unsigned int hdr_len, last_len = 0; u32 off = skb_network_offset(skb); + u16 segs = qdisc_pkt_segs(skb); u32 len = qdisc_pkt_len(skb); - u16 segs = 1; q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8); - if (!shinfo->gso_size) + if (segs == 1) return cake_calc_overhead(q, len, off); /* borrowed from qdisc_pkt_len_segs_init() */ @@ -1430,12 +1430,6 @@ static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb) hdr_len += sizeof(struct udphdr); } - if (unlikely(shinfo->gso_type & SKB_GSO_DODGY)) - segs = DIV_ROUND_UP(skb->len - hdr_len, - shinfo->gso_size); - else - segs = shinfo->gso_segs; - len = shinfo->gso_size + hdr_len; last_len = skb->len - shinfo->gso_size * (segs - 1); @@ -1788,7 +1782,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (unlikely(len > b->max_skblen)) b->max_skblen = len; - if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) { + if (qdisc_pkt_segs(skb) > 1 && q->rate_flags & CAKE_FLAG_SPLIT_GSO) { struct sk_buff *segs, *nskb; netdev_features_t features = netif_skb_features(skb); unsigned int slen = 0, numsegs = 0; -- 2.52.0.rc1.455.g30608eb744-goog It is possible to reorg Qdisc to avoid always dirtying 2 cache lines in fast path by reducing this to a single dirtied cache line. In current layout, we change only four/six fields in the first cache line: - q.spinlock - q.qlen - bstats.bytes - bstats.packets - some Qdisc also change q.next/q.prev In the second cache line we change in the fast path: - running - state - qstats.backlog /* --- cacheline 2 boundary (128 bytes) --- */ struct sk_buff_head gso_skb __attribute__((__aligned__(64))); /* 0x80 0x18 */ struct qdisc_skb_head q; /* 0x98 0x18 */ struct gnet_stats_basic_sync bstats __attribute__((__aligned__(16))); /* 0xb0 0x10 */ /* --- cacheline 3 boundary (192 bytes) --- */ struct gnet_stats_queue qstats; /* 0xc0 0x14 */ bool running; /* 0xd4 0x1 */ /* XXX 3 bytes hole, try to pack */ unsigned long state; /* 0xd8 0x8 */ struct Qdisc * next_sched; /* 0xe0 0x8 */ struct sk_buff_head skb_bad_txq; /* 0xe8 0x18 */ /* --- cacheline 4 boundary (256 bytes) --- */ Reorganize things to have a first cache line mostly read, then a mostly written one. This gives a ~3% increase of performance under tx stress. Note that there is an additional hole because @qstats now spans over a third cache line. /* --- cacheline 2 boundary (128 bytes) --- */ __u8 __cacheline_group_begin__Qdisc_read_mostly[0] __attribute__((__aligned__(64))); /* 0x80 0 */ struct sk_buff_head gso_skb; /* 0x80 0x18 */ struct Qdisc * next_sched; /* 0x98 0x8 */ struct sk_buff_head skb_bad_txq; /* 0xa0 0x18 */ __u8 __cacheline_group_end__Qdisc_read_mostly[0]; /* 0xb8 0 */ /* XXX 8 bytes hole, try to pack */ /* --- cacheline 3 boundary (192 bytes) --- */ __u8 __cacheline_group_begin__Qdisc_write[0] __attribute__((__aligned__(64))); /* 0xc0 0 */ struct qdisc_skb_head q; /* 0xc0 0x18 */ unsigned long state; /* 0xd8 0x8 */ struct gnet_stats_basic_sync bstats __attribute__((__aligned__(16))); /* 0xe0 0x10 */ bool running; /* 0xf0 0x1 */ /* XXX 3 bytes hole, try to pack */ struct gnet_stats_queue qstats; /* 0xf4 0x14 */ /* --- cacheline 4 boundary (256 bytes) was 8 bytes ago --- */ __u8 __cacheline_group_end__Qdisc_write[0]; /* 0x108 0 */ /* XXX 56 bytes hole, try to pack */ Signed-off-by: Eric Dumazet --- include/net/sch_generic.h | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index cdf7a58ebcf5ef2b5f8b76eb6fbe92d5f0e07899..79501499dafba56271b9ebd97a8f379ffdc83cac 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -103,17 +103,24 @@ struct Qdisc { int pad; refcount_t refcnt; - /* - * For performance sake on SMP, we put highly modified fields at the end - */ - struct sk_buff_head gso_skb ____cacheline_aligned_in_smp; - struct qdisc_skb_head q; - struct gnet_stats_basic_sync bstats; - struct gnet_stats_queue qstats; - bool running; /* must be written under qdisc spinlock */ - unsigned long state; - struct Qdisc *next_sched; - struct sk_buff_head skb_bad_txq; + /* Cache line potentially dirtied in dequeue() or __netif_reschedule(). */ + __cacheline_group_begin(Qdisc_read_mostly) ____cacheline_aligned; + struct sk_buff_head gso_skb; + struct Qdisc *next_sched; + struct sk_buff_head skb_bad_txq; + __cacheline_group_end(Qdisc_read_mostly); + + /* Fields dirtied in dequeue() fast path. */ + __cacheline_group_begin(Qdisc_write) ____cacheline_aligned; + struct qdisc_skb_head q; + unsigned long state; + struct gnet_stats_basic_sync bstats; + bool running; /* must be written under qdisc spinlock */ + + /* Note : we only change qstats.backlog in fast path. */ + struct gnet_stats_queue qstats; + __cacheline_group_end(Qdisc_write); + atomic_long_t defer_count ____cacheline_aligned_in_smp; struct llist_head defer_list; -- 2.52.0.rc1.455.g30608eb744-goog Group together changes to qdisc fields to reduce chances of false sharing if another cpu attempts to acquire the qdisc spinlock. qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; qdisc_bstats_update(sch, skb); Signed-off-by: Eric Dumazet --- net/sched/sch_fq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index fee922da2f99c0c7ac6d86569cf3bbce47898951..0b0ca1aa9251f959e87dd5dc504fbe0f4cbc75eb 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -497,6 +497,7 @@ static void fq_dequeue_skb(struct Qdisc *sch, struct fq_flow *flow, skb_mark_not_on_list(skb); qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; + qdisc_bstats_update(sch, skb); } static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb) @@ -776,7 +777,6 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch) f->time_next_packet = now + len; } out: - qdisc_bstats_update(sch, skb); return skb; } -- 2.52.0.rc1.455.g30608eb744-goog prefetch the skb that we are likely to dequeue at the next dequeue(). Also call fq_dequeue_skb() a bit sooner in fq_dequeue(). This reduces the window between read of q.qlen and changes of fields in the cache line that could be dirtied by another cpu trying to queue a packet. Signed-off-by: Eric Dumazet --- net/sched/sch_fq.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 0b0ca1aa9251f959e87dd5dc504fbe0f4cbc75eb..6e5f2f4f241546605f8ba37f96275446c8836eee 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -480,7 +480,10 @@ static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow, struct sk_buff *skb) { if (skb == flow->head) { - flow->head = skb->next; + struct sk_buff *next = skb->next; + + prefetch(next); + flow->head = next; } else { rb_erase(&skb->rbnode, &flow->t_root); skb->dev = qdisc_dev(sch); @@ -712,6 +715,7 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch) goto begin; } prefetch(&skb->end); + fq_dequeue_skb(sch, f, skb); if ((s64)(now - time_next_packet - q->ce_threshold) > 0) { INET_ECN_set_ce(skb); q->stat_ce_mark++; @@ -719,7 +723,6 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch) if (--f->qlen == 0) q->inactive_flows++; q->band_pkt_count[fq_skb_cb(skb)->band]--; - fq_dequeue_skb(sch, f, skb); } else { head->first = f->next; /* force a pass through old_flows to prevent starvation */ -- 2.52.0.rc1.455.g30608eb744-goog Most qdiscs need to read skb->priority at enqueue time(). In commit 100dfa74cad9 ("net: dev_queue_xmit() llist adoption") I added a prefetch(next), lets add another one for the second half of skb. Note that skb->priority and skb->hash share a common cache line, so this patch helps qdiscs needing both fields. Signed-off-by: Eric Dumazet --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/dev.c b/net/core/dev.c index e19eb4e9d77c27535ab2a0ce14299281e3ef9397..53e2496dc4292284072946fd9131d3f9a0c0af44 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4246,6 +4246,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, llist_for_each_entry_safe(skb, next, ll_list, ll_node) { prefetch(next); + prefetch(&next->priority); skb_mark_not_on_list(skb); rc = dev_qdisc_enqueue(skb, q, &to_free, txq); count++; -- 2.52.0.rc1.455.g30608eb744-goog q->limit is read locklessly, add a READ_ONCE(). Fixes: 100dfa74cad9 ("net: dev_queue_xmit() llist adoption") Signed-off-by: Eric Dumazet --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index 53e2496dc4292284072946fd9131d3f9a0c0af44..10042139dbb054b9a93dfb019477a80263feb029 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4194,7 +4194,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, do { if (first_n && !defer_count) { defer_count = atomic_long_inc_return(&q->defer_count); - if (unlikely(defer_count > q->limit)) { + if (unlikely(defer_count > READ_ONCE(q->limit))) { kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_DROP); return NET_XMIT_DROP; } -- 2.52.0.rc1.455.g30608eb744-goog Using kfree_skb_list_reason() to free list of skbs from qdisc operations seems wrong as each skb might have a different drop reason. Cleanup __dev_xmit_skb() to call tcf_kfree_skb_list() once in preparation of the following patch. Signed-off-by: Eric Dumazet --- include/net/sch_generic.h | 11 +++++++++++ net/core/dev.c | 15 +++++---------- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 79501499dafba56271b9ebd97a8f379ffdc83cac..b8092d0378a0cafa290123d17c1b0ba787cd0680 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -1105,6 +1105,17 @@ static inline void tcf_set_drop_reason(const struct sk_buff *skb, tc_skb_cb(skb)->drop_reason = reason; } +static inline void tcf_kfree_skb_list(struct sk_buff *skb) +{ + while (unlikely(skb)) { + struct sk_buff *next = skb->next; + + prefetch(next); + kfree_skb_reason(skb, tcf_get_drop_reason(skb)); + skb = next; + } +} + /* Instead of calling kfree_skb() while root qdisc lock is held, * queue the skb for future freeing at end of __dev_xmit_skb() */ diff --git a/net/core/dev.c b/net/core/dev.c index 10042139dbb054b9a93dfb019477a80263feb029..e865cdb9b6966225072dc44a86610b9c7828bd8c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4162,7 +4162,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, __qdisc_run(q); qdisc_run_end(q); - goto no_lock_out; + goto free_skbs; } qdisc_bstats_cpu_update(q, skb); @@ -4176,12 +4176,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, rc = dev_qdisc_enqueue(skb, q, &to_free, txq); qdisc_run(q); - -no_lock_out: - if (unlikely(to_free)) - kfree_skb_list_reason(to_free, - tcf_get_drop_reason(to_free)); - return rc; + goto free_skbs; } /* Open code llist_add(&skb->ll_node, &q->defer_list) + queue limit. @@ -4257,9 +4252,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, } unlock: spin_unlock(root_lock); - if (unlikely(to_free)) - kfree_skb_list_reason(to_free, - tcf_get_drop_reason(to_free)); + +free_skbs: + tcf_kfree_skb_list(to_free); return rc; } -- 2.52.0.rc1.455.g30608eb744-goog Some qdisc like cake, codel, fq_codel might drop packets in their dequeue() method. This is currently problematic because dequeue() runs with the qdisc spinlock held. Freeing skbs can be extremely expensive. Add qdisc_dequeue_drop() method and a new TCQ_F_DEQUEUE_DROPS so that these qdiscs can opt-in to defer the skb frees after the socket spinlock is released. TCQ_F_DEQUEUE_DROPS is an attempt to not penalize other qdiscs with an extra cache line miss. Signed-off-by: Eric Dumazet --- include/net/pkt_sched.h | 5 +++-- include/net/sch_generic.h | 30 +++++++++++++++++++++++++++--- net/core/dev.c | 22 +++++++++++++--------- 3 files changed, 43 insertions(+), 14 deletions(-) diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 4678db45832a1e3bf7b8a07756fb89ab868bd5d2..e703c507d0daa97ae7c3bf131e322b1eafcc5664 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -114,12 +114,13 @@ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, void __qdisc_run(struct Qdisc *q); -static inline void qdisc_run(struct Qdisc *q) +static inline struct sk_buff *qdisc_run(struct Qdisc *q) { if (qdisc_run_begin(q)) { __qdisc_run(q); - qdisc_run_end(q); + return qdisc_run_end(q); } + return NULL; } extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1]; diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index b8092d0378a0cafa290123d17c1b0ba787cd0680..c3a7268b567e0abf3f38290cd4e3fa7cd0601e36 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -88,6 +88,8 @@ struct Qdisc { #define TCQ_F_INVISIBLE 0x80 /* invisible by default in dump */ #define TCQ_F_NOLOCK 0x100 /* qdisc does not require locking */ #define TCQ_F_OFFLOADED 0x200 /* qdisc is offloaded to HW */ +#define TCQ_F_DEQUEUE_DROPS 0x400 /* ->dequeue() can drop packets in q->to_free */ + u32 limit; const struct Qdisc_ops *ops; struct qdisc_size_table __rcu *stab; @@ -119,6 +121,8 @@ struct Qdisc { /* Note : we only change qstats.backlog in fast path. */ struct gnet_stats_queue qstats; + + struct sk_buff *to_free; __cacheline_group_end(Qdisc_write); @@ -218,8 +222,10 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc) return true; } -static inline void qdisc_run_end(struct Qdisc *qdisc) +static inline struct sk_buff *qdisc_run_end(struct Qdisc *qdisc) { + struct sk_buff *to_free = NULL; + if (qdisc->flags & TCQ_F_NOLOCK) { spin_unlock(&qdisc->seqlock); @@ -232,9 +238,16 @@ static inline void qdisc_run_end(struct Qdisc *qdisc) if (unlikely(test_bit(__QDISC_STATE_MISSED, &qdisc->state))) __netif_schedule(qdisc); - } else { - WRITE_ONCE(qdisc->running, false); + return NULL; + } + + if (qdisc->flags & TCQ_F_DEQUEUE_DROPS) { + to_free = qdisc->to_free; + if (to_free) + qdisc->to_free = NULL; } + WRITE_ONCE(qdisc->running, false); + return to_free; } static inline bool qdisc_may_bulk(const struct Qdisc *qdisc) @@ -1116,6 +1129,17 @@ static inline void tcf_kfree_skb_list(struct sk_buff *skb) } } +static inline void qdisc_dequeue_drop(struct Qdisc *q, struct sk_buff *skb, + enum skb_drop_reason reason) +{ + DEBUG_NET_WARN_ON_ONCE(!(q->flags & TCQ_F_DEQUEUE_DROPS)); + DEBUG_NET_WARN_ON_ONCE(q->flags & TCQ_F_NOLOCK); + + tcf_set_drop_reason(skb, reason); + skb->next = q->to_free; + q->to_free = skb; +} + /* Instead of calling kfree_skb() while root qdisc lock is held, * queue the skb for future freeing at end of __dev_xmit_skb() */ diff --git a/net/core/dev.c b/net/core/dev.c index e865cdb9b6966225072dc44a86610b9c7828bd8c..9094c0fb8c689cd5252274d839638839bfb7642e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4141,7 +4141,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, struct net_device *dev, struct netdev_queue *txq) { - struct sk_buff *next, *to_free = NULL; + struct sk_buff *next, *to_free = NULL, *to_free2 = NULL; spinlock_t *root_lock = qdisc_lock(q); struct llist_node *ll_list, *first_n; unsigned long defer_count = 0; @@ -4160,7 +4160,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, if (unlikely(!nolock_qdisc_is_empty(q))) { rc = dev_qdisc_enqueue(skb, q, &to_free, txq); __qdisc_run(q); - qdisc_run_end(q); + to_free2 = qdisc_run_end(q); goto free_skbs; } @@ -4170,12 +4170,13 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, !nolock_qdisc_is_empty(q)) __qdisc_run(q); - qdisc_run_end(q); - return NET_XMIT_SUCCESS; + to_free2 = qdisc_run_end(q); + rc = NET_XMIT_SUCCESS; + goto free_skbs; } rc = dev_qdisc_enqueue(skb, q, &to_free, txq); - qdisc_run(q); + to_free2 = qdisc_run(q); goto free_skbs; } @@ -4234,7 +4235,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, qdisc_bstats_update(q, skb); if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) __qdisc_run(q); - qdisc_run_end(q); + to_free2 = qdisc_run_end(q); rc = NET_XMIT_SUCCESS; } else { int count = 0; @@ -4246,7 +4247,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, rc = dev_qdisc_enqueue(skb, q, &to_free, txq); count++; } - qdisc_run(q); + to_free2 = qdisc_run(q); if (count != 1) rc = NET_XMIT_SUCCESS; } @@ -4255,6 +4256,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, free_skbs: tcf_kfree_skb_list(to_free); + tcf_kfree_skb_list(to_free2); return rc; } @@ -5747,8 +5749,9 @@ static __latent_entropy void net_tx_action(void) rcu_read_lock(); while (head) { - struct Qdisc *q = head; spinlock_t *root_lock = NULL; + struct sk_buff *to_free; + struct Qdisc *q = head; head = head->next_sched; @@ -5775,9 +5778,10 @@ static __latent_entropy void net_tx_action(void) } clear_bit(__QDISC_STATE_SCHED, &q->state); - qdisc_run(q); + to_free = qdisc_run(q); if (root_lock) spin_unlock(root_lock); + tcf_kfree_skb_list(to_free); } rcu_read_unlock(); -- 2.52.0.rc1.455.g30608eb744-goog cake, codel and fq_codel can drop many packets from dequeue(). Use qdisc_dequeue_drop() so that the freeing can happen outside of the qdisc spinlock scope. Add TCQ_F_DEQUEUE_DROPS to sch->flags. Signed-off-by: Eric Dumazet --- net/sched/sch_cake.c | 4 +++- net/sched/sch_codel.c | 4 +++- net/sched/sch_fq_codel.c | 5 ++++- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 5948a149129c6de041ba949e2e2b5b6b4eb54166..0ea9440f68c60ab69e9dd889b225c1a171199787 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -2183,7 +2183,7 @@ static struct sk_buff *cake_dequeue(struct Qdisc *sch) b->tin_dropped++; qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb)); qdisc_qstats_drop(sch); - kfree_skb_reason(skb, reason); + qdisc_dequeue_drop(sch, skb, reason); if (q->rate_flags & CAKE_FLAG_INGRESS) goto retry; } @@ -2724,6 +2724,8 @@ static int cake_init(struct Qdisc *sch, struct nlattr *opt, int i, j, err; sch->limit = 10240; + sch->flags |= TCQ_F_DEQUEUE_DROPS; + q->tin_mode = CAKE_DIFFSERV_DIFFSERV3; q->flow_mode = CAKE_FLOW_TRIPLE; diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index fa0314679e434a4f84a128e8330bb92743c3d66a..c6551578f1cf8d332ca20ea062e858ffb437966a 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -52,7 +52,7 @@ static void drop_func(struct sk_buff *skb, void *ctx) { struct Qdisc *sch = ctx; - kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_CONGESTED); + qdisc_dequeue_drop(sch, skb, SKB_DROP_REASON_QDISC_CONGESTED); qdisc_qstats_drop(sch); } @@ -182,6 +182,8 @@ static int codel_init(struct Qdisc *sch, struct nlattr *opt, else sch->flags &= ~TCQ_F_CAN_BYPASS; + sch->flags |= TCQ_F_DEQUEUE_DROPS; + return 0; } diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index a141423929394d7ebe127aa328dcf13ae67b3d56..dc187c7f06b10d8fd4191ead82e6a60133fff09d 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -275,7 +275,7 @@ static void drop_func(struct sk_buff *skb, void *ctx) { struct Qdisc *sch = ctx; - kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_CONGESTED); + qdisc_dequeue_drop(sch, skb, SKB_DROP_REASON_QDISC_CONGESTED); qdisc_qstats_drop(sch); } @@ -519,6 +519,9 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt, sch->flags |= TCQ_F_CAN_BYPASS; else sch->flags &= ~TCQ_F_CAN_BYPASS; + + sch->flags |= TCQ_F_DEQUEUE_DROPS; + return 0; alloc_failure: -- 2.52.0.rc1.455.g30608eb744-goog