Add a new u16 field, next to pkt_len : pkt_segs This will cache shinfo->gso_segs to speed up qdisc deqeue(). Move slave_dev_queue_mapping at the end of qdisc_skb_cb, and move three bits from tc_skb_cb : - post_ct - post_ct_snat - post_ct_dnat Signed-off-by: Eric Dumazet --- include/net/sch_generic.h | 18 +++++++++--------- net/core/dev.c | 2 +- net/sched/act_ct.c | 8 ++++---- net/sched/cls_api.c | 6 +++--- net/sched/cls_flower.c | 2 +- 5 files changed, 18 insertions(+), 18 deletions(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 94966692ccdf51db085c236319705aecba8c30cf..9cd8b5d4b23698fd8959ef40c303468e31c1d4af 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -429,13 +429,16 @@ struct tcf_proto { }; struct qdisc_skb_cb { - struct { - unsigned int pkt_len; - u16 slave_dev_queue_mapping; - u16 tc_classid; - }; + unsigned int pkt_len; + u16 pkt_segs; + u16 tc_classid; #define QDISC_CB_PRIV_LEN 20 unsigned char data[QDISC_CB_PRIV_LEN]; + + u16 slave_dev_queue_mapping; + u8 post_ct:1; + u8 post_ct_snat:1; + u8 post_ct_dnat:1; }; typedef void tcf_chain_head_change_t(struct tcf_proto *tp_head, void *priv); @@ -1064,11 +1067,8 @@ struct tc_skb_cb { struct qdisc_skb_cb qdisc_cb; u32 drop_reason; - u16 zone; /* Only valid if post_ct = true */ + u16 zone; /* Only valid if qdisc_skb_cb(skb)->post_ct = true */ u16 mru; - u8 post_ct:1; - u8 post_ct_snat:1; - u8 post_ct_dnat:1; }; static inline struct tc_skb_cb *tc_skb_cb(const struct sk_buff *skb) diff --git a/net/core/dev.c b/net/core/dev.c index 69515edd17bc6a157046f31b3dd343a59ae192ab..46ce6c6107805132b1322128e86634eca91e3340 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4355,7 +4355,7 @@ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb, return ret; tc_skb_cb(skb)->mru = 0; - tc_skb_cb(skb)->post_ct = false; + qdisc_skb_cb(skb)->post_ct = false; tcf_set_drop_reason(skb, *drop_reason); mini_qdisc_bstats_cpu_update(miniq, skb); diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 6749a4a9a9cd0a43897fcd20d228721ce057cb88..2b6ac7069dc168da2c534bddc5d4398e5e7a18c4 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -948,9 +948,9 @@ static int tcf_ct_act_nat(struct sk_buff *skb, return err & NF_VERDICT_MASK; if (action & BIT(NF_NAT_MANIP_SRC)) - tc_skb_cb(skb)->post_ct_snat = 1; + qdisc_skb_cb(skb)->post_ct_snat = 1; if (action & BIT(NF_NAT_MANIP_DST)) - tc_skb_cb(skb)->post_ct_dnat = 1; + qdisc_skb_cb(skb)->post_ct_dnat = 1; return err; #else @@ -986,7 +986,7 @@ TC_INDIRECT_SCOPE int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, tcf_action_update_bstats(&c->common, skb); if (clear) { - tc_skb_cb(skb)->post_ct = false; + qdisc_skb_cb(skb)->post_ct = false; ct = nf_ct_get(skb, &ctinfo); if (ct) { nf_ct_put(ct); @@ -1097,7 +1097,7 @@ TC_INDIRECT_SCOPE int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, out_push: skb_push_rcsum(skb, nh_ofs); - tc_skb_cb(skb)->post_ct = true; + qdisc_skb_cb(skb)->post_ct = true; tc_skb_cb(skb)->zone = p->zone; out_clear: if (defrag) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index f751cd5eeac8d72b4c4d138f45d25a8ba62fb1bd..ebca4b926dcf76daa3abb8ffe221503e33de30e3 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1872,9 +1872,9 @@ int tcf_classify(struct sk_buff *skb, } ext->chain = last_executed_chain; ext->mru = cb->mru; - ext->post_ct = cb->post_ct; - ext->post_ct_snat = cb->post_ct_snat; - ext->post_ct_dnat = cb->post_ct_dnat; + ext->post_ct = qdisc_skb_cb(skb)->post_ct; + ext->post_ct_snat = qdisc_skb_cb(skb)->post_ct_snat; + ext->post_ct_dnat = qdisc_skb_cb(skb)->post_ct_dnat; ext->zone = cb->zone; } } diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 099ff6a3e1f516a50cfac578666f6d5f4fbe8f29..7669371c1354c27ede83c2c83aaea5c0402e6552 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -326,7 +326,7 @@ TC_INDIRECT_SCOPE int fl_classify(struct sk_buff *skb, struct tcf_result *res) { struct cls_fl_head *head = rcu_dereference_bh(tp->root); - bool post_ct = tc_skb_cb(skb)->post_ct; + bool post_ct = qdisc_skb_cb(skb)->post_ct; u16 zone = tc_skb_cb(skb)->zone; struct fl_flow_key skb_key; struct fl_flow_mask *mask; -- 2.51.2.1041.gc1ab5b90ca-goog Qdisc use shinfo->gso_segs for their pkts stats in bstats_update(), but this field needs to be initialized for SKB_GSO_DODGY users. Signed-off-by: Eric Dumazet --- net/core/dev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index 46ce6c6107805132b1322128e86634eca91e3340..dba9eef8bd83dda89b5edd870b47373722264f48 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4071,7 +4071,7 @@ EXPORT_SYMBOL_GPL(validate_xmit_skb_list); static void qdisc_pkt_len_init(struct sk_buff *skb) { - const struct skb_shared_info *shinfo = skb_shinfo(skb); + struct skb_shared_info *shinfo = skb_shinfo(skb); qdisc_skb_cb(skb)->pkt_len = skb->len; @@ -4112,6 +4112,7 @@ static void qdisc_pkt_len_init(struct sk_buff *skb) if (payload <= 0) return; gso_segs = DIV_ROUND_UP(payload, shinfo->gso_size); + shinfo->gso_segs = gso_segs; } qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len; } -- 2.51.2.1041.gc1ab5b90ca-goog qdisc_pkt_len_init() is currently initalizing qdisc_skb_cb(skb)->pkt_len Add qdisc_skb_cb(skb)->pkt_segs initialization and rename this function to qdisc_pkt_len_segs_init(). Signed-off-by: Eric Dumazet --- net/core/dev.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index dba9eef8bd83dda89b5edd870b47373722264f48..895c3e37e686f0f625bd5eec7079a43cbd33a7eb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4069,17 +4069,23 @@ struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *d } EXPORT_SYMBOL_GPL(validate_xmit_skb_list); -static void qdisc_pkt_len_init(struct sk_buff *skb) +static void qdisc_pkt_len_segs_init(struct sk_buff *skb) { struct skb_shared_info *shinfo = skb_shinfo(skb); + u16 gso_segs; qdisc_skb_cb(skb)->pkt_len = skb->len; + if (!shinfo->gso_size) { + qdisc_skb_cb(skb)->pkt_segs = 1; + return; + } + + qdisc_skb_cb(skb)->pkt_segs = gso_segs = shinfo->gso_segs; /* To get more precise estimation of bytes sent on wire, * we add to pkt_len the headers size of all segments */ - if (shinfo->gso_size && skb_transport_header_was_set(skb)) { - u16 gso_segs = shinfo->gso_segs; + if (skb_transport_header_was_set(skb)) { unsigned int hdr_len; /* mac layer + network layer */ @@ -4113,6 +4119,7 @@ static void qdisc_pkt_len_init(struct sk_buff *skb) return; gso_segs = DIV_ROUND_UP(payload, shinfo->gso_size); shinfo->gso_segs = gso_segs; + qdisc_skb_cb(skb)->pkt_segs = gso_segs; } qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len; } @@ -4738,7 +4745,7 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) skb_update_prio(skb); - qdisc_pkt_len_init(skb); + qdisc_pkt_len_segs_init(skb); tcx_set_ingress(skb, false); #ifdef CONFIG_NET_EGRESS if (static_branch_unlikely(&egress_needed_key)) { -- 2.51.2.1041.gc1ab5b90ca-goog Avoid up to two cache line misses in qdisc dequeue() to fetch skb_shinfo(skb)->gso_segs/gso_size while qdisc spinlock is held. This gives a 5 % improvement in a TX intensive workload. Signed-off-by: Eric Dumazet --- include/net/sch_generic.h | 13 ++++++++++--- net/sched/sch_cake.c | 1 + net/sched/sch_dualpi2.c | 1 + net/sched/sch_netem.c | 1 + net/sched/sch_qfq.c | 2 +- net/sched/sch_taprio.c | 1 + net/sched/sch_tbf.c | 1 + 7 files changed, 16 insertions(+), 4 deletions(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 9cd8b5d4b23698fd8959ef40c303468e31c1d4af..ae037e56088208ad17f664c43689aee895569cab 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -829,6 +829,15 @@ static inline unsigned int qdisc_pkt_len(const struct sk_buff *skb) return qdisc_skb_cb(skb)->pkt_len; } +static inline unsigned int qdisc_pkt_segs(const struct sk_buff *skb) +{ + u32 pkt_segs = qdisc_skb_cb(skb)->pkt_segs; + + DEBUG_NET_WARN_ON_ONCE(pkt_segs != + skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1); + return pkt_segs; +} + /* additional qdisc xmit flags (NET_XMIT_MASK in linux/netdevice.h) */ enum net_xmit_qdisc_t { __NET_XMIT_STOLEN = 0x00010000, @@ -870,9 +879,7 @@ static inline void _bstats_update(struct gnet_stats_basic_sync *bstats, static inline void bstats_update(struct gnet_stats_basic_sync *bstats, const struct sk_buff *skb) { - _bstats_update(bstats, - qdisc_pkt_len(skb), - skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1); + _bstats_update(bstats, qdisc_pkt_len(skb), qdisc_pkt_segs(skb)); } static inline void qdisc_bstats_cpu_update(struct Qdisc *sch, diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 32bacfc314c260dccf94178d309ccb2be22d69e4..993ce808230fb7d4769c926f6c8368d927f5a45f 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -1800,6 +1800,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, skb_list_walk_safe(segs, segs, nskb) { skb_mark_not_on_list(segs); qdisc_skb_cb(segs)->pkt_len = segs->len; + qdisc_skb_cb(segs)->pkt_segs = 1; cobalt_set_enqueue_time(segs, now); get_cobalt_cb(segs)->adjusted_len = cake_overhead(q, segs); diff --git a/net/sched/sch_dualpi2.c b/net/sched/sch_dualpi2.c index 4b975feb52b1f3d3b37b31713d1477de5f5806d9..6d7e6389758dc8e645b1116efe4e11fb7290ac86 100644 --- a/net/sched/sch_dualpi2.c +++ b/net/sched/sch_dualpi2.c @@ -475,6 +475,7 @@ static int dualpi2_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, * (3) Enqueue fragment & set ts in dualpi2_enqueue_skb */ qdisc_skb_cb(nskb)->pkt_len = nskb->len; + qdisc_skb_cb(nskb)->pkt_segs = 1; dualpi2_skb_cb(nskb)->classified = dualpi2_skb_cb(skb)->classified; dualpi2_skb_cb(nskb)->ect = dualpi2_skb_cb(skb)->ect; diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index eafc316ae319e3f8c23b0cb0c58fdf54be102213..32a5f33040461f3be952055c097b5f2fe760a858 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -429,6 +429,7 @@ static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff *segs; netdev_features_t features = netif_skb_features(skb); + qdisc_skb_cb(skb)->pkt_segs = 1; segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); if (IS_ERR_OR_NULL(segs)) { diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 2255355e51d350eded4549c1584b60d4d9b00fff..d920f57dc6d7659c510a98956c6dd2ed9e5ee5b8 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -1250,7 +1250,7 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, } } - gso_segs = skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1; + gso_segs = qdisc_pkt_segs(skb); err = qdisc_enqueue(skb, cl->qdisc, to_free); if (unlikely(err != NET_XMIT_SUCCESS)) { pr_debug("qfq_enqueue: enqueue failed %d\n", err); diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 39b735386996eb59712a1fc28f7bb903ec1b2220..300d577b328699eb42d2b829ecfc76464fd7b186 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -595,6 +595,7 @@ static int taprio_enqueue_segmented(struct sk_buff *skb, struct Qdisc *sch, skb_list_walk_safe(segs, segs, nskb) { skb_mark_not_on_list(segs); qdisc_skb_cb(segs)->pkt_len = segs->len; + qdisc_skb_cb(segs)->pkt_segs = 1; slen += segs->len; /* FIXME: we should be segmenting to a smaller size diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 4c977f049670a600eafd219c898e5f29597be2c1..f2340164f579a25431979e12ec3d23ab828edd16 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -221,6 +221,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch, skb_mark_not_on_list(segs); seg_len = segs->len; qdisc_skb_cb(segs)->pkt_len = seg_len; + qdisc_skb_cb(segs)->pkt_segs = 1; ret = qdisc_enqueue(segs, q->qdisc, to_free); if (ret != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) -- 2.51.2.1041.gc1ab5b90ca-goog Use new qdisc_pkt_segs() to avoid a cache line miss in cake_enqueue() for non GSO packets. cake_overhead() does not have to recompute it. Signed-off-by: Eric Dumazet --- net/sched/sch_cake.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 993ce808230fb7d4769c926f6c8368d927f5a45f..312f5b000ffb67d74faf70f26d808e26315b4ab8 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -1398,12 +1398,12 @@ static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb) const struct skb_shared_info *shinfo = skb_shinfo(skb); unsigned int hdr_len, last_len = 0; u32 off = skb_network_offset(skb); + u16 segs = qdisc_pkt_segs(skb); u32 len = qdisc_pkt_len(skb); - u16 segs = 1; q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8); - if (!shinfo->gso_size) + if (segs == 1) return cake_calc_overhead(q, len, off); /* borrowed from qdisc_pkt_len_init() */ @@ -1430,12 +1430,6 @@ static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb) hdr_len += sizeof(struct udphdr); } - if (unlikely(shinfo->gso_type & SKB_GSO_DODGY)) - segs = DIV_ROUND_UP(skb->len - hdr_len, - shinfo->gso_size); - else - segs = shinfo->gso_segs; - len = shinfo->gso_size + hdr_len; last_len = skb->len - shinfo->gso_size * (segs - 1); @@ -1788,7 +1782,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (unlikely(len > b->max_skblen)) b->max_skblen = len; - if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) { + if (qdisc_pkt_segs(skb) > 1 && q->rate_flags & CAKE_FLAG_SPLIT_GSO) { struct sk_buff *segs, *nskb; netdev_features_t features = netif_skb_features(skb); unsigned int slen = 0, numsegs = 0; -- 2.51.2.1041.gc1ab5b90ca-goog It is possible to reorg Qdisc to avoid always dirtying 2 cache lines in fast path by reducing this to a single dirtied cache line. In current layout, we change only four/six fields in the first cache line: - q.spinlock - q.qlen - bstats.bytes - bstats.packets - some Qdisc also change q.next/q.prev In the second cache line we change in the fast path: - running - state - qstats.backlog /* --- cacheline 2 boundary (128 bytes) --- */ struct sk_buff_head gso_skb __attribute__((__aligned__(64))); /* 0x80 0x18 */ struct qdisc_skb_head q; /* 0x98 0x18 */ struct gnet_stats_basic_sync bstats __attribute__((__aligned__(16))); /* 0xb0 0x10 */ /* --- cacheline 3 boundary (192 bytes) --- */ struct gnet_stats_queue qstats; /* 0xc0 0x14 */ bool running; /* 0xd4 0x1 */ /* XXX 3 bytes hole, try to pack */ unsigned long state; /* 0xd8 0x8 */ struct Qdisc * next_sched; /* 0xe0 0x8 */ struct sk_buff_head skb_bad_txq; /* 0xe8 0x18 */ /* --- cacheline 4 boundary (256 bytes) --- */ Reorganize things to have a first cache line mostly read, then a mostly written one. This gives a ~3% increase of performance under tx stress. Note that there is an additional hols because @qstats now spans over a third cache line. /* --- cacheline 2 boundary (128 bytes) --- */ __u8 __cacheline_group_begin__Qdisc_read_mostly[0] __attribute__((__aligned__(64))); /* 0x80 0 */ struct sk_buff_head gso_skb; /* 0x80 0x18 */ struct Qdisc * next_sched; /* 0x98 0x8 */ struct sk_buff_head skb_bad_txq; /* 0xa0 0x18 */ __u8 __cacheline_group_end__Qdisc_read_mostly[0]; /* 0xb8 0 */ /* XXX 8 bytes hole, try to pack */ /* --- cacheline 3 boundary (192 bytes) --- */ __u8 __cacheline_group_begin__Qdisc_write[0] __attribute__((__aligned__(64))); /* 0xc0 0 */ struct qdisc_skb_head q; /* 0xc0 0x18 */ unsigned long state; /* 0xd8 0x8 */ struct gnet_stats_basic_sync bstats __attribute__((__aligned__(16))); /* 0xe0 0x10 */ bool running; /* 0xf0 0x1 */ /* XXX 3 bytes hole, try to pack */ struct gnet_stats_queue qstats; /* 0xf4 0x14 */ /* --- cacheline 4 boundary (256 bytes) was 8 bytes ago --- */ __u8 __cacheline_group_end__Qdisc_write[0]; /* 0x108 0 */ /* XXX 56 bytes hole, try to pack */ Signed-off-by: Eric Dumazet --- include/net/sch_generic.h | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index ae037e56088208ad17f664c43689aee895569cab..b76436ec3f4aa412bac1be3371f5c7c6245cc362 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -103,17 +103,24 @@ struct Qdisc { int pad; refcount_t refcnt; - /* - * For performance sake on SMP, we put highly modified fields at the end - */ - struct sk_buff_head gso_skb ____cacheline_aligned_in_smp; - struct qdisc_skb_head q; - struct gnet_stats_basic_sync bstats; - struct gnet_stats_queue qstats; - bool running; /* must be written under qdisc spinlock */ - unsigned long state; - struct Qdisc *next_sched; - struct sk_buff_head skb_bad_txq; + /* Cache line potentially dirtied in dequeue() or __netif_reschedule(). */ + __cacheline_group_begin(Qdisc_read_mostly) ____cacheline_aligned; + struct sk_buff_head gso_skb; + struct Qdisc *next_sched; + struct sk_buff_head skb_bad_txq; + __cacheline_group_end(Qdisc_read_mostly); + + /* Fields dirtied in dequeue() fast path. */ + __cacheline_group_begin(Qdisc_write) ____cacheline_aligned; + struct qdisc_skb_head q; + unsigned long state; + struct gnet_stats_basic_sync bstats; + bool running; /* must be written under qdisc spinlock */ + + /* Note : we only change qstats.backlog in fast path. */ + struct gnet_stats_queue qstats; + __cacheline_group_end(Qdisc_write); + atomic_long_t defer_count ____cacheline_aligned_in_smp; struct llist_head defer_list; -- 2.51.2.1041.gc1ab5b90ca-goog Group together changes to qdisc fields to reduce chances of false sharing if another cpu attempts to acquire the qdisc spinlock. qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; qdisc_bstats_update(sch, skb); Signed-off-by: Eric Dumazet --- net/sched/sch_fq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index fee922da2f99c0c7ac6d86569cf3bbce47898951..0b0ca1aa9251f959e87dd5dc504fbe0f4cbc75eb 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -497,6 +497,7 @@ static void fq_dequeue_skb(struct Qdisc *sch, struct fq_flow *flow, skb_mark_not_on_list(skb); qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; + qdisc_bstats_update(sch, skb); } static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb) @@ -776,7 +777,6 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch) f->time_next_packet = now + len; } out: - qdisc_bstats_update(sch, skb); return skb; } -- 2.51.2.1041.gc1ab5b90ca-goog prefetch the skb that we are likely to dequeue at the next dequeue(). Also call fq_dequeue_skb() a bit sooner in fq_dequeue(). This reduces the window between read of q.qlen and changes of fields in the cache line that could be dirtied by another cpu trying to queue a packet. Signed-off-by: Eric Dumazet --- net/sched/sch_fq.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 0b0ca1aa9251f959e87dd5dc504fbe0f4cbc75eb..6e5f2f4f241546605f8ba37f96275446c8836eee 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -480,7 +480,10 @@ static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow, struct sk_buff *skb) { if (skb == flow->head) { - flow->head = skb->next; + struct sk_buff *next = skb->next; + + prefetch(next); + flow->head = next; } else { rb_erase(&skb->rbnode, &flow->t_root); skb->dev = qdisc_dev(sch); @@ -712,6 +715,7 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch) goto begin; } prefetch(&skb->end); + fq_dequeue_skb(sch, f, skb); if ((s64)(now - time_next_packet - q->ce_threshold) > 0) { INET_ECN_set_ce(skb); q->stat_ce_mark++; @@ -719,7 +723,6 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch) if (--f->qlen == 0) q->inactive_flows++; q->band_pkt_count[fq_skb_cb(skb)->band]--; - fq_dequeue_skb(sch, f, skb); } else { head->first = f->next; /* force a pass through old_flows to prevent starvation */ -- 2.51.2.1041.gc1ab5b90ca-goog Most qdiscs need to read skb->priority at enqueue time(). __dev_xmit_skb() In commit 100dfa74cad9 ("net: dev_queue_xmit() llist adoption") I added a prefetch(next), lets add another one for the second half of skb. Note that skb->priority and skb->hash share a common cache line, so this patch helps qdiscs needing both fields. Signed-off-by: Eric Dumazet --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/dev.c b/net/core/dev.c index 895c3e37e686f0f625bd5eec7079a43cbd33a7eb..44022fdec655e40e70ff5e1894f55fc76235b00c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4246,6 +4246,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, llist_for_each_entry_safe(skb, next, ll_list, ll_node) { prefetch(next); + prefetch(&next->priority); skb_mark_not_on_list(skb); rc = dev_qdisc_enqueue(skb, q, &to_free, txq); count++; -- 2.51.2.1041.gc1ab5b90ca-goog q->limit is read locklessly, add a READ_ONCE(). Fixes: 100dfa74cad9 ("net: dev_queue_xmit() llist adoption") Signed-off-by: Eric Dumazet --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index 44022fdec655e40e70ff5e1894f55fc76235b00c..ac994974e2a81889fcc0a2e664edcdb7cfd0496d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4194,7 +4194,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, do { if (first_n && !defer_count) { defer_count = atomic_long_inc_return(&q->defer_count); - if (unlikely(defer_count > q->limit)) { + if (unlikely(defer_count > READ_ONCE(q->limit))) { kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_DROP); return NET_XMIT_DROP; } -- 2.51.2.1041.gc1ab5b90ca-goog