Add a new u16 field, next to pkt_len : pkt_segs

This will cache shinfo->gso_segs to speed up qdisc deqeue().

Move slave_dev_queue_mapping at the end of qdisc_skb_cb,
and move three bits from tc_skb_cb :
- post_ct
- post_ct_snat
- post_ct_dnat

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/net/sch_generic.h | 18 +++++++++---------
 net/core/dev.c            |  2 +-
 net/sched/act_ct.c        |  8 ++++----
 net/sched/cls_api.c       |  6 +++---
 net/sched/cls_flower.c    |  2 +-
 5 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 94966692ccdf51db085c236319705aecba8c30cf..9cd8b5d4b23698fd8959ef40c303468e31c1d4af 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -429,13 +429,16 @@ struct tcf_proto {
 };
 
 struct qdisc_skb_cb {
-	struct {
-		unsigned int		pkt_len;
-		u16			slave_dev_queue_mapping;
-		u16			tc_classid;
-	};
+	unsigned int		pkt_len;
+	u16			pkt_segs;
+	u16			tc_classid;
 #define QDISC_CB_PRIV_LEN 20
 	unsigned char		data[QDISC_CB_PRIV_LEN];
+
+	u16			slave_dev_queue_mapping;
+	u8			post_ct:1;
+	u8			post_ct_snat:1;
+	u8			post_ct_dnat:1;
 };
 
 typedef void tcf_chain_head_change_t(struct tcf_proto *tp_head, void *priv);
@@ -1064,11 +1067,8 @@ struct tc_skb_cb {
 	struct qdisc_skb_cb qdisc_cb;
 	u32 drop_reason;
 
-	u16 zone; /* Only valid if post_ct = true */
+	u16 zone; /* Only valid if qdisc_skb_cb(skb)->post_ct = true */
 	u16 mru;
-	u8 post_ct:1;
-	u8 post_ct_snat:1;
-	u8 post_ct_dnat:1;
 };
 
 static inline struct tc_skb_cb *tc_skb_cb(const struct sk_buff *skb)
diff --git a/net/core/dev.c b/net/core/dev.c
index 69515edd17bc6a157046f31b3dd343a59ae192ab..46ce6c6107805132b1322128e86634eca91e3340 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4355,7 +4355,7 @@ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb,
 		return ret;
 
 	tc_skb_cb(skb)->mru = 0;
-	tc_skb_cb(skb)->post_ct = false;
+	qdisc_skb_cb(skb)->post_ct = false;
 	tcf_set_drop_reason(skb, *drop_reason);
 
 	mini_qdisc_bstats_cpu_update(miniq, skb);
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index 6749a4a9a9cd0a43897fcd20d228721ce057cb88..2b6ac7069dc168da2c534bddc5d4398e5e7a18c4 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -948,9 +948,9 @@ static int tcf_ct_act_nat(struct sk_buff *skb,
 		return err & NF_VERDICT_MASK;
 
 	if (action & BIT(NF_NAT_MANIP_SRC))
-		tc_skb_cb(skb)->post_ct_snat = 1;
+		qdisc_skb_cb(skb)->post_ct_snat = 1;
 	if (action & BIT(NF_NAT_MANIP_DST))
-		tc_skb_cb(skb)->post_ct_dnat = 1;
+		qdisc_skb_cb(skb)->post_ct_dnat = 1;
 
 	return err;
 #else
@@ -986,7 +986,7 @@ TC_INDIRECT_SCOPE int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
 	tcf_action_update_bstats(&c->common, skb);
 
 	if (clear) {
-		tc_skb_cb(skb)->post_ct = false;
+		qdisc_skb_cb(skb)->post_ct = false;
 		ct = nf_ct_get(skb, &ctinfo);
 		if (ct) {
 			nf_ct_put(ct);
@@ -1097,7 +1097,7 @@ TC_INDIRECT_SCOPE int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
 out_push:
 	skb_push_rcsum(skb, nh_ofs);
 
-	tc_skb_cb(skb)->post_ct = true;
+	qdisc_skb_cb(skb)->post_ct = true;
 	tc_skb_cb(skb)->zone = p->zone;
 out_clear:
 	if (defrag)
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index f751cd5eeac8d72b4c4d138f45d25a8ba62fb1bd..ebca4b926dcf76daa3abb8ffe221503e33de30e3 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -1872,9 +1872,9 @@ int tcf_classify(struct sk_buff *skb,
 			}
 			ext->chain = last_executed_chain;
 			ext->mru = cb->mru;
-			ext->post_ct = cb->post_ct;
-			ext->post_ct_snat = cb->post_ct_snat;
-			ext->post_ct_dnat = cb->post_ct_dnat;
+			ext->post_ct = qdisc_skb_cb(skb)->post_ct;
+			ext->post_ct_snat = qdisc_skb_cb(skb)->post_ct_snat;
+			ext->post_ct_dnat = qdisc_skb_cb(skb)->post_ct_dnat;
 			ext->zone = cb->zone;
 		}
 	}
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 099ff6a3e1f516a50cfac578666f6d5f4fbe8f29..7669371c1354c27ede83c2c83aaea5c0402e6552 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -326,7 +326,7 @@ TC_INDIRECT_SCOPE int fl_classify(struct sk_buff *skb,
 				  struct tcf_result *res)
 {
 	struct cls_fl_head *head = rcu_dereference_bh(tp->root);
-	bool post_ct = tc_skb_cb(skb)->post_ct;
+	bool post_ct = qdisc_skb_cb(skb)->post_ct;
 	u16 zone = tc_skb_cb(skb)->zone;
 	struct fl_flow_key skb_key;
 	struct fl_flow_mask *mask;
-- 
2.52.0.rc1.455.g30608eb744-goog

Qdisc use shinfo->gso_segs for their pkts stats in bstats_update(),
but this field needs to be initialized for SKB_GSO_DODGY users.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/core/dev.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 46ce6c6107805132b1322128e86634eca91e3340..dba9eef8bd83dda89b5edd870b47373722264f48 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4071,7 +4071,7 @@ EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
 
 static void qdisc_pkt_len_init(struct sk_buff *skb)
 {
-	const struct skb_shared_info *shinfo = skb_shinfo(skb);
+	struct skb_shared_info *shinfo = skb_shinfo(skb);
 
 	qdisc_skb_cb(skb)->pkt_len = skb->len;
 
@@ -4112,6 +4112,7 @@ static void qdisc_pkt_len_init(struct sk_buff *skb)
 			if (payload <= 0)
 				return;
 			gso_segs = DIV_ROUND_UP(payload, shinfo->gso_size);
+			shinfo->gso_segs = gso_segs;
 		}
 		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
 	}
-- 
2.52.0.rc1.455.g30608eb744-goog

qdisc_pkt_len_init() is currently initalizing qdisc_skb_cb(skb)->pkt_len.

Add qdisc_skb_cb(skb)->pkt_segs initialization and rename this function
to qdisc_pkt_len_segs_init().

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/core/dev.c       | 15 +++++++++++----
 net/sched/sch_cake.c |  2 +-
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index dba9eef8bd83dda89b5edd870b47373722264f48..895c3e37e686f0f625bd5eec7079a43cbd33a7eb 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4069,17 +4069,23 @@ struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *d
 }
 EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
 
-static void qdisc_pkt_len_init(struct sk_buff *skb)
+static void qdisc_pkt_len_segs_init(struct sk_buff *skb)
 {
 	struct skb_shared_info *shinfo = skb_shinfo(skb);
+	u16 gso_segs;
 
 	qdisc_skb_cb(skb)->pkt_len = skb->len;
+	if (!shinfo->gso_size) {
+		qdisc_skb_cb(skb)->pkt_segs = 1;
+		return;
+	}
+
+	qdisc_skb_cb(skb)->pkt_segs = gso_segs = shinfo->gso_segs;
 
 	/* To get more precise estimation of bytes sent on wire,
 	 * we add to pkt_len the headers size of all segments
 	 */
-	if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
-		u16 gso_segs = shinfo->gso_segs;
+	if (skb_transport_header_was_set(skb)) {
 		unsigned int hdr_len;
 
 		/* mac layer + network layer */
@@ -4113,6 +4119,7 @@ static void qdisc_pkt_len_init(struct sk_buff *skb)
 				return;
 			gso_segs = DIV_ROUND_UP(payload, shinfo->gso_size);
 			shinfo->gso_segs = gso_segs;
+			qdisc_skb_cb(skb)->pkt_segs = gso_segs;
 		}
 		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
 	}
@@ -4738,7 +4745,7 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
 
 	skb_update_prio(skb);
 
-	qdisc_pkt_len_init(skb);
+	qdisc_pkt_len_segs_init(skb);
 	tcx_set_ingress(skb, false);
 #ifdef CONFIG_NET_EGRESS
 	if (static_branch_unlikely(&egress_needed_key)) {
diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
index 32bacfc314c260dccf94178d309ccb2be22d69e4..9213129f0de10bc67ce418f77c36fed2581f3781 100644
--- a/net/sched/sch_cake.c
+++ b/net/sched/sch_cake.c
@@ -1406,7 +1406,7 @@ static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
 	if (!shinfo->gso_size)
 		return cake_calc_overhead(q, len, off);
 
-	/* borrowed from qdisc_pkt_len_init() */
+	/* borrowed from qdisc_pkt_len_segs_init() */
 	if (!skb->encapsulation)
 		hdr_len = skb_transport_offset(skb);
 	else
-- 
2.52.0.rc1.455.g30608eb744-goog

sch_handle_ingress() sets qdisc_skb_cb(skb)->pkt_len.

We also need to initialize qdisc_skb_cb(skb)->pkt_segs.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/core/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 895c3e37e686f0f625bd5eec7079a43cbd33a7eb..e19eb4e9d77c27535ab2a0ce14299281e3ef9397 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4434,7 +4434,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
 		*pt_prev = NULL;
 	}
 
-	qdisc_skb_cb(skb)->pkt_len = skb->len;
+	qdisc_pkt_len_segs_init(skb);
 	tcx_set_ingress(skb, true);
 
 	if (static_branch_unlikely(&tcx_needed_key)) {
-- 
2.52.0.rc1.455.g30608eb744-goog

Avoid up to two cache line misses in qdisc dequeue() to fetch
skb_shinfo(skb)->gso_segs/gso_size while qdisc spinlock is held.

This gives a 5 % improvement in a TX intensive workload.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/net/sch_generic.h | 13 ++++++++++---
 net/sched/sch_cake.c      |  1 +
 net/sched/sch_dualpi2.c   |  1 +
 net/sched/sch_netem.c     |  1 +
 net/sched/sch_qfq.c       |  2 +-
 net/sched/sch_taprio.c    |  1 +
 net/sched/sch_tbf.c       |  1 +
 7 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 9cd8b5d4b23698fd8959ef40c303468e31c1d4af..cdf7a58ebcf5ef2b5f8b76eb6fbe92d5f0e07899 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -829,6 +829,15 @@ static inline unsigned int qdisc_pkt_len(const struct sk_buff *skb)
 	return qdisc_skb_cb(skb)->pkt_len;
 }
 
+static inline unsigned int qdisc_pkt_segs(const struct sk_buff *skb)
+{
+	u32 pkt_segs = qdisc_skb_cb(skb)->pkt_segs;
+
+	DEBUG_NET_WARN_ON_ONCE(pkt_segs !=
+			(skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1));
+	return pkt_segs;
+}
+
 /* additional qdisc xmit flags (NET_XMIT_MASK in linux/netdevice.h) */
 enum net_xmit_qdisc_t {
 	__NET_XMIT_STOLEN = 0x00010000,
@@ -870,9 +879,7 @@ static inline void _bstats_update(struct gnet_stats_basic_sync *bstats,
 static inline void bstats_update(struct gnet_stats_basic_sync *bstats,
 				 const struct sk_buff *skb)
 {
-	_bstats_update(bstats,
-		       qdisc_pkt_len(skb),
-		       skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1);
+	_bstats_update(bstats, qdisc_pkt_len(skb), qdisc_pkt_segs(skb));
 }
 
 static inline void qdisc_bstats_cpu_update(struct Qdisc *sch,
diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
index 9213129f0de10bc67ce418f77c36fed2581f3781..a20880034aa5eacec0c25977406104448b336397 100644
--- a/net/sched/sch_cake.c
+++ b/net/sched/sch_cake.c
@@ -1800,6 +1800,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 		skb_list_walk_safe(segs, segs, nskb) {
 			skb_mark_not_on_list(segs);
 			qdisc_skb_cb(segs)->pkt_len = segs->len;
+			qdisc_skb_cb(segs)->pkt_segs = 1;
 			cobalt_set_enqueue_time(segs, now);
 			get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
 									  segs);
diff --git a/net/sched/sch_dualpi2.c b/net/sched/sch_dualpi2.c
index 4b975feb52b1f3d3b37b31713d1477de5f5806d9..6d7e6389758dc8e645b1116efe4e11fb7290ac86 100644
--- a/net/sched/sch_dualpi2.c
+++ b/net/sched/sch_dualpi2.c
@@ -475,6 +475,7 @@ static int dualpi2_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 			 * (3) Enqueue fragment & set ts in dualpi2_enqueue_skb
 			 */
 			qdisc_skb_cb(nskb)->pkt_len = nskb->len;
+			qdisc_skb_cb(nskb)->pkt_segs = 1;
 			dualpi2_skb_cb(nskb)->classified =
 				dualpi2_skb_cb(skb)->classified;
 			dualpi2_skb_cb(nskb)->ect = dualpi2_skb_cb(skb)->ect;
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index eafc316ae319e3f8c23b0cb0c58fdf54be102213..32a5f33040461f3be952055c097b5f2fe760a858 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -429,6 +429,7 @@ static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch,
 	struct sk_buff *segs;
 	netdev_features_t features = netif_skb_features(skb);
 
+	qdisc_skb_cb(skb)->pkt_segs = 1;
 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
 
 	if (IS_ERR_OR_NULL(segs)) {
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index 2255355e51d350eded4549c1584b60d4d9b00fff..d920f57dc6d7659c510a98956c6dd2ed9e5ee5b8 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -1250,7 +1250,7 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 		}
 	}
 
-	gso_segs = skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1;
+	gso_segs = qdisc_pkt_segs(skb);
 	err = qdisc_enqueue(skb, cl->qdisc, to_free);
 	if (unlikely(err != NET_XMIT_SUCCESS)) {
 		pr_debug("qfq_enqueue: enqueue failed %d\n", err);
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index 39b735386996eb59712a1fc28f7bb903ec1b2220..300d577b328699eb42d2b829ecfc76464fd7b186 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -595,6 +595,7 @@ static int taprio_enqueue_segmented(struct sk_buff *skb, struct Qdisc *sch,
 	skb_list_walk_safe(segs, segs, nskb) {
 		skb_mark_not_on_list(segs);
 		qdisc_skb_cb(segs)->pkt_len = segs->len;
+		qdisc_skb_cb(segs)->pkt_segs = 1;
 		slen += segs->len;
 
 		/* FIXME: we should be segmenting to a smaller size
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 4c977f049670a600eafd219c898e5f29597be2c1..f2340164f579a25431979e12ec3d23ab828edd16 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -221,6 +221,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch,
 		skb_mark_not_on_list(segs);
 		seg_len = segs->len;
 		qdisc_skb_cb(segs)->pkt_len = seg_len;
+		qdisc_skb_cb(segs)->pkt_segs = 1;
 		ret = qdisc_enqueue(segs, q->qdisc, to_free);
 		if (ret != NET_XMIT_SUCCESS) {
 			if (net_xmit_drop_count(ret))
-- 
2.52.0.rc1.455.g30608eb744-goog

Use new qdisc_pkt_segs() to avoid a cache line miss in cake_enqueue()
for non GSO packets.

cake_overhead() does not have to recompute it.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/sched/sch_cake.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
index a20880034aa5eacec0c25977406104448b336397..5948a149129c6de041ba949e2e2b5b6b4eb54166 100644
--- a/net/sched/sch_cake.c
+++ b/net/sched/sch_cake.c
@@ -1398,12 +1398,12 @@ static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
 	unsigned int hdr_len, last_len = 0;
 	u32 off = skb_network_offset(skb);
+	u16 segs = qdisc_pkt_segs(skb);
 	u32 len = qdisc_pkt_len(skb);
-	u16 segs = 1;
 
 	q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
 
-	if (!shinfo->gso_size)
+	if (segs == 1)
 		return cake_calc_overhead(q, len, off);
 
 	/* borrowed from qdisc_pkt_len_segs_init() */
@@ -1430,12 +1430,6 @@ static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
 			hdr_len += sizeof(struct udphdr);
 	}
 
-	if (unlikely(shinfo->gso_type & SKB_GSO_DODGY))
-		segs = DIV_ROUND_UP(skb->len - hdr_len,
-				    shinfo->gso_size);
-	else
-		segs = shinfo->gso_segs;
-
 	len = shinfo->gso_size + hdr_len;
 	last_len = skb->len - shinfo->gso_size * (segs - 1);
 
@@ -1788,7 +1782,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 	if (unlikely(len > b->max_skblen))
 		b->max_skblen = len;
 
-	if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
+	if (qdisc_pkt_segs(skb) > 1 && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
 		struct sk_buff *segs, *nskb;
 		netdev_features_t features = netif_skb_features(skb);
 		unsigned int slen = 0, numsegs = 0;
-- 
2.52.0.rc1.455.g30608eb744-goog

It is possible to reorg Qdisc to avoid always dirtying 2 cache lines in
fast path by reducing this to a single dirtied cache line.

In current layout, we change only four/six fields in the first cache line:
 - q.spinlock
 - q.qlen
 - bstats.bytes
 - bstats.packets
 - some Qdisc also change q.next/q.prev

In the second cache line we change in the fast path:
 - running
 - state
 - qstats.backlog

        /* --- cacheline 2 boundary (128 bytes) --- */
        struct sk_buff_head        gso_skb __attribute__((__aligned__(64))); /*  0x80  0x18 */
        struct qdisc_skb_head      q;                    /*  0x98  0x18 */
        struct gnet_stats_basic_sync bstats __attribute__((__aligned__(16))); /*  0xb0  0x10 */

        /* --- cacheline 3 boundary (192 bytes) --- */
        struct gnet_stats_queue    qstats;               /*  0xc0  0x14 */
        bool                       running;              /*  0xd4   0x1 */

        /* XXX 3 bytes hole, try to pack */

        unsigned long              state;                /*  0xd8   0x8 */
        struct Qdisc *             next_sched;           /*  0xe0   0x8 */
        struct sk_buff_head        skb_bad_txq;          /*  0xe8  0x18 */
        /* --- cacheline 4 boundary (256 bytes) --- */

Reorganize things to have a first cache line mostly read,
then a mostly written one.

This gives a ~3% increase of performance under tx stress.

Note that there is an additional hole because @qstats now spans over a third cache line.

	/* --- cacheline 2 boundary (128 bytes) --- */
	__u8                       __cacheline_group_begin__Qdisc_read_mostly[0] __attribute__((__aligned__(64))); /*  0x80     0 */
	struct sk_buff_head        gso_skb;              /*  0x80  0x18 */
	struct Qdisc *             next_sched;           /*  0x98   0x8 */
	struct sk_buff_head        skb_bad_txq;          /*  0xa0  0x18 */
	__u8                       __cacheline_group_end__Qdisc_read_mostly[0]; /*  0xb8     0 */

	/* XXX 8 bytes hole, try to pack */

	/* --- cacheline 3 boundary (192 bytes) --- */
	__u8                       __cacheline_group_begin__Qdisc_write[0] __attribute__((__aligned__(64))); /*  0xc0     0 */
	struct qdisc_skb_head      q;                    /*  0xc0  0x18 */
	unsigned long              state;                /*  0xd8   0x8 */
	struct gnet_stats_basic_sync bstats __attribute__((__aligned__(16))); /*  0xe0  0x10 */
	bool                       running;              /*  0xf0   0x1 */

	/* XXX 3 bytes hole, try to pack */

	struct gnet_stats_queue    qstats;               /*  0xf4  0x14 */
	/* --- cacheline 4 boundary (256 bytes) was 8 bytes ago --- */
	__u8                       __cacheline_group_end__Qdisc_write[0]; /* 0x108     0 */

	/* XXX 56 bytes hole, try to pack */

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/net/sch_generic.h | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index cdf7a58ebcf5ef2b5f8b76eb6fbe92d5f0e07899..79501499dafba56271b9ebd97a8f379ffdc83cac 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -103,17 +103,24 @@ struct Qdisc {
 	int			pad;
 	refcount_t		refcnt;
 
-	/*
-	 * For performance sake on SMP, we put highly modified fields at the end
-	 */
-	struct sk_buff_head	gso_skb ____cacheline_aligned_in_smp;
-	struct qdisc_skb_head	q;
-	struct gnet_stats_basic_sync bstats;
-	struct gnet_stats_queue	qstats;
-	bool			running; /* must be written under qdisc spinlock */
-	unsigned long		state;
-	struct Qdisc            *next_sched;
-	struct sk_buff_head	skb_bad_txq;
+	/* Cache line potentially dirtied in dequeue() or __netif_reschedule(). */
+	__cacheline_group_begin(Qdisc_read_mostly) ____cacheline_aligned;
+		struct sk_buff_head	gso_skb;
+		struct Qdisc		*next_sched;
+		struct sk_buff_head	skb_bad_txq;
+	__cacheline_group_end(Qdisc_read_mostly);
+
+	/* Fields dirtied in dequeue() fast path. */
+	__cacheline_group_begin(Qdisc_write) ____cacheline_aligned;
+		struct qdisc_skb_head	q;
+		unsigned long		state;
+		struct gnet_stats_basic_sync bstats;
+		bool			running; /* must be written under qdisc spinlock */
+
+		/* Note : we only change qstats.backlog in fast path. */
+		struct gnet_stats_queue	qstats;
+	__cacheline_group_end(Qdisc_write);
+
 
 	atomic_long_t		defer_count ____cacheline_aligned_in_smp;
 	struct llist_head	defer_list;
-- 
2.52.0.rc1.455.g30608eb744-goog

Group together changes to qdisc fields to reduce chances of false sharing
if another cpu attempts to acquire the qdisc spinlock.

  qdisc_qstats_backlog_dec(sch, skb);
  sch->q.qlen--;
  qdisc_bstats_update(sch, skb);

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/sched/sch_fq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index fee922da2f99c0c7ac6d86569cf3bbce47898951..0b0ca1aa9251f959e87dd5dc504fbe0f4cbc75eb 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -497,6 +497,7 @@ static void fq_dequeue_skb(struct Qdisc *sch, struct fq_flow *flow,
 	skb_mark_not_on_list(skb);
 	qdisc_qstats_backlog_dec(sch, skb);
 	sch->q.qlen--;
+	qdisc_bstats_update(sch, skb);
 }
 
 static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
@@ -776,7 +777,6 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
 		f->time_next_packet = now + len;
 	}
 out:
-	qdisc_bstats_update(sch, skb);
 	return skb;
 }
 
-- 
2.52.0.rc1.455.g30608eb744-goog

prefetch the skb that we are likely to dequeue at the next dequeue().

Also call fq_dequeue_skb() a bit sooner in fq_dequeue().

This reduces the window between read of q.qlen and
changes of fields in the cache line that could be dirtied
by another cpu trying to queue a packet.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/sched/sch_fq.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 0b0ca1aa9251f959e87dd5dc504fbe0f4cbc75eb..6e5f2f4f241546605f8ba37f96275446c8836eee 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -480,7 +480,10 @@ static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow,
 			  struct sk_buff *skb)
 {
 	if (skb == flow->head) {
-		flow->head = skb->next;
+		struct sk_buff *next = skb->next;
+
+		prefetch(next);
+		flow->head = next;
 	} else {
 		rb_erase(&skb->rbnode, &flow->t_root);
 		skb->dev = qdisc_dev(sch);
@@ -712,6 +715,7 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
 			goto begin;
 		}
 		prefetch(&skb->end);
+		fq_dequeue_skb(sch, f, skb);
 		if ((s64)(now - time_next_packet - q->ce_threshold) > 0) {
 			INET_ECN_set_ce(skb);
 			q->stat_ce_mark++;
@@ -719,7 +723,6 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
 		if (--f->qlen == 0)
 			q->inactive_flows++;
 		q->band_pkt_count[fq_skb_cb(skb)->band]--;
-		fq_dequeue_skb(sch, f, skb);
 	} else {
 		head->first = f->next;
 		/* force a pass through old_flows to prevent starvation */
-- 
2.52.0.rc1.455.g30608eb744-goog

Most qdiscs need to read skb->priority at enqueue time().

In commit 100dfa74cad9 ("net: dev_queue_xmit() llist adoption")
I added a prefetch(next), lets add another one for the second
half of skb.

Note that skb->priority and skb->hash share a common cache line,
so this patch helps qdiscs needing both fields.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/core/dev.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/core/dev.c b/net/core/dev.c
index e19eb4e9d77c27535ab2a0ce14299281e3ef9397..53e2496dc4292284072946fd9131d3f9a0c0af44 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4246,6 +4246,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 
 		llist_for_each_entry_safe(skb, next, ll_list, ll_node) {
 			prefetch(next);
+			prefetch(&next->priority);
 			skb_mark_not_on_list(skb);
 			rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
 			count++;
-- 
2.52.0.rc1.455.g30608eb744-goog

q->limit is read locklessly, add a READ_ONCE().

Fixes: 100dfa74cad9 ("net: dev_queue_xmit() llist adoption")
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/core/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 53e2496dc4292284072946fd9131d3f9a0c0af44..10042139dbb054b9a93dfb019477a80263feb029 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4194,7 +4194,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 	do {
 		if (first_n && !defer_count) {
 			defer_count = atomic_long_inc_return(&q->defer_count);
-			if (unlikely(defer_count > q->limit)) {
+			if (unlikely(defer_count > READ_ONCE(q->limit))) {
 				kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_DROP);
 				return NET_XMIT_DROP;
 			}
-- 
2.52.0.rc1.455.g30608eb744-goog

Using kfree_skb_list_reason() to free list of skbs from qdisc
operations seems wrong as each skb might have a different drop reason.

Cleanup __dev_xmit_skb() to call tcf_kfree_skb_list() once
in preparation of the following patch.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/net/sch_generic.h | 11 +++++++++++
 net/core/dev.c            | 15 +++++----------
 2 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 79501499dafba56271b9ebd97a8f379ffdc83cac..b8092d0378a0cafa290123d17c1b0ba787cd0680 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -1105,6 +1105,17 @@ static inline void tcf_set_drop_reason(const struct sk_buff *skb,
 	tc_skb_cb(skb)->drop_reason = reason;
 }
 
+static inline void tcf_kfree_skb_list(struct sk_buff *skb)
+{
+	while (unlikely(skb)) {
+		struct sk_buff *next = skb->next;
+
+		prefetch(next);
+		kfree_skb_reason(skb, tcf_get_drop_reason(skb));
+		skb = next;
+	}
+}
+
 /* Instead of calling kfree_skb() while root qdisc lock is held,
  * queue the skb for future freeing at end of __dev_xmit_skb()
  */
diff --git a/net/core/dev.c b/net/core/dev.c
index 10042139dbb054b9a93dfb019477a80263feb029..e865cdb9b6966225072dc44a86610b9c7828bd8c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4162,7 +4162,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 				__qdisc_run(q);
 				qdisc_run_end(q);
 
-				goto no_lock_out;
+				goto free_skbs;
 			}
 
 			qdisc_bstats_cpu_update(q, skb);
@@ -4176,12 +4176,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 
 		rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
 		qdisc_run(q);
-
-no_lock_out:
-		if (unlikely(to_free))
-			kfree_skb_list_reason(to_free,
-					      tcf_get_drop_reason(to_free));
-		return rc;
+		goto free_skbs;
 	}
 
 	/* Open code llist_add(&skb->ll_node, &q->defer_list) + queue limit.
@@ -4257,9 +4252,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 	}
 unlock:
 	spin_unlock(root_lock);
-	if (unlikely(to_free))
-		kfree_skb_list_reason(to_free,
-				      tcf_get_drop_reason(to_free));
+
+free_skbs:
+	tcf_kfree_skb_list(to_free);
 	return rc;
 }
 
-- 
2.52.0.rc1.455.g30608eb744-goog

Some qdisc like cake, codel, fq_codel might drop packets
in their dequeue() method.

This is currently problematic because dequeue() runs with
the qdisc spinlock held. Freeing skbs can be extremely expensive.

Add qdisc_dequeue_drop() method and a new TCQ_F_DEQUEUE_DROPS
so that these qdiscs can opt-in to defer the skb frees
after the socket spinlock is released.

TCQ_F_DEQUEUE_DROPS is an attempt to not penalize other qdiscs
with an extra cache line miss.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/net/pkt_sched.h   |  5 +++--
 include/net/sch_generic.h | 30 +++++++++++++++++++++++++++---
 net/core/dev.c            | 22 +++++++++++++---------
 3 files changed, 43 insertions(+), 14 deletions(-)

diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index 4678db45832a1e3bf7b8a07756fb89ab868bd5d2..e703c507d0daa97ae7c3bf131e322b1eafcc5664 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -114,12 +114,13 @@ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
 
 void __qdisc_run(struct Qdisc *q);
 
-static inline void qdisc_run(struct Qdisc *q)
+static inline struct sk_buff *qdisc_run(struct Qdisc *q)
 {
 	if (qdisc_run_begin(q)) {
 		__qdisc_run(q);
-		qdisc_run_end(q);
+		return qdisc_run_end(q);
 	}
+	return NULL;
 }
 
 extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1];
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index b8092d0378a0cafa290123d17c1b0ba787cd0680..c3a7268b567e0abf3f38290cd4e3fa7cd0601e36 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -88,6 +88,8 @@ struct Qdisc {
 #define TCQ_F_INVISIBLE		0x80 /* invisible by default in dump */
 #define TCQ_F_NOLOCK		0x100 /* qdisc does not require locking */
 #define TCQ_F_OFFLOADED		0x200 /* qdisc is offloaded to HW */
+#define TCQ_F_DEQUEUE_DROPS	0x400 /* ->dequeue() can drop packets in q->to_free */
+
 	u32			limit;
 	const struct Qdisc_ops	*ops;
 	struct qdisc_size_table	__rcu *stab;
@@ -119,6 +121,8 @@ struct Qdisc {
 
 		/* Note : we only change qstats.backlog in fast path. */
 		struct gnet_stats_queue	qstats;
+
+		struct sk_buff		*to_free;
 	__cacheline_group_end(Qdisc_write);
 
 
@@ -218,8 +222,10 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc)
 	return true;
 }
 
-static inline void qdisc_run_end(struct Qdisc *qdisc)
+static inline struct sk_buff *qdisc_run_end(struct Qdisc *qdisc)
 {
+	struct sk_buff *to_free = NULL;
+
 	if (qdisc->flags & TCQ_F_NOLOCK) {
 		spin_unlock(&qdisc->seqlock);
 
@@ -232,9 +238,16 @@ static inline void qdisc_run_end(struct Qdisc *qdisc)
 		if (unlikely(test_bit(__QDISC_STATE_MISSED,
 				      &qdisc->state)))
 			__netif_schedule(qdisc);
-	} else {
-		WRITE_ONCE(qdisc->running, false);
+		return NULL;
+	}
+
+	if (qdisc->flags & TCQ_F_DEQUEUE_DROPS) {
+		to_free = qdisc->to_free;
+		if (to_free)
+			qdisc->to_free = NULL;
 	}
+	WRITE_ONCE(qdisc->running, false);
+	return to_free;
 }
 
 static inline bool qdisc_may_bulk(const struct Qdisc *qdisc)
@@ -1116,6 +1129,17 @@ static inline void tcf_kfree_skb_list(struct sk_buff *skb)
 	}
 }
 
+static inline void qdisc_dequeue_drop(struct Qdisc *q, struct sk_buff *skb,
+				      enum skb_drop_reason reason)
+{
+	DEBUG_NET_WARN_ON_ONCE(!(q->flags & TCQ_F_DEQUEUE_DROPS));
+	DEBUG_NET_WARN_ON_ONCE(q->flags & TCQ_F_NOLOCK);
+
+	tcf_set_drop_reason(skb, reason);
+	skb->next = q->to_free;
+	q->to_free = skb;
+}
+
 /* Instead of calling kfree_skb() while root qdisc lock is held,
  * queue the skb for future freeing at end of __dev_xmit_skb()
  */
diff --git a/net/core/dev.c b/net/core/dev.c
index e865cdb9b6966225072dc44a86610b9c7828bd8c..9094c0fb8c689cd5252274d839638839bfb7642e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4141,7 +4141,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 				 struct net_device *dev,
 				 struct netdev_queue *txq)
 {
-	struct sk_buff *next, *to_free = NULL;
+	struct sk_buff *next, *to_free = NULL, *to_free2 = NULL;
 	spinlock_t *root_lock = qdisc_lock(q);
 	struct llist_node *ll_list, *first_n;
 	unsigned long defer_count = 0;
@@ -4160,7 +4160,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 			if (unlikely(!nolock_qdisc_is_empty(q))) {
 				rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
 				__qdisc_run(q);
-				qdisc_run_end(q);
+				to_free2 = qdisc_run_end(q);
 
 				goto free_skbs;
 			}
@@ -4170,12 +4170,13 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 			    !nolock_qdisc_is_empty(q))
 				__qdisc_run(q);
 
-			qdisc_run_end(q);
-			return NET_XMIT_SUCCESS;
+			to_free2 = qdisc_run_end(q);
+			rc = NET_XMIT_SUCCESS;
+			goto free_skbs;
 		}
 
 		rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
-		qdisc_run(q);
+		to_free2 = qdisc_run(q);
 		goto free_skbs;
 	}
 
@@ -4234,7 +4235,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 		qdisc_bstats_update(q, skb);
 		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true))
 			__qdisc_run(q);
-		qdisc_run_end(q);
+		to_free2 = qdisc_run_end(q);
 		rc = NET_XMIT_SUCCESS;
 	} else {
 		int count = 0;
@@ -4246,7 +4247,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 			rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
 			count++;
 		}
-		qdisc_run(q);
+		to_free2 = qdisc_run(q);
 		if (count != 1)
 			rc = NET_XMIT_SUCCESS;
 	}
@@ -4255,6 +4256,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 
 free_skbs:
 	tcf_kfree_skb_list(to_free);
+	tcf_kfree_skb_list(to_free2);
 	return rc;
 }
 
@@ -5747,8 +5749,9 @@ static __latent_entropy void net_tx_action(void)
 		rcu_read_lock();
 
 		while (head) {
-			struct Qdisc *q = head;
 			spinlock_t *root_lock = NULL;
+			struct sk_buff *to_free;
+			struct Qdisc *q = head;
 
 			head = head->next_sched;
 
@@ -5775,9 +5778,10 @@ static __latent_entropy void net_tx_action(void)
 			}
 
 			clear_bit(__QDISC_STATE_SCHED, &q->state);
-			qdisc_run(q);
+			to_free = qdisc_run(q);
 			if (root_lock)
 				spin_unlock(root_lock);
+			tcf_kfree_skb_list(to_free);
 		}
 
 		rcu_read_unlock();
-- 
2.52.0.rc1.455.g30608eb744-goog

cake, codel and fq_codel can drop many packets from dequeue().

Use qdisc_dequeue_drop() so that the freeing can happen
outside of the qdisc spinlock scope.

Add TCQ_F_DEQUEUE_DROPS to sch->flags.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/sched/sch_cake.c     | 4 +++-
 net/sched/sch_codel.c    | 4 +++-
 net/sched/sch_fq_codel.c | 5 ++++-
 3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
index 5948a149129c6de041ba949e2e2b5b6b4eb54166..0ea9440f68c60ab69e9dd889b225c1a171199787 100644
--- a/net/sched/sch_cake.c
+++ b/net/sched/sch_cake.c
@@ -2183,7 +2183,7 @@ static struct sk_buff *cake_dequeue(struct Qdisc *sch)
 		b->tin_dropped++;
 		qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
 		qdisc_qstats_drop(sch);
-		kfree_skb_reason(skb, reason);
+		qdisc_dequeue_drop(sch, skb, reason);
 		if (q->rate_flags & CAKE_FLAG_INGRESS)
 			goto retry;
 	}
@@ -2724,6 +2724,8 @@ static int cake_init(struct Qdisc *sch, struct nlattr *opt,
 	int i, j, err;
 
 	sch->limit = 10240;
+	sch->flags |= TCQ_F_DEQUEUE_DROPS;
+
 	q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
 	q->flow_mode  = CAKE_FLOW_TRIPLE;
 
diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c
index fa0314679e434a4f84a128e8330bb92743c3d66a..c6551578f1cf8d332ca20ea062e858ffb437966a 100644
--- a/net/sched/sch_codel.c
+++ b/net/sched/sch_codel.c
@@ -52,7 +52,7 @@ static void drop_func(struct sk_buff *skb, void *ctx)
 {
 	struct Qdisc *sch = ctx;
 
-	kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_CONGESTED);
+	qdisc_dequeue_drop(sch, skb, SKB_DROP_REASON_QDISC_CONGESTED);
 	qdisc_qstats_drop(sch);
 }
 
@@ -182,6 +182,8 @@ static int codel_init(struct Qdisc *sch, struct nlattr *opt,
 	else
 		sch->flags &= ~TCQ_F_CAN_BYPASS;
 
+	sch->flags |= TCQ_F_DEQUEUE_DROPS;
+
 	return 0;
 }
 
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index a141423929394d7ebe127aa328dcf13ae67b3d56..dc187c7f06b10d8fd4191ead82e6a60133fff09d 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -275,7 +275,7 @@ static void drop_func(struct sk_buff *skb, void *ctx)
 {
 	struct Qdisc *sch = ctx;
 
-	kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_CONGESTED);
+	qdisc_dequeue_drop(sch, skb, SKB_DROP_REASON_QDISC_CONGESTED);
 	qdisc_qstats_drop(sch);
 }
 
@@ -519,6 +519,9 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt,
 		sch->flags |= TCQ_F_CAN_BYPASS;
 	else
 		sch->flags &= ~TCQ_F_CAN_BYPASS;
+
+	sch->flags |= TCQ_F_DEQUEUE_DROPS;
+
 	return 0;
 
 alloc_failure:
-- 
2.52.0.rc1.455.g30608eb744-goog