While stress testing UDP senders on a host with expensive indirect
calls, I found cpus processing TX completions where showing
a very high cost (20%) in sock_wfree() due to
CONFIG_MITIGATION_RETPOLINE=y.

Take care of TCP and UDP TX destructors and use INDIRECT_CALL_3() macro.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/core/skbuff.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index bc12790017b0b5c0be99f8fb9d362b3730fa4eb0..c9c06f9a8d6085f8d0907b412e050a60c835a6e8 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1136,7 +1136,9 @@ void skb_release_head_state(struct sk_buff *skb)
 	skb_dst_drop(skb);
 	if (skb->destructor) {
 		DEBUG_NET_WARN_ON_ONCE(in_hardirq());
-		skb->destructor(skb);
+		INDIRECT_CALL_3(skb->destructor,
+				tcp_wfree, __sock_wfree, sock_wfree,
+				skb);
 	}
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
 	nf_conntrack_put(skb_nfct(skb));
-- 
2.51.0.618.g983fd99d29-goog

We want to revert commit 0f022d32c3ec ("net/sched: Fix mirred deadlock
on device recursion") because it adds code in the fast path, even when
act_mirred is not used.

Use an additional device pointers array in struct netdev_xmit
and implement loop detection in tcf_mirred_is_act_redirect().

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/linux/netdevice_xmit.h |  9 ++++-
 net/sched/act_mirred.c         | 62 +++++++++++++---------------------
 2 files changed, 31 insertions(+), 40 deletions(-)

diff --git a/include/linux/netdevice_xmit.h b/include/linux/netdevice_xmit.h
index 813a19122ebbb2c6a04176330b1055b7c2b9c902..cc232508e695eefe95ea6e55a21978be11d5da83 100644
--- a/include/linux/netdevice_xmit.h
+++ b/include/linux/netdevice_xmit.h
@@ -2,6 +2,12 @@
 #ifndef _LINUX_NETDEVICE_XMIT_H
 #define _LINUX_NETDEVICE_XMIT_H
 
+#if IS_ENABLED(CONFIG_NET_ACT_MIRRED)
+#define MIRRED_NEST_LIMIT	4
+#endif
+
+struct net_device;
+
 struct netdev_xmit {
 	u16 recursion;
 	u8  more;
@@ -9,7 +15,8 @@ struct netdev_xmit {
 	u8  skip_txqueue;
 #endif
 #if IS_ENABLED(CONFIG_NET_ACT_MIRRED)
-	u8 sched_mirred_nest;
+	u8			sched_mirred_nest;
+	struct net_device	*sched_mirred_dev[MIRRED_NEST_LIMIT];
 #endif
 #if IS_ENABLED(CONFIG_NF_DUP_NETDEV)
 	u8 nf_dup_skb_recursion;
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 5f01f567c934d3669d9a3058cff861a8fe5f88b6..f27b583def78e4afecc7112854b93d59c2520201 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -29,31 +29,6 @@
 static LIST_HEAD(mirred_list);
 static DEFINE_SPINLOCK(mirred_list_lock);
 
-#define MIRRED_NEST_LIMIT    4
-
-#ifndef CONFIG_PREEMPT_RT
-static u8 tcf_mirred_nest_level_inc_return(void)
-{
-	return __this_cpu_inc_return(softnet_data.xmit.sched_mirred_nest);
-}
-
-static void tcf_mirred_nest_level_dec(void)
-{
-	__this_cpu_dec(softnet_data.xmit.sched_mirred_nest);
-}
-
-#else
-static u8 tcf_mirred_nest_level_inc_return(void)
-{
-	return current->net_xmit.sched_mirred_nest++;
-}
-
-static void tcf_mirred_nest_level_dec(void)
-{
-	current->net_xmit.sched_mirred_nest--;
-}
-#endif
-
 static bool tcf_mirred_is_act_redirect(int action)
 {
 	return action == TCA_EGRESS_REDIR || action == TCA_INGRESS_REDIR;
@@ -439,44 +414,53 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb,
 {
 	struct tcf_mirred *m = to_mirred(a);
 	int retval = READ_ONCE(m->tcf_action);
-	unsigned int nest_level;
+	struct netdev_xmit *xmit;
 	bool m_mac_header_xmit;
 	struct net_device *dev;
-	int m_eaction;
+	int i, m_eaction;
 	u32 blockid;
 
-	nest_level = tcf_mirred_nest_level_inc_return();
-	if (unlikely(nest_level > MIRRED_NEST_LIMIT)) {
+#ifdef CONFIG_PREEMPT_RT
+	xmit = &current->net_xmit;
+#else
+	xmit = this_cpu_ptr(&softnet_data.xmit);
+#endif
+	if (unlikely(xmit->sched_mirred_nest >= MIRRED_NEST_LIMIT)) {
 		net_warn_ratelimited("Packet exceeded mirred recursion limit on dev %s\n",
 				     netdev_name(skb->dev));
-		retval = TC_ACT_SHOT;
-		goto dec_nest_level;
+		return TC_ACT_SHOT;
 	}
 
 	tcf_lastuse_update(&m->tcf_tm);
 	tcf_action_update_bstats(&m->common, skb);
 
 	blockid = READ_ONCE(m->tcfm_blockid);
-	if (blockid) {
-		retval = tcf_blockcast(skb, m, blockid, res, retval);
-		goto dec_nest_level;
-	}
+	if (blockid)
+		return tcf_blockcast(skb, m, blockid, res, retval);
 
 	dev = rcu_dereference_bh(m->tcfm_dev);
 	if (unlikely(!dev)) {
 		pr_notice_once("tc mirred: target device is gone\n");
 		tcf_action_inc_overlimit_qstats(&m->common);
-		goto dec_nest_level;
+		return retval;
 	}
+	for (i = 0; i < xmit->sched_mirred_nest; i++) {
+		if (xmit->sched_mirred_dev[i] != dev)
+			continue;
+		pr_notice_once("tc mirred: loop on device %s\n",
+			       netdev_name(dev));
+		tcf_action_inc_overlimit_qstats(&m->common);
+		return retval;
+	}
+
+	xmit->sched_mirred_dev[xmit->sched_mirred_nest++] = dev;
 
 	m_mac_header_xmit = READ_ONCE(m->tcfm_mac_header_xmit);
 	m_eaction = READ_ONCE(m->tcfm_eaction);
 
 	retval = tcf_mirred_to_dev(skb, m, dev, m_mac_header_xmit, m_eaction,
 				   retval);
-
-dec_nest_level:
-	tcf_mirred_nest_level_dec();
+	xmit->sched_mirred_nest--;
 
 	return retval;
 }
-- 
2.51.0.618.g983fd99d29-goog

This reverts commits 0f022d32c3eca477fbf79a205243a6123ed0fe11
and 44180feaccf266d9b0b28cc4ceaac019817deb5c.

Prior patch in this series implemented loop detection
in act_mirred, we can remove q->owner to save some cycles
in the fast path.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/net/sch_generic.h | 1 -
 net/core/dev.c            | 6 ------
 net/sched/sch_generic.c   | 2 --
 3 files changed, 9 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 738cd5b13c62fb8501619625880b87991f3dc17c..32e9961570b467b6066f1bb2c00ff1a270e342bc 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -117,7 +117,6 @@ struct Qdisc {
 	struct qdisc_skb_head	q;
 	struct gnet_stats_basic_sync bstats;
 	struct gnet_stats_queue	qstats;
-	int                     owner;
 	unsigned long		state;
 	unsigned long		state2; /* must be written under qdisc spinlock */
 	struct Qdisc            *next_sched;
diff --git a/net/core/dev.c b/net/core/dev.c
index a64cef2c537e98ee87776e6f8d3ca3d98f0711b3..0ff178399b2d28ca2754b3f06d69a97f5d6dcf71 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4167,10 +4167,6 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 		return rc;
 	}
 
-	if (unlikely(READ_ONCE(q->owner) == smp_processor_id())) {
-		kfree_skb_reason(skb, SKB_DROP_REASON_TC_RECLASSIFY_LOOP);
-		return NET_XMIT_DROP;
-	}
 	/*
 	 * Heuristic to force contended enqueues to serialize on a
 	 * separate lock before trying to get qdisc main lock.
@@ -4210,9 +4206,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 		qdisc_run_end(q);
 		rc = NET_XMIT_SUCCESS;
 	} else {
-		WRITE_ONCE(q->owner, smp_processor_id());
 		rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
-		WRITE_ONCE(q->owner, -1);
 		if (qdisc_run_begin(q)) {
 			if (unlikely(contended)) {
 				spin_unlock(&q->busylock);
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 1e008a228ebdf846d4ef7f83d655ac1142ec3596..dfa8e8e667d24a435b0c9cb3c1f05c8075f63e89 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -679,7 +679,6 @@ struct Qdisc noop_qdisc = {
 		.qlen = 0,
 		.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.skb_bad_txq.lock),
 	},
-	.owner = -1,
 };
 EXPORT_SYMBOL(noop_qdisc);
 
@@ -985,7 +984,6 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 	sch->enqueue = ops->enqueue;
 	sch->dequeue = ops->dequeue;
 	sch->dev_queue = dev_queue;
-	sch->owner = -1;
 	netdev_hold(dev, &sch->dev_tracker, GFP_KERNEL);
 	refcount_set(&sch->refcnt, 1);
 
-- 
2.51.0.618.g983fd99d29-goog

Replace state2 field with a boolean.

Move it to a hole between qstats and state so that
we shrink Qdisc by a full cache line.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/net/sch_generic.h | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 32e9961570b467b6066f1bb2c00ff1a270e342bc..31561291bc92fd70d4d3ca8f5f7dbc4c94c895a0 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -41,13 +41,6 @@ enum qdisc_state_t {
 	__QDISC_STATE_DRAINING,
 };
 
-enum qdisc_state2_t {
-	/* Only for !TCQ_F_NOLOCK qdisc. Never access it directly.
-	 * Use qdisc_run_begin/end() or qdisc_is_running() instead.
-	 */
-	__QDISC_STATE2_RUNNING,
-};
-
 #define QDISC_STATE_MISSED	BIT(__QDISC_STATE_MISSED)
 #define QDISC_STATE_DRAINING	BIT(__QDISC_STATE_DRAINING)
 
@@ -117,8 +110,8 @@ struct Qdisc {
 	struct qdisc_skb_head	q;
 	struct gnet_stats_basic_sync bstats;
 	struct gnet_stats_queue	qstats;
+	bool			running; /* must be written under qdisc spinlock */
 	unsigned long		state;
-	unsigned long		state2; /* must be written under qdisc spinlock */
 	struct Qdisc            *next_sched;
 	struct sk_buff_head	skb_bad_txq;
 
@@ -167,7 +160,7 @@ static inline bool qdisc_is_running(struct Qdisc *qdisc)
 {
 	if (qdisc->flags & TCQ_F_NOLOCK)
 		return spin_is_locked(&qdisc->seqlock);
-	return test_bit(__QDISC_STATE2_RUNNING, &qdisc->state2);
+	return READ_ONCE(qdisc->running);
 }
 
 static inline bool nolock_qdisc_is_empty(const struct Qdisc *qdisc)
@@ -210,7 +203,10 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc)
 		 */
 		return spin_trylock(&qdisc->seqlock);
 	}
-	return !__test_and_set_bit(__QDISC_STATE2_RUNNING, &qdisc->state2);
+	if (READ_ONCE(qdisc->running))
+		return false;
+	WRITE_ONCE(qdisc->running, true);
+	return true;
 }
 
 static inline void qdisc_run_end(struct Qdisc *qdisc)
@@ -228,7 +224,7 @@ static inline void qdisc_run_end(struct Qdisc *qdisc)
 				      &qdisc->state)))
 			__netif_schedule(qdisc);
 	} else {
-		__clear_bit(__QDISC_STATE2_RUNNING, &qdisc->state2);
+		WRITE_ONCE(qdisc->running, false);
 	}
 }
 
-- 
2.51.0.618.g983fd99d29-goog

Remove busylock spinlock and use a lockless list (llist)
to reduce spinlock contention to the minimum.

Idea is that only one cpu might spin on the qdisc spinlock,
while others simply add their skb in the llist.

After this patch, we get a 300 % improvement on heavy TX workloads.
- Sending twice the number of packets per second.
- While consuming 50 % less cycles.

Tested:

- Dual Intel(R) Xeon(R) 6985P-C  (480 hyper threads).
- 100Gbit NIC, 30 TX queues with FQ packet scheduler.
- echo 64 >/sys/kernel/slab/skbuff_small_head/cpu_partial (avoid contention in mm)
- 240 concurrent "netperf -t UDP_STREAM -- -m 120 -n"

Before:

16 Mpps (41 Mpps if each thread is pinned to a different cpu)

vmstat 2 5
procs -----------memory---------- ---swap-- -----io---- -system-- ------cpu-----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa st
243  0      0 2368988672  51036 1100852    0    0   146     1  242   60  0  9 91  0  0
244  0      0 2368988672  51036 1100852    0    0   536    10 487745 14718  0 52 48  0  0
244  0      0 2368988672  51036 1100852    0    0   512     0 503067 46033  0 52 48  0  0
244  0      0 2368988672  51036 1100852    0    0   512     0 494807 12107  0 52 48  0  0
244  0      0 2368988672  51036 1100852    0    0   702    26 492845 10110  0 52 48  0  0

Lock contention (1 second sample taken on 8 cores)
perf lock record -C0-7 sleep 1; perf lock contention
 contended   total wait     max wait     avg wait         type   caller

    442111      6.79 s     162.47 ms     15.35 us     spinlock   dev_hard_start_xmit+0xcd
      5961      9.57 ms      8.12 us      1.60 us     spinlock   __dev_queue_xmit+0x3a0
       244    560.63 us      7.63 us      2.30 us     spinlock   do_softirq+0x5b
        13     25.09 us      3.21 us      1.93 us     spinlock   net_tx_action+0xf8

If netperf threads are pinned, spinlock stress is very high.
perf lock record -C0-7 sleep 1; perf lock contention
 contended   total wait     max wait     avg wait         type   caller

    964508      7.10 s     147.25 ms      7.36 us     spinlock   dev_hard_start_xmit+0xcd
       201    268.05 us      4.65 us      1.33 us     spinlock   __dev_queue_xmit+0x3a0
        12     26.05 us      3.84 us      2.17 us     spinlock   do_softirq+0x5b

@__dev_queue_xmit_ns:
[256, 512)            21 |                                                    |
[512, 1K)            631 |                                                    |
[1K, 2K)           27328 |@                                                   |
[2K, 4K)          265392 |@@@@@@@@@@@@@@@@                                    |
[4K, 8K)          417543 |@@@@@@@@@@@@@@@@@@@@@@@@@@                          |
[8K, 16K)         826292 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[16K, 32K)        733822 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@      |
[32K, 64K)         19055 |@                                                   |
[64K, 128K)        17240 |@                                                   |
[128K, 256K)       25633 |@                                                   |
[256K, 512K)           4 |                                                    |

After:

29 Mpps (57 Mpps if each thread is pinned to a different cpu)

vmstat 2 5
procs -----------memory---------- ---swap-- -----io---- -system-- ------cpu-----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa st
78  0      0 2369573632  32896 1350988    0    0    22     0  331  254  0  8 92  0  0
75  0      0 2369573632  32896 1350988    0    0    22    50 425713 280199  0 23 76  0  0
104  0      0 2369573632  32896 1350988    0    0   290     0 430238 298247  0 23 76  0  0
86  0      0 2369573632  32896 1350988    0    0   132     0 428019 291865  0 24 76  0  0
90  0      0 2369573632  32896 1350988    0    0   502     0 422498 278672  0 23 76  0  0

perf lock record -C0-7 sleep 1; perf lock contention
 contended   total wait     max wait     avg wait         type   caller

      2524    116.15 ms    486.61 us     46.02 us     spinlock   __dev_queue_xmit+0x55b
      5821    107.18 ms    371.67 us     18.41 us     spinlock   dev_hard_start_xmit+0xcd
      2377      9.73 ms     35.86 us      4.09 us     spinlock   ___slab_alloc+0x4e0
       923      5.74 ms     20.91 us      6.22 us     spinlock   ___slab_alloc+0x5c9
       121      3.42 ms    193.05 us     28.24 us     spinlock   net_tx_action+0xf8
         6    564.33 us    167.60 us     94.05 us     spinlock   do_softirq+0x5b

If netperf threads are pinned (~54 Mpps)
perf lock record -C0-7 sleep 1; perf lock contention
     32907    316.98 ms    195.98 us      9.63 us     spinlock   dev_hard_start_xmit+0xcd
      4507     61.83 ms    212.73 us     13.72 us     spinlock   __dev_queue_xmit+0x554
      2781     23.53 ms     40.03 us      8.46 us     spinlock   ___slab_alloc+0x5c9
      3554     18.94 ms     34.69 us      5.33 us     spinlock   ___slab_alloc+0x4e0
       233      9.09 ms    215.70 us     38.99 us     spinlock   do_softirq+0x5b
       153    930.66 us     48.67 us      6.08 us     spinlock   net_tx_action+0xfd
        84    331.10 us     14.22 us      3.94 us     spinlock   ___slab_alloc+0x5c9
       140    323.71 us      9.94 us      2.31 us     spinlock   ___slab_alloc+0x4e0

@__dev_queue_xmit_ns:
[128, 256)       1539830 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@                  |
[256, 512)       2299558 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[512, 1K)         483936 |@@@@@@@@@@                                          |
[1K, 2K)          265345 |@@@@@@                                              |
[2K, 4K)          145463 |@@@                                                 |
[4K, 8K)           54571 |@                                                   |
[8K, 16K)          10270 |                                                    |
[16K, 32K)          9385 |                                                    |
[32K, 64K)          7749 |                                                    |
[64K, 128K)        26799 |                                                    |
[128K, 256K)        2665 |                                                    |
[256K, 512K)         665 |                                                    |

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/net/sch_generic.h |  4 +-
 net/core/dev.c            | 85 ++++++++++++++++++++++-----------------
 net/sched/sch_generic.c   |  5 ---
 3 files changed, 52 insertions(+), 42 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 31561291bc92fd70d4d3ca8f5f7dbc4c94c895a0..94966692ccdf51db085c236319705aecba8c30cf 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -115,7 +115,9 @@ struct Qdisc {
 	struct Qdisc            *next_sched;
 	struct sk_buff_head	skb_bad_txq;
 
-	spinlock_t		busylock ____cacheline_aligned_in_smp;
+	atomic_long_t		defer_count ____cacheline_aligned_in_smp;
+	struct llist_head	defer_list;
+
 	spinlock_t		seqlock;
 
 	struct rcu_head		rcu;
diff --git a/net/core/dev.c b/net/core/dev.c
index 0ff178399b2d28ca2754b3f06d69a97f5d6dcf71..6094768bf3c028f0ad1e52b9b12b7258fa0ecff6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4125,9 +4125,10 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 				 struct net_device *dev,
 				 struct netdev_queue *txq)
 {
+	struct sk_buff *next, *to_free = NULL;
 	spinlock_t *root_lock = qdisc_lock(q);
-	struct sk_buff *to_free = NULL;
-	bool contended;
+	struct llist_node *ll_list, *first_n;
+	unsigned long defer_count = 0;
 	int rc;
 
 	qdisc_calculate_pkt_len(skb, q);
@@ -4167,61 +4168,73 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 		return rc;
 	}
 
-	/*
-	 * Heuristic to force contended enqueues to serialize on a
-	 * separate lock before trying to get qdisc main lock.
-	 * This permits qdisc->running owner to get the lock more
-	 * often and dequeue packets faster.
-	 * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit
-	 * and then other tasks will only enqueue packets. The packets will be
-	 * sent after the qdisc owner is scheduled again. To prevent this
-	 * scenario the task always serialize on the lock.
+	/* Open code llist_add(&skb->ll_node, &q->defer_list) + queue limit.
+	 * In the try_cmpxchg() loop, we want to increment q->defer_count
+	 * at most once to limit the number of skbs in defer_list.
+	 * We perform the defer_count increment only if the list is not empty,
+	 * because some arches have slow atomic_long_inc_return().
+	 */
+	first_n = READ_ONCE(q->defer_list.first);
+	do {
+		if (first_n && !defer_count) {
+			defer_count = atomic_long_inc_return(&q->defer_count);
+			if (unlikely(defer_count > q->limit)) {
+				kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_DROP);
+				return NET_XMIT_DROP;
+			}
+		}
+		skb->ll_node.next = first_n;
+	} while (!try_cmpxchg(&q->defer_list.first, &first_n, &skb->ll_node));
+
+	/* If defer_list was not empty, we know the cpu which queued
+	 * the first skb will process the whole list for us.
 	 */
-	contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT);
-	if (unlikely(contended))
-		spin_lock(&q->busylock);
+	if (first_n)
+		return NET_XMIT_SUCCESS;
 
 	spin_lock(root_lock);
+
+	ll_list = llist_del_all(&q->defer_list);
+	/* There is a small race because we clear defer_count not atomically
+	 * with the prior llist_del_all(). This means defer_list could grow
+	 * over q->limit.
+	 */
+	atomic_long_set(&q->defer_count, 0);
+
+	ll_list = llist_reverse_order(ll_list);
+
 	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
-		__qdisc_drop(skb, &to_free);
+		llist_for_each_entry_safe(skb, next, ll_list, ll_node)
+			__qdisc_drop(skb, &to_free);
 		rc = NET_XMIT_DROP;
-	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
-		   qdisc_run_begin(q)) {
+		goto unlock;
+	}
+	rc = NET_XMIT_SUCCESS;
+	if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
+	    !llist_next(ll_list) && qdisc_run_begin(q)) {
 		/*
 		 * This is a work-conserving queue; there are no old skbs
 		 * waiting to be sent out; and the qdisc is not running -
 		 * xmit the skb directly.
 		 */
 
+		skb = llist_entry(ll_list, struct sk_buff, ll_node);
 		qdisc_bstats_update(q, skb);
-
-		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
-			if (unlikely(contended)) {
-				spin_unlock(&q->busylock);
-				contended = false;
-			}
+		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true))
 			__qdisc_run(q);
-		}
-
 		qdisc_run_end(q);
-		rc = NET_XMIT_SUCCESS;
 	} else {
-		rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
-		if (qdisc_run_begin(q)) {
-			if (unlikely(contended)) {
-				spin_unlock(&q->busylock);
-				contended = false;
-			}
-			__qdisc_run(q);
-			qdisc_run_end(q);
+		llist_for_each_entry_safe(skb, next, ll_list, ll_node) {
+			prefetch(next);
+			dev_qdisc_enqueue(skb, q, &to_free, txq);
 		}
+		qdisc_run(q);
 	}
+unlock:
 	spin_unlock(root_lock);
 	if (unlikely(to_free))
 		kfree_skb_list_reason(to_free,
 				      tcf_get_drop_reason(to_free));
-	if (unlikely(contended))
-		spin_unlock(&q->busylock);
 	return rc;
 }
 
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index dfa8e8e667d24a435b0c9cb3c1f05c8075f63e89..d9a98d02a55fc361a223f3201e37b6a2b698bb5e 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -666,7 +666,6 @@ struct Qdisc noop_qdisc = {
 	.ops		=	&noop_qdisc_ops,
 	.q.lock		=	__SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
 	.dev_queue	=	&noop_netdev_queue,
-	.busylock	=	__SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
 	.gso_skb = {
 		.next = (struct sk_buff *)&noop_qdisc.gso_skb,
 		.prev = (struct sk_buff *)&noop_qdisc.gso_skb,
@@ -970,10 +969,6 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 		}
 	}
 
-	spin_lock_init(&sch->busylock);
-	lockdep_set_class(&sch->busylock,
-			  dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
-
 	/* seqlock has the same scope of busylock, for NOLOCK qdisc */
 	spin_lock_init(&sch->seqlock);
 	lockdep_set_class(&sch->seqlock,
-- 
2.51.0.618.g983fd99d29-goog