kernel: add patch that adds support for running threaded NAPI poll functions
This is helps on workloads with CPU intensive poll functions (e.g. 802.11) on multicore systems Signed-off-by: Felix Fietkau <nbd@nbd.name>
This commit is contained in:
		@@ -15,7 +15,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
 | 
			
		||||
 | 
			
		||||
--- a/include/linux/netdevice.h
 | 
			
		||||
+++ b/include/linux/netdevice.h
 | 
			
		||||
@@ -1412,6 +1412,7 @@ enum netdev_priv_flags {
 | 
			
		||||
@@ -1415,6 +1415,7 @@ enum netdev_priv_flags {
 | 
			
		||||
 	IFF_PHONY_HEADROOM		= 1<<26,
 | 
			
		||||
 	IFF_MACSEC			= 1<<27,
 | 
			
		||||
 	IFF_L3MDEV_RX_HANDLER		= 1<<28,
 | 
			
		||||
@@ -23,7 +23,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
 | 
			
		||||
 };
 | 
			
		||||
 
 | 
			
		||||
 #define IFF_802_1Q_VLAN			IFF_802_1Q_VLAN
 | 
			
		||||
@@ -1442,6 +1443,7 @@ enum netdev_priv_flags {
 | 
			
		||||
@@ -1445,6 +1446,7 @@ enum netdev_priv_flags {
 | 
			
		||||
 #define IFF_RXFH_CONFIGURED		IFF_RXFH_CONFIGURED
 | 
			
		||||
 #define IFF_MACSEC			IFF_MACSEC
 | 
			
		||||
 #define IFF_L3MDEV_RX_HANDLER		IFF_L3MDEV_RX_HANDLER
 | 
			
		||||
@@ -31,7 +31,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
 | 
			
		||||
 
 | 
			
		||||
 /**
 | 
			
		||||
  *	struct net_device - The DEVICE structure.
 | 
			
		||||
@@ -1728,6 +1730,11 @@ struct net_device {
 | 
			
		||||
@@ -1731,6 +1733,11 @@ struct net_device {
 | 
			
		||||
 	const struct xfrmdev_ops *xfrmdev_ops;
 | 
			
		||||
 #endif
 | 
			
		||||
 
 | 
			
		||||
@@ -43,7 +43,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
 | 
			
		||||
 	const struct header_ops *header_ops;
 | 
			
		||||
 
 | 
			
		||||
 	unsigned int		flags;
 | 
			
		||||
@@ -1802,6 +1809,10 @@ struct net_device {
 | 
			
		||||
@@ -1805,6 +1812,10 @@ struct net_device {
 | 
			
		||||
 	struct mpls_dev __rcu	*mpls_ptr;
 | 
			
		||||
 #endif
 | 
			
		||||
 
 | 
			
		||||
@@ -101,7 +101,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
 | 
			
		||||
 	help
 | 
			
		||||
--- a/net/core/dev.c
 | 
			
		||||
+++ b/net/core/dev.c
 | 
			
		||||
@@ -3000,10 +3000,20 @@ static int xmit_one(struct sk_buff *skb,
 | 
			
		||||
@@ -3001,10 +3001,20 @@ static int xmit_one(struct sk_buff *skb,
 | 
			
		||||
 	if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
 | 
			
		||||
 		dev_queue_xmit_nit(skb, dev);
 | 
			
		||||
 
 | 
			
		||||
 
 | 
			
		||||
@@ -15,7 +15,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
 | 
			
		||||
 | 
			
		||||
--- a/include/linux/netdevice.h
 | 
			
		||||
+++ b/include/linux/netdevice.h
 | 
			
		||||
@@ -1514,6 +1514,7 @@ enum netdev_priv_flags {
 | 
			
		||||
@@ -1517,6 +1517,7 @@ enum netdev_priv_flags {
 | 
			
		||||
 	IFF_FAILOVER_SLAVE		= 1<<28,
 | 
			
		||||
 	IFF_L3MDEV_RX_HANDLER		= 1<<29,
 | 
			
		||||
 	IFF_LIVE_RENAME_OK		= 1<<30,
 | 
			
		||||
@@ -23,7 +23,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
 | 
			
		||||
 };
 | 
			
		||||
 
 | 
			
		||||
 #define IFF_802_1Q_VLAN			IFF_802_1Q_VLAN
 | 
			
		||||
@@ -1546,6 +1547,7 @@ enum netdev_priv_flags {
 | 
			
		||||
@@ -1549,6 +1550,7 @@ enum netdev_priv_flags {
 | 
			
		||||
 #define IFF_FAILOVER_SLAVE		IFF_FAILOVER_SLAVE
 | 
			
		||||
 #define IFF_L3MDEV_RX_HANDLER		IFF_L3MDEV_RX_HANDLER
 | 
			
		||||
 #define IFF_LIVE_RENAME_OK		IFF_LIVE_RENAME_OK
 | 
			
		||||
@@ -31,7 +31,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
 | 
			
		||||
 
 | 
			
		||||
 /**
 | 
			
		||||
  *	struct net_device - The DEVICE structure.
 | 
			
		||||
@@ -1846,6 +1848,11 @@ struct net_device {
 | 
			
		||||
@@ -1849,6 +1851,11 @@ struct net_device {
 | 
			
		||||
 	const struct tlsdev_ops *tlsdev_ops;
 | 
			
		||||
 #endif
 | 
			
		||||
 
 | 
			
		||||
@@ -43,7 +43,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
 | 
			
		||||
 	const struct header_ops *header_ops;
 | 
			
		||||
 
 | 
			
		||||
 	unsigned int		flags;
 | 
			
		||||
@@ -1928,6 +1935,10 @@ struct net_device {
 | 
			
		||||
@@ -1931,6 +1938,10 @@ struct net_device {
 | 
			
		||||
 	struct mpls_dev __rcu	*mpls_ptr;
 | 
			
		||||
 #endif
 | 
			
		||||
 
 | 
			
		||||
@@ -101,7 +101,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
 | 
			
		||||
 	help
 | 
			
		||||
--- a/net/core/dev.c
 | 
			
		||||
+++ b/net/core/dev.c
 | 
			
		||||
@@ -3251,10 +3251,20 @@ static int xmit_one(struct sk_buff *skb,
 | 
			
		||||
@@ -3252,10 +3252,20 @@ static int xmit_one(struct sk_buff *skb,
 | 
			
		||||
 	if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
 | 
			
		||||
 		dev_queue_xmit_nit(skb, dev);
 | 
			
		||||
 
 | 
			
		||||
 
 | 
			
		||||
@@ -15,7 +15,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
 | 
			
		||||
 | 
			
		||||
--- a/include/linux/netdevice.h
 | 
			
		||||
+++ b/include/linux/netdevice.h
 | 
			
		||||
@@ -1546,6 +1546,7 @@ enum netdev_priv_flags {
 | 
			
		||||
@@ -1549,6 +1549,7 @@ enum netdev_priv_flags {
 | 
			
		||||
 	IFF_FAILOVER_SLAVE		= 1<<28,
 | 
			
		||||
 	IFF_L3MDEV_RX_HANDLER		= 1<<29,
 | 
			
		||||
 	IFF_LIVE_RENAME_OK		= 1<<30,
 | 
			
		||||
@@ -23,7 +23,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
 | 
			
		||||
 };
 | 
			
		||||
 
 | 
			
		||||
 #define IFF_802_1Q_VLAN			IFF_802_1Q_VLAN
 | 
			
		||||
@@ -1578,6 +1579,7 @@ enum netdev_priv_flags {
 | 
			
		||||
@@ -1581,6 +1582,7 @@ enum netdev_priv_flags {
 | 
			
		||||
 #define IFF_FAILOVER_SLAVE		IFF_FAILOVER_SLAVE
 | 
			
		||||
 #define IFF_L3MDEV_RX_HANDLER		IFF_L3MDEV_RX_HANDLER
 | 
			
		||||
 #define IFF_LIVE_RENAME_OK		IFF_LIVE_RENAME_OK
 | 
			
		||||
@@ -31,7 +31,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
 | 
			
		||||
 
 | 
			
		||||
 /**
 | 
			
		||||
  *	struct net_device - The DEVICE structure.
 | 
			
		||||
@@ -1879,6 +1881,11 @@ struct net_device {
 | 
			
		||||
@@ -1882,6 +1884,11 @@ struct net_device {
 | 
			
		||||
 	const struct tlsdev_ops *tlsdev_ops;
 | 
			
		||||
 #endif
 | 
			
		||||
 
 | 
			
		||||
@@ -43,7 +43,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
 | 
			
		||||
 	const struct header_ops *header_ops;
 | 
			
		||||
 
 | 
			
		||||
 	unsigned int		flags;
 | 
			
		||||
@@ -1961,6 +1968,10 @@ struct net_device {
 | 
			
		||||
@@ -1964,6 +1971,10 @@ struct net_device {
 | 
			
		||||
 	struct mpls_dev __rcu	*mpls_ptr;
 | 
			
		||||
 #endif
 | 
			
		||||
 
 | 
			
		||||
@@ -101,7 +101,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
 | 
			
		||||
 	help
 | 
			
		||||
--- a/net/core/dev.c
 | 
			
		||||
+++ b/net/core/dev.c
 | 
			
		||||
@@ -3191,10 +3191,20 @@ static int xmit_one(struct sk_buff *skb,
 | 
			
		||||
@@ -3192,10 +3192,20 @@ static int xmit_one(struct sk_buff *skb,
 | 
			
		||||
 	if (dev_nit_active(dev))
 | 
			
		||||
 		dev_queue_xmit_nit(skb, dev);
 | 
			
		||||
 
 | 
			
		||||
 
 | 
			
		||||
@@ -0,0 +1,339 @@
 | 
			
		||||
From: Felix Fietkau <nbd@nbd.name>
 | 
			
		||||
Date: Sun, 26 Jul 2020 14:03:21 +0200
 | 
			
		||||
Subject: [PATCH] net: add support for threaded NAPI polling
 | 
			
		||||
 | 
			
		||||
For some drivers (especially 802.11 drivers), doing a lot of work in the NAPI
 | 
			
		||||
poll function does not perform well. Since NAPI poll is bound to the CPU it
 | 
			
		||||
was scheduled from, we can easily end up with a few very busy CPUs spending
 | 
			
		||||
most of their time in softirq/ksoftirqd and some idle ones.
 | 
			
		||||
 | 
			
		||||
Introduce threaded NAPI for such drivers based on a workqueue. The API is the
 | 
			
		||||
same except for using netif_threaded_napi_add instead of netif_napi_add.
 | 
			
		||||
 | 
			
		||||
In my tests with mt76 on MT7621 using threaded NAPI + a thread for tx scheduling
 | 
			
		||||
improves LAN->WLAN bridging throughput by 10-50%. Throughput without threaded
 | 
			
		||||
NAPI is wildly inconsistent, depending on the CPU that runs the tx scheduling
 | 
			
		||||
thread.
 | 
			
		||||
 | 
			
		||||
With threaded NAPI it seems stable and consistent (and higher than the best
 | 
			
		||||
results I got without it).
 | 
			
		||||
 | 
			
		||||
Based on a patch by Hillf Danton
 | 
			
		||||
 | 
			
		||||
Cc: Hillf Danton <hdanton@sina.com>
 | 
			
		||||
Signed-off-by: Felix Fietkau <nbd@nbd.name>
 | 
			
		||||
---
 | 
			
		||||
 | 
			
		||||
--- a/include/linux/netdevice.h
 | 
			
		||||
+++ b/include/linux/netdevice.h
 | 
			
		||||
@@ -326,6 +326,7 @@ struct napi_struct {
 | 
			
		||||
 	struct list_head	dev_list;
 | 
			
		||||
 	struct hlist_node	napi_hash_node;
 | 
			
		||||
 	unsigned int		napi_id;
 | 
			
		||||
+	struct work_struct	work;
 | 
			
		||||
 };
 | 
			
		||||
 
 | 
			
		||||
 enum {
 | 
			
		||||
@@ -336,6 +337,7 @@ enum {
 | 
			
		||||
 	NAPI_STATE_HASHED,	/* In NAPI hash (busy polling possible) */
 | 
			
		||||
 	NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
 | 
			
		||||
 	NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
 | 
			
		||||
+	NAPI_STATE_THREADED,	/* Use threaded NAPI */
 | 
			
		||||
 };
 | 
			
		||||
 
 | 
			
		||||
 enum {
 | 
			
		||||
@@ -346,6 +348,7 @@ enum {
 | 
			
		||||
 	NAPIF_STATE_HASHED	 = BIT(NAPI_STATE_HASHED),
 | 
			
		||||
 	NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
 | 
			
		||||
 	NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
 | 
			
		||||
+	NAPIF_STATE_THREADED	 = BIT(NAPI_STATE_THREADED),
 | 
			
		||||
 };
 | 
			
		||||
 
 | 
			
		||||
 enum gro_result {
 | 
			
		||||
@@ -2093,6 +2096,26 @@ void netif_napi_add(struct net_device *d
 | 
			
		||||
 		    int (*poll)(struct napi_struct *, int), int weight);
 | 
			
		||||
 
 | 
			
		||||
 /**
 | 
			
		||||
+ *	netif_threaded_napi_add - initialize a NAPI context
 | 
			
		||||
+ *	@dev:  network device
 | 
			
		||||
+ *	@napi: NAPI context
 | 
			
		||||
+ *	@poll: polling function
 | 
			
		||||
+ *	@weight: default weight
 | 
			
		||||
+ *
 | 
			
		||||
+ * This variant of netif_napi_add() should be used from drivers using NAPI
 | 
			
		||||
+ * with CPU intensive poll functions.
 | 
			
		||||
+ * This will schedule polling from a high priority workqueue
 | 
			
		||||
+ */
 | 
			
		||||
+static inline void netif_threaded_napi_add(struct net_device *dev,
 | 
			
		||||
+					   struct napi_struct *napi,
 | 
			
		||||
+					   int (*poll)(struct napi_struct *, int),
 | 
			
		||||
+					   int weight)
 | 
			
		||||
+{
 | 
			
		||||
+	set_bit(NAPI_STATE_THREADED, &napi->state);
 | 
			
		||||
+	netif_napi_add(dev, napi, poll, weight);
 | 
			
		||||
+}
 | 
			
		||||
+
 | 
			
		||||
+/**
 | 
			
		||||
  *	netif_tx_napi_add - initialize a NAPI context
 | 
			
		||||
  *	@dev:  network device
 | 
			
		||||
  *	@napi: NAPI context
 | 
			
		||||
--- a/net/core/dev.c
 | 
			
		||||
+++ b/net/core/dev.c
 | 
			
		||||
@@ -160,6 +160,7 @@ static DEFINE_SPINLOCK(offload_lock);
 | 
			
		||||
 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 | 
			
		||||
 struct list_head ptype_all __read_mostly;	/* Taps */
 | 
			
		||||
 static struct list_head offload_base __read_mostly;
 | 
			
		||||
+static struct workqueue_struct *napi_workq __read_mostly;
 | 
			
		||||
 
 | 
			
		||||
 static int netif_rx_internal(struct sk_buff *skb);
 | 
			
		||||
 static int call_netdevice_notifiers_info(unsigned long val,
 | 
			
		||||
@@ -5237,6 +5238,11 @@ void __napi_schedule(struct napi_struct
 | 
			
		||||
 {
 | 
			
		||||
 	unsigned long flags;
 | 
			
		||||
 
 | 
			
		||||
+	if (test_bit(NAPI_STATE_THREADED, &n->state)) {
 | 
			
		||||
+		queue_work(napi_workq, &n->work);
 | 
			
		||||
+		return;
 | 
			
		||||
+	}
 | 
			
		||||
+
 | 
			
		||||
 	local_irq_save(flags);
 | 
			
		||||
 	____napi_schedule(this_cpu_ptr(&softnet_data), n);
 | 
			
		||||
 	local_irq_restore(flags);
 | 
			
		||||
@@ -5284,6 +5290,11 @@ EXPORT_SYMBOL(napi_schedule_prep);
 | 
			
		||||
  */
 | 
			
		||||
 void __napi_schedule_irqoff(struct napi_struct *n)
 | 
			
		||||
 {
 | 
			
		||||
+	if (test_bit(NAPI_STATE_THREADED, &n->state)) {
 | 
			
		||||
+		queue_work(napi_workq, &n->work);
 | 
			
		||||
+		return;
 | 
			
		||||
+	}
 | 
			
		||||
+
 | 
			
		||||
 	____napi_schedule(this_cpu_ptr(&softnet_data), n);
 | 
			
		||||
 }
 | 
			
		||||
 EXPORT_SYMBOL(__napi_schedule_irqoff);
 | 
			
		||||
@@ -5521,6 +5532,82 @@ static enum hrtimer_restart napi_watchdo
 | 
			
		||||
 	return HRTIMER_NORESTART;
 | 
			
		||||
 }
 | 
			
		||||
 
 | 
			
		||||
+static int __napi_poll(struct napi_struct *n, bool *repoll)
 | 
			
		||||
+{
 | 
			
		||||
+	int work, weight;
 | 
			
		||||
+
 | 
			
		||||
+	weight = n->weight;
 | 
			
		||||
+
 | 
			
		||||
+	/* This NAPI_STATE_SCHED test is for avoiding a race
 | 
			
		||||
+	 * with netpoll's poll_napi().  Only the entity which
 | 
			
		||||
+	 * obtains the lock and sees NAPI_STATE_SCHED set will
 | 
			
		||||
+	 * actually make the ->poll() call.  Therefore we avoid
 | 
			
		||||
+	 * accidentally calling ->poll() when NAPI is not scheduled.
 | 
			
		||||
+	 */
 | 
			
		||||
+	work = 0;
 | 
			
		||||
+	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
 | 
			
		||||
+		work = n->poll(n, weight);
 | 
			
		||||
+		trace_napi_poll(n, work, weight);
 | 
			
		||||
+	}
 | 
			
		||||
+
 | 
			
		||||
+	WARN_ON_ONCE(work > weight);
 | 
			
		||||
+
 | 
			
		||||
+	if (likely(work < weight))
 | 
			
		||||
+		return work;
 | 
			
		||||
+
 | 
			
		||||
+	/* Drivers must not modify the NAPI state if they
 | 
			
		||||
+	 * consume the entire weight.  In such cases this code
 | 
			
		||||
+	 * still "owns" the NAPI instance and therefore can
 | 
			
		||||
+	 * move the instance around on the list at-will.
 | 
			
		||||
+	 */
 | 
			
		||||
+	if (unlikely(napi_disable_pending(n))) {
 | 
			
		||||
+		napi_complete(n);
 | 
			
		||||
+		return work;
 | 
			
		||||
+	}
 | 
			
		||||
+
 | 
			
		||||
+	if (n->gro_list) {
 | 
			
		||||
+		/* flush too old packets
 | 
			
		||||
+		 * If HZ < 1000, flush all packets.
 | 
			
		||||
+		 */
 | 
			
		||||
+		napi_gro_flush(n, HZ >= 1000);
 | 
			
		||||
+	}
 | 
			
		||||
+
 | 
			
		||||
+	*repoll = true;
 | 
			
		||||
+
 | 
			
		||||
+	return work;
 | 
			
		||||
+}
 | 
			
		||||
+
 | 
			
		||||
+static void napi_workfn(struct work_struct *work)
 | 
			
		||||
+{
 | 
			
		||||
+	struct napi_struct *n = container_of(work, struct napi_struct, work);
 | 
			
		||||
+	void *have;
 | 
			
		||||
+
 | 
			
		||||
+	for (;;) {
 | 
			
		||||
+		bool repoll = false;
 | 
			
		||||
+
 | 
			
		||||
+		local_bh_disable();
 | 
			
		||||
+
 | 
			
		||||
+		have = netpoll_poll_lock(n);
 | 
			
		||||
+		__napi_poll(n, &repoll);
 | 
			
		||||
+		netpoll_poll_unlock(have);
 | 
			
		||||
+
 | 
			
		||||
+		local_bh_enable();
 | 
			
		||||
+
 | 
			
		||||
+		if (!repoll)
 | 
			
		||||
+			return;
 | 
			
		||||
+
 | 
			
		||||
+		if (!need_resched())
 | 
			
		||||
+			continue;
 | 
			
		||||
+
 | 
			
		||||
+		/*
 | 
			
		||||
+		 * have to pay for the latency of task switch even if
 | 
			
		||||
+		 * napi is scheduled
 | 
			
		||||
+		 */
 | 
			
		||||
+		queue_work(napi_workq, work);
 | 
			
		||||
+		return;
 | 
			
		||||
+	}
 | 
			
		||||
+}
 | 
			
		||||
+
 | 
			
		||||
 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 | 
			
		||||
 		    int (*poll)(struct napi_struct *, int), int weight)
 | 
			
		||||
 {
 | 
			
		||||
@@ -5540,6 +5627,7 @@ void netif_napi_add(struct net_device *d
 | 
			
		||||
 #ifdef CONFIG_NETPOLL
 | 
			
		||||
 	napi->poll_owner = -1;
 | 
			
		||||
 #endif
 | 
			
		||||
+	INIT_WORK(&napi->work, napi_workfn);
 | 
			
		||||
 	set_bit(NAPI_STATE_SCHED, &napi->state);
 | 
			
		||||
 	napi_hash_add(napi);
 | 
			
		||||
 }
 | 
			
		||||
@@ -5565,6 +5653,7 @@ EXPORT_SYMBOL(napi_disable);
 | 
			
		||||
 void netif_napi_del(struct napi_struct *napi)
 | 
			
		||||
 {
 | 
			
		||||
 	might_sleep();
 | 
			
		||||
+	cancel_work_sync(&napi->work);
 | 
			
		||||
 	if (napi_hash_del(napi))
 | 
			
		||||
 		synchronize_net();
 | 
			
		||||
 	list_del_init(&napi->dev_list);
 | 
			
		||||
@@ -5578,48 +5667,18 @@ EXPORT_SYMBOL(netif_napi_del);
 | 
			
		||||
 
 | 
			
		||||
 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
 | 
			
		||||
 {
 | 
			
		||||
+	bool do_repoll = false;
 | 
			
		||||
 	void *have;
 | 
			
		||||
-	int work, weight;
 | 
			
		||||
+	int work;
 | 
			
		||||
 
 | 
			
		||||
 	list_del_init(&n->poll_list);
 | 
			
		||||
 
 | 
			
		||||
 	have = netpoll_poll_lock(n);
 | 
			
		||||
 
 | 
			
		||||
-	weight = n->weight;
 | 
			
		||||
-
 | 
			
		||||
-	/* This NAPI_STATE_SCHED test is for avoiding a race
 | 
			
		||||
-	 * with netpoll's poll_napi().  Only the entity which
 | 
			
		||||
-	 * obtains the lock and sees NAPI_STATE_SCHED set will
 | 
			
		||||
-	 * actually make the ->poll() call.  Therefore we avoid
 | 
			
		||||
-	 * accidentally calling ->poll() when NAPI is not scheduled.
 | 
			
		||||
-	 */
 | 
			
		||||
-	work = 0;
 | 
			
		||||
-	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
 | 
			
		||||
-		work = n->poll(n, weight);
 | 
			
		||||
-		trace_napi_poll(n, work, weight);
 | 
			
		||||
-	}
 | 
			
		||||
-
 | 
			
		||||
-	WARN_ON_ONCE(work > weight);
 | 
			
		||||
-
 | 
			
		||||
-	if (likely(work < weight))
 | 
			
		||||
-		goto out_unlock;
 | 
			
		||||
+	work = __napi_poll(n, &do_repoll);
 | 
			
		||||
 
 | 
			
		||||
-	/* Drivers must not modify the NAPI state if they
 | 
			
		||||
-	 * consume the entire weight.  In such cases this code
 | 
			
		||||
-	 * still "owns" the NAPI instance and therefore can
 | 
			
		||||
-	 * move the instance around on the list at-will.
 | 
			
		||||
-	 */
 | 
			
		||||
-	if (unlikely(napi_disable_pending(n))) {
 | 
			
		||||
-		napi_complete(n);
 | 
			
		||||
+	if (!do_repoll)
 | 
			
		||||
 		goto out_unlock;
 | 
			
		||||
-	}
 | 
			
		||||
-
 | 
			
		||||
-	if (n->gro_list) {
 | 
			
		||||
-		/* flush too old packets
 | 
			
		||||
-		 * If HZ < 1000, flush all packets.
 | 
			
		||||
-		 */
 | 
			
		||||
-		napi_gro_flush(n, HZ >= 1000);
 | 
			
		||||
-	}
 | 
			
		||||
 
 | 
			
		||||
 	/* Some drivers may have called napi_schedule
 | 
			
		||||
 	 * prior to exhausting their budget.
 | 
			
		||||
@@ -8855,6 +8914,10 @@ static int __init net_dev_init(void)
 | 
			
		||||
 		sd->backlog.weight = weight_p;
 | 
			
		||||
 	}
 | 
			
		||||
 
 | 
			
		||||
+	napi_workq = alloc_workqueue("napi_workq", WQ_UNBOUND | WQ_HIGHPRI,
 | 
			
		||||
+				     WQ_UNBOUND_MAX_ACTIVE | WQ_SYSFS);
 | 
			
		||||
+	BUG_ON(!napi_workq);
 | 
			
		||||
+
 | 
			
		||||
 	dev_boot_phase = 0;
 | 
			
		||||
 
 | 
			
		||||
 	/* The loopback device is special if any other network devices
 | 
			
		||||
--- a/net/core/net-sysfs.c
 | 
			
		||||
+++ b/net/core/net-sysfs.c
 | 
			
		||||
@@ -441,6 +441,52 @@ static ssize_t proto_down_store(struct d
 | 
			
		||||
 }
 | 
			
		||||
 NETDEVICE_SHOW_RW(proto_down, fmt_dec);
 | 
			
		||||
 
 | 
			
		||||
+static int change_napi_threaded(struct net_device *dev, unsigned long val)
 | 
			
		||||
+{
 | 
			
		||||
+	struct napi_struct *napi;
 | 
			
		||||
+
 | 
			
		||||
+	if (list_empty(&dev->napi_list))
 | 
			
		||||
+		return -EOPNOTSUPP;
 | 
			
		||||
+
 | 
			
		||||
+	list_for_each_entry(napi, &dev->napi_list, dev_list) {
 | 
			
		||||
+		if (val)
 | 
			
		||||
+			set_bit(NAPI_STATE_THREADED, &napi->state);
 | 
			
		||||
+		else
 | 
			
		||||
+			clear_bit(NAPI_STATE_THREADED, &napi->state);
 | 
			
		||||
+	}
 | 
			
		||||
+
 | 
			
		||||
+	return 0;
 | 
			
		||||
+}
 | 
			
		||||
+
 | 
			
		||||
+static ssize_t napi_threaded_store(struct device *dev,
 | 
			
		||||
+				struct device_attribute *attr,
 | 
			
		||||
+				const char *buf, size_t len)
 | 
			
		||||
+{
 | 
			
		||||
+	return netdev_store(dev, attr, buf, len, change_napi_threaded);
 | 
			
		||||
+}
 | 
			
		||||
+
 | 
			
		||||
+static ssize_t napi_threaded_show(struct device *dev,
 | 
			
		||||
+				  struct device_attribute *attr,
 | 
			
		||||
+				  char *buf)
 | 
			
		||||
+{
 | 
			
		||||
+	struct net_device *netdev = to_net_dev(dev);
 | 
			
		||||
+	struct napi_struct *napi;
 | 
			
		||||
+	bool enabled = false;
 | 
			
		||||
+
 | 
			
		||||
+	if (!rtnl_trylock())
 | 
			
		||||
+		return restart_syscall();
 | 
			
		||||
+
 | 
			
		||||
+	list_for_each_entry(napi, &netdev->napi_list, dev_list) {
 | 
			
		||||
+		if (test_bit(NAPI_STATE_THREADED, &napi->state))
 | 
			
		||||
+			enabled = true;
 | 
			
		||||
+	}
 | 
			
		||||
+
 | 
			
		||||
+	rtnl_unlock();
 | 
			
		||||
+
 | 
			
		||||
+	return sprintf(buf, fmt_dec, enabled);
 | 
			
		||||
+}
 | 
			
		||||
+static DEVICE_ATTR_RW(napi_threaded);
 | 
			
		||||
+
 | 
			
		||||
 static ssize_t phys_port_id_show(struct device *dev,
 | 
			
		||||
 				 struct device_attribute *attr, char *buf)
 | 
			
		||||
 {
 | 
			
		||||
@@ -536,6 +582,7 @@ static struct attribute *net_class_attrs
 | 
			
		||||
 	&dev_attr_flags.attr,
 | 
			
		||||
 	&dev_attr_tx_queue_len.attr,
 | 
			
		||||
 	&dev_attr_gro_flush_timeout.attr,
 | 
			
		||||
+	&dev_attr_napi_threaded.attr,
 | 
			
		||||
 	&dev_attr_phys_port_id.attr,
 | 
			
		||||
 	&dev_attr_phys_port_name.attr,
 | 
			
		||||
 	&dev_attr_phys_switch_id.attr,
 | 
			
		||||
@@ -0,0 +1,339 @@
 | 
			
		||||
From: Felix Fietkau <nbd@nbd.name>
 | 
			
		||||
Date: Sun, 26 Jul 2020 14:03:21 +0200
 | 
			
		||||
Subject: [PATCH] net: add support for threaded NAPI polling
 | 
			
		||||
 | 
			
		||||
For some drivers (especially 802.11 drivers), doing a lot of work in the NAPI
 | 
			
		||||
poll function does not perform well. Since NAPI poll is bound to the CPU it
 | 
			
		||||
was scheduled from, we can easily end up with a few very busy CPUs spending
 | 
			
		||||
most of their time in softirq/ksoftirqd and some idle ones.
 | 
			
		||||
 | 
			
		||||
Introduce threaded NAPI for such drivers based on a workqueue. The API is the
 | 
			
		||||
same except for using netif_threaded_napi_add instead of netif_napi_add.
 | 
			
		||||
 | 
			
		||||
In my tests with mt76 on MT7621 using threaded NAPI + a thread for tx scheduling
 | 
			
		||||
improves LAN->WLAN bridging throughput by 10-50%. Throughput without threaded
 | 
			
		||||
NAPI is wildly inconsistent, depending on the CPU that runs the tx scheduling
 | 
			
		||||
thread.
 | 
			
		||||
 | 
			
		||||
With threaded NAPI it seems stable and consistent (and higher than the best
 | 
			
		||||
results I got without it).
 | 
			
		||||
 | 
			
		||||
Based on a patch by Hillf Danton
 | 
			
		||||
 | 
			
		||||
Cc: Hillf Danton <hdanton@sina.com>
 | 
			
		||||
Signed-off-by: Felix Fietkau <nbd@nbd.name>
 | 
			
		||||
---
 | 
			
		||||
 | 
			
		||||
--- a/include/linux/netdevice.h
 | 
			
		||||
+++ b/include/linux/netdevice.h
 | 
			
		||||
@@ -339,6 +339,7 @@ struct napi_struct {
 | 
			
		||||
 	struct list_head	dev_list;
 | 
			
		||||
 	struct hlist_node	napi_hash_node;
 | 
			
		||||
 	unsigned int		napi_id;
 | 
			
		||||
+	struct work_struct	work;
 | 
			
		||||
 };
 | 
			
		||||
 
 | 
			
		||||
 enum {
 | 
			
		||||
@@ -349,6 +350,7 @@ enum {
 | 
			
		||||
 	NAPI_STATE_HASHED,	/* In NAPI hash (busy polling possible) */
 | 
			
		||||
 	NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
 | 
			
		||||
 	NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
 | 
			
		||||
+	NAPI_STATE_THREADED,	/* Use threaded NAPI */
 | 
			
		||||
 };
 | 
			
		||||
 
 | 
			
		||||
 enum {
 | 
			
		||||
@@ -359,6 +361,7 @@ enum {
 | 
			
		||||
 	NAPIF_STATE_HASHED	 = BIT(NAPI_STATE_HASHED),
 | 
			
		||||
 	NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
 | 
			
		||||
 	NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
 | 
			
		||||
+	NAPIF_STATE_THREADED	 = BIT(NAPI_STATE_THREADED),
 | 
			
		||||
 };
 | 
			
		||||
 
 | 
			
		||||
 enum gro_result {
 | 
			
		||||
@@ -2230,6 +2233,26 @@ void netif_napi_add(struct net_device *d
 | 
			
		||||
 		    int (*poll)(struct napi_struct *, int), int weight);
 | 
			
		||||
 
 | 
			
		||||
 /**
 | 
			
		||||
+ *	netif_threaded_napi_add - initialize a NAPI context
 | 
			
		||||
+ *	@dev:  network device
 | 
			
		||||
+ *	@napi: NAPI context
 | 
			
		||||
+ *	@poll: polling function
 | 
			
		||||
+ *	@weight: default weight
 | 
			
		||||
+ *
 | 
			
		||||
+ * This variant of netif_napi_add() should be used from drivers using NAPI
 | 
			
		||||
+ * with CPU intensive poll functions.
 | 
			
		||||
+ * This will schedule polling from a high priority workqueue
 | 
			
		||||
+ */
 | 
			
		||||
+static inline void netif_threaded_napi_add(struct net_device *dev,
 | 
			
		||||
+					   struct napi_struct *napi,
 | 
			
		||||
+					   int (*poll)(struct napi_struct *, int),
 | 
			
		||||
+					   int weight)
 | 
			
		||||
+{
 | 
			
		||||
+	set_bit(NAPI_STATE_THREADED, &napi->state);
 | 
			
		||||
+	netif_napi_add(dev, napi, poll, weight);
 | 
			
		||||
+}
 | 
			
		||||
+
 | 
			
		||||
+/**
 | 
			
		||||
  *	netif_tx_napi_add - initialize a NAPI context
 | 
			
		||||
  *	@dev:  network device
 | 
			
		||||
  *	@napi: NAPI context
 | 
			
		||||
--- a/net/core/dev.c
 | 
			
		||||
+++ b/net/core/dev.c
 | 
			
		||||
@@ -160,6 +160,7 @@ static DEFINE_SPINLOCK(offload_lock);
 | 
			
		||||
 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 | 
			
		||||
 struct list_head ptype_all __read_mostly;	/* Taps */
 | 
			
		||||
 static struct list_head offload_base __read_mostly;
 | 
			
		||||
+static struct workqueue_struct *napi_workq __read_mostly;
 | 
			
		||||
 
 | 
			
		||||
 static int netif_rx_internal(struct sk_buff *skb);
 | 
			
		||||
 static int call_netdevice_notifiers_info(unsigned long val,
 | 
			
		||||
@@ -5891,6 +5892,11 @@ void __napi_schedule(struct napi_struct
 | 
			
		||||
 {
 | 
			
		||||
 	unsigned long flags;
 | 
			
		||||
 
 | 
			
		||||
+	if (test_bit(NAPI_STATE_THREADED, &n->state)) {
 | 
			
		||||
+		queue_work(napi_workq, &n->work);
 | 
			
		||||
+		return;
 | 
			
		||||
+	}
 | 
			
		||||
+
 | 
			
		||||
 	local_irq_save(flags);
 | 
			
		||||
 	____napi_schedule(this_cpu_ptr(&softnet_data), n);
 | 
			
		||||
 	local_irq_restore(flags);
 | 
			
		||||
@@ -5938,6 +5944,11 @@ EXPORT_SYMBOL(napi_schedule_prep);
 | 
			
		||||
  */
 | 
			
		||||
 void __napi_schedule_irqoff(struct napi_struct *n)
 | 
			
		||||
 {
 | 
			
		||||
+	if (test_bit(NAPI_STATE_THREADED, &n->state)) {
 | 
			
		||||
+		queue_work(napi_workq, &n->work);
 | 
			
		||||
+		return;
 | 
			
		||||
+	}
 | 
			
		||||
+
 | 
			
		||||
 	____napi_schedule(this_cpu_ptr(&softnet_data), n);
 | 
			
		||||
 }
 | 
			
		||||
 EXPORT_SYMBOL(__napi_schedule_irqoff);
 | 
			
		||||
@@ -6186,6 +6197,82 @@ static void init_gro_hash(struct napi_st
 | 
			
		||||
 	napi->gro_bitmask = 0;
 | 
			
		||||
 }
 | 
			
		||||
 
 | 
			
		||||
+static int __napi_poll(struct napi_struct *n, bool *repoll)
 | 
			
		||||
+{
 | 
			
		||||
+	int work, weight;
 | 
			
		||||
+
 | 
			
		||||
+	weight = n->weight;
 | 
			
		||||
+
 | 
			
		||||
+	/* This NAPI_STATE_SCHED test is for avoiding a race
 | 
			
		||||
+	 * with netpoll's poll_napi().  Only the entity which
 | 
			
		||||
+	 * obtains the lock and sees NAPI_STATE_SCHED set will
 | 
			
		||||
+	 * actually make the ->poll() call.  Therefore we avoid
 | 
			
		||||
+	 * accidentally calling ->poll() when NAPI is not scheduled.
 | 
			
		||||
+	 */
 | 
			
		||||
+	work = 0;
 | 
			
		||||
+	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
 | 
			
		||||
+		work = n->poll(n, weight);
 | 
			
		||||
+		trace_napi_poll(n, work, weight);
 | 
			
		||||
+	}
 | 
			
		||||
+
 | 
			
		||||
+	WARN_ON_ONCE(work > weight);
 | 
			
		||||
+
 | 
			
		||||
+	if (likely(work < weight))
 | 
			
		||||
+		return work;
 | 
			
		||||
+
 | 
			
		||||
+	/* Drivers must not modify the NAPI state if they
 | 
			
		||||
+	 * consume the entire weight.  In such cases this code
 | 
			
		||||
+	 * still "owns" the NAPI instance and therefore can
 | 
			
		||||
+	 * move the instance around on the list at-will.
 | 
			
		||||
+	 */
 | 
			
		||||
+	if (unlikely(napi_disable_pending(n))) {
 | 
			
		||||
+		napi_complete(n);
 | 
			
		||||
+		return work;
 | 
			
		||||
+	}
 | 
			
		||||
+
 | 
			
		||||
+	if (n->gro_bitmask) {
 | 
			
		||||
+		/* flush too old packets
 | 
			
		||||
+		 * If HZ < 1000, flush all packets.
 | 
			
		||||
+		 */
 | 
			
		||||
+		napi_gro_flush(n, HZ >= 1000);
 | 
			
		||||
+	}
 | 
			
		||||
+
 | 
			
		||||
+	*repoll = true;
 | 
			
		||||
+
 | 
			
		||||
+	return work;
 | 
			
		||||
+}
 | 
			
		||||
+
 | 
			
		||||
+static void napi_workfn(struct work_struct *work)
 | 
			
		||||
+{
 | 
			
		||||
+	struct napi_struct *n = container_of(work, struct napi_struct, work);
 | 
			
		||||
+	void *have;
 | 
			
		||||
+
 | 
			
		||||
+	for (;;) {
 | 
			
		||||
+		bool repoll = false;
 | 
			
		||||
+
 | 
			
		||||
+		local_bh_disable();
 | 
			
		||||
+
 | 
			
		||||
+		have = netpoll_poll_lock(n);
 | 
			
		||||
+		__napi_poll(n, &repoll);
 | 
			
		||||
+		netpoll_poll_unlock(have);
 | 
			
		||||
+
 | 
			
		||||
+		local_bh_enable();
 | 
			
		||||
+
 | 
			
		||||
+		if (!repoll)
 | 
			
		||||
+			return;
 | 
			
		||||
+
 | 
			
		||||
+		if (!need_resched())
 | 
			
		||||
+			continue;
 | 
			
		||||
+
 | 
			
		||||
+		/*
 | 
			
		||||
+		 * have to pay for the latency of task switch even if
 | 
			
		||||
+		 * napi is scheduled
 | 
			
		||||
+		 */
 | 
			
		||||
+		queue_work(napi_workq, work);
 | 
			
		||||
+		return;
 | 
			
		||||
+	}
 | 
			
		||||
+}
 | 
			
		||||
+
 | 
			
		||||
 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 | 
			
		||||
 		    int (*poll)(struct napi_struct *, int), int weight)
 | 
			
		||||
 {
 | 
			
		||||
@@ -6204,6 +6291,7 @@ void netif_napi_add(struct net_device *d
 | 
			
		||||
 #ifdef CONFIG_NETPOLL
 | 
			
		||||
 	napi->poll_owner = -1;
 | 
			
		||||
 #endif
 | 
			
		||||
+	INIT_WORK(&napi->work, napi_workfn);
 | 
			
		||||
 	set_bit(NAPI_STATE_SCHED, &napi->state);
 | 
			
		||||
 	napi_hash_add(napi);
 | 
			
		||||
 }
 | 
			
		||||
@@ -6242,6 +6330,7 @@ static void flush_gro_hash(struct napi_s
 | 
			
		||||
 void netif_napi_del(struct napi_struct *napi)
 | 
			
		||||
 {
 | 
			
		||||
 	might_sleep();
 | 
			
		||||
+	cancel_work_sync(&napi->work);
 | 
			
		||||
 	if (napi_hash_del(napi))
 | 
			
		||||
 		synchronize_net();
 | 
			
		||||
 	list_del_init(&napi->dev_list);
 | 
			
		||||
@@ -6254,48 +6343,18 @@ EXPORT_SYMBOL(netif_napi_del);
 | 
			
		||||
 
 | 
			
		||||
 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
 | 
			
		||||
 {
 | 
			
		||||
+	bool do_repoll = false;
 | 
			
		||||
 	void *have;
 | 
			
		||||
-	int work, weight;
 | 
			
		||||
+	int work;
 | 
			
		||||
 
 | 
			
		||||
 	list_del_init(&n->poll_list);
 | 
			
		||||
 
 | 
			
		||||
 	have = netpoll_poll_lock(n);
 | 
			
		||||
 
 | 
			
		||||
-	weight = n->weight;
 | 
			
		||||
-
 | 
			
		||||
-	/* This NAPI_STATE_SCHED test is for avoiding a race
 | 
			
		||||
-	 * with netpoll's poll_napi().  Only the entity which
 | 
			
		||||
-	 * obtains the lock and sees NAPI_STATE_SCHED set will
 | 
			
		||||
-	 * actually make the ->poll() call.  Therefore we avoid
 | 
			
		||||
-	 * accidentally calling ->poll() when NAPI is not scheduled.
 | 
			
		||||
-	 */
 | 
			
		||||
-	work = 0;
 | 
			
		||||
-	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
 | 
			
		||||
-		work = n->poll(n, weight);
 | 
			
		||||
-		trace_napi_poll(n, work, weight);
 | 
			
		||||
-	}
 | 
			
		||||
-
 | 
			
		||||
-	WARN_ON_ONCE(work > weight);
 | 
			
		||||
-
 | 
			
		||||
-	if (likely(work < weight))
 | 
			
		||||
-		goto out_unlock;
 | 
			
		||||
+	work = __napi_poll(n, &do_repoll);
 | 
			
		||||
 
 | 
			
		||||
-	/* Drivers must not modify the NAPI state if they
 | 
			
		||||
-	 * consume the entire weight.  In such cases this code
 | 
			
		||||
-	 * still "owns" the NAPI instance and therefore can
 | 
			
		||||
-	 * move the instance around on the list at-will.
 | 
			
		||||
-	 */
 | 
			
		||||
-	if (unlikely(napi_disable_pending(n))) {
 | 
			
		||||
-		napi_complete(n);
 | 
			
		||||
+	if (!do_repoll)
 | 
			
		||||
 		goto out_unlock;
 | 
			
		||||
-	}
 | 
			
		||||
-
 | 
			
		||||
-	if (n->gro_bitmask) {
 | 
			
		||||
-		/* flush too old packets
 | 
			
		||||
-		 * If HZ < 1000, flush all packets.
 | 
			
		||||
-		 */
 | 
			
		||||
-		napi_gro_flush(n, HZ >= 1000);
 | 
			
		||||
-	}
 | 
			
		||||
 
 | 
			
		||||
 	/* Some drivers may have called napi_schedule
 | 
			
		||||
 	 * prior to exhausting their budget.
 | 
			
		||||
@@ -9895,6 +9954,10 @@ static int __init net_dev_init(void)
 | 
			
		||||
 		sd->backlog.weight = weight_p;
 | 
			
		||||
 	}
 | 
			
		||||
 
 | 
			
		||||
+	napi_workq = alloc_workqueue("napi_workq", WQ_UNBOUND | WQ_HIGHPRI,
 | 
			
		||||
+				     WQ_UNBOUND_MAX_ACTIVE | WQ_SYSFS);
 | 
			
		||||
+	BUG_ON(!napi_workq);
 | 
			
		||||
+
 | 
			
		||||
 	dev_boot_phase = 0;
 | 
			
		||||
 
 | 
			
		||||
 	/* The loopback device is special if any other network devices
 | 
			
		||||
--- a/net/core/net-sysfs.c
 | 
			
		||||
+++ b/net/core/net-sysfs.c
 | 
			
		||||
@@ -447,6 +447,52 @@ static ssize_t proto_down_store(struct d
 | 
			
		||||
 }
 | 
			
		||||
 NETDEVICE_SHOW_RW(proto_down, fmt_dec);
 | 
			
		||||
 
 | 
			
		||||
+static int change_napi_threaded(struct net_device *dev, unsigned long val)
 | 
			
		||||
+{
 | 
			
		||||
+	struct napi_struct *napi;
 | 
			
		||||
+
 | 
			
		||||
+	if (list_empty(&dev->napi_list))
 | 
			
		||||
+		return -EOPNOTSUPP;
 | 
			
		||||
+
 | 
			
		||||
+	list_for_each_entry(napi, &dev->napi_list, dev_list) {
 | 
			
		||||
+		if (val)
 | 
			
		||||
+			set_bit(NAPI_STATE_THREADED, &napi->state);
 | 
			
		||||
+		else
 | 
			
		||||
+			clear_bit(NAPI_STATE_THREADED, &napi->state);
 | 
			
		||||
+	}
 | 
			
		||||
+
 | 
			
		||||
+	return 0;
 | 
			
		||||
+}
 | 
			
		||||
+
 | 
			
		||||
+static ssize_t napi_threaded_store(struct device *dev,
 | 
			
		||||
+				struct device_attribute *attr,
 | 
			
		||||
+				const char *buf, size_t len)
 | 
			
		||||
+{
 | 
			
		||||
+	return netdev_store(dev, attr, buf, len, change_napi_threaded);
 | 
			
		||||
+}
 | 
			
		||||
+
 | 
			
		||||
+static ssize_t napi_threaded_show(struct device *dev,
 | 
			
		||||
+				  struct device_attribute *attr,
 | 
			
		||||
+				  char *buf)
 | 
			
		||||
+{
 | 
			
		||||
+	struct net_device *netdev = to_net_dev(dev);
 | 
			
		||||
+	struct napi_struct *napi;
 | 
			
		||||
+	bool enabled = false;
 | 
			
		||||
+
 | 
			
		||||
+	if (!rtnl_trylock())
 | 
			
		||||
+		return restart_syscall();
 | 
			
		||||
+
 | 
			
		||||
+	list_for_each_entry(napi, &netdev->napi_list, dev_list) {
 | 
			
		||||
+		if (test_bit(NAPI_STATE_THREADED, &napi->state))
 | 
			
		||||
+			enabled = true;
 | 
			
		||||
+	}
 | 
			
		||||
+
 | 
			
		||||
+	rtnl_unlock();
 | 
			
		||||
+
 | 
			
		||||
+	return sprintf(buf, fmt_dec, enabled);
 | 
			
		||||
+}
 | 
			
		||||
+static DEVICE_ATTR_RW(napi_threaded);
 | 
			
		||||
+
 | 
			
		||||
 static ssize_t phys_port_id_show(struct device *dev,
 | 
			
		||||
 				 struct device_attribute *attr, char *buf)
 | 
			
		||||
 {
 | 
			
		||||
@@ -542,6 +588,7 @@ static struct attribute *net_class_attrs
 | 
			
		||||
 	&dev_attr_flags.attr,
 | 
			
		||||
 	&dev_attr_tx_queue_len.attr,
 | 
			
		||||
 	&dev_attr_gro_flush_timeout.attr,
 | 
			
		||||
+	&dev_attr_napi_threaded.attr,
 | 
			
		||||
 	&dev_attr_phys_port_id.attr,
 | 
			
		||||
 	&dev_attr_phys_port_name.attr,
 | 
			
		||||
 	&dev_attr_phys_switch_id.attr,
 | 
			
		||||
@@ -0,0 +1,343 @@
 | 
			
		||||
From: Felix Fietkau <nbd@nbd.name>
 | 
			
		||||
Date: Sun, 26 Jul 2020 14:03:21 +0200
 | 
			
		||||
Subject: [PATCH] net: add support for threaded NAPI polling
 | 
			
		||||
 | 
			
		||||
For some drivers (especially 802.11 drivers), doing a lot of work in the NAPI
 | 
			
		||||
poll function does not perform well. Since NAPI poll is bound to the CPU it
 | 
			
		||||
was scheduled from, we can easily end up with a few very busy CPUs spending
 | 
			
		||||
most of their time in softirq/ksoftirqd and some idle ones.
 | 
			
		||||
 | 
			
		||||
Introduce threaded NAPI for such drivers based on a workqueue. The API is the
 | 
			
		||||
same except for using netif_threaded_napi_add instead of netif_napi_add.
 | 
			
		||||
 | 
			
		||||
In my tests with mt76 on MT7621 using threaded NAPI + a thread for tx scheduling
 | 
			
		||||
improves LAN->WLAN bridging throughput by 10-50%. Throughput without threaded
 | 
			
		||||
NAPI is wildly inconsistent, depending on the CPU that runs the tx scheduling
 | 
			
		||||
thread.
 | 
			
		||||
 | 
			
		||||
With threaded NAPI it seems stable and consistent (and higher than the best
 | 
			
		||||
results I got without it).
 | 
			
		||||
 | 
			
		||||
Based on a patch by Hillf Danton
 | 
			
		||||
 | 
			
		||||
Cc: Hillf Danton <hdanton@sina.com>
 | 
			
		||||
Signed-off-by: Felix Fietkau <nbd@nbd.name>
 | 
			
		||||
---
 | 
			
		||||
 | 
			
		||||
--- a/include/linux/netdevice.h
 | 
			
		||||
+++ b/include/linux/netdevice.h
 | 
			
		||||
@@ -340,6 +340,7 @@ struct napi_struct {
 | 
			
		||||
 	struct list_head	dev_list;
 | 
			
		||||
 	struct hlist_node	napi_hash_node;
 | 
			
		||||
 	unsigned int		napi_id;
 | 
			
		||||
+	struct work_struct	work;
 | 
			
		||||
 };
 | 
			
		||||
 
 | 
			
		||||
 enum {
 | 
			
		||||
@@ -350,6 +351,7 @@ enum {
 | 
			
		||||
 	NAPI_STATE_HASHED,	/* In NAPI hash (busy polling possible) */
 | 
			
		||||
 	NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
 | 
			
		||||
 	NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
 | 
			
		||||
+	NAPI_STATE_THREADED,	/* Use threaded NAPI */
 | 
			
		||||
 };
 | 
			
		||||
 
 | 
			
		||||
 enum {
 | 
			
		||||
@@ -360,6 +362,7 @@ enum {
 | 
			
		||||
 	NAPIF_STATE_HASHED	 = BIT(NAPI_STATE_HASHED),
 | 
			
		||||
 	NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
 | 
			
		||||
 	NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
 | 
			
		||||
+	NAPIF_STATE_THREADED	 = BIT(NAPI_STATE_THREADED),
 | 
			
		||||
 };
 | 
			
		||||
 
 | 
			
		||||
 enum gro_result {
 | 
			
		||||
@@ -2249,6 +2252,26 @@ void netif_napi_add(struct net_device *d
 | 
			
		||||
 		    int (*poll)(struct napi_struct *, int), int weight);
 | 
			
		||||
 
 | 
			
		||||
 /**
 | 
			
		||||
+ *	netif_threaded_napi_add - initialize a NAPI context
 | 
			
		||||
+ *	@dev:  network device
 | 
			
		||||
+ *	@napi: NAPI context
 | 
			
		||||
+ *	@poll: polling function
 | 
			
		||||
+ *	@weight: default weight
 | 
			
		||||
+ *
 | 
			
		||||
+ * This variant of netif_napi_add() should be used from drivers using NAPI
 | 
			
		||||
+ * with CPU intensive poll functions.
 | 
			
		||||
+ * This will schedule polling from a high priority workqueue
 | 
			
		||||
+ */
 | 
			
		||||
+static inline void netif_threaded_napi_add(struct net_device *dev,
 | 
			
		||||
+					   struct napi_struct *napi,
 | 
			
		||||
+					   int (*poll)(struct napi_struct *, int),
 | 
			
		||||
+					   int weight)
 | 
			
		||||
+{
 | 
			
		||||
+	set_bit(NAPI_STATE_THREADED, &napi->state);
 | 
			
		||||
+	netif_napi_add(dev, napi, poll, weight);
 | 
			
		||||
+}
 | 
			
		||||
+
 | 
			
		||||
+/**
 | 
			
		||||
  *	netif_tx_napi_add - initialize a NAPI context
 | 
			
		||||
  *	@dev:  network device
 | 
			
		||||
  *	@napi: NAPI context
 | 
			
		||||
--- a/net/core/dev.c
 | 
			
		||||
+++ b/net/core/dev.c
 | 
			
		||||
@@ -156,6 +156,7 @@ static DEFINE_SPINLOCK(offload_lock);
 | 
			
		||||
 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 | 
			
		||||
 struct list_head ptype_all __read_mostly;	/* Taps */
 | 
			
		||||
 static struct list_head offload_base __read_mostly;
 | 
			
		||||
+static struct workqueue_struct *napi_workq __read_mostly;
 | 
			
		||||
 
 | 
			
		||||
 static int netif_rx_internal(struct sk_buff *skb);
 | 
			
		||||
 static int call_netdevice_notifiers_info(unsigned long val,
 | 
			
		||||
@@ -5910,6 +5911,11 @@ void __napi_schedule(struct napi_struct
 | 
			
		||||
 {
 | 
			
		||||
 	unsigned long flags;
 | 
			
		||||
 
 | 
			
		||||
+	if (test_bit(NAPI_STATE_THREADED, &n->state)) {
 | 
			
		||||
+		queue_work(napi_workq, &n->work);
 | 
			
		||||
+		return;
 | 
			
		||||
+	}
 | 
			
		||||
+
 | 
			
		||||
 	local_irq_save(flags);
 | 
			
		||||
 	____napi_schedule(this_cpu_ptr(&softnet_data), n);
 | 
			
		||||
 	local_irq_restore(flags);
 | 
			
		||||
@@ -5957,6 +5963,11 @@ EXPORT_SYMBOL(napi_schedule_prep);
 | 
			
		||||
  */
 | 
			
		||||
 void __napi_schedule_irqoff(struct napi_struct *n)
 | 
			
		||||
 {
 | 
			
		||||
+	if (test_bit(NAPI_STATE_THREADED, &n->state)) {
 | 
			
		||||
+		queue_work(napi_workq, &n->work);
 | 
			
		||||
+		return;
 | 
			
		||||
+	}
 | 
			
		||||
+
 | 
			
		||||
 	____napi_schedule(this_cpu_ptr(&softnet_data), n);
 | 
			
		||||
 }
 | 
			
		||||
 EXPORT_SYMBOL(__napi_schedule_irqoff);
 | 
			
		||||
@@ -6218,6 +6229,84 @@ static void init_gro_hash(struct napi_st
 | 
			
		||||
 	napi->gro_bitmask = 0;
 | 
			
		||||
 }
 | 
			
		||||
 
 | 
			
		||||
+static int __napi_poll(struct napi_struct *n, bool *repoll)
 | 
			
		||||
+{
 | 
			
		||||
+	int work, weight;
 | 
			
		||||
+
 | 
			
		||||
+	weight = n->weight;
 | 
			
		||||
+
 | 
			
		||||
+	/* This NAPI_STATE_SCHED test is for avoiding a race
 | 
			
		||||
+	 * with netpoll's poll_napi().  Only the entity which
 | 
			
		||||
+	 * obtains the lock and sees NAPI_STATE_SCHED set will
 | 
			
		||||
+	 * actually make the ->poll() call.  Therefore we avoid
 | 
			
		||||
+	 * accidentally calling ->poll() when NAPI is not scheduled.
 | 
			
		||||
+	 */
 | 
			
		||||
+	work = 0;
 | 
			
		||||
+	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
 | 
			
		||||
+		work = n->poll(n, weight);
 | 
			
		||||
+		trace_napi_poll(n, work, weight);
 | 
			
		||||
+	}
 | 
			
		||||
+
 | 
			
		||||
+	WARN_ON_ONCE(work > weight);
 | 
			
		||||
+
 | 
			
		||||
+	if (likely(work < weight))
 | 
			
		||||
+		return work;
 | 
			
		||||
+
 | 
			
		||||
+	/* Drivers must not modify the NAPI state if they
 | 
			
		||||
+	 * consume the entire weight.  In such cases this code
 | 
			
		||||
+	 * still "owns" the NAPI instance and therefore can
 | 
			
		||||
+	 * move the instance around on the list at-will.
 | 
			
		||||
+	 */
 | 
			
		||||
+	if (unlikely(napi_disable_pending(n))) {
 | 
			
		||||
+		napi_complete(n);
 | 
			
		||||
+		return work;
 | 
			
		||||
+	}
 | 
			
		||||
+
 | 
			
		||||
+	if (n->gro_bitmask) {
 | 
			
		||||
+		/* flush too old packets
 | 
			
		||||
+		 * If HZ < 1000, flush all packets.
 | 
			
		||||
+		 */
 | 
			
		||||
+		napi_gro_flush(n, HZ >= 1000);
 | 
			
		||||
+	}
 | 
			
		||||
+
 | 
			
		||||
+	gro_normal_list(n);
 | 
			
		||||
+
 | 
			
		||||
+	*repoll = true;
 | 
			
		||||
+
 | 
			
		||||
+	return work;
 | 
			
		||||
+}
 | 
			
		||||
+
 | 
			
		||||
+static void napi_workfn(struct work_struct *work)
 | 
			
		||||
+{
 | 
			
		||||
+	struct napi_struct *n = container_of(work, struct napi_struct, work);
 | 
			
		||||
+	void *have;
 | 
			
		||||
+
 | 
			
		||||
+	for (;;) {
 | 
			
		||||
+		bool repoll = false;
 | 
			
		||||
+
 | 
			
		||||
+		local_bh_disable();
 | 
			
		||||
+
 | 
			
		||||
+		have = netpoll_poll_lock(n);
 | 
			
		||||
+		__napi_poll(n, &repoll);
 | 
			
		||||
+		netpoll_poll_unlock(have);
 | 
			
		||||
+
 | 
			
		||||
+		local_bh_enable();
 | 
			
		||||
+
 | 
			
		||||
+		if (!repoll)
 | 
			
		||||
+			return;
 | 
			
		||||
+
 | 
			
		||||
+		if (!need_resched())
 | 
			
		||||
+			continue;
 | 
			
		||||
+
 | 
			
		||||
+		/*
 | 
			
		||||
+		 * have to pay for the latency of task switch even if
 | 
			
		||||
+		 * napi is scheduled
 | 
			
		||||
+		 */
 | 
			
		||||
+		queue_work(napi_workq, work);
 | 
			
		||||
+		return;
 | 
			
		||||
+	}
 | 
			
		||||
+}
 | 
			
		||||
+
 | 
			
		||||
 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 | 
			
		||||
 		    int (*poll)(struct napi_struct *, int), int weight)
 | 
			
		||||
 {
 | 
			
		||||
@@ -6238,6 +6327,7 @@ void netif_napi_add(struct net_device *d
 | 
			
		||||
 #ifdef CONFIG_NETPOLL
 | 
			
		||||
 	napi->poll_owner = -1;
 | 
			
		||||
 #endif
 | 
			
		||||
+	INIT_WORK(&napi->work, napi_workfn);
 | 
			
		||||
 	set_bit(NAPI_STATE_SCHED, &napi->state);
 | 
			
		||||
 	napi_hash_add(napi);
 | 
			
		||||
 }
 | 
			
		||||
@@ -6276,6 +6366,7 @@ static void flush_gro_hash(struct napi_s
 | 
			
		||||
 void netif_napi_del(struct napi_struct *napi)
 | 
			
		||||
 {
 | 
			
		||||
 	might_sleep();
 | 
			
		||||
+	cancel_work_sync(&napi->work);
 | 
			
		||||
 	if (napi_hash_del(napi))
 | 
			
		||||
 		synchronize_net();
 | 
			
		||||
 	list_del_init(&napi->dev_list);
 | 
			
		||||
@@ -6288,50 +6379,18 @@ EXPORT_SYMBOL(netif_napi_del);
 | 
			
		||||
 
 | 
			
		||||
 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
 | 
			
		||||
 {
 | 
			
		||||
+	bool do_repoll = false;
 | 
			
		||||
 	void *have;
 | 
			
		||||
-	int work, weight;
 | 
			
		||||
+	int work;
 | 
			
		||||
 
 | 
			
		||||
 	list_del_init(&n->poll_list);
 | 
			
		||||
 
 | 
			
		||||
 	have = netpoll_poll_lock(n);
 | 
			
		||||
 
 | 
			
		||||
-	weight = n->weight;
 | 
			
		||||
-
 | 
			
		||||
-	/* This NAPI_STATE_SCHED test is for avoiding a race
 | 
			
		||||
-	 * with netpoll's poll_napi().  Only the entity which
 | 
			
		||||
-	 * obtains the lock and sees NAPI_STATE_SCHED set will
 | 
			
		||||
-	 * actually make the ->poll() call.  Therefore we avoid
 | 
			
		||||
-	 * accidentally calling ->poll() when NAPI is not scheduled.
 | 
			
		||||
-	 */
 | 
			
		||||
-	work = 0;
 | 
			
		||||
-	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
 | 
			
		||||
-		work = n->poll(n, weight);
 | 
			
		||||
-		trace_napi_poll(n, work, weight);
 | 
			
		||||
-	}
 | 
			
		||||
-
 | 
			
		||||
-	WARN_ON_ONCE(work > weight);
 | 
			
		||||
-
 | 
			
		||||
-	if (likely(work < weight))
 | 
			
		||||
-		goto out_unlock;
 | 
			
		||||
+	work = __napi_poll(n, &do_repoll);
 | 
			
		||||
 
 | 
			
		||||
-	/* Drivers must not modify the NAPI state if they
 | 
			
		||||
-	 * consume the entire weight.  In such cases this code
 | 
			
		||||
-	 * still "owns" the NAPI instance and therefore can
 | 
			
		||||
-	 * move the instance around on the list at-will.
 | 
			
		||||
-	 */
 | 
			
		||||
-	if (unlikely(napi_disable_pending(n))) {
 | 
			
		||||
-		napi_complete(n);
 | 
			
		||||
+	if (!do_repoll)
 | 
			
		||||
 		goto out_unlock;
 | 
			
		||||
-	}
 | 
			
		||||
-
 | 
			
		||||
-	if (n->gro_bitmask) {
 | 
			
		||||
-		/* flush too old packets
 | 
			
		||||
-		 * If HZ < 1000, flush all packets.
 | 
			
		||||
-		 */
 | 
			
		||||
-		napi_gro_flush(n, HZ >= 1000);
 | 
			
		||||
-	}
 | 
			
		||||
-
 | 
			
		||||
-	gro_normal_list(n);
 | 
			
		||||
 
 | 
			
		||||
 	/* Some drivers may have called napi_schedule
 | 
			
		||||
 	 * prior to exhausting their budget.
 | 
			
		||||
@@ -10264,6 +10323,10 @@ static int __init net_dev_init(void)
 | 
			
		||||
 		sd->backlog.weight = weight_p;
 | 
			
		||||
 	}
 | 
			
		||||
 
 | 
			
		||||
+	napi_workq = alloc_workqueue("napi_workq", WQ_UNBOUND | WQ_HIGHPRI,
 | 
			
		||||
+				     WQ_UNBOUND_MAX_ACTIVE | WQ_SYSFS);
 | 
			
		||||
+	BUG_ON(!napi_workq);
 | 
			
		||||
+
 | 
			
		||||
 	dev_boot_phase = 0;
 | 
			
		||||
 
 | 
			
		||||
 	/* The loopback device is special if any other network devices
 | 
			
		||||
--- a/net/core/net-sysfs.c
 | 
			
		||||
+++ b/net/core/net-sysfs.c
 | 
			
		||||
@@ -442,6 +442,52 @@ static ssize_t proto_down_store(struct d
 | 
			
		||||
 }
 | 
			
		||||
 NETDEVICE_SHOW_RW(proto_down, fmt_dec);
 | 
			
		||||
 
 | 
			
		||||
+static int change_napi_threaded(struct net_device *dev, unsigned long val)
 | 
			
		||||
+{
 | 
			
		||||
+	struct napi_struct *napi;
 | 
			
		||||
+
 | 
			
		||||
+	if (list_empty(&dev->napi_list))
 | 
			
		||||
+		return -EOPNOTSUPP;
 | 
			
		||||
+
 | 
			
		||||
+	list_for_each_entry(napi, &dev->napi_list, dev_list) {
 | 
			
		||||
+		if (val)
 | 
			
		||||
+			set_bit(NAPI_STATE_THREADED, &napi->state);
 | 
			
		||||
+		else
 | 
			
		||||
+			clear_bit(NAPI_STATE_THREADED, &napi->state);
 | 
			
		||||
+	}
 | 
			
		||||
+
 | 
			
		||||
+	return 0;
 | 
			
		||||
+}
 | 
			
		||||
+
 | 
			
		||||
+static ssize_t napi_threaded_store(struct device *dev,
 | 
			
		||||
+				struct device_attribute *attr,
 | 
			
		||||
+				const char *buf, size_t len)
 | 
			
		||||
+{
 | 
			
		||||
+	return netdev_store(dev, attr, buf, len, change_napi_threaded);
 | 
			
		||||
+}
 | 
			
		||||
+
 | 
			
		||||
+static ssize_t napi_threaded_show(struct device *dev,
 | 
			
		||||
+				  struct device_attribute *attr,
 | 
			
		||||
+				  char *buf)
 | 
			
		||||
+{
 | 
			
		||||
+	struct net_device *netdev = to_net_dev(dev);
 | 
			
		||||
+	struct napi_struct *napi;
 | 
			
		||||
+	bool enabled = false;
 | 
			
		||||
+
 | 
			
		||||
+	if (!rtnl_trylock())
 | 
			
		||||
+		return restart_syscall();
 | 
			
		||||
+
 | 
			
		||||
+	list_for_each_entry(napi, &netdev->napi_list, dev_list) {
 | 
			
		||||
+		if (test_bit(NAPI_STATE_THREADED, &napi->state))
 | 
			
		||||
+			enabled = true;
 | 
			
		||||
+	}
 | 
			
		||||
+
 | 
			
		||||
+	rtnl_unlock();
 | 
			
		||||
+
 | 
			
		||||
+	return sprintf(buf, fmt_dec, enabled);
 | 
			
		||||
+}
 | 
			
		||||
+static DEVICE_ATTR_RW(napi_threaded);
 | 
			
		||||
+
 | 
			
		||||
 static ssize_t phys_port_id_show(struct device *dev,
 | 
			
		||||
 				 struct device_attribute *attr, char *buf)
 | 
			
		||||
 {
 | 
			
		||||
@@ -532,6 +578,7 @@ static struct attribute *net_class_attrs
 | 
			
		||||
 	&dev_attr_flags.attr,
 | 
			
		||||
 	&dev_attr_tx_queue_len.attr,
 | 
			
		||||
 	&dev_attr_gro_flush_timeout.attr,
 | 
			
		||||
+	&dev_attr_napi_threaded.attr,
 | 
			
		||||
 	&dev_attr_phys_port_id.attr,
 | 
			
		||||
 	&dev_attr_phys_port_name.attr,
 | 
			
		||||
 	&dev_attr_phys_switch_id.attr,
 | 
			
		||||
		Reference in New Issue
	
	Block a user