kernel: backport the upstream implementation of threaded NAPI to 5.4
The workqueue based implementation has a few corner cases and typically lower
performance than the upstream one
Signed-off-by: Felix Fietkau <nbd@nbd.name>
(cherry-picked from commit 01bebc070c)
			
			
This commit is contained in:
		| @@ -0,0 +1,88 @@ | ||||
| From: Felix Fietkau <nbd@nbd.name> | ||||
| Date: Mon, 8 Feb 2021 11:34:08 -0800 | ||||
| Subject: [PATCH] net: extract napi poll functionality to __napi_poll() | ||||
|  | ||||
| This commit introduces a new function __napi_poll() which does the main | ||||
| logic of the existing napi_poll() function, and will be called by other | ||||
| functions in later commits. | ||||
| This idea and implementation is done by Felix Fietkau <nbd@nbd.name> and | ||||
| is proposed as part of the patch to move napi work to work_queue | ||||
| context. | ||||
| This commit by itself is a code restructure. | ||||
|  | ||||
| Signed-off-by: Felix Fietkau <nbd@nbd.name> | ||||
| Signed-off-by: Wei Wang <weiwan@google.com> | ||||
| Reviewed-by: Alexander Duyck <alexanderduyck@fb.com> | ||||
| Signed-off-by: David S. Miller <davem@davemloft.net> | ||||
| --- | ||||
|  | ||||
| --- a/net/core/dev.c | ||||
| +++ b/net/core/dev.c | ||||
| @@ -6322,15 +6322,10 @@ void netif_napi_del(struct napi_struct * | ||||
|  } | ||||
|  EXPORT_SYMBOL(netif_napi_del); | ||||
|   | ||||
| -static int napi_poll(struct napi_struct *n, struct list_head *repoll) | ||||
| +static int __napi_poll(struct napi_struct *n, bool *repoll) | ||||
|  { | ||||
| -	void *have; | ||||
|  	int work, weight; | ||||
|   | ||||
| -	list_del_init(&n->poll_list); | ||||
| - | ||||
| -	have = netpoll_poll_lock(n); | ||||
| - | ||||
|  	weight = n->weight; | ||||
|   | ||||
|  	/* This NAPI_STATE_SCHED test is for avoiding a race | ||||
| @@ -6348,7 +6343,7 @@ static int napi_poll(struct napi_struct | ||||
|  	WARN_ON_ONCE(work > weight); | ||||
|   | ||||
|  	if (likely(work < weight)) | ||||
| -		goto out_unlock; | ||||
| +		return work; | ||||
|   | ||||
|  	/* Drivers must not modify the NAPI state if they | ||||
|  	 * consume the entire weight.  In such cases this code | ||||
| @@ -6357,7 +6352,7 @@ static int napi_poll(struct napi_struct | ||||
|  	 */ | ||||
|  	if (unlikely(napi_disable_pending(n))) { | ||||
|  		napi_complete(n); | ||||
| -		goto out_unlock; | ||||
| +		return work; | ||||
|  	} | ||||
|   | ||||
|  	if (n->gro_bitmask) { | ||||
| @@ -6375,12 +6370,29 @@ static int napi_poll(struct napi_struct | ||||
|  	if (unlikely(!list_empty(&n->poll_list))) { | ||||
|  		pr_warn_once("%s: Budget exhausted after napi rescheduled\n", | ||||
|  			     n->dev ? n->dev->name : "backlog"); | ||||
| -		goto out_unlock; | ||||
| +		return work; | ||||
|  	} | ||||
|   | ||||
| -	list_add_tail(&n->poll_list, repoll); | ||||
| +	*repoll = true; | ||||
| + | ||||
| +	return work; | ||||
| +} | ||||
| + | ||||
| +static int napi_poll(struct napi_struct *n, struct list_head *repoll) | ||||
| +{ | ||||
| +	bool do_repoll = false; | ||||
| +	void *have; | ||||
| +	int work; | ||||
| + | ||||
| +	list_del_init(&n->poll_list); | ||||
| + | ||||
| +	have = netpoll_poll_lock(n); | ||||
| + | ||||
| +	work = __napi_poll(n, &do_repoll); | ||||
| + | ||||
| +	if (do_repoll) | ||||
| +		list_add_tail(&n->poll_list, repoll); | ||||
|   | ||||
| -out_unlock: | ||||
|  	netpoll_poll_unlock(have); | ||||
|   | ||||
|  	return work; | ||||
| @@ -0,0 +1,261 @@ | ||||
| From: Wei Wang <weiwan@google.com> | ||||
| Date: Mon, 8 Feb 2021 11:34:09 -0800 | ||||
| Subject: [PATCH] net: implement threaded-able napi poll loop support | ||||
|  | ||||
| This patch allows running each napi poll loop inside its own | ||||
| kernel thread. | ||||
| The kthread is created during netif_napi_add() if dev->threaded | ||||
| is set. And threaded mode is enabled in napi_enable(). We will | ||||
| provide a way to set dev->threaded and enable threaded mode | ||||
| without a device up/down in the following patch. | ||||
|  | ||||
| Once that threaded mode is enabled and the kthread is | ||||
| started, napi_schedule() will wake-up such thread instead | ||||
| of scheduling the softirq. | ||||
|  | ||||
| The threaded poll loop behaves quite likely the net_rx_action, | ||||
| but it does not have to manipulate local irqs and uses | ||||
| an explicit scheduling point based on netdev_budget. | ||||
|  | ||||
| Co-developed-by: Paolo Abeni <pabeni@redhat.com> | ||||
| Signed-off-by: Paolo Abeni <pabeni@redhat.com> | ||||
| Co-developed-by: Hannes Frederic Sowa <hannes@stressinduktion.org> | ||||
| Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org> | ||||
| Co-developed-by: Jakub Kicinski <kuba@kernel.org> | ||||
| Signed-off-by: Jakub Kicinski <kuba@kernel.org> | ||||
| Signed-off-by: Wei Wang <weiwan@google.com> | ||||
| Reviewed-by: Alexander Duyck <alexanderduyck@fb.com> | ||||
| Signed-off-by: David S. Miller <davem@davemloft.net> | ||||
| --- | ||||
|  | ||||
| --- a/include/linux/netdevice.h | ||||
| +++ b/include/linux/netdevice.h | ||||
| @@ -340,6 +340,7 @@ struct napi_struct { | ||||
|  	struct list_head	dev_list; | ||||
|  	struct hlist_node	napi_hash_node; | ||||
|  	unsigned int		napi_id; | ||||
| +	struct task_struct	*thread; | ||||
|  }; | ||||
|   | ||||
|  enum { | ||||
| @@ -350,6 +351,7 @@ enum { | ||||
|  	NAPI_STATE_HASHED,	/* In NAPI hash (busy polling possible) */ | ||||
|  	NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */ | ||||
|  	NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */ | ||||
| +	NAPI_STATE_THREADED,		/* The poll is performed inside its own thread*/ | ||||
|  }; | ||||
|   | ||||
|  enum { | ||||
| @@ -360,6 +362,7 @@ enum { | ||||
|  	NAPIF_STATE_HASHED	 = BIT(NAPI_STATE_HASHED), | ||||
|  	NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL), | ||||
|  	NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL), | ||||
| +	NAPIF_STATE_THREADED	 = BIT(NAPI_STATE_THREADED), | ||||
|  }; | ||||
|   | ||||
|  enum gro_result { | ||||
| @@ -504,20 +507,7 @@ bool napi_hash_del(struct napi_struct *n | ||||
|   */ | ||||
|  void napi_disable(struct napi_struct *n); | ||||
|   | ||||
| -/** | ||||
| - *	napi_enable - enable NAPI scheduling | ||||
| - *	@n: NAPI context | ||||
| - * | ||||
| - * Resume NAPI from being scheduled on this context. | ||||
| - * Must be paired with napi_disable. | ||||
| - */ | ||||
| -static inline void napi_enable(struct napi_struct *n) | ||||
| -{ | ||||
| -	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); | ||||
| -	smp_mb__before_atomic(); | ||||
| -	clear_bit(NAPI_STATE_SCHED, &n->state); | ||||
| -	clear_bit(NAPI_STATE_NPSVC, &n->state); | ||||
| -} | ||||
| +void napi_enable(struct napi_struct *n); | ||||
|   | ||||
|  /** | ||||
|   *	napi_synchronize - wait until NAPI is not running | ||||
| @@ -1783,6 +1773,8 @@ enum netdev_ml_priv_type { | ||||
|   * | ||||
|   *	@wol_enabled:	Wake-on-LAN is enabled | ||||
|   * | ||||
| + *	@threaded:	napi threaded mode is enabled | ||||
| + * | ||||
|   *	FIXME: cleanup struct net_device such that network protocol info | ||||
|   *	moves out. | ||||
|   */ | ||||
| @@ -2075,6 +2067,7 @@ struct net_device { | ||||
|  	struct lock_class_key	addr_list_lock_key; | ||||
|  	bool			proto_down; | ||||
|  	unsigned		wol_enabled:1; | ||||
| +	unsigned		threaded:1; | ||||
|  }; | ||||
|  #define to_net_dev(d) container_of(d, struct net_device, dev) | ||||
|   | ||||
| --- a/net/core/dev.c | ||||
| +++ b/net/core/dev.c | ||||
| @@ -91,6 +91,7 @@ | ||||
|  #include <linux/etherdevice.h> | ||||
|  #include <linux/ethtool.h> | ||||
|  #include <linux/skbuff.h> | ||||
| +#include <linux/kthread.h> | ||||
|  #include <linux/bpf.h> | ||||
|  #include <linux/bpf_trace.h> | ||||
|  #include <net/net_namespace.h> | ||||
| @@ -1289,6 +1290,27 @@ void netdev_notify_peers(struct net_devi | ||||
|  } | ||||
|  EXPORT_SYMBOL(netdev_notify_peers); | ||||
|   | ||||
| +static int napi_threaded_poll(void *data); | ||||
| + | ||||
| +static int napi_kthread_create(struct napi_struct *n) | ||||
| +{ | ||||
| +	int err = 0; | ||||
| + | ||||
| +	/* Create and wake up the kthread once to put it in | ||||
| +	 * TASK_INTERRUPTIBLE mode to avoid the blocked task | ||||
| +	 * warning and work with loadavg. | ||||
| +	 */ | ||||
| +	n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d", | ||||
| +				n->dev->name, n->napi_id); | ||||
| +	if (IS_ERR(n->thread)) { | ||||
| +		err = PTR_ERR(n->thread); | ||||
| +		pr_err("kthread_run failed with err %d\n", err); | ||||
| +		n->thread = NULL; | ||||
| +	} | ||||
| + | ||||
| +	return err; | ||||
| +} | ||||
| + | ||||
|  static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack) | ||||
|  { | ||||
|  	const struct net_device_ops *ops = dev->netdev_ops; | ||||
| @@ -3885,6 +3907,21 @@ int gro_normal_batch __read_mostly = 8; | ||||
|  static inline void ____napi_schedule(struct softnet_data *sd, | ||||
|  				     struct napi_struct *napi) | ||||
|  { | ||||
| +	struct task_struct *thread; | ||||
| + | ||||
| +	if (test_bit(NAPI_STATE_THREADED, &napi->state)) { | ||||
| +		/* Paired with smp_mb__before_atomic() in | ||||
| +		 * napi_enable(). Use READ_ONCE() to guarantee | ||||
| +		 * a complete read on napi->thread. Only call | ||||
| +		 * wake_up_process() when it's not NULL. | ||||
| +		 */ | ||||
| +		thread = READ_ONCE(napi->thread); | ||||
| +		if (thread) { | ||||
| +			wake_up_process(thread); | ||||
| +			return; | ||||
| +		} | ||||
| +	} | ||||
| + | ||||
|  	list_add_tail(&napi->poll_list, &sd->poll_list); | ||||
|  	__raise_softirq_irqoff(NET_RX_SOFTIRQ); | ||||
|  } | ||||
| @@ -6276,6 +6313,12 @@ void netif_napi_add(struct net_device *d | ||||
|  	set_bit(NAPI_STATE_NPSVC, &napi->state); | ||||
|  	list_add_rcu(&napi->dev_list, &dev->napi_list); | ||||
|  	napi_hash_add(napi); | ||||
| +	/* Create kthread for this napi if dev->threaded is set. | ||||
| +	 * Clear dev->threaded if kthread creation failed so that | ||||
| +	 * threaded mode will not be enabled in napi_enable(). | ||||
| +	 */ | ||||
| +	if (dev->threaded && napi_kthread_create(napi)) | ||||
| +		dev->threaded = 0; | ||||
|  } | ||||
|  EXPORT_SYMBOL(netif_napi_add); | ||||
|   | ||||
| @@ -6292,9 +6335,28 @@ void napi_disable(struct napi_struct *n) | ||||
|  	hrtimer_cancel(&n->timer); | ||||
|   | ||||
|  	clear_bit(NAPI_STATE_DISABLE, &n->state); | ||||
| +	clear_bit(NAPI_STATE_THREADED, &n->state); | ||||
|  } | ||||
|  EXPORT_SYMBOL(napi_disable); | ||||
|   | ||||
| +/** | ||||
| + *	napi_enable - enable NAPI scheduling | ||||
| + *	@n: NAPI context | ||||
| + * | ||||
| + * Resume NAPI from being scheduled on this context. | ||||
| + * Must be paired with napi_disable. | ||||
| + */ | ||||
| +void napi_enable(struct napi_struct *n) | ||||
| +{ | ||||
| +	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); | ||||
| +	smp_mb__before_atomic(); | ||||
| +	clear_bit(NAPI_STATE_SCHED, &n->state); | ||||
| +	clear_bit(NAPI_STATE_NPSVC, &n->state); | ||||
| +	if (n->dev->threaded && n->thread) | ||||
| +		set_bit(NAPI_STATE_THREADED, &n->state); | ||||
| +} | ||||
| +EXPORT_SYMBOL(napi_enable); | ||||
| + | ||||
|  static void flush_gro_hash(struct napi_struct *napi) | ||||
|  { | ||||
|  	int i; | ||||
| @@ -6319,6 +6381,11 @@ void netif_napi_del(struct napi_struct * | ||||
|   | ||||
|  	flush_gro_hash(napi); | ||||
|  	napi->gro_bitmask = 0; | ||||
| + | ||||
| +	if (napi->thread) { | ||||
| +		kthread_stop(napi->thread); | ||||
| +		napi->thread = NULL; | ||||
| +	} | ||||
|  } | ||||
|  EXPORT_SYMBOL(netif_napi_del); | ||||
|   | ||||
| @@ -6398,6 +6465,51 @@ static int napi_poll(struct napi_struct | ||||
|  	return work; | ||||
|  } | ||||
|   | ||||
| +static int napi_thread_wait(struct napi_struct *napi) | ||||
| +{ | ||||
| +	set_current_state(TASK_INTERRUPTIBLE); | ||||
| + | ||||
| +	while (!kthread_should_stop() && !napi_disable_pending(napi)) { | ||||
| +		if (test_bit(NAPI_STATE_SCHED, &napi->state)) { | ||||
| +			WARN_ON(!list_empty(&napi->poll_list)); | ||||
| +			__set_current_state(TASK_RUNNING); | ||||
| +			return 0; | ||||
| +		} | ||||
| + | ||||
| +		schedule(); | ||||
| +		set_current_state(TASK_INTERRUPTIBLE); | ||||
| +	} | ||||
| +	__set_current_state(TASK_RUNNING); | ||||
| +	return -1; | ||||
| +} | ||||
| + | ||||
| +static int napi_threaded_poll(void *data) | ||||
| +{ | ||||
| +	struct napi_struct *napi = data; | ||||
| +	void *have; | ||||
| + | ||||
| +	while (!napi_thread_wait(napi)) { | ||||
| +		for (;;) { | ||||
| +			bool repoll = false; | ||||
| + | ||||
| +			local_bh_disable(); | ||||
| + | ||||
| +			have = netpoll_poll_lock(napi); | ||||
| +			__napi_poll(napi, &repoll); | ||||
| +			netpoll_poll_unlock(have); | ||||
| + | ||||
| +			__kfree_skb_flush(); | ||||
| +			local_bh_enable(); | ||||
| + | ||||
| +			if (!repoll) | ||||
| +				break; | ||||
| + | ||||
| +			cond_resched(); | ||||
| +		} | ||||
| +	} | ||||
| +	return 0; | ||||
| +} | ||||
| + | ||||
|  static __latent_entropy void net_rx_action(struct softirq_action *h) | ||||
|  { | ||||
|  	struct softnet_data *sd = this_cpu_ptr(&softnet_data); | ||||
| @@ -0,0 +1,177 @@ | ||||
| From: Wei Wang <weiwan@google.com> | ||||
| Date: Mon, 8 Feb 2021 11:34:10 -0800 | ||||
| Subject: [PATCH] net: add sysfs attribute to control napi threaded mode | ||||
|  | ||||
| This patch adds a new sysfs attribute to the network device class. | ||||
| Said attribute provides a per-device control to enable/disable the | ||||
| threaded mode for all the napi instances of the given network device, | ||||
| without the need for a device up/down. | ||||
| User sets it to 1 or 0 to enable or disable threaded mode. | ||||
| Note: when switching between threaded and the current softirq based mode | ||||
| for a napi instance, it will not immediately take effect if the napi is | ||||
| currently being polled. The mode switch will happen for the next time | ||||
| napi_schedule() is called. | ||||
|  | ||||
| Co-developed-by: Paolo Abeni <pabeni@redhat.com> | ||||
| Signed-off-by: Paolo Abeni <pabeni@redhat.com> | ||||
| Co-developed-by: Hannes Frederic Sowa <hannes@stressinduktion.org> | ||||
| Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org> | ||||
| Co-developed-by: Felix Fietkau <nbd@nbd.name> | ||||
| Signed-off-by: Felix Fietkau <nbd@nbd.name> | ||||
| Signed-off-by: Wei Wang <weiwan@google.com> | ||||
| Reviewed-by: Alexander Duyck <alexanderduyck@fb.com> | ||||
| Signed-off-by: David S. Miller <davem@davemloft.net> | ||||
| --- | ||||
|  | ||||
| --- a/Documentation/ABI/testing/sysfs-class-net | ||||
| +++ b/Documentation/ABI/testing/sysfs-class-net | ||||
| @@ -301,3 +301,18 @@ Contact:	netdev@vger.kernel.org | ||||
|  Description: | ||||
|  		32-bit unsigned integer counting the number of times the link has | ||||
|  		been down | ||||
| + | ||||
| +What:		/sys/class/net/<iface>/threaded | ||||
| +Date:		Jan 2021 | ||||
| +KernelVersion:	5.12 | ||||
| +Contact:	netdev@vger.kernel.org | ||||
| +Description: | ||||
| +		Boolean value to control the threaded mode per device. User could | ||||
| +		set this value to enable/disable threaded mode for all napi | ||||
| +		belonging to this device, without the need to do device up/down. | ||||
| + | ||||
| +		Possible values: | ||||
| +		== ================================== | ||||
| +		0  threaded mode disabled for this dev | ||||
| +		1  threaded mode enabled for this dev | ||||
| +		== ================================== | ||||
| --- a/include/linux/netdevice.h | ||||
| +++ b/include/linux/netdevice.h | ||||
| @@ -498,6 +498,8 @@ static inline bool napi_complete(struct | ||||
|   */ | ||||
|  bool napi_hash_del(struct napi_struct *napi); | ||||
|   | ||||
| +int dev_set_threaded(struct net_device *dev, bool threaded); | ||||
| + | ||||
|  /** | ||||
|   *	napi_disable - prevent NAPI from scheduling | ||||
|   *	@n: NAPI context | ||||
| --- a/net/core/dev.c | ||||
| +++ b/net/core/dev.c | ||||
| @@ -3911,8 +3911,9 @@ static inline void ____napi_schedule(str | ||||
|   | ||||
|  	if (test_bit(NAPI_STATE_THREADED, &napi->state)) { | ||||
|  		/* Paired with smp_mb__before_atomic() in | ||||
| -		 * napi_enable(). Use READ_ONCE() to guarantee | ||||
| -		 * a complete read on napi->thread. Only call | ||||
| +		 * napi_enable()/dev_set_threaded(). | ||||
| +		 * Use READ_ONCE() to guarantee a complete | ||||
| +		 * read on napi->thread. Only call | ||||
|  		 * wake_up_process() when it's not NULL. | ||||
|  		 */ | ||||
|  		thread = READ_ONCE(napi->thread); | ||||
| @@ -6290,6 +6291,49 @@ static void init_gro_hash(struct napi_st | ||||
|  	napi->gro_bitmask = 0; | ||||
|  } | ||||
|   | ||||
| +int dev_set_threaded(struct net_device *dev, bool threaded) | ||||
| +{ | ||||
| +	struct napi_struct *napi; | ||||
| +	int err = 0; | ||||
| + | ||||
| +	if (dev->threaded == threaded) | ||||
| +		return 0; | ||||
| + | ||||
| +	if (threaded) { | ||||
| +		list_for_each_entry(napi, &dev->napi_list, dev_list) { | ||||
| +			if (!napi->thread) { | ||||
| +				err = napi_kthread_create(napi); | ||||
| +				if (err) { | ||||
| +					threaded = false; | ||||
| +					break; | ||||
| +				} | ||||
| +			} | ||||
| +		} | ||||
| +	} | ||||
| + | ||||
| +	dev->threaded = threaded; | ||||
| + | ||||
| +	/* Make sure kthread is created before THREADED bit | ||||
| +	 * is set. | ||||
| +	 */ | ||||
| +	smp_mb__before_atomic(); | ||||
| + | ||||
| +	/* Setting/unsetting threaded mode on a napi might not immediately | ||||
| +	 * take effect, if the current napi instance is actively being | ||||
| +	 * polled. In this case, the switch between threaded mode and | ||||
| +	 * softirq mode will happen in the next round of napi_schedule(). | ||||
| +	 * This should not cause hiccups/stalls to the live traffic. | ||||
| +	 */ | ||||
| +	list_for_each_entry(napi, &dev->napi_list, dev_list) { | ||||
| +		if (threaded) | ||||
| +			set_bit(NAPI_STATE_THREADED, &napi->state); | ||||
| +		else | ||||
| +			clear_bit(NAPI_STATE_THREADED, &napi->state); | ||||
| +	} | ||||
| + | ||||
| +	return err; | ||||
| +} | ||||
| + | ||||
|  void netif_napi_add(struct net_device *dev, struct napi_struct *napi, | ||||
|  		    int (*poll)(struct napi_struct *, int), int weight) | ||||
|  { | ||||
| --- a/net/core/net-sysfs.c | ||||
| +++ b/net/core/net-sysfs.c | ||||
| @@ -557,6 +557,45 @@ static ssize_t phys_switch_id_show(struc | ||||
|  } | ||||
|  static DEVICE_ATTR_RO(phys_switch_id); | ||||
|   | ||||
| +static ssize_t threaded_show(struct device *dev, | ||||
| +			     struct device_attribute *attr, char *buf) | ||||
| +{ | ||||
| +	struct net_device *netdev = to_net_dev(dev); | ||||
| +	ssize_t ret = -EINVAL; | ||||
| + | ||||
| +	if (!rtnl_trylock()) | ||||
| +		return restart_syscall(); | ||||
| + | ||||
| +	if (dev_isalive(netdev)) | ||||
| +		ret = sprintf(buf, fmt_dec, netdev->threaded); | ||||
| + | ||||
| +	rtnl_unlock(); | ||||
| +	return ret; | ||||
| +} | ||||
| + | ||||
| +static int modify_napi_threaded(struct net_device *dev, unsigned long val) | ||||
| +{ | ||||
| +	int ret; | ||||
| + | ||||
| +	if (list_empty(&dev->napi_list)) | ||||
| +		return -EOPNOTSUPP; | ||||
| + | ||||
| +	if (val != 0 && val != 1) | ||||
| +		return -EOPNOTSUPP; | ||||
| + | ||||
| +	ret = dev_set_threaded(dev, val); | ||||
| + | ||||
| +	return ret; | ||||
| +} | ||||
| + | ||||
| +static ssize_t threaded_store(struct device *dev, | ||||
| +			      struct device_attribute *attr, | ||||
| +			      const char *buf, size_t len) | ||||
| +{ | ||||
| +	return netdev_store(dev, attr, buf, len, modify_napi_threaded); | ||||
| +} | ||||
| +static DEVICE_ATTR_RW(threaded); | ||||
| + | ||||
|  static struct attribute *net_class_attrs[] __ro_after_init = { | ||||
|  	&dev_attr_netdev_group.attr, | ||||
|  	&dev_attr_type.attr, | ||||
| @@ -587,6 +626,7 @@ static struct attribute *net_class_attrs | ||||
|  	&dev_attr_proto_down.attr, | ||||
|  	&dev_attr_carrier_up_count.attr, | ||||
|  	&dev_attr_carrier_down_count.attr, | ||||
| +	&dev_attr_threaded.attr, | ||||
|  	NULL, | ||||
|  }; | ||||
|  ATTRIBUTE_GROUPS(net_class); | ||||
| @@ -0,0 +1,93 @@ | ||||
| From: Wei Wang <weiwan@google.com> | ||||
| Date: Mon, 1 Mar 2021 17:21:13 -0800 | ||||
| Subject: [PATCH] net: fix race between napi kthread mode and busy poll | ||||
|  | ||||
| Currently, napi_thread_wait() checks for NAPI_STATE_SCHED bit to | ||||
| determine if the kthread owns this napi and could call napi->poll() on | ||||
| it. However, if socket busy poll is enabled, it is possible that the | ||||
| busy poll thread grabs this SCHED bit (after the previous napi->poll() | ||||
| invokes napi_complete_done() and clears SCHED bit) and tries to poll | ||||
| on the same napi. napi_disable() could grab the SCHED bit as well. | ||||
| This patch tries to fix this race by adding a new bit | ||||
| NAPI_STATE_SCHED_THREADED in napi->state. This bit gets set in | ||||
| ____napi_schedule() if the threaded mode is enabled, and gets cleared | ||||
| in napi_complete_done(), and we only poll the napi in kthread if this | ||||
| bit is set. This helps distinguish the ownership of the napi between | ||||
| kthread and other scenarios and fixes the race issue. | ||||
|  | ||||
| Fixes: 29863d41bb6e ("net: implement threaded-able napi poll loop support") | ||||
| Reported-by: Martin Zaharinov <micron10@gmail.com> | ||||
| Suggested-by: Jakub Kicinski <kuba@kernel.org> | ||||
| Signed-off-by: Wei Wang <weiwan@google.com> | ||||
| Cc: Alexander Duyck <alexanderduyck@fb.com> | ||||
| Cc: Eric Dumazet <edumazet@google.com> | ||||
| Cc: Paolo Abeni <pabeni@redhat.com> | ||||
| Cc: Hannes Frederic Sowa <hannes@stressinduktion.org> | ||||
| --- | ||||
|  | ||||
| --- a/include/linux/netdevice.h | ||||
| +++ b/include/linux/netdevice.h | ||||
| @@ -352,6 +352,7 @@ enum { | ||||
|  	NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */ | ||||
|  	NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */ | ||||
|  	NAPI_STATE_THREADED,		/* The poll is performed inside its own thread*/ | ||||
| +	NAPI_STATE_SCHED_THREADED,	/* Napi is currently scheduled in threaded mode */ | ||||
|  }; | ||||
|   | ||||
|  enum { | ||||
| @@ -363,6 +364,7 @@ enum { | ||||
|  	NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL), | ||||
|  	NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL), | ||||
|  	NAPIF_STATE_THREADED	 = BIT(NAPI_STATE_THREADED), | ||||
| +	NAPIF_STATE_SCHED_THREADED	= BIT(NAPI_STATE_SCHED_THREADED), | ||||
|  }; | ||||
|   | ||||
|  enum gro_result { | ||||
| --- a/net/core/dev.c | ||||
| +++ b/net/core/dev.c | ||||
| @@ -3918,6 +3918,8 @@ static inline void ____napi_schedule(str | ||||
|  		 */ | ||||
|  		thread = READ_ONCE(napi->thread); | ||||
|  		if (thread) { | ||||
| +			if (thread->state != TASK_INTERRUPTIBLE) | ||||
| +				set_bit(NAPI_STATE_SCHED_THREADED, &napi->state); | ||||
|  			wake_up_process(thread); | ||||
|  			return; | ||||
|  		} | ||||
| @@ -6078,7 +6080,8 @@ bool napi_complete_done(struct napi_stru | ||||
|   | ||||
|  		WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED)); | ||||
|   | ||||
| -		new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED); | ||||
| +		new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED | | ||||
| +			      NAPIF_STATE_SCHED_THREADED); | ||||
|   | ||||
|  		/* If STATE_MISSED was set, leave STATE_SCHED set, | ||||
|  		 * because we will call napi->poll() one more time. | ||||
| @@ -6511,16 +6514,25 @@ static int napi_poll(struct napi_struct | ||||
|   | ||||
|  static int napi_thread_wait(struct napi_struct *napi) | ||||
|  { | ||||
| +	bool woken = false; | ||||
| + | ||||
|  	set_current_state(TASK_INTERRUPTIBLE); | ||||
|   | ||||
|  	while (!kthread_should_stop() && !napi_disable_pending(napi)) { | ||||
| -		if (test_bit(NAPI_STATE_SCHED, &napi->state)) { | ||||
| +		/* Testing SCHED_THREADED bit here to make sure the current | ||||
| +		 * kthread owns this napi and could poll on this napi. | ||||
| +		 * Testing SCHED bit is not enough because SCHED bit might be | ||||
| +		 * set by some other busy poll thread or by napi_disable(). | ||||
| +		 */ | ||||
| +		if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) { | ||||
|  			WARN_ON(!list_empty(&napi->poll_list)); | ||||
|  			__set_current_state(TASK_RUNNING); | ||||
|  			return 0; | ||||
|  		} | ||||
|   | ||||
|  		schedule(); | ||||
| +		/* woken being true indicates this thread owns this napi. */ | ||||
| +		woken = true; | ||||
|  		set_current_state(TASK_INTERRUPTIBLE); | ||||
|  	} | ||||
|  	__set_current_state(TASK_RUNNING); | ||||
| @@ -0,0 +1,53 @@ | ||||
| From: Paolo Abeni <pabeni@redhat.com> | ||||
| Date: Fri, 9 Apr 2021 17:24:17 +0200 | ||||
| Subject: [PATCH] net: fix hangup on napi_disable for threaded napi | ||||
|  | ||||
| napi_disable() is subject to an hangup, when the threaded | ||||
| mode is enabled and the napi is under heavy traffic. | ||||
|  | ||||
| If the relevant napi has been scheduled and the napi_disable() | ||||
| kicks in before the next napi_threaded_wait() completes - so | ||||
| that the latter quits due to the napi_disable_pending() condition, | ||||
| the existing code leaves the NAPI_STATE_SCHED bit set and the | ||||
| napi_disable() loop waiting for such bit will hang. | ||||
|  | ||||
| This patch addresses the issue by dropping the NAPI_STATE_DISABLE | ||||
| bit test in napi_thread_wait(). The later napi_threaded_poll() | ||||
| iteration will take care of clearing the NAPI_STATE_SCHED. | ||||
|  | ||||
| This also addresses a related problem reported by Jakub: | ||||
| before this patch a napi_disable()/napi_enable() pair killed | ||||
| the napi thread, effectively disabling the threaded mode. | ||||
| On the patched kernel napi_disable() simply stops scheduling | ||||
| the relevant thread. | ||||
|  | ||||
| v1 -> v2: | ||||
|   - let the main napi_thread_poll() loop clear the SCHED bit | ||||
|  | ||||
| Reported-by: Jakub Kicinski <kuba@kernel.org> | ||||
| Fixes: 29863d41bb6e ("net: implement threaded-able napi poll loop support") | ||||
| Signed-off-by: Paolo Abeni <pabeni@redhat.com> | ||||
| Reviewed-by: Eric Dumazet <edumazet@google.com> | ||||
| Link: https://lore.kernel.org/r/883923fa22745a9589e8610962b7dc59df09fb1f.1617981844.git.pabeni@redhat.com | ||||
| Signed-off-by: Jakub Kicinski <kuba@kernel.org> | ||||
| --- | ||||
|  | ||||
| --- a/net/core/dev.c | ||||
| +++ b/net/core/dev.c | ||||
| @@ -6518,7 +6518,7 @@ static int napi_thread_wait(struct napi_ | ||||
|   | ||||
|  	set_current_state(TASK_INTERRUPTIBLE); | ||||
|   | ||||
| -	while (!kthread_should_stop() && !napi_disable_pending(napi)) { | ||||
| +	while (!kthread_should_stop()) { | ||||
|  		/* Testing SCHED_THREADED bit here to make sure the current | ||||
|  		 * kthread owns this napi and could poll on this napi. | ||||
|  		 * Testing SCHED bit is not enough because SCHED bit might be | ||||
| @@ -6536,6 +6536,7 @@ static int napi_thread_wait(struct napi_ | ||||
|  		set_current_state(TASK_INTERRUPTIBLE); | ||||
|  	} | ||||
|  	__set_current_state(TASK_RUNNING); | ||||
| + | ||||
|  	return -1; | ||||
|  } | ||||
|   | ||||
| @@ -66,7 +66,7 @@ Signed-off-by: David S. Miller <davem@davemloft.net> | ||||
|  | ||||
| --- a/net/core/dev.c | ||||
| +++ b/net/core/dev.c | ||||
| @@ -5432,8 +5432,7 @@ static inline void skb_gro_reset_offset( | ||||
| @@ -5472,8 +5472,7 @@ static inline void skb_gro_reset_offset( | ||||
|  	NAPI_GRO_CB(skb)->frag0 = NULL; | ||||
|  	NAPI_GRO_CB(skb)->frag0_len = 0; | ||||
|   | ||||
|   | ||||
| @@ -15,7 +15,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name> | ||||
|  | ||||
| --- a/include/linux/netdevice.h | ||||
| +++ b/include/linux/netdevice.h | ||||
| @@ -1549,6 +1549,7 @@ enum netdev_priv_flags { | ||||
| @@ -1540,6 +1540,7 @@ enum netdev_priv_flags { | ||||
|  	IFF_FAILOVER_SLAVE		= 1<<28, | ||||
|  	IFF_L3MDEV_RX_HANDLER		= 1<<29, | ||||
|  	IFF_LIVE_RENAME_OK		= 1<<30, | ||||
| @@ -23,7 +23,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name> | ||||
|  }; | ||||
|   | ||||
|  #define IFF_802_1Q_VLAN			IFF_802_1Q_VLAN | ||||
| @@ -1581,6 +1582,7 @@ enum netdev_priv_flags { | ||||
| @@ -1572,6 +1573,7 @@ enum netdev_priv_flags { | ||||
|  #define IFF_FAILOVER_SLAVE		IFF_FAILOVER_SLAVE | ||||
|  #define IFF_L3MDEV_RX_HANDLER		IFF_L3MDEV_RX_HANDLER | ||||
|  #define IFF_LIVE_RENAME_OK		IFF_LIVE_RENAME_OK | ||||
| @@ -31,7 +31,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name> | ||||
|   | ||||
|  /* Specifies the type of the struct net_device::ml_priv pointer */ | ||||
|  enum netdev_ml_priv_type { | ||||
| @@ -1889,6 +1891,11 @@ struct net_device { | ||||
| @@ -1882,6 +1884,11 @@ struct net_device { | ||||
|  	const struct tlsdev_ops *tlsdev_ops; | ||||
|  #endif | ||||
|   | ||||
| @@ -43,7 +43,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name> | ||||
|  	const struct header_ops *header_ops; | ||||
|   | ||||
|  	unsigned int		flags; | ||||
| @@ -1971,6 +1978,10 @@ struct net_device { | ||||
| @@ -1964,6 +1971,10 @@ struct net_device { | ||||
|  	struct mpls_dev __rcu	*mpls_ptr; | ||||
|  #endif | ||||
|   | ||||
| @@ -101,7 +101,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name> | ||||
|  	help | ||||
| --- a/net/core/dev.c | ||||
| +++ b/net/core/dev.c | ||||
| @@ -3200,10 +3200,20 @@ static int xmit_one(struct sk_buff *skb, | ||||
| @@ -3221,10 +3221,20 @@ static int xmit_one(struct sk_buff *skb, | ||||
|  	if (dev_nit_active(dev)) | ||||
|  		dev_queue_xmit_nit(skb, dev); | ||||
|   | ||||
|   | ||||
| @@ -23,7 +23,7 @@ Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> | ||||
|  | ||||
| --- a/include/linux/netdevice.h | ||||
| +++ b/include/linux/netdevice.h | ||||
| @@ -928,6 +928,13 @@ struct devlink; | ||||
| @@ -922,6 +922,13 @@ struct devlink; | ||||
|  struct tlsdev_ops; | ||||
|   | ||||
|   | ||||
| @@ -37,7 +37,7 @@ Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> | ||||
|  /* | ||||
|   * This structure defines the management hooks for network devices. | ||||
|   * The following hooks can be defined; unless noted otherwise, they are | ||||
| @@ -1160,6 +1167,10 @@ struct tlsdev_ops; | ||||
| @@ -1154,6 +1161,10 @@ struct tlsdev_ops; | ||||
|   * int (*ndo_bridge_dellink)(struct net_device *dev, struct nlmsghdr *nlh, | ||||
|   *			     u16 flags); | ||||
|   * | ||||
| @@ -48,7 +48,7 @@ Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> | ||||
|   * int (*ndo_change_carrier)(struct net_device *dev, bool new_carrier); | ||||
|   *	Called to change device carrier. Soft-devices (like dummy, team, etc) | ||||
|   *	which do not represent real hardware may define this to allow their | ||||
| @@ -1407,6 +1418,8 @@ struct net_device_ops { | ||||
| @@ -1401,6 +1412,8 @@ struct net_device_ops { | ||||
|  	int			(*ndo_bridge_dellink)(struct net_device *dev, | ||||
|  						      struct nlmsghdr *nlh, | ||||
|  						      u16 flags); | ||||
|   | ||||
| @@ -15,7 +15,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name> | ||||
|  | ||||
| --- a/include/linux/netdevice.h | ||||
| +++ b/include/linux/netdevice.h | ||||
| @@ -929,6 +929,7 @@ struct tlsdev_ops; | ||||
| @@ -923,6 +923,7 @@ struct tlsdev_ops; | ||||
|   | ||||
|   | ||||
|  struct flow_offload; | ||||
| @@ -23,7 +23,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name> | ||||
|   | ||||
|  enum flow_offload_type { | ||||
|  	FLOW_OFFLOAD_ADD	= 0, | ||||
| @@ -1167,8 +1168,15 @@ enum flow_offload_type { | ||||
| @@ -1161,8 +1162,15 @@ enum flow_offload_type { | ||||
|   * int (*ndo_bridge_dellink)(struct net_device *dev, struct nlmsghdr *nlh, | ||||
|   *			     u16 flags); | ||||
|   * | ||||
| @@ -40,7 +40,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name> | ||||
|   *	Adds/deletes flow entry to/from net device flowtable. | ||||
|   * | ||||
|   * int (*ndo_change_carrier)(struct net_device *dev, bool new_carrier); | ||||
| @@ -1418,8 +1426,11 @@ struct net_device_ops { | ||||
| @@ -1412,8 +1420,11 @@ struct net_device_ops { | ||||
|  	int			(*ndo_bridge_dellink)(struct net_device *dev, | ||||
|  						      struct nlmsghdr *nlh, | ||||
|  						      u16 flags); | ||||
|   | ||||
| @@ -11,7 +11,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name> | ||||
|  | ||||
| --- a/include/linux/netdevice.h | ||||
| +++ b/include/linux/netdevice.h | ||||
| @@ -1931,6 +1931,8 @@ struct net_device { | ||||
| @@ -1927,6 +1927,8 @@ struct net_device { | ||||
|  	struct netdev_hw_addr_list	mc; | ||||
|  	struct netdev_hw_addr_list	dev_addrs; | ||||
|   | ||||
| @@ -32,7 +32,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name> | ||||
|  	__u16			tc_index;	/* traffic control index */ | ||||
| --- a/net/core/dev.c | ||||
| +++ b/net/core/dev.c | ||||
| @@ -5498,6 +5498,9 @@ static enum gro_result dev_gro_receive(s | ||||
| @@ -5538,6 +5538,9 @@ static enum gro_result dev_gro_receive(s | ||||
|  	int same_flow; | ||||
|  	int grow; | ||||
|   | ||||
| @@ -42,7 +42,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name> | ||||
|  	if (netif_elide_gro(skb->dev)) | ||||
|  		goto normal; | ||||
|   | ||||
| @@ -7300,6 +7303,48 @@ static void __netdev_adjacent_dev_unlink | ||||
| @@ -7481,6 +7484,48 @@ static void __netdev_adjacent_dev_unlink | ||||
|  					   &upper_dev->adj_list.lower); | ||||
|  } | ||||
|   | ||||
| @@ -91,7 +91,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name> | ||||
|  static int __netdev_upper_dev_link(struct net_device *dev, | ||||
|  				   struct net_device *upper_dev, bool master, | ||||
|  				   void *upper_priv, void *upper_info, | ||||
| @@ -7350,6 +7395,7 @@ static int __netdev_upper_dev_link(struc | ||||
| @@ -7531,6 +7576,7 @@ static int __netdev_upper_dev_link(struc | ||||
|  	if (ret) | ||||
|  		return ret; | ||||
|   | ||||
| @@ -99,7 +99,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name> | ||||
|  	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, | ||||
|  					    &changeupper_info.info); | ||||
|  	ret = notifier_to_errno(ret); | ||||
| @@ -7443,6 +7489,7 @@ void netdev_upper_dev_unlink(struct net_ | ||||
| @@ -7624,6 +7670,7 @@ void netdev_upper_dev_unlink(struct net_ | ||||
|   | ||||
|  	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); | ||||
|   | ||||
| @@ -107,7 +107,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name> | ||||
|  	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, | ||||
|  				      &changeupper_info.info); | ||||
|   | ||||
| @@ -8173,6 +8220,7 @@ int dev_set_mac_address(struct net_devic | ||||
| @@ -8354,6 +8401,7 @@ int dev_set_mac_address(struct net_devic | ||||
|  	if (err) | ||||
|  		return err; | ||||
|  	dev->addr_assign_type = NET_ADDR_SET; | ||||
|   | ||||
| @@ -1,356 +0,0 @@ | ||||
| From: Felix Fietkau <nbd@nbd.name> | ||||
| Date: Sun, 26 Jul 2020 14:03:21 +0200 | ||||
| Subject: [PATCH] net: add support for threaded NAPI polling | ||||
|  | ||||
| For some drivers (especially 802.11 drivers), doing a lot of work in the NAPI | ||||
| poll function does not perform well. Since NAPI poll is bound to the CPU it | ||||
| was scheduled from, we can easily end up with a few very busy CPUs spending | ||||
| most of their time in softirq/ksoftirqd and some idle ones. | ||||
|  | ||||
| Introduce threaded NAPI for such drivers based on a workqueue. The API is the | ||||
| same except for using netif_threaded_napi_add instead of netif_napi_add. | ||||
|  | ||||
| In my tests with mt76 on MT7621 using threaded NAPI + a thread for tx scheduling | ||||
| improves LAN->WLAN bridging throughput by 10-50%. Throughput without threaded | ||||
| NAPI is wildly inconsistent, depending on the CPU that runs the tx scheduling | ||||
| thread. | ||||
|  | ||||
| With threaded NAPI it seems stable and consistent (and higher than the best | ||||
| results I got without it). | ||||
|  | ||||
| Based on a patch by Hillf Danton | ||||
|  | ||||
| Cc: Hillf Danton <hdanton@sina.com> | ||||
| Signed-off-by: Felix Fietkau <nbd@nbd.name> | ||||
| --- | ||||
|  | ||||
| --- a/include/linux/netdevice.h | ||||
| +++ b/include/linux/netdevice.h | ||||
| @@ -340,6 +340,7 @@ struct napi_struct { | ||||
|  	struct list_head	dev_list; | ||||
|  	struct hlist_node	napi_hash_node; | ||||
|  	unsigned int		napi_id; | ||||
| +	struct work_struct	work; | ||||
|  }; | ||||
|   | ||||
|  enum { | ||||
| @@ -350,6 +351,7 @@ enum { | ||||
|  	NAPI_STATE_HASHED,	/* In NAPI hash (busy polling possible) */ | ||||
|  	NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */ | ||||
|  	NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */ | ||||
| +	NAPI_STATE_THREADED,	/* Use threaded NAPI */ | ||||
|  }; | ||||
|   | ||||
|  enum { | ||||
| @@ -360,6 +362,7 @@ enum { | ||||
|  	NAPIF_STATE_HASHED	 = BIT(NAPI_STATE_HASHED), | ||||
|  	NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL), | ||||
|  	NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL), | ||||
| +	NAPIF_STATE_THREADED	 = BIT(NAPI_STATE_THREADED), | ||||
|  }; | ||||
|   | ||||
|  enum gro_result { | ||||
| @@ -2101,6 +2104,7 @@ struct net_device { | ||||
|  	struct lock_class_key	addr_list_lock_key; | ||||
|  	bool			proto_down; | ||||
|  	unsigned		wol_enabled:1; | ||||
| +	unsigned		threaded:1; | ||||
|  }; | ||||
|  #define to_net_dev(d) container_of(d, struct net_device, dev) | ||||
|   | ||||
| @@ -2281,6 +2285,26 @@ void netif_napi_add(struct net_device *d | ||||
|  		    int (*poll)(struct napi_struct *, int), int weight); | ||||
|   | ||||
|  /** | ||||
| + *	netif_threaded_napi_add - initialize a NAPI context | ||||
| + *	@dev:  network device | ||||
| + *	@napi: NAPI context | ||||
| + *	@poll: polling function | ||||
| + *	@weight: default weight | ||||
| + * | ||||
| + * This variant of netif_napi_add() should be used from drivers using NAPI | ||||
| + * with CPU intensive poll functions. | ||||
| + * This will schedule polling from a high priority workqueue | ||||
| + */ | ||||
| +static inline void netif_threaded_napi_add(struct net_device *dev, | ||||
| +					   struct napi_struct *napi, | ||||
| +					   int (*poll)(struct napi_struct *, int), | ||||
| +					   int weight) | ||||
| +{ | ||||
| +	set_bit(NAPI_STATE_THREADED, &napi->state); | ||||
| +	netif_napi_add(dev, napi, poll, weight); | ||||
| +} | ||||
| + | ||||
| +/** | ||||
|   *	netif_tx_napi_add - initialize a NAPI context | ||||
|   *	@dev:  network device | ||||
|   *	@napi: NAPI context | ||||
| --- a/net/core/dev.c | ||||
| +++ b/net/core/dev.c | ||||
| @@ -156,6 +156,7 @@ static DEFINE_SPINLOCK(offload_lock); | ||||
|  struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; | ||||
|  struct list_head ptype_all __read_mostly;	/* Taps */ | ||||
|  static struct list_head offload_base __read_mostly; | ||||
| +static struct workqueue_struct *napi_workq __read_mostly; | ||||
|   | ||||
|  static int netif_rx_internal(struct sk_buff *skb); | ||||
|  static int call_netdevice_notifiers_info(unsigned long val, | ||||
| @@ -5940,6 +5941,11 @@ void __napi_schedule(struct napi_struct | ||||
|  { | ||||
|  	unsigned long flags; | ||||
|   | ||||
| +	if (test_bit(NAPI_STATE_THREADED, &n->state)) { | ||||
| +		queue_work(napi_workq, &n->work); | ||||
| +		return; | ||||
| +	} | ||||
| + | ||||
|  	local_irq_save(flags); | ||||
|  	____napi_schedule(this_cpu_ptr(&softnet_data), n); | ||||
|  	local_irq_restore(flags); | ||||
| @@ -5991,6 +5997,11 @@ EXPORT_SYMBOL(napi_schedule_prep); | ||||
|   */ | ||||
|  void __napi_schedule_irqoff(struct napi_struct *n) | ||||
|  { | ||||
| +	if (test_bit(NAPI_STATE_THREADED, &n->state)) { | ||||
| +		queue_work(napi_workq, &n->work); | ||||
| +		return; | ||||
| +	} | ||||
| + | ||||
|  	if (!IS_ENABLED(CONFIG_PREEMPT_RT)) | ||||
|  		____napi_schedule(this_cpu_ptr(&softnet_data), n); | ||||
|  	else | ||||
| @@ -6255,9 +6266,89 @@ static void init_gro_hash(struct napi_st | ||||
|  	napi->gro_bitmask = 0; | ||||
|  } | ||||
|   | ||||
| +static int __napi_poll(struct napi_struct *n, bool *repoll) | ||||
| +{ | ||||
| +	int work, weight; | ||||
| + | ||||
| +	weight = n->weight; | ||||
| + | ||||
| +	/* This NAPI_STATE_SCHED test is for avoiding a race | ||||
| +	 * with netpoll's poll_napi().  Only the entity which | ||||
| +	 * obtains the lock and sees NAPI_STATE_SCHED set will | ||||
| +	 * actually make the ->poll() call.  Therefore we avoid | ||||
| +	 * accidentally calling ->poll() when NAPI is not scheduled. | ||||
| +	 */ | ||||
| +	work = 0; | ||||
| +	if (test_bit(NAPI_STATE_SCHED, &n->state)) { | ||||
| +		work = n->poll(n, weight); | ||||
| +		trace_napi_poll(n, work, weight); | ||||
| +	} | ||||
| + | ||||
| +	WARN_ON_ONCE(work > weight); | ||||
| + | ||||
| +	if (likely(work < weight)) | ||||
| +		return work; | ||||
| + | ||||
| +	/* Drivers must not modify the NAPI state if they | ||||
| +	 * consume the entire weight.  In such cases this code | ||||
| +	 * still "owns" the NAPI instance and therefore can | ||||
| +	 * move the instance around on the list at-will. | ||||
| +	 */ | ||||
| +	if (unlikely(napi_disable_pending(n))) { | ||||
| +		napi_complete(n); | ||||
| +		return work; | ||||
| +	} | ||||
| + | ||||
| +	if (n->gro_bitmask) { | ||||
| +		/* flush too old packets | ||||
| +		 * If HZ < 1000, flush all packets. | ||||
| +		 */ | ||||
| +		napi_gro_flush(n, HZ >= 1000); | ||||
| +	} | ||||
| + | ||||
| +	gro_normal_list(n); | ||||
| + | ||||
| +	*repoll = true; | ||||
| + | ||||
| +	return work; | ||||
| +} | ||||
| + | ||||
| +static void napi_workfn(struct work_struct *work) | ||||
| +{ | ||||
| +	struct napi_struct *n = container_of(work, struct napi_struct, work); | ||||
| +	void *have; | ||||
| + | ||||
| +	for (;;) { | ||||
| +		bool repoll = false; | ||||
| + | ||||
| +		local_bh_disable(); | ||||
| + | ||||
| +		have = netpoll_poll_lock(n); | ||||
| +		__napi_poll(n, &repoll); | ||||
| +		netpoll_poll_unlock(have); | ||||
| + | ||||
| +		local_bh_enable(); | ||||
| + | ||||
| +		if (!repoll) | ||||
| +			return; | ||||
| + | ||||
| +		if (!need_resched()) | ||||
| +			continue; | ||||
| + | ||||
| +		/* | ||||
| +		 * have to pay for the latency of task switch even if | ||||
| +		 * napi is scheduled | ||||
| +		 */ | ||||
| +		queue_work(napi_workq, work); | ||||
| +		return; | ||||
| +	} | ||||
| +} | ||||
| + | ||||
|  void netif_napi_add(struct net_device *dev, struct napi_struct *napi, | ||||
|  		    int (*poll)(struct napi_struct *, int), int weight) | ||||
|  { | ||||
| +	if (dev->threaded) | ||||
| +		set_bit(NAPI_STATE_THREADED, &napi->state); | ||||
|  	INIT_LIST_HEAD(&napi->poll_list); | ||||
|  	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); | ||||
|  	napi->timer.function = napi_watchdog; | ||||
| @@ -6274,6 +6365,7 @@ void netif_napi_add(struct net_device *d | ||||
|  #ifdef CONFIG_NETPOLL | ||||
|  	napi->poll_owner = -1; | ||||
|  #endif | ||||
| +	INIT_WORK(&napi->work, napi_workfn); | ||||
|  	set_bit(NAPI_STATE_SCHED, &napi->state); | ||||
|  	set_bit(NAPI_STATE_NPSVC, &napi->state); | ||||
|  	list_add_rcu(&napi->dev_list, &dev->napi_list); | ||||
| @@ -6314,6 +6406,7 @@ static void flush_gro_hash(struct napi_s | ||||
|  void netif_napi_del(struct napi_struct *napi) | ||||
|  { | ||||
|  	might_sleep(); | ||||
| +	cancel_work_sync(&napi->work); | ||||
|  	if (napi_hash_del(napi)) | ||||
|  		synchronize_net(); | ||||
|  	list_del_init(&napi->dev_list); | ||||
| @@ -6326,50 +6419,18 @@ EXPORT_SYMBOL(netif_napi_del); | ||||
|   | ||||
|  static int napi_poll(struct napi_struct *n, struct list_head *repoll) | ||||
|  { | ||||
| +	bool do_repoll = false; | ||||
|  	void *have; | ||||
| -	int work, weight; | ||||
| +	int work; | ||||
|   | ||||
|  	list_del_init(&n->poll_list); | ||||
|   | ||||
|  	have = netpoll_poll_lock(n); | ||||
|   | ||||
| -	weight = n->weight; | ||||
| +	work = __napi_poll(n, &do_repoll); | ||||
|   | ||||
| -	/* This NAPI_STATE_SCHED test is for avoiding a race | ||||
| -	 * with netpoll's poll_napi().  Only the entity which | ||||
| -	 * obtains the lock and sees NAPI_STATE_SCHED set will | ||||
| -	 * actually make the ->poll() call.  Therefore we avoid | ||||
| -	 * accidentally calling ->poll() when NAPI is not scheduled. | ||||
| -	 */ | ||||
| -	work = 0; | ||||
| -	if (test_bit(NAPI_STATE_SCHED, &n->state)) { | ||||
| -		work = n->poll(n, weight); | ||||
| -		trace_napi_poll(n, work, weight); | ||||
| -	} | ||||
| - | ||||
| -	WARN_ON_ONCE(work > weight); | ||||
| - | ||||
| -	if (likely(work < weight)) | ||||
| -		goto out_unlock; | ||||
| - | ||||
| -	/* Drivers must not modify the NAPI state if they | ||||
| -	 * consume the entire weight.  In such cases this code | ||||
| -	 * still "owns" the NAPI instance and therefore can | ||||
| -	 * move the instance around on the list at-will. | ||||
| -	 */ | ||||
| -	if (unlikely(napi_disable_pending(n))) { | ||||
| -		napi_complete(n); | ||||
| +	if (!do_repoll) | ||||
|  		goto out_unlock; | ||||
| -	} | ||||
| - | ||||
| -	if (n->gro_bitmask) { | ||||
| -		/* flush too old packets | ||||
| -		 * If HZ < 1000, flush all packets. | ||||
| -		 */ | ||||
| -		napi_gro_flush(n, HZ >= 1000); | ||||
| -	} | ||||
| - | ||||
| -	gro_normal_list(n); | ||||
|   | ||||
|  	/* Some drivers may have called napi_schedule | ||||
|  	 * prior to exhausting their budget. | ||||
| @@ -10349,6 +10410,10 @@ static int __init net_dev_init(void) | ||||
|  		sd->backlog.weight = weight_p; | ||||
|  	} | ||||
|   | ||||
| +	napi_workq = alloc_workqueue("napi_workq", WQ_UNBOUND | WQ_HIGHPRI, | ||||
| +				     WQ_UNBOUND_MAX_ACTIVE | WQ_SYSFS); | ||||
| +	BUG_ON(!napi_workq); | ||||
| + | ||||
|  	dev_boot_phase = 0; | ||||
|   | ||||
|  	/* The loopback device is special if any other network devices | ||||
| --- a/net/core/net-sysfs.c | ||||
| +++ b/net/core/net-sysfs.c | ||||
| @@ -470,6 +470,52 @@ static ssize_t proto_down_store(struct d | ||||
|  } | ||||
|  NETDEVICE_SHOW_RW(proto_down, fmt_dec); | ||||
|   | ||||
| +static int change_napi_threaded(struct net_device *dev, unsigned long val) | ||||
| +{ | ||||
| +	struct napi_struct *napi; | ||||
| + | ||||
| +	if (list_empty(&dev->napi_list)) | ||||
| +		return -EOPNOTSUPP; | ||||
| + | ||||
| +	list_for_each_entry(napi, &dev->napi_list, dev_list) { | ||||
| +		if (val) | ||||
| +			set_bit(NAPI_STATE_THREADED, &napi->state); | ||||
| +		else | ||||
| +			clear_bit(NAPI_STATE_THREADED, &napi->state); | ||||
| +	} | ||||
| + | ||||
| +	return 0; | ||||
| +} | ||||
| + | ||||
| +static ssize_t napi_threaded_store(struct device *dev, | ||||
| +				struct device_attribute *attr, | ||||
| +				const char *buf, size_t len) | ||||
| +{ | ||||
| +	return netdev_store(dev, attr, buf, len, change_napi_threaded); | ||||
| +} | ||||
| + | ||||
| +static ssize_t napi_threaded_show(struct device *dev, | ||||
| +				  struct device_attribute *attr, | ||||
| +				  char *buf) | ||||
| +{ | ||||
| +	struct net_device *netdev = to_net_dev(dev); | ||||
| +	struct napi_struct *napi; | ||||
| +	bool enabled = false; | ||||
| + | ||||
| +	if (!rtnl_trylock()) | ||||
| +		return restart_syscall(); | ||||
| + | ||||
| +	list_for_each_entry(napi, &netdev->napi_list, dev_list) { | ||||
| +		if (test_bit(NAPI_STATE_THREADED, &napi->state)) | ||||
| +			enabled = true; | ||||
| +	} | ||||
| + | ||||
| +	rtnl_unlock(); | ||||
| + | ||||
| +	return sprintf(buf, fmt_dec, enabled); | ||||
| +} | ||||
| +static DEVICE_ATTR_RW(napi_threaded); | ||||
| + | ||||
|  static ssize_t phys_port_id_show(struct device *dev, | ||||
|  				 struct device_attribute *attr, char *buf) | ||||
|  { | ||||
| @@ -581,6 +627,7 @@ static struct attribute *net_class_attrs | ||||
|  	&dev_attr_flags.attr, | ||||
|  	&dev_attr_tx_queue_len.attr, | ||||
|  	&dev_attr_gro_flush_timeout.attr, | ||||
| +	&dev_attr_napi_threaded.attr, | ||||
|  	&dev_attr_phys_port_id.attr, | ||||
|  	&dev_attr_phys_port_name.attr, | ||||
|  	&dev_attr_phys_switch_id.attr, | ||||
| @@ -24,7 +24,7 @@ Reviewed-by: Grant Grundler <grundler@chromium.org> | ||||
|  | ||||
| --- a/include/linux/netdevice.h | ||||
| +++ b/include/linux/netdevice.h | ||||
| @@ -776,6 +776,16 @@ struct xps_map { | ||||
| @@ -767,6 +767,16 @@ struct xps_map { | ||||
|  #define XPS_MIN_MAP_ALLOC ((L1_CACHE_ALIGN(offsetof(struct xps_map, queues[1])) \ | ||||
|         - sizeof(struct xps_map)) / sizeof(u16)) | ||||
|   | ||||
| @@ -41,7 +41,7 @@ Reviewed-by: Grant Grundler <grundler@chromium.org> | ||||
|  /* | ||||
|   * This structure holds all XPS maps for device.  Maps are indexed by CPU. | ||||
|   */ | ||||
| @@ -1379,6 +1389,9 @@ struct net_device_ops { | ||||
| @@ -1370,6 +1380,9 @@ struct net_device_ops { | ||||
|  						     const struct sk_buff *skb, | ||||
|  						     u16 rxq_index, | ||||
|  						     u32 flow_id); | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Felix Fietkau
					Felix Fietkau