generic: 6.6: replace (broken) downstream patch with upstream solution
Our downstream patch "net/core: add optional threading for backlog processing" has been broken with the switch to Linux 6.6. Replace it by backporting the now available upstream solution. Signed-off-by: Daniel Golle <daniel@makrotopia.org> Link: https://github.com/openwrt/openwrt/pull/15592 Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
This commit is contained in:
		 Daniel Golle
					Daniel Golle
				
			
				
					committed by
					
						 Christian Marangi
						Christian Marangi
					
				
			
			
				
	
			
			
			 Christian Marangi
						Christian Marangi
					
				
			
						parent
						
							f3080677f5
						
					
				
				
					commit
					a5c095c453
				
			| @@ -0,0 +1,75 @@ | |||||||
|  | From 56364c910691f6d10ba88c964c9041b9ab777bd6 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> | ||||||
|  | Date: Mon, 25 Mar 2024 08:40:28 +0100 | ||||||
|  | Subject: [PATCH 1/4] net: Remove conditional threaded-NAPI wakeup based on | ||||||
|  |  task state. | ||||||
|  |  | ||||||
|  | A NAPI thread is scheduled by first setting NAPI_STATE_SCHED bit. If | ||||||
|  | successful (the bit was not yet set) then the NAPI_STATE_SCHED_THREADED | ||||||
|  | is set but only if thread's state is not TASK_INTERRUPTIBLE (is | ||||||
|  | TASK_RUNNING) followed by task wakeup. | ||||||
|  |  | ||||||
|  | If the task is idle (TASK_INTERRUPTIBLE) then the | ||||||
|  | NAPI_STATE_SCHED_THREADED bit is not set. The thread is no relying on | ||||||
|  | the bit but always leaving the wait-loop after returning from schedule() | ||||||
|  | because there must have been a wakeup. | ||||||
|  |  | ||||||
|  | The smpboot-threads implementation for per-CPU threads requires an | ||||||
|  | explicit condition and does not support "if we get out of schedule() | ||||||
|  | then there must be something to do". | ||||||
|  |  | ||||||
|  | Removing this optimisation simplifies the following integration. | ||||||
|  |  | ||||||
|  | Set NAPI_STATE_SCHED_THREADED unconditionally on wakeup and rely on it | ||||||
|  | in the wait path by removing the `woken' condition. | ||||||
|  |  | ||||||
|  | Acked-by: Jakub Kicinski <kuba@kernel.org> | ||||||
|  | Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> | ||||||
|  | Signed-off-by: Paolo Abeni <pabeni@redhat.com> | ||||||
|  | --- | ||||||
|  |  net/core/dev.c | 14 ++------------ | ||||||
|  |  1 file changed, 2 insertions(+), 12 deletions(-) | ||||||
|  |  | ||||||
|  | --- a/net/core/dev.c | ||||||
|  | +++ b/net/core/dev.c | ||||||
|  | @@ -4473,13 +4473,7 @@ static inline void ____napi_schedule(str | ||||||
|  |  		 */ | ||||||
|  |  		thread = READ_ONCE(napi->thread); | ||||||
|  |  		if (thread) { | ||||||
|  | -			/* Avoid doing set_bit() if the thread is in | ||||||
|  | -			 * INTERRUPTIBLE state, cause napi_thread_wait() | ||||||
|  | -			 * makes sure to proceed with napi polling | ||||||
|  | -			 * if the thread is explicitly woken from here. | ||||||
|  | -			 */ | ||||||
|  | -			if (READ_ONCE(thread->__state) != TASK_INTERRUPTIBLE) | ||||||
|  | -				set_bit(NAPI_STATE_SCHED_THREADED, &napi->state); | ||||||
|  | +			set_bit(NAPI_STATE_SCHED_THREADED, &napi->state); | ||||||
|  |  			wake_up_process(thread); | ||||||
|  |  			return; | ||||||
|  |  		} | ||||||
|  | @@ -6635,8 +6629,6 @@ static int napi_poll(struct napi_struct | ||||||
|  |   | ||||||
|  |  static int napi_thread_wait(struct napi_struct *napi) | ||||||
|  |  { | ||||||
|  | -	bool woken = false; | ||||||
|  | - | ||||||
|  |  	set_current_state(TASK_INTERRUPTIBLE); | ||||||
|  |   | ||||||
|  |  	while (!kthread_should_stop()) { | ||||||
|  | @@ -6645,15 +6637,13 @@ static int napi_thread_wait(struct napi_ | ||||||
|  |  		 * Testing SCHED bit is not enough because SCHED bit might be | ||||||
|  |  		 * set by some other busy poll thread or by napi_disable(). | ||||||
|  |  		 */ | ||||||
|  | -		if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) { | ||||||
|  | +		if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) { | ||||||
|  |  			WARN_ON(!list_empty(&napi->poll_list)); | ||||||
|  |  			__set_current_state(TASK_RUNNING); | ||||||
|  |  			return 0; | ||||||
|  |  		} | ||||||
|  |   | ||||||
|  |  		schedule(); | ||||||
|  | -		/* woken being true indicates this thread owns this napi. */ | ||||||
|  | -		woken = true; | ||||||
|  |  		set_current_state(TASK_INTERRUPTIBLE); | ||||||
|  |  	} | ||||||
|  |  	__set_current_state(TASK_RUNNING); | ||||||
| @@ -0,0 +1,330 @@ | |||||||
|  | From dad6b97702639fba27a2bd3e986982ad6f0db3a7 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> | ||||||
|  | Date: Mon, 25 Mar 2024 08:40:29 +0100 | ||||||
|  | Subject: [PATCH 2/4] net: Allow to use SMP threads for backlog NAPI. | ||||||
|  |  | ||||||
|  | Backlog NAPI is a per-CPU NAPI struct only (with no device behind it) | ||||||
|  | used by drivers which don't do NAPI them self, RPS and parts of the | ||||||
|  | stack which need to avoid recursive deadlocks while processing a packet. | ||||||
|  |  | ||||||
|  | The non-NAPI driver use the CPU local backlog NAPI. If RPS is enabled | ||||||
|  | then a flow for the skb is computed and based on the flow the skb can be | ||||||
|  | enqueued on a remote CPU. Scheduling/ raising the softirq (for backlog's | ||||||
|  | NAPI) on the remote CPU isn't trivial because the softirq is only | ||||||
|  | scheduled on the local CPU and performed after the hardirq is done. | ||||||
|  | In order to schedule a softirq on the remote CPU, an IPI is sent to the | ||||||
|  | remote CPU which schedules the backlog-NAPI on the then local CPU. | ||||||
|  |  | ||||||
|  | On PREEMPT_RT interrupts are force-threaded. The soft interrupts are | ||||||
|  | raised within the interrupt thread and processed after the interrupt | ||||||
|  | handler completed still within the context of the interrupt thread. The | ||||||
|  | softirq is handled in the context where it originated. | ||||||
|  |  | ||||||
|  | With force-threaded interrupts enabled, ksoftirqd is woken up if a | ||||||
|  | softirq is raised from hardirq context. This is the case if it is raised | ||||||
|  | from an IPI. Additionally there is a warning on PREEMPT_RT if the | ||||||
|  | softirq is raised from the idle thread. | ||||||
|  | This was done for two reasons: | ||||||
|  | - With threaded interrupts the processing should happen in thread | ||||||
|  |   context (where it originated) and ksoftirqd is the only thread for | ||||||
|  |   this context if raised from hardirq. Using the currently running task | ||||||
|  |   instead would "punish" a random task. | ||||||
|  | - Once ksoftirqd is active it consumes all further softirqs until it | ||||||
|  |   stops running. This changed recently and is no longer the case. | ||||||
|  |  | ||||||
|  | Instead of keeping the backlog NAPI in ksoftirqd (in force-threaded/ | ||||||
|  | PREEMPT_RT setups) I am proposing NAPI-threads for backlog. | ||||||
|  | The "proper" setup with threaded-NAPI is not doable because the threads | ||||||
|  | are not pinned to an individual CPU and can be modified by the user. | ||||||
|  | Additionally a dummy network device would have to be assigned. Also | ||||||
|  | CPU-hotplug has to be considered if additional CPUs show up. | ||||||
|  | All this can be probably done/ solved but the smpboot-threads already | ||||||
|  | provide this infrastructure. | ||||||
|  |  | ||||||
|  | Sending UDP packets over loopback expects that the packet is processed | ||||||
|  | within the call. Delaying it by handing it over to the thread hurts | ||||||
|  | performance. It is not beneficial to the outcome if the context switch | ||||||
|  | happens immediately after enqueue or after a while to process a few | ||||||
|  | packets in a batch. | ||||||
|  | There is no need to always use the thread if the backlog NAPI is | ||||||
|  | requested on the local CPU. This restores the loopback throuput. The | ||||||
|  | performance drops mostly to the same value after enabling RPS on the | ||||||
|  | loopback comparing the IPI and the tread result. | ||||||
|  |  | ||||||
|  | Create NAPI-threads for backlog if request during boot. The thread runs | ||||||
|  | the inner loop from napi_threaded_poll(), the wait part is different. It | ||||||
|  | checks for NAPI_STATE_SCHED (the backlog NAPI can not be disabled). | ||||||
|  |  | ||||||
|  | The NAPI threads for backlog are optional, it has to be enabled via the boot | ||||||
|  | argument "thread_backlog_napi". It is mandatory for PREEMPT_RT to avoid the | ||||||
|  | wakeup of ksoftirqd from the IPI. | ||||||
|  |  | ||||||
|  | Acked-by: Jakub Kicinski <kuba@kernel.org> | ||||||
|  | Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> | ||||||
|  | Signed-off-by: Paolo Abeni <pabeni@redhat.com> | ||||||
|  | --- | ||||||
|  |  net/core/dev.c | 148 +++++++++++++++++++++++++++++++++++++------------ | ||||||
|  |  1 file changed, 113 insertions(+), 35 deletions(-) | ||||||
|  |  | ||||||
|  | --- a/net/core/dev.c | ||||||
|  | +++ b/net/core/dev.c | ||||||
|  | @@ -78,6 +78,7 @@ | ||||||
|  |  #include <linux/slab.h> | ||||||
|  |  #include <linux/sched.h> | ||||||
|  |  #include <linux/sched/mm.h> | ||||||
|  | +#include <linux/smpboot.h> | ||||||
|  |  #include <linux/mutex.h> | ||||||
|  |  #include <linux/rwsem.h> | ||||||
|  |  #include <linux/string.h> | ||||||
|  | @@ -217,6 +218,31 @@ static inline struct hlist_head *dev_ind | ||||||
|  |  	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; | ||||||
|  |  } | ||||||
|  |   | ||||||
|  | +#ifndef CONFIG_PREEMPT_RT | ||||||
|  | + | ||||||
|  | +static DEFINE_STATIC_KEY_FALSE(use_backlog_threads_key); | ||||||
|  | + | ||||||
|  | +static int __init setup_backlog_napi_threads(char *arg) | ||||||
|  | +{ | ||||||
|  | +	static_branch_enable(&use_backlog_threads_key); | ||||||
|  | +	return 0; | ||||||
|  | +} | ||||||
|  | +early_param("thread_backlog_napi", setup_backlog_napi_threads); | ||||||
|  | + | ||||||
|  | +static bool use_backlog_threads(void) | ||||||
|  | +{ | ||||||
|  | +	return static_branch_unlikely(&use_backlog_threads_key); | ||||||
|  | +} | ||||||
|  | + | ||||||
|  | +#else | ||||||
|  | + | ||||||
|  | +static bool use_backlog_threads(void) | ||||||
|  | +{ | ||||||
|  | +	return true; | ||||||
|  | +} | ||||||
|  | + | ||||||
|  | +#endif | ||||||
|  | + | ||||||
|  |  static inline void rps_lock_irqsave(struct softnet_data *sd, | ||||||
|  |  				    unsigned long *flags) | ||||||
|  |  { | ||||||
|  | @@ -4441,6 +4467,7 @@ EXPORT_SYMBOL(__dev_direct_xmit); | ||||||
|  |  /************************************************************************* | ||||||
|  |   *			Receiver routines | ||||||
|  |   *************************************************************************/ | ||||||
|  | +static DEFINE_PER_CPU(struct task_struct *, backlog_napi); | ||||||
|  |   | ||||||
|  |  int netdev_max_backlog __read_mostly = 1000; | ||||||
|  |  EXPORT_SYMBOL(netdev_max_backlog); | ||||||
|  | @@ -4473,12 +4500,16 @@ static inline void ____napi_schedule(str | ||||||
|  |  		 */ | ||||||
|  |  		thread = READ_ONCE(napi->thread); | ||||||
|  |  		if (thread) { | ||||||
|  | +			if (use_backlog_threads() && thread == raw_cpu_read(backlog_napi)) | ||||||
|  | +				goto use_local_napi; | ||||||
|  | + | ||||||
|  |  			set_bit(NAPI_STATE_SCHED_THREADED, &napi->state); | ||||||
|  |  			wake_up_process(thread); | ||||||
|  |  			return; | ||||||
|  |  		} | ||||||
|  |  	} | ||||||
|  |   | ||||||
|  | +use_local_napi: | ||||||
|  |  	list_add_tail(&napi->poll_list, &sd->poll_list); | ||||||
|  |  	WRITE_ONCE(napi->list_owner, smp_processor_id()); | ||||||
|  |  	/* If not called from net_rx_action() | ||||||
|  | @@ -4724,6 +4755,11 @@ static void napi_schedule_rps(struct sof | ||||||
|  |   | ||||||
|  |  #ifdef CONFIG_RPS | ||||||
|  |  	if (sd != mysd) { | ||||||
|  | +		if (use_backlog_threads()) { | ||||||
|  | +			__napi_schedule_irqoff(&sd->backlog); | ||||||
|  | +			return; | ||||||
|  | +		} | ||||||
|  | + | ||||||
|  |  		sd->rps_ipi_next = mysd->rps_ipi_list; | ||||||
|  |  		mysd->rps_ipi_list = sd; | ||||||
|  |   | ||||||
|  | @@ -5947,7 +5983,7 @@ static void net_rps_action_and_irq_enabl | ||||||
|  |  #ifdef CONFIG_RPS | ||||||
|  |  	struct softnet_data *remsd = sd->rps_ipi_list; | ||||||
|  |   | ||||||
|  | -	if (remsd) { | ||||||
|  | +	if (!use_backlog_threads() && remsd) { | ||||||
|  |  		sd->rps_ipi_list = NULL; | ||||||
|  |   | ||||||
|  |  		local_irq_enable(); | ||||||
|  | @@ -5962,7 +5998,7 @@ static void net_rps_action_and_irq_enabl | ||||||
|  |  static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) | ||||||
|  |  { | ||||||
|  |  #ifdef CONFIG_RPS | ||||||
|  | -	return sd->rps_ipi_list != NULL; | ||||||
|  | +	return !use_backlog_threads() && sd->rps_ipi_list; | ||||||
|  |  #else | ||||||
|  |  	return false; | ||||||
|  |  #endif | ||||||
|  | @@ -6006,7 +6042,7 @@ static int process_backlog(struct napi_s | ||||||
|  |  			 * We can use a plain write instead of clear_bit(), | ||||||
|  |  			 * and we dont need an smp_mb() memory barrier. | ||||||
|  |  			 */ | ||||||
|  | -			napi->state = 0; | ||||||
|  | +			napi->state &= NAPIF_STATE_THREADED; | ||||||
|  |  			again = false; | ||||||
|  |  		} else { | ||||||
|  |  			skb_queue_splice_tail_init(&sd->input_pkt_queue, | ||||||
|  | @@ -6672,43 +6708,48 @@ static void skb_defer_free_flush(struct | ||||||
|  |  	} | ||||||
|  |  } | ||||||
|  |   | ||||||
|  | -static int napi_threaded_poll(void *data) | ||||||
|  | +static void napi_threaded_poll_loop(struct napi_struct *napi) | ||||||
|  |  { | ||||||
|  | -	struct napi_struct *napi = data; | ||||||
|  |  	struct softnet_data *sd; | ||||||
|  | -	void *have; | ||||||
|  | +	unsigned long last_qs = jiffies; | ||||||
|  |   | ||||||
|  | -	while (!napi_thread_wait(napi)) { | ||||||
|  | -		unsigned long last_qs = jiffies; | ||||||
|  | +	for (;;) { | ||||||
|  | +		bool repoll = false; | ||||||
|  | +		void *have; | ||||||
|  |   | ||||||
|  | -		for (;;) { | ||||||
|  | -			bool repoll = false; | ||||||
|  | +		local_bh_disable(); | ||||||
|  | +		sd = this_cpu_ptr(&softnet_data); | ||||||
|  | +		sd->in_napi_threaded_poll = true; | ||||||
|  |   | ||||||
|  | -			local_bh_disable(); | ||||||
|  | -			sd = this_cpu_ptr(&softnet_data); | ||||||
|  | -			sd->in_napi_threaded_poll = true; | ||||||
|  | - | ||||||
|  | -			have = netpoll_poll_lock(napi); | ||||||
|  | -			__napi_poll(napi, &repoll); | ||||||
|  | -			netpoll_poll_unlock(have); | ||||||
|  | - | ||||||
|  | -			sd->in_napi_threaded_poll = false; | ||||||
|  | -			barrier(); | ||||||
|  | - | ||||||
|  | -			if (sd_has_rps_ipi_waiting(sd)) { | ||||||
|  | -				local_irq_disable(); | ||||||
|  | -				net_rps_action_and_irq_enable(sd); | ||||||
|  | -			} | ||||||
|  | -			skb_defer_free_flush(sd); | ||||||
|  | -			local_bh_enable(); | ||||||
|  | +		have = netpoll_poll_lock(napi); | ||||||
|  | +		__napi_poll(napi, &repoll); | ||||||
|  | +		netpoll_poll_unlock(have); | ||||||
|  | + | ||||||
|  | +		sd->in_napi_threaded_poll = false; | ||||||
|  | +		barrier(); | ||||||
|  | + | ||||||
|  | +		if (sd_has_rps_ipi_waiting(sd)) { | ||||||
|  | +			local_irq_disable(); | ||||||
|  | +			net_rps_action_and_irq_enable(sd); | ||||||
|  | +		} | ||||||
|  | +		skb_defer_free_flush(sd); | ||||||
|  | +		local_bh_enable(); | ||||||
|  |   | ||||||
|  | -			if (!repoll) | ||||||
|  | -				break; | ||||||
|  | +		if (!repoll) | ||||||
|  | +			break; | ||||||
|  |   | ||||||
|  | -			rcu_softirq_qs_periodic(last_qs); | ||||||
|  | -			cond_resched(); | ||||||
|  | -		} | ||||||
|  | +		rcu_softirq_qs_periodic(last_qs); | ||||||
|  | +		cond_resched(); | ||||||
|  |  	} | ||||||
|  | +} | ||||||
|  | + | ||||||
|  | +static int napi_threaded_poll(void *data) | ||||||
|  | +{ | ||||||
|  | +	struct napi_struct *napi = data; | ||||||
|  | + | ||||||
|  | +	while (!napi_thread_wait(napi)) | ||||||
|  | +		napi_threaded_poll_loop(napi); | ||||||
|  | + | ||||||
|  |  	return 0; | ||||||
|  |  } | ||||||
|  |   | ||||||
|  | @@ -11288,7 +11329,7 @@ static int dev_cpu_dead(unsigned int old | ||||||
|  |   | ||||||
|  |  		list_del_init(&napi->poll_list); | ||||||
|  |  		if (napi->poll == process_backlog) | ||||||
|  | -			napi->state = 0; | ||||||
|  | +			napi->state &= NAPIF_STATE_THREADED; | ||||||
|  |  		else | ||||||
|  |  			____napi_schedule(sd, napi); | ||||||
|  |  	} | ||||||
|  | @@ -11296,12 +11337,14 @@ static int dev_cpu_dead(unsigned int old | ||||||
|  |  	raise_softirq_irqoff(NET_TX_SOFTIRQ); | ||||||
|  |  	local_irq_enable(); | ||||||
|  |   | ||||||
|  | +	if (!use_backlog_threads()) { | ||||||
|  |  #ifdef CONFIG_RPS | ||||||
|  | -	remsd = oldsd->rps_ipi_list; | ||||||
|  | -	oldsd->rps_ipi_list = NULL; | ||||||
|  | +		remsd = oldsd->rps_ipi_list; | ||||||
|  | +		oldsd->rps_ipi_list = NULL; | ||||||
|  |  #endif | ||||||
|  | -	/* send out pending IPI's on offline CPU */ | ||||||
|  | -	net_rps_send_ipi(remsd); | ||||||
|  | +		/* send out pending IPI's on offline CPU */ | ||||||
|  | +		net_rps_send_ipi(remsd); | ||||||
|  | +	} | ||||||
|  |   | ||||||
|  |  	/* Process offline CPU's input_pkt_queue */ | ||||||
|  |  	while ((skb = __skb_dequeue(&oldsd->process_queue))) { | ||||||
|  | @@ -11564,6 +11607,38 @@ static struct pernet_operations __net_in | ||||||
|  |   * | ||||||
|  |   */ | ||||||
|  |   | ||||||
|  | +static int backlog_napi_should_run(unsigned int cpu) | ||||||
|  | +{ | ||||||
|  | +	struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); | ||||||
|  | +	struct napi_struct *napi = &sd->backlog; | ||||||
|  | + | ||||||
|  | +	return test_bit(NAPI_STATE_SCHED_THREADED, &napi->state); | ||||||
|  | +} | ||||||
|  | + | ||||||
|  | +static void run_backlog_napi(unsigned int cpu) | ||||||
|  | +{ | ||||||
|  | +	struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); | ||||||
|  | + | ||||||
|  | +	napi_threaded_poll_loop(&sd->backlog); | ||||||
|  | +} | ||||||
|  | + | ||||||
|  | +static void backlog_napi_setup(unsigned int cpu) | ||||||
|  | +{ | ||||||
|  | +	struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); | ||||||
|  | +	struct napi_struct *napi = &sd->backlog; | ||||||
|  | + | ||||||
|  | +	napi->thread = this_cpu_read(backlog_napi); | ||||||
|  | +	set_bit(NAPI_STATE_THREADED, &napi->state); | ||||||
|  | +} | ||||||
|  | + | ||||||
|  | +static struct smp_hotplug_thread backlog_threads = { | ||||||
|  | +	.store			= &backlog_napi, | ||||||
|  | +	.thread_should_run	= backlog_napi_should_run, | ||||||
|  | +	.thread_fn		= run_backlog_napi, | ||||||
|  | +	.thread_comm		= "backlog_napi/%u", | ||||||
|  | +	.setup			= backlog_napi_setup, | ||||||
|  | +}; | ||||||
|  | + | ||||||
|  |  /* | ||||||
|  |   *       This is called single threaded during boot, so no need | ||||||
|  |   *       to take the rtnl semaphore. | ||||||
|  | @@ -11614,7 +11689,10 @@ static int __init net_dev_init(void) | ||||||
|  |  		init_gro_hash(&sd->backlog); | ||||||
|  |  		sd->backlog.poll = process_backlog; | ||||||
|  |  		sd->backlog.weight = weight_p; | ||||||
|  | +		INIT_LIST_HEAD(&sd->backlog.poll_list); | ||||||
|  |  	} | ||||||
|  | +	if (use_backlog_threads()) | ||||||
|  | +		smpboot_register_percpu_thread(&backlog_threads); | ||||||
|  |   | ||||||
|  |  	dev_boot_phase = 0; | ||||||
|  |   | ||||||
| @@ -0,0 +1,121 @@ | |||||||
|  | From 80d2eefcb4c84aa9018b2a997ab3a4c567bc821a Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> | ||||||
|  | Date: Mon, 25 Mar 2024 08:40:30 +0100 | ||||||
|  | Subject: [PATCH 3/4] net: Use backlog-NAPI to clean up the defer_list. | ||||||
|  |  | ||||||
|  | The defer_list is a per-CPU list which is used to free skbs outside of | ||||||
|  | the socket lock and on the CPU on which they have been allocated. | ||||||
|  | The list is processed during NAPI callbacks so ideally the list is | ||||||
|  | cleaned up. | ||||||
|  | Should the amount of skbs on the list exceed a certain water mark then | ||||||
|  | the softirq is triggered remotely on the target CPU by invoking a remote | ||||||
|  | function call. The raise of the softirqs via a remote function call | ||||||
|  | leads to waking the ksoftirqd on PREEMPT_RT which is undesired. | ||||||
|  | The backlog-NAPI threads already provide the infrastructure which can be | ||||||
|  | utilized to perform the cleanup of the defer_list. | ||||||
|  |  | ||||||
|  | The NAPI state is updated with the input_pkt_queue.lock acquired. It | ||||||
|  | order not to break the state, it is needed to also wake the backlog-NAPI | ||||||
|  | thread with the lock held. This requires to acquire the use the lock in | ||||||
|  | rps_lock_irq*() if the backlog-NAPI threads are used even with RPS | ||||||
|  | disabled. | ||||||
|  |  | ||||||
|  | Move the logic of remotely starting softirqs to clean up the defer_list | ||||||
|  | into kick_defer_list_purge(). Make sure a lock is held in | ||||||
|  | rps_lock_irq*() if backlog-NAPI threads are used. Schedule backlog-NAPI | ||||||
|  | for defer_list cleanup if backlog-NAPI is available. | ||||||
|  |  | ||||||
|  | Acked-by: Jakub Kicinski <kuba@kernel.org> | ||||||
|  | Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> | ||||||
|  | Signed-off-by: Paolo Abeni <pabeni@redhat.com> | ||||||
|  | --- | ||||||
|  |  include/linux/netdevice.h |  1 + | ||||||
|  |  net/core/dev.c            | 25 +++++++++++++++++++++---- | ||||||
|  |  net/core/skbuff.c         |  4 ++-- | ||||||
|  |  3 files changed, 24 insertions(+), 6 deletions(-) | ||||||
|  |  | ||||||
|  | --- a/include/linux/netdevice.h | ||||||
|  | +++ b/include/linux/netdevice.h | ||||||
|  | @@ -3300,6 +3300,7 @@ static inline void dev_xmit_recursion_de | ||||||
|  |  	__this_cpu_dec(softnet_data.xmit.recursion); | ||||||
|  |  } | ||||||
|  |   | ||||||
|  | +void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu); | ||||||
|  |  void __netif_schedule(struct Qdisc *q); | ||||||
|  |  void netif_schedule_queue(struct netdev_queue *txq); | ||||||
|  |   | ||||||
|  | --- a/net/core/dev.c | ||||||
|  | +++ b/net/core/dev.c | ||||||
|  | @@ -246,7 +246,7 @@ static bool use_backlog_threads(void) | ||||||
|  |  static inline void rps_lock_irqsave(struct softnet_data *sd, | ||||||
|  |  				    unsigned long *flags) | ||||||
|  |  { | ||||||
|  | -	if (IS_ENABLED(CONFIG_RPS)) | ||||||
|  | +	if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) | ||||||
|  |  		spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags); | ||||||
|  |  	else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) | ||||||
|  |  		local_irq_save(*flags); | ||||||
|  | @@ -254,7 +254,7 @@ static inline void rps_lock_irqsave(stru | ||||||
|  |   | ||||||
|  |  static inline void rps_lock_irq_disable(struct softnet_data *sd) | ||||||
|  |  { | ||||||
|  | -	if (IS_ENABLED(CONFIG_RPS)) | ||||||
|  | +	if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) | ||||||
|  |  		spin_lock_irq(&sd->input_pkt_queue.lock); | ||||||
|  |  	else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) | ||||||
|  |  		local_irq_disable(); | ||||||
|  | @@ -263,7 +263,7 @@ static inline void rps_lock_irq_disable( | ||||||
|  |  static inline void rps_unlock_irq_restore(struct softnet_data *sd, | ||||||
|  |  					  unsigned long *flags) | ||||||
|  |  { | ||||||
|  | -	if (IS_ENABLED(CONFIG_RPS)) | ||||||
|  | +	if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) | ||||||
|  |  		spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags); | ||||||
|  |  	else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) | ||||||
|  |  		local_irq_restore(*flags); | ||||||
|  | @@ -271,7 +271,7 @@ static inline void rps_unlock_irq_restor | ||||||
|  |   | ||||||
|  |  static inline void rps_unlock_irq_enable(struct softnet_data *sd) | ||||||
|  |  { | ||||||
|  | -	if (IS_ENABLED(CONFIG_RPS)) | ||||||
|  | +	if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) | ||||||
|  |  		spin_unlock_irq(&sd->input_pkt_queue.lock); | ||||||
|  |  	else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) | ||||||
|  |  		local_irq_enable(); | ||||||
|  | @@ -4774,6 +4774,23 @@ static void napi_schedule_rps(struct sof | ||||||
|  |  	__napi_schedule_irqoff(&mysd->backlog); | ||||||
|  |  } | ||||||
|  |   | ||||||
|  | +void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu) | ||||||
|  | +{ | ||||||
|  | +	unsigned long flags; | ||||||
|  | + | ||||||
|  | +	if (use_backlog_threads()) { | ||||||
|  | +		rps_lock_irqsave(sd, &flags); | ||||||
|  | + | ||||||
|  | +		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) | ||||||
|  | +			__napi_schedule_irqoff(&sd->backlog); | ||||||
|  | + | ||||||
|  | +		rps_unlock_irq_restore(sd, &flags); | ||||||
|  | + | ||||||
|  | +	} else if (!cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) { | ||||||
|  | +		smp_call_function_single_async(cpu, &sd->defer_csd); | ||||||
|  | +	} | ||||||
|  | +} | ||||||
|  | + | ||||||
|  |  #ifdef CONFIG_NET_FLOW_LIMIT | ||||||
|  |  int netdev_flow_limit_table_len __read_mostly = (1 << 12); | ||||||
|  |  #endif | ||||||
|  | --- a/net/core/skbuff.c | ||||||
|  | +++ b/net/core/skbuff.c | ||||||
|  | @@ -6863,8 +6863,8 @@ nodefer:	__kfree_skb(skb); | ||||||
|  |  	/* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU | ||||||
|  |  	 * if we are unlucky enough (this seems very unlikely). | ||||||
|  |  	 */ | ||||||
|  | -	if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) | ||||||
|  | -		smp_call_function_single_async(cpu, &sd->defer_csd); | ||||||
|  | +	if (unlikely(kick)) | ||||||
|  | +		kick_defer_list_purge(sd, cpu); | ||||||
|  |  } | ||||||
|  |   | ||||||
|  |  static void skb_splice_csum_page(struct sk_buff *skb, struct page *page, | ||||||
| @@ -0,0 +1,164 @@ | |||||||
|  | From 765b11f8f4e20b7433e4ba4a3e9106a0d59501ed Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> | ||||||
|  | Date: Mon, 25 Mar 2024 08:40:31 +0100 | ||||||
|  | Subject: [PATCH 4/4] net: Rename rps_lock to backlog_lock. | ||||||
|  |  | ||||||
|  | The rps_lock.*() functions use the inner lock of a sk_buff_head for | ||||||
|  | locking. This lock is used if RPS is enabled, otherwise the list is | ||||||
|  | accessed lockless and disabling interrupts is enough for the | ||||||
|  | synchronisation because it is only accessed CPU local. Not only the list | ||||||
|  | is protected but also the NAPI state protected. | ||||||
|  | With the addition of backlog threads, the lock is also needed because of | ||||||
|  | the cross CPU access even without RPS. The clean up of the defer_list | ||||||
|  | list is also done via backlog threads (if enabled). | ||||||
|  |  | ||||||
|  | It has been suggested to rename the locking function since it is no | ||||||
|  | longer just RPS. | ||||||
|  |  | ||||||
|  | Rename the rps_lock*() functions to backlog_lock*(). | ||||||
|  |  | ||||||
|  | Suggested-by: Jakub Kicinski <kuba@kernel.org> | ||||||
|  | Acked-by: Jakub Kicinski <kuba@kernel.org> | ||||||
|  | Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> | ||||||
|  | Signed-off-by: Paolo Abeni <pabeni@redhat.com> | ||||||
|  | --- | ||||||
|  |  net/core/dev.c | 34 +++++++++++++++++----------------- | ||||||
|  |  1 file changed, 17 insertions(+), 17 deletions(-) | ||||||
|  |  | ||||||
|  | --- a/net/core/dev.c | ||||||
|  | +++ b/net/core/dev.c | ||||||
|  | @@ -243,8 +243,8 @@ static bool use_backlog_threads(void) | ||||||
|  |   | ||||||
|  |  #endif | ||||||
|  |   | ||||||
|  | -static inline void rps_lock_irqsave(struct softnet_data *sd, | ||||||
|  | -				    unsigned long *flags) | ||||||
|  | +static inline void backlog_lock_irq_save(struct softnet_data *sd, | ||||||
|  | +					 unsigned long *flags) | ||||||
|  |  { | ||||||
|  |  	if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) | ||||||
|  |  		spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags); | ||||||
|  | @@ -252,7 +252,7 @@ static inline void rps_lock_irqsave(stru | ||||||
|  |  		local_irq_save(*flags); | ||||||
|  |  } | ||||||
|  |   | ||||||
|  | -static inline void rps_lock_irq_disable(struct softnet_data *sd) | ||||||
|  | +static inline void backlog_lock_irq_disable(struct softnet_data *sd) | ||||||
|  |  { | ||||||
|  |  	if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) | ||||||
|  |  		spin_lock_irq(&sd->input_pkt_queue.lock); | ||||||
|  | @@ -260,8 +260,8 @@ static inline void rps_lock_irq_disable( | ||||||
|  |  		local_irq_disable(); | ||||||
|  |  } | ||||||
|  |   | ||||||
|  | -static inline void rps_unlock_irq_restore(struct softnet_data *sd, | ||||||
|  | -					  unsigned long *flags) | ||||||
|  | +static inline void backlog_unlock_irq_restore(struct softnet_data *sd, | ||||||
|  | +					      unsigned long *flags) | ||||||
|  |  { | ||||||
|  |  	if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) | ||||||
|  |  		spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags); | ||||||
|  | @@ -269,7 +269,7 @@ static inline void rps_unlock_irq_restor | ||||||
|  |  		local_irq_restore(*flags); | ||||||
|  |  } | ||||||
|  |   | ||||||
|  | -static inline void rps_unlock_irq_enable(struct softnet_data *sd) | ||||||
|  | +static inline void backlog_unlock_irq_enable(struct softnet_data *sd) | ||||||
|  |  { | ||||||
|  |  	if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) | ||||||
|  |  		spin_unlock_irq(&sd->input_pkt_queue.lock); | ||||||
|  | @@ -4779,12 +4779,12 @@ void kick_defer_list_purge(struct softne | ||||||
|  |  	unsigned long flags; | ||||||
|  |   | ||||||
|  |  	if (use_backlog_threads()) { | ||||||
|  | -		rps_lock_irqsave(sd, &flags); | ||||||
|  | +		backlog_lock_irq_save(sd, &flags); | ||||||
|  |   | ||||||
|  |  		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) | ||||||
|  |  			__napi_schedule_irqoff(&sd->backlog); | ||||||
|  |   | ||||||
|  | -		rps_unlock_irq_restore(sd, &flags); | ||||||
|  | +		backlog_unlock_irq_restore(sd, &flags); | ||||||
|  |   | ||||||
|  |  	} else if (!cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) { | ||||||
|  |  		smp_call_function_single_async(cpu, &sd->defer_csd); | ||||||
|  | @@ -4846,7 +4846,7 @@ static int enqueue_to_backlog(struct sk_ | ||||||
|  |  	reason = SKB_DROP_REASON_NOT_SPECIFIED; | ||||||
|  |  	sd = &per_cpu(softnet_data, cpu); | ||||||
|  |   | ||||||
|  | -	rps_lock_irqsave(sd, &flags); | ||||||
|  | +	backlog_lock_irq_save(sd, &flags); | ||||||
|  |  	if (!netif_running(skb->dev)) | ||||||
|  |  		goto drop; | ||||||
|  |  	qlen = skb_queue_len(&sd->input_pkt_queue); | ||||||
|  | @@ -4855,7 +4855,7 @@ static int enqueue_to_backlog(struct sk_ | ||||||
|  |  enqueue: | ||||||
|  |  			__skb_queue_tail(&sd->input_pkt_queue, skb); | ||||||
|  |  			input_queue_tail_incr_save(sd, qtail); | ||||||
|  | -			rps_unlock_irq_restore(sd, &flags); | ||||||
|  | +			backlog_unlock_irq_restore(sd, &flags); | ||||||
|  |  			return NET_RX_SUCCESS; | ||||||
|  |  		} | ||||||
|  |   | ||||||
|  | @@ -4870,7 +4870,7 @@ enqueue: | ||||||
|  |   | ||||||
|  |  drop: | ||||||
|  |  	sd->dropped++; | ||||||
|  | -	rps_unlock_irq_restore(sd, &flags); | ||||||
|  | +	backlog_unlock_irq_restore(sd, &flags); | ||||||
|  |   | ||||||
|  |  	dev_core_stats_rx_dropped_inc(skb->dev); | ||||||
|  |  	kfree_skb_reason(skb, reason); | ||||||
|  | @@ -5901,7 +5901,7 @@ static void flush_backlog(struct work_st | ||||||
|  |  	local_bh_disable(); | ||||||
|  |  	sd = this_cpu_ptr(&softnet_data); | ||||||
|  |   | ||||||
|  | -	rps_lock_irq_disable(sd); | ||||||
|  | +	backlog_lock_irq_disable(sd); | ||||||
|  |  	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { | ||||||
|  |  		if (skb->dev->reg_state == NETREG_UNREGISTERING) { | ||||||
|  |  			__skb_unlink(skb, &sd->input_pkt_queue); | ||||||
|  | @@ -5909,7 +5909,7 @@ static void flush_backlog(struct work_st | ||||||
|  |  			input_queue_head_incr(sd); | ||||||
|  |  		} | ||||||
|  |  	} | ||||||
|  | -	rps_unlock_irq_enable(sd); | ||||||
|  | +	backlog_unlock_irq_enable(sd); | ||||||
|  |   | ||||||
|  |  	skb_queue_walk_safe(&sd->process_queue, skb, tmp) { | ||||||
|  |  		if (skb->dev->reg_state == NETREG_UNREGISTERING) { | ||||||
|  | @@ -5927,14 +5927,14 @@ static bool flush_required(int cpu) | ||||||
|  |  	struct softnet_data *sd = &per_cpu(softnet_data, cpu); | ||||||
|  |  	bool do_flush; | ||||||
|  |   | ||||||
|  | -	rps_lock_irq_disable(sd); | ||||||
|  | +	backlog_lock_irq_disable(sd); | ||||||
|  |   | ||||||
|  |  	/* as insertion into process_queue happens with the rps lock held, | ||||||
|  |  	 * process_queue access may race only with dequeue | ||||||
|  |  	 */ | ||||||
|  |  	do_flush = !skb_queue_empty(&sd->input_pkt_queue) || | ||||||
|  |  		   !skb_queue_empty_lockless(&sd->process_queue); | ||||||
|  | -	rps_unlock_irq_enable(sd); | ||||||
|  | +	backlog_unlock_irq_enable(sd); | ||||||
|  |   | ||||||
|  |  	return do_flush; | ||||||
|  |  #endif | ||||||
|  | @@ -6049,7 +6049,7 @@ static int process_backlog(struct napi_s | ||||||
|  |   | ||||||
|  |  		} | ||||||
|  |   | ||||||
|  | -		rps_lock_irq_disable(sd); | ||||||
|  | +		backlog_lock_irq_disable(sd); | ||||||
|  |  		if (skb_queue_empty(&sd->input_pkt_queue)) { | ||||||
|  |  			/* | ||||||
|  |  			 * Inline a custom version of __napi_complete(). | ||||||
|  | @@ -6065,7 +6065,7 @@ static int process_backlog(struct napi_s | ||||||
|  |  			skb_queue_splice_tail_init(&sd->input_pkt_queue, | ||||||
|  |  						   &sd->process_queue); | ||||||
|  |  		} | ||||||
|  | -		rps_unlock_irq_enable(sd); | ||||||
|  | +		backlog_unlock_irq_enable(sd); | ||||||
|  |  	} | ||||||
|  |   | ||||||
|  |  	return work; | ||||||
| @@ -85,7 +85,7 @@ Signed-off-by: Paolo Abeni <pabeni@redhat.com> | |||||||
|  /** |  /** | ||||||
| --- a/net/core/dev.c | --- a/net/core/dev.c | ||||||
| +++ b/net/core/dev.c | +++ b/net/core/dev.c | ||||||
| @@ -6555,7 +6555,7 @@ static int __napi_poll(struct napi_struc | @@ -6602,7 +6602,7 @@ static int __napi_poll(struct napi_struc | ||||||
|  	 * accidentally calling ->poll() when NAPI is not scheduled. |  	 * accidentally calling ->poll() when NAPI is not scheduled. | ||||||
|  	 */ |  	 */ | ||||||
|  	work = 0; |  	work = 0; | ||||||
|   | |||||||
| @@ -19,7 +19,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name> | |||||||
|  |  | ||||||
| --- a/include/linux/netdevice.h | --- a/include/linux/netdevice.h | ||||||
| +++ b/include/linux/netdevice.h | +++ b/include/linux/netdevice.h | ||||||
| @@ -1759,6 +1759,7 @@ enum netdev_priv_flags { | @@ -1758,6 +1758,7 @@ enum netdev_priv_flags { | ||||||
|  	IFF_TX_SKB_NO_LINEAR		= BIT_ULL(31), |  	IFF_TX_SKB_NO_LINEAR		= BIT_ULL(31), | ||||||
|  	IFF_CHANGE_PROTO_DOWN		= BIT_ULL(32), |  	IFF_CHANGE_PROTO_DOWN		= BIT_ULL(32), | ||||||
|  	IFF_SEE_ALL_HWTSTAMP_REQUESTS	= BIT_ULL(33), |  	IFF_SEE_ALL_HWTSTAMP_REQUESTS	= BIT_ULL(33), | ||||||
| @@ -27,7 +27,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name> | |||||||
|  }; |  }; | ||||||
|   |   | ||||||
|  #define IFF_802_1Q_VLAN			IFF_802_1Q_VLAN |  #define IFF_802_1Q_VLAN			IFF_802_1Q_VLAN | ||||||
| @@ -1792,6 +1793,7 @@ enum netdev_priv_flags { | @@ -1791,6 +1792,7 @@ enum netdev_priv_flags { | ||||||
|  #define IFF_FAILOVER_SLAVE		IFF_FAILOVER_SLAVE |  #define IFF_FAILOVER_SLAVE		IFF_FAILOVER_SLAVE | ||||||
|  #define IFF_L3MDEV_RX_HANDLER		IFF_L3MDEV_RX_HANDLER |  #define IFF_L3MDEV_RX_HANDLER		IFF_L3MDEV_RX_HANDLER | ||||||
|  #define IFF_TX_SKB_NO_LINEAR		IFF_TX_SKB_NO_LINEAR |  #define IFF_TX_SKB_NO_LINEAR		IFF_TX_SKB_NO_LINEAR | ||||||
| @@ -35,7 +35,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name> | |||||||
|   |   | ||||||
|  /* Specifies the type of the struct net_device::ml_priv pointer */ |  /* Specifies the type of the struct net_device::ml_priv pointer */ | ||||||
|  enum netdev_ml_priv_type { |  enum netdev_ml_priv_type { | ||||||
| @@ -2184,6 +2186,11 @@ struct net_device { | @@ -2183,6 +2185,11 @@ struct net_device { | ||||||
|  	const struct tlsdev_ops *tlsdev_ops; |  	const struct tlsdev_ops *tlsdev_ops; | ||||||
|  #endif |  #endif | ||||||
|   |   | ||||||
| @@ -47,7 +47,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name> | |||||||
|  	const struct header_ops *header_ops; |  	const struct header_ops *header_ops; | ||||||
|   |   | ||||||
|  	unsigned char		operstate; |  	unsigned char		operstate; | ||||||
| @@ -2257,6 +2264,10 @@ struct net_device { | @@ -2256,6 +2263,10 @@ struct net_device { | ||||||
|  	struct mctp_dev __rcu	*mctp_ptr; |  	struct mctp_dev __rcu	*mctp_ptr; | ||||||
|  #endif |  #endif | ||||||
|   |   | ||||||
| @@ -105,7 +105,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name> | |||||||
|  	help |  	help | ||||||
| --- a/net/core/dev.c | --- a/net/core/dev.c | ||||||
| +++ b/net/core/dev.c | +++ b/net/core/dev.c | ||||||
| @@ -3571,6 +3571,11 @@ static int xmit_one(struct sk_buff *skb, | @@ -3597,6 +3597,11 @@ static int xmit_one(struct sk_buff *skb, | ||||||
|  	if (dev_nit_active(dev)) |  	if (dev_nit_active(dev)) | ||||||
|  		dev_queue_xmit_nit(skb, dev); |  		dev_queue_xmit_nit(skb, dev); | ||||||
|   |   | ||||||
|   | |||||||
| @@ -1,227 +0,0 @@ | |||||||
| From: Felix Fietkau <nbd@nbd.name> |  | ||||||
| Date: Thu, 16 Feb 2023 18:39:04 +0100 |  | ||||||
| Subject: [PATCH] net/core: add optional threading for backlog processing |  | ||||||
|  |  | ||||||
| When dealing with few flows or an imbalance on CPU utilization, static RPS |  | ||||||
| CPU assignment can be too inflexible. Add support for enabling threaded NAPI |  | ||||||
| for backlog processing in order to allow the scheduler to better balance |  | ||||||
| processing. This helps better spread the load across idle CPUs. |  | ||||||
|  |  | ||||||
| Signed-off-by: Felix Fietkau <nbd@nbd.name> |  | ||||||
| --- |  | ||||||
|  |  | ||||||
| --- a/include/linux/netdevice.h |  | ||||||
| +++ b/include/linux/netdevice.h |  | ||||||
| @@ -558,6 +558,7 @@ static inline bool napi_complete(struct |  | ||||||
|  } |  | ||||||
|   |  | ||||||
|  int dev_set_threaded(struct net_device *dev, bool threaded); |  | ||||||
| +int backlog_set_threaded(bool threaded); |  | ||||||
|   |  | ||||||
|  /** |  | ||||||
|   *	napi_disable - prevent NAPI from scheduling |  | ||||||
| @@ -3236,6 +3237,7 @@ struct softnet_data { |  | ||||||
|  	/* stats */ |  | ||||||
|  	unsigned int		processed; |  | ||||||
|  	unsigned int		time_squeeze; |  | ||||||
| +	unsigned int		process_queue_empty; |  | ||||||
|  #ifdef CONFIG_RPS |  | ||||||
|  	struct softnet_data	*rps_ipi_list; |  | ||||||
|  #endif |  | ||||||
| --- a/net/core/dev.c |  | ||||||
| +++ b/net/core/dev.c |  | ||||||
| @@ -4729,7 +4729,7 @@ static void napi_schedule_rps(struct sof |  | ||||||
|  	struct softnet_data *mysd = this_cpu_ptr(&softnet_data); |  | ||||||
|   |  | ||||||
|  #ifdef CONFIG_RPS |  | ||||||
| -	if (sd != mysd) { |  | ||||||
| +	if (sd != mysd && !test_bit(NAPI_STATE_THREADED, &sd->backlog.state)) { |  | ||||||
|  		sd->rps_ipi_next = mysd->rps_ipi_list; |  | ||||||
|  		mysd->rps_ipi_list = sd; |  | ||||||
|   |  | ||||||
| @@ -5848,6 +5848,8 @@ static DEFINE_PER_CPU(struct work_struct |  | ||||||
|  /* Network device is going away, flush any packets still pending */ |  | ||||||
|  static void flush_backlog(struct work_struct *work) |  | ||||||
|  { |  | ||||||
| +	unsigned int process_queue_empty; |  | ||||||
| +	bool threaded, flush_processq; |  | ||||||
|  	struct sk_buff *skb, *tmp; |  | ||||||
|  	struct softnet_data *sd; |  | ||||||
|   |  | ||||||
| @@ -5862,8 +5864,17 @@ static void flush_backlog(struct work_st |  | ||||||
|  			input_queue_head_incr(sd); |  | ||||||
|  		} |  | ||||||
|  	} |  | ||||||
| + |  | ||||||
| +	threaded = test_bit(NAPI_STATE_THREADED, &sd->backlog.state); |  | ||||||
| +	flush_processq = threaded && |  | ||||||
| +			 !skb_queue_empty_lockless(&sd->process_queue); |  | ||||||
| +	if (flush_processq) |  | ||||||
| +		process_queue_empty = sd->process_queue_empty; |  | ||||||
|  	rps_unlock_irq_enable(sd); |  | ||||||
|   |  | ||||||
| +	if (threaded) |  | ||||||
| +		goto out; |  | ||||||
| + |  | ||||||
|  	skb_queue_walk_safe(&sd->process_queue, skb, tmp) { |  | ||||||
|  		if (skb->dev->reg_state == NETREG_UNREGISTERING) { |  | ||||||
|  			__skb_unlink(skb, &sd->process_queue); |  | ||||||
| @@ -5871,7 +5882,16 @@ static void flush_backlog(struct work_st |  | ||||||
|  			input_queue_head_incr(sd); |  | ||||||
|  		} |  | ||||||
|  	} |  | ||||||
| + |  | ||||||
| +out: |  | ||||||
|  	local_bh_enable(); |  | ||||||
| + |  | ||||||
| +	while (flush_processq) { |  | ||||||
| +		msleep(1); |  | ||||||
| +		rps_lock_irq_disable(sd); |  | ||||||
| +		flush_processq = process_queue_empty == sd->process_queue_empty; |  | ||||||
| +		rps_unlock_irq_enable(sd); |  | ||||||
| +	} |  | ||||||
|  } |  | ||||||
|   |  | ||||||
|  static bool flush_required(int cpu) |  | ||||||
| @@ -6003,6 +6023,7 @@ static int process_backlog(struct napi_s |  | ||||||
|  		} |  | ||||||
|   |  | ||||||
|  		rps_lock_irq_disable(sd); |  | ||||||
| +		sd->process_queue_empty++; |  | ||||||
|  		if (skb_queue_empty(&sd->input_pkt_queue)) { |  | ||||||
|  			/* |  | ||||||
|  			 * Inline a custom version of __napi_complete(). |  | ||||||
| @@ -6012,7 +6033,8 @@ static int process_backlog(struct napi_s |  | ||||||
|  			 * We can use a plain write instead of clear_bit(), |  | ||||||
|  			 * and we dont need an smp_mb() memory barrier. |  | ||||||
|  			 */ |  | ||||||
| -			napi->state = 0; |  | ||||||
| +			napi->state &= ~(NAPIF_STATE_SCHED | |  | ||||||
| +					 NAPIF_STATE_SCHED_THREADED); |  | ||||||
|  			again = false; |  | ||||||
|  		} else { |  | ||||||
|  			skb_queue_splice_tail_init(&sd->input_pkt_queue, |  | ||||||
| @@ -6426,6 +6448,55 @@ int dev_set_threaded(struct net_device * |  | ||||||
|  } |  | ||||||
|  EXPORT_SYMBOL(dev_set_threaded); |  | ||||||
|   |  | ||||||
| +int backlog_set_threaded(bool threaded) |  | ||||||
| +{ |  | ||||||
| +	static bool backlog_threaded; |  | ||||||
| +	int err = 0; |  | ||||||
| +	int i; |  | ||||||
| + |  | ||||||
| +	if (backlog_threaded == threaded) |  | ||||||
| +		return 0; |  | ||||||
| + |  | ||||||
| +	for_each_possible_cpu(i) { |  | ||||||
| +		struct softnet_data *sd = &per_cpu(softnet_data, i); |  | ||||||
| +		struct napi_struct *n = &sd->backlog; |  | ||||||
| + |  | ||||||
| +		if (n->thread) |  | ||||||
| +			continue; |  | ||||||
| +		n->thread = kthread_run(napi_threaded_poll, n, "napi/backlog-%d", i); |  | ||||||
| +		if (IS_ERR(n->thread)) { |  | ||||||
| +			err = PTR_ERR(n->thread); |  | ||||||
| +			pr_err("kthread_run failed with err %d\n", err); |  | ||||||
| +			n->thread = NULL; |  | ||||||
| +			threaded = false; |  | ||||||
| +			break; |  | ||||||
| +		} |  | ||||||
| + |  | ||||||
| +	} |  | ||||||
| + |  | ||||||
| +	backlog_threaded = threaded; |  | ||||||
| + |  | ||||||
| +	/* Make sure kthread is created before THREADED bit |  | ||||||
| +	 * is set. |  | ||||||
| +	 */ |  | ||||||
| +	smp_mb__before_atomic(); |  | ||||||
| + |  | ||||||
| +	for_each_possible_cpu(i) { |  | ||||||
| +		struct softnet_data *sd = &per_cpu(softnet_data, i); |  | ||||||
| +		struct napi_struct *n = &sd->backlog; |  | ||||||
| +		unsigned long flags; |  | ||||||
| + |  | ||||||
| +		rps_lock_irqsave(sd, &flags); |  | ||||||
| +		if (threaded) |  | ||||||
| +			n->state |= NAPIF_STATE_THREADED; |  | ||||||
| +		else |  | ||||||
| +			n->state &= ~NAPIF_STATE_THREADED; |  | ||||||
| +		rps_unlock_irq_restore(sd, &flags); |  | ||||||
| +	} |  | ||||||
| + |  | ||||||
| +	return err; |  | ||||||
| +} |  | ||||||
| + |  | ||||||
|  void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, |  | ||||||
|  			   int (*poll)(struct napi_struct *, int), int weight) |  | ||||||
|  { |  | ||||||
| @@ -11307,6 +11378,9 @@ static int dev_cpu_dead(unsigned int old |  | ||||||
|  	raise_softirq_irqoff(NET_TX_SOFTIRQ); |  | ||||||
|  	local_irq_enable(); |  | ||||||
|   |  | ||||||
| +	if (test_bit(NAPI_STATE_THREADED, &oldsd->backlog.state)) |  | ||||||
| +		return 0; |  | ||||||
| + |  | ||||||
|  #ifdef CONFIG_RPS |  | ||||||
|  	remsd = oldsd->rps_ipi_list; |  | ||||||
|  	oldsd->rps_ipi_list = NULL; |  | ||||||
| @@ -11622,6 +11696,7 @@ static int __init net_dev_init(void) |  | ||||||
|  		INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd); |  | ||||||
|  		spin_lock_init(&sd->defer_lock); |  | ||||||
|   |  | ||||||
| +		INIT_LIST_HEAD(&sd->backlog.poll_list); |  | ||||||
|  		init_gro_hash(&sd->backlog); |  | ||||||
|  		sd->backlog.poll = process_backlog; |  | ||||||
|  		sd->backlog.weight = weight_p; |  | ||||||
| --- a/net/core/sysctl_net_core.c |  | ||||||
| +++ b/net/core/sysctl_net_core.c |  | ||||||
| @@ -30,6 +30,7 @@ static int int_3600 = 3600; |  | ||||||
|  static int min_sndbuf = SOCK_MIN_SNDBUF; |  | ||||||
|  static int min_rcvbuf = SOCK_MIN_RCVBUF; |  | ||||||
|  static int max_skb_frags = MAX_SKB_FRAGS; |  | ||||||
| +static int backlog_threaded; |  | ||||||
|  static int min_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE; |  | ||||||
|   |  | ||||||
|  static int net_msg_warn;	/* Unused, but still a sysctl */ |  | ||||||
| @@ -189,6 +190,23 @@ static int rps_sock_flow_sysctl(struct c |  | ||||||
|  } |  | ||||||
|  #endif /* CONFIG_RPS */ |  | ||||||
|   |  | ||||||
| +static int backlog_threaded_sysctl(struct ctl_table *table, int write, |  | ||||||
| +			       void *buffer, size_t *lenp, loff_t *ppos) |  | ||||||
| +{ |  | ||||||
| +	static DEFINE_MUTEX(backlog_threaded_mutex); |  | ||||||
| +	int ret; |  | ||||||
| + |  | ||||||
| +	mutex_lock(&backlog_threaded_mutex); |  | ||||||
| + |  | ||||||
| +	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |  | ||||||
| +	if (write && !ret) |  | ||||||
| +		ret = backlog_set_threaded(backlog_threaded); |  | ||||||
| + |  | ||||||
| +	mutex_unlock(&backlog_threaded_mutex); |  | ||||||
| + |  | ||||||
| +	return ret; |  | ||||||
| +} |  | ||||||
| + |  | ||||||
|  #ifdef CONFIG_NET_FLOW_LIMIT |  | ||||||
|  static DEFINE_MUTEX(flow_limit_update_mutex); |  | ||||||
|   |  | ||||||
| @@ -541,6 +559,15 @@ static struct ctl_table net_core_table[] |  | ||||||
|  		.proc_handler	= rps_sock_flow_sysctl |  | ||||||
|  	}, |  | ||||||
|  #endif |  | ||||||
| +	{ |  | ||||||
| +		.procname	= "backlog_threaded", |  | ||||||
| +		.data		= &backlog_threaded, |  | ||||||
| +		.maxlen		= sizeof(unsigned int), |  | ||||||
| +		.mode		= 0644, |  | ||||||
| +		.proc_handler	= backlog_threaded_sysctl, |  | ||||||
| +		.extra1		= SYSCTL_ZERO, |  | ||||||
| +		.extra2		= SYSCTL_ONE |  | ||||||
| +	}, |  | ||||||
|  #ifdef CONFIG_NET_FLOW_LIMIT |  | ||||||
|  	{ |  | ||||||
|  		.procname	= "flow_limit_cpu_bitmap", |  | ||||||
		Reference in New Issue
	
	Block a user