 c06fb25d1f
			
		
	
	c06fb25d1f
	
	
		
			
	
		
	
	
		
			Some checks failed
		
		
	
	Build Kernel / Build all affected Kernels (push) Has been cancelled
				
			Build all core packages / Build all core packages for selected target (push) Has been cancelled
				
			Build and Push prebuilt tools container / Build and Push all prebuilt containers (push) Has been cancelled
				
			Build Toolchains / Build Toolchains for each target (push) Has been cancelled
				
			Build host tools / Build host tools for linux and macos based systems (push) Has been cancelled
				
			Coverity scan build / Coverity x86/64 build (push) Has been cancelled
				
			
		
			
				
	
	
		
			331 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			Diff
		
	
	
	
	
	
			
		
		
	
	
			331 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			Diff
		
	
	
	
	
	
| From dad6b97702639fba27a2bd3e986982ad6f0db3a7 Mon Sep 17 00:00:00 2001
 | |
| From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
 | |
| Date: Mon, 25 Mar 2024 08:40:29 +0100
 | |
| Subject: [PATCH 2/4] net: Allow to use SMP threads for backlog NAPI.
 | |
| 
 | |
| Backlog NAPI is a per-CPU NAPI struct only (with no device behind it)
 | |
| used by drivers which don't do NAPI them self, RPS and parts of the
 | |
| stack which need to avoid recursive deadlocks while processing a packet.
 | |
| 
 | |
| The non-NAPI driver use the CPU local backlog NAPI. If RPS is enabled
 | |
| then a flow for the skb is computed and based on the flow the skb can be
 | |
| enqueued on a remote CPU. Scheduling/ raising the softirq (for backlog's
 | |
| NAPI) on the remote CPU isn't trivial because the softirq is only
 | |
| scheduled on the local CPU and performed after the hardirq is done.
 | |
| In order to schedule a softirq on the remote CPU, an IPI is sent to the
 | |
| remote CPU which schedules the backlog-NAPI on the then local CPU.
 | |
| 
 | |
| On PREEMPT_RT interrupts are force-threaded. The soft interrupts are
 | |
| raised within the interrupt thread and processed after the interrupt
 | |
| handler completed still within the context of the interrupt thread. The
 | |
| softirq is handled in the context where it originated.
 | |
| 
 | |
| With force-threaded interrupts enabled, ksoftirqd is woken up if a
 | |
| softirq is raised from hardirq context. This is the case if it is raised
 | |
| from an IPI. Additionally there is a warning on PREEMPT_RT if the
 | |
| softirq is raised from the idle thread.
 | |
| This was done for two reasons:
 | |
| - With threaded interrupts the processing should happen in thread
 | |
|   context (where it originated) and ksoftirqd is the only thread for
 | |
|   this context if raised from hardirq. Using the currently running task
 | |
|   instead would "punish" a random task.
 | |
| - Once ksoftirqd is active it consumes all further softirqs until it
 | |
|   stops running. This changed recently and is no longer the case.
 | |
| 
 | |
| Instead of keeping the backlog NAPI in ksoftirqd (in force-threaded/
 | |
| PREEMPT_RT setups) I am proposing NAPI-threads for backlog.
 | |
| The "proper" setup with threaded-NAPI is not doable because the threads
 | |
| are not pinned to an individual CPU and can be modified by the user.
 | |
| Additionally a dummy network device would have to be assigned. Also
 | |
| CPU-hotplug has to be considered if additional CPUs show up.
 | |
| All this can be probably done/ solved but the smpboot-threads already
 | |
| provide this infrastructure.
 | |
| 
 | |
| Sending UDP packets over loopback expects that the packet is processed
 | |
| within the call. Delaying it by handing it over to the thread hurts
 | |
| performance. It is not beneficial to the outcome if the context switch
 | |
| happens immediately after enqueue or after a while to process a few
 | |
| packets in a batch.
 | |
| There is no need to always use the thread if the backlog NAPI is
 | |
| requested on the local CPU. This restores the loopback throuput. The
 | |
| performance drops mostly to the same value after enabling RPS on the
 | |
| loopback comparing the IPI and the tread result.
 | |
| 
 | |
| Create NAPI-threads for backlog if request during boot. The thread runs
 | |
| the inner loop from napi_threaded_poll(), the wait part is different. It
 | |
| checks for NAPI_STATE_SCHED (the backlog NAPI can not be disabled).
 | |
| 
 | |
| The NAPI threads for backlog are optional, it has to be enabled via the boot
 | |
| argument "thread_backlog_napi". It is mandatory for PREEMPT_RT to avoid the
 | |
| wakeup of ksoftirqd from the IPI.
 | |
| 
 | |
| Acked-by: Jakub Kicinski <kuba@kernel.org>
 | |
| Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
 | |
| Signed-off-by: Paolo Abeni <pabeni@redhat.com>
 | |
| ---
 | |
|  net/core/dev.c | 148 +++++++++++++++++++++++++++++++++++++------------
 | |
|  1 file changed, 113 insertions(+), 35 deletions(-)
 | |
| 
 | |
| --- a/net/core/dev.c
 | |
| +++ b/net/core/dev.c
 | |
| @@ -78,6 +78,7 @@
 | |
|  #include <linux/slab.h>
 | |
|  #include <linux/sched.h>
 | |
|  #include <linux/sched/mm.h>
 | |
| +#include <linux/smpboot.h>
 | |
|  #include <linux/mutex.h>
 | |
|  #include <linux/rwsem.h>
 | |
|  #include <linux/string.h>
 | |
| @@ -217,6 +218,31 @@ static inline struct hlist_head *dev_ind
 | |
|  	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 | |
|  }
 | |
|  
 | |
| +#ifndef CONFIG_PREEMPT_RT
 | |
| +
 | |
| +static DEFINE_STATIC_KEY_FALSE(use_backlog_threads_key);
 | |
| +
 | |
| +static int __init setup_backlog_napi_threads(char *arg)
 | |
| +{
 | |
| +	static_branch_enable(&use_backlog_threads_key);
 | |
| +	return 0;
 | |
| +}
 | |
| +early_param("thread_backlog_napi", setup_backlog_napi_threads);
 | |
| +
 | |
| +static bool use_backlog_threads(void)
 | |
| +{
 | |
| +	return static_branch_unlikely(&use_backlog_threads_key);
 | |
| +}
 | |
| +
 | |
| +#else
 | |
| +
 | |
| +static bool use_backlog_threads(void)
 | |
| +{
 | |
| +	return true;
 | |
| +}
 | |
| +
 | |
| +#endif
 | |
| +
 | |
|  static inline void rps_lock_irqsave(struct softnet_data *sd,
 | |
|  				    unsigned long *flags)
 | |
|  {
 | |
| @@ -4449,6 +4475,7 @@ EXPORT_SYMBOL(__dev_direct_xmit);
 | |
|  /*************************************************************************
 | |
|   *			Receiver routines
 | |
|   *************************************************************************/
 | |
| +static DEFINE_PER_CPU(struct task_struct *, backlog_napi);
 | |
|  
 | |
|  int netdev_max_backlog __read_mostly = 1000;
 | |
|  EXPORT_SYMBOL(netdev_max_backlog);
 | |
| @@ -4481,12 +4508,16 @@ static inline void ____napi_schedule(str
 | |
|  		 */
 | |
|  		thread = READ_ONCE(napi->thread);
 | |
|  		if (thread) {
 | |
| +			if (use_backlog_threads() && thread == raw_cpu_read(backlog_napi))
 | |
| +				goto use_local_napi;
 | |
| +
 | |
|  			set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
 | |
|  			wake_up_process(thread);
 | |
|  			return;
 | |
|  		}
 | |
|  	}
 | |
|  
 | |
| +use_local_napi:
 | |
|  	list_add_tail(&napi->poll_list, &sd->poll_list);
 | |
|  	WRITE_ONCE(napi->list_owner, smp_processor_id());
 | |
|  	/* If not called from net_rx_action()
 | |
| @@ -4732,6 +4763,11 @@ static void napi_schedule_rps(struct sof
 | |
|  
 | |
|  #ifdef CONFIG_RPS
 | |
|  	if (sd != mysd) {
 | |
| +		if (use_backlog_threads()) {
 | |
| +			__napi_schedule_irqoff(&sd->backlog);
 | |
| +			return;
 | |
| +		}
 | |
| +
 | |
|  		sd->rps_ipi_next = mysd->rps_ipi_list;
 | |
|  		mysd->rps_ipi_list = sd;
 | |
|  
 | |
| @@ -5955,7 +5991,7 @@ static void net_rps_action_and_irq_enabl
 | |
|  #ifdef CONFIG_RPS
 | |
|  	struct softnet_data *remsd = sd->rps_ipi_list;
 | |
|  
 | |
| -	if (remsd) {
 | |
| +	if (!use_backlog_threads() && remsd) {
 | |
|  		sd->rps_ipi_list = NULL;
 | |
|  
 | |
|  		local_irq_enable();
 | |
| @@ -5970,7 +6006,7 @@ static void net_rps_action_and_irq_enabl
 | |
|  static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
 | |
|  {
 | |
|  #ifdef CONFIG_RPS
 | |
| -	return sd->rps_ipi_list != NULL;
 | |
| +	return !use_backlog_threads() && sd->rps_ipi_list;
 | |
|  #else
 | |
|  	return false;
 | |
|  #endif
 | |
| @@ -6014,7 +6050,7 @@ static int process_backlog(struct napi_s
 | |
|  			 * We can use a plain write instead of clear_bit(),
 | |
|  			 * and we dont need an smp_mb() memory barrier.
 | |
|  			 */
 | |
| -			napi->state = 0;
 | |
| +			napi->state &= NAPIF_STATE_THREADED;
 | |
|  			again = false;
 | |
|  		} else {
 | |
|  			skb_queue_splice_tail_init(&sd->input_pkt_queue,
 | |
| @@ -6680,43 +6716,48 @@ static void skb_defer_free_flush(struct
 | |
|  	}
 | |
|  }
 | |
|  
 | |
| -static int napi_threaded_poll(void *data)
 | |
| +static void napi_threaded_poll_loop(struct napi_struct *napi)
 | |
|  {
 | |
| -	struct napi_struct *napi = data;
 | |
|  	struct softnet_data *sd;
 | |
| -	void *have;
 | |
| +	unsigned long last_qs = jiffies;
 | |
|  
 | |
| -	while (!napi_thread_wait(napi)) {
 | |
| -		unsigned long last_qs = jiffies;
 | |
| +	for (;;) {
 | |
| +		bool repoll = false;
 | |
| +		void *have;
 | |
|  
 | |
| -		for (;;) {
 | |
| -			bool repoll = false;
 | |
| +		local_bh_disable();
 | |
| +		sd = this_cpu_ptr(&softnet_data);
 | |
| +		sd->in_napi_threaded_poll = true;
 | |
|  
 | |
| -			local_bh_disable();
 | |
| -			sd = this_cpu_ptr(&softnet_data);
 | |
| -			sd->in_napi_threaded_poll = true;
 | |
| -
 | |
| -			have = netpoll_poll_lock(napi);
 | |
| -			__napi_poll(napi, &repoll);
 | |
| -			netpoll_poll_unlock(have);
 | |
| -
 | |
| -			sd->in_napi_threaded_poll = false;
 | |
| -			barrier();
 | |
| -
 | |
| -			if (sd_has_rps_ipi_waiting(sd)) {
 | |
| -				local_irq_disable();
 | |
| -				net_rps_action_and_irq_enable(sd);
 | |
| -			}
 | |
| -			skb_defer_free_flush(sd);
 | |
| -			local_bh_enable();
 | |
| +		have = netpoll_poll_lock(napi);
 | |
| +		__napi_poll(napi, &repoll);
 | |
| +		netpoll_poll_unlock(have);
 | |
| +
 | |
| +		sd->in_napi_threaded_poll = false;
 | |
| +		barrier();
 | |
| +
 | |
| +		if (sd_has_rps_ipi_waiting(sd)) {
 | |
| +			local_irq_disable();
 | |
| +			net_rps_action_and_irq_enable(sd);
 | |
| +		}
 | |
| +		skb_defer_free_flush(sd);
 | |
| +		local_bh_enable();
 | |
|  
 | |
| -			if (!repoll)
 | |
| -				break;
 | |
| +		if (!repoll)
 | |
| +			break;
 | |
|  
 | |
| -			rcu_softirq_qs_periodic(last_qs);
 | |
| -			cond_resched();
 | |
| -		}
 | |
| +		rcu_softirq_qs_periodic(last_qs);
 | |
| +		cond_resched();
 | |
|  	}
 | |
| +}
 | |
| +
 | |
| +static int napi_threaded_poll(void *data)
 | |
| +{
 | |
| +	struct napi_struct *napi = data;
 | |
| +
 | |
| +	while (!napi_thread_wait(napi))
 | |
| +		napi_threaded_poll_loop(napi);
 | |
| +
 | |
|  	return 0;
 | |
|  }
 | |
|  
 | |
| @@ -11297,7 +11338,7 @@ static int dev_cpu_dead(unsigned int old
 | |
|  
 | |
|  		list_del_init(&napi->poll_list);
 | |
|  		if (napi->poll == process_backlog)
 | |
| -			napi->state = 0;
 | |
| +			napi->state &= NAPIF_STATE_THREADED;
 | |
|  		else
 | |
|  			____napi_schedule(sd, napi);
 | |
|  	}
 | |
| @@ -11305,12 +11346,14 @@ static int dev_cpu_dead(unsigned int old
 | |
|  	raise_softirq_irqoff(NET_TX_SOFTIRQ);
 | |
|  	local_irq_enable();
 | |
|  
 | |
| +	if (!use_backlog_threads()) {
 | |
|  #ifdef CONFIG_RPS
 | |
| -	remsd = oldsd->rps_ipi_list;
 | |
| -	oldsd->rps_ipi_list = NULL;
 | |
| +		remsd = oldsd->rps_ipi_list;
 | |
| +		oldsd->rps_ipi_list = NULL;
 | |
|  #endif
 | |
| -	/* send out pending IPI's on offline CPU */
 | |
| -	net_rps_send_ipi(remsd);
 | |
| +		/* send out pending IPI's on offline CPU */
 | |
| +		net_rps_send_ipi(remsd);
 | |
| +	}
 | |
|  
 | |
|  	/* Process offline CPU's input_pkt_queue */
 | |
|  	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
 | |
| @@ -11573,6 +11616,38 @@ static struct pernet_operations __net_in
 | |
|   *
 | |
|   */
 | |
|  
 | |
| +static int backlog_napi_should_run(unsigned int cpu)
 | |
| +{
 | |
| +	struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
 | |
| +	struct napi_struct *napi = &sd->backlog;
 | |
| +
 | |
| +	return test_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
 | |
| +}
 | |
| +
 | |
| +static void run_backlog_napi(unsigned int cpu)
 | |
| +{
 | |
| +	struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
 | |
| +
 | |
| +	napi_threaded_poll_loop(&sd->backlog);
 | |
| +}
 | |
| +
 | |
| +static void backlog_napi_setup(unsigned int cpu)
 | |
| +{
 | |
| +	struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
 | |
| +	struct napi_struct *napi = &sd->backlog;
 | |
| +
 | |
| +	napi->thread = this_cpu_read(backlog_napi);
 | |
| +	set_bit(NAPI_STATE_THREADED, &napi->state);
 | |
| +}
 | |
| +
 | |
| +static struct smp_hotplug_thread backlog_threads = {
 | |
| +	.store			= &backlog_napi,
 | |
| +	.thread_should_run	= backlog_napi_should_run,
 | |
| +	.thread_fn		= run_backlog_napi,
 | |
| +	.thread_comm		= "backlog_napi/%u",
 | |
| +	.setup			= backlog_napi_setup,
 | |
| +};
 | |
| +
 | |
|  /*
 | |
|   *       This is called single threaded during boot, so no need
 | |
|   *       to take the rtnl semaphore.
 | |
| @@ -11623,7 +11698,10 @@ static int __init net_dev_init(void)
 | |
|  		init_gro_hash(&sd->backlog);
 | |
|  		sd->backlog.poll = process_backlog;
 | |
|  		sd->backlog.weight = weight_p;
 | |
| +		INIT_LIST_HEAD(&sd->backlog.poll_list);
 | |
|  	}
 | |
| +	if (use_backlog_threads())
 | |
| +		smpboot_register_percpu_thread(&backlog_threads);
 | |
|  
 | |
|  	dev_boot_phase = 0;
 | |
|  
 |