 b81045a818
			
		
	
	b81045a818
	
	
	
		
			
			All patches automatically rebased. Build system: x86_64/Fedora 38 Build-tested: ipq807x/Qnap 301W, ipq807x/Dynalink DL-WRX36 Run-tested: ipq807x/Qnap 301W, ipq807x/Dynalink DL-WRX36 Signed-off-by: Robert Marko <robimarko@gmail.com>
		
			
				
	
	
		
			228 lines
		
	
	
		
			6.2 KiB
		
	
	
	
		
			Diff
		
	
	
	
	
	
			
		
		
	
	
			228 lines
		
	
	
		
			6.2 KiB
		
	
	
	
		
			Diff
		
	
	
	
	
	
| From: Felix Fietkau <nbd@nbd.name>
 | |
| Date: Thu, 16 Feb 2023 18:39:04 +0100
 | |
| Subject: [PATCH] net/core: add optional threading for backlog processing
 | |
| 
 | |
| When dealing with few flows or an imbalance on CPU utilization, static RPS
 | |
| CPU assignment can be too inflexible. Add support for enabling threaded NAPI
 | |
| for backlog processing in order to allow the scheduler to better balance
 | |
| processing. This helps better spread the load across idle CPUs.
 | |
| 
 | |
| Signed-off-by: Felix Fietkau <nbd@nbd.name>
 | |
| ---
 | |
| 
 | |
| --- a/include/linux/netdevice.h
 | |
| +++ b/include/linux/netdevice.h
 | |
| @@ -520,6 +520,7 @@ static inline bool napi_complete(struct
 | |
|  }
 | |
|  
 | |
|  int dev_set_threaded(struct net_device *dev, bool threaded);
 | |
| +int backlog_set_threaded(bool threaded);
 | |
|  
 | |
|  /**
 | |
|   *	napi_disable - prevent NAPI from scheduling
 | |
| @@ -3127,6 +3128,7 @@ struct softnet_data {
 | |
|  	unsigned int		processed;
 | |
|  	unsigned int		time_squeeze;
 | |
|  	unsigned int		received_rps;
 | |
| +	unsigned int		process_queue_empty;
 | |
|  #ifdef CONFIG_RPS
 | |
|  	struct softnet_data	*rps_ipi_list;
 | |
|  #endif
 | |
| --- a/net/core/dev.c
 | |
| +++ b/net/core/dev.c
 | |
| @@ -4606,7 +4606,7 @@ static int napi_schedule_rps(struct soft
 | |
|  	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
 | |
|  
 | |
|  #ifdef CONFIG_RPS
 | |
| -	if (sd != mysd) {
 | |
| +	if (sd != mysd && !test_bit(NAPI_STATE_THREADED, &sd->backlog.state)) {
 | |
|  		sd->rps_ipi_next = mysd->rps_ipi_list;
 | |
|  		mysd->rps_ipi_list = sd;
 | |
|  
 | |
| @@ -5787,6 +5787,8 @@ static DEFINE_PER_CPU(struct work_struct
 | |
|  /* Network device is going away, flush any packets still pending */
 | |
|  static void flush_backlog(struct work_struct *work)
 | |
|  {
 | |
| +	unsigned int process_queue_empty;
 | |
| +	bool threaded, flush_processq;
 | |
|  	struct sk_buff *skb, *tmp;
 | |
|  	struct softnet_data *sd;
 | |
|  
 | |
| @@ -5801,8 +5803,17 @@ static void flush_backlog(struct work_st
 | |
|  			input_queue_head_incr(sd);
 | |
|  		}
 | |
|  	}
 | |
| +
 | |
| +	threaded = test_bit(NAPI_STATE_THREADED, &sd->backlog.state);
 | |
| +	flush_processq = threaded &&
 | |
| +			 !skb_queue_empty_lockless(&sd->process_queue);
 | |
| +	if (flush_processq)
 | |
| +		process_queue_empty = sd->process_queue_empty;
 | |
|  	rps_unlock_irq_enable(sd);
 | |
|  
 | |
| +	if (threaded)
 | |
| +		goto out;
 | |
| +
 | |
|  	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
 | |
|  		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
 | |
|  			__skb_unlink(skb, &sd->process_queue);
 | |
| @@ -5810,7 +5821,16 @@ static void flush_backlog(struct work_st
 | |
|  			input_queue_head_incr(sd);
 | |
|  		}
 | |
|  	}
 | |
| +
 | |
| +out:
 | |
|  	local_bh_enable();
 | |
| +
 | |
| +	while (flush_processq) {
 | |
| +		msleep(1);
 | |
| +		rps_lock_irq_disable(sd);
 | |
| +		flush_processq = process_queue_empty == sd->process_queue_empty;
 | |
| +		rps_unlock_irq_enable(sd);
 | |
| +	}
 | |
|  }
 | |
|  
 | |
|  static bool flush_required(int cpu)
 | |
| @@ -5942,6 +5962,7 @@ static int process_backlog(struct napi_s
 | |
|  		}
 | |
|  
 | |
|  		rps_lock_irq_disable(sd);
 | |
| +		sd->process_queue_empty++;
 | |
|  		if (skb_queue_empty(&sd->input_pkt_queue)) {
 | |
|  			/*
 | |
|  			 * Inline a custom version of __napi_complete().
 | |
| @@ -5951,7 +5972,8 @@ static int process_backlog(struct napi_s
 | |
|  			 * We can use a plain write instead of clear_bit(),
 | |
|  			 * and we dont need an smp_mb() memory barrier.
 | |
|  			 */
 | |
| -			napi->state = 0;
 | |
| +			napi->state &= ~(NAPIF_STATE_SCHED |
 | |
| +					 NAPIF_STATE_SCHED_THREADED);
 | |
|  			again = false;
 | |
|  		} else {
 | |
|  			skb_queue_splice_tail_init(&sd->input_pkt_queue,
 | |
| @@ -6367,6 +6389,55 @@ int dev_set_threaded(struct net_device *
 | |
|  }
 | |
|  EXPORT_SYMBOL(dev_set_threaded);
 | |
|  
 | |
| +int backlog_set_threaded(bool threaded)
 | |
| +{
 | |
| +	static bool backlog_threaded;
 | |
| +	int err = 0;
 | |
| +	int i;
 | |
| +
 | |
| +	if (backlog_threaded == threaded)
 | |
| +		return 0;
 | |
| +
 | |
| +	for_each_possible_cpu(i) {
 | |
| +		struct softnet_data *sd = &per_cpu(softnet_data, i);
 | |
| +		struct napi_struct *n = &sd->backlog;
 | |
| +
 | |
| +		if (n->thread)
 | |
| +			continue;
 | |
| +		n->thread = kthread_run(napi_threaded_poll, n, "napi/backlog-%d", i);
 | |
| +		if (IS_ERR(n->thread)) {
 | |
| +			err = PTR_ERR(n->thread);
 | |
| +			pr_err("kthread_run failed with err %d\n", err);
 | |
| +			n->thread = NULL;
 | |
| +			threaded = false;
 | |
| +			break;
 | |
| +		}
 | |
| +
 | |
| +	}
 | |
| +
 | |
| +	backlog_threaded = threaded;
 | |
| +
 | |
| +	/* Make sure kthread is created before THREADED bit
 | |
| +	 * is set.
 | |
| +	 */
 | |
| +	smp_mb__before_atomic();
 | |
| +
 | |
| +	for_each_possible_cpu(i) {
 | |
| +		struct softnet_data *sd = &per_cpu(softnet_data, i);
 | |
| +		struct napi_struct *n = &sd->backlog;
 | |
| +		unsigned long flags;
 | |
| +
 | |
| +		rps_lock_irqsave(sd, &flags);
 | |
| +		if (threaded)
 | |
| +			n->state |= NAPIF_STATE_THREADED;
 | |
| +		else
 | |
| +			n->state &= ~NAPIF_STATE_THREADED;
 | |
| +		rps_unlock_irq_restore(sd, &flags);
 | |
| +	}
 | |
| +
 | |
| +	return err;
 | |
| +}
 | |
| +
 | |
|  void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
 | |
|  			   int (*poll)(struct napi_struct *, int), int weight)
 | |
|  {
 | |
| @@ -11139,6 +11210,9 @@ static int dev_cpu_dead(unsigned int old
 | |
|  	raise_softirq_irqoff(NET_TX_SOFTIRQ);
 | |
|  	local_irq_enable();
 | |
|  
 | |
| +	if (test_bit(NAPI_STATE_THREADED, &oldsd->backlog.state))
 | |
| +		return 0;
 | |
| +
 | |
|  #ifdef CONFIG_RPS
 | |
|  	remsd = oldsd->rps_ipi_list;
 | |
|  	oldsd->rps_ipi_list = NULL;
 | |
| @@ -11442,6 +11516,7 @@ static int __init net_dev_init(void)
 | |
|  		INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);
 | |
|  		spin_lock_init(&sd->defer_lock);
 | |
|  
 | |
| +		INIT_LIST_HEAD(&sd->backlog.poll_list);
 | |
|  		init_gro_hash(&sd->backlog);
 | |
|  		sd->backlog.poll = process_backlog;
 | |
|  		sd->backlog.weight = weight_p;
 | |
| --- a/net/core/sysctl_net_core.c
 | |
| +++ b/net/core/sysctl_net_core.c
 | |
| @@ -29,6 +29,7 @@ static int int_3600 = 3600;
 | |
|  static int min_sndbuf = SOCK_MIN_SNDBUF;
 | |
|  static int min_rcvbuf = SOCK_MIN_RCVBUF;
 | |
|  static int max_skb_frags = MAX_SKB_FRAGS;
 | |
| +static int backlog_threaded;
 | |
|  
 | |
|  static int net_msg_warn;	/* Unused, but still a sysctl */
 | |
|  
 | |
| @@ -112,6 +113,23 @@ static int rps_sock_flow_sysctl(struct c
 | |
|  }
 | |
|  #endif /* CONFIG_RPS */
 | |
|  
 | |
| +static int backlog_threaded_sysctl(struct ctl_table *table, int write,
 | |
| +			       void *buffer, size_t *lenp, loff_t *ppos)
 | |
| +{
 | |
| +	static DEFINE_MUTEX(backlog_threaded_mutex);
 | |
| +	int ret;
 | |
| +
 | |
| +	mutex_lock(&backlog_threaded_mutex);
 | |
| +
 | |
| +	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 | |
| +	if (write && !ret)
 | |
| +		ret = backlog_set_threaded(backlog_threaded);
 | |
| +
 | |
| +	mutex_unlock(&backlog_threaded_mutex);
 | |
| +
 | |
| +	return ret;
 | |
| +}
 | |
| +
 | |
|  #ifdef CONFIG_NET_FLOW_LIMIT
 | |
|  static DEFINE_MUTEX(flow_limit_update_mutex);
 | |
|  
 | |
| @@ -473,6 +491,15 @@ static struct ctl_table net_core_table[]
 | |
|  		.proc_handler	= rps_sock_flow_sysctl
 | |
|  	},
 | |
|  #endif
 | |
| +	{
 | |
| +		.procname	= "backlog_threaded",
 | |
| +		.data		= &backlog_threaded,
 | |
| +		.maxlen		= sizeof(unsigned int),
 | |
| +		.mode		= 0644,
 | |
| +		.proc_handler	= backlog_threaded_sysctl,
 | |
| +		.extra1		= SYSCTL_ZERO,
 | |
| +		.extra2		= SYSCTL_ONE
 | |
| +	},
 | |
|  #ifdef CONFIG_NET_FLOW_LIMIT
 | |
|  	{
 | |
|  		.procname	= "flow_limit_cpu_bitmap",
 |