Refresh pending patches with make target/linux/refresh. Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
		
			
				
	
	
		
			228 lines
		
	
	
		
			6.2 KiB
		
	
	
	
		
			Diff
		
	
	
	
	
	
			
		
		
	
	
			228 lines
		
	
	
		
			6.2 KiB
		
	
	
	
		
			Diff
		
	
	
	
	
	
From: Felix Fietkau <nbd@nbd.name>
 | 
						|
Date: Thu, 16 Feb 2023 18:39:04 +0100
 | 
						|
Subject: [PATCH] net/core: add optional threading for backlog processing
 | 
						|
 | 
						|
When dealing with few flows or an imbalance on CPU utilization, static RPS
 | 
						|
CPU assignment can be too inflexible. Add support for enabling threaded NAPI
 | 
						|
for backlog processing in order to allow the scheduler to better balance
 | 
						|
processing. This helps better spread the load across idle CPUs.
 | 
						|
 | 
						|
Signed-off-by: Felix Fietkau <nbd@nbd.name>
 | 
						|
---
 | 
						|
 | 
						|
--- a/include/linux/netdevice.h
 | 
						|
+++ b/include/linux/netdevice.h
 | 
						|
@@ -520,6 +520,7 @@ static inline bool napi_complete(struct
 | 
						|
 }
 | 
						|
 
 | 
						|
 int dev_set_threaded(struct net_device *dev, bool threaded);
 | 
						|
+int backlog_set_threaded(bool threaded);
 | 
						|
 
 | 
						|
 /**
 | 
						|
  *	napi_disable - prevent NAPI from scheduling
 | 
						|
@@ -3126,6 +3127,7 @@ struct softnet_data {
 | 
						|
 	unsigned int		processed;
 | 
						|
 	unsigned int		time_squeeze;
 | 
						|
 	unsigned int		received_rps;
 | 
						|
+	unsigned int		process_queue_empty;
 | 
						|
 #ifdef CONFIG_RPS
 | 
						|
 	struct softnet_data	*rps_ipi_list;
 | 
						|
 #endif
 | 
						|
--- a/net/core/dev.c
 | 
						|
+++ b/net/core/dev.c
 | 
						|
@@ -4604,7 +4604,7 @@ static int napi_schedule_rps(struct soft
 | 
						|
 	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
 | 
						|
 
 | 
						|
 #ifdef CONFIG_RPS
 | 
						|
-	if (sd != mysd) {
 | 
						|
+	if (sd != mysd && !test_bit(NAPI_STATE_THREADED, &sd->backlog.state)) {
 | 
						|
 		sd->rps_ipi_next = mysd->rps_ipi_list;
 | 
						|
 		mysd->rps_ipi_list = sd;
 | 
						|
 
 | 
						|
@@ -5785,6 +5785,8 @@ static DEFINE_PER_CPU(struct work_struct
 | 
						|
 /* Network device is going away, flush any packets still pending */
 | 
						|
 static void flush_backlog(struct work_struct *work)
 | 
						|
 {
 | 
						|
+	unsigned int process_queue_empty;
 | 
						|
+	bool threaded, flush_processq;
 | 
						|
 	struct sk_buff *skb, *tmp;
 | 
						|
 	struct softnet_data *sd;
 | 
						|
 
 | 
						|
@@ -5799,8 +5801,17 @@ static void flush_backlog(struct work_st
 | 
						|
 			input_queue_head_incr(sd);
 | 
						|
 		}
 | 
						|
 	}
 | 
						|
+
 | 
						|
+	threaded = test_bit(NAPI_STATE_THREADED, &sd->backlog.state);
 | 
						|
+	flush_processq = threaded &&
 | 
						|
+			 !skb_queue_empty_lockless(&sd->process_queue);
 | 
						|
+	if (flush_processq)
 | 
						|
+		process_queue_empty = sd->process_queue_empty;
 | 
						|
 	rps_unlock_irq_enable(sd);
 | 
						|
 
 | 
						|
+	if (threaded)
 | 
						|
+		goto out;
 | 
						|
+
 | 
						|
 	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
 | 
						|
 		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
 | 
						|
 			__skb_unlink(skb, &sd->process_queue);
 | 
						|
@@ -5808,7 +5819,16 @@ static void flush_backlog(struct work_st
 | 
						|
 			input_queue_head_incr(sd);
 | 
						|
 		}
 | 
						|
 	}
 | 
						|
+
 | 
						|
+out:
 | 
						|
 	local_bh_enable();
 | 
						|
+
 | 
						|
+	while (flush_processq) {
 | 
						|
+		msleep(1);
 | 
						|
+		rps_lock_irq_disable(sd);
 | 
						|
+		flush_processq = process_queue_empty == sd->process_queue_empty;
 | 
						|
+		rps_unlock_irq_enable(sd);
 | 
						|
+	}
 | 
						|
 }
 | 
						|
 
 | 
						|
 static bool flush_required(int cpu)
 | 
						|
@@ -5940,6 +5960,7 @@ static int process_backlog(struct napi_s
 | 
						|
 		}
 | 
						|
 
 | 
						|
 		rps_lock_irq_disable(sd);
 | 
						|
+		sd->process_queue_empty++;
 | 
						|
 		if (skb_queue_empty(&sd->input_pkt_queue)) {
 | 
						|
 			/*
 | 
						|
 			 * Inline a custom version of __napi_complete().
 | 
						|
@@ -5949,7 +5970,8 @@ static int process_backlog(struct napi_s
 | 
						|
 			 * We can use a plain write instead of clear_bit(),
 | 
						|
 			 * and we dont need an smp_mb() memory barrier.
 | 
						|
 			 */
 | 
						|
-			napi->state = 0;
 | 
						|
+			napi->state &= ~(NAPIF_STATE_SCHED |
 | 
						|
+					 NAPIF_STATE_SCHED_THREADED);
 | 
						|
 			again = false;
 | 
						|
 		} else {
 | 
						|
 			skb_queue_splice_tail_init(&sd->input_pkt_queue,
 | 
						|
@@ -6365,6 +6387,55 @@ int dev_set_threaded(struct net_device *
 | 
						|
 }
 | 
						|
 EXPORT_SYMBOL(dev_set_threaded);
 | 
						|
 
 | 
						|
+int backlog_set_threaded(bool threaded)
 | 
						|
+{
 | 
						|
+	static bool backlog_threaded;
 | 
						|
+	int err = 0;
 | 
						|
+	int i;
 | 
						|
+
 | 
						|
+	if (backlog_threaded == threaded)
 | 
						|
+		return 0;
 | 
						|
+
 | 
						|
+	for_each_possible_cpu(i) {
 | 
						|
+		struct softnet_data *sd = &per_cpu(softnet_data, i);
 | 
						|
+		struct napi_struct *n = &sd->backlog;
 | 
						|
+
 | 
						|
+		if (n->thread)
 | 
						|
+			continue;
 | 
						|
+		n->thread = kthread_run(napi_threaded_poll, n, "napi/backlog-%d", i);
 | 
						|
+		if (IS_ERR(n->thread)) {
 | 
						|
+			err = PTR_ERR(n->thread);
 | 
						|
+			pr_err("kthread_run failed with err %d\n", err);
 | 
						|
+			n->thread = NULL;
 | 
						|
+			threaded = false;
 | 
						|
+			break;
 | 
						|
+		}
 | 
						|
+
 | 
						|
+	}
 | 
						|
+
 | 
						|
+	backlog_threaded = threaded;
 | 
						|
+
 | 
						|
+	/* Make sure kthread is created before THREADED bit
 | 
						|
+	 * is set.
 | 
						|
+	 */
 | 
						|
+	smp_mb__before_atomic();
 | 
						|
+
 | 
						|
+	for_each_possible_cpu(i) {
 | 
						|
+		struct softnet_data *sd = &per_cpu(softnet_data, i);
 | 
						|
+		struct napi_struct *n = &sd->backlog;
 | 
						|
+		unsigned long flags;
 | 
						|
+
 | 
						|
+		rps_lock_irqsave(sd, &flags);
 | 
						|
+		if (threaded)
 | 
						|
+			n->state |= NAPIF_STATE_THREADED;
 | 
						|
+		else
 | 
						|
+			n->state &= ~NAPIF_STATE_THREADED;
 | 
						|
+		rps_unlock_irq_restore(sd, &flags);
 | 
						|
+	}
 | 
						|
+
 | 
						|
+	return err;
 | 
						|
+}
 | 
						|
+
 | 
						|
 void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
 | 
						|
 			   int (*poll)(struct napi_struct *, int), int weight)
 | 
						|
 {
 | 
						|
@@ -11137,6 +11208,9 @@ static int dev_cpu_dead(unsigned int old
 | 
						|
 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
 | 
						|
 	local_irq_enable();
 | 
						|
 
 | 
						|
+	if (test_bit(NAPI_STATE_THREADED, &oldsd->backlog.state))
 | 
						|
+		return 0;
 | 
						|
+
 | 
						|
 #ifdef CONFIG_RPS
 | 
						|
 	remsd = oldsd->rps_ipi_list;
 | 
						|
 	oldsd->rps_ipi_list = NULL;
 | 
						|
@@ -11440,6 +11514,7 @@ static int __init net_dev_init(void)
 | 
						|
 		INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);
 | 
						|
 		spin_lock_init(&sd->defer_lock);
 | 
						|
 
 | 
						|
+		INIT_LIST_HEAD(&sd->backlog.poll_list);
 | 
						|
 		init_gro_hash(&sd->backlog);
 | 
						|
 		sd->backlog.poll = process_backlog;
 | 
						|
 		sd->backlog.weight = weight_p;
 | 
						|
--- a/net/core/sysctl_net_core.c
 | 
						|
+++ b/net/core/sysctl_net_core.c
 | 
						|
@@ -29,6 +29,7 @@ static int int_3600 = 3600;
 | 
						|
 static int min_sndbuf = SOCK_MIN_SNDBUF;
 | 
						|
 static int min_rcvbuf = SOCK_MIN_RCVBUF;
 | 
						|
 static int max_skb_frags = MAX_SKB_FRAGS;
 | 
						|
+static int backlog_threaded;
 | 
						|
 
 | 
						|
 static int net_msg_warn;	/* Unused, but still a sysctl */
 | 
						|
 
 | 
						|
@@ -112,6 +113,23 @@ static int rps_sock_flow_sysctl(struct c
 | 
						|
 }
 | 
						|
 #endif /* CONFIG_RPS */
 | 
						|
 
 | 
						|
+static int backlog_threaded_sysctl(struct ctl_table *table, int write,
 | 
						|
+			       void *buffer, size_t *lenp, loff_t *ppos)
 | 
						|
+{
 | 
						|
+	static DEFINE_MUTEX(backlog_threaded_mutex);
 | 
						|
+	int ret;
 | 
						|
+
 | 
						|
+	mutex_lock(&backlog_threaded_mutex);
 | 
						|
+
 | 
						|
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 | 
						|
+	if (write && !ret)
 | 
						|
+		ret = backlog_set_threaded(backlog_threaded);
 | 
						|
+
 | 
						|
+	mutex_unlock(&backlog_threaded_mutex);
 | 
						|
+
 | 
						|
+	return ret;
 | 
						|
+}
 | 
						|
+
 | 
						|
 #ifdef CONFIG_NET_FLOW_LIMIT
 | 
						|
 static DEFINE_MUTEX(flow_limit_update_mutex);
 | 
						|
 
 | 
						|
@@ -473,6 +491,15 @@ static struct ctl_table net_core_table[]
 | 
						|
 		.proc_handler	= rps_sock_flow_sysctl
 | 
						|
 	},
 | 
						|
 #endif
 | 
						|
+	{
 | 
						|
+		.procname	= "backlog_threaded",
 | 
						|
+		.data		= &backlog_threaded,
 | 
						|
+		.maxlen		= sizeof(unsigned int),
 | 
						|
+		.mode		= 0644,
 | 
						|
+		.proc_handler	= backlog_threaded_sysctl,
 | 
						|
+		.extra1		= SYSCTL_ZERO,
 | 
						|
+		.extra2		= SYSCTL_ONE
 | 
						|
+	},
 | 
						|
 #ifdef CONFIG_NET_FLOW_LIMIT
 | 
						|
 	{
 | 
						|
 		.procname	= "flow_limit_cpu_bitmap",
 |