kernel: replace threaded NAPI implementation in 5.10 with upstream backport

This uses a kthread per NAPI instance instead of the workqueue approach

Signed-off-by: Felix Fietkau <nbd@nbd.name>
This commit is contained in:
Felix Fietkau
2021-03-14 23:53:35 +01:00
parent 12b5f898f9
commit 4a79c32fee
15 changed files with 639 additions and 395 deletions

View File

@@ -58,7 +58,7 @@ Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -833,6 +833,27 @@ typedef u16 (*select_queue_fallback_t)(s
@@ -827,6 +827,27 @@ typedef u16 (*select_queue_fallback_t)(s
struct sk_buff *skb,
struct net_device *sb_dev);
@@ -86,7 +86,7 @@ Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
enum tc_setup_type {
TC_SETUP_QDISC_MQPRIO,
TC_SETUP_CLSU32,
@@ -1279,6 +1300,8 @@ struct netdev_net_notifier {
@@ -1273,6 +1294,8 @@ struct netdev_net_notifier {
* struct net_device *(*ndo_get_peer_dev)(struct net_device *dev);
* If a device is paired with a peer device, return the peer instance.
* The caller must be under RCU read context.
@@ -95,7 +95,7 @@ Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
*/
struct net_device_ops {
int (*ndo_init)(struct net_device *dev);
@@ -1487,6 +1510,8 @@ struct net_device_ops {
@@ -1481,6 +1504,8 @@ struct net_device_ops {
int (*ndo_tunnel_ctl)(struct net_device *dev,
struct ip_tunnel_parm *p, int cmd);
struct net_device * (*ndo_get_peer_dev)(struct net_device *dev);
@@ -104,7 +104,7 @@ Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
};
/**
@@ -2798,6 +2823,8 @@ void dev_remove_offload(struct packet_of
@@ -2795,6 +2820,8 @@ void dev_remove_offload(struct packet_of
int dev_get_iflink(const struct net_device *dev);
int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb);
@@ -115,7 +115,7 @@ Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
struct net_device *dev_get_by_name(struct net *net, const char *name);
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -846,6 +846,52 @@ int dev_fill_metadata_dst(struct net_dev
@@ -847,6 +847,52 @@ int dev_fill_metadata_dst(struct net_dev
}
EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);

View File

@@ -28,7 +28,7 @@ Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -835,11 +835,18 @@ typedef u16 (*select_queue_fallback_t)(s
@@ -829,11 +829,18 @@ typedef u16 (*select_queue_fallback_t)(s
enum net_device_path_type {
DEV_PATH_ETHERNET = 0,

View File

@@ -9,7 +9,7 @@ Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -836,6 +836,7 @@ typedef u16 (*select_queue_fallback_t)(s
@@ -830,6 +830,7 @@ typedef u16 (*select_queue_fallback_t)(s
enum net_device_path_type {
DEV_PATH_ETHERNET = 0,
DEV_PATH_VLAN,

View File

@@ -15,7 +15,7 @@ Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -847,10 +847,20 @@ struct net_device_path {
@@ -841,10 +841,20 @@ struct net_device_path {
u16 id;
__be16 proto;
} encap;
@@ -36,7 +36,7 @@ Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
struct net_device_path_stack {
int num_paths;
@@ -860,6 +870,12 @@ struct net_device_path_stack {
@@ -854,6 +864,12 @@ struct net_device_path_stack {
struct net_device_path_ctx {
const struct net_device *dev;
const u8 *daddr;

View File

@@ -78,7 +78,7 @@ Pass on the PPPoE session ID and the real device.
static int pppoe_recvmsg(struct socket *sock, struct msghdr *m,
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -837,6 +837,7 @@ enum net_device_path_type {
@@ -831,6 +831,7 @@ enum net_device_path_type {
DEV_PATH_ETHERNET = 0,
DEV_PATH_VLAN,
DEV_PATH_BRIDGE,

View File

@@ -7,7 +7,7 @@ Add .ndo_fill_forward_path for dsa slave port devices
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -838,6 +838,7 @@ enum net_device_path_type {
@@ -832,6 +832,7 @@ enum net_device_path_type {
DEV_PATH_VLAN,
DEV_PATH_BRIDGE,
DEV_PATH_PPPOE,
@@ -15,7 +15,7 @@ Add .ndo_fill_forward_path for dsa slave port devices
};
struct net_device_path {
@@ -857,6 +858,10 @@ struct net_device_path {
@@ -851,6 +852,10 @@ struct net_device_path {
u16 vlan_id;
__be16 vlan_proto;
} bridge;

View File

@@ -27,7 +27,7 @@ Subject: [PATCH] netfilter: flowtable: add pppoe support
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -848,6 +848,7 @@ struct net_device_path {
@@ -842,6 +842,7 @@ struct net_device_path {
struct {
u16 id;
__be16 proto;

View File

@@ -11,7 +11,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -855,6 +855,7 @@ struct net_device_path {
@@ -849,6 +849,7 @@ struct net_device_path {
DEV_PATH_BR_VLAN_KEEP,
DEV_PATH_BR_VLAN_TAG,
DEV_PATH_BR_VLAN_UNTAG,

View File

@@ -11,7 +11,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2033,6 +2033,8 @@ struct net_device {
@@ -2029,6 +2029,8 @@ struct net_device {
struct netdev_hw_addr_list mc;
struct netdev_hw_addr_list dev_addrs;
@@ -32,7 +32,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
__u16 tc_index; /* traffic control index */
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5965,6 +5965,9 @@ static enum gro_result dev_gro_receive(s
@@ -6005,6 +6005,9 @@ static enum gro_result dev_gro_receive(s
int same_flow;
int grow;
@@ -42,7 +42,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
if (netif_elide_gro(skb->dev))
goto normal;
@@ -7793,6 +7796,48 @@ static void __netdev_adjacent_dev_unlink
@@ -7973,6 +7976,48 @@ static void __netdev_adjacent_dev_unlink
&upper_dev->adj_list.lower);
}
@@ -91,7 +91,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
static int __netdev_upper_dev_link(struct net_device *dev,
struct net_device *upper_dev, bool master,
void *upper_priv, void *upper_info,
@@ -7844,6 +7889,7 @@ static int __netdev_upper_dev_link(struc
@@ -8024,6 +8069,7 @@ static int __netdev_upper_dev_link(struc
if (ret)
return ret;
@@ -99,7 +99,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
&changeupper_info.info);
ret = notifier_to_errno(ret);
@@ -7940,6 +7986,7 @@ static void __netdev_upper_dev_unlink(st
@@ -8120,6 +8166,7 @@ static void __netdev_upper_dev_unlink(st
__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
@@ -107,7 +107,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
&changeupper_info.info);
@@ -8726,6 +8773,7 @@ int dev_set_mac_address(struct net_devic
@@ -8906,6 +8953,7 @@ int dev_set_mac_address(struct net_devic
if (err)
return err;
dev->addr_assign_type = NET_ADDR_SET;

View File

@@ -1,301 +0,0 @@
From: Felix Fietkau <nbd@nbd.name>
Date: Sun, 26 Jul 2020 14:03:21 +0200
Subject: [PATCH] net: add support for threaded NAPI polling
For some drivers (especially 802.11 drivers), doing a lot of work in the NAPI
poll function does not perform well. Since NAPI poll is bound to the CPU it
was scheduled from, we can easily end up with a few very busy CPUs spending
most of their time in softirq/ksoftirqd and some idle ones.
Introduce threaded NAPI for such drivers based on a workqueue. The API is the
same except for using netif_threaded_napi_add instead of netif_napi_add.
In my tests with mt76 on MT7621 using threaded NAPI + a thread for tx scheduling
improves LAN->WLAN bridging throughput by 10-50%. Throughput without threaded
NAPI is wildly inconsistent, depending on the CPU that runs the tx scheduling
thread.
With threaded NAPI, throughput seems stable and consistent (and higher than
the best results I got without it).
Based on a patch by Hillf Danton
Cc: Hillf Danton <hdanton@sina.com>
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -347,6 +347,7 @@ struct napi_struct {
struct list_head dev_list;
struct hlist_node napi_hash_node;
unsigned int napi_id;
+ struct work_struct work;
};
enum {
@@ -357,6 +358,7 @@ enum {
NAPI_STATE_LISTED, /* NAPI added to system lists */
NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
+ NAPI_STATE_THREADED, /* Use threaded NAPI */
};
enum {
@@ -367,6 +369,7 @@ enum {
NAPIF_STATE_LISTED = BIT(NAPI_STATE_LISTED),
NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
+ NAPIF_STATE_THREADED = BIT(NAPI_STATE_THREADED),
};
enum gro_result {
@@ -2211,6 +2214,7 @@ struct net_device {
struct lock_class_key *qdisc_running_key;
bool proto_down;
unsigned wol_enabled:1;
+ unsigned threaded:1;
struct list_head net_notifier_list;
@@ -2413,6 +2417,26 @@ void netif_napi_add(struct net_device *d
int (*poll)(struct napi_struct *, int), int weight);
/**
+ * netif_threaded_napi_add - initialize a NAPI context
+ * @dev: network device
+ * @napi: NAPI context
+ * @poll: polling function
+ * @weight: default weight
+ *
+ * This variant of netif_napi_add() should be used from drivers using NAPI
+ * with CPU intensive poll functions.
+ * This will schedule polling from a high priority workqueue
+ */
+static inline void netif_threaded_napi_add(struct net_device *dev,
+ struct napi_struct *napi,
+ int (*poll)(struct napi_struct *, int),
+ int weight)
+{
+ set_bit(NAPI_STATE_THREADED, &napi->state);
+ netif_napi_add(dev, napi, poll, weight);
+}
+
+/**
* netif_tx_napi_add - initialize a NAPI context
* @dev: network device
* @napi: NAPI context
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -159,6 +159,7 @@ static DEFINE_SPINLOCK(offload_lock);
struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
struct list_head ptype_all __read_mostly; /* Taps */
static struct list_head offload_base __read_mostly;
+static struct workqueue_struct *napi_workq __read_mostly;
static int netif_rx_internal(struct sk_buff *skb);
static int call_netdevice_notifiers_info(unsigned long val,
@@ -6407,6 +6408,11 @@ void __napi_schedule(struct napi_struct
{
unsigned long flags;
+ if (test_bit(NAPI_STATE_THREADED, &n->state)) {
+ queue_work(napi_workq, &n->work);
+ return;
+ }
+
local_irq_save(flags);
____napi_schedule(this_cpu_ptr(&softnet_data), n);
local_irq_restore(flags);
@@ -6454,6 +6460,11 @@ EXPORT_SYMBOL(napi_schedule_prep);
*/
void __napi_schedule_irqoff(struct napi_struct *n)
{
+ if (test_bit(NAPI_STATE_THREADED, &n->state)) {
+ queue_work(napi_workq, &n->work);
+ return;
+ }
+
____napi_schedule(this_cpu_ptr(&softnet_data), n);
}
EXPORT_SYMBOL(__napi_schedule_irqoff);
@@ -6715,12 +6726,94 @@ static void init_gro_hash(struct napi_st
napi->gro_bitmask = 0;
}
+static int __napi_poll(struct napi_struct *n, bool *repoll)
+{
+ int work, weight;
+
+ weight = n->weight;
+
+ /* This NAPI_STATE_SCHED test is for avoiding a race
+ * with netpoll's poll_napi(). Only the entity which
+ * obtains the lock and sees NAPI_STATE_SCHED set will
+ * actually make the ->poll() call. Therefore we avoid
+ * accidentally calling ->poll() when NAPI is not scheduled.
+ */
+ work = 0;
+ if (test_bit(NAPI_STATE_SCHED, &n->state)) {
+ work = n->poll(n, weight);
+ trace_napi_poll(n, work, weight);
+ }
+
+ if (unlikely(work > weight))
+ pr_err_once("NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
+ n->poll, work, weight);
+
+ if (likely(work < weight))
+ return work;
+
+ /* Drivers must not modify the NAPI state if they
+ * consume the entire weight. In such cases this code
+ * still "owns" the NAPI instance and therefore can
+ * move the instance around on the list at-will.
+ */
+ if (unlikely(napi_disable_pending(n))) {
+ napi_complete(n);
+ return work;
+ }
+
+ if (n->gro_bitmask) {
+ /* flush too old packets
+ * If HZ < 1000, flush all packets.
+ */
+ napi_gro_flush(n, HZ >= 1000);
+ }
+
+ gro_normal_list(n);
+
+ *repoll = true;
+
+ return work;
+}
+
+static void napi_workfn(struct work_struct *work)
+{
+ struct napi_struct *n = container_of(work, struct napi_struct, work);
+ void *have;
+
+ for (;;) {
+ bool repoll = false;
+
+ local_bh_disable();
+
+ have = netpoll_poll_lock(n);
+ __napi_poll(n, &repoll);
+ netpoll_poll_unlock(have);
+
+ local_bh_enable();
+
+ if (!repoll)
+ return;
+
+ if (!need_resched())
+ continue;
+
+ /*
+ * have to pay for the latency of task switch even if
+ * napi is scheduled
+ */
+ queue_work(napi_workq, work);
+ return;
+ }
+}
+
void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
int (*poll)(struct napi_struct *, int), int weight)
{
if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state)))
return;
+ if (dev->threaded)
+ set_bit(NAPI_STATE_THREADED, &napi->state);
INIT_LIST_HEAD(&napi->poll_list);
INIT_HLIST_NODE(&napi->napi_hash_node);
hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
@@ -6738,6 +6831,7 @@ void netif_napi_add(struct net_device *d
#ifdef CONFIG_NETPOLL
napi->poll_owner = -1;
#endif
+ INIT_WORK(&napi->work, napi_workfn);
set_bit(NAPI_STATE_SCHED, &napi->state);
set_bit(NAPI_STATE_NPSVC, &napi->state);
list_add_rcu(&napi->dev_list, &dev->napi_list);
@@ -6780,6 +6874,7 @@ void __netif_napi_del(struct napi_struct
if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
return;
+ cancel_work_sync(&napi->work);
napi_hash_del(napi);
list_del_rcu(&napi->dev_list);
napi_free_frags(napi);
@@ -6791,53 +6886,19 @@ EXPORT_SYMBOL(__netif_napi_del);
static int napi_poll(struct napi_struct *n, struct list_head *repoll)
{
+ bool do_repoll = false;
void *have;
- int work, weight;
+ int work;
list_del_init(&n->poll_list);
have = netpoll_poll_lock(n);
- weight = n->weight;
+ work = __napi_poll(n, &do_repoll);
- /* This NAPI_STATE_SCHED test is for avoiding a race
- * with netpoll's poll_napi(). Only the entity which
- * obtains the lock and sees NAPI_STATE_SCHED set will
- * actually make the ->poll() call. Therefore we avoid
- * accidentally calling ->poll() when NAPI is not scheduled.
- */
- work = 0;
- if (test_bit(NAPI_STATE_SCHED, &n->state)) {
- work = n->poll(n, weight);
- trace_napi_poll(n, work, weight);
- }
-
- if (unlikely(work > weight))
- pr_err_once("NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
- n->poll, work, weight);
-
- if (likely(work < weight))
+ if (!do_repoll)
goto out_unlock;
- /* Drivers must not modify the NAPI state if they
- * consume the entire weight. In such cases this code
- * still "owns" the NAPI instance and therefore can
- * move the instance around on the list at-will.
- */
- if (unlikely(napi_disable_pending(n))) {
- napi_complete(n);
- goto out_unlock;
- }
-
- if (n->gro_bitmask) {
- /* flush too old packets
- * If HZ < 1000, flush all packets.
- */
- napi_gro_flush(n, HZ >= 1000);
- }
-
- gro_normal_list(n);
-
/* Some drivers may have called napi_schedule
* prior to exhausting their budget.
*/
@@ -11333,6 +11394,10 @@ static int __init net_dev_init(void)
sd->backlog.weight = weight_p;
}
+ napi_workq = alloc_workqueue("napi_workq", WQ_UNBOUND | WQ_HIGHPRI,
+ WQ_UNBOUND_MAX_ACTIVE | WQ_SYSFS);
+ BUG_ON(!napi_workq);
+
dev_boot_phase = 0;
/* The loopback device is special if any other network devices

View File

@@ -1,74 +0,0 @@
From: Felix Fietkau <nbd@nbd.name>
Date: Fri, 21 Aug 2020 15:07:54 +0200
Subject: [PATCH] net: add sysfs attribute for enabling threaded NAPI
This can be used to enable threaded NAPI on drivers that did not explicitly
request it.
Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -472,6 +472,52 @@ static ssize_t proto_down_store(struct d
}
NETDEVICE_SHOW_RW(proto_down, fmt_dec);
+static int change_napi_threaded(struct net_device *dev, unsigned long val)
+{
+ struct napi_struct *napi;
+
+ if (list_empty(&dev->napi_list))
+ return -EOPNOTSUPP;
+
+ list_for_each_entry(napi, &dev->napi_list, dev_list) {
+ if (val)
+ set_bit(NAPI_STATE_THREADED, &napi->state);
+ else
+ clear_bit(NAPI_STATE_THREADED, &napi->state);
+ }
+
+ return 0;
+}
+
+static ssize_t napi_threaded_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return netdev_store(dev, attr, buf, len, change_napi_threaded);
+}
+
+static ssize_t napi_threaded_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ struct napi_struct *napi;
+ bool enabled = false;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ list_for_each_entry(napi, &netdev->napi_list, dev_list) {
+ if (test_bit(NAPI_STATE_THREADED, &napi->state))
+ enabled = true;
+ }
+
+ rtnl_unlock();
+
+ return sprintf(buf, fmt_dec, enabled);
+}
+static DEVICE_ATTR_RW(napi_threaded);
+
static ssize_t phys_port_id_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
@@ -564,6 +610,7 @@ static struct attribute *net_class_attrs
&dev_attr_tx_queue_len.attr,
&dev_attr_gro_flush_timeout.attr,
&dev_attr_napi_defer_hard_irqs.attr,
+ &dev_attr_napi_threaded.attr,
&dev_attr_phys_port_id.attr,
&dev_attr_phys_port_name.attr,
&dev_attr_phys_switch_id.attr,