These are the latest patches that just landed upstream for 5.13, will be backported by Greg into 5.10 (because of stable@), and are now in the 5.4 backport branch of wireguard: https://git.zx2c4.com/wireguard-linux/log/?h=backport-5.4.y Cc: Ilya Lipnitskiy <ilya.lipnitskiy@gmail.com> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> Tested-by: Stijn Segers <foss@volatilesystems.org>
		
			
				
	
	
		
			238 lines
		
	
	
		
			8.4 KiB
		
	
	
	
		
			Diff
		
	
	
	
	
	
			
		
		
	
	
			238 lines
		
	
	
		
			8.4 KiB
		
	
	
	
		
			Diff
		
	
	
	
	
	
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 | 
						|
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
 | 
						|
Date: Fri, 4 Jun 2021 17:17:36 +0200
 | 
						|
Subject: [PATCH] wireguard: allowedips: remove nodes in O(1)
 | 
						|
 | 
						|
commit f634f418c227c912e7ea95a3299efdc9b10e4022 upstream.
 | 
						|
 | 
						|
Previously, deleting peers would require traversing the entire trie in
 | 
						|
order to rebalance nodes and safely free them. This meant that removing
 | 
						|
1000 peers from a trie with a half million nodes would take an extremely
 | 
						|
long time, during which we're holding the rtnl lock. Large-scale users
 | 
						|
were reporting 200ms latencies added to the networking stack as a whole
 | 
						|
every time their userspace software would queue up significant removals.
 | 
						|
That's a serious situation.
 | 
						|
 | 
						|
This commit fixes that by maintaining a double pointer to the parent's
 | 
						|
bit pointer for each node, and then using the already existing node list
 | 
						|
belonging to each peer to go directly to the node, fix up its pointers,
 | 
						|
and free it with RCU. This means removal is O(1) instead of O(n), and we
 | 
						|
don't use gobs of stack.
 | 
						|
 | 
						|
The removal algorithm has the same downside as the code that it fixes:
 | 
						|
it won't collapse needlessly long runs of fillers.  We can enhance that
 | 
						|
in the future if it ever becomes a problem. This commit documents that
 | 
						|
limitation with a TODO comment in code, a small but meaningful
 | 
						|
improvement over the prior situation.
 | 
						|
 | 
						|
Currently the biggest flaw, which the next commit addresses, is that
 | 
						|
because this increases the node size on 64-bit machines from 60 bytes to
 | 
						|
68 bytes. 60 rounds up to 64, but 68 rounds up to 128. So we wind up
 | 
						|
using twice as much memory per node, because of power-of-two
 | 
						|
allocations, which is a big bummer. We'll need to figure something out
 | 
						|
there.
 | 
						|
 | 
						|
Fixes: e7096c131e51 ("net: WireGuard secure network tunnel")
 | 
						|
Cc: stable@vger.kernel.org
 | 
						|
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
 | 
						|
Signed-off-by: David S. Miller <davem@davemloft.net>
 | 
						|
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
 | 
						|
---
 | 
						|
 drivers/net/wireguard/allowedips.c | 132 ++++++++++++-----------------
 | 
						|
 drivers/net/wireguard/allowedips.h |   9 +-
 | 
						|
 2 files changed, 57 insertions(+), 84 deletions(-)
 | 
						|
 | 
						|
--- a/drivers/net/wireguard/allowedips.c
 | 
						|
+++ b/drivers/net/wireguard/allowedips.c
 | 
						|
@@ -66,60 +66,6 @@ static void root_remove_peer_lists(struc
 | 
						|
 	}
 | 
						|
 }
 | 
						|
 
 | 
						|
-static void walk_remove_by_peer(struct allowedips_node __rcu **top,
 | 
						|
-				struct wg_peer *peer, struct mutex *lock)
 | 
						|
-{
 | 
						|
-#define REF(p) rcu_access_pointer(p)
 | 
						|
-#define DEREF(p) rcu_dereference_protected(*(p), lockdep_is_held(lock))
 | 
						|
-#define PUSH(p) ({                                                             \
 | 
						|
-		WARN_ON(IS_ENABLED(DEBUG) && len >= 128);                      \
 | 
						|
-		stack[len++] = p;                                              \
 | 
						|
-	})
 | 
						|
-
 | 
						|
-	struct allowedips_node __rcu **stack[128], **nptr;
 | 
						|
-	struct allowedips_node *node, *prev;
 | 
						|
-	unsigned int len;
 | 
						|
-
 | 
						|
-	if (unlikely(!peer || !REF(*top)))
 | 
						|
-		return;
 | 
						|
-
 | 
						|
-	for (prev = NULL, len = 0, PUSH(top); len > 0; prev = node) {
 | 
						|
-		nptr = stack[len - 1];
 | 
						|
-		node = DEREF(nptr);
 | 
						|
-		if (!node) {
 | 
						|
-			--len;
 | 
						|
-			continue;
 | 
						|
-		}
 | 
						|
-		if (!prev || REF(prev->bit[0]) == node ||
 | 
						|
-		    REF(prev->bit[1]) == node) {
 | 
						|
-			if (REF(node->bit[0]))
 | 
						|
-				PUSH(&node->bit[0]);
 | 
						|
-			else if (REF(node->bit[1]))
 | 
						|
-				PUSH(&node->bit[1]);
 | 
						|
-		} else if (REF(node->bit[0]) == prev) {
 | 
						|
-			if (REF(node->bit[1]))
 | 
						|
-				PUSH(&node->bit[1]);
 | 
						|
-		} else {
 | 
						|
-			if (rcu_dereference_protected(node->peer,
 | 
						|
-				lockdep_is_held(lock)) == peer) {
 | 
						|
-				RCU_INIT_POINTER(node->peer, NULL);
 | 
						|
-				list_del_init(&node->peer_list);
 | 
						|
-				if (!node->bit[0] || !node->bit[1]) {
 | 
						|
-					rcu_assign_pointer(*nptr, DEREF(
 | 
						|
-					       &node->bit[!REF(node->bit[0])]));
 | 
						|
-					kfree_rcu(node, rcu);
 | 
						|
-					node = DEREF(nptr);
 | 
						|
-				}
 | 
						|
-			}
 | 
						|
-			--len;
 | 
						|
-		}
 | 
						|
-	}
 | 
						|
-
 | 
						|
-#undef REF
 | 
						|
-#undef DEREF
 | 
						|
-#undef PUSH
 | 
						|
-}
 | 
						|
-
 | 
						|
 static unsigned int fls128(u64 a, u64 b)
 | 
						|
 {
 | 
						|
 	return a ? fls64(a) + 64U : fls64(b);
 | 
						|
@@ -224,6 +170,7 @@ static int add(struct allowedips_node __
 | 
						|
 		RCU_INIT_POINTER(node->peer, peer);
 | 
						|
 		list_add_tail(&node->peer_list, &peer->allowedips_list);
 | 
						|
 		copy_and_assign_cidr(node, key, cidr, bits);
 | 
						|
+		rcu_assign_pointer(node->parent_bit, trie);
 | 
						|
 		rcu_assign_pointer(*trie, node);
 | 
						|
 		return 0;
 | 
						|
 	}
 | 
						|
@@ -243,9 +190,9 @@ static int add(struct allowedips_node __
 | 
						|
 	if (!node) {
 | 
						|
 		down = rcu_dereference_protected(*trie, lockdep_is_held(lock));
 | 
						|
 	} else {
 | 
						|
-		down = rcu_dereference_protected(CHOOSE_NODE(node, key),
 | 
						|
-						 lockdep_is_held(lock));
 | 
						|
+		down = rcu_dereference_protected(CHOOSE_NODE(node, key), lockdep_is_held(lock));
 | 
						|
 		if (!down) {
 | 
						|
+			rcu_assign_pointer(newnode->parent_bit, &CHOOSE_NODE(node, key));
 | 
						|
 			rcu_assign_pointer(CHOOSE_NODE(node, key), newnode);
 | 
						|
 			return 0;
 | 
						|
 		}
 | 
						|
@@ -254,29 +201,37 @@ static int add(struct allowedips_node __
 | 
						|
 	parent = node;
 | 
						|
 
 | 
						|
 	if (newnode->cidr == cidr) {
 | 
						|
+		rcu_assign_pointer(down->parent_bit, &CHOOSE_NODE(newnode, down->bits));
 | 
						|
 		rcu_assign_pointer(CHOOSE_NODE(newnode, down->bits), down);
 | 
						|
-		if (!parent)
 | 
						|
+		if (!parent) {
 | 
						|
+			rcu_assign_pointer(newnode->parent_bit, trie);
 | 
						|
 			rcu_assign_pointer(*trie, newnode);
 | 
						|
-		else
 | 
						|
-			rcu_assign_pointer(CHOOSE_NODE(parent, newnode->bits),
 | 
						|
-					   newnode);
 | 
						|
-	} else {
 | 
						|
-		node = kzalloc(sizeof(*node), GFP_KERNEL);
 | 
						|
-		if (unlikely(!node)) {
 | 
						|
-			list_del(&newnode->peer_list);
 | 
						|
-			kfree(newnode);
 | 
						|
-			return -ENOMEM;
 | 
						|
+		} else {
 | 
						|
+			rcu_assign_pointer(newnode->parent_bit, &CHOOSE_NODE(parent, newnode->bits));
 | 
						|
+			rcu_assign_pointer(CHOOSE_NODE(parent, newnode->bits), newnode);
 | 
						|
 		}
 | 
						|
-		INIT_LIST_HEAD(&node->peer_list);
 | 
						|
-		copy_and_assign_cidr(node, newnode->bits, cidr, bits);
 | 
						|
+		return 0;
 | 
						|
+	}
 | 
						|
+
 | 
						|
+	node = kzalloc(sizeof(*node), GFP_KERNEL);
 | 
						|
+	if (unlikely(!node)) {
 | 
						|
+		list_del(&newnode->peer_list);
 | 
						|
+		kfree(newnode);
 | 
						|
+		return -ENOMEM;
 | 
						|
+	}
 | 
						|
+	INIT_LIST_HEAD(&node->peer_list);
 | 
						|
+	copy_and_assign_cidr(node, newnode->bits, cidr, bits);
 | 
						|
 
 | 
						|
-		rcu_assign_pointer(CHOOSE_NODE(node, down->bits), down);
 | 
						|
-		rcu_assign_pointer(CHOOSE_NODE(node, newnode->bits), newnode);
 | 
						|
-		if (!parent)
 | 
						|
-			rcu_assign_pointer(*trie, node);
 | 
						|
-		else
 | 
						|
-			rcu_assign_pointer(CHOOSE_NODE(parent, node->bits),
 | 
						|
-					   node);
 | 
						|
+	rcu_assign_pointer(down->parent_bit, &CHOOSE_NODE(node, down->bits));
 | 
						|
+	rcu_assign_pointer(CHOOSE_NODE(node, down->bits), down);
 | 
						|
+	rcu_assign_pointer(newnode->parent_bit, &CHOOSE_NODE(node, newnode->bits));
 | 
						|
+	rcu_assign_pointer(CHOOSE_NODE(node, newnode->bits), newnode);
 | 
						|
+	if (!parent) {
 | 
						|
+		rcu_assign_pointer(node->parent_bit, trie);
 | 
						|
+		rcu_assign_pointer(*trie, node);
 | 
						|
+	} else {
 | 
						|
+		rcu_assign_pointer(node->parent_bit, &CHOOSE_NODE(parent, node->bits));
 | 
						|
+		rcu_assign_pointer(CHOOSE_NODE(parent, node->bits), node);
 | 
						|
 	}
 | 
						|
 	return 0;
 | 
						|
 }
 | 
						|
@@ -335,9 +290,30 @@ int wg_allowedips_insert_v6(struct allow
 | 
						|
 void wg_allowedips_remove_by_peer(struct allowedips *table,
 | 
						|
 				  struct wg_peer *peer, struct mutex *lock)
 | 
						|
 {
 | 
						|
+	struct allowedips_node *node, *child, *tmp;
 | 
						|
+
 | 
						|
+	if (list_empty(&peer->allowedips_list))
 | 
						|
+		return;
 | 
						|
 	++table->seq;
 | 
						|
-	walk_remove_by_peer(&table->root4, peer, lock);
 | 
						|
-	walk_remove_by_peer(&table->root6, peer, lock);
 | 
						|
+	list_for_each_entry_safe(node, tmp, &peer->allowedips_list, peer_list) {
 | 
						|
+		list_del_init(&node->peer_list);
 | 
						|
+		RCU_INIT_POINTER(node->peer, NULL);
 | 
						|
+		if (node->bit[0] && node->bit[1])
 | 
						|
+			continue;
 | 
						|
+		child = rcu_dereference_protected(
 | 
						|
+				node->bit[!rcu_access_pointer(node->bit[0])],
 | 
						|
+				lockdep_is_held(lock));
 | 
						|
+		if (child)
 | 
						|
+			child->parent_bit = node->parent_bit;
 | 
						|
+		*rcu_dereference_protected(node->parent_bit, lockdep_is_held(lock)) = child;
 | 
						|
+		kfree_rcu(node, rcu);
 | 
						|
+
 | 
						|
+		/* TODO: Note that we currently don't walk up and down in order to
 | 
						|
+		 * free any potential filler nodes. This means that this function
 | 
						|
+		 * doesn't free up as much as it could, which could be revisited
 | 
						|
+		 * at some point.
 | 
						|
+		 */
 | 
						|
+	}
 | 
						|
 }
 | 
						|
 
 | 
						|
 int wg_allowedips_read_node(struct allowedips_node *node, u8 ip[16], u8 *cidr)
 | 
						|
--- a/drivers/net/wireguard/allowedips.h
 | 
						|
+++ b/drivers/net/wireguard/allowedips.h
 | 
						|
@@ -15,14 +15,11 @@ struct wg_peer;
 | 
						|
 struct allowedips_node {
 | 
						|
 	struct wg_peer __rcu *peer;
 | 
						|
 	struct allowedips_node __rcu *bit[2];
 | 
						|
-	/* While it may seem scandalous that we waste space for v4,
 | 
						|
-	 * we're alloc'ing to the nearest power of 2 anyway, so this
 | 
						|
-	 * doesn't actually make a difference.
 | 
						|
-	 */
 | 
						|
-	u8 bits[16] __aligned(__alignof(u64));
 | 
						|
 	u8 cidr, bit_at_a, bit_at_b, bitlen;
 | 
						|
+	u8 bits[16] __aligned(__alignof(u64));
 | 
						|
 
 | 
						|
-	/* Keep rarely used list at bottom to be beyond cache line. */
 | 
						|
+	/* Keep rarely used members at bottom to be beyond cache line. */
 | 
						|
+	struct allowedips_node *__rcu *parent_bit; /* XXX: this puts us at 68->128 bytes instead of 60->64 bytes!! */
 | 
						|
 	union {
 | 
						|
 		struct list_head peer_list;
 | 
						|
 		struct rcu_head rcu;
 |