Rather than using the clunky, old, slower wireguard-linux-compat out of tree module, this commit does a patch-by-patch backport of upstream's wireguard to 5.4. This specific backport is in widespread use, being part of SUSE's enterprise kernel, Oracle's enterprise kernel, Google's Android kernel, Gentoo's distro kernel, and probably more I've forgotten about. It's definately the "more proper" way of adding wireguard to a kernel than the ugly compat.h hell of the wireguard-linux-compat repo. And most importantly for OpenWRT, it allows using the same module configuration code for 5.10 as for 5.4, with no need for bifurcation. These patches are from the backport tree which is maintained in the open here: https://git.zx2c4.com/wireguard-linux/log/?h=backport-5.4.y I'll be sending PRs to update this as needed. Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> (cherry picked from commit3888fa7880) (cherry picked from commitd540725871) (cherry picked from commit196f3d586f) (cherry picked from commit3500fd7938) (cherry picked from commit23b801d3ba) (cherry picked from commit0c0cb97da7) (cherry picked from commit2a27f6f90a) Signed-off-by: Ilya Lipnitskiy <ilya.lipnitskiy@gmail.com>
		
			
				
	
	
		
			325 lines
		
	
	
		
			9.4 KiB
		
	
	
	
		
			Diff
		
	
	
	
	
	
			
		
		
	
	
			325 lines
		
	
	
		
			9.4 KiB
		
	
	
	
		
			Diff
		
	
	
	
	
	
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 | 
						|
From: Ard Biesheuvel <ardb@kernel.org>
 | 
						|
Date: Fri, 6 Nov 2020 17:39:38 +0100
 | 
						|
Subject: [PATCH] crypto: arm64/chacha - simplify tail block handling
 | 
						|
 | 
						|
commit c4fc6328d6c67690a7e6e03f43a5a976a13120ef upstream.
 | 
						|
 | 
						|
Based on lessons learnt from optimizing the 32-bit version of this driver,
 | 
						|
we can simplify the arm64 version considerably, by reordering the final
 | 
						|
two stores when the last block is not a multiple of 64 bytes. This removes
 | 
						|
the need to use permutation instructions to calculate the elements that are
 | 
						|
clobbered by the final overlapping store, given that the store of the
 | 
						|
penultimate block now follows it, and that one carries the correct values
 | 
						|
for those elements already.
 | 
						|
 | 
						|
While at it, simplify the overlapping loads as well, by calculating the
 | 
						|
address of the final overlapping load upfront, and switching to this
 | 
						|
address for every load that would otherwise extend past the end of the
 | 
						|
source buffer.
 | 
						|
 | 
						|
There is no impact on performance, but the resulting code is substantially
 | 
						|
smaller and easier to follow.
 | 
						|
 | 
						|
Cc: Eric Biggers <ebiggers@google.com>
 | 
						|
Cc: "Jason A . Donenfeld" <Jason@zx2c4.com>
 | 
						|
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
 | 
						|
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
 | 
						|
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
 | 
						|
---
 | 
						|
 arch/arm64/crypto/chacha-neon-core.S | 193 ++++++++++-----------------
 | 
						|
 1 file changed, 69 insertions(+), 124 deletions(-)
 | 
						|
 | 
						|
--- a/arch/arm64/crypto/chacha-neon-core.S
 | 
						|
+++ b/arch/arm64/crypto/chacha-neon-core.S
 | 
						|
@@ -195,7 +195,6 @@ ENTRY(chacha_4block_xor_neon)
 | 
						|
 	adr_l		x10, .Lpermute
 | 
						|
 	and		x5, x4, #63
 | 
						|
 	add		x10, x10, x5
 | 
						|
-	add		x11, x10, #64
 | 
						|
 
 | 
						|
 	//
 | 
						|
 	// This function encrypts four consecutive ChaCha blocks by loading
 | 
						|
@@ -645,11 +644,11 @@ CPU_BE(	  rev		a15, a15	)
 | 
						|
 	zip2		v31.4s, v14.4s, v15.4s
 | 
						|
 	  eor		a15, a15, w9
 | 
						|
 
 | 
						|
-	mov		x3, #64
 | 
						|
+	add		x3, x2, x4
 | 
						|
+	sub		x3, x3, #128		// start of last block
 | 
						|
+
 | 
						|
 	subs		x5, x4, #128
 | 
						|
-	add		x6, x5, x2
 | 
						|
-	csel		x3, x3, xzr, ge
 | 
						|
-	csel		x2, x2, x6, ge
 | 
						|
+	csel		x2, x2, x3, ge
 | 
						|
 
 | 
						|
 	// interleave 64-bit words in state n, n+2
 | 
						|
 	zip1		v0.2d, v16.2d, v18.2d
 | 
						|
@@ -658,13 +657,10 @@ CPU_BE(	  rev		a15, a15	)
 | 
						|
 	zip1		v8.2d, v17.2d, v19.2d
 | 
						|
 	zip2		v12.2d, v17.2d, v19.2d
 | 
						|
 	  stp		a2, a3, [x1, #-56]
 | 
						|
-	ld1		{v16.16b-v19.16b}, [x2], x3
 | 
						|
 
 | 
						|
 	subs		x6, x4, #192
 | 
						|
-	ccmp		x3, xzr, #4, lt
 | 
						|
-	add		x7, x6, x2
 | 
						|
-	csel		x3, x3, xzr, eq
 | 
						|
-	csel		x2, x2, x7, eq
 | 
						|
+	ld1		{v16.16b-v19.16b}, [x2], #64
 | 
						|
+	csel		x2, x2, x3, ge
 | 
						|
 
 | 
						|
 	zip1		v1.2d, v20.2d, v22.2d
 | 
						|
 	zip2		v5.2d, v20.2d, v22.2d
 | 
						|
@@ -672,13 +668,10 @@ CPU_BE(	  rev		a15, a15	)
 | 
						|
 	zip1		v9.2d, v21.2d, v23.2d
 | 
						|
 	zip2		v13.2d, v21.2d, v23.2d
 | 
						|
 	  stp		a6, a7, [x1, #-40]
 | 
						|
-	ld1		{v20.16b-v23.16b}, [x2], x3
 | 
						|
 
 | 
						|
 	subs		x7, x4, #256
 | 
						|
-	ccmp		x3, xzr, #4, lt
 | 
						|
-	add		x8, x7, x2
 | 
						|
-	csel		x3, x3, xzr, eq
 | 
						|
-	csel		x2, x2, x8, eq
 | 
						|
+	ld1		{v20.16b-v23.16b}, [x2], #64
 | 
						|
+	csel		x2, x2, x3, ge
 | 
						|
 
 | 
						|
 	zip1		v2.2d, v24.2d, v26.2d
 | 
						|
 	zip2		v6.2d, v24.2d, v26.2d
 | 
						|
@@ -686,12 +679,10 @@ CPU_BE(	  rev		a15, a15	)
 | 
						|
 	zip1		v10.2d, v25.2d, v27.2d
 | 
						|
 	zip2		v14.2d, v25.2d, v27.2d
 | 
						|
 	  stp		a10, a11, [x1, #-24]
 | 
						|
-	ld1		{v24.16b-v27.16b}, [x2], x3
 | 
						|
 
 | 
						|
 	subs		x8, x4, #320
 | 
						|
-	ccmp		x3, xzr, #4, lt
 | 
						|
-	add		x9, x8, x2
 | 
						|
-	csel		x2, x2, x9, eq
 | 
						|
+	ld1		{v24.16b-v27.16b}, [x2], #64
 | 
						|
+	csel		x2, x2, x3, ge
 | 
						|
 
 | 
						|
 	zip1		v3.2d, v28.2d, v30.2d
 | 
						|
 	zip2		v7.2d, v28.2d, v30.2d
 | 
						|
@@ -699,151 +690,105 @@ CPU_BE(	  rev		a15, a15	)
 | 
						|
 	zip1		v11.2d, v29.2d, v31.2d
 | 
						|
 	zip2		v15.2d, v29.2d, v31.2d
 | 
						|
 	  stp		a14, a15, [x1, #-8]
 | 
						|
+
 | 
						|
+	tbnz		x5, #63, .Lt128
 | 
						|
 	ld1		{v28.16b-v31.16b}, [x2]
 | 
						|
 
 | 
						|
 	// xor with corresponding input, write to output
 | 
						|
-	tbnz		x5, #63, 0f
 | 
						|
 	eor		v16.16b, v16.16b, v0.16b
 | 
						|
 	eor		v17.16b, v17.16b, v1.16b
 | 
						|
 	eor		v18.16b, v18.16b, v2.16b
 | 
						|
 	eor		v19.16b, v19.16b, v3.16b
 | 
						|
-	st1		{v16.16b-v19.16b}, [x1], #64
 | 
						|
-	cbz		x5, .Lout
 | 
						|
 
 | 
						|
-	tbnz		x6, #63, 1f
 | 
						|
+	tbnz		x6, #63, .Lt192
 | 
						|
+
 | 
						|
 	eor		v20.16b, v20.16b, v4.16b
 | 
						|
 	eor		v21.16b, v21.16b, v5.16b
 | 
						|
 	eor		v22.16b, v22.16b, v6.16b
 | 
						|
 	eor		v23.16b, v23.16b, v7.16b
 | 
						|
-	st1		{v20.16b-v23.16b}, [x1], #64
 | 
						|
-	cbz		x6, .Lout
 | 
						|
 
 | 
						|
-	tbnz		x7, #63, 2f
 | 
						|
+	st1		{v16.16b-v19.16b}, [x1], #64
 | 
						|
+	tbnz		x7, #63, .Lt256
 | 
						|
+
 | 
						|
 	eor		v24.16b, v24.16b, v8.16b
 | 
						|
 	eor		v25.16b, v25.16b, v9.16b
 | 
						|
 	eor		v26.16b, v26.16b, v10.16b
 | 
						|
 	eor		v27.16b, v27.16b, v11.16b
 | 
						|
-	st1		{v24.16b-v27.16b}, [x1], #64
 | 
						|
-	cbz		x7, .Lout
 | 
						|
 
 | 
						|
-	tbnz		x8, #63, 3f
 | 
						|
+	st1		{v20.16b-v23.16b}, [x1], #64
 | 
						|
+	tbnz		x8, #63, .Lt320
 | 
						|
+
 | 
						|
 	eor		v28.16b, v28.16b, v12.16b
 | 
						|
 	eor		v29.16b, v29.16b, v13.16b
 | 
						|
 	eor		v30.16b, v30.16b, v14.16b
 | 
						|
 	eor		v31.16b, v31.16b, v15.16b
 | 
						|
+
 | 
						|
+	st1		{v24.16b-v27.16b}, [x1], #64
 | 
						|
 	st1		{v28.16b-v31.16b}, [x1]
 | 
						|
 
 | 
						|
 .Lout:	frame_pop
 | 
						|
 	ret
 | 
						|
 
 | 
						|
-	// fewer than 128 bytes of in/output
 | 
						|
-0:	ld1		{v8.16b}, [x10]
 | 
						|
-	ld1		{v9.16b}, [x11]
 | 
						|
-	movi		v10.16b, #16
 | 
						|
-	sub		x2, x1, #64
 | 
						|
-	add		x1, x1, x5
 | 
						|
-	ld1		{v16.16b-v19.16b}, [x2]
 | 
						|
-	tbl		v4.16b, {v0.16b-v3.16b}, v8.16b
 | 
						|
-	tbx		v20.16b, {v16.16b-v19.16b}, v9.16b
 | 
						|
-	add		v8.16b, v8.16b, v10.16b
 | 
						|
-	add		v9.16b, v9.16b, v10.16b
 | 
						|
-	tbl		v5.16b, {v0.16b-v3.16b}, v8.16b
 | 
						|
-	tbx		v21.16b, {v16.16b-v19.16b}, v9.16b
 | 
						|
-	add		v8.16b, v8.16b, v10.16b
 | 
						|
-	add		v9.16b, v9.16b, v10.16b
 | 
						|
-	tbl		v6.16b, {v0.16b-v3.16b}, v8.16b
 | 
						|
-	tbx		v22.16b, {v16.16b-v19.16b}, v9.16b
 | 
						|
-	add		v8.16b, v8.16b, v10.16b
 | 
						|
-	add		v9.16b, v9.16b, v10.16b
 | 
						|
-	tbl		v7.16b, {v0.16b-v3.16b}, v8.16b
 | 
						|
-	tbx		v23.16b, {v16.16b-v19.16b}, v9.16b
 | 
						|
-
 | 
						|
-	eor		v20.16b, v20.16b, v4.16b
 | 
						|
-	eor		v21.16b, v21.16b, v5.16b
 | 
						|
-	eor		v22.16b, v22.16b, v6.16b
 | 
						|
-	eor		v23.16b, v23.16b, v7.16b
 | 
						|
-	st1		{v20.16b-v23.16b}, [x1]
 | 
						|
-	b		.Lout
 | 
						|
-
 | 
						|
 	// fewer than 192 bytes of in/output
 | 
						|
-1:	ld1		{v8.16b}, [x10]
 | 
						|
-	ld1		{v9.16b}, [x11]
 | 
						|
-	movi		v10.16b, #16
 | 
						|
-	add		x1, x1, x6
 | 
						|
-	tbl		v0.16b, {v4.16b-v7.16b}, v8.16b
 | 
						|
-	tbx		v20.16b, {v16.16b-v19.16b}, v9.16b
 | 
						|
-	add		v8.16b, v8.16b, v10.16b
 | 
						|
-	add		v9.16b, v9.16b, v10.16b
 | 
						|
-	tbl		v1.16b, {v4.16b-v7.16b}, v8.16b
 | 
						|
-	tbx		v21.16b, {v16.16b-v19.16b}, v9.16b
 | 
						|
-	add		v8.16b, v8.16b, v10.16b
 | 
						|
-	add		v9.16b, v9.16b, v10.16b
 | 
						|
-	tbl		v2.16b, {v4.16b-v7.16b}, v8.16b
 | 
						|
-	tbx		v22.16b, {v16.16b-v19.16b}, v9.16b
 | 
						|
-	add		v8.16b, v8.16b, v10.16b
 | 
						|
-	add		v9.16b, v9.16b, v10.16b
 | 
						|
-	tbl		v3.16b, {v4.16b-v7.16b}, v8.16b
 | 
						|
-	tbx		v23.16b, {v16.16b-v19.16b}, v9.16b
 | 
						|
-
 | 
						|
-	eor		v20.16b, v20.16b, v0.16b
 | 
						|
-	eor		v21.16b, v21.16b, v1.16b
 | 
						|
-	eor		v22.16b, v22.16b, v2.16b
 | 
						|
-	eor		v23.16b, v23.16b, v3.16b
 | 
						|
-	st1		{v20.16b-v23.16b}, [x1]
 | 
						|
+.Lt192:	cbz		x5, 1f				// exactly 128 bytes?
 | 
						|
+	ld1		{v28.16b-v31.16b}, [x10]
 | 
						|
+	add		x5, x5, x1
 | 
						|
+	tbl		v28.16b, {v4.16b-v7.16b}, v28.16b
 | 
						|
+	tbl		v29.16b, {v4.16b-v7.16b}, v29.16b
 | 
						|
+	tbl		v30.16b, {v4.16b-v7.16b}, v30.16b
 | 
						|
+	tbl		v31.16b, {v4.16b-v7.16b}, v31.16b
 | 
						|
+
 | 
						|
+0:	eor		v20.16b, v20.16b, v28.16b
 | 
						|
+	eor		v21.16b, v21.16b, v29.16b
 | 
						|
+	eor		v22.16b, v22.16b, v30.16b
 | 
						|
+	eor		v23.16b, v23.16b, v31.16b
 | 
						|
+	st1		{v20.16b-v23.16b}, [x5]		// overlapping stores
 | 
						|
+1:	st1		{v16.16b-v19.16b}, [x1]
 | 
						|
 	b		.Lout
 | 
						|
 
 | 
						|
+	// fewer than 128 bytes of in/output
 | 
						|
+.Lt128:	ld1		{v28.16b-v31.16b}, [x10]
 | 
						|
+	add		x5, x5, x1
 | 
						|
+	sub		x1, x1, #64
 | 
						|
+	tbl		v28.16b, {v0.16b-v3.16b}, v28.16b
 | 
						|
+	tbl		v29.16b, {v0.16b-v3.16b}, v29.16b
 | 
						|
+	tbl		v30.16b, {v0.16b-v3.16b}, v30.16b
 | 
						|
+	tbl		v31.16b, {v0.16b-v3.16b}, v31.16b
 | 
						|
+	ld1		{v16.16b-v19.16b}, [x1]		// reload first output block
 | 
						|
+	b		0b
 | 
						|
+
 | 
						|
 	// fewer than 256 bytes of in/output
 | 
						|
-2:	ld1		{v4.16b}, [x10]
 | 
						|
-	ld1		{v5.16b}, [x11]
 | 
						|
-	movi		v6.16b, #16
 | 
						|
-	add		x1, x1, x7
 | 
						|
+.Lt256:	cbz		x6, 2f				// exactly 192 bytes?
 | 
						|
+	ld1		{v4.16b-v7.16b}, [x10]
 | 
						|
+	add		x6, x6, x1
 | 
						|
 	tbl		v0.16b, {v8.16b-v11.16b}, v4.16b
 | 
						|
-	tbx		v24.16b, {v20.16b-v23.16b}, v5.16b
 | 
						|
-	add		v4.16b, v4.16b, v6.16b
 | 
						|
-	add		v5.16b, v5.16b, v6.16b
 | 
						|
-	tbl		v1.16b, {v8.16b-v11.16b}, v4.16b
 | 
						|
-	tbx		v25.16b, {v20.16b-v23.16b}, v5.16b
 | 
						|
-	add		v4.16b, v4.16b, v6.16b
 | 
						|
-	add		v5.16b, v5.16b, v6.16b
 | 
						|
-	tbl		v2.16b, {v8.16b-v11.16b}, v4.16b
 | 
						|
-	tbx		v26.16b, {v20.16b-v23.16b}, v5.16b
 | 
						|
-	add		v4.16b, v4.16b, v6.16b
 | 
						|
-	add		v5.16b, v5.16b, v6.16b
 | 
						|
-	tbl		v3.16b, {v8.16b-v11.16b}, v4.16b
 | 
						|
-	tbx		v27.16b, {v20.16b-v23.16b}, v5.16b
 | 
						|
-
 | 
						|
-	eor		v24.16b, v24.16b, v0.16b
 | 
						|
-	eor		v25.16b, v25.16b, v1.16b
 | 
						|
-	eor		v26.16b, v26.16b, v2.16b
 | 
						|
-	eor		v27.16b, v27.16b, v3.16b
 | 
						|
-	st1		{v24.16b-v27.16b}, [x1]
 | 
						|
+	tbl		v1.16b, {v8.16b-v11.16b}, v5.16b
 | 
						|
+	tbl		v2.16b, {v8.16b-v11.16b}, v6.16b
 | 
						|
+	tbl		v3.16b, {v8.16b-v11.16b}, v7.16b
 | 
						|
+
 | 
						|
+	eor		v28.16b, v28.16b, v0.16b
 | 
						|
+	eor		v29.16b, v29.16b, v1.16b
 | 
						|
+	eor		v30.16b, v30.16b, v2.16b
 | 
						|
+	eor		v31.16b, v31.16b, v3.16b
 | 
						|
+	st1		{v28.16b-v31.16b}, [x6]		// overlapping stores
 | 
						|
+2:	st1		{v20.16b-v23.16b}, [x1]
 | 
						|
 	b		.Lout
 | 
						|
 
 | 
						|
 	// fewer than 320 bytes of in/output
 | 
						|
-3:	ld1		{v4.16b}, [x10]
 | 
						|
-	ld1		{v5.16b}, [x11]
 | 
						|
-	movi		v6.16b, #16
 | 
						|
-	add		x1, x1, x8
 | 
						|
+.Lt320:	cbz		x7, 3f				// exactly 256 bytes?
 | 
						|
+	ld1		{v4.16b-v7.16b}, [x10]
 | 
						|
+	add		x7, x7, x1
 | 
						|
 	tbl		v0.16b, {v12.16b-v15.16b}, v4.16b
 | 
						|
-	tbx		v28.16b, {v24.16b-v27.16b}, v5.16b
 | 
						|
-	add		v4.16b, v4.16b, v6.16b
 | 
						|
-	add		v5.16b, v5.16b, v6.16b
 | 
						|
-	tbl		v1.16b, {v12.16b-v15.16b}, v4.16b
 | 
						|
-	tbx		v29.16b, {v24.16b-v27.16b}, v5.16b
 | 
						|
-	add		v4.16b, v4.16b, v6.16b
 | 
						|
-	add		v5.16b, v5.16b, v6.16b
 | 
						|
-	tbl		v2.16b, {v12.16b-v15.16b}, v4.16b
 | 
						|
-	tbx		v30.16b, {v24.16b-v27.16b}, v5.16b
 | 
						|
-	add		v4.16b, v4.16b, v6.16b
 | 
						|
-	add		v5.16b, v5.16b, v6.16b
 | 
						|
-	tbl		v3.16b, {v12.16b-v15.16b}, v4.16b
 | 
						|
-	tbx		v31.16b, {v24.16b-v27.16b}, v5.16b
 | 
						|
+	tbl		v1.16b, {v12.16b-v15.16b}, v5.16b
 | 
						|
+	tbl		v2.16b, {v12.16b-v15.16b}, v6.16b
 | 
						|
+	tbl		v3.16b, {v12.16b-v15.16b}, v7.16b
 | 
						|
 
 | 
						|
 	eor		v28.16b, v28.16b, v0.16b
 | 
						|
 	eor		v29.16b, v29.16b, v1.16b
 | 
						|
 	eor		v30.16b, v30.16b, v2.16b
 | 
						|
 	eor		v31.16b, v31.16b, v3.16b
 | 
						|
-	st1		{v28.16b-v31.16b}, [x1]
 | 
						|
+	st1		{v28.16b-v31.16b}, [x7]		// overlapping stores
 | 
						|
+3:	st1		{v24.16b-v27.16b}, [x1]
 | 
						|
 	b		.Lout
 | 
						|
 ENDPROC(chacha_4block_xor_neon)
 | 
						|
 
 | 
						|
@@ -851,7 +796,7 @@ ENDPROC(chacha_4block_xor_neon)
 | 
						|
 	.align		L1_CACHE_SHIFT
 | 
						|
 .Lpermute:
 | 
						|
 	.set		.Li, 0
 | 
						|
-	.rept		192
 | 
						|
+	.rept		128
 | 
						|
 	.byte		(.Li - 64)
 | 
						|
 	.set		.Li, .Li + 1
 | 
						|
 	.endr
 |