 c0cb86e1d5
			
		
	
	c0cb86e1d5
	
	
	
		
			
			Rather than using the clunky, old, slower wireguard-linux-compat out of tree module, this commit does a patch-by-patch backport of upstream's wireguard to 5.4. This specific backport is in widespread use, being part of SUSE's enterprise kernel, Oracle's enterprise kernel, Google's Android kernel, Gentoo's distro kernel, and probably more I've forgotten about. It's definately the "more proper" way of adding wireguard to a kernel than the ugly compat.h hell of the wireguard-linux-compat repo. And most importantly for OpenWRT, it allows using the same module configuration code for 5.10 as for 5.4, with no need for bifurcation. These patches are from the backport tree which is maintained in the open here: https://git.zx2c4.com/wireguard-linux/log/?h=backport-5.4.y I'll be sending PRs to update this as needed. Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> (cherry picked from commit3888fa7880) (cherry picked from commitd540725871) (cherry picked from commit196f3d586f) (cherry picked from commit3500fd7938) (cherry picked from commit23b801d3ba) (cherry picked from commit0c0cb97da7) (cherry picked from commit2a27f6f90a) Signed-off-by: Ilya Lipnitskiy <ilya.lipnitskiy@gmail.com>
		
			
				
	
	
		
			2928 lines
		
	
	
		
			75 KiB
		
	
	
	
		
			Diff
		
	
	
	
	
	
			
		
		
	
	
			2928 lines
		
	
	
		
			75 KiB
		
	
	
	
		
			Diff
		
	
	
	
	
	
| From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 | |
| From: "Jason A. Donenfeld" <Jason@zx2c4.com>
 | |
| Date: Sun, 5 Jan 2020 22:40:48 -0500
 | |
| Subject: [PATCH] crypto: x86/poly1305 - wire up faster implementations for
 | |
|  kernel
 | |
| 
 | |
| commit d7d7b853566254648df59f7ea27ea05952a6cfa8 upstream.
 | |
| 
 | |
| These x86_64 vectorized implementations support AVX, AVX-2, and AVX512F.
 | |
| The AVX-512F implementation is disabled on Skylake, due to throttling,
 | |
| but it is quite fast on >= Cannonlake.
 | |
| 
 | |
| On the left is cycle counts on a Core i7 6700HQ using the AVX-2
 | |
| codepath, comparing this implementation ("new") to the implementation in
 | |
| the current crypto api ("old"). On the right are benchmarks on a Xeon
 | |
| Gold 5120 using the AVX-512 codepath. The new implementation is faster
 | |
| on all benchmarks.
 | |
| 
 | |
|         AVX-2                  AVX-512
 | |
|       ---------              -----------
 | |
| 
 | |
|     size    old     new      size   old     new
 | |
|     ----    ----    ----     ----   ----    ----
 | |
|     0       70      68       0      74      70
 | |
|     16      92      90       16     96      92
 | |
|     32      134     104      32     136     106
 | |
|     48      172     120      48     184     124
 | |
|     64      218     136      64     218     138
 | |
|     80      254     158      80     260     160
 | |
|     96      298     174      96     300     176
 | |
|     112     342     192      112    342     194
 | |
|     128     388     212      128    384     212
 | |
|     144     428     228      144    420     226
 | |
|     160     466     246      160    464     248
 | |
|     176     510     264      176    504     264
 | |
|     192     550     282      192    544     282
 | |
|     208     594     302      208    582     300
 | |
|     224     628     316      224    624     318
 | |
|     240     676     334      240    662     338
 | |
|     256     716     354      256    708     358
 | |
|     272     764     374      272    748     372
 | |
|     288     802     352      288    788     358
 | |
|     304     420     366      304    422     370
 | |
|     320     428     360      320    432     364
 | |
|     336     484     378      336    486     380
 | |
|     352     426     384      352    434     390
 | |
|     368     478     400      368    480     408
 | |
|     384     488     394      384    490     398
 | |
|     400     542     408      400    542     412
 | |
|     416     486     416      416    492     426
 | |
|     432     534     430      432    538     436
 | |
|     448     544     422      448    546     432
 | |
|     464     600     438      464    600     448
 | |
|     480     540     448      480    548     456
 | |
|     496     594     464      496    594     476
 | |
|     512     602     456      512    606     470
 | |
|     528     656     476      528    656     480
 | |
|     544     600     480      544    606     498
 | |
|     560     650     494      560    652     512
 | |
|     576     664     490      576    662     508
 | |
|     592     714     508      592    716     522
 | |
|     608     656     514      608    664     538
 | |
|     624     708     532      624    710     552
 | |
|     640     716     524      640    720     516
 | |
|     656     770     536      656    772     526
 | |
|     672     716     548      672    722     544
 | |
|     688     770     562      688    768     556
 | |
|     704     774     552      704    778     556
 | |
|     720     826     568      720    832     568
 | |
|     736     768     574      736    780     584
 | |
|     752     822     592      752    826     600
 | |
|     768     830     584      768    836     560
 | |
|     784     884     602      784    888     572
 | |
|     800     828     610      800    838     588
 | |
|     816     884     628      816    884     604
 | |
|     832     888     618      832    894     598
 | |
|     848     942     632      848    946     612
 | |
|     864     884     644      864    896     628
 | |
|     880     936     660      880    942     644
 | |
|     896     948     652      896    952     608
 | |
|     912     1000    664      912    1004    616
 | |
|     928     942     676      928    954     634
 | |
|     944     994     690      944    1000    646
 | |
|     960     1002    680      960    1008    646
 | |
|     976     1054    694      976    1062    658
 | |
|     992     1002    706      992    1012    674
 | |
|     1008    1052    720      1008   1058    690
 | |
| 
 | |
| This commit wires in the prior implementation from Andy, and makes the
 | |
| following changes to be suitable for kernel land.
 | |
| 
 | |
|   - Some cosmetic and structural changes, like renaming labels to
 | |
|     .Lname, constants, and other Linux conventions, as well as making
 | |
|     the code easy for us to maintain moving forward.
 | |
| 
 | |
|   - CPU feature checking is done in C by the glue code.
 | |
| 
 | |
|   - We avoid jumping into the middle of functions, to appease objtool,
 | |
|     and instead parameterize shared code.
 | |
| 
 | |
|   - We maintain frame pointers so that stack traces make sense.
 | |
| 
 | |
|   - We remove the dependency on the perl xlate code, which transforms
 | |
|     the output into things that assemblers we don't care about use.
 | |
| 
 | |
| Importantly, none of our changes affect the arithmetic or core code, but
 | |
| just involve the differing environment of kernel space.
 | |
| 
 | |
| Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
 | |
| Signed-off-by: Samuel Neves <sneves@dei.uc.pt>
 | |
| Co-developed-by: Samuel Neves <sneves@dei.uc.pt>
 | |
| Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
 | |
| Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
 | |
| ---
 | |
|  arch/x86/crypto/.gitignore                    |   1 +
 | |
|  arch/x86/crypto/Makefile                      |  11 +-
 | |
|  arch/x86/crypto/poly1305-avx2-x86_64.S        | 390 ----------
 | |
|  arch/x86/crypto/poly1305-sse2-x86_64.S        | 590 ---------------
 | |
|  arch/x86/crypto/poly1305-x86_64-cryptogams.pl | 682 ++++++++++--------
 | |
|  arch/x86/crypto/poly1305_glue.c               | 473 +++++-------
 | |
|  lib/crypto/Kconfig                            |   2 +-
 | |
|  7 files changed, 572 insertions(+), 1577 deletions(-)
 | |
|  create mode 100644 arch/x86/crypto/.gitignore
 | |
|  delete mode 100644 arch/x86/crypto/poly1305-avx2-x86_64.S
 | |
|  delete mode 100644 arch/x86/crypto/poly1305-sse2-x86_64.S
 | |
| 
 | |
| --- /dev/null
 | |
| +++ b/arch/x86/crypto/.gitignore
 | |
| @@ -0,0 +1 @@
 | |
| +poly1305-x86_64.S
 | |
| --- a/arch/x86/crypto/Makefile
 | |
| +++ b/arch/x86/crypto/Makefile
 | |
| @@ -73,6 +73,10 @@ aegis128-aesni-y := aegis128-aesni-asm.o
 | |
|  
 | |
|  nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
 | |
|  blake2s-x86_64-y := blake2s-core.o blake2s-glue.o
 | |
| +poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o
 | |
| +ifneq ($(CONFIG_CRYPTO_POLY1305_X86_64),)
 | |
| +targets += poly1305-x86_64-cryptogams.S
 | |
| +endif
 | |
|  
 | |
|  ifeq ($(avx_supported),yes)
 | |
|  	camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
 | |
| @@ -101,10 +105,8 @@ aesni-intel-y := aesni-intel_asm.o aesni
 | |
|  aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
 | |
|  ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
 | |
|  sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
 | |
| -poly1305-x86_64-y := poly1305-sse2-x86_64.o poly1305_glue.o
 | |
|  ifeq ($(avx2_supported),yes)
 | |
|  sha1-ssse3-y += sha1_avx2_x86_64_asm.o
 | |
| -poly1305-x86_64-y += poly1305-avx2-x86_64.o
 | |
|  endif
 | |
|  ifeq ($(sha1_ni_supported),yes)
 | |
|  sha1-ssse3-y += sha1_ni_asm.o
 | |
| @@ -118,3 +120,8 @@ sha256-ssse3-y += sha256_ni_asm.o
 | |
|  endif
 | |
|  sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
 | |
|  crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o
 | |
| +
 | |
| +quiet_cmd_perlasm = PERLASM $@
 | |
| +      cmd_perlasm = $(PERL) $< > $@
 | |
| +$(obj)/%.S: $(src)/%.pl FORCE
 | |
| +	$(call if_changed,perlasm)
 | |
| --- a/arch/x86/crypto/poly1305-avx2-x86_64.S
 | |
| +++ /dev/null
 | |
| @@ -1,390 +0,0 @@
 | |
| -/* SPDX-License-Identifier: GPL-2.0-or-later */
 | |
| -/*
 | |
| - * Poly1305 authenticator algorithm, RFC7539, x64 AVX2 functions
 | |
| - *
 | |
| - * Copyright (C) 2015 Martin Willi
 | |
| - */
 | |
| -
 | |
| -#include <linux/linkage.h>
 | |
| -
 | |
| -.section	.rodata.cst32.ANMASK, "aM", @progbits, 32
 | |
| -.align 32
 | |
| -ANMASK:	.octa 0x0000000003ffffff0000000003ffffff
 | |
| -	.octa 0x0000000003ffffff0000000003ffffff
 | |
| -
 | |
| -.section	.rodata.cst32.ORMASK, "aM", @progbits, 32
 | |
| -.align 32
 | |
| -ORMASK:	.octa 0x00000000010000000000000001000000
 | |
| -	.octa 0x00000000010000000000000001000000
 | |
| -
 | |
| -.text
 | |
| -
 | |
| -#define h0 0x00(%rdi)
 | |
| -#define h1 0x04(%rdi)
 | |
| -#define h2 0x08(%rdi)
 | |
| -#define h3 0x0c(%rdi)
 | |
| -#define h4 0x10(%rdi)
 | |
| -#define r0 0x00(%rdx)
 | |
| -#define r1 0x04(%rdx)
 | |
| -#define r2 0x08(%rdx)
 | |
| -#define r3 0x0c(%rdx)
 | |
| -#define r4 0x10(%rdx)
 | |
| -#define u0 0x00(%r8)
 | |
| -#define u1 0x04(%r8)
 | |
| -#define u2 0x08(%r8)
 | |
| -#define u3 0x0c(%r8)
 | |
| -#define u4 0x10(%r8)
 | |
| -#define w0 0x18(%r8)
 | |
| -#define w1 0x1c(%r8)
 | |
| -#define w2 0x20(%r8)
 | |
| -#define w3 0x24(%r8)
 | |
| -#define w4 0x28(%r8)
 | |
| -#define y0 0x30(%r8)
 | |
| -#define y1 0x34(%r8)
 | |
| -#define y2 0x38(%r8)
 | |
| -#define y3 0x3c(%r8)
 | |
| -#define y4 0x40(%r8)
 | |
| -#define m %rsi
 | |
| -#define hc0 %ymm0
 | |
| -#define hc1 %ymm1
 | |
| -#define hc2 %ymm2
 | |
| -#define hc3 %ymm3
 | |
| -#define hc4 %ymm4
 | |
| -#define hc0x %xmm0
 | |
| -#define hc1x %xmm1
 | |
| -#define hc2x %xmm2
 | |
| -#define hc3x %xmm3
 | |
| -#define hc4x %xmm4
 | |
| -#define t1 %ymm5
 | |
| -#define t2 %ymm6
 | |
| -#define t1x %xmm5
 | |
| -#define t2x %xmm6
 | |
| -#define ruwy0 %ymm7
 | |
| -#define ruwy1 %ymm8
 | |
| -#define ruwy2 %ymm9
 | |
| -#define ruwy3 %ymm10
 | |
| -#define ruwy4 %ymm11
 | |
| -#define ruwy0x %xmm7
 | |
| -#define ruwy1x %xmm8
 | |
| -#define ruwy2x %xmm9
 | |
| -#define ruwy3x %xmm10
 | |
| -#define ruwy4x %xmm11
 | |
| -#define svxz1 %ymm12
 | |
| -#define svxz2 %ymm13
 | |
| -#define svxz3 %ymm14
 | |
| -#define svxz4 %ymm15
 | |
| -#define d0 %r9
 | |
| -#define d1 %r10
 | |
| -#define d2 %r11
 | |
| -#define d3 %r12
 | |
| -#define d4 %r13
 | |
| -
 | |
| -ENTRY(poly1305_4block_avx2)
 | |
| -	# %rdi: Accumulator h[5]
 | |
| -	# %rsi: 64 byte input block m
 | |
| -	# %rdx: Poly1305 key r[5]
 | |
| -	# %rcx: Quadblock count
 | |
| -	# %r8:  Poly1305 derived key r^2 u[5], r^3 w[5], r^4 y[5],
 | |
| -
 | |
| -	# This four-block variant uses loop unrolled block processing. It
 | |
| -	# requires 4 Poly1305 keys: r, r^2, r^3 and r^4:
 | |
| -	# h = (h + m) * r  =>  h = (h + m1) * r^4 + m2 * r^3 + m3 * r^2 + m4 * r
 | |
| -
 | |
| -	vzeroupper
 | |
| -	push		%rbx
 | |
| -	push		%r12
 | |
| -	push		%r13
 | |
| -
 | |
| -	# combine r0,u0,w0,y0
 | |
| -	vmovd		y0,ruwy0x
 | |
| -	vmovd		w0,t1x
 | |
| -	vpunpcklqdq	t1,ruwy0,ruwy0
 | |
| -	vmovd		u0,t1x
 | |
| -	vmovd		r0,t2x
 | |
| -	vpunpcklqdq	t2,t1,t1
 | |
| -	vperm2i128	$0x20,t1,ruwy0,ruwy0
 | |
| -
 | |
| -	# combine r1,u1,w1,y1 and s1=r1*5,v1=u1*5,x1=w1*5,z1=y1*5
 | |
| -	vmovd		y1,ruwy1x
 | |
| -	vmovd		w1,t1x
 | |
| -	vpunpcklqdq	t1,ruwy1,ruwy1
 | |
| -	vmovd		u1,t1x
 | |
| -	vmovd		r1,t2x
 | |
| -	vpunpcklqdq	t2,t1,t1
 | |
| -	vperm2i128	$0x20,t1,ruwy1,ruwy1
 | |
| -	vpslld		$2,ruwy1,svxz1
 | |
| -	vpaddd		ruwy1,svxz1,svxz1
 | |
| -
 | |
| -	# combine r2,u2,w2,y2 and s2=r2*5,v2=u2*5,x2=w2*5,z2=y2*5
 | |
| -	vmovd		y2,ruwy2x
 | |
| -	vmovd		w2,t1x
 | |
| -	vpunpcklqdq	t1,ruwy2,ruwy2
 | |
| -	vmovd		u2,t1x
 | |
| -	vmovd		r2,t2x
 | |
| -	vpunpcklqdq	t2,t1,t1
 | |
| -	vperm2i128	$0x20,t1,ruwy2,ruwy2
 | |
| -	vpslld		$2,ruwy2,svxz2
 | |
| -	vpaddd		ruwy2,svxz2,svxz2
 | |
| -
 | |
| -	# combine r3,u3,w3,y3 and s3=r3*5,v3=u3*5,x3=w3*5,z3=y3*5
 | |
| -	vmovd		y3,ruwy3x
 | |
| -	vmovd		w3,t1x
 | |
| -	vpunpcklqdq	t1,ruwy3,ruwy3
 | |
| -	vmovd		u3,t1x
 | |
| -	vmovd		r3,t2x
 | |
| -	vpunpcklqdq	t2,t1,t1
 | |
| -	vperm2i128	$0x20,t1,ruwy3,ruwy3
 | |
| -	vpslld		$2,ruwy3,svxz3
 | |
| -	vpaddd		ruwy3,svxz3,svxz3
 | |
| -
 | |
| -	# combine r4,u4,w4,y4 and s4=r4*5,v4=u4*5,x4=w4*5,z4=y4*5
 | |
| -	vmovd		y4,ruwy4x
 | |
| -	vmovd		w4,t1x
 | |
| -	vpunpcklqdq	t1,ruwy4,ruwy4
 | |
| -	vmovd		u4,t1x
 | |
| -	vmovd		r4,t2x
 | |
| -	vpunpcklqdq	t2,t1,t1
 | |
| -	vperm2i128	$0x20,t1,ruwy4,ruwy4
 | |
| -	vpslld		$2,ruwy4,svxz4
 | |
| -	vpaddd		ruwy4,svxz4,svxz4
 | |
| -
 | |
| -.Ldoblock4:
 | |
| -	# hc0 = [m[48-51] & 0x3ffffff, m[32-35] & 0x3ffffff,
 | |
| -	#	 m[16-19] & 0x3ffffff, m[ 0- 3] & 0x3ffffff + h0]
 | |
| -	vmovd		0x00(m),hc0x
 | |
| -	vmovd		0x10(m),t1x
 | |
| -	vpunpcklqdq	t1,hc0,hc0
 | |
| -	vmovd		0x20(m),t1x
 | |
| -	vmovd		0x30(m),t2x
 | |
| -	vpunpcklqdq	t2,t1,t1
 | |
| -	vperm2i128	$0x20,t1,hc0,hc0
 | |
| -	vpand		ANMASK(%rip),hc0,hc0
 | |
| -	vmovd		h0,t1x
 | |
| -	vpaddd		t1,hc0,hc0
 | |
| -	# hc1 = [(m[51-54] >> 2) & 0x3ffffff, (m[35-38] >> 2) & 0x3ffffff,
 | |
| -	#	 (m[19-22] >> 2) & 0x3ffffff, (m[ 3- 6] >> 2) & 0x3ffffff + h1]
 | |
| -	vmovd		0x03(m),hc1x
 | |
| -	vmovd		0x13(m),t1x
 | |
| -	vpunpcklqdq	t1,hc1,hc1
 | |
| -	vmovd		0x23(m),t1x
 | |
| -	vmovd		0x33(m),t2x
 | |
| -	vpunpcklqdq	t2,t1,t1
 | |
| -	vperm2i128	$0x20,t1,hc1,hc1
 | |
| -	vpsrld		$2,hc1,hc1
 | |
| -	vpand		ANMASK(%rip),hc1,hc1
 | |
| -	vmovd		h1,t1x
 | |
| -	vpaddd		t1,hc1,hc1
 | |
| -	# hc2 = [(m[54-57] >> 4) & 0x3ffffff, (m[38-41] >> 4) & 0x3ffffff,
 | |
| -	#	 (m[22-25] >> 4) & 0x3ffffff, (m[ 6- 9] >> 4) & 0x3ffffff + h2]
 | |
| -	vmovd		0x06(m),hc2x
 | |
| -	vmovd		0x16(m),t1x
 | |
| -	vpunpcklqdq	t1,hc2,hc2
 | |
| -	vmovd		0x26(m),t1x
 | |
| -	vmovd		0x36(m),t2x
 | |
| -	vpunpcklqdq	t2,t1,t1
 | |
| -	vperm2i128	$0x20,t1,hc2,hc2
 | |
| -	vpsrld		$4,hc2,hc2
 | |
| -	vpand		ANMASK(%rip),hc2,hc2
 | |
| -	vmovd		h2,t1x
 | |
| -	vpaddd		t1,hc2,hc2
 | |
| -	# hc3 = [(m[57-60] >> 6) & 0x3ffffff, (m[41-44] >> 6) & 0x3ffffff,
 | |
| -	#	 (m[25-28] >> 6) & 0x3ffffff, (m[ 9-12] >> 6) & 0x3ffffff + h3]
 | |
| -	vmovd		0x09(m),hc3x
 | |
| -	vmovd		0x19(m),t1x
 | |
| -	vpunpcklqdq	t1,hc3,hc3
 | |
| -	vmovd		0x29(m),t1x
 | |
| -	vmovd		0x39(m),t2x
 | |
| -	vpunpcklqdq	t2,t1,t1
 | |
| -	vperm2i128	$0x20,t1,hc3,hc3
 | |
| -	vpsrld		$6,hc3,hc3
 | |
| -	vpand		ANMASK(%rip),hc3,hc3
 | |
| -	vmovd		h3,t1x
 | |
| -	vpaddd		t1,hc3,hc3
 | |
| -	# hc4 = [(m[60-63] >> 8) | (1<<24), (m[44-47] >> 8) | (1<<24),
 | |
| -	#	 (m[28-31] >> 8) | (1<<24), (m[12-15] >> 8) | (1<<24) + h4]
 | |
| -	vmovd		0x0c(m),hc4x
 | |
| -	vmovd		0x1c(m),t1x
 | |
| -	vpunpcklqdq	t1,hc4,hc4
 | |
| -	vmovd		0x2c(m),t1x
 | |
| -	vmovd		0x3c(m),t2x
 | |
| -	vpunpcklqdq	t2,t1,t1
 | |
| -	vperm2i128	$0x20,t1,hc4,hc4
 | |
| -	vpsrld		$8,hc4,hc4
 | |
| -	vpor		ORMASK(%rip),hc4,hc4
 | |
| -	vmovd		h4,t1x
 | |
| -	vpaddd		t1,hc4,hc4
 | |
| -
 | |
| -	# t1 = [ hc0[3] * r0, hc0[2] * u0, hc0[1] * w0, hc0[0] * y0 ]
 | |
| -	vpmuludq	hc0,ruwy0,t1
 | |
| -	# t1 += [ hc1[3] * s4, hc1[2] * v4, hc1[1] * x4, hc1[0] * z4 ]
 | |
| -	vpmuludq	hc1,svxz4,t2
 | |
| -	vpaddq		t2,t1,t1
 | |
| -	# t1 += [ hc2[3] * s3, hc2[2] * v3, hc2[1] * x3, hc2[0] * z3 ]
 | |
| -	vpmuludq	hc2,svxz3,t2
 | |
| -	vpaddq		t2,t1,t1
 | |
| -	# t1 += [ hc3[3] * s2, hc3[2] * v2, hc3[1] * x2, hc3[0] * z2 ]
 | |
| -	vpmuludq	hc3,svxz2,t2
 | |
| -	vpaddq		t2,t1,t1
 | |
| -	# t1 += [ hc4[3] * s1, hc4[2] * v1, hc4[1] * x1, hc4[0] * z1 ]
 | |
| -	vpmuludq	hc4,svxz1,t2
 | |
| -	vpaddq		t2,t1,t1
 | |
| -	# d0 = t1[0] + t1[1] + t[2] + t[3]
 | |
| -	vpermq		$0xee,t1,t2
 | |
| -	vpaddq		t2,t1,t1
 | |
| -	vpsrldq		$8,t1,t2
 | |
| -	vpaddq		t2,t1,t1
 | |
| -	vmovq		t1x,d0
 | |
| -
 | |
| -	# t1 = [ hc0[3] * r1, hc0[2] * u1,hc0[1] * w1, hc0[0] * y1 ]
 | |
| -	vpmuludq	hc0,ruwy1,t1
 | |
| -	# t1 += [ hc1[3] * r0, hc1[2] * u0, hc1[1] * w0, hc1[0] * y0 ]
 | |
| -	vpmuludq	hc1,ruwy0,t2
 | |
| -	vpaddq		t2,t1,t1
 | |
| -	# t1 += [ hc2[3] * s4, hc2[2] * v4, hc2[1] * x4, hc2[0] * z4 ]
 | |
| -	vpmuludq	hc2,svxz4,t2
 | |
| -	vpaddq		t2,t1,t1
 | |
| -	# t1 += [ hc3[3] * s3, hc3[2] * v3, hc3[1] * x3, hc3[0] * z3 ]
 | |
| -	vpmuludq	hc3,svxz3,t2
 | |
| -	vpaddq		t2,t1,t1
 | |
| -	# t1 += [ hc4[3] * s2, hc4[2] * v2, hc4[1] * x2, hc4[0] * z2 ]
 | |
| -	vpmuludq	hc4,svxz2,t2
 | |
| -	vpaddq		t2,t1,t1
 | |
| -	# d1 = t1[0] + t1[1] + t1[3] + t1[4]
 | |
| -	vpermq		$0xee,t1,t2
 | |
| -	vpaddq		t2,t1,t1
 | |
| -	vpsrldq		$8,t1,t2
 | |
| -	vpaddq		t2,t1,t1
 | |
| -	vmovq		t1x,d1
 | |
| -
 | |
| -	# t1 = [ hc0[3] * r2, hc0[2] * u2, hc0[1] * w2, hc0[0] * y2 ]
 | |
| -	vpmuludq	hc0,ruwy2,t1
 | |
| -	# t1 += [ hc1[3] * r1, hc1[2] * u1, hc1[1] * w1, hc1[0] * y1 ]
 | |
| -	vpmuludq	hc1,ruwy1,t2
 | |
| -	vpaddq		t2,t1,t1
 | |
| -	# t1 += [ hc2[3] * r0, hc2[2] * u0, hc2[1] * w0, hc2[0] * y0 ]
 | |
| -	vpmuludq	hc2,ruwy0,t2
 | |
| -	vpaddq		t2,t1,t1
 | |
| -	# t1 += [ hc3[3] * s4, hc3[2] * v4, hc3[1] * x4, hc3[0] * z4 ]
 | |
| -	vpmuludq	hc3,svxz4,t2
 | |
| -	vpaddq		t2,t1,t1
 | |
| -	# t1 += [ hc4[3] * s3, hc4[2] * v3, hc4[1] * x3, hc4[0] * z3 ]
 | |
| -	vpmuludq	hc4,svxz3,t2
 | |
| -	vpaddq		t2,t1,t1
 | |
| -	# d2 = t1[0] + t1[1] + t1[2] + t1[3]
 | |
| -	vpermq		$0xee,t1,t2
 | |
| -	vpaddq		t2,t1,t1
 | |
| -	vpsrldq		$8,t1,t2
 | |
| -	vpaddq		t2,t1,t1
 | |
| -	vmovq		t1x,d2
 | |
| -
 | |
| -	# t1 = [ hc0[3] * r3, hc0[2] * u3, hc0[1] * w3, hc0[0] * y3 ]
 | |
| -	vpmuludq	hc0,ruwy3,t1
 | |
| -	# t1 += [ hc1[3] * r2, hc1[2] * u2, hc1[1] * w2, hc1[0] * y2 ]
 | |
| -	vpmuludq	hc1,ruwy2,t2
 | |
| -	vpaddq		t2,t1,t1
 | |
| -	# t1 += [ hc2[3] * r1, hc2[2] * u1, hc2[1] * w1, hc2[0] * y1 ]
 | |
| -	vpmuludq	hc2,ruwy1,t2
 | |
| -	vpaddq		t2,t1,t1
 | |
| -	# t1 += [ hc3[3] * r0, hc3[2] * u0, hc3[1] * w0, hc3[0] * y0 ]
 | |
| -	vpmuludq	hc3,ruwy0,t2
 | |
| -	vpaddq		t2,t1,t1
 | |
| -	# t1 += [ hc4[3] * s4, hc4[2] * v4, hc4[1] * x4, hc4[0] * z4 ]
 | |
| -	vpmuludq	hc4,svxz4,t2
 | |
| -	vpaddq		t2,t1,t1
 | |
| -	# d3 = t1[0] + t1[1] + t1[2] + t1[3]
 | |
| -	vpermq		$0xee,t1,t2
 | |
| -	vpaddq		t2,t1,t1
 | |
| -	vpsrldq		$8,t1,t2
 | |
| -	vpaddq		t2,t1,t1
 | |
| -	vmovq		t1x,d3
 | |
| -
 | |
| -	# t1 = [ hc0[3] * r4, hc0[2] * u4, hc0[1] * w4, hc0[0] * y4 ]
 | |
| -	vpmuludq	hc0,ruwy4,t1
 | |
| -	# t1 += [ hc1[3] * r3, hc1[2] * u3, hc1[1] * w3, hc1[0] * y3 ]
 | |
| -	vpmuludq	hc1,ruwy3,t2
 | |
| -	vpaddq		t2,t1,t1
 | |
| -	# t1 += [ hc2[3] * r2, hc2[2] * u2, hc2[1] * w2, hc2[0] * y2 ]
 | |
| -	vpmuludq	hc2,ruwy2,t2
 | |
| -	vpaddq		t2,t1,t1
 | |
| -	# t1 += [ hc3[3] * r1, hc3[2] * u1, hc3[1] * w1, hc3[0] * y1 ]
 | |
| -	vpmuludq	hc3,ruwy1,t2
 | |
| -	vpaddq		t2,t1,t1
 | |
| -	# t1 += [ hc4[3] * r0, hc4[2] * u0, hc4[1] * w0, hc4[0] * y0 ]
 | |
| -	vpmuludq	hc4,ruwy0,t2
 | |
| -	vpaddq		t2,t1,t1
 | |
| -	# d4 = t1[0] + t1[1] + t1[2] + t1[3]
 | |
| -	vpermq		$0xee,t1,t2
 | |
| -	vpaddq		t2,t1,t1
 | |
| -	vpsrldq		$8,t1,t2
 | |
| -	vpaddq		t2,t1,t1
 | |
| -	vmovq		t1x,d4
 | |
| -
 | |
| -	# Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
 | |
| -	# h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
 | |
| -	# amount.  Careful: we must not assume the carry bits 'd0 >> 26',
 | |
| -	# 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
 | |
| -	# integers.  It's true in a single-block implementation, but not here.
 | |
| -
 | |
| -	# d1 += d0 >> 26
 | |
| -	mov		d0,%rax
 | |
| -	shr		$26,%rax
 | |
| -	add		%rax,d1
 | |
| -	# h0 = d0 & 0x3ffffff
 | |
| -	mov		d0,%rbx
 | |
| -	and		$0x3ffffff,%ebx
 | |
| -
 | |
| -	# d2 += d1 >> 26
 | |
| -	mov		d1,%rax
 | |
| -	shr		$26,%rax
 | |
| -	add		%rax,d2
 | |
| -	# h1 = d1 & 0x3ffffff
 | |
| -	mov		d1,%rax
 | |
| -	and		$0x3ffffff,%eax
 | |
| -	mov		%eax,h1
 | |
| -
 | |
| -	# d3 += d2 >> 26
 | |
| -	mov		d2,%rax
 | |
| -	shr		$26,%rax
 | |
| -	add		%rax,d3
 | |
| -	# h2 = d2 & 0x3ffffff
 | |
| -	mov		d2,%rax
 | |
| -	and		$0x3ffffff,%eax
 | |
| -	mov		%eax,h2
 | |
| -
 | |
| -	# d4 += d3 >> 26
 | |
| -	mov		d3,%rax
 | |
| -	shr		$26,%rax
 | |
| -	add		%rax,d4
 | |
| -	# h3 = d3 & 0x3ffffff
 | |
| -	mov		d3,%rax
 | |
| -	and		$0x3ffffff,%eax
 | |
| -	mov		%eax,h3
 | |
| -
 | |
| -	# h0 += (d4 >> 26) * 5
 | |
| -	mov		d4,%rax
 | |
| -	shr		$26,%rax
 | |
| -	lea		(%rax,%rax,4),%rax
 | |
| -	add		%rax,%rbx
 | |
| -	# h4 = d4 & 0x3ffffff
 | |
| -	mov		d4,%rax
 | |
| -	and		$0x3ffffff,%eax
 | |
| -	mov		%eax,h4
 | |
| -
 | |
| -	# h1 += h0 >> 26
 | |
| -	mov		%rbx,%rax
 | |
| -	shr		$26,%rax
 | |
| -	add		%eax,h1
 | |
| -	# h0 = h0 & 0x3ffffff
 | |
| -	andl		$0x3ffffff,%ebx
 | |
| -	mov		%ebx,h0
 | |
| -
 | |
| -	add		$0x40,m
 | |
| -	dec		%rcx
 | |
| -	jnz		.Ldoblock4
 | |
| -
 | |
| -	vzeroupper
 | |
| -	pop		%r13
 | |
| -	pop		%r12
 | |
| -	pop		%rbx
 | |
| -	ret
 | |
| -ENDPROC(poly1305_4block_avx2)
 | |
| --- a/arch/x86/crypto/poly1305-sse2-x86_64.S
 | |
| +++ /dev/null
 | |
| @@ -1,590 +0,0 @@
 | |
| -/* SPDX-License-Identifier: GPL-2.0-or-later */
 | |
| -/*
 | |
| - * Poly1305 authenticator algorithm, RFC7539, x64 SSE2 functions
 | |
| - *
 | |
| - * Copyright (C) 2015 Martin Willi
 | |
| - */
 | |
| -
 | |
| -#include <linux/linkage.h>
 | |
| -
 | |
| -.section	.rodata.cst16.ANMASK, "aM", @progbits, 16
 | |
| -.align 16
 | |
| -ANMASK:	.octa 0x0000000003ffffff0000000003ffffff
 | |
| -
 | |
| -.section	.rodata.cst16.ORMASK, "aM", @progbits, 16
 | |
| -.align 16
 | |
| -ORMASK:	.octa 0x00000000010000000000000001000000
 | |
| -
 | |
| -.text
 | |
| -
 | |
| -#define h0 0x00(%rdi)
 | |
| -#define h1 0x04(%rdi)
 | |
| -#define h2 0x08(%rdi)
 | |
| -#define h3 0x0c(%rdi)
 | |
| -#define h4 0x10(%rdi)
 | |
| -#define r0 0x00(%rdx)
 | |
| -#define r1 0x04(%rdx)
 | |
| -#define r2 0x08(%rdx)
 | |
| -#define r3 0x0c(%rdx)
 | |
| -#define r4 0x10(%rdx)
 | |
| -#define s1 0x00(%rsp)
 | |
| -#define s2 0x04(%rsp)
 | |
| -#define s3 0x08(%rsp)
 | |
| -#define s4 0x0c(%rsp)
 | |
| -#define m %rsi
 | |
| -#define h01 %xmm0
 | |
| -#define h23 %xmm1
 | |
| -#define h44 %xmm2
 | |
| -#define t1 %xmm3
 | |
| -#define t2 %xmm4
 | |
| -#define t3 %xmm5
 | |
| -#define t4 %xmm6
 | |
| -#define mask %xmm7
 | |
| -#define d0 %r8
 | |
| -#define d1 %r9
 | |
| -#define d2 %r10
 | |
| -#define d3 %r11
 | |
| -#define d4 %r12
 | |
| -
 | |
| -ENTRY(poly1305_block_sse2)
 | |
| -	# %rdi: Accumulator h[5]
 | |
| -	# %rsi: 16 byte input block m
 | |
| -	# %rdx: Poly1305 key r[5]
 | |
| -	# %rcx: Block count
 | |
| -
 | |
| -	# This single block variant tries to improve performance by doing two
 | |
| -	# multiplications in parallel using SSE instructions. There is quite
 | |
| -	# some quardword packing involved, hence the speedup is marginal.
 | |
| -
 | |
| -	push		%rbx
 | |
| -	push		%r12
 | |
| -	sub		$0x10,%rsp
 | |
| -
 | |
| -	# s1..s4 = r1..r4 * 5
 | |
| -	mov		r1,%eax
 | |
| -	lea		(%eax,%eax,4),%eax
 | |
| -	mov		%eax,s1
 | |
| -	mov		r2,%eax
 | |
| -	lea		(%eax,%eax,4),%eax
 | |
| -	mov		%eax,s2
 | |
| -	mov		r3,%eax
 | |
| -	lea		(%eax,%eax,4),%eax
 | |
| -	mov		%eax,s3
 | |
| -	mov		r4,%eax
 | |
| -	lea		(%eax,%eax,4),%eax
 | |
| -	mov		%eax,s4
 | |
| -
 | |
| -	movdqa		ANMASK(%rip),mask
 | |
| -
 | |
| -.Ldoblock:
 | |
| -	# h01 = [0, h1, 0, h0]
 | |
| -	# h23 = [0, h3, 0, h2]
 | |
| -	# h44 = [0, h4, 0, h4]
 | |
| -	movd		h0,h01
 | |
| -	movd		h1,t1
 | |
| -	movd		h2,h23
 | |
| -	movd		h3,t2
 | |
| -	movd		h4,h44
 | |
| -	punpcklqdq	t1,h01
 | |
| -	punpcklqdq	t2,h23
 | |
| -	punpcklqdq	h44,h44
 | |
| -
 | |
| -	# h01 += [ (m[3-6] >> 2) & 0x3ffffff, m[0-3] & 0x3ffffff ]
 | |
| -	movd		0x00(m),t1
 | |
| -	movd		0x03(m),t2
 | |
| -	psrld		$2,t2
 | |
| -	punpcklqdq	t2,t1
 | |
| -	pand		mask,t1
 | |
| -	paddd		t1,h01
 | |
| -	# h23 += [ (m[9-12] >> 6) & 0x3ffffff, (m[6-9] >> 4) & 0x3ffffff ]
 | |
| -	movd		0x06(m),t1
 | |
| -	movd		0x09(m),t2
 | |
| -	psrld		$4,t1
 | |
| -	psrld		$6,t2
 | |
| -	punpcklqdq	t2,t1
 | |
| -	pand		mask,t1
 | |
| -	paddd		t1,h23
 | |
| -	# h44 += [ (m[12-15] >> 8) | (1 << 24), (m[12-15] >> 8) | (1 << 24) ]
 | |
| -	mov		0x0c(m),%eax
 | |
| -	shr		$8,%eax
 | |
| -	or		$0x01000000,%eax
 | |
| -	movd		%eax,t1
 | |
| -	pshufd		$0xc4,t1,t1
 | |
| -	paddd		t1,h44
 | |
| -
 | |
| -	# t1[0] = h0 * r0 + h2 * s3
 | |
| -	# t1[1] = h1 * s4 + h3 * s2
 | |
| -	movd		r0,t1
 | |
| -	movd		s4,t2
 | |
| -	punpcklqdq	t2,t1
 | |
| -	pmuludq		h01,t1
 | |
| -	movd		s3,t2
 | |
| -	movd		s2,t3
 | |
| -	punpcklqdq	t3,t2
 | |
| -	pmuludq		h23,t2
 | |
| -	paddq		t2,t1
 | |
| -	# t2[0] = h0 * r1 + h2 * s4
 | |
| -	# t2[1] = h1 * r0 + h3 * s3
 | |
| -	movd		r1,t2
 | |
| -	movd		r0,t3
 | |
| -	punpcklqdq	t3,t2
 | |
| -	pmuludq		h01,t2
 | |
| -	movd		s4,t3
 | |
| -	movd		s3,t4
 | |
| -	punpcklqdq	t4,t3
 | |
| -	pmuludq		h23,t3
 | |
| -	paddq		t3,t2
 | |
| -	# t3[0] = h4 * s1
 | |
| -	# t3[1] = h4 * s2
 | |
| -	movd		s1,t3
 | |
| -	movd		s2,t4
 | |
| -	punpcklqdq	t4,t3
 | |
| -	pmuludq		h44,t3
 | |
| -	# d0 = t1[0] + t1[1] + t3[0]
 | |
| -	# d1 = t2[0] + t2[1] + t3[1]
 | |
| -	movdqa		t1,t4
 | |
| -	punpcklqdq	t2,t4
 | |
| -	punpckhqdq	t2,t1
 | |
| -	paddq		t4,t1
 | |
| -	paddq		t3,t1
 | |
| -	movq		t1,d0
 | |
| -	psrldq		$8,t1
 | |
| -	movq		t1,d1
 | |
| -
 | |
| -	# t1[0] = h0 * r2 + h2 * r0
 | |
| -	# t1[1] = h1 * r1 + h3 * s4
 | |
| -	movd		r2,t1
 | |
| -	movd		r1,t2
 | |
| -	punpcklqdq 	t2,t1
 | |
| -	pmuludq		h01,t1
 | |
| -	movd		r0,t2
 | |
| -	movd		s4,t3
 | |
| -	punpcklqdq	t3,t2
 | |
| -	pmuludq		h23,t2
 | |
| -	paddq		t2,t1
 | |
| -	# t2[0] = h0 * r3 + h2 * r1
 | |
| -	# t2[1] = h1 * r2 + h3 * r0
 | |
| -	movd		r3,t2
 | |
| -	movd		r2,t3
 | |
| -	punpcklqdq	t3,t2
 | |
| -	pmuludq		h01,t2
 | |
| -	movd		r1,t3
 | |
| -	movd		r0,t4
 | |
| -	punpcklqdq	t4,t3
 | |
| -	pmuludq		h23,t3
 | |
| -	paddq		t3,t2
 | |
| -	# t3[0] = h4 * s3
 | |
| -	# t3[1] = h4 * s4
 | |
| -	movd		s3,t3
 | |
| -	movd		s4,t4
 | |
| -	punpcklqdq	t4,t3
 | |
| -	pmuludq		h44,t3
 | |
| -	# d2 = t1[0] + t1[1] + t3[0]
 | |
| -	# d3 = t2[0] + t2[1] + t3[1]
 | |
| -	movdqa		t1,t4
 | |
| -	punpcklqdq	t2,t4
 | |
| -	punpckhqdq	t2,t1
 | |
| -	paddq		t4,t1
 | |
| -	paddq		t3,t1
 | |
| -	movq		t1,d2
 | |
| -	psrldq		$8,t1
 | |
| -	movq		t1,d3
 | |
| -
 | |
| -	# t1[0] = h0 * r4 + h2 * r2
 | |
| -	# t1[1] = h1 * r3 + h3 * r1
 | |
| -	movd		r4,t1
 | |
| -	movd		r3,t2
 | |
| -	punpcklqdq	t2,t1
 | |
| -	pmuludq		h01,t1
 | |
| -	movd		r2,t2
 | |
| -	movd		r1,t3
 | |
| -	punpcklqdq	t3,t2
 | |
| -	pmuludq		h23,t2
 | |
| -	paddq		t2,t1
 | |
| -	# t3[0] = h4 * r0
 | |
| -	movd		r0,t3
 | |
| -	pmuludq		h44,t3
 | |
| -	# d4 = t1[0] + t1[1] + t3[0]
 | |
| -	movdqa		t1,t4
 | |
| -	psrldq		$8,t4
 | |
| -	paddq		t4,t1
 | |
| -	paddq		t3,t1
 | |
| -	movq		t1,d4
 | |
| -
 | |
| -	# d1 += d0 >> 26
 | |
| -	mov		d0,%rax
 | |
| -	shr		$26,%rax
 | |
| -	add		%rax,d1
 | |
| -	# h0 = d0 & 0x3ffffff
 | |
| -	mov		d0,%rbx
 | |
| -	and		$0x3ffffff,%ebx
 | |
| -
 | |
| -	# d2 += d1 >> 26
 | |
| -	mov		d1,%rax
 | |
| -	shr		$26,%rax
 | |
| -	add		%rax,d2
 | |
| -	# h1 = d1 & 0x3ffffff
 | |
| -	mov		d1,%rax
 | |
| -	and		$0x3ffffff,%eax
 | |
| -	mov		%eax,h1
 | |
| -
 | |
| -	# d3 += d2 >> 26
 | |
| -	mov		d2,%rax
 | |
| -	shr		$26,%rax
 | |
| -	add		%rax,d3
 | |
| -	# h2 = d2 & 0x3ffffff
 | |
| -	mov		d2,%rax
 | |
| -	and		$0x3ffffff,%eax
 | |
| -	mov		%eax,h2
 | |
| -
 | |
| -	# d4 += d3 >> 26
 | |
| -	mov		d3,%rax
 | |
| -	shr		$26,%rax
 | |
| -	add		%rax,d4
 | |
| -	# h3 = d3 & 0x3ffffff
 | |
| -	mov		d3,%rax
 | |
| -	and		$0x3ffffff,%eax
 | |
| -	mov		%eax,h3
 | |
| -
 | |
| -	# h0 += (d4 >> 26) * 5
 | |
| -	mov		d4,%rax
 | |
| -	shr		$26,%rax
 | |
| -	lea		(%rax,%rax,4),%rax
 | |
| -	add		%rax,%rbx
 | |
| -	# h4 = d4 & 0x3ffffff
 | |
| -	mov		d4,%rax
 | |
| -	and		$0x3ffffff,%eax
 | |
| -	mov		%eax,h4
 | |
| -
 | |
| -	# h1 += h0 >> 26
 | |
| -	mov		%rbx,%rax
 | |
| -	shr		$26,%rax
 | |
| -	add		%eax,h1
 | |
| -	# h0 = h0 & 0x3ffffff
 | |
| -	andl		$0x3ffffff,%ebx
 | |
| -	mov		%ebx,h0
 | |
| -
 | |
| -	add		$0x10,m
 | |
| -	dec		%rcx
 | |
| -	jnz		.Ldoblock
 | |
| -
 | |
| -	# Zeroing of key material
 | |
| -	mov		%rcx,0x00(%rsp)
 | |
| -	mov		%rcx,0x08(%rsp)
 | |
| -
 | |
| -	add		$0x10,%rsp
 | |
| -	pop		%r12
 | |
| -	pop		%rbx
 | |
| -	ret
 | |
| -ENDPROC(poly1305_block_sse2)
 | |
| -
 | |
| -
 | |
| -#define u0 0x00(%r8)
 | |
| -#define u1 0x04(%r8)
 | |
| -#define u2 0x08(%r8)
 | |
| -#define u3 0x0c(%r8)
 | |
| -#define u4 0x10(%r8)
 | |
| -#define hc0 %xmm0
 | |
| -#define hc1 %xmm1
 | |
| -#define hc2 %xmm2
 | |
| -#define hc3 %xmm5
 | |
| -#define hc4 %xmm6
 | |
| -#define ru0 %xmm7
 | |
| -#define ru1 %xmm8
 | |
| -#define ru2 %xmm9
 | |
| -#define ru3 %xmm10
 | |
| -#define ru4 %xmm11
 | |
| -#define sv1 %xmm12
 | |
| -#define sv2 %xmm13
 | |
| -#define sv3 %xmm14
 | |
| -#define sv4 %xmm15
 | |
| -#undef d0
 | |
| -#define d0 %r13
 | |
| -
 | |
| -ENTRY(poly1305_2block_sse2)
 | |
| -	# %rdi: Accumulator h[5]
 | |
| -	# %rsi: 16 byte input block m
 | |
| -	# %rdx: Poly1305 key r[5]
 | |
| -	# %rcx: Doubleblock count
 | |
| -	# %r8:  Poly1305 derived key r^2 u[5]
 | |
| -
 | |
| -	# This two-block variant further improves performance by using loop
 | |
| -	# unrolled block processing. This is more straight forward and does
 | |
| -	# less byte shuffling, but requires a second Poly1305 key r^2:
 | |
| -	# h = (h + m) * r    =>    h = (h + m1) * r^2 + m2 * r
 | |
| -
 | |
| -	push		%rbx
 | |
| -	push		%r12
 | |
| -	push		%r13
 | |
| -
 | |
| -	# combine r0,u0
 | |
| -	movd		u0,ru0
 | |
| -	movd		r0,t1
 | |
| -	punpcklqdq	t1,ru0
 | |
| -
 | |
| -	# combine r1,u1 and s1=r1*5,v1=u1*5
 | |
| -	movd		u1,ru1
 | |
| -	movd		r1,t1
 | |
| -	punpcklqdq	t1,ru1
 | |
| -	movdqa		ru1,sv1
 | |
| -	pslld		$2,sv1
 | |
| -	paddd		ru1,sv1
 | |
| -
 | |
| -	# combine r2,u2 and s2=r2*5,v2=u2*5
 | |
| -	movd		u2,ru2
 | |
| -	movd		r2,t1
 | |
| -	punpcklqdq	t1,ru2
 | |
| -	movdqa		ru2,sv2
 | |
| -	pslld		$2,sv2
 | |
| -	paddd		ru2,sv2
 | |
| -
 | |
| -	# combine r3,u3 and s3=r3*5,v3=u3*5
 | |
| -	movd		u3,ru3
 | |
| -	movd		r3,t1
 | |
| -	punpcklqdq	t1,ru3
 | |
| -	movdqa		ru3,sv3
 | |
| -	pslld		$2,sv3
 | |
| -	paddd		ru3,sv3
 | |
| -
 | |
| -	# combine r4,u4 and s4=r4*5,v4=u4*5
 | |
| -	movd		u4,ru4
 | |
| -	movd		r4,t1
 | |
| -	punpcklqdq	t1,ru4
 | |
| -	movdqa		ru4,sv4
 | |
| -	pslld		$2,sv4
 | |
| -	paddd		ru4,sv4
 | |
| -
 | |
| -.Ldoblock2:
 | |
| -	# hc0 = [ m[16-19] & 0x3ffffff, h0 + m[0-3] & 0x3ffffff ]
 | |
| -	movd		0x00(m),hc0
 | |
| -	movd		0x10(m),t1
 | |
| -	punpcklqdq	t1,hc0
 | |
| -	pand		ANMASK(%rip),hc0
 | |
| -	movd		h0,t1
 | |
| -	paddd		t1,hc0
 | |
| -	# hc1 = [ (m[19-22] >> 2) & 0x3ffffff, h1 + (m[3-6] >> 2) & 0x3ffffff ]
 | |
| -	movd		0x03(m),hc1
 | |
| -	movd		0x13(m),t1
 | |
| -	punpcklqdq	t1,hc1
 | |
| -	psrld		$2,hc1
 | |
| -	pand		ANMASK(%rip),hc1
 | |
| -	movd		h1,t1
 | |
| -	paddd		t1,hc1
 | |
| -	# hc2 = [ (m[22-25] >> 4) & 0x3ffffff, h2 + (m[6-9] >> 4) & 0x3ffffff ]
 | |
| -	movd		0x06(m),hc2
 | |
| -	movd		0x16(m),t1
 | |
| -	punpcklqdq	t1,hc2
 | |
| -	psrld		$4,hc2
 | |
| -	pand		ANMASK(%rip),hc2
 | |
| -	movd		h2,t1
 | |
| -	paddd		t1,hc2
 | |
| -	# hc3 = [ (m[25-28] >> 6) & 0x3ffffff, h3 + (m[9-12] >> 6) & 0x3ffffff ]
 | |
| -	movd		0x09(m),hc3
 | |
| -	movd		0x19(m),t1
 | |
| -	punpcklqdq	t1,hc3
 | |
| -	psrld		$6,hc3
 | |
| -	pand		ANMASK(%rip),hc3
 | |
| -	movd		h3,t1
 | |
| -	paddd		t1,hc3
 | |
| -	# hc4 = [ (m[28-31] >> 8) | (1<<24), h4 + (m[12-15] >> 8) | (1<<24) ]
 | |
| -	movd		0x0c(m),hc4
 | |
| -	movd		0x1c(m),t1
 | |
| -	punpcklqdq	t1,hc4
 | |
| -	psrld		$8,hc4
 | |
| -	por		ORMASK(%rip),hc4
 | |
| -	movd		h4,t1
 | |
| -	paddd		t1,hc4
 | |
| -
 | |
| -	# t1 = [ hc0[1] * r0, hc0[0] * u0 ]
 | |
| -	movdqa		ru0,t1
 | |
| -	pmuludq		hc0,t1
 | |
| -	# t1 += [ hc1[1] * s4, hc1[0] * v4 ]
 | |
| -	movdqa		sv4,t2
 | |
| -	pmuludq		hc1,t2
 | |
| -	paddq		t2,t1
 | |
| -	# t1 += [ hc2[1] * s3, hc2[0] * v3 ]
 | |
| -	movdqa		sv3,t2
 | |
| -	pmuludq		hc2,t2
 | |
| -	paddq		t2,t1
 | |
| -	# t1 += [ hc3[1] * s2, hc3[0] * v2 ]
 | |
| -	movdqa		sv2,t2
 | |
| -	pmuludq		hc3,t2
 | |
| -	paddq		t2,t1
 | |
| -	# t1 += [ hc4[1] * s1, hc4[0] * v1 ]
 | |
| -	movdqa		sv1,t2
 | |
| -	pmuludq		hc4,t2
 | |
| -	paddq		t2,t1
 | |
| -	# d0 = t1[0] + t1[1]
 | |
| -	movdqa		t1,t2
 | |
| -	psrldq		$8,t2
 | |
| -	paddq		t2,t1
 | |
| -	movq		t1,d0
 | |
| -
 | |
| -	# t1 = [ hc0[1] * r1, hc0[0] * u1 ]
 | |
| -	movdqa		ru1,t1
 | |
| -	pmuludq		hc0,t1
 | |
| -	# t1 += [ hc1[1] * r0, hc1[0] * u0 ]
 | |
| -	movdqa		ru0,t2
 | |
| -	pmuludq		hc1,t2
 | |
| -	paddq		t2,t1
 | |
| -	# t1 += [ hc2[1] * s4, hc2[0] * v4 ]
 | |
| -	movdqa		sv4,t2
 | |
| -	pmuludq		hc2,t2
 | |
| -	paddq		t2,t1
 | |
| -	# t1 += [ hc3[1] * s3, hc3[0] * v3 ]
 | |
| -	movdqa		sv3,t2
 | |
| -	pmuludq		hc3,t2
 | |
| -	paddq		t2,t1
 | |
| -	# t1 += [ hc4[1] * s2, hc4[0] * v2 ]
 | |
| -	movdqa		sv2,t2
 | |
| -	pmuludq		hc4,t2
 | |
| -	paddq		t2,t1
 | |
| -	# d1 = t1[0] + t1[1]
 | |
| -	movdqa		t1,t2
 | |
| -	psrldq		$8,t2
 | |
| -	paddq		t2,t1
 | |
| -	movq		t1,d1
 | |
| -
 | |
| -	# t1 = [ hc0[1] * r2, hc0[0] * u2 ]
 | |
| -	movdqa		ru2,t1
 | |
| -	pmuludq		hc0,t1
 | |
| -	# t1 += [ hc1[1] * r1, hc1[0] * u1 ]
 | |
| -	movdqa		ru1,t2
 | |
| -	pmuludq		hc1,t2
 | |
| -	paddq		t2,t1
 | |
| -	# t1 += [ hc2[1] * r0, hc2[0] * u0 ]
 | |
| -	movdqa		ru0,t2
 | |
| -	pmuludq		hc2,t2
 | |
| -	paddq		t2,t1
 | |
| -	# t1 += [ hc3[1] * s4, hc3[0] * v4 ]
 | |
| -	movdqa		sv4,t2
 | |
| -	pmuludq		hc3,t2
 | |
| -	paddq		t2,t1
 | |
| -	# t1 += [ hc4[1] * s3, hc4[0] * v3 ]
 | |
| -	movdqa		sv3,t2
 | |
| -	pmuludq		hc4,t2
 | |
| -	paddq		t2,t1
 | |
| -	# d2 = t1[0] + t1[1]
 | |
| -	movdqa		t1,t2
 | |
| -	psrldq		$8,t2
 | |
| -	paddq		t2,t1
 | |
| -	movq		t1,d2
 | |
| -
 | |
| -	# t1 = [ hc0[1] * r3, hc0[0] * u3 ]
 | |
| -	movdqa		ru3,t1
 | |
| -	pmuludq		hc0,t1
 | |
| -	# t1 += [ hc1[1] * r2, hc1[0] * u2 ]
 | |
| -	movdqa		ru2,t2
 | |
| -	pmuludq		hc1,t2
 | |
| -	paddq		t2,t1
 | |
| -	# t1 += [ hc2[1] * r1, hc2[0] * u1 ]
 | |
| -	movdqa		ru1,t2
 | |
| -	pmuludq		hc2,t2
 | |
| -	paddq		t2,t1
 | |
| -	# t1 += [ hc3[1] * r0, hc3[0] * u0 ]
 | |
| -	movdqa		ru0,t2
 | |
| -	pmuludq		hc3,t2
 | |
| -	paddq		t2,t1
 | |
| -	# t1 += [ hc4[1] * s4, hc4[0] * v4 ]
 | |
| -	movdqa		sv4,t2
 | |
| -	pmuludq		hc4,t2
 | |
| -	paddq		t2,t1
 | |
| -	# d3 = t1[0] + t1[1]
 | |
| -	movdqa		t1,t2
 | |
| -	psrldq		$8,t2
 | |
| -	paddq		t2,t1
 | |
| -	movq		t1,d3
 | |
| -
 | |
| -	# t1 = [ hc0[1] * r4, hc0[0] * u4 ]
 | |
| -	movdqa		ru4,t1
 | |
| -	pmuludq		hc0,t1
 | |
| -	# t1 += [ hc1[1] * r3, hc1[0] * u3 ]
 | |
| -	movdqa		ru3,t2
 | |
| -	pmuludq		hc1,t2
 | |
| -	paddq		t2,t1
 | |
| -	# t1 += [ hc2[1] * r2, hc2[0] * u2 ]
 | |
| -	movdqa		ru2,t2
 | |
| -	pmuludq		hc2,t2
 | |
| -	paddq		t2,t1
 | |
| -	# t1 += [ hc3[1] * r1, hc3[0] * u1 ]
 | |
| -	movdqa		ru1,t2
 | |
| -	pmuludq		hc3,t2
 | |
| -	paddq		t2,t1
 | |
| -	# t1 += [ hc4[1] * r0, hc4[0] * u0 ]
 | |
| -	movdqa		ru0,t2
 | |
| -	pmuludq		hc4,t2
 | |
| -	paddq		t2,t1
 | |
| -	# d4 = t1[0] + t1[1]
 | |
| -	movdqa		t1,t2
 | |
| -	psrldq		$8,t2
 | |
| -	paddq		t2,t1
 | |
| -	movq		t1,d4
 | |
| -
 | |
| -	# Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
 | |
| -	# h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
 | |
| -	# amount.  Careful: we must not assume the carry bits 'd0 >> 26',
 | |
| -	# 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
 | |
| -	# integers.  It's true in a single-block implementation, but not here.
 | |
| -
 | |
| -	# d1 += d0 >> 26
 | |
| -	mov		d0,%rax
 | |
| -	shr		$26,%rax
 | |
| -	add		%rax,d1
 | |
| -	# h0 = d0 & 0x3ffffff
 | |
| -	mov		d0,%rbx
 | |
| -	and		$0x3ffffff,%ebx
 | |
| -
 | |
| -	# d2 += d1 >> 26
 | |
| -	mov		d1,%rax
 | |
| -	shr		$26,%rax
 | |
| -	add		%rax,d2
 | |
| -	# h1 = d1 & 0x3ffffff
 | |
| -	mov		d1,%rax
 | |
| -	and		$0x3ffffff,%eax
 | |
| -	mov		%eax,h1
 | |
| -
 | |
| -	# d3 += d2 >> 26
 | |
| -	mov		d2,%rax
 | |
| -	shr		$26,%rax
 | |
| -	add		%rax,d3
 | |
| -	# h2 = d2 & 0x3ffffff
 | |
| -	mov		d2,%rax
 | |
| -	and		$0x3ffffff,%eax
 | |
| -	mov		%eax,h2
 | |
| -
 | |
| -	# d4 += d3 >> 26
 | |
| -	mov		d3,%rax
 | |
| -	shr		$26,%rax
 | |
| -	add		%rax,d4
 | |
| -	# h3 = d3 & 0x3ffffff
 | |
| -	mov		d3,%rax
 | |
| -	and		$0x3ffffff,%eax
 | |
| -	mov		%eax,h3
 | |
| -
 | |
| -	# h0 += (d4 >> 26) * 5
 | |
| -	mov		d4,%rax
 | |
| -	shr		$26,%rax
 | |
| -	lea		(%rax,%rax,4),%rax
 | |
| -	add		%rax,%rbx
 | |
| -	# h4 = d4 & 0x3ffffff
 | |
| -	mov		d4,%rax
 | |
| -	and		$0x3ffffff,%eax
 | |
| -	mov		%eax,h4
 | |
| -
 | |
| -	# h1 += h0 >> 26
 | |
| -	mov		%rbx,%rax
 | |
| -	shr		$26,%rax
 | |
| -	add		%eax,h1
 | |
| -	# h0 = h0 & 0x3ffffff
 | |
| -	andl		$0x3ffffff,%ebx
 | |
| -	mov		%ebx,h0
 | |
| -
 | |
| -	add		$0x20,m
 | |
| -	dec		%rcx
 | |
| -	jnz		.Ldoblock2
 | |
| -
 | |
| -	pop		%r13
 | |
| -	pop		%r12
 | |
| -	pop		%rbx
 | |
| -	ret
 | |
| -ENDPROC(poly1305_2block_sse2)
 | |
| --- a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
 | |
| +++ b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
 | |
| @@ -1,11 +1,14 @@
 | |
| -#! /usr/bin/env perl
 | |
| -# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
 | |
| +#!/usr/bin/env perl
 | |
| +# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
 | |
|  #
 | |
| -# Licensed under the OpenSSL license (the "License").  You may not use
 | |
| -# this file except in compliance with the License.  You can obtain a copy
 | |
| -# in the file LICENSE in the source distribution or at
 | |
| -# https://www.openssl.org/source/license.html
 | |
| -
 | |
| +# Copyright (C) 2017-2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
 | |
| +# Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
 | |
| +# Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
 | |
| +#
 | |
| +# This code is taken from the OpenSSL project but the author, Andy Polyakov,
 | |
| +# has relicensed it under the licenses specified in the SPDX header above.
 | |
| +# The original headers, including the original license headers, are
 | |
| +# included below for completeness.
 | |
|  #
 | |
|  # ====================================================================
 | |
|  # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
 | |
| @@ -32,7 +35,7 @@
 | |
|  # Skylake-X system performance. Since we are likely to suppress
 | |
|  # AVX512F capability flag [at least on Skylake-X], conversion serves
 | |
|  # as kind of "investment protection". Note that next *lake processor,
 | |
| -# Cannolake, has AVX512IFMA code path to execute...
 | |
| +# Cannonlake, has AVX512IFMA code path to execute...
 | |
|  #
 | |
|  # Numbers are cycles per processed byte with poly1305_blocks alone,
 | |
|  # measured with rdtsc at fixed clock frequency.
 | |
| @@ -68,39 +71,114 @@ $output  = shift;
 | |
|  if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
 | |
|  
 | |
|  $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
 | |
| +$kernel=0; $kernel=1 if (!$flavour && !$output);
 | |
|  
 | |
| -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 | |
| -( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
 | |
| -( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 | |
| -die "can't locate x86_64-xlate.pl";
 | |
| -
 | |
| -if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
 | |
| -		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
 | |
| -	$avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25) + ($1>=2.26);
 | |
| +if (!$kernel) {
 | |
| +	$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 | |
| +	( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
 | |
| +	( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 | |
| +	die "can't locate x86_64-xlate.pl";
 | |
| +
 | |
| +	open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
 | |
| +	*STDOUT=*OUT;
 | |
| +
 | |
| +	if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
 | |
| +	    =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
 | |
| +		$avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
 | |
| +	}
 | |
| +
 | |
| +	if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
 | |
| +	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
 | |
| +		$avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
 | |
| +		$avx += 1 if ($1==2.11 && $2>=8);
 | |
| +	}
 | |
| +
 | |
| +	if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
 | |
| +	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
 | |
| +		$avx = ($1>=10) + ($1>=11);
 | |
| +	}
 | |
| +
 | |
| +	if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
 | |
| +		$avx = ($2>=3.0) + ($2>3.0);
 | |
| +	}
 | |
| +} else {
 | |
| +	$avx = 4; # The kernel uses ifdefs for this.
 | |
|  }
 | |
|  
 | |
| -if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
 | |
| -	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
 | |
| -	$avx = ($1>=2.09) + ($1>=2.10) + 2 * ($1>=2.12);
 | |
| -	$avx += 2 if ($1==2.11 && $2>=8);
 | |
| +sub declare_function() {
 | |
| +	my ($name, $align, $nargs) = @_;
 | |
| +	if($kernel) {
 | |
| +		$code .= ".align $align\n";
 | |
| +		$code .= "ENTRY($name)\n";
 | |
| +		$code .= ".L$name:\n";
 | |
| +	} else {
 | |
| +		$code .= ".globl	$name\n";
 | |
| +		$code .= ".type	$name,\@function,$nargs\n";
 | |
| +		$code .= ".align	$align\n";
 | |
| +		$code .= "$name:\n";
 | |
| +	}
 | |
|  }
 | |
|  
 | |
| -if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
 | |
| -	   `ml64 2>&1` =~ /Version ([0-9]+)\./) {
 | |
| -	$avx = ($1>=10) + ($1>=12);
 | |
| +sub end_function() {
 | |
| +	my ($name) = @_;
 | |
| +	if($kernel) {
 | |
| +		$code .= "ENDPROC($name)\n";
 | |
| +	} else {
 | |
| +		$code .= ".size   $name,.-$name\n";
 | |
| +	}
 | |
|  }
 | |
|  
 | |
| -if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
 | |
| -	$avx = ($2>=3.0) + ($2>3.0);
 | |
| -}
 | |
| +$code.=<<___ if $kernel;
 | |
| +#include <linux/linkage.h>
 | |
| +___
 | |
|  
 | |
| -open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
 | |
| -*STDOUT=*OUT;
 | |
| +if ($avx) {
 | |
| +$code.=<<___ if $kernel;
 | |
| +.section .rodata
 | |
| +___
 | |
| +$code.=<<___;
 | |
| +.align	64
 | |
| +.Lconst:
 | |
| +.Lmask24:
 | |
| +.long	0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
 | |
| +.L129:
 | |
| +.long	`1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
 | |
| +.Lmask26:
 | |
| +.long	0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
 | |
| +.Lpermd_avx2:
 | |
| +.long	2,2,2,3,2,0,2,1
 | |
| +.Lpermd_avx512:
 | |
| +.long	0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
 | |
| +
 | |
| +.L2_44_inp_permd:
 | |
| +.long	0,1,1,2,2,3,7,7
 | |
| +.L2_44_inp_shift:
 | |
| +.quad	0,12,24,64
 | |
| +.L2_44_mask:
 | |
| +.quad	0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
 | |
| +.L2_44_shift_rgt:
 | |
| +.quad	44,44,42,64
 | |
| +.L2_44_shift_lft:
 | |
| +.quad	8,8,10,64
 | |
| +
 | |
| +.align	64
 | |
| +.Lx_mask44:
 | |
| +.quad	0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
 | |
| +.quad	0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
 | |
| +.Lx_mask42:
 | |
| +.quad	0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
 | |
| +.quad	0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
 | |
| +___
 | |
| +}
 | |
| +$code.=<<___ if (!$kernel);
 | |
| +.asciz	"Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 | |
| +.align	16
 | |
| +___
 | |
|  
 | |
|  my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
 | |
|  my ($mac,$nonce)=($inp,$len);	# *_emit arguments
 | |
| -my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13));
 | |
| -my ($h0,$h1,$h2)=("%r14","%rbx","%rbp");
 | |
| +my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13");
 | |
| +my ($h0,$h1,$h2)=("%r14","%rbx","%r10");
 | |
|  
 | |
|  sub poly1305_iteration {
 | |
|  # input:	copy of $r1 in %rax, $h0-$h2, $r0-$r1
 | |
| @@ -155,19 +233,19 @@ ___
 | |
|  
 | |
|  $code.=<<___;
 | |
|  .text
 | |
| -
 | |
| +___
 | |
| +$code.=<<___ if (!$kernel);
 | |
|  .extern	OPENSSL_ia32cap_P
 | |
|  
 | |
| -.globl	poly1305_init
 | |
| -.hidden	poly1305_init
 | |
| -.globl	poly1305_blocks
 | |
| -.hidden	poly1305_blocks
 | |
| -.globl	poly1305_emit
 | |
| -.hidden	poly1305_emit
 | |
| -
 | |
| -.type	poly1305_init,\@function,3
 | |
| -.align	32
 | |
| -poly1305_init:
 | |
| +.globl	poly1305_init_x86_64
 | |
| +.hidden	poly1305_init_x86_64
 | |
| +.globl	poly1305_blocks_x86_64
 | |
| +.hidden	poly1305_blocks_x86_64
 | |
| +.globl	poly1305_emit_x86_64
 | |
| +.hidden	poly1305_emit_x86_64
 | |
| +___
 | |
| +&declare_function("poly1305_init_x86_64", 32, 3);
 | |
| +$code.=<<___;
 | |
|  	xor	%rax,%rax
 | |
|  	mov	%rax,0($ctx)		# initialize hash value
 | |
|  	mov	%rax,8($ctx)
 | |
| @@ -175,11 +253,12 @@ poly1305_init:
 | |
|  
 | |
|  	cmp	\$0,$inp
 | |
|  	je	.Lno_key
 | |
| -
 | |
| -	lea	poly1305_blocks(%rip),%r10
 | |
| -	lea	poly1305_emit(%rip),%r11
 | |
|  ___
 | |
| -$code.=<<___	if ($avx);
 | |
| +$code.=<<___ if (!$kernel);
 | |
| +	lea	poly1305_blocks_x86_64(%rip),%r10
 | |
| +	lea	poly1305_emit_x86_64(%rip),%r11
 | |
| +___
 | |
| +$code.=<<___	if (!$kernel && $avx);
 | |
|  	mov	OPENSSL_ia32cap_P+4(%rip),%r9
 | |
|  	lea	poly1305_blocks_avx(%rip),%rax
 | |
|  	lea	poly1305_emit_avx(%rip),%rcx
 | |
| @@ -187,12 +266,12 @@ $code.=<<___	if ($avx);
 | |
|  	cmovc	%rax,%r10
 | |
|  	cmovc	%rcx,%r11
 | |
|  ___
 | |
| -$code.=<<___	if ($avx>1);
 | |
| +$code.=<<___	if (!$kernel && $avx>1);
 | |
|  	lea	poly1305_blocks_avx2(%rip),%rax
 | |
|  	bt	\$`5+32`,%r9		# AVX2?
 | |
|  	cmovc	%rax,%r10
 | |
|  ___
 | |
| -$code.=<<___	if ($avx>3);
 | |
| +$code.=<<___	if (!$kernel && $avx>3);
 | |
|  	mov	\$`(1<<31|1<<21|1<<16)`,%rax
 | |
|  	shr	\$32,%r9
 | |
|  	and	%rax,%r9
 | |
| @@ -207,11 +286,11 @@ $code.=<<___;
 | |
|  	mov	%rax,24($ctx)
 | |
|  	mov	%rcx,32($ctx)
 | |
|  ___
 | |
| -$code.=<<___	if ($flavour !~ /elf32/);
 | |
| +$code.=<<___	if (!$kernel && $flavour !~ /elf32/);
 | |
|  	mov	%r10,0(%rdx)
 | |
|  	mov	%r11,8(%rdx)
 | |
|  ___
 | |
| -$code.=<<___	if ($flavour =~ /elf32/);
 | |
| +$code.=<<___	if (!$kernel && $flavour =~ /elf32/);
 | |
|  	mov	%r10d,0(%rdx)
 | |
|  	mov	%r11d,4(%rdx)
 | |
|  ___
 | |
| @@ -219,11 +298,11 @@ $code.=<<___;
 | |
|  	mov	\$1,%eax
 | |
|  .Lno_key:
 | |
|  	ret
 | |
| -.size	poly1305_init,.-poly1305_init
 | |
| +___
 | |
| +&end_function("poly1305_init_x86_64");
 | |
|  
 | |
| -.type	poly1305_blocks,\@function,4
 | |
| -.align	32
 | |
| -poly1305_blocks:
 | |
| +&declare_function("poly1305_blocks_x86_64", 32, 4);
 | |
| +$code.=<<___;
 | |
|  .cfi_startproc
 | |
|  .Lblocks:
 | |
|  	shr	\$4,$len
 | |
| @@ -231,8 +310,6 @@ poly1305_blocks:
 | |
|  
 | |
|  	push	%rbx
 | |
|  .cfi_push	%rbx
 | |
| -	push	%rbp
 | |
| -.cfi_push	%rbp
 | |
|  	push	%r12
 | |
|  .cfi_push	%r12
 | |
|  	push	%r13
 | |
| @@ -241,6 +318,8 @@ poly1305_blocks:
 | |
|  .cfi_push	%r14
 | |
|  	push	%r15
 | |
|  .cfi_push	%r15
 | |
| +	push	$ctx
 | |
| +.cfi_push	$ctx
 | |
|  .Lblocks_body:
 | |
|  
 | |
|  	mov	$len,%r15		# reassign $len
 | |
| @@ -265,26 +344,29 @@ poly1305_blocks:
 | |
|  	lea	16($inp),$inp
 | |
|  	adc	$padbit,$h2
 | |
|  ___
 | |
| +
 | |
|  	&poly1305_iteration();
 | |
| +
 | |
|  $code.=<<___;
 | |
|  	mov	$r1,%rax
 | |
|  	dec	%r15			# len-=16
 | |
|  	jnz	.Loop
 | |
|  
 | |
| +	mov	0(%rsp),$ctx
 | |
| +.cfi_restore	$ctx
 | |
| +
 | |
|  	mov	$h0,0($ctx)		# store hash value
 | |
|  	mov	$h1,8($ctx)
 | |
|  	mov	$h2,16($ctx)
 | |
|  
 | |
| -	mov	0(%rsp),%r15
 | |
| +	mov	8(%rsp),%r15
 | |
|  .cfi_restore	%r15
 | |
| -	mov	8(%rsp),%r14
 | |
| +	mov	16(%rsp),%r14
 | |
|  .cfi_restore	%r14
 | |
| -	mov	16(%rsp),%r13
 | |
| +	mov	24(%rsp),%r13
 | |
|  .cfi_restore	%r13
 | |
| -	mov	24(%rsp),%r12
 | |
| +	mov	32(%rsp),%r12
 | |
|  .cfi_restore	%r12
 | |
| -	mov	32(%rsp),%rbp
 | |
| -.cfi_restore	%rbp
 | |
|  	mov	40(%rsp),%rbx
 | |
|  .cfi_restore	%rbx
 | |
|  	lea	48(%rsp),%rsp
 | |
| @@ -293,11 +375,11 @@ $code.=<<___;
 | |
|  .Lblocks_epilogue:
 | |
|  	ret
 | |
|  .cfi_endproc
 | |
| -.size	poly1305_blocks,.-poly1305_blocks
 | |
| +___
 | |
| +&end_function("poly1305_blocks_x86_64");
 | |
|  
 | |
| -.type	poly1305_emit,\@function,3
 | |
| -.align	32
 | |
| -poly1305_emit:
 | |
| +&declare_function("poly1305_emit_x86_64", 32, 3);
 | |
| +$code.=<<___;
 | |
|  .Lemit:
 | |
|  	mov	0($ctx),%r8	# load hash value
 | |
|  	mov	8($ctx),%r9
 | |
| @@ -318,10 +400,14 @@ poly1305_emit:
 | |
|  	mov	%rcx,8($mac)
 | |
|  
 | |
|  	ret
 | |
| -.size	poly1305_emit,.-poly1305_emit
 | |
|  ___
 | |
| +&end_function("poly1305_emit_x86_64");
 | |
|  if ($avx) {
 | |
|  
 | |
| +if($kernel) {
 | |
| +	$code .= "#ifdef CONFIG_AS_AVX\n";
 | |
| +}
 | |
| +
 | |
|  ########################################################################
 | |
|  # Layout of opaque area is following.
 | |
|  #
 | |
| @@ -342,15 +428,19 @@ $code.=<<___;
 | |
|  .type	__poly1305_block,\@abi-omnipotent
 | |
|  .align	32
 | |
|  __poly1305_block:
 | |
| +	push $ctx
 | |
|  ___
 | |
|  	&poly1305_iteration();
 | |
|  $code.=<<___;
 | |
| +	pop $ctx
 | |
|  	ret
 | |
|  .size	__poly1305_block,.-__poly1305_block
 | |
|  
 | |
|  .type	__poly1305_init_avx,\@abi-omnipotent
 | |
|  .align	32
 | |
|  __poly1305_init_avx:
 | |
| +	push %rbp
 | |
| +	mov %rsp,%rbp
 | |
|  	mov	$r0,$h0
 | |
|  	mov	$r1,$h1
 | |
|  	xor	$h2,$h2
 | |
| @@ -507,12 +597,13 @@ __poly1305_init_avx:
 | |
|  	mov	$d1#d,`16*8+8-64`($ctx)
 | |
|  
 | |
|  	lea	-48-64($ctx),$ctx	# size [de-]optimization
 | |
| +	pop %rbp
 | |
|  	ret
 | |
|  .size	__poly1305_init_avx,.-__poly1305_init_avx
 | |
| +___
 | |
|  
 | |
| -.type	poly1305_blocks_avx,\@function,4
 | |
| -.align	32
 | |
| -poly1305_blocks_avx:
 | |
| +&declare_function("poly1305_blocks_avx", 32, 4);
 | |
| +$code.=<<___;
 | |
|  .cfi_startproc
 | |
|  	mov	20($ctx),%r8d		# is_base2_26
 | |
|  	cmp	\$128,$len
 | |
| @@ -532,10 +623,11 @@ poly1305_blocks_avx:
 | |
|  	test	\$31,$len
 | |
|  	jz	.Leven_avx
 | |
|  
 | |
| -	push	%rbx
 | |
| -.cfi_push	%rbx
 | |
|  	push	%rbp
 | |
|  .cfi_push	%rbp
 | |
| +	mov 	%rsp,%rbp
 | |
| +	push	%rbx
 | |
| +.cfi_push	%rbx
 | |
|  	push	%r12
 | |
|  .cfi_push	%r12
 | |
|  	push	%r13
 | |
| @@ -645,20 +737,18 @@ poly1305_blocks_avx:
 | |
|  	mov	$h2#d,16($ctx)
 | |
|  .align	16
 | |
|  .Ldone_avx:
 | |
| -	mov	0(%rsp),%r15
 | |
| +	pop 		%r15
 | |
|  .cfi_restore	%r15
 | |
| -	mov	8(%rsp),%r14
 | |
| +	pop 		%r14
 | |
|  .cfi_restore	%r14
 | |
| -	mov	16(%rsp),%r13
 | |
| +	pop 		%r13
 | |
|  .cfi_restore	%r13
 | |
| -	mov	24(%rsp),%r12
 | |
| +	pop 		%r12
 | |
|  .cfi_restore	%r12
 | |
| -	mov	32(%rsp),%rbp
 | |
| -.cfi_restore	%rbp
 | |
| -	mov	40(%rsp),%rbx
 | |
| +	pop 		%rbx
 | |
|  .cfi_restore	%rbx
 | |
| -	lea	48(%rsp),%rsp
 | |
| -.cfi_adjust_cfa_offset	-48
 | |
| +	pop 		%rbp
 | |
| +.cfi_restore	%rbp
 | |
|  .Lno_data_avx:
 | |
|  .Lblocks_avx_epilogue:
 | |
|  	ret
 | |
| @@ -667,10 +757,11 @@ poly1305_blocks_avx:
 | |
|  .align	32
 | |
|  .Lbase2_64_avx:
 | |
|  .cfi_startproc
 | |
| -	push	%rbx
 | |
| -.cfi_push	%rbx
 | |
|  	push	%rbp
 | |
|  .cfi_push	%rbp
 | |
| +	mov 	%rsp,%rbp
 | |
| +	push	%rbx
 | |
| +.cfi_push	%rbx
 | |
|  	push	%r12
 | |
|  .cfi_push	%r12
 | |
|  	push	%r13
 | |
| @@ -736,22 +827,18 @@ poly1305_blocks_avx:
 | |
|  
 | |
|  .Lproceed_avx:
 | |
|  	mov	%r15,$len
 | |
| -
 | |
| -	mov	0(%rsp),%r15
 | |
| +	pop 		%r15
 | |
|  .cfi_restore	%r15
 | |
| -	mov	8(%rsp),%r14
 | |
| +	pop 		%r14
 | |
|  .cfi_restore	%r14
 | |
| -	mov	16(%rsp),%r13
 | |
| +	pop 		%r13
 | |
|  .cfi_restore	%r13
 | |
| -	mov	24(%rsp),%r12
 | |
| +	pop 		%r12
 | |
|  .cfi_restore	%r12
 | |
| -	mov	32(%rsp),%rbp
 | |
| -.cfi_restore	%rbp
 | |
| -	mov	40(%rsp),%rbx
 | |
| +	pop 		%rbx
 | |
|  .cfi_restore	%rbx
 | |
| -	lea	48(%rsp),%rax
 | |
| -	lea	48(%rsp),%rsp
 | |
| -.cfi_adjust_cfa_offset	-48
 | |
| +	pop 		%rbp
 | |
| +.cfi_restore	%rbp
 | |
|  .Lbase2_64_avx_epilogue:
 | |
|  	jmp	.Ldo_avx
 | |
|  .cfi_endproc
 | |
| @@ -768,8 +855,11 @@ poly1305_blocks_avx:
 | |
|  .Ldo_avx:
 | |
|  ___
 | |
|  $code.=<<___	if (!$win64);
 | |
| +	lea		8(%rsp),%r10
 | |
| +.cfi_def_cfa_register	%r10
 | |
| +	and		\$-32,%rsp
 | |
| +	sub		\$-8,%rsp
 | |
|  	lea		-0x58(%rsp),%r11
 | |
| -.cfi_def_cfa		%r11,0x60
 | |
|  	sub		\$0x178,%rsp
 | |
|  ___
 | |
|  $code.=<<___	if ($win64);
 | |
| @@ -1361,18 +1451,18 @@ $code.=<<___	if ($win64);
 | |
|  .Ldo_avx_epilogue:
 | |
|  ___
 | |
|  $code.=<<___	if (!$win64);
 | |
| -	lea		0x58(%r11),%rsp
 | |
| -.cfi_def_cfa		%rsp,8
 | |
| +	lea		-8(%r10),%rsp
 | |
| +.cfi_def_cfa_register	%rsp
 | |
|  ___
 | |
|  $code.=<<___;
 | |
|  	vzeroupper
 | |
|  	ret
 | |
|  .cfi_endproc
 | |
| -.size	poly1305_blocks_avx,.-poly1305_blocks_avx
 | |
| +___
 | |
| +&end_function("poly1305_blocks_avx");
 | |
|  
 | |
| -.type	poly1305_emit_avx,\@function,3
 | |
| -.align	32
 | |
| -poly1305_emit_avx:
 | |
| +&declare_function("poly1305_emit_avx", 32, 3);
 | |
| +$code.=<<___;
 | |
|  	cmpl	\$0,20($ctx)	# is_base2_26?
 | |
|  	je	.Lemit
 | |
|  
 | |
| @@ -1423,41 +1513,51 @@ poly1305_emit_avx:
 | |
|  	mov	%rcx,8($mac)
 | |
|  
 | |
|  	ret
 | |
| -.size	poly1305_emit_avx,.-poly1305_emit_avx
 | |
|  ___
 | |
| +&end_function("poly1305_emit_avx");
 | |
| +
 | |
| +if ($kernel) {
 | |
| +	$code .= "#endif\n";
 | |
| +}
 | |
|  
 | |
|  if ($avx>1) {
 | |
| +
 | |
| +if ($kernel) {
 | |
| +	$code .= "#ifdef CONFIG_AS_AVX2\n";
 | |
| +}
 | |
| +
 | |
|  my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
 | |
|      map("%ymm$_",(0..15));
 | |
|  my $S4=$MASK;
 | |
|  
 | |
| +sub poly1305_blocks_avxN {
 | |
| +	my ($avx512) = @_;
 | |
| +	my $suffix = $avx512 ? "_avx512" : "";
 | |
|  $code.=<<___;
 | |
| -.type	poly1305_blocks_avx2,\@function,4
 | |
| -.align	32
 | |
| -poly1305_blocks_avx2:
 | |
|  .cfi_startproc
 | |
|  	mov	20($ctx),%r8d		# is_base2_26
 | |
|  	cmp	\$128,$len
 | |
| -	jae	.Lblocks_avx2
 | |
| +	jae	.Lblocks_avx2$suffix
 | |
|  	test	%r8d,%r8d
 | |
|  	jz	.Lblocks
 | |
|  
 | |
| -.Lblocks_avx2:
 | |
| +.Lblocks_avx2$suffix:
 | |
|  	and	\$-16,$len
 | |
| -	jz	.Lno_data_avx2
 | |
| +	jz	.Lno_data_avx2$suffix
 | |
|  
 | |
|  	vzeroupper
 | |
|  
 | |
|  	test	%r8d,%r8d
 | |
| -	jz	.Lbase2_64_avx2
 | |
| +	jz	.Lbase2_64_avx2$suffix
 | |
|  
 | |
|  	test	\$63,$len
 | |
| -	jz	.Leven_avx2
 | |
| +	jz	.Leven_avx2$suffix
 | |
|  
 | |
| -	push	%rbx
 | |
| -.cfi_push	%rbx
 | |
|  	push	%rbp
 | |
|  .cfi_push	%rbp
 | |
| +	mov 	%rsp,%rbp
 | |
| +	push	%rbx
 | |
| +.cfi_push	%rbx
 | |
|  	push	%r12
 | |
|  .cfi_push	%r12
 | |
|  	push	%r13
 | |
| @@ -1466,7 +1566,7 @@ poly1305_blocks_avx2:
 | |
|  .cfi_push	%r14
 | |
|  	push	%r15
 | |
|  .cfi_push	%r15
 | |
| -.Lblocks_avx2_body:
 | |
| +.Lblocks_avx2_body$suffix:
 | |
|  
 | |
|  	mov	$len,%r15		# reassign $len
 | |
|  
 | |
| @@ -1513,7 +1613,7 @@ poly1305_blocks_avx2:
 | |
|  	shr	\$2,$s1
 | |
|  	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
 | |
|  
 | |
| -.Lbase2_26_pre_avx2:
 | |
| +.Lbase2_26_pre_avx2$suffix:
 | |
|  	add	0($inp),$h0		# accumulate input
 | |
|  	adc	8($inp),$h1
 | |
|  	lea	16($inp),$inp
 | |
| @@ -1524,10 +1624,10 @@ poly1305_blocks_avx2:
 | |
|  	mov	$r1,%rax
 | |
|  
 | |
|  	test	\$63,%r15
 | |
| -	jnz	.Lbase2_26_pre_avx2
 | |
| +	jnz	.Lbase2_26_pre_avx2$suffix
 | |
|  
 | |
|  	test	$padbit,$padbit		# if $padbit is zero,
 | |
| -	jz	.Lstore_base2_64_avx2	# store hash in base 2^64 format
 | |
| +	jz	.Lstore_base2_64_avx2$suffix	# store hash in base 2^64 format
 | |
|  
 | |
|  	################################# base 2^64 -> base 2^26
 | |
|  	mov	$h0,%rax
 | |
| @@ -1548,57 +1648,56 @@ poly1305_blocks_avx2:
 | |
|  	or	$r1,$h2			# h[4]
 | |
|  
 | |
|  	test	%r15,%r15
 | |
| -	jz	.Lstore_base2_26_avx2
 | |
| +	jz	.Lstore_base2_26_avx2$suffix
 | |
|  
 | |
|  	vmovd	%rax#d,%x#$H0
 | |
|  	vmovd	%rdx#d,%x#$H1
 | |
|  	vmovd	$h0#d,%x#$H2
 | |
|  	vmovd	$h1#d,%x#$H3
 | |
|  	vmovd	$h2#d,%x#$H4
 | |
| -	jmp	.Lproceed_avx2
 | |
| +	jmp	.Lproceed_avx2$suffix
 | |
|  
 | |
|  .align	32
 | |
| -.Lstore_base2_64_avx2:
 | |
| +.Lstore_base2_64_avx2$suffix:
 | |
|  	mov	$h0,0($ctx)
 | |
|  	mov	$h1,8($ctx)
 | |
|  	mov	$h2,16($ctx)		# note that is_base2_26 is zeroed
 | |
| -	jmp	.Ldone_avx2
 | |
| +	jmp	.Ldone_avx2$suffix
 | |
|  
 | |
|  .align	16
 | |
| -.Lstore_base2_26_avx2:
 | |
| +.Lstore_base2_26_avx2$suffix:
 | |
|  	mov	%rax#d,0($ctx)		# store hash value base 2^26
 | |
|  	mov	%rdx#d,4($ctx)
 | |
|  	mov	$h0#d,8($ctx)
 | |
|  	mov	$h1#d,12($ctx)
 | |
|  	mov	$h2#d,16($ctx)
 | |
|  .align	16
 | |
| -.Ldone_avx2:
 | |
| -	mov	0(%rsp),%r15
 | |
| +.Ldone_avx2$suffix:
 | |
| +	pop 		%r15
 | |
|  .cfi_restore	%r15
 | |
| -	mov	8(%rsp),%r14
 | |
| +	pop 		%r14
 | |
|  .cfi_restore	%r14
 | |
| -	mov	16(%rsp),%r13
 | |
| +	pop 		%r13
 | |
|  .cfi_restore	%r13
 | |
| -	mov	24(%rsp),%r12
 | |
| +	pop 		%r12
 | |
|  .cfi_restore	%r12
 | |
| -	mov	32(%rsp),%rbp
 | |
| -.cfi_restore	%rbp
 | |
| -	mov	40(%rsp),%rbx
 | |
| +	pop 		%rbx
 | |
|  .cfi_restore	%rbx
 | |
| -	lea	48(%rsp),%rsp
 | |
| -.cfi_adjust_cfa_offset	-48
 | |
| -.Lno_data_avx2:
 | |
| -.Lblocks_avx2_epilogue:
 | |
| +	pop 		%rbp
 | |
| +.cfi_restore 	%rbp
 | |
| +.Lno_data_avx2$suffix:
 | |
| +.Lblocks_avx2_epilogue$suffix:
 | |
|  	ret
 | |
|  .cfi_endproc
 | |
|  
 | |
|  .align	32
 | |
| -.Lbase2_64_avx2:
 | |
| +.Lbase2_64_avx2$suffix:
 | |
|  .cfi_startproc
 | |
| -	push	%rbx
 | |
| -.cfi_push	%rbx
 | |
|  	push	%rbp
 | |
|  .cfi_push	%rbp
 | |
| +	mov 	%rsp,%rbp
 | |
| +	push	%rbx
 | |
| +.cfi_push	%rbx
 | |
|  	push	%r12
 | |
|  .cfi_push	%r12
 | |
|  	push	%r13
 | |
| @@ -1607,7 +1706,7 @@ poly1305_blocks_avx2:
 | |
|  .cfi_push	%r14
 | |
|  	push	%r15
 | |
|  .cfi_push	%r15
 | |
| -.Lbase2_64_avx2_body:
 | |
| +.Lbase2_64_avx2_body$suffix:
 | |
|  
 | |
|  	mov	$len,%r15		# reassign $len
 | |
|  
 | |
| @@ -1624,9 +1723,9 @@ poly1305_blocks_avx2:
 | |
|  	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
 | |
|  
 | |
|  	test	\$63,$len
 | |
| -	jz	.Linit_avx2
 | |
| +	jz	.Linit_avx2$suffix
 | |
|  
 | |
| -.Lbase2_64_pre_avx2:
 | |
| +.Lbase2_64_pre_avx2$suffix:
 | |
|  	add	0($inp),$h0		# accumulate input
 | |
|  	adc	8($inp),$h1
 | |
|  	lea	16($inp),$inp
 | |
| @@ -1637,9 +1736,9 @@ poly1305_blocks_avx2:
 | |
|  	mov	$r1,%rax
 | |
|  
 | |
|  	test	\$63,%r15
 | |
| -	jnz	.Lbase2_64_pre_avx2
 | |
| +	jnz	.Lbase2_64_pre_avx2$suffix
 | |
|  
 | |
| -.Linit_avx2:
 | |
| +.Linit_avx2$suffix:
 | |
|  	################################# base 2^64 -> base 2^26
 | |
|  	mov	$h0,%rax
 | |
|  	mov	$h0,%rdx
 | |
| @@ -1667,69 +1766,77 @@ poly1305_blocks_avx2:
 | |
|  
 | |
|  	call	__poly1305_init_avx
 | |
|  
 | |
| -.Lproceed_avx2:
 | |
| +.Lproceed_avx2$suffix:
 | |
|  	mov	%r15,$len			# restore $len
 | |
| -	mov	OPENSSL_ia32cap_P+8(%rip),%r10d
 | |
| +___
 | |
| +$code.=<<___ if (!$kernel);
 | |
| +	mov	OPENSSL_ia32cap_P+8(%rip),%r9d
 | |
|  	mov	\$`(1<<31|1<<30|1<<16)`,%r11d
 | |
| -
 | |
| -	mov	0(%rsp),%r15
 | |
| +___
 | |
| +$code.=<<___;
 | |
| +	pop 		%r15
 | |
|  .cfi_restore	%r15
 | |
| -	mov	8(%rsp),%r14
 | |
| +	pop 		%r14
 | |
|  .cfi_restore	%r14
 | |
| -	mov	16(%rsp),%r13
 | |
| +	pop 		%r13
 | |
|  .cfi_restore	%r13
 | |
| -	mov	24(%rsp),%r12
 | |
| +	pop 		%r12
 | |
|  .cfi_restore	%r12
 | |
| -	mov	32(%rsp),%rbp
 | |
| -.cfi_restore	%rbp
 | |
| -	mov	40(%rsp),%rbx
 | |
| +	pop 		%rbx
 | |
|  .cfi_restore	%rbx
 | |
| -	lea	48(%rsp),%rax
 | |
| -	lea	48(%rsp),%rsp
 | |
| -.cfi_adjust_cfa_offset	-48
 | |
| -.Lbase2_64_avx2_epilogue:
 | |
| -	jmp	.Ldo_avx2
 | |
| +	pop 		%rbp
 | |
| +.cfi_restore 	%rbp
 | |
| +.Lbase2_64_avx2_epilogue$suffix:
 | |
| +	jmp	.Ldo_avx2$suffix
 | |
|  .cfi_endproc
 | |
|  
 | |
|  .align	32
 | |
| -.Leven_avx2:
 | |
| +.Leven_avx2$suffix:
 | |
|  .cfi_startproc
 | |
| -	mov		OPENSSL_ia32cap_P+8(%rip),%r10d
 | |
| +___
 | |
| +$code.=<<___ if (!$kernel);
 | |
| +	mov		OPENSSL_ia32cap_P+8(%rip),%r9d
 | |
| +___
 | |
| +$code.=<<___;
 | |
|  	vmovd		4*0($ctx),%x#$H0	# load hash value base 2^26
 | |
|  	vmovd		4*1($ctx),%x#$H1
 | |
|  	vmovd		4*2($ctx),%x#$H2
 | |
|  	vmovd		4*3($ctx),%x#$H3
 | |
|  	vmovd		4*4($ctx),%x#$H4
 | |
|  
 | |
| -.Ldo_avx2:
 | |
| +.Ldo_avx2$suffix:
 | |
|  ___
 | |
| -$code.=<<___		if ($avx>2);
 | |
| +$code.=<<___		if (!$kernel && $avx>2);
 | |
|  	cmp		\$512,$len
 | |
|  	jb		.Lskip_avx512
 | |
| -	and		%r11d,%r10d
 | |
| -	test		\$`1<<16`,%r10d		# check for AVX512F
 | |
| +	and		%r11d,%r9d
 | |
| +	test		\$`1<<16`,%r9d		# check for AVX512F
 | |
|  	jnz		.Lblocks_avx512
 | |
| -.Lskip_avx512:
 | |
| +.Lskip_avx512$suffix:
 | |
| +___
 | |
| +$code.=<<___ if ($avx > 2 && $avx512 && $kernel);
 | |
| +	cmp		\$512,$len
 | |
| +	jae		.Lblocks_avx512
 | |
|  ___
 | |
|  $code.=<<___	if (!$win64);
 | |
| -	lea		-8(%rsp),%r11
 | |
| -.cfi_def_cfa		%r11,16
 | |
| +	lea		8(%rsp),%r10
 | |
| +.cfi_def_cfa_register	%r10
 | |
|  	sub		\$0x128,%rsp
 | |
|  ___
 | |
|  $code.=<<___	if ($win64);
 | |
| -	lea		-0xf8(%rsp),%r11
 | |
| +	lea		8(%rsp),%r10
 | |
|  	sub		\$0x1c8,%rsp
 | |
| -	vmovdqa		%xmm6,0x50(%r11)
 | |
| -	vmovdqa		%xmm7,0x60(%r11)
 | |
| -	vmovdqa		%xmm8,0x70(%r11)
 | |
| -	vmovdqa		%xmm9,0x80(%r11)
 | |
| -	vmovdqa		%xmm10,0x90(%r11)
 | |
| -	vmovdqa		%xmm11,0xa0(%r11)
 | |
| -	vmovdqa		%xmm12,0xb0(%r11)
 | |
| -	vmovdqa		%xmm13,0xc0(%r11)
 | |
| -	vmovdqa		%xmm14,0xd0(%r11)
 | |
| -	vmovdqa		%xmm15,0xe0(%r11)
 | |
| -.Ldo_avx2_body:
 | |
| +	vmovdqa		%xmm6,-0xb0(%r10)
 | |
| +	vmovdqa		%xmm7,-0xa0(%r10)
 | |
| +	vmovdqa		%xmm8,-0x90(%r10)
 | |
| +	vmovdqa		%xmm9,-0x80(%r10)
 | |
| +	vmovdqa		%xmm10,-0x70(%r10)
 | |
| +	vmovdqa		%xmm11,-0x60(%r10)
 | |
| +	vmovdqa		%xmm12,-0x50(%r10)
 | |
| +	vmovdqa		%xmm13,-0x40(%r10)
 | |
| +	vmovdqa		%xmm14,-0x30(%r10)
 | |
| +	vmovdqa		%xmm15,-0x20(%r10)
 | |
| +.Ldo_avx2_body$suffix:
 | |
|  ___
 | |
|  $code.=<<___;
 | |
|  	lea		.Lconst(%rip),%rcx
 | |
| @@ -1794,11 +1901,11 @@ $code.=<<___;
 | |
|  
 | |
|  	vpaddq		$H2,$T2,$H2		# accumulate input
 | |
|  	sub		\$64,$len
 | |
| -	jz		.Ltail_avx2
 | |
| -	jmp		.Loop_avx2
 | |
| +	jz		.Ltail_avx2$suffix
 | |
| +	jmp		.Loop_avx2$suffix
 | |
|  
 | |
|  .align	32
 | |
| -.Loop_avx2:
 | |
| +.Loop_avx2$suffix:
 | |
|  	################################################################
 | |
|  	# ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
 | |
|  	# ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
 | |
| @@ -1946,10 +2053,10 @@ $code.=<<___;
 | |
|  	 vpor		32(%rcx),$T4,$T4	# padbit, yes, always
 | |
|  
 | |
|  	sub		\$64,$len
 | |
| -	jnz		.Loop_avx2
 | |
| +	jnz		.Loop_avx2$suffix
 | |
|  
 | |
|  	.byte		0x66,0x90
 | |
| -.Ltail_avx2:
 | |
| +.Ltail_avx2$suffix:
 | |
|  	################################################################
 | |
|  	# while above multiplications were by r^4 in all lanes, in last
 | |
|  	# iteration we multiply least significant lane by r^4 and most
 | |
| @@ -2087,37 +2194,29 @@ $code.=<<___;
 | |
|  	vmovd		%x#$H4,`4*4-48-64`($ctx)
 | |
|  ___
 | |
|  $code.=<<___	if ($win64);
 | |
| -	vmovdqa		0x50(%r11),%xmm6
 | |
| -	vmovdqa		0x60(%r11),%xmm7
 | |
| -	vmovdqa		0x70(%r11),%xmm8
 | |
| -	vmovdqa		0x80(%r11),%xmm9
 | |
| -	vmovdqa		0x90(%r11),%xmm10
 | |
| -	vmovdqa		0xa0(%r11),%xmm11
 | |
| -	vmovdqa		0xb0(%r11),%xmm12
 | |
| -	vmovdqa		0xc0(%r11),%xmm13
 | |
| -	vmovdqa		0xd0(%r11),%xmm14
 | |
| -	vmovdqa		0xe0(%r11),%xmm15
 | |
| -	lea		0xf8(%r11),%rsp
 | |
| -.Ldo_avx2_epilogue:
 | |
| +	vmovdqa		-0xb0(%r10),%xmm6
 | |
| +	vmovdqa		-0xa0(%r10),%xmm7
 | |
| +	vmovdqa		-0x90(%r10),%xmm8
 | |
| +	vmovdqa		-0x80(%r10),%xmm9
 | |
| +	vmovdqa		-0x70(%r10),%xmm10
 | |
| +	vmovdqa		-0x60(%r10),%xmm11
 | |
| +	vmovdqa		-0x50(%r10),%xmm12
 | |
| +	vmovdqa		-0x40(%r10),%xmm13
 | |
| +	vmovdqa		-0x30(%r10),%xmm14
 | |
| +	vmovdqa		-0x20(%r10),%xmm15
 | |
| +	lea		-8(%r10),%rsp
 | |
| +.Ldo_avx2_epilogue$suffix:
 | |
|  ___
 | |
|  $code.=<<___	if (!$win64);
 | |
| -	lea		8(%r11),%rsp
 | |
| -.cfi_def_cfa		%rsp,8
 | |
| +	lea		-8(%r10),%rsp
 | |
| +.cfi_def_cfa_register	%rsp
 | |
|  ___
 | |
|  $code.=<<___;
 | |
|  	vzeroupper
 | |
|  	ret
 | |
|  .cfi_endproc
 | |
| -.size	poly1305_blocks_avx2,.-poly1305_blocks_avx2
 | |
|  ___
 | |
| -#######################################################################
 | |
| -if ($avx>2) {
 | |
| -# On entry we have input length divisible by 64. But since inner loop
 | |
| -# processes 128 bytes per iteration, cases when length is not divisible
 | |
| -# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
 | |
| -# reason stack layout is kept identical to poly1305_blocks_avx2. If not
 | |
| -# for this tail, we wouldn't have to even allocate stack frame...
 | |
| -
 | |
| +if($avx > 2 && $avx512) {
 | |
|  my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));
 | |
|  my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));
 | |
|  my $PADBIT="%zmm30";
 | |
| @@ -2128,32 +2227,29 @@ map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
 | |
|  map(s/%y/%z/,($MASK));
 | |
|  
 | |
|  $code.=<<___;
 | |
| -.type	poly1305_blocks_avx512,\@function,4
 | |
| -.align	32
 | |
| -poly1305_blocks_avx512:
 | |
|  .cfi_startproc
 | |
|  .Lblocks_avx512:
 | |
|  	mov		\$15,%eax
 | |
|  	kmovw		%eax,%k2
 | |
|  ___
 | |
|  $code.=<<___	if (!$win64);
 | |
| -	lea		-8(%rsp),%r11
 | |
| -.cfi_def_cfa		%r11,16
 | |
| +	lea		8(%rsp),%r10
 | |
| +.cfi_def_cfa_register	%r10
 | |
|  	sub		\$0x128,%rsp
 | |
|  ___
 | |
|  $code.=<<___	if ($win64);
 | |
| -	lea		-0xf8(%rsp),%r11
 | |
| +	lea		8(%rsp),%r10
 | |
|  	sub		\$0x1c8,%rsp
 | |
| -	vmovdqa		%xmm6,0x50(%r11)
 | |
| -	vmovdqa		%xmm7,0x60(%r11)
 | |
| -	vmovdqa		%xmm8,0x70(%r11)
 | |
| -	vmovdqa		%xmm9,0x80(%r11)
 | |
| -	vmovdqa		%xmm10,0x90(%r11)
 | |
| -	vmovdqa		%xmm11,0xa0(%r11)
 | |
| -	vmovdqa		%xmm12,0xb0(%r11)
 | |
| -	vmovdqa		%xmm13,0xc0(%r11)
 | |
| -	vmovdqa		%xmm14,0xd0(%r11)
 | |
| -	vmovdqa		%xmm15,0xe0(%r11)
 | |
| +	vmovdqa		%xmm6,-0xb0(%r10)
 | |
| +	vmovdqa		%xmm7,-0xa0(%r10)
 | |
| +	vmovdqa		%xmm8,-0x90(%r10)
 | |
| +	vmovdqa		%xmm9,-0x80(%r10)
 | |
| +	vmovdqa		%xmm10,-0x70(%r10)
 | |
| +	vmovdqa		%xmm11,-0x60(%r10)
 | |
| +	vmovdqa		%xmm12,-0x50(%r10)
 | |
| +	vmovdqa		%xmm13,-0x40(%r10)
 | |
| +	vmovdqa		%xmm14,-0x30(%r10)
 | |
| +	vmovdqa		%xmm15,-0x20(%r10)
 | |
|  .Ldo_avx512_body:
 | |
|  ___
 | |
|  $code.=<<___;
 | |
| @@ -2679,7 +2775,7 @@ $code.=<<___;
 | |
|  
 | |
|  	lea		0x90(%rsp),%rax		# size optimization for .Ltail_avx2
 | |
|  	add		\$64,$len
 | |
| -	jnz		.Ltail_avx2
 | |
| +	jnz		.Ltail_avx2$suffix
 | |
|  
 | |
|  	vpsubq		$T2,$H2,$H2		# undo input accumulation
 | |
|  	vmovd		%x#$H0,`4*0-48-64`($ctx)# save partially reduced
 | |
| @@ -2690,29 +2786,61 @@ $code.=<<___;
 | |
|  	vzeroall
 | |
|  ___
 | |
|  $code.=<<___	if ($win64);
 | |
| -	movdqa		0x50(%r11),%xmm6
 | |
| -	movdqa		0x60(%r11),%xmm7
 | |
| -	movdqa		0x70(%r11),%xmm8
 | |
| -	movdqa		0x80(%r11),%xmm9
 | |
| -	movdqa		0x90(%r11),%xmm10
 | |
| -	movdqa		0xa0(%r11),%xmm11
 | |
| -	movdqa		0xb0(%r11),%xmm12
 | |
| -	movdqa		0xc0(%r11),%xmm13
 | |
| -	movdqa		0xd0(%r11),%xmm14
 | |
| -	movdqa		0xe0(%r11),%xmm15
 | |
| -	lea		0xf8(%r11),%rsp
 | |
| +	movdqa		-0xb0(%r10),%xmm6
 | |
| +	movdqa		-0xa0(%r10),%xmm7
 | |
| +	movdqa		-0x90(%r10),%xmm8
 | |
| +	movdqa		-0x80(%r10),%xmm9
 | |
| +	movdqa		-0x70(%r10),%xmm10
 | |
| +	movdqa		-0x60(%r10),%xmm11
 | |
| +	movdqa		-0x50(%r10),%xmm12
 | |
| +	movdqa		-0x40(%r10),%xmm13
 | |
| +	movdqa		-0x30(%r10),%xmm14
 | |
| +	movdqa		-0x20(%r10),%xmm15
 | |
| +	lea		-8(%r10),%rsp
 | |
|  .Ldo_avx512_epilogue:
 | |
|  ___
 | |
|  $code.=<<___	if (!$win64);
 | |
| -	lea		8(%r11),%rsp
 | |
| -.cfi_def_cfa		%rsp,8
 | |
| +	lea		-8(%r10),%rsp
 | |
| +.cfi_def_cfa_register	%rsp
 | |
|  ___
 | |
|  $code.=<<___;
 | |
|  	ret
 | |
|  .cfi_endproc
 | |
| -.size	poly1305_blocks_avx512,.-poly1305_blocks_avx512
 | |
|  ___
 | |
| -if ($avx>3) {
 | |
| +
 | |
| +}
 | |
| +
 | |
| +}
 | |
| +
 | |
| +&declare_function("poly1305_blocks_avx2", 32, 4);
 | |
| +poly1305_blocks_avxN(0);
 | |
| +&end_function("poly1305_blocks_avx2");
 | |
| +
 | |
| +if($kernel) {
 | |
| +	$code .= "#endif\n";
 | |
| +}
 | |
| +
 | |
| +#######################################################################
 | |
| +if ($avx>2) {
 | |
| +# On entry we have input length divisible by 64. But since inner loop
 | |
| +# processes 128 bytes per iteration, cases when length is not divisible
 | |
| +# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
 | |
| +# reason stack layout is kept identical to poly1305_blocks_avx2. If not
 | |
| +# for this tail, we wouldn't have to even allocate stack frame...
 | |
| +
 | |
| +if($kernel) {
 | |
| +	$code .= "#ifdef CONFIG_AS_AVX512\n";
 | |
| +}
 | |
| +
 | |
| +&declare_function("poly1305_blocks_avx512", 32, 4);
 | |
| +poly1305_blocks_avxN(1);
 | |
| +&end_function("poly1305_blocks_avx512");
 | |
| +
 | |
| +if ($kernel) {
 | |
| +	$code .= "#endif\n";
 | |
| +}
 | |
| +
 | |
| +if (!$kernel && $avx>3) {
 | |
|  ########################################################################
 | |
|  # VPMADD52 version using 2^44 radix.
 | |
|  #
 | |
| @@ -3753,45 +3881,9 @@ poly1305_emit_base2_44:
 | |
|  .size	poly1305_emit_base2_44,.-poly1305_emit_base2_44
 | |
|  ___
 | |
|  }	}	}
 | |
| -$code.=<<___;
 | |
| -.align	64
 | |
| -.Lconst:
 | |
| -.Lmask24:
 | |
| -.long	0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
 | |
| -.L129:
 | |
| -.long	`1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
 | |
| -.Lmask26:
 | |
| -.long	0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
 | |
| -.Lpermd_avx2:
 | |
| -.long	2,2,2,3,2,0,2,1
 | |
| -.Lpermd_avx512:
 | |
| -.long	0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
 | |
| -
 | |
| -.L2_44_inp_permd:
 | |
| -.long	0,1,1,2,2,3,7,7
 | |
| -.L2_44_inp_shift:
 | |
| -.quad	0,12,24,64
 | |
| -.L2_44_mask:
 | |
| -.quad	0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
 | |
| -.L2_44_shift_rgt:
 | |
| -.quad	44,44,42,64
 | |
| -.L2_44_shift_lft:
 | |
| -.quad	8,8,10,64
 | |
| -
 | |
| -.align	64
 | |
| -.Lx_mask44:
 | |
| -.quad	0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
 | |
| -.quad	0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
 | |
| -.Lx_mask42:
 | |
| -.quad	0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
 | |
| -.quad	0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
 | |
| -___
 | |
|  }
 | |
| -$code.=<<___;
 | |
| -.asciz	"Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 | |
| -.align	16
 | |
| -___
 | |
|  
 | |
| +if (!$kernel)
 | |
|  {	# chacha20-poly1305 helpers
 | |
|  my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") :  # Win64 order
 | |
|                                    ("%rdi","%rsi","%rdx","%rcx");  # Unix order
 | |
| @@ -4038,17 +4130,17 @@ avx_handler:
 | |
|  
 | |
|  .section	.pdata
 | |
|  .align	4
 | |
| -	.rva	.LSEH_begin_poly1305_init
 | |
| -	.rva	.LSEH_end_poly1305_init
 | |
| -	.rva	.LSEH_info_poly1305_init
 | |
| -
 | |
| -	.rva	.LSEH_begin_poly1305_blocks
 | |
| -	.rva	.LSEH_end_poly1305_blocks
 | |
| -	.rva	.LSEH_info_poly1305_blocks
 | |
| -
 | |
| -	.rva	.LSEH_begin_poly1305_emit
 | |
| -	.rva	.LSEH_end_poly1305_emit
 | |
| -	.rva	.LSEH_info_poly1305_emit
 | |
| +	.rva	.LSEH_begin_poly1305_init_x86_64
 | |
| +	.rva	.LSEH_end_poly1305_init_x86_64
 | |
| +	.rva	.LSEH_info_poly1305_init_x86_64
 | |
| +
 | |
| +	.rva	.LSEH_begin_poly1305_blocks_x86_64
 | |
| +	.rva	.LSEH_end_poly1305_blocks_x86_64
 | |
| +	.rva	.LSEH_info_poly1305_blocks_x86_64
 | |
| +
 | |
| +	.rva	.LSEH_begin_poly1305_emit_x86_64
 | |
| +	.rva	.LSEH_end_poly1305_emit_x86_64
 | |
| +	.rva	.LSEH_info_poly1305_emit_x86_64
 | |
|  ___
 | |
|  $code.=<<___ if ($avx);
 | |
|  	.rva	.LSEH_begin_poly1305_blocks_avx
 | |
| @@ -4088,20 +4180,20 @@ ___
 | |
|  $code.=<<___;
 | |
|  .section	.xdata
 | |
|  .align	8
 | |
| -.LSEH_info_poly1305_init:
 | |
| +.LSEH_info_poly1305_init_x86_64:
 | |
|  	.byte	9,0,0,0
 | |
|  	.rva	se_handler
 | |
| -	.rva	.LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init
 | |
| +	.rva	.LSEH_begin_poly1305_init_x86_64,.LSEH_begin_poly1305_init_x86_64
 | |
|  
 | |
| -.LSEH_info_poly1305_blocks:
 | |
| +.LSEH_info_poly1305_blocks_x86_64:
 | |
|  	.byte	9,0,0,0
 | |
|  	.rva	se_handler
 | |
|  	.rva	.Lblocks_body,.Lblocks_epilogue
 | |
|  
 | |
| -.LSEH_info_poly1305_emit:
 | |
| +.LSEH_info_poly1305_emit_x86_64:
 | |
|  	.byte	9,0,0,0
 | |
|  	.rva	se_handler
 | |
| -	.rva	.LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit
 | |
| +	.rva	.LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64
 | |
|  ___
 | |
|  $code.=<<___ if ($avx);
 | |
|  .LSEH_info_poly1305_blocks_avx_1:
 | |
| @@ -4148,12 +4240,26 @@ $code.=<<___ if ($avx>2);
 | |
|  ___
 | |
|  }
 | |
|  
 | |
| +open SELF,$0;
 | |
| +while(<SELF>) {
 | |
| +	next if (/^#!/);
 | |
| +	last if (!s/^#/\/\// and !/^$/);
 | |
| +	print;
 | |
| +}
 | |
| +close SELF;
 | |
| +
 | |
|  foreach (split('\n',$code)) {
 | |
|  	s/\`([^\`]*)\`/eval($1)/ge;
 | |
|  	s/%r([a-z]+)#d/%e$1/g;
 | |
|  	s/%r([0-9]+)#d/%r$1d/g;
 | |
|  	s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;
 | |
|  
 | |
| +	if ($kernel) {
 | |
| +		s/(^\.type.*),[0-9]+$/\1/;
 | |
| +		s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/;
 | |
| +		next if /^\.cfi.*/;
 | |
| +	}
 | |
| +
 | |
|  	print $_,"\n";
 | |
|  }
 | |
|  close STDOUT;
 | |
| --- a/arch/x86/crypto/poly1305_glue.c
 | |
| +++ b/arch/x86/crypto/poly1305_glue.c
 | |
| @@ -1,8 +1,6 @@
 | |
| -// SPDX-License-Identifier: GPL-2.0-or-later
 | |
| +// SPDX-License-Identifier: GPL-2.0 OR MIT
 | |
|  /*
 | |
| - * Poly1305 authenticator algorithm, RFC7539, SIMD glue code
 | |
| - *
 | |
| - * Copyright (C) 2015 Martin Willi
 | |
| + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
 | |
|   */
 | |
|  
 | |
|  #include <crypto/algapi.h>
 | |
| @@ -13,279 +11,170 @@
 | |
|  #include <linux/jump_label.h>
 | |
|  #include <linux/kernel.h>
 | |
|  #include <linux/module.h>
 | |
| +#include <asm/intel-family.h>
 | |
|  #include <asm/simd.h>
 | |
|  
 | |
| -asmlinkage void poly1305_block_sse2(u32 *h, const u8 *src,
 | |
| -				    const u32 *r, unsigned int blocks);
 | |
| -asmlinkage void poly1305_2block_sse2(u32 *h, const u8 *src, const u32 *r,
 | |
| -				     unsigned int blocks, const u32 *u);
 | |
| -asmlinkage void poly1305_4block_avx2(u32 *h, const u8 *src, const u32 *r,
 | |
| -				     unsigned int blocks, const u32 *u);
 | |
| +asmlinkage void poly1305_init_x86_64(void *ctx,
 | |
| +				     const u8 key[POLY1305_KEY_SIZE]);
 | |
| +asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp,
 | |
| +				       const size_t len, const u32 padbit);
 | |
| +asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
 | |
| +				     const u32 nonce[4]);
 | |
| +asmlinkage void poly1305_emit_avx(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
 | |
| +				  const u32 nonce[4]);
 | |
| +asmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp, const size_t len,
 | |
| +				    const u32 padbit);
 | |
| +asmlinkage void poly1305_blocks_avx2(void *ctx, const u8 *inp, const size_t len,
 | |
| +				     const u32 padbit);
 | |
| +asmlinkage void poly1305_blocks_avx512(void *ctx, const u8 *inp,
 | |
| +				       const size_t len, const u32 padbit);
 | |
|  
 | |
| -static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_simd);
 | |
| +static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx);
 | |
|  static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2);
 | |
| +static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512);
 | |
|  
 | |
| -static inline u64 mlt(u64 a, u64 b)
 | |
| -{
 | |
| -	return a * b;
 | |
| -}
 | |
| -
 | |
| -static inline u32 sr(u64 v, u_char n)
 | |
| -{
 | |
| -	return v >> n;
 | |
| -}
 | |
| -
 | |
| -static inline u32 and(u32 v, u32 mask)
 | |
| -{
 | |
| -	return v & mask;
 | |
| -}
 | |
| -
 | |
| -static void poly1305_simd_mult(u32 *a, const u32 *b)
 | |
| -{
 | |
| -	u8 m[POLY1305_BLOCK_SIZE];
 | |
| -
 | |
| -	memset(m, 0, sizeof(m));
 | |
| -	/* The poly1305 block function adds a hi-bit to the accumulator which
 | |
| -	 * we don't need for key multiplication; compensate for it. */
 | |
| -	a[4] -= 1 << 24;
 | |
| -	poly1305_block_sse2(a, m, b, 1);
 | |
| -}
 | |
| -
 | |
| -static void poly1305_integer_setkey(struct poly1305_key *key, const u8 *raw_key)
 | |
| -{
 | |
| -	/* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
 | |
| -	key->r[0] = (get_unaligned_le32(raw_key +  0) >> 0) & 0x3ffffff;
 | |
| -	key->r[1] = (get_unaligned_le32(raw_key +  3) >> 2) & 0x3ffff03;
 | |
| -	key->r[2] = (get_unaligned_le32(raw_key +  6) >> 4) & 0x3ffc0ff;
 | |
| -	key->r[3] = (get_unaligned_le32(raw_key +  9) >> 6) & 0x3f03fff;
 | |
| -	key->r[4] = (get_unaligned_le32(raw_key + 12) >> 8) & 0x00fffff;
 | |
| -}
 | |
| +struct poly1305_arch_internal {
 | |
| +	union {
 | |
| +		struct {
 | |
| +			u32 h[5];
 | |
| +			u32 is_base2_26;
 | |
| +		};
 | |
| +		u64 hs[3];
 | |
| +	};
 | |
| +	u64 r[2];
 | |
| +	u64 pad;
 | |
| +	struct { u32 r2, r1, r4, r3; } rn[9];
 | |
| +};
 | |
|  
 | |
| -static void poly1305_integer_blocks(struct poly1305_state *state,
 | |
| -				    const struct poly1305_key *key,
 | |
| -				    const void *src,
 | |
| -				    unsigned int nblocks, u32 hibit)
 | |
| +/* The AVX code uses base 2^26, while the scalar code uses base 2^64. If we hit
 | |
| + * the unfortunate situation of using AVX and then having to go back to scalar
 | |
| + * -- because the user is silly and has called the update function from two
 | |
| + * separate contexts -- then we need to convert back to the original base before
 | |
| + * proceeding. It is possible to reason that the initial reduction below is
 | |
| + * sufficient given the implementation invariants. However, for an avoidance of
 | |
| + * doubt and because this is not performance critical, we do the full reduction
 | |
| + * anyway. Z3 proof of below function: https://xn--4db.cc/ltPtHCKN/py
 | |
| + */
 | |
| +static void convert_to_base2_64(void *ctx)
 | |
|  {
 | |
| -	u32 r0, r1, r2, r3, r4;
 | |
| -	u32 s1, s2, s3, s4;
 | |
| -	u32 h0, h1, h2, h3, h4;
 | |
| -	u64 d0, d1, d2, d3, d4;
 | |
| +	struct poly1305_arch_internal *state = ctx;
 | |
| +	u32 cy;
 | |
|  
 | |
| -	if (!nblocks)
 | |
| +	if (!state->is_base2_26)
 | |
|  		return;
 | |
|  
 | |
| -	r0 = key->r[0];
 | |
| -	r1 = key->r[1];
 | |
| -	r2 = key->r[2];
 | |
| -	r3 = key->r[3];
 | |
| -	r4 = key->r[4];
 | |
| -
 | |
| -	s1 = r1 * 5;
 | |
| -	s2 = r2 * 5;
 | |
| -	s3 = r3 * 5;
 | |
| -	s4 = r4 * 5;
 | |
| -
 | |
| -	h0 = state->h[0];
 | |
| -	h1 = state->h[1];
 | |
| -	h2 = state->h[2];
 | |
| -	h3 = state->h[3];
 | |
| -	h4 = state->h[4];
 | |
| -
 | |
| -	do {
 | |
| -		/* h += m[i] */
 | |
| -		h0 += (get_unaligned_le32(src +  0) >> 0) & 0x3ffffff;
 | |
| -		h1 += (get_unaligned_le32(src +  3) >> 2) & 0x3ffffff;
 | |
| -		h2 += (get_unaligned_le32(src +  6) >> 4) & 0x3ffffff;
 | |
| -		h3 += (get_unaligned_le32(src +  9) >> 6) & 0x3ffffff;
 | |
| -		h4 += (get_unaligned_le32(src + 12) >> 8) | (hibit << 24);
 | |
| -
 | |
| -		/* h *= r */
 | |
| -		d0 = mlt(h0, r0) + mlt(h1, s4) + mlt(h2, s3) +
 | |
| -		     mlt(h3, s2) + mlt(h4, s1);
 | |
| -		d1 = mlt(h0, r1) + mlt(h1, r0) + mlt(h2, s4) +
 | |
| -		     mlt(h3, s3) + mlt(h4, s2);
 | |
| -		d2 = mlt(h0, r2) + mlt(h1, r1) + mlt(h2, r0) +
 | |
| -		     mlt(h3, s4) + mlt(h4, s3);
 | |
| -		d3 = mlt(h0, r3) + mlt(h1, r2) + mlt(h2, r1) +
 | |
| -		     mlt(h3, r0) + mlt(h4, s4);
 | |
| -		d4 = mlt(h0, r4) + mlt(h1, r3) + mlt(h2, r2) +
 | |
| -		     mlt(h3, r1) + mlt(h4, r0);
 | |
| -
 | |
| -		/* (partial) h %= p */
 | |
| -		d1 += sr(d0, 26);     h0 = and(d0, 0x3ffffff);
 | |
| -		d2 += sr(d1, 26);     h1 = and(d1, 0x3ffffff);
 | |
| -		d3 += sr(d2, 26);     h2 = and(d2, 0x3ffffff);
 | |
| -		d4 += sr(d3, 26);     h3 = and(d3, 0x3ffffff);
 | |
| -		h0 += sr(d4, 26) * 5; h4 = and(d4, 0x3ffffff);
 | |
| -		h1 += h0 >> 26;       h0 = h0 & 0x3ffffff;
 | |
| -
 | |
| -		src += POLY1305_BLOCK_SIZE;
 | |
| -	} while (--nblocks);
 | |
| -
 | |
| -	state->h[0] = h0;
 | |
| -	state->h[1] = h1;
 | |
| -	state->h[2] = h2;
 | |
| -	state->h[3] = h3;
 | |
| -	state->h[4] = h4;
 | |
| -}
 | |
| -
 | |
| -static void poly1305_integer_emit(const struct poly1305_state *state, void *dst)
 | |
| -{
 | |
| -	u32 h0, h1, h2, h3, h4;
 | |
| -	u32 g0, g1, g2, g3, g4;
 | |
| -	u32 mask;
 | |
| -
 | |
| -	/* fully carry h */
 | |
| -	h0 = state->h[0];
 | |
| -	h1 = state->h[1];
 | |
| -	h2 = state->h[2];
 | |
| -	h3 = state->h[3];
 | |
| -	h4 = state->h[4];
 | |
| -
 | |
| -	h2 += (h1 >> 26);     h1 = h1 & 0x3ffffff;
 | |
| -	h3 += (h2 >> 26);     h2 = h2 & 0x3ffffff;
 | |
| -	h4 += (h3 >> 26);     h3 = h3 & 0x3ffffff;
 | |
| -	h0 += (h4 >> 26) * 5; h4 = h4 & 0x3ffffff;
 | |
| -	h1 += (h0 >> 26);     h0 = h0 & 0x3ffffff;
 | |
| -
 | |
| -	/* compute h + -p */
 | |
| -	g0 = h0 + 5;
 | |
| -	g1 = h1 + (g0 >> 26);             g0 &= 0x3ffffff;
 | |
| -	g2 = h2 + (g1 >> 26);             g1 &= 0x3ffffff;
 | |
| -	g3 = h3 + (g2 >> 26);             g2 &= 0x3ffffff;
 | |
| -	g4 = h4 + (g3 >> 26) - (1 << 26); g3 &= 0x3ffffff;
 | |
| -
 | |
| -	/* select h if h < p, or h + -p if h >= p */
 | |
| -	mask = (g4 >> ((sizeof(u32) * 8) - 1)) - 1;
 | |
| -	g0 &= mask;
 | |
| -	g1 &= mask;
 | |
| -	g2 &= mask;
 | |
| -	g3 &= mask;
 | |
| -	g4 &= mask;
 | |
| -	mask = ~mask;
 | |
| -	h0 = (h0 & mask) | g0;
 | |
| -	h1 = (h1 & mask) | g1;
 | |
| -	h2 = (h2 & mask) | g2;
 | |
| -	h3 = (h3 & mask) | g3;
 | |
| -	h4 = (h4 & mask) | g4;
 | |
| -
 | |
| -	/* h = h % (2^128) */
 | |
| -	put_unaligned_le32((h0 >>  0) | (h1 << 26), dst +  0);
 | |
| -	put_unaligned_le32((h1 >>  6) | (h2 << 20), dst +  4);
 | |
| -	put_unaligned_le32((h2 >> 12) | (h3 << 14), dst +  8);
 | |
| -	put_unaligned_le32((h3 >> 18) | (h4 <<  8), dst + 12);
 | |
| -}
 | |
| -
 | |
| -void poly1305_init_arch(struct poly1305_desc_ctx *desc, const u8 *key)
 | |
| -{
 | |
| -	poly1305_integer_setkey(desc->opaque_r, key);
 | |
| -	desc->s[0] = get_unaligned_le32(key + 16);
 | |
| -	desc->s[1] = get_unaligned_le32(key + 20);
 | |
| -	desc->s[2] = get_unaligned_le32(key + 24);
 | |
| -	desc->s[3] = get_unaligned_le32(key + 28);
 | |
| -	poly1305_core_init(&desc->h);
 | |
| -	desc->buflen = 0;
 | |
| -	desc->sset = true;
 | |
| -	desc->rset = 1;
 | |
| -}
 | |
| -EXPORT_SYMBOL_GPL(poly1305_init_arch);
 | |
| -
 | |
| -static unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
 | |
| -					       const u8 *src, unsigned int srclen)
 | |
| -{
 | |
| -	if (!dctx->sset) {
 | |
| -		if (!dctx->rset && srclen >= POLY1305_BLOCK_SIZE) {
 | |
| -			poly1305_integer_setkey(dctx->r, src);
 | |
| -			src += POLY1305_BLOCK_SIZE;
 | |
| -			srclen -= POLY1305_BLOCK_SIZE;
 | |
| -			dctx->rset = 1;
 | |
| -		}
 | |
| -		if (srclen >= POLY1305_BLOCK_SIZE) {
 | |
| -			dctx->s[0] = get_unaligned_le32(src +  0);
 | |
| -			dctx->s[1] = get_unaligned_le32(src +  4);
 | |
| -			dctx->s[2] = get_unaligned_le32(src +  8);
 | |
| -			dctx->s[3] = get_unaligned_le32(src + 12);
 | |
| -			src += POLY1305_BLOCK_SIZE;
 | |
| -			srclen -= POLY1305_BLOCK_SIZE;
 | |
| -			dctx->sset = true;
 | |
| -		}
 | |
| +	cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy;
 | |
| +	cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy;
 | |
| +	cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy;
 | |
| +	cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy;
 | |
| +	state->hs[0] = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | state->h[0];
 | |
| +	state->hs[1] = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | (state->h[2] >> 12);
 | |
| +	state->hs[2] = state->h[4] >> 24;
 | |
| +#define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1))
 | |
| +	cy = (state->hs[2] >> 2) + (state->hs[2] & ~3ULL);
 | |
| +	state->hs[2] &= 3;
 | |
| +	state->hs[0] += cy;
 | |
| +	state->hs[1] += (cy = ULT(state->hs[0], cy));
 | |
| +	state->hs[2] += ULT(state->hs[1], cy);
 | |
| +#undef ULT
 | |
| +	state->is_base2_26 = 0;
 | |
| +}
 | |
| +
 | |
| +static void poly1305_simd_init(void *ctx, const u8 key[POLY1305_KEY_SIZE])
 | |
| +{
 | |
| +	poly1305_init_x86_64(ctx, key);
 | |
| +}
 | |
| +
 | |
| +static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len,
 | |
| +				 const u32 padbit)
 | |
| +{
 | |
| +	struct poly1305_arch_internal *state = ctx;
 | |
| +
 | |
| +	/* SIMD disables preemption, so relax after processing each page. */
 | |
| +	BUILD_BUG_ON(PAGE_SIZE < POLY1305_BLOCK_SIZE ||
 | |
| +		     PAGE_SIZE % POLY1305_BLOCK_SIZE);
 | |
| +
 | |
| +	if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx) ||
 | |
| +	    (len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) ||
 | |
| +	    !crypto_simd_usable()) {
 | |
| +		convert_to_base2_64(ctx);
 | |
| +		poly1305_blocks_x86_64(ctx, inp, len, padbit);
 | |
| +		return;
 | |
|  	}
 | |
| -	return srclen;
 | |
| -}
 | |
|  
 | |
| -static unsigned int poly1305_scalar_blocks(struct poly1305_desc_ctx *dctx,
 | |
| -					   const u8 *src, unsigned int srclen)
 | |
| -{
 | |
| -	unsigned int datalen;
 | |
| +	for (;;) {
 | |
| +		const size_t bytes = min_t(size_t, len, PAGE_SIZE);
 | |
|  
 | |
| -	if (unlikely(!dctx->sset)) {
 | |
| -		datalen = crypto_poly1305_setdesckey(dctx, src, srclen);
 | |
| -		src += srclen - datalen;
 | |
| -		srclen = datalen;
 | |
| -	}
 | |
| -	if (srclen >= POLY1305_BLOCK_SIZE) {
 | |
| -		poly1305_integer_blocks(&dctx->h, dctx->opaque_r, src,
 | |
| -					srclen / POLY1305_BLOCK_SIZE, 1);
 | |
| -		srclen %= POLY1305_BLOCK_SIZE;
 | |
| +		kernel_fpu_begin();
 | |
| +		if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&poly1305_use_avx512))
 | |
| +			poly1305_blocks_avx512(ctx, inp, bytes, padbit);
 | |
| +		else if (IS_ENABLED(CONFIG_AS_AVX2) && static_branch_likely(&poly1305_use_avx2))
 | |
| +			poly1305_blocks_avx2(ctx, inp, bytes, padbit);
 | |
| +		else
 | |
| +			poly1305_blocks_avx(ctx, inp, bytes, padbit);
 | |
| +		kernel_fpu_end();
 | |
| +		len -= bytes;
 | |
| +		if (!len)
 | |
| +			break;
 | |
| +		inp += bytes;
 | |
|  	}
 | |
| -	return srclen;
 | |
|  }
 | |
|  
 | |
| -static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx,
 | |
| -					 const u8 *src, unsigned int srclen)
 | |
| -{
 | |
| -	unsigned int blocks, datalen;
 | |
| +static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
 | |
| +			       const u32 nonce[4])
 | |
| +{
 | |
| +	struct poly1305_arch_internal *state = ctx;
 | |
| +
 | |
| +	if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx) ||
 | |
| +	    !state->is_base2_26 || !crypto_simd_usable()) {
 | |
| +		convert_to_base2_64(ctx);
 | |
| +		poly1305_emit_x86_64(ctx, mac, nonce);
 | |
| +	} else
 | |
| +		poly1305_emit_avx(ctx, mac, nonce);
 | |
| +}
 | |
| +
 | |
| +void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
 | |
| +{
 | |
| +	poly1305_simd_init(&dctx->h, key);
 | |
| +	dctx->s[0] = get_unaligned_le32(&key[16]);
 | |
| +	dctx->s[1] = get_unaligned_le32(&key[20]);
 | |
| +	dctx->s[2] = get_unaligned_le32(&key[24]);
 | |
| +	dctx->s[3] = get_unaligned_le32(&key[28]);
 | |
| +	dctx->buflen = 0;
 | |
| +	dctx->sset = true;
 | |
| +}
 | |
| +EXPORT_SYMBOL(poly1305_init_arch);
 | |
|  
 | |
| +static unsigned int crypto_poly1305_setdctxkey(struct poly1305_desc_ctx *dctx,
 | |
| +					       const u8 *inp, unsigned int len)
 | |
| +{
 | |
| +	unsigned int acc = 0;
 | |
|  	if (unlikely(!dctx->sset)) {
 | |
| -		datalen = crypto_poly1305_setdesckey(dctx, src, srclen);
 | |
| -		src += srclen - datalen;
 | |
| -		srclen = datalen;
 | |
| -	}
 | |
| -
 | |
| -	if (IS_ENABLED(CONFIG_AS_AVX2) &&
 | |
| -	    static_branch_likely(&poly1305_use_avx2) &&
 | |
| -	    srclen >= POLY1305_BLOCK_SIZE * 4) {
 | |
| -		if (unlikely(dctx->rset < 4)) {
 | |
| -			if (dctx->rset < 2) {
 | |
| -				dctx->r[1] = dctx->r[0];
 | |
| -				poly1305_simd_mult(dctx->r[1].r, dctx->r[0].r);
 | |
| -			}
 | |
| -			dctx->r[2] = dctx->r[1];
 | |
| -			poly1305_simd_mult(dctx->r[2].r, dctx->r[0].r);
 | |
| -			dctx->r[3] = dctx->r[2];
 | |
| -			poly1305_simd_mult(dctx->r[3].r, dctx->r[0].r);
 | |
| -			dctx->rset = 4;
 | |
| +		if (!dctx->rset && len >= POLY1305_BLOCK_SIZE) {
 | |
| +			poly1305_simd_init(&dctx->h, inp);
 | |
| +			inp += POLY1305_BLOCK_SIZE;
 | |
| +			len -= POLY1305_BLOCK_SIZE;
 | |
| +			acc += POLY1305_BLOCK_SIZE;
 | |
| +			dctx->rset = 1;
 | |
|  		}
 | |
| -		blocks = srclen / (POLY1305_BLOCK_SIZE * 4);
 | |
| -		poly1305_4block_avx2(dctx->h.h, src, dctx->r[0].r, blocks,
 | |
| -				     dctx->r[1].r);
 | |
| -		src += POLY1305_BLOCK_SIZE * 4 * blocks;
 | |
| -		srclen -= POLY1305_BLOCK_SIZE * 4 * blocks;
 | |
| -	}
 | |
| -
 | |
| -	if (likely(srclen >= POLY1305_BLOCK_SIZE * 2)) {
 | |
| -		if (unlikely(dctx->rset < 2)) {
 | |
| -			dctx->r[1] = dctx->r[0];
 | |
| -			poly1305_simd_mult(dctx->r[1].r, dctx->r[0].r);
 | |
| -			dctx->rset = 2;
 | |
| +		if (len >= POLY1305_BLOCK_SIZE) {
 | |
| +			dctx->s[0] = get_unaligned_le32(&inp[0]);
 | |
| +			dctx->s[1] = get_unaligned_le32(&inp[4]);
 | |
| +			dctx->s[2] = get_unaligned_le32(&inp[8]);
 | |
| +			dctx->s[3] = get_unaligned_le32(&inp[12]);
 | |
| +			inp += POLY1305_BLOCK_SIZE;
 | |
| +			len -= POLY1305_BLOCK_SIZE;
 | |
| +			acc += POLY1305_BLOCK_SIZE;
 | |
| +			dctx->sset = true;
 | |
|  		}
 | |
| -		blocks = srclen / (POLY1305_BLOCK_SIZE * 2);
 | |
| -		poly1305_2block_sse2(dctx->h.h, src, dctx->r[0].r,
 | |
| -				     blocks, dctx->r[1].r);
 | |
| -		src += POLY1305_BLOCK_SIZE * 2 * blocks;
 | |
| -		srclen -= POLY1305_BLOCK_SIZE * 2 * blocks;
 | |
| -	}
 | |
| -	if (srclen >= POLY1305_BLOCK_SIZE) {
 | |
| -		poly1305_block_sse2(dctx->h.h, src, dctx->r[0].r, 1);
 | |
| -		srclen -= POLY1305_BLOCK_SIZE;
 | |
|  	}
 | |
| -	return srclen;
 | |
| +	return acc;
 | |
|  }
 | |
|  
 | |
|  void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
 | |
|  			  unsigned int srclen)
 | |
|  {
 | |
| -	unsigned int bytes;
 | |
| +	unsigned int bytes, used;
 | |
|  
 | |
|  	if (unlikely(dctx->buflen)) {
 | |
|  		bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen);
 | |
| @@ -295,31 +184,19 @@ void poly1305_update_arch(struct poly130
 | |
|  		dctx->buflen += bytes;
 | |
|  
 | |
|  		if (dctx->buflen == POLY1305_BLOCK_SIZE) {
 | |
| -			if (static_branch_likely(&poly1305_use_simd) &&
 | |
| -			    likely(crypto_simd_usable())) {
 | |
| -				kernel_fpu_begin();
 | |
| -				poly1305_simd_blocks(dctx, dctx->buf,
 | |
| -						     POLY1305_BLOCK_SIZE);
 | |
| -				kernel_fpu_end();
 | |
| -			} else {
 | |
| -				poly1305_scalar_blocks(dctx, dctx->buf,
 | |
| -						       POLY1305_BLOCK_SIZE);
 | |
| -			}
 | |
| +			if (likely(!crypto_poly1305_setdctxkey(dctx, dctx->buf, POLY1305_BLOCK_SIZE)))
 | |
| +				poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 1);
 | |
|  			dctx->buflen = 0;
 | |
|  		}
 | |
|  	}
 | |
|  
 | |
|  	if (likely(srclen >= POLY1305_BLOCK_SIZE)) {
 | |
| -		if (static_branch_likely(&poly1305_use_simd) &&
 | |
| -		    likely(crypto_simd_usable())) {
 | |
| -			kernel_fpu_begin();
 | |
| -			bytes = poly1305_simd_blocks(dctx, src, srclen);
 | |
| -			kernel_fpu_end();
 | |
| -		} else {
 | |
| -			bytes = poly1305_scalar_blocks(dctx, src, srclen);
 | |
| -		}
 | |
| -		src += srclen - bytes;
 | |
| -		srclen = bytes;
 | |
| +		bytes = round_down(srclen, POLY1305_BLOCK_SIZE);
 | |
| +		srclen -= bytes;
 | |
| +		used = crypto_poly1305_setdctxkey(dctx, src, bytes);
 | |
| +		if (likely(bytes - used))
 | |
| +			poly1305_simd_blocks(&dctx->h, src + used, bytes - used, 1);
 | |
| +		src += bytes;
 | |
|  	}
 | |
|  
 | |
|  	if (unlikely(srclen)) {
 | |
| @@ -329,31 +206,17 @@ void poly1305_update_arch(struct poly130
 | |
|  }
 | |
|  EXPORT_SYMBOL(poly1305_update_arch);
 | |
|  
 | |
| -void poly1305_final_arch(struct poly1305_desc_ctx *desc, u8 *dst)
 | |
| +void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
 | |
|  {
 | |
| -	__le32 digest[4];
 | |
| -	u64 f = 0;
 | |
| -
 | |
| -	if (unlikely(desc->buflen)) {
 | |
| -		desc->buf[desc->buflen++] = 1;
 | |
| -		memset(desc->buf + desc->buflen, 0,
 | |
| -		       POLY1305_BLOCK_SIZE - desc->buflen);
 | |
| -		poly1305_integer_blocks(&desc->h, desc->opaque_r, desc->buf, 1, 0);
 | |
| +	if (unlikely(dctx->buflen)) {
 | |
| +		dctx->buf[dctx->buflen++] = 1;
 | |
| +		memset(dctx->buf + dctx->buflen, 0,
 | |
| +		       POLY1305_BLOCK_SIZE - dctx->buflen);
 | |
| +		poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
 | |
|  	}
 | |
|  
 | |
| -	poly1305_integer_emit(&desc->h, digest);
 | |
| -
 | |
| -	/* mac = (h + s) % (2^128) */
 | |
| -	f = (f >> 32) + le32_to_cpu(digest[0]) + desc->s[0];
 | |
| -	put_unaligned_le32(f, dst + 0);
 | |
| -	f = (f >> 32) + le32_to_cpu(digest[1]) + desc->s[1];
 | |
| -	put_unaligned_le32(f, dst + 4);
 | |
| -	f = (f >> 32) + le32_to_cpu(digest[2]) + desc->s[2];
 | |
| -	put_unaligned_le32(f, dst + 8);
 | |
| -	f = (f >> 32) + le32_to_cpu(digest[3]) + desc->s[3];
 | |
| -	put_unaligned_le32(f, dst + 12);
 | |
| -
 | |
| -	*desc = (struct poly1305_desc_ctx){};
 | |
| +	poly1305_simd_emit(&dctx->h, dst, dctx->s);
 | |
| +	*dctx = (struct poly1305_desc_ctx){};
 | |
|  }
 | |
|  EXPORT_SYMBOL(poly1305_final_arch);
 | |
|  
 | |
| @@ -361,38 +224,34 @@ static int crypto_poly1305_init(struct s
 | |
|  {
 | |
|  	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
 | |
|  
 | |
| -	poly1305_core_init(&dctx->h);
 | |
| -	dctx->buflen = 0;
 | |
| -	dctx->rset = 0;
 | |
| -	dctx->sset = false;
 | |
| -
 | |
| +	*dctx = (struct poly1305_desc_ctx){};
 | |
|  	return 0;
 | |
|  }
 | |
|  
 | |
| -static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
 | |
| +static int crypto_poly1305_update(struct shash_desc *desc,
 | |
| +				  const u8 *src, unsigned int srclen)
 | |
|  {
 | |
|  	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
 | |
|  
 | |
| -	if (unlikely(!dctx->sset))
 | |
| -		return -ENOKEY;
 | |
| -
 | |
| -	poly1305_final_arch(dctx, dst);
 | |
| +	poly1305_update_arch(dctx, src, srclen);
 | |
|  	return 0;
 | |
|  }
 | |
|  
 | |
| -static int poly1305_simd_update(struct shash_desc *desc,
 | |
| -				const u8 *src, unsigned int srclen)
 | |
| +static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
 | |
|  {
 | |
|  	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
 | |
|  
 | |
| -	poly1305_update_arch(dctx, src, srclen);
 | |
| +	if (unlikely(!dctx->sset))
 | |
| +		return -ENOKEY;
 | |
| +
 | |
| +	poly1305_final_arch(dctx, dst);
 | |
|  	return 0;
 | |
|  }
 | |
|  
 | |
|  static struct shash_alg alg = {
 | |
|  	.digestsize	= POLY1305_DIGEST_SIZE,
 | |
|  	.init		= crypto_poly1305_init,
 | |
| -	.update		= poly1305_simd_update,
 | |
| +	.update		= crypto_poly1305_update,
 | |
|  	.final		= crypto_poly1305_final,
 | |
|  	.descsize	= sizeof(struct poly1305_desc_ctx),
 | |
|  	.base		= {
 | |
| @@ -406,17 +265,19 @@ static struct shash_alg alg = {
 | |
|  
 | |
|  static int __init poly1305_simd_mod_init(void)
 | |
|  {
 | |
| -	if (!boot_cpu_has(X86_FEATURE_XMM2))
 | |
| -		return 0;
 | |
| -
 | |
| -	static_branch_enable(&poly1305_use_simd);
 | |
| -
 | |
| -	if (IS_ENABLED(CONFIG_AS_AVX2) &&
 | |
| -	    boot_cpu_has(X86_FEATURE_AVX) &&
 | |
| +	if (IS_ENABLED(CONFIG_AS_AVX) && boot_cpu_has(X86_FEATURE_AVX) &&
 | |
| +	    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
 | |
| +		static_branch_enable(&poly1305_use_avx);
 | |
| +	if (IS_ENABLED(CONFIG_AS_AVX2) && boot_cpu_has(X86_FEATURE_AVX) &&
 | |
|  	    boot_cpu_has(X86_FEATURE_AVX2) &&
 | |
|  	    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
 | |
|  		static_branch_enable(&poly1305_use_avx2);
 | |
| -
 | |
| +	if (IS_ENABLED(CONFIG_AS_AVX512) && boot_cpu_has(X86_FEATURE_AVX) &&
 | |
| +	    boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX512F) &&
 | |
| +	    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) &&
 | |
| +	    /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */
 | |
| +	    boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X)
 | |
| +		static_branch_enable(&poly1305_use_avx512);
 | |
|  	return IS_REACHABLE(CONFIG_CRYPTO_HASH) ? crypto_register_shash(&alg) : 0;
 | |
|  }
 | |
|  
 | |
| @@ -430,7 +291,7 @@ module_init(poly1305_simd_mod_init);
 | |
|  module_exit(poly1305_simd_mod_exit);
 | |
|  
 | |
|  MODULE_LICENSE("GPL");
 | |
| -MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
 | |
| +MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
 | |
|  MODULE_DESCRIPTION("Poly1305 authenticator");
 | |
|  MODULE_ALIAS_CRYPTO("poly1305");
 | |
|  MODULE_ALIAS_CRYPTO("poly1305-simd");
 | |
| --- a/lib/crypto/Kconfig
 | |
| +++ b/lib/crypto/Kconfig
 | |
| @@ -90,7 +90,7 @@ config CRYPTO_LIB_DES
 | |
|  config CRYPTO_LIB_POLY1305_RSIZE
 | |
|  	int
 | |
|  	default 2 if MIPS
 | |
| -	default 4 if X86_64
 | |
| +	default 11 if X86_64
 | |
|  	default 9 if ARM || ARM64
 | |
|  	default 1
 | |
|  
 |