diff options
Diffstat (limited to 'lib/crypto/arm64')
| -rw-r--r-- | lib/crypto/arm64/.gitignore | 4 | ||||
| -rw-r--r-- | lib/crypto/arm64/Kconfig | 14 | ||||
| -rw-r--r-- | lib/crypto/arm64/Makefile | 17 | ||||
| -rw-r--r-- | lib/crypto/arm64/chacha-neon-core.S | 805 | ||||
| -rw-r--r-- | lib/crypto/arm64/chacha-neon-glue.c | 119 | ||||
| -rw-r--r-- | lib/crypto/arm64/poly1305-armv8.pl | 917 | ||||
| -rw-r--r-- | lib/crypto/arm64/poly1305-glue.c | 74 | ||||
| -rw-r--r-- | lib/crypto/arm64/sha1-ce-core.S | 130 | ||||
| -rw-r--r-- | lib/crypto/arm64/sha1.h | 39 | ||||
| -rw-r--r-- | lib/crypto/arm64/sha2-armv8.pl | 786 | ||||
| -rw-r--r-- | lib/crypto/arm64/sha256-ce.S | 136 | ||||
| -rw-r--r-- | lib/crypto/arm64/sha256.h | 57 | ||||
| -rw-r--r-- | lib/crypto/arm64/sha512-ce-core.S | 197 | ||||
| -rw-r--r-- | lib/crypto/arm64/sha512.h | 46 | 
14 files changed, 3341 insertions, 0 deletions
diff --git a/lib/crypto/arm64/.gitignore b/lib/crypto/arm64/.gitignore new file mode 100644 index 000000000000..f6c4e8ef80da --- /dev/null +++ b/lib/crypto/arm64/.gitignore @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only +poly1305-core.S +sha256-core.S +sha512-core.S diff --git a/lib/crypto/arm64/Kconfig b/lib/crypto/arm64/Kconfig new file mode 100644 index 000000000000..0b903ef524d8 --- /dev/null +++ b/lib/crypto/arm64/Kconfig @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: GPL-2.0-only + +config CRYPTO_CHACHA20_NEON +	tristate +	depends on KERNEL_MODE_NEON +	default CRYPTO_LIB_CHACHA +	select CRYPTO_LIB_CHACHA_GENERIC +	select CRYPTO_ARCH_HAVE_LIB_CHACHA + +config CRYPTO_POLY1305_NEON +	tristate +	depends on KERNEL_MODE_NEON +	default CRYPTO_LIB_POLY1305 +	select CRYPTO_ARCH_HAVE_LIB_POLY1305 diff --git a/lib/crypto/arm64/Makefile b/lib/crypto/arm64/Makefile new file mode 100644 index 000000000000..6207088397a7 --- /dev/null +++ b/lib/crypto/arm64/Makefile @@ -0,0 +1,17 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o +chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o + +obj-$(CONFIG_CRYPTO_POLY1305_NEON) += poly1305-neon.o +poly1305-neon-y := poly1305-core.o poly1305-glue.o +AFLAGS_poly1305-core.o += -Dpoly1305_init=poly1305_block_init_arch +AFLAGS_poly1305-core.o += -Dpoly1305_emit=poly1305_emit_arch + +quiet_cmd_perlasm = PERLASM $@ +      cmd_perlasm = $(PERL) $(<) void $(@) + +$(obj)/%-core.S: $(src)/%-armv8.pl +	$(call cmd,perlasm) + +clean-files += poly1305-core.S diff --git a/lib/crypto/arm64/chacha-neon-core.S b/lib/crypto/arm64/chacha-neon-core.S new file mode 100644 index 000000000000..80079586ecc7 --- /dev/null +++ b/lib/crypto/arm64/chacha-neon-core.S @@ -0,0 +1,805 @@ +/* + * ChaCha/HChaCha NEON helper functions + * + * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Originally based on: + * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> +#include <asm/cache.h> + +	.text +	.align		6 + +/* + * chacha_permute - permute one block + * + * Permute one 64-byte block where the state matrix is stored in the four NEON + * registers v0-v3.  It performs matrix operations on four words in parallel, + * but requires shuffling to rearrange the words after each round. + * + * The round count is given in w3. + * + * Clobbers: w3, x10, v4, v12 + */ +SYM_FUNC_START_LOCAL(chacha_permute) + +	adr_l		x10, ROT8 +	ld1		{v12.4s}, [x10] + +.Ldoubleround: +	// x0 += x1, x3 = rotl32(x3 ^ x0, 16) +	add		v0.4s, v0.4s, v1.4s +	eor		v3.16b, v3.16b, v0.16b +	rev32		v3.8h, v3.8h + +	// x2 += x3, x1 = rotl32(x1 ^ x2, 12) +	add		v2.4s, v2.4s, v3.4s +	eor		v4.16b, v1.16b, v2.16b +	shl		v1.4s, v4.4s, #12 +	sri		v1.4s, v4.4s, #20 + +	// x0 += x1, x3 = rotl32(x3 ^ x0, 8) +	add		v0.4s, v0.4s, v1.4s +	eor		v3.16b, v3.16b, v0.16b +	tbl		v3.16b, {v3.16b}, v12.16b + +	// x2 += x3, x1 = rotl32(x1 ^ x2, 7) +	add		v2.4s, v2.4s, v3.4s +	eor		v4.16b, v1.16b, v2.16b +	shl		v1.4s, v4.4s, #7 +	sri		v1.4s, v4.4s, #25 + +	// x1 = shuffle32(x1, MASK(0, 3, 2, 1)) +	ext		v1.16b, v1.16b, v1.16b, #4 +	// x2 = shuffle32(x2, MASK(1, 0, 3, 2)) +	ext		v2.16b, v2.16b, v2.16b, #8 +	// x3 = shuffle32(x3, MASK(2, 1, 0, 3)) +	ext		v3.16b, v3.16b, v3.16b, #12 + +	// x0 += x1, x3 = rotl32(x3 ^ x0, 16) +	add		v0.4s, v0.4s, v1.4s +	eor		v3.16b, v3.16b, v0.16b +	rev32		v3.8h, v3.8h + +	// x2 += x3, x1 = rotl32(x1 ^ x2, 12) +	add		v2.4s, v2.4s, v3.4s +	eor		v4.16b, v1.16b, v2.16b +	shl		v1.4s, v4.4s, #12 +	sri		v1.4s, v4.4s, #20 + +	// x0 += x1, x3 = rotl32(x3 ^ x0, 8) +	add		v0.4s, v0.4s, v1.4s +	eor		v3.16b, v3.16b, v0.16b +	tbl		v3.16b, {v3.16b}, v12.16b + +	// x2 += x3, x1 = rotl32(x1 ^ x2, 7) +	add		v2.4s, v2.4s, v3.4s +	eor		v4.16b, v1.16b, v2.16b +	shl		v1.4s, v4.4s, #7 +	sri		v1.4s, v4.4s, #25 + +	// x1 = shuffle32(x1, MASK(2, 1, 0, 3)) +	ext		v1.16b, v1.16b, v1.16b, #12 +	// x2 = shuffle32(x2, MASK(1, 0, 3, 2)) +	ext		v2.16b, v2.16b, v2.16b, #8 +	// x3 = shuffle32(x3, MASK(0, 3, 2, 1)) +	ext		v3.16b, v3.16b, v3.16b, #4 + +	subs		w3, w3, #2 +	b.ne		.Ldoubleround + +	ret +SYM_FUNC_END(chacha_permute) + +SYM_FUNC_START(chacha_block_xor_neon) +	// x0: Input state matrix, s +	// x1: 1 data block output, o +	// x2: 1 data block input, i +	// w3: nrounds + +	stp		x29, x30, [sp, #-16]! +	mov		x29, sp + +	// x0..3 = s0..3 +	ld1		{v0.4s-v3.4s}, [x0] +	ld1		{v8.4s-v11.4s}, [x0] + +	bl		chacha_permute + +	ld1		{v4.16b-v7.16b}, [x2] + +	// o0 = i0 ^ (x0 + s0) +	add		v0.4s, v0.4s, v8.4s +	eor		v0.16b, v0.16b, v4.16b + +	// o1 = i1 ^ (x1 + s1) +	add		v1.4s, v1.4s, v9.4s +	eor		v1.16b, v1.16b, v5.16b + +	// o2 = i2 ^ (x2 + s2) +	add		v2.4s, v2.4s, v10.4s +	eor		v2.16b, v2.16b, v6.16b + +	// o3 = i3 ^ (x3 + s3) +	add		v3.4s, v3.4s, v11.4s +	eor		v3.16b, v3.16b, v7.16b + +	st1		{v0.16b-v3.16b}, [x1] + +	ldp		x29, x30, [sp], #16 +	ret +SYM_FUNC_END(chacha_block_xor_neon) + +SYM_FUNC_START(hchacha_block_neon) +	// x0: Input state matrix, s +	// x1: output (8 32-bit words) +	// w2: nrounds + +	stp		x29, x30, [sp, #-16]! +	mov		x29, sp + +	ld1		{v0.4s-v3.4s}, [x0] + +	mov		w3, w2 +	bl		chacha_permute + +	st1		{v0.4s}, [x1], #16 +	st1		{v3.4s}, [x1] + +	ldp		x29, x30, [sp], #16 +	ret +SYM_FUNC_END(hchacha_block_neon) + +	a0		.req	w12 +	a1		.req	w13 +	a2		.req	w14 +	a3		.req	w15 +	a4		.req	w16 +	a5		.req	w17 +	a6		.req	w19 +	a7		.req	w20 +	a8		.req	w21 +	a9		.req	w22 +	a10		.req	w23 +	a11		.req	w24 +	a12		.req	w25 +	a13		.req	w26 +	a14		.req	w27 +	a15		.req	w28 + +	.align		6 +SYM_FUNC_START(chacha_4block_xor_neon) +	frame_push	10 + +	// x0: Input state matrix, s +	// x1: 4 data blocks output, o +	// x2: 4 data blocks input, i +	// w3: nrounds +	// x4: byte count + +	adr_l		x10, .Lpermute +	and		x5, x4, #63 +	add		x10, x10, x5 + +	// +	// This function encrypts four consecutive ChaCha blocks by loading +	// the state matrix in NEON registers four times. The algorithm performs +	// each operation on the corresponding word of each state matrix, hence +	// requires no word shuffling. For final XORing step we transpose the +	// matrix by interleaving 32- and then 64-bit words, which allows us to +	// do XOR in NEON registers. +	// +	// At the same time, a fifth block is encrypted in parallel using +	// scalar registers +	// +	adr_l		x9, CTRINC		// ... and ROT8 +	ld1		{v30.4s-v31.4s}, [x9] + +	// x0..15[0-3] = s0..3[0..3] +	add		x8, x0, #16 +	ld4r		{ v0.4s- v3.4s}, [x0] +	ld4r		{ v4.4s- v7.4s}, [x8], #16 +	ld4r		{ v8.4s-v11.4s}, [x8], #16 +	ld4r		{v12.4s-v15.4s}, [x8] + +	mov		a0, v0.s[0] +	mov		a1, v1.s[0] +	mov		a2, v2.s[0] +	mov		a3, v3.s[0] +	mov		a4, v4.s[0] +	mov		a5, v5.s[0] +	mov		a6, v6.s[0] +	mov		a7, v7.s[0] +	mov		a8, v8.s[0] +	mov		a9, v9.s[0] +	mov		a10, v10.s[0] +	mov		a11, v11.s[0] +	mov		a12, v12.s[0] +	mov		a13, v13.s[0] +	mov		a14, v14.s[0] +	mov		a15, v15.s[0] + +	// x12 += counter values 1-4 +	add		v12.4s, v12.4s, v30.4s + +.Ldoubleround4: +	// x0 += x4, x12 = rotl32(x12 ^ x0, 16) +	// x1 += x5, x13 = rotl32(x13 ^ x1, 16) +	// x2 += x6, x14 = rotl32(x14 ^ x2, 16) +	// x3 += x7, x15 = rotl32(x15 ^ x3, 16) +	add		v0.4s, v0.4s, v4.4s +	  add		a0, a0, a4 +	add		v1.4s, v1.4s, v5.4s +	  add		a1, a1, a5 +	add		v2.4s, v2.4s, v6.4s +	  add		a2, a2, a6 +	add		v3.4s, v3.4s, v7.4s +	  add		a3, a3, a7 + +	eor		v12.16b, v12.16b, v0.16b +	  eor		a12, a12, a0 +	eor		v13.16b, v13.16b, v1.16b +	  eor		a13, a13, a1 +	eor		v14.16b, v14.16b, v2.16b +	  eor		a14, a14, a2 +	eor		v15.16b, v15.16b, v3.16b +	  eor		a15, a15, a3 + +	rev32		v12.8h, v12.8h +	  ror		a12, a12, #16 +	rev32		v13.8h, v13.8h +	  ror		a13, a13, #16 +	rev32		v14.8h, v14.8h +	  ror		a14, a14, #16 +	rev32		v15.8h, v15.8h +	  ror		a15, a15, #16 + +	// x8 += x12, x4 = rotl32(x4 ^ x8, 12) +	// x9 += x13, x5 = rotl32(x5 ^ x9, 12) +	// x10 += x14, x6 = rotl32(x6 ^ x10, 12) +	// x11 += x15, x7 = rotl32(x7 ^ x11, 12) +	add		v8.4s, v8.4s, v12.4s +	  add		a8, a8, a12 +	add		v9.4s, v9.4s, v13.4s +	  add		a9, a9, a13 +	add		v10.4s, v10.4s, v14.4s +	  add		a10, a10, a14 +	add		v11.4s, v11.4s, v15.4s +	  add		a11, a11, a15 + +	eor		v16.16b, v4.16b, v8.16b +	  eor		a4, a4, a8 +	eor		v17.16b, v5.16b, v9.16b +	  eor		a5, a5, a9 +	eor		v18.16b, v6.16b, v10.16b +	  eor		a6, a6, a10 +	eor		v19.16b, v7.16b, v11.16b +	  eor		a7, a7, a11 + +	shl		v4.4s, v16.4s, #12 +	shl		v5.4s, v17.4s, #12 +	shl		v6.4s, v18.4s, #12 +	shl		v7.4s, v19.4s, #12 + +	sri		v4.4s, v16.4s, #20 +	  ror		a4, a4, #20 +	sri		v5.4s, v17.4s, #20 +	  ror		a5, a5, #20 +	sri		v6.4s, v18.4s, #20 +	  ror		a6, a6, #20 +	sri		v7.4s, v19.4s, #20 +	  ror		a7, a7, #20 + +	// x0 += x4, x12 = rotl32(x12 ^ x0, 8) +	// x1 += x5, x13 = rotl32(x13 ^ x1, 8) +	// x2 += x6, x14 = rotl32(x14 ^ x2, 8) +	// x3 += x7, x15 = rotl32(x15 ^ x3, 8) +	add		v0.4s, v0.4s, v4.4s +	  add		a0, a0, a4 +	add		v1.4s, v1.4s, v5.4s +	  add		a1, a1, a5 +	add		v2.4s, v2.4s, v6.4s +	  add		a2, a2, a6 +	add		v3.4s, v3.4s, v7.4s +	  add		a3, a3, a7 + +	eor		v12.16b, v12.16b, v0.16b +	  eor		a12, a12, a0 +	eor		v13.16b, v13.16b, v1.16b +	  eor		a13, a13, a1 +	eor		v14.16b, v14.16b, v2.16b +	  eor		a14, a14, a2 +	eor		v15.16b, v15.16b, v3.16b +	  eor		a15, a15, a3 + +	tbl		v12.16b, {v12.16b}, v31.16b +	  ror		a12, a12, #24 +	tbl		v13.16b, {v13.16b}, v31.16b +	  ror		a13, a13, #24 +	tbl		v14.16b, {v14.16b}, v31.16b +	  ror		a14, a14, #24 +	tbl		v15.16b, {v15.16b}, v31.16b +	  ror		a15, a15, #24 + +	// x8 += x12, x4 = rotl32(x4 ^ x8, 7) +	// x9 += x13, x5 = rotl32(x5 ^ x9, 7) +	// x10 += x14, x6 = rotl32(x6 ^ x10, 7) +	// x11 += x15, x7 = rotl32(x7 ^ x11, 7) +	add		v8.4s, v8.4s, v12.4s +	  add		a8, a8, a12 +	add		v9.4s, v9.4s, v13.4s +	  add		a9, a9, a13 +	add		v10.4s, v10.4s, v14.4s +	  add		a10, a10, a14 +	add		v11.4s, v11.4s, v15.4s +	  add		a11, a11, a15 + +	eor		v16.16b, v4.16b, v8.16b +	  eor		a4, a4, a8 +	eor		v17.16b, v5.16b, v9.16b +	  eor		a5, a5, a9 +	eor		v18.16b, v6.16b, v10.16b +	  eor		a6, a6, a10 +	eor		v19.16b, v7.16b, v11.16b +	  eor		a7, a7, a11 + +	shl		v4.4s, v16.4s, #7 +	shl		v5.4s, v17.4s, #7 +	shl		v6.4s, v18.4s, #7 +	shl		v7.4s, v19.4s, #7 + +	sri		v4.4s, v16.4s, #25 +	  ror		a4, a4, #25 +	sri		v5.4s, v17.4s, #25 +	  ror		a5, a5, #25 +	sri		v6.4s, v18.4s, #25 +	 ror		a6, a6, #25 +	sri		v7.4s, v19.4s, #25 +	  ror		a7, a7, #25 + +	// x0 += x5, x15 = rotl32(x15 ^ x0, 16) +	// x1 += x6, x12 = rotl32(x12 ^ x1, 16) +	// x2 += x7, x13 = rotl32(x13 ^ x2, 16) +	// x3 += x4, x14 = rotl32(x14 ^ x3, 16) +	add		v0.4s, v0.4s, v5.4s +	  add		a0, a0, a5 +	add		v1.4s, v1.4s, v6.4s +	  add		a1, a1, a6 +	add		v2.4s, v2.4s, v7.4s +	  add		a2, a2, a7 +	add		v3.4s, v3.4s, v4.4s +	  add		a3, a3, a4 + +	eor		v15.16b, v15.16b, v0.16b +	  eor		a15, a15, a0 +	eor		v12.16b, v12.16b, v1.16b +	  eor		a12, a12, a1 +	eor		v13.16b, v13.16b, v2.16b +	  eor		a13, a13, a2 +	eor		v14.16b, v14.16b, v3.16b +	  eor		a14, a14, a3 + +	rev32		v15.8h, v15.8h +	  ror		a15, a15, #16 +	rev32		v12.8h, v12.8h +	  ror		a12, a12, #16 +	rev32		v13.8h, v13.8h +	  ror		a13, a13, #16 +	rev32		v14.8h, v14.8h +	  ror		a14, a14, #16 + +	// x10 += x15, x5 = rotl32(x5 ^ x10, 12) +	// x11 += x12, x6 = rotl32(x6 ^ x11, 12) +	// x8 += x13, x7 = rotl32(x7 ^ x8, 12) +	// x9 += x14, x4 = rotl32(x4 ^ x9, 12) +	add		v10.4s, v10.4s, v15.4s +	  add		a10, a10, a15 +	add		v11.4s, v11.4s, v12.4s +	  add		a11, a11, a12 +	add		v8.4s, v8.4s, v13.4s +	  add		a8, a8, a13 +	add		v9.4s, v9.4s, v14.4s +	  add		a9, a9, a14 + +	eor		v16.16b, v5.16b, v10.16b +	  eor		a5, a5, a10 +	eor		v17.16b, v6.16b, v11.16b +	  eor		a6, a6, a11 +	eor		v18.16b, v7.16b, v8.16b +	  eor		a7, a7, a8 +	eor		v19.16b, v4.16b, v9.16b +	  eor		a4, a4, a9 + +	shl		v5.4s, v16.4s, #12 +	shl		v6.4s, v17.4s, #12 +	shl		v7.4s, v18.4s, #12 +	shl		v4.4s, v19.4s, #12 + +	sri		v5.4s, v16.4s, #20 +	  ror		a5, a5, #20 +	sri		v6.4s, v17.4s, #20 +	  ror		a6, a6, #20 +	sri		v7.4s, v18.4s, #20 +	  ror		a7, a7, #20 +	sri		v4.4s, v19.4s, #20 +	  ror		a4, a4, #20 + +	// x0 += x5, x15 = rotl32(x15 ^ x0, 8) +	// x1 += x6, x12 = rotl32(x12 ^ x1, 8) +	// x2 += x7, x13 = rotl32(x13 ^ x2, 8) +	// x3 += x4, x14 = rotl32(x14 ^ x3, 8) +	add		v0.4s, v0.4s, v5.4s +	  add		a0, a0, a5 +	add		v1.4s, v1.4s, v6.4s +	  add		a1, a1, a6 +	add		v2.4s, v2.4s, v7.4s +	  add		a2, a2, a7 +	add		v3.4s, v3.4s, v4.4s +	  add		a3, a3, a4 + +	eor		v15.16b, v15.16b, v0.16b +	  eor		a15, a15, a0 +	eor		v12.16b, v12.16b, v1.16b +	  eor		a12, a12, a1 +	eor		v13.16b, v13.16b, v2.16b +	  eor		a13, a13, a2 +	eor		v14.16b, v14.16b, v3.16b +	  eor		a14, a14, a3 + +	tbl		v15.16b, {v15.16b}, v31.16b +	  ror		a15, a15, #24 +	tbl		v12.16b, {v12.16b}, v31.16b +	  ror		a12, a12, #24 +	tbl		v13.16b, {v13.16b}, v31.16b +	  ror		a13, a13, #24 +	tbl		v14.16b, {v14.16b}, v31.16b +	  ror		a14, a14, #24 + +	// x10 += x15, x5 = rotl32(x5 ^ x10, 7) +	// x11 += x12, x6 = rotl32(x6 ^ x11, 7) +	// x8 += x13, x7 = rotl32(x7 ^ x8, 7) +	// x9 += x14, x4 = rotl32(x4 ^ x9, 7) +	add		v10.4s, v10.4s, v15.4s +	  add		a10, a10, a15 +	add		v11.4s, v11.4s, v12.4s +	  add		a11, a11, a12 +	add		v8.4s, v8.4s, v13.4s +	  add		a8, a8, a13 +	add		v9.4s, v9.4s, v14.4s +	  add		a9, a9, a14 + +	eor		v16.16b, v5.16b, v10.16b +	  eor		a5, a5, a10 +	eor		v17.16b, v6.16b, v11.16b +	  eor		a6, a6, a11 +	eor		v18.16b, v7.16b, v8.16b +	  eor		a7, a7, a8 +	eor		v19.16b, v4.16b, v9.16b +	  eor		a4, a4, a9 + +	shl		v5.4s, v16.4s, #7 +	shl		v6.4s, v17.4s, #7 +	shl		v7.4s, v18.4s, #7 +	shl		v4.4s, v19.4s, #7 + +	sri		v5.4s, v16.4s, #25 +	  ror		a5, a5, #25 +	sri		v6.4s, v17.4s, #25 +	  ror		a6, a6, #25 +	sri		v7.4s, v18.4s, #25 +	  ror		a7, a7, #25 +	sri		v4.4s, v19.4s, #25 +	  ror		a4, a4, #25 + +	subs		w3, w3, #2 +	b.ne		.Ldoubleround4 + +	ld4r		{v16.4s-v19.4s}, [x0], #16 +	ld4r		{v20.4s-v23.4s}, [x0], #16 + +	// x12 += counter values 0-3 +	add		v12.4s, v12.4s, v30.4s + +	// x0[0-3] += s0[0] +	// x1[0-3] += s0[1] +	// x2[0-3] += s0[2] +	// x3[0-3] += s0[3] +	add		v0.4s, v0.4s, v16.4s +	  mov		w6, v16.s[0] +	  mov		w7, v17.s[0] +	add		v1.4s, v1.4s, v17.4s +	  mov		w8, v18.s[0] +	  mov		w9, v19.s[0] +	add		v2.4s, v2.4s, v18.4s +	  add		a0, a0, w6 +	  add		a1, a1, w7 +	add		v3.4s, v3.4s, v19.4s +	  add		a2, a2, w8 +	  add		a3, a3, w9 +CPU_BE(	  rev		a0, a0		) +CPU_BE(	  rev		a1, a1		) +CPU_BE(	  rev		a2, a2		) +CPU_BE(	  rev		a3, a3		) + +	ld4r		{v24.4s-v27.4s}, [x0], #16 +	ld4r		{v28.4s-v31.4s}, [x0] + +	// x4[0-3] += s1[0] +	// x5[0-3] += s1[1] +	// x6[0-3] += s1[2] +	// x7[0-3] += s1[3] +	add		v4.4s, v4.4s, v20.4s +	  mov		w6, v20.s[0] +	  mov		w7, v21.s[0] +	add		v5.4s, v5.4s, v21.4s +	  mov		w8, v22.s[0] +	  mov		w9, v23.s[0] +	add		v6.4s, v6.4s, v22.4s +	  add		a4, a4, w6 +	  add		a5, a5, w7 +	add		v7.4s, v7.4s, v23.4s +	  add		a6, a6, w8 +	  add		a7, a7, w9 +CPU_BE(	  rev		a4, a4		) +CPU_BE(	  rev		a5, a5		) +CPU_BE(	  rev		a6, a6		) +CPU_BE(	  rev		a7, a7		) + +	// x8[0-3] += s2[0] +	// x9[0-3] += s2[1] +	// x10[0-3] += s2[2] +	// x11[0-3] += s2[3] +	add		v8.4s, v8.4s, v24.4s +	  mov		w6, v24.s[0] +	  mov		w7, v25.s[0] +	add		v9.4s, v9.4s, v25.4s +	  mov		w8, v26.s[0] +	  mov		w9, v27.s[0] +	add		v10.4s, v10.4s, v26.4s +	  add		a8, a8, w6 +	  add		a9, a9, w7 +	add		v11.4s, v11.4s, v27.4s +	  add		a10, a10, w8 +	  add		a11, a11, w9 +CPU_BE(	  rev		a8, a8		) +CPU_BE(	  rev		a9, a9		) +CPU_BE(	  rev		a10, a10	) +CPU_BE(	  rev		a11, a11	) + +	// x12[0-3] += s3[0] +	// x13[0-3] += s3[1] +	// x14[0-3] += s3[2] +	// x15[0-3] += s3[3] +	add		v12.4s, v12.4s, v28.4s +	  mov		w6, v28.s[0] +	  mov		w7, v29.s[0] +	add		v13.4s, v13.4s, v29.4s +	  mov		w8, v30.s[0] +	  mov		w9, v31.s[0] +	add		v14.4s, v14.4s, v30.4s +	  add		a12, a12, w6 +	  add		a13, a13, w7 +	add		v15.4s, v15.4s, v31.4s +	  add		a14, a14, w8 +	  add		a15, a15, w9 +CPU_BE(	  rev		a12, a12	) +CPU_BE(	  rev		a13, a13	) +CPU_BE(	  rev		a14, a14	) +CPU_BE(	  rev		a15, a15	) + +	// interleave 32-bit words in state n, n+1 +	  ldp		w6, w7, [x2], #64 +	zip1		v16.4s, v0.4s, v1.4s +	  ldp		w8, w9, [x2, #-56] +	  eor		a0, a0, w6 +	zip2		v17.4s, v0.4s, v1.4s +	  eor		a1, a1, w7 +	zip1		v18.4s, v2.4s, v3.4s +	  eor		a2, a2, w8 +	zip2		v19.4s, v2.4s, v3.4s +	  eor		a3, a3, w9 +	  ldp		w6, w7, [x2, #-48] +	zip1		v20.4s, v4.4s, v5.4s +	  ldp		w8, w9, [x2, #-40] +	  eor		a4, a4, w6 +	zip2		v21.4s, v4.4s, v5.4s +	  eor		a5, a5, w7 +	zip1		v22.4s, v6.4s, v7.4s +	  eor		a6, a6, w8 +	zip2		v23.4s, v6.4s, v7.4s +	  eor		a7, a7, w9 +	  ldp		w6, w7, [x2, #-32] +	zip1		v24.4s, v8.4s, v9.4s +	  ldp		w8, w9, [x2, #-24] +	  eor		a8, a8, w6 +	zip2		v25.4s, v8.4s, v9.4s +	  eor		a9, a9, w7 +	zip1		v26.4s, v10.4s, v11.4s +	  eor		a10, a10, w8 +	zip2		v27.4s, v10.4s, v11.4s +	  eor		a11, a11, w9 +	  ldp		w6, w7, [x2, #-16] +	zip1		v28.4s, v12.4s, v13.4s +	  ldp		w8, w9, [x2, #-8] +	  eor		a12, a12, w6 +	zip2		v29.4s, v12.4s, v13.4s +	  eor		a13, a13, w7 +	zip1		v30.4s, v14.4s, v15.4s +	  eor		a14, a14, w8 +	zip2		v31.4s, v14.4s, v15.4s +	  eor		a15, a15, w9 + +	add		x3, x2, x4 +	sub		x3, x3, #128		// start of last block + +	subs		x5, x4, #128 +	csel		x2, x2, x3, ge + +	// interleave 64-bit words in state n, n+2 +	zip1		v0.2d, v16.2d, v18.2d +	zip2		v4.2d, v16.2d, v18.2d +	  stp		a0, a1, [x1], #64 +	zip1		v8.2d, v17.2d, v19.2d +	zip2		v12.2d, v17.2d, v19.2d +	  stp		a2, a3, [x1, #-56] + +	subs		x6, x4, #192 +	ld1		{v16.16b-v19.16b}, [x2], #64 +	csel		x2, x2, x3, ge + +	zip1		v1.2d, v20.2d, v22.2d +	zip2		v5.2d, v20.2d, v22.2d +	  stp		a4, a5, [x1, #-48] +	zip1		v9.2d, v21.2d, v23.2d +	zip2		v13.2d, v21.2d, v23.2d +	  stp		a6, a7, [x1, #-40] + +	subs		x7, x4, #256 +	ld1		{v20.16b-v23.16b}, [x2], #64 +	csel		x2, x2, x3, ge + +	zip1		v2.2d, v24.2d, v26.2d +	zip2		v6.2d, v24.2d, v26.2d +	  stp		a8, a9, [x1, #-32] +	zip1		v10.2d, v25.2d, v27.2d +	zip2		v14.2d, v25.2d, v27.2d +	  stp		a10, a11, [x1, #-24] + +	subs		x8, x4, #320 +	ld1		{v24.16b-v27.16b}, [x2], #64 +	csel		x2, x2, x3, ge + +	zip1		v3.2d, v28.2d, v30.2d +	zip2		v7.2d, v28.2d, v30.2d +	  stp		a12, a13, [x1, #-16] +	zip1		v11.2d, v29.2d, v31.2d +	zip2		v15.2d, v29.2d, v31.2d +	  stp		a14, a15, [x1, #-8] + +	tbnz		x5, #63, .Lt128 +	ld1		{v28.16b-v31.16b}, [x2] + +	// xor with corresponding input, write to output +	eor		v16.16b, v16.16b, v0.16b +	eor		v17.16b, v17.16b, v1.16b +	eor		v18.16b, v18.16b, v2.16b +	eor		v19.16b, v19.16b, v3.16b + +	tbnz		x6, #63, .Lt192 + +	eor		v20.16b, v20.16b, v4.16b +	eor		v21.16b, v21.16b, v5.16b +	eor		v22.16b, v22.16b, v6.16b +	eor		v23.16b, v23.16b, v7.16b + +	st1		{v16.16b-v19.16b}, [x1], #64 +	tbnz		x7, #63, .Lt256 + +	eor		v24.16b, v24.16b, v8.16b +	eor		v25.16b, v25.16b, v9.16b +	eor		v26.16b, v26.16b, v10.16b +	eor		v27.16b, v27.16b, v11.16b + +	st1		{v20.16b-v23.16b}, [x1], #64 +	tbnz		x8, #63, .Lt320 + +	eor		v28.16b, v28.16b, v12.16b +	eor		v29.16b, v29.16b, v13.16b +	eor		v30.16b, v30.16b, v14.16b +	eor		v31.16b, v31.16b, v15.16b + +	st1		{v24.16b-v27.16b}, [x1], #64 +	st1		{v28.16b-v31.16b}, [x1] + +.Lout:	frame_pop +	ret + +	// fewer than 192 bytes of in/output +.Lt192:	cbz		x5, 1f				// exactly 128 bytes? +	ld1		{v28.16b-v31.16b}, [x10] +	add		x5, x5, x1 +	tbl		v28.16b, {v4.16b-v7.16b}, v28.16b +	tbl		v29.16b, {v4.16b-v7.16b}, v29.16b +	tbl		v30.16b, {v4.16b-v7.16b}, v30.16b +	tbl		v31.16b, {v4.16b-v7.16b}, v31.16b + +0:	eor		v20.16b, v20.16b, v28.16b +	eor		v21.16b, v21.16b, v29.16b +	eor		v22.16b, v22.16b, v30.16b +	eor		v23.16b, v23.16b, v31.16b +	st1		{v20.16b-v23.16b}, [x5]		// overlapping stores +1:	st1		{v16.16b-v19.16b}, [x1] +	b		.Lout + +	// fewer than 128 bytes of in/output +.Lt128:	ld1		{v28.16b-v31.16b}, [x10] +	add		x5, x5, x1 +	sub		x1, x1, #64 +	tbl		v28.16b, {v0.16b-v3.16b}, v28.16b +	tbl		v29.16b, {v0.16b-v3.16b}, v29.16b +	tbl		v30.16b, {v0.16b-v3.16b}, v30.16b +	tbl		v31.16b, {v0.16b-v3.16b}, v31.16b +	ld1		{v16.16b-v19.16b}, [x1]		// reload first output block +	b		0b + +	// fewer than 256 bytes of in/output +.Lt256:	cbz		x6, 2f				// exactly 192 bytes? +	ld1		{v4.16b-v7.16b}, [x10] +	add		x6, x6, x1 +	tbl		v0.16b, {v8.16b-v11.16b}, v4.16b +	tbl		v1.16b, {v8.16b-v11.16b}, v5.16b +	tbl		v2.16b, {v8.16b-v11.16b}, v6.16b +	tbl		v3.16b, {v8.16b-v11.16b}, v7.16b + +	eor		v28.16b, v28.16b, v0.16b +	eor		v29.16b, v29.16b, v1.16b +	eor		v30.16b, v30.16b, v2.16b +	eor		v31.16b, v31.16b, v3.16b +	st1		{v28.16b-v31.16b}, [x6]		// overlapping stores +2:	st1		{v20.16b-v23.16b}, [x1] +	b		.Lout + +	// fewer than 320 bytes of in/output +.Lt320:	cbz		x7, 3f				// exactly 256 bytes? +	ld1		{v4.16b-v7.16b}, [x10] +	add		x7, x7, x1 +	tbl		v0.16b, {v12.16b-v15.16b}, v4.16b +	tbl		v1.16b, {v12.16b-v15.16b}, v5.16b +	tbl		v2.16b, {v12.16b-v15.16b}, v6.16b +	tbl		v3.16b, {v12.16b-v15.16b}, v7.16b + +	eor		v28.16b, v28.16b, v0.16b +	eor		v29.16b, v29.16b, v1.16b +	eor		v30.16b, v30.16b, v2.16b +	eor		v31.16b, v31.16b, v3.16b +	st1		{v28.16b-v31.16b}, [x7]		// overlapping stores +3:	st1		{v24.16b-v27.16b}, [x1] +	b		.Lout +SYM_FUNC_END(chacha_4block_xor_neon) + +	.section	".rodata", "a", %progbits +	.align		L1_CACHE_SHIFT +.Lpermute: +	.set		.Li, 0 +	.rept		128 +	.byte		(.Li - 64) +	.set		.Li, .Li + 1 +	.endr + +CTRINC:	.word		1, 2, 3, 4 +ROT8:	.word		0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f diff --git a/lib/crypto/arm64/chacha-neon-glue.c b/lib/crypto/arm64/chacha-neon-glue.c new file mode 100644 index 000000000000..d0188f974ca5 --- /dev/null +++ b/lib/crypto/arm64/chacha-neon-glue.c @@ -0,0 +1,119 @@ +/* + * ChaCha and HChaCha functions (ARM64 optimized) + * + * Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Based on: + * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <crypto/chacha.h> +#include <crypto/internal/simd.h> +#include <linux/jump_label.h> +#include <linux/kernel.h> +#include <linux/module.h> + +#include <asm/hwcap.h> +#include <asm/neon.h> +#include <asm/simd.h> + +asmlinkage void chacha_block_xor_neon(const struct chacha_state *state, +				      u8 *dst, const u8 *src, int nrounds); +asmlinkage void chacha_4block_xor_neon(const struct chacha_state *state, +				       u8 *dst, const u8 *src, +				       int nrounds, int bytes); +asmlinkage void hchacha_block_neon(const struct chacha_state *state, +				   u32 out[HCHACHA_OUT_WORDS], int nrounds); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); + +static void chacha_doneon(struct chacha_state *state, u8 *dst, const u8 *src, +			  int bytes, int nrounds) +{ +	while (bytes > 0) { +		int l = min(bytes, CHACHA_BLOCK_SIZE * 5); + +		if (l <= CHACHA_BLOCK_SIZE) { +			u8 buf[CHACHA_BLOCK_SIZE]; + +			memcpy(buf, src, l); +			chacha_block_xor_neon(state, buf, buf, nrounds); +			memcpy(dst, buf, l); +			state->x[12] += 1; +			break; +		} +		chacha_4block_xor_neon(state, dst, src, nrounds, l); +		bytes -= l; +		src += l; +		dst += l; +		state->x[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE); +	} +} + +void hchacha_block_arch(const struct chacha_state *state, +			u32 out[HCHACHA_OUT_WORDS], int nrounds) +{ +	if (!static_branch_likely(&have_neon) || !crypto_simd_usable()) { +		hchacha_block_generic(state, out, nrounds); +	} else { +		kernel_neon_begin(); +		hchacha_block_neon(state, out, nrounds); +		kernel_neon_end(); +	} +} +EXPORT_SYMBOL(hchacha_block_arch); + +void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, +		       unsigned int bytes, int nrounds) +{ +	if (!static_branch_likely(&have_neon) || bytes <= CHACHA_BLOCK_SIZE || +	    !crypto_simd_usable()) +		return chacha_crypt_generic(state, dst, src, bytes, nrounds); + +	do { +		unsigned int todo = min_t(unsigned int, bytes, SZ_4K); + +		kernel_neon_begin(); +		chacha_doneon(state, dst, src, todo, nrounds); +		kernel_neon_end(); + +		bytes -= todo; +		src += todo; +		dst += todo; +	} while (bytes); +} +EXPORT_SYMBOL(chacha_crypt_arch); + +bool chacha_is_arch_optimized(void) +{ +	return static_key_enabled(&have_neon); +} +EXPORT_SYMBOL(chacha_is_arch_optimized); + +static int __init chacha_simd_mod_init(void) +{ +	if (cpu_have_named_feature(ASIMD)) +		static_branch_enable(&have_neon); +	return 0; +} +subsys_initcall(chacha_simd_mod_init); + +static void __exit chacha_simd_mod_exit(void) +{ +} +module_exit(chacha_simd_mod_exit); + +MODULE_DESCRIPTION("ChaCha and HChaCha functions (ARM64 optimized)"); +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/arm64/poly1305-armv8.pl b/lib/crypto/arm64/poly1305-armv8.pl new file mode 100644 index 000000000000..22c9069c0650 --- /dev/null +++ b/lib/crypto/arm64/poly1305-armv8.pl @@ -0,0 +1,917 @@ +#!/usr/bin/env perl +# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause +# +# ==================================================================== +# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL +# project. +# ==================================================================== +# +# This module implements Poly1305 hash for ARMv8. +# +# June 2015 +# +# Numbers are cycles per processed byte with poly1305_blocks alone. +# +#		IALU/gcc-4.9	NEON +# +# Apple A7	1.86/+5%	0.72 +# Cortex-A53	2.69/+58%	1.47 +# Cortex-A57	2.70/+7%	1.14 +# Denver	1.64/+50%	1.18(*) +# X-Gene	2.13/+68%	2.27 +# Mongoose	1.77/+75%	1.12 +# Kryo		2.70/+55%	1.13 +# ThunderX2	1.17/+95%	1.36 +# +# (*)	estimate based on resources availability is less than 1.0, +#	i.e. measured result is worse than expected, presumably binary +#	translator is not almighty; + +$flavour=shift; +$output=shift; + +if ($flavour && $flavour ne "void") { +    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +    die "can't locate arm-xlate.pl"; + +    open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { +    open STDOUT,">$output"; +} + +my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3)); +my ($mac,$nonce)=($inp,$len); + +my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14)); + +$code.=<<___; +#ifndef __KERNEL__ +# include "arm_arch.h" +.extern	OPENSSL_armcap_P +#endif + +.text + +// forward "declarations" are required for Apple +.globl	poly1305_blocks +.globl	poly1305_emit + +.globl	poly1305_init +.type	poly1305_init,%function +.align	5 +poly1305_init: +	cmp	$inp,xzr +	stp	xzr,xzr,[$ctx]		// zero hash value +	stp	xzr,xzr,[$ctx,#16]	// [along with is_base2_26] + +	csel	x0,xzr,x0,eq +	b.eq	.Lno_key + +#ifndef	__KERNEL__ +	adrp	x17,OPENSSL_armcap_P +	ldr	w17,[x17,#:lo12:OPENSSL_armcap_P] +#endif + +	ldp	$r0,$r1,[$inp]		// load key +	mov	$s1,#0xfffffffc0fffffff +	movk	$s1,#0x0fff,lsl#48 +#ifdef	__AARCH64EB__ +	rev	$r0,$r0			// flip bytes +	rev	$r1,$r1 +#endif +	and	$r0,$r0,$s1		// &=0ffffffc0fffffff +	and	$s1,$s1,#-4 +	and	$r1,$r1,$s1		// &=0ffffffc0ffffffc +	mov	w#$s1,#-1 +	stp	$r0,$r1,[$ctx,#32]	// save key value +	str	w#$s1,[$ctx,#48]	// impossible key power value + +#ifndef	__KERNEL__ +	tst	w17,#ARMV7_NEON + +	adr	$d0,.Lpoly1305_blocks +	adr	$r0,.Lpoly1305_blocks_neon +	adr	$d1,.Lpoly1305_emit + +	csel	$d0,$d0,$r0,eq + +# ifdef	__ILP32__ +	stp	w#$d0,w#$d1,[$len] +# else +	stp	$d0,$d1,[$len] +# endif +#endif +	mov	x0,#1 +.Lno_key: +	ret +.size	poly1305_init,.-poly1305_init + +.type	poly1305_blocks,%function +.align	5 +poly1305_blocks: +.Lpoly1305_blocks: +	ands	$len,$len,#-16 +	b.eq	.Lno_data + +	ldp	$h0,$h1,[$ctx]		// load hash value +	ldp	$h2,x17,[$ctx,#16]	// [along with is_base2_26] +	ldp	$r0,$r1,[$ctx,#32]	// load key value + +#ifdef	__AARCH64EB__ +	lsr	$d0,$h0,#32 +	mov	w#$d1,w#$h0 +	lsr	$d2,$h1,#32 +	mov	w15,w#$h1 +	lsr	x16,$h2,#32 +#else +	mov	w#$d0,w#$h0 +	lsr	$d1,$h0,#32 +	mov	w#$d2,w#$h1 +	lsr	x15,$h1,#32 +	mov	w16,w#$h2 +#endif + +	add	$d0,$d0,$d1,lsl#26	// base 2^26 -> base 2^64 +	lsr	$d1,$d2,#12 +	adds	$d0,$d0,$d2,lsl#52 +	add	$d1,$d1,x15,lsl#14 +	adc	$d1,$d1,xzr +	lsr	$d2,x16,#24 +	adds	$d1,$d1,x16,lsl#40 +	adc	$d2,$d2,xzr + +	cmp	x17,#0			// is_base2_26? +	add	$s1,$r1,$r1,lsr#2	// s1 = r1 + (r1 >> 2) +	csel	$h0,$h0,$d0,eq		// choose between radixes +	csel	$h1,$h1,$d1,eq +	csel	$h2,$h2,$d2,eq + +.Loop: +	ldp	$t0,$t1,[$inp],#16	// load input +	sub	$len,$len,#16 +#ifdef	__AARCH64EB__ +	rev	$t0,$t0 +	rev	$t1,$t1 +#endif +	adds	$h0,$h0,$t0		// accumulate input +	adcs	$h1,$h1,$t1 + +	mul	$d0,$h0,$r0		// h0*r0 +	adc	$h2,$h2,$padbit +	umulh	$d1,$h0,$r0 + +	mul	$t0,$h1,$s1		// h1*5*r1 +	umulh	$t1,$h1,$s1 + +	adds	$d0,$d0,$t0 +	mul	$t0,$h0,$r1		// h0*r1 +	adc	$d1,$d1,$t1 +	umulh	$d2,$h0,$r1 + +	adds	$d1,$d1,$t0 +	mul	$t0,$h1,$r0		// h1*r0 +	adc	$d2,$d2,xzr +	umulh	$t1,$h1,$r0 + +	adds	$d1,$d1,$t0 +	mul	$t0,$h2,$s1		// h2*5*r1 +	adc	$d2,$d2,$t1 +	mul	$t1,$h2,$r0		// h2*r0 + +	adds	$d1,$d1,$t0 +	adc	$d2,$d2,$t1 + +	and	$t0,$d2,#-4		// final reduction +	and	$h2,$d2,#3 +	add	$t0,$t0,$d2,lsr#2 +	adds	$h0,$d0,$t0 +	adcs	$h1,$d1,xzr +	adc	$h2,$h2,xzr + +	cbnz	$len,.Loop + +	stp	$h0,$h1,[$ctx]		// store hash value +	stp	$h2,xzr,[$ctx,#16]	// [and clear is_base2_26] + +.Lno_data: +	ret +.size	poly1305_blocks,.-poly1305_blocks + +.type	poly1305_emit,%function +.align	5 +poly1305_emit: +.Lpoly1305_emit: +	ldp	$h0,$h1,[$ctx]		// load hash base 2^64 +	ldp	$h2,$r0,[$ctx,#16]	// [along with is_base2_26] +	ldp	$t0,$t1,[$nonce]	// load nonce + +#ifdef	__AARCH64EB__ +	lsr	$d0,$h0,#32 +	mov	w#$d1,w#$h0 +	lsr	$d2,$h1,#32 +	mov	w15,w#$h1 +	lsr	x16,$h2,#32 +#else +	mov	w#$d0,w#$h0 +	lsr	$d1,$h0,#32 +	mov	w#$d2,w#$h1 +	lsr	x15,$h1,#32 +	mov	w16,w#$h2 +#endif + +	add	$d0,$d0,$d1,lsl#26	// base 2^26 -> base 2^64 +	lsr	$d1,$d2,#12 +	adds	$d0,$d0,$d2,lsl#52 +	add	$d1,$d1,x15,lsl#14 +	adc	$d1,$d1,xzr +	lsr	$d2,x16,#24 +	adds	$d1,$d1,x16,lsl#40 +	adc	$d2,$d2,xzr + +	cmp	$r0,#0			// is_base2_26? +	csel	$h0,$h0,$d0,eq		// choose between radixes +	csel	$h1,$h1,$d1,eq +	csel	$h2,$h2,$d2,eq + +	adds	$d0,$h0,#5		// compare to modulus +	adcs	$d1,$h1,xzr +	adc	$d2,$h2,xzr + +	tst	$d2,#-4			// see if it's carried/borrowed + +	csel	$h0,$h0,$d0,eq +	csel	$h1,$h1,$d1,eq + +#ifdef	__AARCH64EB__ +	ror	$t0,$t0,#32		// flip nonce words +	ror	$t1,$t1,#32 +#endif +	adds	$h0,$h0,$t0		// accumulate nonce +	adc	$h1,$h1,$t1 +#ifdef	__AARCH64EB__ +	rev	$h0,$h0			// flip output bytes +	rev	$h1,$h1 +#endif +	stp	$h0,$h1,[$mac]		// write result + +	ret +.size	poly1305_emit,.-poly1305_emit +___ +my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8)); +my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13)); +my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18)); +my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23)); +my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28)); +my ($T0,$T1,$MASK) = map("v$_",(29..31)); + +my ($in2,$zeros)=("x16","x17"); +my $is_base2_26 = $zeros;		# borrow + +$code.=<<___; +.type	poly1305_mult,%function +.align	5 +poly1305_mult: +	mul	$d0,$h0,$r0		// h0*r0 +	umulh	$d1,$h0,$r0 + +	mul	$t0,$h1,$s1		// h1*5*r1 +	umulh	$t1,$h1,$s1 + +	adds	$d0,$d0,$t0 +	mul	$t0,$h0,$r1		// h0*r1 +	adc	$d1,$d1,$t1 +	umulh	$d2,$h0,$r1 + +	adds	$d1,$d1,$t0 +	mul	$t0,$h1,$r0		// h1*r0 +	adc	$d2,$d2,xzr +	umulh	$t1,$h1,$r0 + +	adds	$d1,$d1,$t0 +	mul	$t0,$h2,$s1		// h2*5*r1 +	adc	$d2,$d2,$t1 +	mul	$t1,$h2,$r0		// h2*r0 + +	adds	$d1,$d1,$t0 +	adc	$d2,$d2,$t1 + +	and	$t0,$d2,#-4		// final reduction +	and	$h2,$d2,#3 +	add	$t0,$t0,$d2,lsr#2 +	adds	$h0,$d0,$t0 +	adcs	$h1,$d1,xzr +	adc	$h2,$h2,xzr + +	ret +.size	poly1305_mult,.-poly1305_mult + +.type	poly1305_splat,%function +.align	4 +poly1305_splat: +	and	x12,$h0,#0x03ffffff	// base 2^64 -> base 2^26 +	ubfx	x13,$h0,#26,#26 +	extr	x14,$h1,$h0,#52 +	and	x14,x14,#0x03ffffff +	ubfx	x15,$h1,#14,#26 +	extr	x16,$h2,$h1,#40 + +	str	w12,[$ctx,#16*0]	// r0 +	add	w12,w13,w13,lsl#2	// r1*5 +	str	w13,[$ctx,#16*1]	// r1 +	add	w13,w14,w14,lsl#2	// r2*5 +	str	w12,[$ctx,#16*2]	// s1 +	str	w14,[$ctx,#16*3]	// r2 +	add	w14,w15,w15,lsl#2	// r3*5 +	str	w13,[$ctx,#16*4]	// s2 +	str	w15,[$ctx,#16*5]	// r3 +	add	w15,w16,w16,lsl#2	// r4*5 +	str	w14,[$ctx,#16*6]	// s3 +	str	w16,[$ctx,#16*7]	// r4 +	str	w15,[$ctx,#16*8]	// s4 + +	ret +.size	poly1305_splat,.-poly1305_splat + +#ifdef	__KERNEL__ +.globl	poly1305_blocks_neon +#endif +.type	poly1305_blocks_neon,%function +.align	5 +poly1305_blocks_neon: +.Lpoly1305_blocks_neon: +	ldr	$is_base2_26,[$ctx,#24] +	cmp	$len,#128 +	b.lo	.Lpoly1305_blocks + +	.inst	0xd503233f		// paciasp +	stp	x29,x30,[sp,#-80]! +	add	x29,sp,#0 + +	stp	d8,d9,[sp,#16]		// meet ABI requirements +	stp	d10,d11,[sp,#32] +	stp	d12,d13,[sp,#48] +	stp	d14,d15,[sp,#64] + +	cbz	$is_base2_26,.Lbase2_64_neon + +	ldp	w10,w11,[$ctx]		// load hash value base 2^26 +	ldp	w12,w13,[$ctx,#8] +	ldr	w14,[$ctx,#16] + +	tst	$len,#31 +	b.eq	.Leven_neon + +	ldp	$r0,$r1,[$ctx,#32]	// load key value + +	add	$h0,x10,x11,lsl#26	// base 2^26 -> base 2^64 +	lsr	$h1,x12,#12 +	adds	$h0,$h0,x12,lsl#52 +	add	$h1,$h1,x13,lsl#14 +	adc	$h1,$h1,xzr +	lsr	$h2,x14,#24 +	adds	$h1,$h1,x14,lsl#40 +	adc	$d2,$h2,xzr		// can be partially reduced... + +	ldp	$d0,$d1,[$inp],#16	// load input +	sub	$len,$len,#16 +	add	$s1,$r1,$r1,lsr#2	// s1 = r1 + (r1 >> 2) + +#ifdef	__AARCH64EB__ +	rev	$d0,$d0 +	rev	$d1,$d1 +#endif +	adds	$h0,$h0,$d0		// accumulate input +	adcs	$h1,$h1,$d1 +	adc	$h2,$h2,$padbit + +	bl	poly1305_mult + +	and	x10,$h0,#0x03ffffff	// base 2^64 -> base 2^26 +	ubfx	x11,$h0,#26,#26 +	extr	x12,$h1,$h0,#52 +	and	x12,x12,#0x03ffffff +	ubfx	x13,$h1,#14,#26 +	extr	x14,$h2,$h1,#40 + +	b	.Leven_neon + +.align	4 +.Lbase2_64_neon: +	ldp	$r0,$r1,[$ctx,#32]	// load key value + +	ldp	$h0,$h1,[$ctx]		// load hash value base 2^64 +	ldr	$h2,[$ctx,#16] + +	tst	$len,#31 +	b.eq	.Linit_neon + +	ldp	$d0,$d1,[$inp],#16	// load input +	sub	$len,$len,#16 +	add	$s1,$r1,$r1,lsr#2	// s1 = r1 + (r1 >> 2) +#ifdef	__AARCH64EB__ +	rev	$d0,$d0 +	rev	$d1,$d1 +#endif +	adds	$h0,$h0,$d0		// accumulate input +	adcs	$h1,$h1,$d1 +	adc	$h2,$h2,$padbit + +	bl	poly1305_mult + +.Linit_neon: +	ldr	w17,[$ctx,#48]		// first table element +	and	x10,$h0,#0x03ffffff	// base 2^64 -> base 2^26 +	ubfx	x11,$h0,#26,#26 +	extr	x12,$h1,$h0,#52 +	and	x12,x12,#0x03ffffff +	ubfx	x13,$h1,#14,#26 +	extr	x14,$h2,$h1,#40 + +	cmp	w17,#-1			// is value impossible? +	b.ne	.Leven_neon + +	fmov	${H0},x10 +	fmov	${H1},x11 +	fmov	${H2},x12 +	fmov	${H3},x13 +	fmov	${H4},x14 + +	////////////////////////////////// initialize r^n table +	mov	$h0,$r0			// r^1 +	add	$s1,$r1,$r1,lsr#2	// s1 = r1 + (r1 >> 2) +	mov	$h1,$r1 +	mov	$h2,xzr +	add	$ctx,$ctx,#48+12 +	bl	poly1305_splat + +	bl	poly1305_mult		// r^2 +	sub	$ctx,$ctx,#4 +	bl	poly1305_splat + +	bl	poly1305_mult		// r^3 +	sub	$ctx,$ctx,#4 +	bl	poly1305_splat + +	bl	poly1305_mult		// r^4 +	sub	$ctx,$ctx,#4 +	bl	poly1305_splat +	sub	$ctx,$ctx,#48		// restore original $ctx +	b	.Ldo_neon + +.align	4 +.Leven_neon: +	fmov	${H0},x10 +	fmov	${H1},x11 +	fmov	${H2},x12 +	fmov	${H3},x13 +	fmov	${H4},x14 + +.Ldo_neon: +	ldp	x8,x12,[$inp,#32]	// inp[2:3] +	subs	$len,$len,#64 +	ldp	x9,x13,[$inp,#48] +	add	$in2,$inp,#96 +	adrp	$zeros,.Lzeros +	add	$zeros,$zeros,#:lo12:.Lzeros + +	lsl	$padbit,$padbit,#24 +	add	x15,$ctx,#48 + +#ifdef	__AARCH64EB__ +	rev	x8,x8 +	rev	x12,x12 +	rev	x9,x9 +	rev	x13,x13 +#endif +	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26 +	and	x5,x9,#0x03ffffff +	ubfx	x6,x8,#26,#26 +	ubfx	x7,x9,#26,#26 +	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32 +	extr	x8,x12,x8,#52 +	extr	x9,x13,x9,#52 +	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32 +	fmov	$IN23_0,x4 +	and	x8,x8,#0x03ffffff +	and	x9,x9,#0x03ffffff +	ubfx	x10,x12,#14,#26 +	ubfx	x11,x13,#14,#26 +	add	x12,$padbit,x12,lsr#40 +	add	x13,$padbit,x13,lsr#40 +	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32 +	fmov	$IN23_1,x6 +	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32 +	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32 +	fmov	$IN23_2,x8 +	fmov	$IN23_3,x10 +	fmov	$IN23_4,x12 + +	ldp	x8,x12,[$inp],#16	// inp[0:1] +	ldp	x9,x13,[$inp],#48 + +	ld1	{$R0,$R1,$S1,$R2},[x15],#64 +	ld1	{$S2,$R3,$S3,$R4},[x15],#64 +	ld1	{$S4},[x15] + +#ifdef	__AARCH64EB__ +	rev	x8,x8 +	rev	x12,x12 +	rev	x9,x9 +	rev	x13,x13 +#endif +	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26 +	and	x5,x9,#0x03ffffff +	ubfx	x6,x8,#26,#26 +	ubfx	x7,x9,#26,#26 +	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32 +	extr	x8,x12,x8,#52 +	extr	x9,x13,x9,#52 +	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32 +	fmov	$IN01_0,x4 +	and	x8,x8,#0x03ffffff +	and	x9,x9,#0x03ffffff +	ubfx	x10,x12,#14,#26 +	ubfx	x11,x13,#14,#26 +	add	x12,$padbit,x12,lsr#40 +	add	x13,$padbit,x13,lsr#40 +	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32 +	fmov	$IN01_1,x6 +	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32 +	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32 +	movi	$MASK.2d,#-1 +	fmov	$IN01_2,x8 +	fmov	$IN01_3,x10 +	fmov	$IN01_4,x12 +	ushr	$MASK.2d,$MASK.2d,#38 + +	b.ls	.Lskip_loop + +.align	4 +.Loop_neon: +	//////////////////////////////////////////////////////////////// +	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 +	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r +	//   \___________________/ +	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 +	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r +	//   \___________________/ \____________________/ +	// +	// Note that we start with inp[2:3]*r^2. This is because it +	// doesn't depend on reduction in previous iteration. +	//////////////////////////////////////////////////////////////// +	// d4 = h0*r4 + h1*r3   + h2*r2   + h3*r1   + h4*r0 +	// d3 = h0*r3 + h1*r2   + h2*r1   + h3*r0   + h4*5*r4 +	// d2 = h0*r2 + h1*r1   + h2*r0   + h3*5*r4 + h4*5*r3 +	// d1 = h0*r1 + h1*r0   + h2*5*r4 + h3*5*r3 + h4*5*r2 +	// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 + +	subs	$len,$len,#64 +	umull	$ACC4,$IN23_0,${R4}[2] +	csel	$in2,$zeros,$in2,lo +	umull	$ACC3,$IN23_0,${R3}[2] +	umull	$ACC2,$IN23_0,${R2}[2] +	 ldp	x8,x12,[$in2],#16	// inp[2:3] (or zero) +	umull	$ACC1,$IN23_0,${R1}[2] +	 ldp	x9,x13,[$in2],#48 +	umull	$ACC0,$IN23_0,${R0}[2] +#ifdef	__AARCH64EB__ +	 rev	x8,x8 +	 rev	x12,x12 +	 rev	x9,x9 +	 rev	x13,x13 +#endif + +	umlal	$ACC4,$IN23_1,${R3}[2] +	 and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26 +	umlal	$ACC3,$IN23_1,${R2}[2] +	 and	x5,x9,#0x03ffffff +	umlal	$ACC2,$IN23_1,${R1}[2] +	 ubfx	x6,x8,#26,#26 +	umlal	$ACC1,$IN23_1,${R0}[2] +	 ubfx	x7,x9,#26,#26 +	umlal	$ACC0,$IN23_1,${S4}[2] +	 add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32 + +	umlal	$ACC4,$IN23_2,${R2}[2] +	 extr	x8,x12,x8,#52 +	umlal	$ACC3,$IN23_2,${R1}[2] +	 extr	x9,x13,x9,#52 +	umlal	$ACC2,$IN23_2,${R0}[2] +	 add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32 +	umlal	$ACC1,$IN23_2,${S4}[2] +	 fmov	$IN23_0,x4 +	umlal	$ACC0,$IN23_2,${S3}[2] +	 and	x8,x8,#0x03ffffff + +	umlal	$ACC4,$IN23_3,${R1}[2] +	 and	x9,x9,#0x03ffffff +	umlal	$ACC3,$IN23_3,${R0}[2] +	 ubfx	x10,x12,#14,#26 +	umlal	$ACC2,$IN23_3,${S4}[2] +	 ubfx	x11,x13,#14,#26 +	umlal	$ACC1,$IN23_3,${S3}[2] +	 add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32 +	umlal	$ACC0,$IN23_3,${S2}[2] +	 fmov	$IN23_1,x6 + +	add	$IN01_2,$IN01_2,$H2 +	 add	x12,$padbit,x12,lsr#40 +	umlal	$ACC4,$IN23_4,${R0}[2] +	 add	x13,$padbit,x13,lsr#40 +	umlal	$ACC3,$IN23_4,${S4}[2] +	 add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32 +	umlal	$ACC2,$IN23_4,${S3}[2] +	 add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32 +	umlal	$ACC1,$IN23_4,${S2}[2] +	 fmov	$IN23_2,x8 +	umlal	$ACC0,$IN23_4,${S1}[2] +	 fmov	$IN23_3,x10 + +	//////////////////////////////////////////////////////////////// +	// (hash+inp[0:1])*r^4 and accumulate + +	add	$IN01_0,$IN01_0,$H0 +	 fmov	$IN23_4,x12 +	umlal	$ACC3,$IN01_2,${R1}[0] +	 ldp	x8,x12,[$inp],#16	// inp[0:1] +	umlal	$ACC0,$IN01_2,${S3}[0] +	 ldp	x9,x13,[$inp],#48 +	umlal	$ACC4,$IN01_2,${R2}[0] +	umlal	$ACC1,$IN01_2,${S4}[0] +	umlal	$ACC2,$IN01_2,${R0}[0] +#ifdef	__AARCH64EB__ +	 rev	x8,x8 +	 rev	x12,x12 +	 rev	x9,x9 +	 rev	x13,x13 +#endif + +	add	$IN01_1,$IN01_1,$H1 +	umlal	$ACC3,$IN01_0,${R3}[0] +	umlal	$ACC4,$IN01_0,${R4}[0] +	 and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26 +	umlal	$ACC2,$IN01_0,${R2}[0] +	 and	x5,x9,#0x03ffffff +	umlal	$ACC0,$IN01_0,${R0}[0] +	 ubfx	x6,x8,#26,#26 +	umlal	$ACC1,$IN01_0,${R1}[0] +	 ubfx	x7,x9,#26,#26 + +	add	$IN01_3,$IN01_3,$H3 +	 add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32 +	umlal	$ACC3,$IN01_1,${R2}[0] +	 extr	x8,x12,x8,#52 +	umlal	$ACC4,$IN01_1,${R3}[0] +	 extr	x9,x13,x9,#52 +	umlal	$ACC0,$IN01_1,${S4}[0] +	 add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32 +	umlal	$ACC2,$IN01_1,${R1}[0] +	 fmov	$IN01_0,x4 +	umlal	$ACC1,$IN01_1,${R0}[0] +	 and	x8,x8,#0x03ffffff + +	add	$IN01_4,$IN01_4,$H4 +	 and	x9,x9,#0x03ffffff +	umlal	$ACC3,$IN01_3,${R0}[0] +	 ubfx	x10,x12,#14,#26 +	umlal	$ACC0,$IN01_3,${S2}[0] +	 ubfx	x11,x13,#14,#26 +	umlal	$ACC4,$IN01_3,${R1}[0] +	 add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32 +	umlal	$ACC1,$IN01_3,${S3}[0] +	 fmov	$IN01_1,x6 +	umlal	$ACC2,$IN01_3,${S4}[0] +	 add	x12,$padbit,x12,lsr#40 + +	umlal	$ACC3,$IN01_4,${S4}[0] +	 add	x13,$padbit,x13,lsr#40 +	umlal	$ACC0,$IN01_4,${S1}[0] +	 add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32 +	umlal	$ACC4,$IN01_4,${R0}[0] +	 add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32 +	umlal	$ACC1,$IN01_4,${S2}[0] +	 fmov	$IN01_2,x8 +	umlal	$ACC2,$IN01_4,${S3}[0] +	 fmov	$IN01_3,x10 +	 fmov	$IN01_4,x12 + +	///////////////////////////////////////////////////////////////// +	// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein +	// and P. Schwabe +	// +	// [see discussion in poly1305-armv4 module] + +	ushr	$T0.2d,$ACC3,#26 +	xtn	$H3,$ACC3 +	 ushr	$T1.2d,$ACC0,#26 +	 and	$ACC0,$ACC0,$MASK.2d +	add	$ACC4,$ACC4,$T0.2d	// h3 -> h4 +	bic	$H3,#0xfc,lsl#24	// &=0x03ffffff +	 add	$ACC1,$ACC1,$T1.2d	// h0 -> h1 + +	ushr	$T0.2d,$ACC4,#26 +	xtn	$H4,$ACC4 +	 ushr	$T1.2d,$ACC1,#26 +	 xtn	$H1,$ACC1 +	bic	$H4,#0xfc,lsl#24 +	 add	$ACC2,$ACC2,$T1.2d	// h1 -> h2 + +	add	$ACC0,$ACC0,$T0.2d +	shl	$T0.2d,$T0.2d,#2 +	 shrn	$T1.2s,$ACC2,#26 +	 xtn	$H2,$ACC2 +	add	$ACC0,$ACC0,$T0.2d	// h4 -> h0 +	 bic	$H1,#0xfc,lsl#24 +	 add	$H3,$H3,$T1.2s		// h2 -> h3 +	 bic	$H2,#0xfc,lsl#24 + +	shrn	$T0.2s,$ACC0,#26 +	xtn	$H0,$ACC0 +	 ushr	$T1.2s,$H3,#26 +	 bic	$H3,#0xfc,lsl#24 +	 bic	$H0,#0xfc,lsl#24 +	add	$H1,$H1,$T0.2s		// h0 -> h1 +	 add	$H4,$H4,$T1.2s		// h3 -> h4 + +	b.hi	.Loop_neon + +.Lskip_loop: +	dup	$IN23_2,${IN23_2}[0] +	add	$IN01_2,$IN01_2,$H2 + +	//////////////////////////////////////////////////////////////// +	// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 + +	adds	$len,$len,#32 +	b.ne	.Long_tail + +	dup	$IN23_2,${IN01_2}[0] +	add	$IN23_0,$IN01_0,$H0 +	add	$IN23_3,$IN01_3,$H3 +	add	$IN23_1,$IN01_1,$H1 +	add	$IN23_4,$IN01_4,$H4 + +.Long_tail: +	dup	$IN23_0,${IN23_0}[0] +	umull2	$ACC0,$IN23_2,${S3} +	umull2	$ACC3,$IN23_2,${R1} +	umull2	$ACC4,$IN23_2,${R2} +	umull2	$ACC2,$IN23_2,${R0} +	umull2	$ACC1,$IN23_2,${S4} + +	dup	$IN23_1,${IN23_1}[0] +	umlal2	$ACC0,$IN23_0,${R0} +	umlal2	$ACC2,$IN23_0,${R2} +	umlal2	$ACC3,$IN23_0,${R3} +	umlal2	$ACC4,$IN23_0,${R4} +	umlal2	$ACC1,$IN23_0,${R1} + +	dup	$IN23_3,${IN23_3}[0] +	umlal2	$ACC0,$IN23_1,${S4} +	umlal2	$ACC3,$IN23_1,${R2} +	umlal2	$ACC2,$IN23_1,${R1} +	umlal2	$ACC4,$IN23_1,${R3} +	umlal2	$ACC1,$IN23_1,${R0} + +	dup	$IN23_4,${IN23_4}[0] +	umlal2	$ACC3,$IN23_3,${R0} +	umlal2	$ACC4,$IN23_3,${R1} +	umlal2	$ACC0,$IN23_3,${S2} +	umlal2	$ACC1,$IN23_3,${S3} +	umlal2	$ACC2,$IN23_3,${S4} + +	umlal2	$ACC3,$IN23_4,${S4} +	umlal2	$ACC0,$IN23_4,${S1} +	umlal2	$ACC4,$IN23_4,${R0} +	umlal2	$ACC1,$IN23_4,${S2} +	umlal2	$ACC2,$IN23_4,${S3} + +	b.eq	.Lshort_tail + +	//////////////////////////////////////////////////////////////// +	// (hash+inp[0:1])*r^4:r^3 and accumulate + +	add	$IN01_0,$IN01_0,$H0 +	umlal	$ACC3,$IN01_2,${R1} +	umlal	$ACC0,$IN01_2,${S3} +	umlal	$ACC4,$IN01_2,${R2} +	umlal	$ACC1,$IN01_2,${S4} +	umlal	$ACC2,$IN01_2,${R0} + +	add	$IN01_1,$IN01_1,$H1 +	umlal	$ACC3,$IN01_0,${R3} +	umlal	$ACC0,$IN01_0,${R0} +	umlal	$ACC4,$IN01_0,${R4} +	umlal	$ACC1,$IN01_0,${R1} +	umlal	$ACC2,$IN01_0,${R2} + +	add	$IN01_3,$IN01_3,$H3 +	umlal	$ACC3,$IN01_1,${R2} +	umlal	$ACC0,$IN01_1,${S4} +	umlal	$ACC4,$IN01_1,${R3} +	umlal	$ACC1,$IN01_1,${R0} +	umlal	$ACC2,$IN01_1,${R1} + +	add	$IN01_4,$IN01_4,$H4 +	umlal	$ACC3,$IN01_3,${R0} +	umlal	$ACC0,$IN01_3,${S2} +	umlal	$ACC4,$IN01_3,${R1} +	umlal	$ACC1,$IN01_3,${S3} +	umlal	$ACC2,$IN01_3,${S4} + +	umlal	$ACC3,$IN01_4,${S4} +	umlal	$ACC0,$IN01_4,${S1} +	umlal	$ACC4,$IN01_4,${R0} +	umlal	$ACC1,$IN01_4,${S2} +	umlal	$ACC2,$IN01_4,${S3} + +.Lshort_tail: +	//////////////////////////////////////////////////////////////// +	// horizontal add + +	addp	$ACC3,$ACC3,$ACC3 +	 ldp	d8,d9,[sp,#16]		// meet ABI requirements +	addp	$ACC0,$ACC0,$ACC0 +	 ldp	d10,d11,[sp,#32] +	addp	$ACC4,$ACC4,$ACC4 +	 ldp	d12,d13,[sp,#48] +	addp	$ACC1,$ACC1,$ACC1 +	 ldp	d14,d15,[sp,#64] +	addp	$ACC2,$ACC2,$ACC2 +	 ldr	x30,[sp,#8] + +	//////////////////////////////////////////////////////////////// +	// lazy reduction, but without narrowing + +	ushr	$T0.2d,$ACC3,#26 +	and	$ACC3,$ACC3,$MASK.2d +	 ushr	$T1.2d,$ACC0,#26 +	 and	$ACC0,$ACC0,$MASK.2d + +	add	$ACC4,$ACC4,$T0.2d	// h3 -> h4 +	 add	$ACC1,$ACC1,$T1.2d	// h0 -> h1 + +	ushr	$T0.2d,$ACC4,#26 +	and	$ACC4,$ACC4,$MASK.2d +	 ushr	$T1.2d,$ACC1,#26 +	 and	$ACC1,$ACC1,$MASK.2d +	 add	$ACC2,$ACC2,$T1.2d	// h1 -> h2 + +	add	$ACC0,$ACC0,$T0.2d +	shl	$T0.2d,$T0.2d,#2 +	 ushr	$T1.2d,$ACC2,#26 +	 and	$ACC2,$ACC2,$MASK.2d +	add	$ACC0,$ACC0,$T0.2d	// h4 -> h0 +	 add	$ACC3,$ACC3,$T1.2d	// h2 -> h3 + +	ushr	$T0.2d,$ACC0,#26 +	and	$ACC0,$ACC0,$MASK.2d +	 ushr	$T1.2d,$ACC3,#26 +	 and	$ACC3,$ACC3,$MASK.2d +	add	$ACC1,$ACC1,$T0.2d	// h0 -> h1 +	 add	$ACC4,$ACC4,$T1.2d	// h3 -> h4 + +	//////////////////////////////////////////////////////////////// +	// write the result, can be partially reduced + +	st4	{$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16 +	mov	x4,#1 +	st1	{$ACC4}[0],[$ctx] +	str	x4,[$ctx,#8]		// set is_base2_26 + +	ldr	x29,[sp],#80 +	 .inst	0xd50323bf		// autiasp +	ret +.size	poly1305_blocks_neon,.-poly1305_blocks_neon + +.pushsection .rodata +.align	5 +.Lzeros: +.long	0,0,0,0,0,0,0,0 +.asciz	"Poly1305 for ARMv8, CRYPTOGAMS by \@dot-asm" +.popsection + +.align	2 +#if !defined(__KERNEL__) && !defined(_WIN64) +.comm	OPENSSL_armcap_P,4,4 +.hidden	OPENSSL_armcap_P +#endif +___ + +foreach (split("\n",$code)) { +	s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/			or +	s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/	or +	(m/\bdup\b/ and (s/\.[24]s/.2d/g or 1))			or +	(m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1))	or +	(m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1))		or +	(m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1))		or +	(m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1)); + +	s/\.[124]([sd])\[/.$1\[/; +	s/w#x([0-9]+)/w$1/g; + +	print $_,"\n"; +} +close STDOUT; diff --git a/lib/crypto/arm64/poly1305-glue.c b/lib/crypto/arm64/poly1305-glue.c new file mode 100644 index 000000000000..31aea21ce42f --- /dev/null +++ b/lib/crypto/arm64/poly1305-glue.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * OpenSSL/Cryptogams accelerated Poly1305 transform for arm64 + * + * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org> + */ + +#include <asm/hwcap.h> +#include <asm/neon.h> +#include <asm/simd.h> +#include <crypto/internal/poly1305.h> +#include <linux/cpufeature.h> +#include <linux/jump_label.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/unaligned.h> + +asmlinkage void poly1305_block_init_arch( +	struct poly1305_block_state *state, +	const u8 raw_key[POLY1305_BLOCK_SIZE]); +EXPORT_SYMBOL_GPL(poly1305_block_init_arch); +asmlinkage void poly1305_blocks(struct poly1305_block_state *state, +				const u8 *src, u32 len, u32 hibit); +asmlinkage void poly1305_blocks_neon(struct poly1305_block_state *state, +				     const u8 *src, u32 len, u32 hibit); +asmlinkage void poly1305_emit_arch(const struct poly1305_state *state, +				   u8 digest[POLY1305_DIGEST_SIZE], +				   const u32 nonce[4]); +EXPORT_SYMBOL_GPL(poly1305_emit_arch); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); + +void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src, +			  unsigned int len, u32 padbit) +{ +	len = round_down(len, POLY1305_BLOCK_SIZE); +	if (static_branch_likely(&have_neon) && likely(may_use_simd())) { +		do { +			unsigned int todo = min_t(unsigned int, len, SZ_4K); + +			kernel_neon_begin(); +			poly1305_blocks_neon(state, src, todo, padbit); +			kernel_neon_end(); + +			len -= todo; +			src += todo; +		} while (len); +	} else +		poly1305_blocks(state, src, len, padbit); +} +EXPORT_SYMBOL_GPL(poly1305_blocks_arch); + +bool poly1305_is_arch_optimized(void) +{ +	/* We always can use at least the ARM64 scalar implementation. */ +	return true; +} +EXPORT_SYMBOL(poly1305_is_arch_optimized); + +static int __init neon_poly1305_mod_init(void) +{ +	if (cpu_have_named_feature(ASIMD)) +		static_branch_enable(&have_neon); +	return 0; +} +subsys_initcall(neon_poly1305_mod_init); + +static void __exit neon_poly1305_mod_exit(void) +{ +} +module_exit(neon_poly1305_mod_exit); + +MODULE_DESCRIPTION("Poly1305 authenticator (ARM64 optimized)"); +MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/arm64/sha1-ce-core.S b/lib/crypto/arm64/sha1-ce-core.S new file mode 100644 index 000000000000..21efbbafd7d6 --- /dev/null +++ b/lib/crypto/arm64/sha1-ce-core.S @@ -0,0 +1,130 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions + * + * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +	.text +	.arch		armv8-a+crypto + +	k0		.req	v0 +	k1		.req	v1 +	k2		.req	v2 +	k3		.req	v3 + +	t0		.req	v4 +	t1		.req	v5 + +	dga		.req	q6 +	dgav		.req	v6 +	dgb		.req	s7 +	dgbv		.req	v7 + +	dg0q		.req	q12 +	dg0s		.req	s12 +	dg0v		.req	v12 +	dg1s		.req	s13 +	dg1v		.req	v13 +	dg2s		.req	s14 + +	.macro		add_only, op, ev, rc, s0, dg1 +	.ifc		\ev, ev +	add		t1.4s, v\s0\().4s, \rc\().4s +	sha1h		dg2s, dg0s +	.ifnb		\dg1 +	sha1\op		dg0q, \dg1, t0.4s +	.else +	sha1\op		dg0q, dg1s, t0.4s +	.endif +	.else +	.ifnb		\s0 +	add		t0.4s, v\s0\().4s, \rc\().4s +	.endif +	sha1h		dg1s, dg0s +	sha1\op		dg0q, dg2s, t1.4s +	.endif +	.endm + +	.macro		add_update, op, ev, rc, s0, s1, s2, s3, dg1 +	sha1su0		v\s0\().4s, v\s1\().4s, v\s2\().4s +	add_only	\op, \ev, \rc, \s1, \dg1 +	sha1su1		v\s0\().4s, v\s3\().4s +	.endm + +	.macro		loadrc, k, val, tmp +	movz		\tmp, :abs_g0_nc:\val +	movk		\tmp, :abs_g1:\val +	dup		\k, \tmp +	.endm + +	/* +	 * size_t __sha1_ce_transform(struct sha1_block_state *state, +	 *			      const u8 *data, size_t nblocks); +	 */ +SYM_FUNC_START(__sha1_ce_transform) +	/* load round constants */ +	loadrc		k0.4s, 0x5a827999, w6 +	loadrc		k1.4s, 0x6ed9eba1, w6 +	loadrc		k2.4s, 0x8f1bbcdc, w6 +	loadrc		k3.4s, 0xca62c1d6, w6 + +	/* load state */ +	ld1		{dgav.4s}, [x0] +	ldr		dgb, [x0, #16] + +	/* load input */ +0:	ld1		{v8.4s-v11.4s}, [x1], #64 +	sub		x2, x2, #1 + +CPU_LE(	rev32		v8.16b, v8.16b		) +CPU_LE(	rev32		v9.16b, v9.16b		) +CPU_LE(	rev32		v10.16b, v10.16b	) +CPU_LE(	rev32		v11.16b, v11.16b	) + +	add		t0.4s, v8.4s, k0.4s +	mov		dg0v.16b, dgav.16b + +	add_update	c, ev, k0,  8,  9, 10, 11, dgb +	add_update	c, od, k0,  9, 10, 11,  8 +	add_update	c, ev, k0, 10, 11,  8,  9 +	add_update	c, od, k0, 11,  8,  9, 10 +	add_update	c, ev, k1,  8,  9, 10, 11 + +	add_update	p, od, k1,  9, 10, 11,  8 +	add_update	p, ev, k1, 10, 11,  8,  9 +	add_update	p, od, k1, 11,  8,  9, 10 +	add_update	p, ev, k1,  8,  9, 10, 11 +	add_update	p, od, k2,  9, 10, 11,  8 + +	add_update	m, ev, k2, 10, 11,  8,  9 +	add_update	m, od, k2, 11,  8,  9, 10 +	add_update	m, ev, k2,  8,  9, 10, 11 +	add_update	m, od, k2,  9, 10, 11,  8 +	add_update	m, ev, k3, 10, 11,  8,  9 + +	add_update	p, od, k3, 11,  8,  9, 10 +	add_only	p, ev, k3,  9 +	add_only	p, od, k3, 10 +	add_only	p, ev, k3, 11 +	add_only	p, od + +	/* update state */ +	add		dgbv.2s, dgbv.2s, dg1v.2s +	add		dgav.4s, dgav.4s, dg0v.4s + +	/* return early if voluntary preemption is needed */ +	cond_yield	1f, x5, x6 + +	/* handled all input blocks? */ +	cbnz		x2, 0b + +	/* store new state */ +1:	st1		{dgav.4s}, [x0] +	str		dgb, [x0, #16] +	mov		x0, x2 +	ret +SYM_FUNC_END(__sha1_ce_transform) diff --git a/lib/crypto/arm64/sha1.h b/lib/crypto/arm64/sha1.h new file mode 100644 index 000000000000..f822563538cc --- /dev/null +++ b/lib/crypto/arm64/sha1.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SHA-1 optimized for ARM64 + * + * Copyright 2025 Google LLC + */ +#include <asm/neon.h> +#include <asm/simd.h> +#include <linux/cpufeature.h> + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce); + +asmlinkage size_t __sha1_ce_transform(struct sha1_block_state *state, +				      const u8 *data, size_t nblocks); + +static void sha1_blocks(struct sha1_block_state *state, +			const u8 *data, size_t nblocks) +{ +	if (static_branch_likely(&have_ce) && likely(may_use_simd())) { +		do { +			size_t rem; + +			kernel_neon_begin(); +			rem = __sha1_ce_transform(state, data, nblocks); +			kernel_neon_end(); +			data += (nblocks - rem) * SHA1_BLOCK_SIZE; +			nblocks = rem; +		} while (nblocks); +	} else { +		sha1_blocks_generic(state, data, nblocks); +	} +} + +#define sha1_mod_init_arch sha1_mod_init_arch +static inline void sha1_mod_init_arch(void) +{ +	if (cpu_have_named_feature(SHA1)) +		static_branch_enable(&have_ce); +} diff --git a/lib/crypto/arm64/sha2-armv8.pl b/lib/crypto/arm64/sha2-armv8.pl new file mode 100644 index 000000000000..35ec9ae99fe1 --- /dev/null +++ b/lib/crypto/arm64/sha2-armv8.pl @@ -0,0 +1,786 @@ +#! /usr/bin/env perl +# SPDX-License-Identifier: GPL-2.0 + +# This code is taken from the OpenSSL project but the author (Andy Polyakov) +# has relicensed it under the GPLv2. Therefore this program is free software; +# you can redistribute it and/or modify it under the terms of the GNU General +# Public License version 2 as published by the Free Software Foundation. +# +# The original headers, including the original license headers, are +# included below for completeness. + +# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License").  You may not use +# this file except in compliance with the License.  You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# SHA256/512 for ARMv8. +# +# Performance in cycles per processed byte and improvement coefficient +# over code generated with "default" compiler: +# +#		SHA256-hw	SHA256(*)	SHA512 +# Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**)) +# Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***)) +# Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***)) +# Denver	2.01		10.5 (+26%)	6.70 (+8%) +# X-Gene			20.0 (+100%)	12.8 (+300%(***)) +# Mongoose	2.36		13.0 (+50%)	8.36 (+33%) +# +# (*)	Software SHA256 results are of lesser relevance, presented +#	mostly for informational purposes. +# (**)	The result is a trade-off: it's possible to improve it by +#	10% (or by 1 cycle per round), but at the cost of 20% loss +#	on Cortex-A53 (or by 4 cycles per round). +# (***)	Super-impressive coefficients over gcc-generated code are +#	indication of some compiler "pathology", most notably code +#	generated with -mgeneral-regs-only is significantly faster +#	and the gap is only 40-90%. +# +# October 2016. +# +# Originally it was reckoned that it makes no sense to implement NEON +# version of SHA256 for 64-bit processors. This is because performance +# improvement on most wide-spread Cortex-A5x processors was observed +# to be marginal, same on Cortex-A53 and ~10% on A57. But then it was +# observed that 32-bit NEON SHA256 performs significantly better than +# 64-bit scalar version on *some* of the more recent processors. As +# result 64-bit NEON version of SHA256 was added to provide best +# all-round performance. For example it executes ~30% faster on X-Gene +# and Mongoose. [For reference, NEON version of SHA512 is bound to +# deliver much less improvement, likely *negative* on Cortex-A5x. +# Which is why NEON support is limited to SHA256.] + +$output=pop; +$flavour=pop; + +if ($flavour && $flavour ne "void") { +    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +    die "can't locate arm-xlate.pl"; + +    open OUT,"| \"$^X\" $xlate $flavour $output"; +    *STDOUT=*OUT; +} else { +    open STDOUT,">$output"; +} + +if ($output =~ /512/) { +	$BITS=512; +	$SZ=8; +	@Sigma0=(28,34,39); +	@Sigma1=(14,18,41); +	@sigma0=(1,  8, 7); +	@sigma1=(19,61, 6); +	$rounds=80; +	$reg_t="x"; +} else { +	$BITS=256; +	$SZ=4; +	@Sigma0=( 2,13,22); +	@Sigma1=( 6,11,25); +	@sigma0=( 7,18, 3); +	@sigma1=(17,19,10); +	$rounds=64; +	$reg_t="w"; +} + +$func="sha${BITS}_block_data_order"; + +($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30)); + +@X=map("$reg_t$_",(3..15,0..2)); +@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27)); +($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28)); + +sub BODY_00_xx { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; +my $j=($i+1)&15; +my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]); +   $T0=@X[$i+3] if ($i<11); + +$code.=<<___	if ($i<16); +#ifndef	__AARCH64EB__ +	rev	@X[$i],@X[$i]			// $i +#endif +___ +$code.=<<___	if ($i<13 && ($i&1)); +	ldp	@X[$i+1],@X[$i+2],[$inp],#2*$SZ +___ +$code.=<<___	if ($i==13); +	ldp	@X[14],@X[15],[$inp] +___ +$code.=<<___	if ($i>=14); +	ldr	@X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`] +___ +$code.=<<___	if ($i>0 && $i<16); +	add	$a,$a,$t1			// h+=Sigma0(a) +___ +$code.=<<___	if ($i>=11); +	str	@X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`] +___ +# While ARMv8 specifies merged rotate-n-logical operation such as +# 'eor x,y,z,ror#n', it was found to negatively affect performance +# on Apple A7. The reason seems to be that it requires even 'y' to +# be available earlier. This means that such merged instruction is +# not necessarily best choice on critical path... On the other hand +# Cortex-A5x handles merged instructions much better than disjoint +# rotate and logical... See (**) footnote above. +$code.=<<___	if ($i<15); +	ror	$t0,$e,#$Sigma1[0] +	add	$h,$h,$t2			// h+=K[i] +	eor	$T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]` +	and	$t1,$f,$e +	bic	$t2,$g,$e +	add	$h,$h,@X[$i&15]			// h+=X[i] +	orr	$t1,$t1,$t2			// Ch(e,f,g) +	eor	$t2,$a,$b			// a^b, b^c in next round +	eor	$t0,$t0,$T0,ror#$Sigma1[1]	// Sigma1(e) +	ror	$T0,$a,#$Sigma0[0] +	add	$h,$h,$t1			// h+=Ch(e,f,g) +	eor	$t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]` +	add	$h,$h,$t0			// h+=Sigma1(e) +	and	$t3,$t3,$t2			// (b^c)&=(a^b) +	add	$d,$d,$h			// d+=h +	eor	$t3,$t3,$b			// Maj(a,b,c) +	eor	$t1,$T0,$t1,ror#$Sigma0[1]	// Sigma0(a) +	add	$h,$h,$t3			// h+=Maj(a,b,c) +	ldr	$t3,[$Ktbl],#$SZ		// *K++, $t2 in next round +	//add	$h,$h,$t1			// h+=Sigma0(a) +___ +$code.=<<___	if ($i>=15); +	ror	$t0,$e,#$Sigma1[0] +	add	$h,$h,$t2			// h+=K[i] +	ror	$T1,@X[($j+1)&15],#$sigma0[0] +	and	$t1,$f,$e +	ror	$T2,@X[($j+14)&15],#$sigma1[0] +	bic	$t2,$g,$e +	ror	$T0,$a,#$Sigma0[0] +	add	$h,$h,@X[$i&15]			// h+=X[i] +	eor	$t0,$t0,$e,ror#$Sigma1[1] +	eor	$T1,$T1,@X[($j+1)&15],ror#$sigma0[1] +	orr	$t1,$t1,$t2			// Ch(e,f,g) +	eor	$t2,$a,$b			// a^b, b^c in next round +	eor	$t0,$t0,$e,ror#$Sigma1[2]	// Sigma1(e) +	eor	$T0,$T0,$a,ror#$Sigma0[1] +	add	$h,$h,$t1			// h+=Ch(e,f,g) +	and	$t3,$t3,$t2			// (b^c)&=(a^b) +	eor	$T2,$T2,@X[($j+14)&15],ror#$sigma1[1] +	eor	$T1,$T1,@X[($j+1)&15],lsr#$sigma0[2]	// sigma0(X[i+1]) +	add	$h,$h,$t0			// h+=Sigma1(e) +	eor	$t3,$t3,$b			// Maj(a,b,c) +	eor	$t1,$T0,$a,ror#$Sigma0[2]	// Sigma0(a) +	eor	$T2,$T2,@X[($j+14)&15],lsr#$sigma1[2]	// sigma1(X[i+14]) +	add	@X[$j],@X[$j],@X[($j+9)&15] +	add	$d,$d,$h			// d+=h +	add	$h,$h,$t3			// h+=Maj(a,b,c) +	ldr	$t3,[$Ktbl],#$SZ		// *K++, $t2 in next round +	add	@X[$j],@X[$j],$T1 +	add	$h,$h,$t1			// h+=Sigma0(a) +	add	@X[$j],@X[$j],$T2 +___ +	($t2,$t3)=($t3,$t2); +} + +$code.=<<___; +#ifndef	__KERNEL__ +# include "arm_arch.h" +#endif + +.text + +.extern	OPENSSL_armcap_P +.globl	$func +.type	$func,%function +.align	6 +$func: +___ +$code.=<<___	if ($SZ==4); +#ifndef	__KERNEL__ +# ifdef	__ILP32__ +	ldrsw	x16,.LOPENSSL_armcap_P +# else +	ldr	x16,.LOPENSSL_armcap_P +# endif +	adr	x17,.LOPENSSL_armcap_P +	add	x16,x16,x17 +	ldr	w16,[x16] +	tst	w16,#ARMV8_SHA256 +	b.ne	.Lv8_entry +	tst	w16,#ARMV7_NEON +	b.ne	.Lneon_entry +#endif +___ +$code.=<<___; +	stp	x29,x30,[sp,#-128]! +	add	x29,sp,#0 + +	stp	x19,x20,[sp,#16] +	stp	x21,x22,[sp,#32] +	stp	x23,x24,[sp,#48] +	stp	x25,x26,[sp,#64] +	stp	x27,x28,[sp,#80] +	sub	sp,sp,#4*$SZ + +	ldp	$A,$B,[$ctx]				// load context +	ldp	$C,$D,[$ctx,#2*$SZ] +	ldp	$E,$F,[$ctx,#4*$SZ] +	add	$num,$inp,$num,lsl#`log(16*$SZ)/log(2)`	// end of input +	ldp	$G,$H,[$ctx,#6*$SZ] +	adr	$Ktbl,.LK$BITS +	stp	$ctx,$num,[x29,#96] + +.Loop: +	ldp	@X[0],@X[1],[$inp],#2*$SZ +	ldr	$t2,[$Ktbl],#$SZ			// *K++ +	eor	$t3,$B,$C				// magic seed +	str	$inp,[x29,#112] +___ +for ($i=0;$i<16;$i++)	{ &BODY_00_xx($i,@V); unshift(@V,pop(@V)); } +$code.=".Loop_16_xx:\n"; +for (;$i<32;$i++)	{ &BODY_00_xx($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; +	cbnz	$t2,.Loop_16_xx + +	ldp	$ctx,$num,[x29,#96] +	ldr	$inp,[x29,#112] +	sub	$Ktbl,$Ktbl,#`$SZ*($rounds+1)`		// rewind + +	ldp	@X[0],@X[1],[$ctx] +	ldp	@X[2],@X[3],[$ctx,#2*$SZ] +	add	$inp,$inp,#14*$SZ			// advance input pointer +	ldp	@X[4],@X[5],[$ctx,#4*$SZ] +	add	$A,$A,@X[0] +	ldp	@X[6],@X[7],[$ctx,#6*$SZ] +	add	$B,$B,@X[1] +	add	$C,$C,@X[2] +	add	$D,$D,@X[3] +	stp	$A,$B,[$ctx] +	add	$E,$E,@X[4] +	add	$F,$F,@X[5] +	stp	$C,$D,[$ctx,#2*$SZ] +	add	$G,$G,@X[6] +	add	$H,$H,@X[7] +	cmp	$inp,$num +	stp	$E,$F,[$ctx,#4*$SZ] +	stp	$G,$H,[$ctx,#6*$SZ] +	b.ne	.Loop + +	ldp	x19,x20,[x29,#16] +	add	sp,sp,#4*$SZ +	ldp	x21,x22,[x29,#32] +	ldp	x23,x24,[x29,#48] +	ldp	x25,x26,[x29,#64] +	ldp	x27,x28,[x29,#80] +	ldp	x29,x30,[sp],#128 +	ret +.size	$func,.-$func + +.align	6 +.type	.LK$BITS,%object +.LK$BITS: +___ +$code.=<<___ if ($SZ==8); +	.quad	0x428a2f98d728ae22,0x7137449123ef65cd +	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc +	.quad	0x3956c25bf348b538,0x59f111f1b605d019 +	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118 +	.quad	0xd807aa98a3030242,0x12835b0145706fbe +	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 +	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1 +	.quad	0x9bdc06a725c71235,0xc19bf174cf692694 +	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3 +	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 +	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483 +	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5 +	.quad	0x983e5152ee66dfab,0xa831c66d2db43210 +	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4 +	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725 +	.quad	0x06ca6351e003826f,0x142929670a0e6e70 +	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926 +	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df +	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8 +	.quad	0x81c2c92e47edaee6,0x92722c851482353b +	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001 +	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30 +	.quad	0xd192e819d6ef5218,0xd69906245565a910 +	.quad	0xf40e35855771202a,0x106aa07032bbd1b8 +	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53 +	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 +	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb +	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 +	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60 +	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec +	.quad	0x90befffa23631e28,0xa4506cebde82bde9 +	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b +	.quad	0xca273eceea26619c,0xd186b8c721c0c207 +	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 +	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6 +	.quad	0x113f9804bef90dae,0x1b710b35131c471b +	.quad	0x28db77f523047d84,0x32caab7b40c72493 +	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c +	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a +	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817 +	.quad	0	// terminator +___ +$code.=<<___ if ($SZ==4); +	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +	.long	0	//terminator +___ +$code.=<<___; +.size	.LK$BITS,.-.LK$BITS +#ifndef	__KERNEL__ +.align	3 +.LOPENSSL_armcap_P: +# ifdef	__ILP32__ +	.long	OPENSSL_armcap_P-. +# else +	.quad	OPENSSL_armcap_P-. +# endif +#endif +.asciz	"SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" +.align	2 +___ + +if ($SZ==4) { +my $Ktbl="x3"; + +my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2)); +my @MSG=map("v$_.16b",(4..7)); +my ($W0,$W1)=("v16.4s","v17.4s"); +my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b"); + +$code.=<<___; +#ifndef	__KERNEL__ +.type	sha256_block_armv8,%function +.align	6 +sha256_block_armv8: +.Lv8_entry: +	stp		x29,x30,[sp,#-16]! +	add		x29,sp,#0 + +	ld1.32		{$ABCD,$EFGH},[$ctx] +	adr		$Ktbl,.LK256 + +.Loop_hw: +	ld1		{@MSG[0]-@MSG[3]},[$inp],#64 +	sub		$num,$num,#1 +	ld1.32		{$W0},[$Ktbl],#16 +	rev32		@MSG[0],@MSG[0] +	rev32		@MSG[1],@MSG[1] +	rev32		@MSG[2],@MSG[2] +	rev32		@MSG[3],@MSG[3] +	orr		$ABCD_SAVE,$ABCD,$ABCD		// offload +	orr		$EFGH_SAVE,$EFGH,$EFGH +___ +for($i=0;$i<12;$i++) { +$code.=<<___; +	ld1.32		{$W1},[$Ktbl],#16 +	add.i32		$W0,$W0,@MSG[0] +	sha256su0	@MSG[0],@MSG[1] +	orr		$abcd,$ABCD,$ABCD +	sha256h		$ABCD,$EFGH,$W0 +	sha256h2	$EFGH,$abcd,$W0 +	sha256su1	@MSG[0],@MSG[2],@MSG[3] +___ +	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG)); +} +$code.=<<___; +	ld1.32		{$W1},[$Ktbl],#16 +	add.i32		$W0,$W0,@MSG[0] +	orr		$abcd,$ABCD,$ABCD +	sha256h		$ABCD,$EFGH,$W0 +	sha256h2	$EFGH,$abcd,$W0 + +	ld1.32		{$W0},[$Ktbl],#16 +	add.i32		$W1,$W1,@MSG[1] +	orr		$abcd,$ABCD,$ABCD +	sha256h		$ABCD,$EFGH,$W1 +	sha256h2	$EFGH,$abcd,$W1 + +	ld1.32		{$W1},[$Ktbl] +	add.i32		$W0,$W0,@MSG[2] +	sub		$Ktbl,$Ktbl,#$rounds*$SZ-16	// rewind +	orr		$abcd,$ABCD,$ABCD +	sha256h		$ABCD,$EFGH,$W0 +	sha256h2	$EFGH,$abcd,$W0 + +	add.i32		$W1,$W1,@MSG[3] +	orr		$abcd,$ABCD,$ABCD +	sha256h		$ABCD,$EFGH,$W1 +	sha256h2	$EFGH,$abcd,$W1 + +	add.i32		$ABCD,$ABCD,$ABCD_SAVE +	add.i32		$EFGH,$EFGH,$EFGH_SAVE + +	cbnz		$num,.Loop_hw + +	st1.32		{$ABCD,$EFGH},[$ctx] + +	ldr		x29,[sp],#16 +	ret +.size	sha256_block_armv8,.-sha256_block_armv8 +#endif +___ +} + +if ($SZ==4) {	######################################### NEON stuff # +# You'll surely note a lot of similarities with sha256-armv4 module, +# and of course it's not a coincidence. sha256-armv4 was used as +# initial template, but was adapted for ARMv8 instruction set and +# extensively re-tuned for all-round performance. + +my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10)); +my ($t0,$t1,$t2,$t3,$t4) = map("w$_",(11..15)); +my $Ktbl="x16"; +my $Xfer="x17"; +my @X = map("q$_",(0..3)); +my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19)); +my $j=0; + +sub AUTOLOAD()          # thunk [simplified] x86-style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; +  my $arg = pop; +    $arg = "#$arg" if ($arg*1 eq $arg); +    $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; +} + +sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; } +sub Dlo     { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; } +sub Dhi     { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; } + +sub Xupdate() +{ use integer; +  my $body = shift; +  my @insns = (&$body,&$body,&$body,&$body); +  my ($a,$b,$c,$d,$e,$f,$g,$h); + +	&ext_8		($T0,@X[0],@X[1],4);	# X[1..4] +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&ext_8		($T3,@X[2],@X[3],4);	# X[9..12] +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&mov		(&Dscalar($T7),&Dhi(@X[3]));	# X[14..15] +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&ushr_32	($T2,$T0,$sigma0[0]); +	 eval(shift(@insns)); +	&ushr_32	($T1,$T0,$sigma0[2]); +	 eval(shift(@insns)); +	&add_32 	(@X[0],@X[0],$T3);	# X[0..3] += X[9..12] +	 eval(shift(@insns)); +	&sli_32		($T2,$T0,32-$sigma0[0]); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&ushr_32	($T3,$T0,$sigma0[1]); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&eor_8		($T1,$T1,$T2); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&sli_32		($T3,$T0,32-$sigma0[1]); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	  &ushr_32	($T4,$T7,$sigma1[0]); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&eor_8		($T1,$T1,$T3);		# sigma0(X[1..4]) +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	  &sli_32	($T4,$T7,32-$sigma1[0]); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	  &ushr_32	($T5,$T7,$sigma1[2]); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	  &ushr_32	($T3,$T7,$sigma1[1]); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&add_32		(@X[0],@X[0],$T1);	# X[0..3] += sigma0(X[1..4]) +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	  &sli_u32	($T3,$T7,32-$sigma1[1]); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	  &eor_8	($T5,$T5,$T4); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	  &eor_8	($T5,$T5,$T3);		# sigma1(X[14..15]) +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&add_32		(@X[0],@X[0],$T5);	# X[0..1] += sigma1(X[14..15]) +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	  &ushr_32	($T6,@X[0],$sigma1[0]); +	 eval(shift(@insns)); +	  &ushr_32	($T7,@X[0],$sigma1[2]); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	  &sli_32	($T6,@X[0],32-$sigma1[0]); +	 eval(shift(@insns)); +	  &ushr_32	($T5,@X[0],$sigma1[1]); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	  &eor_8	($T7,$T7,$T6); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	  &sli_32	($T5,@X[0],32-$sigma1[1]); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&ld1_32		("{$T0}","[$Ktbl], #16"); +	 eval(shift(@insns)); +	  &eor_8	($T7,$T7,$T5);		# sigma1(X[16..17]) +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&eor_8		($T5,$T5,$T5); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&mov		(&Dhi($T5), &Dlo($T7)); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&add_32		(@X[0],@X[0],$T5);	# X[2..3] += sigma1(X[16..17]) +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&add_32		($T0,$T0,@X[0]); +	 while($#insns>=1) { eval(shift(@insns)); } +	&st1_32		("{$T0}","[$Xfer], #16"); +	 eval(shift(@insns)); + +	push(@X,shift(@X));		# "rotate" X[] +} + +sub Xpreload() +{ use integer; +  my $body = shift; +  my @insns = (&$body,&$body,&$body,&$body); +  my ($a,$b,$c,$d,$e,$f,$g,$h); + +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&ld1_8		("{@X[0]}","[$inp],#16"); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&ld1_32		("{$T0}","[$Ktbl],#16"); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&rev32		(@X[0],@X[0]); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	 eval(shift(@insns)); +	&add_32		($T0,$T0,@X[0]); +	 foreach (@insns) { eval; }	# remaining instructions +	&st1_32		("{$T0}","[$Xfer], #16"); + +	push(@X,shift(@X));		# "rotate" X[] +} + +sub body_00_15 () { +	( +	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. +	'&add	($h,$h,$t1)',			# h+=X[i]+K[i] +	'&add	($a,$a,$t4);'.			# h+=Sigma0(a) from the past +	'&and	($t1,$f,$e)', +	'&bic	($t4,$g,$e)', +	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', +	'&add	($a,$a,$t2)',			# h+=Maj(a,b,c) from the past +	'&orr	($t1,$t1,$t4)',			# Ch(e,f,g) +	'&eor	($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e) +	'&eor	($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', +	'&add	($h,$h,$t1)',			# h+=Ch(e,f,g) +	'&ror	($t0,$t0,"#$Sigma1[0]")', +	'&eor	($t2,$a,$b)',			# a^b, b^c in next round +	'&eor	($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a) +	'&add	($h,$h,$t0)',			# h+=Sigma1(e) +	'&ldr	($t1,sprintf "[sp,#%d]",4*(($j+1)&15))	if (($j&15)!=15);'. +	'&ldr	($t1,"[$Ktbl]")				if ($j==15);'. +	'&and	($t3,$t3,$t2)',			# (b^c)&=(a^b) +	'&ror	($t4,$t4,"#$Sigma0[0]")', +	'&add	($d,$d,$h)',			# d+=h +	'&eor	($t3,$t3,$b)',			# Maj(a,b,c) +	'$j++;	unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' +	) +} + +$code.=<<___; +#ifdef	__KERNEL__ +.globl	sha256_block_neon +#endif +.type	sha256_block_neon,%function +.align	4 +sha256_block_neon: +.Lneon_entry: +	stp	x29, x30, [sp, #-16]! +	mov	x29, sp +	sub	sp,sp,#16*4 + +	adr	$Ktbl,.LK256 +	add	$num,$inp,$num,lsl#6	// len to point at the end of inp + +	ld1.8	{@X[0]},[$inp], #16 +	ld1.8	{@X[1]},[$inp], #16 +	ld1.8	{@X[2]},[$inp], #16 +	ld1.8	{@X[3]},[$inp], #16 +	ld1.32	{$T0},[$Ktbl], #16 +	ld1.32	{$T1},[$Ktbl], #16 +	ld1.32	{$T2},[$Ktbl], #16 +	ld1.32	{$T3},[$Ktbl], #16 +	rev32	@X[0],@X[0]		// yes, even on +	rev32	@X[1],@X[1]		// big-endian +	rev32	@X[2],@X[2] +	rev32	@X[3],@X[3] +	mov	$Xfer,sp +	add.32	$T0,$T0,@X[0] +	add.32	$T1,$T1,@X[1] +	add.32	$T2,$T2,@X[2] +	st1.32	{$T0-$T1},[$Xfer], #32 +	add.32	$T3,$T3,@X[3] +	st1.32	{$T2-$T3},[$Xfer] +	sub	$Xfer,$Xfer,#32 + +	ldp	$A,$B,[$ctx] +	ldp	$C,$D,[$ctx,#8] +	ldp	$E,$F,[$ctx,#16] +	ldp	$G,$H,[$ctx,#24] +	ldr	$t1,[sp,#0] +	mov	$t2,wzr +	eor	$t3,$B,$C +	mov	$t4,wzr +	b	.L_00_48 + +.align	4 +.L_00_48: +___ +	&Xupdate(\&body_00_15); +	&Xupdate(\&body_00_15); +	&Xupdate(\&body_00_15); +	&Xupdate(\&body_00_15); +$code.=<<___; +	cmp	$t1,#0				// check for K256 terminator +	ldr	$t1,[sp,#0] +	sub	$Xfer,$Xfer,#64 +	bne	.L_00_48 + +	sub	$Ktbl,$Ktbl,#256		// rewind $Ktbl +	cmp	$inp,$num +	mov	$Xfer, #64 +	csel	$Xfer, $Xfer, xzr, eq +	sub	$inp,$inp,$Xfer			// avoid SEGV +	mov	$Xfer,sp +___ +	&Xpreload(\&body_00_15); +	&Xpreload(\&body_00_15); +	&Xpreload(\&body_00_15); +	&Xpreload(\&body_00_15); +$code.=<<___; +	add	$A,$A,$t4			// h+=Sigma0(a) from the past +	ldp	$t0,$t1,[$ctx,#0] +	add	$A,$A,$t2			// h+=Maj(a,b,c) from the past +	ldp	$t2,$t3,[$ctx,#8] +	add	$A,$A,$t0			// accumulate +	add	$B,$B,$t1 +	ldp	$t0,$t1,[$ctx,#16] +	add	$C,$C,$t2 +	add	$D,$D,$t3 +	ldp	$t2,$t3,[$ctx,#24] +	add	$E,$E,$t0 +	add	$F,$F,$t1 +	 ldr	$t1,[sp,#0] +	stp	$A,$B,[$ctx,#0] +	add	$G,$G,$t2 +	 mov	$t2,wzr +	stp	$C,$D,[$ctx,#8] +	add	$H,$H,$t3 +	stp	$E,$F,[$ctx,#16] +	 eor	$t3,$B,$C +	stp	$G,$H,[$ctx,#24] +	 mov	$t4,wzr +	 mov	$Xfer,sp +	b.ne	.L_00_48 + +	ldr	x29,[x29] +	add	sp,sp,#16*4+16 +	ret +.size	sha256_block_neon,.-sha256_block_neon +___ +} + +$code.=<<___; +#ifndef	__KERNEL__ +.comm	OPENSSL_armcap_P,4,4 +#endif +___ + +{   my  %opcode = ( +	"sha256h"	=> 0x5e004000,	"sha256h2"	=> 0x5e005000, +	"sha256su0"	=> 0x5e282800,	"sha256su1"	=> 0x5e006000	); + +    sub unsha256 { +	my ($mnemonic,$arg)=@_; + +	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o +	&& +	sprintf ".inst\t0x%08x\t//%s %s", +			$opcode{$mnemonic}|$1|($2<<5)|($3<<16), +			$mnemonic,$arg; +    } +} + +open SELF,$0; +while(<SELF>) { +        next if (/^#!/); +        last if (!s/^#/\/\// and !/^$/); +        print; +} +close SELF; + +foreach(split("\n",$code)) { + +	s/\`([^\`]*)\`/eval($1)/ge; + +	s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge; + +	s/\bq([0-9]+)\b/v$1.16b/g;		# old->new registers + +	s/\.[ui]?8(\s)/$1/; +	s/\.\w?32\b//		and s/\.16b/\.4s/g; +	m/(ld|st)1[^\[]+\[0\]/	and s/\.4s/\.s/g; + +	print $_,"\n"; +} + +close STDOUT; diff --git a/lib/crypto/arm64/sha256-ce.S b/lib/crypto/arm64/sha256-ce.S new file mode 100644 index 000000000000..b99d9589c421 --- /dev/null +++ b/lib/crypto/arm64/sha256-ce.S @@ -0,0 +1,136 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * sha2-ce-core.S - core SHA-224/SHA-256 transform using v8 Crypto Extensions + * + * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +	.text +	.arch		armv8-a+crypto + +	dga		.req	q20 +	dgav		.req	v20 +	dgb		.req	q21 +	dgbv		.req	v21 + +	t0		.req	v22 +	t1		.req	v23 + +	dg0q		.req	q24 +	dg0v		.req	v24 +	dg1q		.req	q25 +	dg1v		.req	v25 +	dg2q		.req	q26 +	dg2v		.req	v26 + +	.macro		add_only, ev, rc, s0 +	mov		dg2v.16b, dg0v.16b +	.ifeq		\ev +	add		t1.4s, v\s0\().4s, \rc\().4s +	sha256h		dg0q, dg1q, t0.4s +	sha256h2	dg1q, dg2q, t0.4s +	.else +	.ifnb		\s0 +	add		t0.4s, v\s0\().4s, \rc\().4s +	.endif +	sha256h		dg0q, dg1q, t1.4s +	sha256h2	dg1q, dg2q, t1.4s +	.endif +	.endm + +	.macro		add_update, ev, rc, s0, s1, s2, s3 +	sha256su0	v\s0\().4s, v\s1\().4s +	add_only	\ev, \rc, \s1 +	sha256su1	v\s0\().4s, v\s2\().4s, v\s3\().4s +	.endm + +	/* +	 * The SHA-256 round constants +	 */ +	.section	".rodata", "a" +	.align		4 +.Lsha2_rcon: +	.word		0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 +	.word		0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 +	.word		0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 +	.word		0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 +	.word		0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc +	.word		0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da +	.word		0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 +	.word		0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 +	.word		0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 +	.word		0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 +	.word		0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 +	.word		0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 +	.word		0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 +	.word		0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 +	.word		0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 +	.word		0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 + +	/* +	 * size_t __sha256_ce_transform(struct sha256_block_state *state, +	 *				const u8 *data, size_t nblocks); +	 */ +	.text +SYM_FUNC_START(__sha256_ce_transform) +	/* load round constants */ +	adr_l		x8, .Lsha2_rcon +	ld1		{ v0.4s- v3.4s}, [x8], #64 +	ld1		{ v4.4s- v7.4s}, [x8], #64 +	ld1		{ v8.4s-v11.4s}, [x8], #64 +	ld1		{v12.4s-v15.4s}, [x8] + +	/* load state */ +	ld1		{dgav.4s, dgbv.4s}, [x0] + +	/* load input */ +0:	ld1		{v16.4s-v19.4s}, [x1], #64 +	sub		x2, x2, #1 + +CPU_LE(	rev32		v16.16b, v16.16b	) +CPU_LE(	rev32		v17.16b, v17.16b	) +CPU_LE(	rev32		v18.16b, v18.16b	) +CPU_LE(	rev32		v19.16b, v19.16b	) + +	add		t0.4s, v16.4s, v0.4s +	mov		dg0v.16b, dgav.16b +	mov		dg1v.16b, dgbv.16b + +	add_update	0,  v1, 16, 17, 18, 19 +	add_update	1,  v2, 17, 18, 19, 16 +	add_update	0,  v3, 18, 19, 16, 17 +	add_update	1,  v4, 19, 16, 17, 18 + +	add_update	0,  v5, 16, 17, 18, 19 +	add_update	1,  v6, 17, 18, 19, 16 +	add_update	0,  v7, 18, 19, 16, 17 +	add_update	1,  v8, 19, 16, 17, 18 + +	add_update	0,  v9, 16, 17, 18, 19 +	add_update	1, v10, 17, 18, 19, 16 +	add_update	0, v11, 18, 19, 16, 17 +	add_update	1, v12, 19, 16, 17, 18 + +	add_only	0, v13, 17 +	add_only	1, v14, 18 +	add_only	0, v15, 19 +	add_only	1 + +	/* update state */ +	add		dgav.4s, dgav.4s, dg0v.4s +	add		dgbv.4s, dgbv.4s, dg1v.4s + +	/* return early if voluntary preemption is needed */ +	cond_yield	1f, x5, x6 + +	/* handled all input blocks? */ +	cbnz		x2, 0b + +	/* store new state */ +1:	st1		{dgav.4s, dgbv.4s}, [x0] +	mov		x0, x2 +	ret +SYM_FUNC_END(__sha256_ce_transform) diff --git a/lib/crypto/arm64/sha256.h b/lib/crypto/arm64/sha256.h new file mode 100644 index 000000000000..a211966c124a --- /dev/null +++ b/lib/crypto/arm64/sha256.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SHA-256 optimized for ARM64 + * + * Copyright 2025 Google LLC + */ +#include <asm/neon.h> +#include <crypto/internal/simd.h> +#include <linux/cpufeature.h> + +asmlinkage void sha256_block_data_order(struct sha256_block_state *state, +					const u8 *data, size_t nblocks); +asmlinkage void sha256_block_neon(struct sha256_block_state *state, +				  const u8 *data, size_t nblocks); +asmlinkage size_t __sha256_ce_transform(struct sha256_block_state *state, +					const u8 *data, size_t nblocks); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce); + +static void sha256_blocks(struct sha256_block_state *state, +			  const u8 *data, size_t nblocks) +{ +	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && +	    static_branch_likely(&have_neon) && crypto_simd_usable()) { +		if (static_branch_likely(&have_ce)) { +			do { +				size_t rem; + +				kernel_neon_begin(); +				rem = __sha256_ce_transform(state, +							    data, nblocks); +				kernel_neon_end(); +				data += (nblocks - rem) * SHA256_BLOCK_SIZE; +				nblocks = rem; +			} while (nblocks); +		} else { +			kernel_neon_begin(); +			sha256_block_neon(state, data, nblocks); +			kernel_neon_end(); +		} +	} else { +		sha256_block_data_order(state, data, nblocks); +	} +} + +#ifdef CONFIG_KERNEL_MODE_NEON +#define sha256_mod_init_arch sha256_mod_init_arch +static inline void sha256_mod_init_arch(void) +{ +	if (cpu_have_named_feature(ASIMD)) { +		static_branch_enable(&have_neon); +		if (cpu_have_named_feature(SHA2)) +			static_branch_enable(&have_ce); +	} +} +#endif /* CONFIG_KERNEL_MODE_NEON */ diff --git a/lib/crypto/arm64/sha512-ce-core.S b/lib/crypto/arm64/sha512-ce-core.S new file mode 100644 index 000000000000..22f1ded89bc8 --- /dev/null +++ b/lib/crypto/arm64/sha512-ce-core.S @@ -0,0 +1,197 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * sha512-ce-core.S - core SHA-384/SHA-512 transform using v8 Crypto Extensions + * + * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +	/* +	 * We have to specify the "sha3" feature here, since the GNU and clang +	 * assemblers both consider the SHA-512 instructions to be part of the +	 * "sha3" feature.  (Except binutils 2.30 through 2.42, which used +	 * "sha2".  But "sha3" implies "sha2", so "sha3" still works in those +	 * versions.)  "sha3" doesn't make a lot of sense, since SHA-512 is part +	 * of the SHA-2 family of algorithms, and also the Arm Architecture +	 * Reference Manual defines FEAT_SHA512 and FEAT_SHA3 separately. +	 * Regardless, we must use "sha3" to be compatible with the assemblers. +	 */ +	.arch		armv8-a+sha3 + +	/* +	 * The SHA-512 round constants +	 */ +	.section	".rodata", "a" +	.align		4 +.Lsha512_rcon: +	.quad		0x428a2f98d728ae22, 0x7137449123ef65cd +	.quad		0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc +	.quad		0x3956c25bf348b538, 0x59f111f1b605d019 +	.quad		0x923f82a4af194f9b, 0xab1c5ed5da6d8118 +	.quad		0xd807aa98a3030242, 0x12835b0145706fbe +	.quad		0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2 +	.quad		0x72be5d74f27b896f, 0x80deb1fe3b1696b1 +	.quad		0x9bdc06a725c71235, 0xc19bf174cf692694 +	.quad		0xe49b69c19ef14ad2, 0xefbe4786384f25e3 +	.quad		0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65 +	.quad		0x2de92c6f592b0275, 0x4a7484aa6ea6e483 +	.quad		0x5cb0a9dcbd41fbd4, 0x76f988da831153b5 +	.quad		0x983e5152ee66dfab, 0xa831c66d2db43210 +	.quad		0xb00327c898fb213f, 0xbf597fc7beef0ee4 +	.quad		0xc6e00bf33da88fc2, 0xd5a79147930aa725 +	.quad		0x06ca6351e003826f, 0x142929670a0e6e70 +	.quad		0x27b70a8546d22ffc, 0x2e1b21385c26c926 +	.quad		0x4d2c6dfc5ac42aed, 0x53380d139d95b3df +	.quad		0x650a73548baf63de, 0x766a0abb3c77b2a8 +	.quad		0x81c2c92e47edaee6, 0x92722c851482353b +	.quad		0xa2bfe8a14cf10364, 0xa81a664bbc423001 +	.quad		0xc24b8b70d0f89791, 0xc76c51a30654be30 +	.quad		0xd192e819d6ef5218, 0xd69906245565a910 +	.quad		0xf40e35855771202a, 0x106aa07032bbd1b8 +	.quad		0x19a4c116b8d2d0c8, 0x1e376c085141ab53 +	.quad		0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8 +	.quad		0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb +	.quad		0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3 +	.quad		0x748f82ee5defb2fc, 0x78a5636f43172f60 +	.quad		0x84c87814a1f0ab72, 0x8cc702081a6439ec +	.quad		0x90befffa23631e28, 0xa4506cebde82bde9 +	.quad		0xbef9a3f7b2c67915, 0xc67178f2e372532b +	.quad		0xca273eceea26619c, 0xd186b8c721c0c207 +	.quad		0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178 +	.quad		0x06f067aa72176fba, 0x0a637dc5a2c898a6 +	.quad		0x113f9804bef90dae, 0x1b710b35131c471b +	.quad		0x28db77f523047d84, 0x32caab7b40c72493 +	.quad		0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c +	.quad		0x4cc5d4becb3e42b6, 0x597f299cfc657e2a +	.quad		0x5fcb6fab3ad6faec, 0x6c44198c4a475817 + +	.macro		dround, i0, i1, i2, i3, i4, rc0, rc1, in0, in1, in2, in3, in4 +	.ifnb		\rc1 +	ld1		{v\rc1\().2d}, [x4], #16 +	.endif +	add		v5.2d, v\rc0\().2d, v\in0\().2d +	ext		v6.16b, v\i2\().16b, v\i3\().16b, #8 +	ext		v5.16b, v5.16b, v5.16b, #8 +	ext		v7.16b, v\i1\().16b, v\i2\().16b, #8 +	add		v\i3\().2d, v\i3\().2d, v5.2d +	.ifnb		\in1 +	ext		v5.16b, v\in3\().16b, v\in4\().16b, #8 +	sha512su0	v\in0\().2d, v\in1\().2d +	.endif +	sha512h		q\i3, q6, v7.2d +	.ifnb		\in1 +	sha512su1	v\in0\().2d, v\in2\().2d, v5.2d +	.endif +	add		v\i4\().2d, v\i1\().2d, v\i3\().2d +	sha512h2	q\i3, q\i1, v\i0\().2d +	.endm + +	/* +	 * size_t __sha512_ce_transform(struct sha512_block_state *state, +	 *				const u8 *data, size_t nblocks); +	 */ +	.text +SYM_FUNC_START(__sha512_ce_transform) +	/* load state */ +	ld1		{v8.2d-v11.2d}, [x0] + +	/* load first 4 round constants */ +	adr_l		x3, .Lsha512_rcon +	ld1		{v20.2d-v23.2d}, [x3], #64 + +	/* load input */ +0:	ld1		{v12.2d-v15.2d}, [x1], #64 +	ld1		{v16.2d-v19.2d}, [x1], #64 +	sub		x2, x2, #1 + +CPU_LE(	rev64		v12.16b, v12.16b	) +CPU_LE(	rev64		v13.16b, v13.16b	) +CPU_LE(	rev64		v14.16b, v14.16b	) +CPU_LE(	rev64		v15.16b, v15.16b	) +CPU_LE(	rev64		v16.16b, v16.16b	) +CPU_LE(	rev64		v17.16b, v17.16b	) +CPU_LE(	rev64		v18.16b, v18.16b	) +CPU_LE(	rev64		v19.16b, v19.16b	) + +	mov		x4, x3				// rc pointer + +	mov		v0.16b, v8.16b +	mov		v1.16b, v9.16b +	mov		v2.16b, v10.16b +	mov		v3.16b, v11.16b + +	// v0  ab  cd  --  ef  gh  ab +	// v1  cd  --  ef  gh  ab  cd +	// v2  ef  gh  ab  cd  --  ef +	// v3  gh  ab  cd  --  ef  gh +	// v4  --  ef  gh  ab  cd  -- + +	dround		0, 1, 2, 3, 4, 20, 24, 12, 13, 19, 16, 17 +	dround		3, 0, 4, 2, 1, 21, 25, 13, 14, 12, 17, 18 +	dround		2, 3, 1, 4, 0, 22, 26, 14, 15, 13, 18, 19 +	dround		4, 2, 0, 1, 3, 23, 27, 15, 16, 14, 19, 12 +	dround		1, 4, 3, 0, 2, 24, 28, 16, 17, 15, 12, 13 + +	dround		0, 1, 2, 3, 4, 25, 29, 17, 18, 16, 13, 14 +	dround		3, 0, 4, 2, 1, 26, 30, 18, 19, 17, 14, 15 +	dround		2, 3, 1, 4, 0, 27, 31, 19, 12, 18, 15, 16 +	dround		4, 2, 0, 1, 3, 28, 24, 12, 13, 19, 16, 17 +	dround		1, 4, 3, 0, 2, 29, 25, 13, 14, 12, 17, 18 + +	dround		0, 1, 2, 3, 4, 30, 26, 14, 15, 13, 18, 19 +	dround		3, 0, 4, 2, 1, 31, 27, 15, 16, 14, 19, 12 +	dround		2, 3, 1, 4, 0, 24, 28, 16, 17, 15, 12, 13 +	dround		4, 2, 0, 1, 3, 25, 29, 17, 18, 16, 13, 14 +	dround		1, 4, 3, 0, 2, 26, 30, 18, 19, 17, 14, 15 + +	dround		0, 1, 2, 3, 4, 27, 31, 19, 12, 18, 15, 16 +	dround		3, 0, 4, 2, 1, 28, 24, 12, 13, 19, 16, 17 +	dround		2, 3, 1, 4, 0, 29, 25, 13, 14, 12, 17, 18 +	dround		4, 2, 0, 1, 3, 30, 26, 14, 15, 13, 18, 19 +	dround		1, 4, 3, 0, 2, 31, 27, 15, 16, 14, 19, 12 + +	dround		0, 1, 2, 3, 4, 24, 28, 16, 17, 15, 12, 13 +	dround		3, 0, 4, 2, 1, 25, 29, 17, 18, 16, 13, 14 +	dround		2, 3, 1, 4, 0, 26, 30, 18, 19, 17, 14, 15 +	dround		4, 2, 0, 1, 3, 27, 31, 19, 12, 18, 15, 16 +	dround		1, 4, 3, 0, 2, 28, 24, 12, 13, 19, 16, 17 + +	dround		0, 1, 2, 3, 4, 29, 25, 13, 14, 12, 17, 18 +	dround		3, 0, 4, 2, 1, 30, 26, 14, 15, 13, 18, 19 +	dround		2, 3, 1, 4, 0, 31, 27, 15, 16, 14, 19, 12 +	dround		4, 2, 0, 1, 3, 24, 28, 16, 17, 15, 12, 13 +	dround		1, 4, 3, 0, 2, 25, 29, 17, 18, 16, 13, 14 + +	dround		0, 1, 2, 3, 4, 26, 30, 18, 19, 17, 14, 15 +	dround		3, 0, 4, 2, 1, 27, 31, 19, 12, 18, 15, 16 +	dround		2, 3, 1, 4, 0, 28, 24, 12 +	dround		4, 2, 0, 1, 3, 29, 25, 13 +	dround		1, 4, 3, 0, 2, 30, 26, 14 + +	dround		0, 1, 2, 3, 4, 31, 27, 15 +	dround		3, 0, 4, 2, 1, 24,   , 16 +	dround		2, 3, 1, 4, 0, 25,   , 17 +	dround		4, 2, 0, 1, 3, 26,   , 18 +	dround		1, 4, 3, 0, 2, 27,   , 19 + +	/* update state */ +	add		v8.2d, v8.2d, v0.2d +	add		v9.2d, v9.2d, v1.2d +	add		v10.2d, v10.2d, v2.2d +	add		v11.2d, v11.2d, v3.2d + +	cond_yield	3f, x4, x5 +	/* handled all input blocks? */ +	cbnz		x2, 0b + +	/* store new state */ +3:	st1		{v8.2d-v11.2d}, [x0] +	mov		x0, x2 +	ret +SYM_FUNC_END(__sha512_ce_transform) diff --git a/lib/crypto/arm64/sha512.h b/lib/crypto/arm64/sha512.h new file mode 100644 index 000000000000..6abb40b467f2 --- /dev/null +++ b/lib/crypto/arm64/sha512.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * arm64-optimized SHA-512 block function + * + * Copyright 2025 Google LLC + */ + +#include <asm/neon.h> +#include <crypto/internal/simd.h> +#include <linux/cpufeature.h> + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha512_insns); + +asmlinkage void sha512_block_data_order(struct sha512_block_state *state, +					const u8 *data, size_t nblocks); +asmlinkage size_t __sha512_ce_transform(struct sha512_block_state *state, +					const u8 *data, size_t nblocks); + +static void sha512_blocks(struct sha512_block_state *state, +			  const u8 *data, size_t nblocks) +{ +	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && +	    static_branch_likely(&have_sha512_insns) && +	    likely(crypto_simd_usable())) { +		do { +			size_t rem; + +			kernel_neon_begin(); +			rem = __sha512_ce_transform(state, data, nblocks); +			kernel_neon_end(); +			data += (nblocks - rem) * SHA512_BLOCK_SIZE; +			nblocks = rem; +		} while (nblocks); +	} else { +		sha512_block_data_order(state, data, nblocks); +	} +} + +#ifdef CONFIG_KERNEL_MODE_NEON +#define sha512_mod_init_arch sha512_mod_init_arch +static inline void sha512_mod_init_arch(void) +{ +	if (cpu_have_named_feature(SHA512)) +		static_branch_enable(&have_sha512_insns); +} +#endif /* CONFIG_KERNEL_MODE_NEON */  | 
