diff options
Diffstat (limited to 'lib/crypto/arm')
| -rw-r--r-- | lib/crypto/arm/blake2b-neon-core.S | 350 | ||||
| -rw-r--r-- | lib/crypto/arm/blake2b.h | 40 | ||||
| -rw-r--r-- | lib/crypto/arm/blake2s-core.S | 22 | ||||
| -rw-r--r-- | lib/crypto/arm/blake2s.h | 4 | ||||
| -rw-r--r-- | lib/crypto/arm/chacha.h | 11 | ||||
| -rw-r--r-- | lib/crypto/arm/curve25519.h | 5 | ||||
| -rw-r--r-- | lib/crypto/arm/poly1305.h | 6 | ||||
| -rw-r--r-- | lib/crypto/arm/sha1-armv7-neon.S | 2 | ||||
| -rw-r--r-- | lib/crypto/arm/sha1-ce-core.S | 2 | ||||
| -rw-r--r-- | lib/crypto/arm/sha1.h | 13 | ||||
| -rw-r--r-- | lib/crypto/arm/sha256-ce.S | 2 | ||||
| -rw-r--r-- | lib/crypto/arm/sha256.h | 12 | ||||
| -rw-r--r-- | lib/crypto/arm/sha512.h | 5 |
13 files changed, 428 insertions, 46 deletions
diff --git a/lib/crypto/arm/blake2b-neon-core.S b/lib/crypto/arm/blake2b-neon-core.S new file mode 100644 index 000000000000..b55c37f0b88f --- /dev/null +++ b/lib/crypto/arm/blake2b-neon-core.S @@ -0,0 +1,350 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * BLAKE2b digest algorithm optimized with ARM NEON instructions. On ARM + * processors that have NEON support but not the ARMv8 Crypto Extensions, + * typically this BLAKE2b implementation is much faster than the SHA-2 family + * and slightly faster than SHA-1. + * + * Copyright 2020 Google LLC + * + * Author: Eric Biggers <ebiggers@google.com> + */ + +#include <linux/linkage.h> + + .text + .fpu neon + + // The arguments to blake2b_compress_neon() + CTX .req r0 + DATA .req r1 + NBLOCKS .req r2 + INC .req r3 + + // Pointers to the rotation tables + ROR24_TABLE .req r4 + ROR16_TABLE .req r5 + + // The original stack pointer + ORIG_SP .req r6 + + // NEON registers which contain the message words of the current block. + // M_0-M_3 are occasionally used for other purposes too. + M_0 .req d16 + M_1 .req d17 + M_2 .req d18 + M_3 .req d19 + M_4 .req d20 + M_5 .req d21 + M_6 .req d22 + M_7 .req d23 + M_8 .req d24 + M_9 .req d25 + M_10 .req d26 + M_11 .req d27 + M_12 .req d28 + M_13 .req d29 + M_14 .req d30 + M_15 .req d31 + + .align 4 + // Tables for computing ror64(x, 24) and ror64(x, 16) using the vtbl.8 + // instruction. This is the most efficient way to implement these + // rotation amounts with NEON. (On Cortex-A53 it's the same speed as + // vshr.u64 + vsli.u64, while on Cortex-A7 it's faster.) +.Lror24_table: + .byte 3, 4, 5, 6, 7, 0, 1, 2 +.Lror16_table: + .byte 2, 3, 4, 5, 6, 7, 0, 1 + // The BLAKE2b initialization vector +.Lblake2b_IV: + .quad 0x6a09e667f3bcc908, 0xbb67ae8584caa73b + .quad 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1 + .quad 0x510e527fade682d1, 0x9b05688c2b3e6c1f + .quad 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179 + +// Execute one round of BLAKE2b by updating the state matrix v[0..15] in the +// NEON registers q0-q7. The message block is in q8..q15 (M_0-M_15). The stack +// pointer points to a 32-byte aligned buffer containing a copy of q8 and q9 +// (M_0-M_3), so that they can be reloaded if they are used as temporary +// registers. The macro arguments s0-s15 give the order in which the message +// words are used in this round. 'final' is 1 if this is the final round. +.macro _blake2b_round s0, s1, s2, s3, s4, s5, s6, s7, \ + s8, s9, s10, s11, s12, s13, s14, s15, final=0 + + // Mix the columns: + // (v[0], v[4], v[8], v[12]), (v[1], v[5], v[9], v[13]), + // (v[2], v[6], v[10], v[14]), and (v[3], v[7], v[11], v[15]). + + // a += b + m[blake2b_sigma[r][2*i + 0]]; + vadd.u64 q0, q0, q2 + vadd.u64 q1, q1, q3 + vadd.u64 d0, d0, M_\s0 + vadd.u64 d1, d1, M_\s2 + vadd.u64 d2, d2, M_\s4 + vadd.u64 d3, d3, M_\s6 + + // d = ror64(d ^ a, 32); + veor q6, q6, q0 + veor q7, q7, q1 + vrev64.32 q6, q6 + vrev64.32 q7, q7 + + // c += d; + vadd.u64 q4, q4, q6 + vadd.u64 q5, q5, q7 + + // b = ror64(b ^ c, 24); + vld1.8 {M_0}, [ROR24_TABLE, :64] + veor q2, q2, q4 + veor q3, q3, q5 + vtbl.8 d4, {d4}, M_0 + vtbl.8 d5, {d5}, M_0 + vtbl.8 d6, {d6}, M_0 + vtbl.8 d7, {d7}, M_0 + + // a += b + m[blake2b_sigma[r][2*i + 1]]; + // + // M_0 got clobbered above, so we have to reload it if any of the four + // message words this step needs happens to be M_0. Otherwise we don't + // need to reload it here, as it will just get clobbered again below. +.if \s1 == 0 || \s3 == 0 || \s5 == 0 || \s7 == 0 + vld1.8 {M_0}, [sp, :64] +.endif + vadd.u64 q0, q0, q2 + vadd.u64 q1, q1, q3 + vadd.u64 d0, d0, M_\s1 + vadd.u64 d1, d1, M_\s3 + vadd.u64 d2, d2, M_\s5 + vadd.u64 d3, d3, M_\s7 + + // d = ror64(d ^ a, 16); + vld1.8 {M_0}, [ROR16_TABLE, :64] + veor q6, q6, q0 + veor q7, q7, q1 + vtbl.8 d12, {d12}, M_0 + vtbl.8 d13, {d13}, M_0 + vtbl.8 d14, {d14}, M_0 + vtbl.8 d15, {d15}, M_0 + + // c += d; + vadd.u64 q4, q4, q6 + vadd.u64 q5, q5, q7 + + // b = ror64(b ^ c, 63); + // + // This rotation amount isn't a multiple of 8, so it has to be + // implemented using a pair of shifts, which requires temporary + // registers. Use q8-q9 (M_0-M_3) for this, and reload them afterwards. + veor q8, q2, q4 + veor q9, q3, q5 + vshr.u64 q2, q8, #63 + vshr.u64 q3, q9, #63 + vsli.u64 q2, q8, #1 + vsli.u64 q3, q9, #1 + vld1.8 {q8-q9}, [sp, :256] + + // Mix the diagonals: + // (v[0], v[5], v[10], v[15]), (v[1], v[6], v[11], v[12]), + // (v[2], v[7], v[8], v[13]), and (v[3], v[4], v[9], v[14]). + // + // There are two possible ways to do this: use 'vext' instructions to + // shift the rows of the matrix so that the diagonals become columns, + // and undo it afterwards; or just use 64-bit operations on 'd' + // registers instead of 128-bit operations on 'q' registers. We use the + // latter approach, as it performs much better on Cortex-A7. + + // a += b + m[blake2b_sigma[r][2*i + 0]]; + vadd.u64 d0, d0, d5 + vadd.u64 d1, d1, d6 + vadd.u64 d2, d2, d7 + vadd.u64 d3, d3, d4 + vadd.u64 d0, d0, M_\s8 + vadd.u64 d1, d1, M_\s10 + vadd.u64 d2, d2, M_\s12 + vadd.u64 d3, d3, M_\s14 + + // d = ror64(d ^ a, 32); + veor d15, d15, d0 + veor d12, d12, d1 + veor d13, d13, d2 + veor d14, d14, d3 + vrev64.32 d15, d15 + vrev64.32 d12, d12 + vrev64.32 d13, d13 + vrev64.32 d14, d14 + + // c += d; + vadd.u64 d10, d10, d15 + vadd.u64 d11, d11, d12 + vadd.u64 d8, d8, d13 + vadd.u64 d9, d9, d14 + + // b = ror64(b ^ c, 24); + vld1.8 {M_0}, [ROR24_TABLE, :64] + veor d5, d5, d10 + veor d6, d6, d11 + veor d7, d7, d8 + veor d4, d4, d9 + vtbl.8 d5, {d5}, M_0 + vtbl.8 d6, {d6}, M_0 + vtbl.8 d7, {d7}, M_0 + vtbl.8 d4, {d4}, M_0 + + // a += b + m[blake2b_sigma[r][2*i + 1]]; +.if \s9 == 0 || \s11 == 0 || \s13 == 0 || \s15 == 0 + vld1.8 {M_0}, [sp, :64] +.endif + vadd.u64 d0, d0, d5 + vadd.u64 d1, d1, d6 + vadd.u64 d2, d2, d7 + vadd.u64 d3, d3, d4 + vadd.u64 d0, d0, M_\s9 + vadd.u64 d1, d1, M_\s11 + vadd.u64 d2, d2, M_\s13 + vadd.u64 d3, d3, M_\s15 + + // d = ror64(d ^ a, 16); + vld1.8 {M_0}, [ROR16_TABLE, :64] + veor d15, d15, d0 + veor d12, d12, d1 + veor d13, d13, d2 + veor d14, d14, d3 + vtbl.8 d12, {d12}, M_0 + vtbl.8 d13, {d13}, M_0 + vtbl.8 d14, {d14}, M_0 + vtbl.8 d15, {d15}, M_0 + + // c += d; + vadd.u64 d10, d10, d15 + vadd.u64 d11, d11, d12 + vadd.u64 d8, d8, d13 + vadd.u64 d9, d9, d14 + + // b = ror64(b ^ c, 63); + veor d16, d4, d9 + veor d17, d5, d10 + veor d18, d6, d11 + veor d19, d7, d8 + vshr.u64 q2, q8, #63 + vshr.u64 q3, q9, #63 + vsli.u64 q2, q8, #1 + vsli.u64 q3, q9, #1 + // Reloading q8-q9 can be skipped on the final round. +.if ! \final + vld1.8 {q8-q9}, [sp, :256] +.endif +.endm + +// +// void blake2b_compress_neon(struct blake2b_ctx *ctx, +// const u8 *data, size_t nblocks, u32 inc); +// +// Only the first three fields of struct blake2b_ctx are used: +// u64 h[8]; (inout) +// u64 t[2]; (inout) +// u64 f[2]; (in) +// + .align 5 +ENTRY(blake2b_compress_neon) + push {r4-r10} + + // Allocate a 32-byte stack buffer that is 32-byte aligned. + mov ORIG_SP, sp + sub ip, sp, #32 + bic ip, ip, #31 + mov sp, ip + + adr ROR24_TABLE, .Lror24_table + adr ROR16_TABLE, .Lror16_table + + mov ip, CTX + vld1.64 {q0-q1}, [ip]! // Load h[0..3] + vld1.64 {q2-q3}, [ip]! // Load h[4..7] +.Lnext_block: + adr r10, .Lblake2b_IV + vld1.64 {q14-q15}, [ip] // Load t[0..1] and f[0..1] + vld1.64 {q4-q5}, [r10]! // Load IV[0..3] + vmov r7, r8, d28 // Copy t[0] to (r7, r8) + vld1.64 {q6-q7}, [r10] // Load IV[4..7] + adds r7, r7, INC // Increment counter + bcs .Lslow_inc_ctr + vmov.i32 d28[0], r7 + vst1.64 {d28}, [ip] // Update t[0] +.Linc_ctr_done: + + // Load the next message block and finish initializing the state matrix + // 'v'. Fortunately, there are exactly enough NEON registers to fit the + // entire state matrix in q0-q7 and the entire message block in q8-15. + // + // However, _blake2b_round also needs some extra registers for rotates, + // so we have to spill some registers. It's better to spill the message + // registers than the state registers, as the message doesn't change. + // Therefore we store a copy of the first 32 bytes of the message block + // (q8-q9) in an aligned buffer on the stack so that they can be + // reloaded when needed. (We could just reload directly from the + // message buffer, but it's faster to use aligned loads.) + vld1.8 {q8-q9}, [DATA]! + veor q6, q6, q14 // v[12..13] = IV[4..5] ^ t[0..1] + vld1.8 {q10-q11}, [DATA]! + veor q7, q7, q15 // v[14..15] = IV[6..7] ^ f[0..1] + vld1.8 {q12-q13}, [DATA]! + vst1.8 {q8-q9}, [sp, :256] + mov ip, CTX + vld1.8 {q14-q15}, [DATA]! + + // Execute the rounds. Each round is provided the order in which it + // needs to use the message words. + _blake2b_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + _blake2b_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 + _blake2b_round 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 + _blake2b_round 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 + _blake2b_round 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 + _blake2b_round 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 + _blake2b_round 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 + _blake2b_round 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 + _blake2b_round 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 + _blake2b_round 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 + _blake2b_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + _blake2b_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 \ + final=1 + + // Fold the final state matrix into the hash chaining value: + // + // for (i = 0; i < 8; i++) + // h[i] ^= v[i] ^ v[i + 8]; + // + vld1.64 {q8-q9}, [ip]! // Load old h[0..3] + veor q0, q0, q4 // v[0..1] ^= v[8..9] + veor q1, q1, q5 // v[2..3] ^= v[10..11] + vld1.64 {q10-q11}, [ip] // Load old h[4..7] + veor q2, q2, q6 // v[4..5] ^= v[12..13] + veor q3, q3, q7 // v[6..7] ^= v[14..15] + veor q0, q0, q8 // v[0..1] ^= h[0..1] + veor q1, q1, q9 // v[2..3] ^= h[2..3] + mov ip, CTX + subs NBLOCKS, NBLOCKS, #1 // nblocks-- + vst1.64 {q0-q1}, [ip]! // Store new h[0..3] + veor q2, q2, q10 // v[4..5] ^= h[4..5] + veor q3, q3, q11 // v[6..7] ^= h[6..7] + vst1.64 {q2-q3}, [ip]! // Store new h[4..7] + + // Advance to the next block, if there is one. + bne .Lnext_block // nblocks != 0? + + mov sp, ORIG_SP + pop {r4-r10} + mov pc, lr + +.Lslow_inc_ctr: + // Handle the case where the counter overflowed its low 32 bits, by + // carrying the overflow bit into the full 128-bit counter. + vmov r9, r10, d29 + adcs r8, r8, #0 + adcs r9, r9, #0 + adc r10, r10, #0 + vmov d28, r7, r8 + vmov d29, r9, r10 + vst1.64 {q14}, [ip] // Update t[0] and t[1] + b .Linc_ctr_done +ENDPROC(blake2b_compress_neon) diff --git a/lib/crypto/arm/blake2b.h b/lib/crypto/arm/blake2b.h new file mode 100644 index 000000000000..5c76498521e6 --- /dev/null +++ b/lib/crypto/arm/blake2b.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * BLAKE2b digest algorithm, NEON accelerated + * + * Copyright 2020 Google LLC + */ + +#include <asm/neon.h> +#include <asm/simd.h> + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); + +asmlinkage void blake2b_compress_neon(struct blake2b_ctx *ctx, + const u8 *data, size_t nblocks, u32 inc); + +static void blake2b_compress(struct blake2b_ctx *ctx, + const u8 *data, size_t nblocks, u32 inc) +{ + if (!static_branch_likely(&have_neon) || !may_use_simd()) { + blake2b_compress_generic(ctx, data, nblocks, inc); + return; + } + do { + const size_t blocks = min_t(size_t, nblocks, + SZ_4K / BLAKE2B_BLOCK_SIZE); + + scoped_ksimd() + blake2b_compress_neon(ctx, data, blocks, inc); + + data += blocks * BLAKE2B_BLOCK_SIZE; + nblocks -= blocks; + } while (nblocks); +} + +#define blake2b_mod_init_arch blake2b_mod_init_arch +static void blake2b_mod_init_arch(void) +{ + if (elf_hwcap & HWCAP_NEON) + static_branch_enable(&have_neon); +} diff --git a/lib/crypto/arm/blake2s-core.S b/lib/crypto/arm/blake2s-core.S index 293f44fa8f31..933f0558b7cd 100644 --- a/lib/crypto/arm/blake2s-core.S +++ b/lib/crypto/arm/blake2s-core.S @@ -115,7 +115,7 @@ // Execute one round of BLAKE2s by updating the state matrix v[0..15]. v[0..9] // are in r0..r9. The stack pointer points to 8 bytes of scratch space for -// spilling v[8..9], then to v[9..15], then to the message block. r10-r12 and +// spilling v[8..9], then to v[10..15], then to the message block. r10-r12 and // r14 are free to use. The macro arguments s0-s15 give the order in which the // message words are used in this round. // @@ -170,10 +170,10 @@ .endm // -// void blake2s_compress(struct blake2s_state *state, -// const u8 *block, size_t nblocks, u32 inc); +// void blake2s_compress(struct blake2s_ctx *ctx, +// const u8 *data, size_t nblocks, u32 inc); // -// Only the first three fields of struct blake2s_state are used: +// Only the first three fields of struct blake2s_ctx are used: // u32 h[8]; (inout) // u32 t[2]; (inout) // u32 f[2]; (in) @@ -183,8 +183,8 @@ ENTRY(blake2s_compress) push {r0-r2,r4-r11,lr} // keep this an even number .Lnext_block: - // r0 is 'state' - // r1 is 'block' + // r0 is 'ctx' + // r1 is 'data' // r3 is 'inc' // Load and increment the counter t[0..1]. @@ -209,18 +209,18 @@ ENTRY(blake2s_compress) .Lcopy_block_done: str r1, [sp, #68] // Update message pointer - // Calculate v[8..15]. Push v[9..15] onto the stack, and leave space + // Calculate v[8..15]. Push v[10..15] onto the stack, and leave space // for spilling v[8..9]. Leave v[8..9] in r8-r9. - mov r14, r0 // r14 = state + mov r14, r0 // r14 = ctx adr r12, .Lblake2s_IV ldmia r12!, {r8-r9} // load IV[0..1] __ldrd r0, r1, r14, 40 // load f[0..1] - ldm r12, {r2-r7} // load IV[3..7] + ldm r12, {r2-r7} // load IV[2..7] eor r4, r4, r10 // v[12] = IV[4] ^ t[0] eor r5, r5, r11 // v[13] = IV[5] ^ t[1] eor r6, r6, r0 // v[14] = IV[6] ^ f[0] eor r7, r7, r1 // v[15] = IV[7] ^ f[1] - push {r2-r7} // push v[9..15] + push {r2-r7} // push v[10..15] sub sp, sp, #8 // leave space for v[8..9] // Load h[0..7] == v[0..7]. @@ -275,7 +275,7 @@ ENTRY(blake2s_compress) // Advance to the next block, if there is one. Note that if there are // multiple blocks, then 'inc' (the counter increment amount) must be // 64. So we can simply set it to 64 without re-loading it. - ldm sp, {r0, r1, r2} // load (state, block, nblocks) + ldm sp, {r0, r1, r2} // load (ctx, data, nblocks) mov r3, #64 // set 'inc' subs r2, r2, #1 // nblocks-- str r2, [sp, #8] diff --git a/lib/crypto/arm/blake2s.h b/lib/crypto/arm/blake2s.h index aa7a97139ea7..42c04440c191 100644 --- a/lib/crypto/arm/blake2s.h +++ b/lib/crypto/arm/blake2s.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ /* defined in blake2s-core.S */ -void blake2s_compress(struct blake2s_state *state, const u8 *block, - size_t nblocks, u32 inc); +void blake2s_compress(struct blake2s_ctx *ctx, + const u8 *data, size_t nblocks, u32 inc); diff --git a/lib/crypto/arm/chacha.h b/lib/crypto/arm/chacha.h index 0cae30f8ee5d..836e49088e98 100644 --- a/lib/crypto/arm/chacha.h +++ b/lib/crypto/arm/chacha.h @@ -12,7 +12,6 @@ #include <asm/cputype.h> #include <asm/hwcap.h> -#include <asm/neon.h> #include <asm/simd.h> asmlinkage void chacha_block_xor_neon(const struct chacha_state *state, @@ -68,9 +67,8 @@ static void hchacha_block_arch(const struct chacha_state *state, if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable()) { hchacha_block_arm(state, out, nrounds); } else { - kernel_neon_begin(); - hchacha_block_neon(state, out, nrounds); - kernel_neon_end(); + scoped_ksimd() + hchacha_block_neon(state, out, nrounds); } } @@ -87,9 +85,8 @@ static void chacha_crypt_arch(struct chacha_state *state, u8 *dst, do { unsigned int todo = min_t(unsigned int, bytes, SZ_4K); - kernel_neon_begin(); - chacha_doneon(state, dst, src, todo, nrounds); - kernel_neon_end(); + scoped_ksimd() + chacha_doneon(state, dst, src, todo, nrounds); bytes -= todo; src += todo; diff --git a/lib/crypto/arm/curve25519.h b/lib/crypto/arm/curve25519.h index f6d66494eb8f..b1a566885e95 100644 --- a/lib/crypto/arm/curve25519.h +++ b/lib/crypto/arm/curve25519.h @@ -25,9 +25,8 @@ static void curve25519_arch(u8 out[CURVE25519_KEY_SIZE], const u8 point[CURVE25519_KEY_SIZE]) { if (static_branch_likely(&have_neon) && crypto_simd_usable()) { - kernel_neon_begin(); - curve25519_neon(out, scalar, point); - kernel_neon_end(); + scoped_ksimd() + curve25519_neon(out, scalar, point); } else { curve25519_generic(out, scalar, point); } diff --git a/lib/crypto/arm/poly1305.h b/lib/crypto/arm/poly1305.h index 0021cf368307..0fe903d8de55 100644 --- a/lib/crypto/arm/poly1305.h +++ b/lib/crypto/arm/poly1305.h @@ -6,7 +6,6 @@ */ #include <asm/hwcap.h> -#include <asm/neon.h> #include <asm/simd.h> #include <linux/cpufeature.h> #include <linux/jump_label.h> @@ -32,9 +31,8 @@ static void poly1305_blocks(struct poly1305_block_state *state, const u8 *src, do { unsigned int todo = min_t(unsigned int, len, SZ_4K); - kernel_neon_begin(); - poly1305_blocks_neon(state, src, todo, padbit); - kernel_neon_end(); + scoped_ksimd() + poly1305_blocks_neon(state, src, todo, padbit); len -= todo; src += todo; diff --git a/lib/crypto/arm/sha1-armv7-neon.S b/lib/crypto/arm/sha1-armv7-neon.S index 6edba3ab62e8..a0323fa5c58a 100644 --- a/lib/crypto/arm/sha1-armv7-neon.S +++ b/lib/crypto/arm/sha1-armv7-neon.S @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* sha1-armv7-neon.S - ARM/NEON accelerated SHA-1 transform function +/* ARM/NEON accelerated SHA-1 transform function * * Copyright © 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi> */ diff --git a/lib/crypto/arm/sha1-ce-core.S b/lib/crypto/arm/sha1-ce-core.S index 2de40dd25e47..7d6b2631ca8d 100644 --- a/lib/crypto/arm/sha1-ce-core.S +++ b/lib/crypto/arm/sha1-ce-core.S @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions + * SHA-1 secure hash using ARMv8 Crypto Extensions * * Copyright (C) 2015 Linaro Ltd. * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org> diff --git a/lib/crypto/arm/sha1.h b/lib/crypto/arm/sha1.h index 29f8bcad0447..3e2d8c7cab9f 100644 --- a/lib/crypto/arm/sha1.h +++ b/lib/crypto/arm/sha1.h @@ -4,7 +4,6 @@ * * Copyright 2025 Google LLC */ -#include <asm/neon.h> #include <asm/simd.h> static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); @@ -22,12 +21,12 @@ static void sha1_blocks(struct sha1_block_state *state, { if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && static_branch_likely(&have_neon) && likely(may_use_simd())) { - kernel_neon_begin(); - if (static_branch_likely(&have_ce)) - sha1_ce_transform(state, data, nblocks); - else - sha1_transform_neon(state, data, nblocks); - kernel_neon_end(); + scoped_ksimd() { + if (static_branch_likely(&have_ce)) + sha1_ce_transform(state, data, nblocks); + else + sha1_transform_neon(state, data, nblocks); + } } else { sha1_block_data_order(state, data, nblocks); } diff --git a/lib/crypto/arm/sha256-ce.S b/lib/crypto/arm/sha256-ce.S index 7481ac8e6c0d..144ee805f64a 100644 --- a/lib/crypto/arm/sha256-ce.S +++ b/lib/crypto/arm/sha256-ce.S @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * sha256-ce.S - SHA-224/256 secure hash using ARMv8 Crypto Extensions + * SHA-224/256 secure hash using ARMv8 Crypto Extensions * * Copyright (C) 2015 Linaro Ltd. * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org> diff --git a/lib/crypto/arm/sha256.h b/lib/crypto/arm/sha256.h index 7556457b3094..ae7e52dd6e3b 100644 --- a/lib/crypto/arm/sha256.h +++ b/lib/crypto/arm/sha256.h @@ -22,12 +22,12 @@ static void sha256_blocks(struct sha256_block_state *state, { if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && static_branch_likely(&have_neon) && likely(may_use_simd())) { - kernel_neon_begin(); - if (static_branch_likely(&have_ce)) - sha256_ce_transform(state, data, nblocks); - else - sha256_block_data_order_neon(state, data, nblocks); - kernel_neon_end(); + scoped_ksimd() { + if (static_branch_likely(&have_ce)) + sha256_ce_transform(state, data, nblocks); + else + sha256_block_data_order_neon(state, data, nblocks); + } } else { sha256_block_data_order(state, data, nblocks); } diff --git a/lib/crypto/arm/sha512.h b/lib/crypto/arm/sha512.h index d1b485dd275d..ed9bd81d6d78 100644 --- a/lib/crypto/arm/sha512.h +++ b/lib/crypto/arm/sha512.h @@ -19,9 +19,8 @@ static void sha512_blocks(struct sha512_block_state *state, { if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && static_branch_likely(&have_neon) && likely(may_use_simd())) { - kernel_neon_begin(); - sha512_block_data_order_neon(state, data, nblocks); - kernel_neon_end(); + scoped_ksimd() + sha512_block_data_order_neon(state, data, nblocks); } else { sha512_block_data_order(state, data, nblocks); } |
