diff options
Diffstat (limited to 'lib/crypto')
-rw-r--r-- | lib/crypto/arm64/sha256-ce.S | 284 | ||||
-rw-r--r-- | lib/crypto/arm64/sha256.h | 37 | ||||
-rw-r--r-- | lib/crypto/sha256.c | 71 | ||||
-rw-r--r-- | lib/crypto/tests/sha256_kunit.c | 184 | ||||
-rw-r--r-- | lib/crypto/x86/sha256-ni-asm.S | 368 | ||||
-rw-r--r-- | lib/crypto/x86/sha256.h | 39 |
6 files changed, 972 insertions, 11 deletions
diff --git a/lib/crypto/arm64/sha256-ce.S b/lib/crypto/arm64/sha256-ce.S index b99d9589c421..410174ba5237 100644 --- a/lib/crypto/arm64/sha256-ce.S +++ b/lib/crypto/arm64/sha256-ce.S @@ -70,18 +70,22 @@ .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 + .macro load_round_constants tmp + adr_l \tmp, .Lsha2_rcon + ld1 { v0.4s- v3.4s}, [\tmp], #64 + ld1 { v4.4s- v7.4s}, [\tmp], #64 + ld1 { v8.4s-v11.4s}, [\tmp], #64 + ld1 {v12.4s-v15.4s}, [\tmp] + .endm + /* * size_t __sha256_ce_transform(struct sha256_block_state *state, * const u8 *data, size_t nblocks); */ .text SYM_FUNC_START(__sha256_ce_transform) - /* load round constants */ - adr_l x8, .Lsha2_rcon - ld1 { v0.4s- v3.4s}, [x8], #64 - ld1 { v4.4s- v7.4s}, [x8], #64 - ld1 { v8.4s-v11.4s}, [x8], #64 - ld1 {v12.4s-v15.4s}, [x8] + + load_round_constants x8 /* load state */ ld1 {dgav.4s, dgbv.4s}, [x0] @@ -134,3 +138,271 @@ CPU_LE( rev32 v19.16b, v19.16b ) mov x0, x2 ret SYM_FUNC_END(__sha256_ce_transform) + + .unreq dga + .unreq dgav + .unreq dgb + .unreq dgbv + .unreq t0 + .unreq t1 + .unreq dg0q + .unreq dg0v + .unreq dg1q + .unreq dg1v + .unreq dg2q + .unreq dg2v + + // parameters for sha256_ce_finup2x() + ctx .req x0 + data1 .req x1 + data2 .req x2 + len .req w3 + out1 .req x4 + out2 .req x5 + + // other scalar variables + count .req x6 + final_step .req w7 + + // x8-x9 are used as temporaries. + + // v0-v15 are used to cache the SHA-256 round constants. + // v16-v19 are used for the message schedule for the first message. + // v20-v23 are used for the message schedule for the second message. + // v24-v31 are used for the state and temporaries as given below. + // *_a are for the first message and *_b for the second. + state0_a_q .req q24 + state0_a .req v24 + state1_a_q .req q25 + state1_a .req v25 + state0_b_q .req q26 + state0_b .req v26 + state1_b_q .req q27 + state1_b .req v27 + t0_a .req v28 + t0_b .req v29 + t1_a_q .req q30 + t1_a .req v30 + t1_b_q .req q31 + t1_b .req v31 + +#define OFFSETOF_BYTECOUNT 32 // offsetof(struct __sha256_ctx, bytecount) +#define OFFSETOF_BUF 40 // offsetof(struct __sha256_ctx, buf) +// offsetof(struct __sha256_ctx, state) is assumed to be 0. + + // Do 4 rounds of SHA-256 for each of two messages (interleaved). m0_a + // and m0_b contain the current 4 message schedule words for the first + // and second message respectively. + // + // If not all the message schedule words have been computed yet, then + // this also computes 4 more message schedule words for each message. + // m1_a-m3_a contain the next 3 groups of 4 message schedule words for + // the first message, and likewise m1_b-m3_b for the second. After + // consuming the current value of m0_a, this macro computes the group + // after m3_a and writes it to m0_a, and likewise for *_b. This means + // that the next (m0_a, m1_a, m2_a, m3_a) is the current (m1_a, m2_a, + // m3_a, m0_a), and likewise for *_b, so the caller must cycle through + // the registers accordingly. + .macro do_4rounds_2x i, k, m0_a, m1_a, m2_a, m3_a, \ + m0_b, m1_b, m2_b, m3_b + add t0_a\().4s, \m0_a\().4s, \k\().4s + add t0_b\().4s, \m0_b\().4s, \k\().4s + .if \i < 48 + sha256su0 \m0_a\().4s, \m1_a\().4s + sha256su0 \m0_b\().4s, \m1_b\().4s + sha256su1 \m0_a\().4s, \m2_a\().4s, \m3_a\().4s + sha256su1 \m0_b\().4s, \m2_b\().4s, \m3_b\().4s + .endif + mov t1_a.16b, state0_a.16b + mov t1_b.16b, state0_b.16b + sha256h state0_a_q, state1_a_q, t0_a\().4s + sha256h state0_b_q, state1_b_q, t0_b\().4s + sha256h2 state1_a_q, t1_a_q, t0_a\().4s + sha256h2 state1_b_q, t1_b_q, t0_b\().4s + .endm + + .macro do_16rounds_2x i, k0, k1, k2, k3 + do_4rounds_2x \i + 0, \k0, v16, v17, v18, v19, v20, v21, v22, v23 + do_4rounds_2x \i + 4, \k1, v17, v18, v19, v16, v21, v22, v23, v20 + do_4rounds_2x \i + 8, \k2, v18, v19, v16, v17, v22, v23, v20, v21 + do_4rounds_2x \i + 12, \k3, v19, v16, v17, v18, v23, v20, v21, v22 + .endm + +// +// void sha256_ce_finup2x(const struct __sha256_ctx *ctx, +// const u8 *data1, const u8 *data2, int len, +// u8 out1[SHA256_DIGEST_SIZE], +// u8 out2[SHA256_DIGEST_SIZE]); +// +// This function computes the SHA-256 digests of two messages |data1| and +// |data2| that are both |len| bytes long, starting from the initial context +// |ctx|. |len| must be at least SHA256_BLOCK_SIZE. +// +// The instructions for the two SHA-256 operations are interleaved. On many +// CPUs, this is almost twice as fast as hashing each message individually due +// to taking better advantage of the CPU's SHA-256 and SIMD throughput. +// +SYM_FUNC_START(sha256_ce_finup2x) + sub sp, sp, #128 + mov final_step, #0 + load_round_constants x8 + + // Load the initial state from ctx->state. + ld1 {state0_a.4s-state1_a.4s}, [ctx] + + // Load ctx->bytecount. Take the mod 64 of it to get the number of + // bytes that are buffered in ctx->buf. Also save it in a register with + // len added to it. + ldr x8, [ctx, #OFFSETOF_BYTECOUNT] + add count, x8, len, sxtw + and x8, x8, #63 + cbz x8, .Lfinup2x_enter_loop // No bytes buffered? + + // x8 bytes (1 to 63) are currently buffered in ctx->buf. Load them + // followed by the first 64 - x8 bytes of data. Since len >= 64, we + // just load 64 bytes from each of ctx->buf, data1, and data2 + // unconditionally and rearrange the data as needed. + add x9, ctx, #OFFSETOF_BUF + ld1 {v16.16b-v19.16b}, [x9] + st1 {v16.16b-v19.16b}, [sp] + + ld1 {v16.16b-v19.16b}, [data1], #64 + add x9, sp, x8 + st1 {v16.16b-v19.16b}, [x9] + ld1 {v16.4s-v19.4s}, [sp] + + ld1 {v20.16b-v23.16b}, [data2], #64 + st1 {v20.16b-v23.16b}, [x9] + ld1 {v20.4s-v23.4s}, [sp] + + sub len, len, #64 + sub data1, data1, x8 + sub data2, data2, x8 + add len, len, w8 + mov state0_b.16b, state0_a.16b + mov state1_b.16b, state1_a.16b + b .Lfinup2x_loop_have_data + +.Lfinup2x_enter_loop: + sub len, len, #64 + mov state0_b.16b, state0_a.16b + mov state1_b.16b, state1_a.16b +.Lfinup2x_loop: + // Load the next two data blocks. + ld1 {v16.4s-v19.4s}, [data1], #64 + ld1 {v20.4s-v23.4s}, [data2], #64 +.Lfinup2x_loop_have_data: + // Convert the words of the data blocks from big endian. +CPU_LE( rev32 v16.16b, v16.16b ) +CPU_LE( rev32 v17.16b, v17.16b ) +CPU_LE( rev32 v18.16b, v18.16b ) +CPU_LE( rev32 v19.16b, v19.16b ) +CPU_LE( rev32 v20.16b, v20.16b ) +CPU_LE( rev32 v21.16b, v21.16b ) +CPU_LE( rev32 v22.16b, v22.16b ) +CPU_LE( rev32 v23.16b, v23.16b ) +.Lfinup2x_loop_have_bswapped_data: + + // Save the original state for each block. + st1 {state0_a.4s-state1_b.4s}, [sp] + + // Do the SHA-256 rounds on each block. + do_16rounds_2x 0, v0, v1, v2, v3 + do_16rounds_2x 16, v4, v5, v6, v7 + do_16rounds_2x 32, v8, v9, v10, v11 + do_16rounds_2x 48, v12, v13, v14, v15 + + // Add the original state for each block. + ld1 {v16.4s-v19.4s}, [sp] + add state0_a.4s, state0_a.4s, v16.4s + add state1_a.4s, state1_a.4s, v17.4s + add state0_b.4s, state0_b.4s, v18.4s + add state1_b.4s, state1_b.4s, v19.4s + + // Update len and loop back if more blocks remain. + sub len, len, #64 + tbz len, #31, .Lfinup2x_loop // len >= 0? + + // Check if any final blocks need to be handled. + // final_step = 2: all done + // final_step = 1: need to do count-only padding block + // final_step = 0: need to do the block with 0x80 padding byte + tbnz final_step, #1, .Lfinup2x_done + tbnz final_step, #0, .Lfinup2x_finalize_countonly + add len, len, #64 + cbz len, .Lfinup2x_finalize_blockaligned + + // Not block-aligned; 1 <= len <= 63 data bytes remain. Pad the block. + // To do this, write the padding starting with the 0x80 byte to + // &sp[64]. Then for each message, copy the last 64 data bytes to sp + // and load from &sp[64 - len] to get the needed padding block. This + // code relies on the data buffers being >= 64 bytes in length. + sub w8, len, #64 // w8 = len - 64 + add data1, data1, w8, sxtw // data1 += len - 64 + add data2, data2, w8, sxtw // data2 += len - 64 +CPU_LE( mov x9, #0x80 ) +CPU_LE( fmov d16, x9 ) +CPU_BE( movi v16.16b, #0 ) +CPU_BE( mov x9, #0x8000000000000000 ) +CPU_BE( mov v16.d[1], x9 ) + movi v17.16b, #0 + stp q16, q17, [sp, #64] + stp q17, q17, [sp, #96] + sub x9, sp, w8, sxtw // x9 = &sp[64 - len] + cmp len, #56 + b.ge 1f // will count spill into its own block? + lsl count, count, #3 +CPU_LE( rev count, count ) + str count, [x9, #56] + mov final_step, #2 // won't need count-only block + b 2f +1: + mov final_step, #1 // will need count-only block +2: + ld1 {v16.16b-v19.16b}, [data1] + st1 {v16.16b-v19.16b}, [sp] + ld1 {v16.4s-v19.4s}, [x9] + ld1 {v20.16b-v23.16b}, [data2] + st1 {v20.16b-v23.16b}, [sp] + ld1 {v20.4s-v23.4s}, [x9] + b .Lfinup2x_loop_have_data + + // Prepare a padding block, either: + // + // {0x80, 0, 0, 0, ..., count (as __be64)} + // This is for a block aligned message. + // + // { 0, 0, 0, 0, ..., count (as __be64)} + // This is for a message whose length mod 64 is >= 56. + // + // Pre-swap the endianness of the words. +.Lfinup2x_finalize_countonly: + movi v16.2d, #0 + b 1f +.Lfinup2x_finalize_blockaligned: + mov x8, #0x80000000 + fmov d16, x8 +1: + movi v17.2d, #0 + movi v18.2d, #0 + ror count, count, #29 // ror(lsl(count, 3), 32) + mov v19.d[0], xzr + mov v19.d[1], count + mov v20.16b, v16.16b + movi v21.2d, #0 + movi v22.2d, #0 + mov v23.16b, v19.16b + mov final_step, #2 + b .Lfinup2x_loop_have_bswapped_data + +.Lfinup2x_done: + // Write the two digests with all bytes in the correct order. +CPU_LE( rev32 state0_a.16b, state0_a.16b ) +CPU_LE( rev32 state1_a.16b, state1_a.16b ) +CPU_LE( rev32 state0_b.16b, state0_b.16b ) +CPU_LE( rev32 state1_b.16b, state1_b.16b ) + st1 {state0_a.4s-state1_a.4s}, [out1] + st1 {state0_b.4s-state1_b.4s}, [out2] + add sp, sp, #128 + ret +SYM_FUNC_END(sha256_ce_finup2x) diff --git a/lib/crypto/arm64/sha256.h b/lib/crypto/arm64/sha256.h index be4aeda9d0e6..80d06df27d3a 100644 --- a/lib/crypto/arm64/sha256.h +++ b/lib/crypto/arm64/sha256.h @@ -44,6 +44,43 @@ static void sha256_blocks(struct sha256_block_state *state, } } +static_assert(offsetof(struct __sha256_ctx, state) == 0); +static_assert(offsetof(struct __sha256_ctx, bytecount) == 32); +static_assert(offsetof(struct __sha256_ctx, buf) == 40); +asmlinkage void sha256_ce_finup2x(const struct __sha256_ctx *ctx, + const u8 *data1, const u8 *data2, int len, + u8 out1[SHA256_DIGEST_SIZE], + u8 out2[SHA256_DIGEST_SIZE]); + +#define sha256_finup_2x_arch sha256_finup_2x_arch +static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx, + const u8 *data1, const u8 *data2, size_t len, + u8 out1[SHA256_DIGEST_SIZE], + u8 out2[SHA256_DIGEST_SIZE]) +{ + /* + * The assembly requires len >= SHA256_BLOCK_SIZE && len <= INT_MAX. + * Further limit len to 65536 to avoid spending too long with preemption + * disabled. (Of course, in practice len is nearly always 4096 anyway.) + */ + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && + static_branch_likely(&have_ce) && len >= SHA256_BLOCK_SIZE && + len <= 65536 && likely(may_use_simd())) { + kernel_neon_begin(); + sha256_ce_finup2x(ctx, data1, data2, len, out1, out2); + kernel_neon_end(); + kmsan_unpoison_memory(out1, SHA256_DIGEST_SIZE); + kmsan_unpoison_memory(out2, SHA256_DIGEST_SIZE); + return true; + } + return false; +} + +static bool sha256_finup_2x_is_optimized_arch(void) +{ + return static_key_enabled(&have_ce); +} + #ifdef CONFIG_KERNEL_MODE_NEON #define sha256_mod_init_arch sha256_mod_init_arch static void sha256_mod_init_arch(void) diff --git a/lib/crypto/sha256.c b/lib/crypto/sha256.c index 8fa15165d23e..881b935418ce 100644 --- a/lib/crypto/sha256.c +++ b/lib/crypto/sha256.c @@ -25,13 +25,20 @@ static const struct sha256_block_state sha224_iv = { }, }; -static const struct sha256_block_state sha256_iv = { - .h = { - SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3, - SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7, +static const struct sha256_ctx initial_sha256_ctx = { + .ctx = { + .state = { + .h = { + SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3, + SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7, + }, + }, + .bytecount = 0, }, }; +#define sha256_iv (initial_sha256_ctx.ctx.state) + static const u32 sha256_K[64] = { 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, @@ -261,8 +268,62 @@ void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE]) } EXPORT_SYMBOL(sha256); -/* pre-boot environment (as indicated by __DISABLE_EXPORTS) doesn't need HMAC */ +/* + * Pre-boot environment (as indicated by __DISABLE_EXPORTS being defined) + * doesn't need either HMAC support or interleaved hashing support + */ #ifndef __DISABLE_EXPORTS + +#ifndef sha256_finup_2x_arch +static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx, + const u8 *data1, const u8 *data2, size_t len, + u8 out1[SHA256_DIGEST_SIZE], + u8 out2[SHA256_DIGEST_SIZE]) +{ + return false; +} +static bool sha256_finup_2x_is_optimized_arch(void) +{ + return false; +} +#endif + +/* Sequential fallback implementation of sha256_finup_2x() */ +static noinline_for_stack void sha256_finup_2x_sequential( + const struct __sha256_ctx *ctx, const u8 *data1, const u8 *data2, + size_t len, u8 out1[SHA256_DIGEST_SIZE], u8 out2[SHA256_DIGEST_SIZE]) +{ + struct __sha256_ctx mut_ctx; + + mut_ctx = *ctx; + __sha256_update(&mut_ctx, data1, len); + __sha256_final(&mut_ctx, out1, SHA256_DIGEST_SIZE); + + mut_ctx = *ctx; + __sha256_update(&mut_ctx, data2, len); + __sha256_final(&mut_ctx, out2, SHA256_DIGEST_SIZE); +} + +void sha256_finup_2x(const struct sha256_ctx *ctx, const u8 *data1, + const u8 *data2, size_t len, u8 out1[SHA256_DIGEST_SIZE], + u8 out2[SHA256_DIGEST_SIZE]) +{ + if (ctx == NULL) + ctx = &initial_sha256_ctx; + + if (likely(sha256_finup_2x_arch(&ctx->ctx, data1, data2, len, out1, + out2))) + return; + sha256_finup_2x_sequential(&ctx->ctx, data1, data2, len, out1, out2); +} +EXPORT_SYMBOL_GPL(sha256_finup_2x); + +bool sha256_finup_2x_is_optimized(void) +{ + return sha256_finup_2x_is_optimized_arch(); +} +EXPORT_SYMBOL_GPL(sha256_finup_2x_is_optimized); + static void __hmac_sha256_preparekey(struct sha256_block_state *istate, struct sha256_block_state *ostate, const u8 *raw_key, size_t raw_key_len, diff --git a/lib/crypto/tests/sha256_kunit.c b/lib/crypto/tests/sha256_kunit.c index 1cd4caee6010..dcedfca06df6 100644 --- a/lib/crypto/tests/sha256_kunit.c +++ b/lib/crypto/tests/sha256_kunit.c @@ -5,6 +5,7 @@ #include <crypto/sha2.h> #include "sha256-testvecs.h" +/* Generate the HASH_KUNIT_CASES using hash-test-template.h. */ #define HASH sha256 #define HASH_CTX sha256_ctx #define HASH_SIZE SHA256_DIGEST_SIZE @@ -21,9 +22,192 @@ #define HMAC_USINGRAWKEY hmac_sha256_usingrawkey #include "hash-test-template.h" +static void free_guarded_buf(void *buf) +{ + vfree(buf); +} + +/* + * Allocate a KUnit-managed buffer that has length @len bytes immediately + * followed by an unmapped page, and assert that the allocation succeeds. + */ +static void *alloc_guarded_buf(struct kunit *test, size_t len) +{ + size_t full_len = round_up(len, PAGE_SIZE); + void *buf = vmalloc(full_len); + + KUNIT_ASSERT_NOT_NULL(test, buf); + KUNIT_ASSERT_EQ(test, 0, + kunit_add_action_or_reset(test, free_guarded_buf, buf)); + return buf + full_len - len; +} + +/* + * Test for sha256_finup_2x(). Specifically, choose various data lengths and + * salt lengths, and for each one, verify that sha256_finup_2x() produces the + * same results as sha256_update() and sha256_final(). + * + * Use guarded buffers for all inputs and outputs to reliably detect any + * out-of-bounds reads or writes, even if they occur in assembly code. + */ +static void test_sha256_finup_2x(struct kunit *test) +{ + const size_t max_data_len = 16384; + u8 *data1_buf, *data2_buf, *hash1, *hash2; + u8 expected_hash1[SHA256_DIGEST_SIZE]; + u8 expected_hash2[SHA256_DIGEST_SIZE]; + u8 salt[SHA256_BLOCK_SIZE]; + struct sha256_ctx *ctx; + + data1_buf = alloc_guarded_buf(test, max_data_len); + data2_buf = alloc_guarded_buf(test, max_data_len); + hash1 = alloc_guarded_buf(test, SHA256_DIGEST_SIZE); + hash2 = alloc_guarded_buf(test, SHA256_DIGEST_SIZE); + ctx = alloc_guarded_buf(test, sizeof(*ctx)); + + rand_bytes(data1_buf, max_data_len); + rand_bytes(data2_buf, max_data_len); + rand_bytes(salt, sizeof(salt)); + + for (size_t i = 0; i < 500; i++) { + size_t salt_len = rand_length(sizeof(salt)); + size_t data_len = rand_length(max_data_len); + const u8 *data1 = data1_buf + max_data_len - data_len; + const u8 *data2 = data2_buf + max_data_len - data_len; + struct sha256_ctx orig_ctx; + + sha256_init(ctx); + sha256_update(ctx, salt, salt_len); + orig_ctx = *ctx; + + sha256_finup_2x(ctx, data1, data2, data_len, hash1, hash2); + KUNIT_ASSERT_MEMEQ_MSG( + test, ctx, &orig_ctx, sizeof(*ctx), + "sha256_finup_2x() modified its ctx argument"); + + sha256_update(ctx, data1, data_len); + sha256_final(ctx, expected_hash1); + sha256_update(&orig_ctx, data2, data_len); + sha256_final(&orig_ctx, expected_hash2); + KUNIT_ASSERT_MEMEQ_MSG( + test, hash1, expected_hash1, SHA256_DIGEST_SIZE, + "Wrong hash1 with salt_len=%zu data_len=%zu", salt_len, + data_len); + KUNIT_ASSERT_MEMEQ_MSG( + test, hash2, expected_hash2, SHA256_DIGEST_SIZE, + "Wrong hash2 with salt_len=%zu data_len=%zu", salt_len, + data_len); + } +} + +/* Test sha256_finup_2x() with ctx == NULL */ +static void test_sha256_finup_2x_defaultctx(struct kunit *test) +{ + const size_t data_len = 128; + struct sha256_ctx ctx; + u8 hash1_a[SHA256_DIGEST_SIZE]; + u8 hash2_a[SHA256_DIGEST_SIZE]; + u8 hash1_b[SHA256_DIGEST_SIZE]; + u8 hash2_b[SHA256_DIGEST_SIZE]; + + rand_bytes(test_buf, 2 * data_len); + + sha256_init(&ctx); + sha256_finup_2x(&ctx, test_buf, &test_buf[data_len], data_len, hash1_a, + hash2_a); + + sha256_finup_2x(NULL, test_buf, &test_buf[data_len], data_len, hash1_b, + hash2_b); + + KUNIT_ASSERT_MEMEQ(test, hash1_a, hash1_b, SHA256_DIGEST_SIZE); + KUNIT_ASSERT_MEMEQ(test, hash2_a, hash2_b, SHA256_DIGEST_SIZE); +} + +/* + * Test that sha256_finup_2x() and sha256_update/final() produce consistent + * results with total message lengths that require more than 32 bits. + */ +static void test_sha256_finup_2x_hugelen(struct kunit *test) +{ + const size_t data_len = 4 * SHA256_BLOCK_SIZE; + struct sha256_ctx ctx = {}; + u8 expected_hash[SHA256_DIGEST_SIZE]; + u8 hash[SHA256_DIGEST_SIZE]; + + rand_bytes(test_buf, data_len); + for (size_t align = 0; align < SHA256_BLOCK_SIZE; align++) { + sha256_init(&ctx); + ctx.ctx.bytecount = 0x123456789abcd00 + align; + + sha256_finup_2x(&ctx, test_buf, test_buf, data_len, hash, hash); + + sha256_update(&ctx, test_buf, data_len); + sha256_final(&ctx, expected_hash); + + KUNIT_ASSERT_MEMEQ(test, hash, expected_hash, + SHA256_DIGEST_SIZE); + } +} + +/* Benchmark for sha256_finup_2x() */ +static void benchmark_sha256_finup_2x(struct kunit *test) +{ + /* + * Try a few different salt lengths, since sha256_finup_2x() performance + * may vary slightly for the same data_len depending on how many bytes + * were already processed in the initial context. + */ + static const size_t salt_lens_to_test[] = { 0, 32, 64 }; + const size_t data_len = 4096; + const size_t num_iters = 4096; + struct sha256_ctx ctx; + u8 hash1[SHA256_DIGEST_SIZE]; + u8 hash2[SHA256_DIGEST_SIZE]; + + if (!IS_ENABLED(CONFIG_CRYPTO_LIB_BENCHMARK)) + kunit_skip(test, "not enabled"); + if (!sha256_finup_2x_is_optimized()) + kunit_skip(test, "not relevant"); + + rand_bytes(test_buf, data_len * 2); + + /* Warm-up */ + for (size_t i = 0; i < num_iters; i++) + sha256_finup_2x(NULL, &test_buf[0], &test_buf[data_len], + data_len, hash1, hash2); + + for (size_t i = 0; i < ARRAY_SIZE(salt_lens_to_test); i++) { + size_t salt_len = salt_lens_to_test[i]; + u64 t0, t1; + + /* + * Prepare the initial context. The time to process the salt is + * not measured; we're just interested in sha256_finup_2x(). + */ + sha256_init(&ctx); + sha256_update(&ctx, test_buf, salt_len); + + preempt_disable(); + t0 = ktime_get_ns(); + for (size_t j = 0; j < num_iters; j++) + sha256_finup_2x(&ctx, &test_buf[0], &test_buf[data_len], + data_len, hash1, hash2); + t1 = ktime_get_ns(); + preempt_enable(); + kunit_info(test, "data_len=%zu salt_len=%zu: %llu MB/s", + data_len, salt_len, + div64_u64((u64)data_len * 2 * num_iters * 1000, + t1 - t0 ?: 1)); + } +} + static struct kunit_case hash_test_cases[] = { HASH_KUNIT_CASES, + KUNIT_CASE(test_sha256_finup_2x), + KUNIT_CASE(test_sha256_finup_2x_defaultctx), + KUNIT_CASE(test_sha256_finup_2x_hugelen), KUNIT_CASE(benchmark_hash), + KUNIT_CASE(benchmark_sha256_finup_2x), {}, }; diff --git a/lib/crypto/x86/sha256-ni-asm.S b/lib/crypto/x86/sha256-ni-asm.S index 4bd9490ffc66..de5f707e7ef7 100644 --- a/lib/crypto/x86/sha256-ni-asm.S +++ b/lib/crypto/x86/sha256-ni-asm.S @@ -165,6 +165,374 @@ SYM_FUNC_START(sha256_ni_transform) RET SYM_FUNC_END(sha256_ni_transform) +#undef DIGEST_PTR +#undef DATA_PTR +#undef NUM_BLKS +#undef SHA256CONSTANTS +#undef MSG +#undef STATE0 +#undef STATE1 +#undef MSG0 +#undef MSG1 +#undef MSG2 +#undef MSG3 +#undef TMP +#undef SHUF_MASK +#undef ABEF_SAVE +#undef CDGH_SAVE + +// parameters for sha256_ni_finup2x() +#define CTX %rdi +#define DATA1 %rsi +#define DATA2 %rdx +#define LEN %ecx +#define LEN8 %cl +#define LEN64 %rcx +#define OUT1 %r8 +#define OUT2 %r9 + +// other scalar variables +#define SHA256CONSTANTS %rax +#define COUNT %r10 +#define COUNT32 %r10d +#define FINAL_STEP %r11d + +// rbx is used as a temporary. + +#define MSG %xmm0 // sha256rnds2 implicit operand +#define STATE0_A %xmm1 +#define STATE1_A %xmm2 +#define STATE0_B %xmm3 +#define STATE1_B %xmm4 +#define TMP_A %xmm5 +#define TMP_B %xmm6 +#define MSG0_A %xmm7 +#define MSG1_A %xmm8 +#define MSG2_A %xmm9 +#define MSG3_A %xmm10 +#define MSG0_B %xmm11 +#define MSG1_B %xmm12 +#define MSG2_B %xmm13 +#define MSG3_B %xmm14 +#define SHUF_MASK %xmm15 + +#define OFFSETOF_STATE 0 // offsetof(struct __sha256_ctx, state) +#define OFFSETOF_BYTECOUNT 32 // offsetof(struct __sha256_ctx, bytecount) +#define OFFSETOF_BUF 40 // offsetof(struct __sha256_ctx, buf) + +// Do 4 rounds of SHA-256 for each of two messages (interleaved). m0_a and m0_b +// contain the current 4 message schedule words for the first and second message +// respectively. +// +// If not all the message schedule words have been computed yet, then this also +// computes 4 more message schedule words for each message. m1_a-m3_a contain +// the next 3 groups of 4 message schedule words for the first message, and +// likewise m1_b-m3_b for the second. After consuming the current value of +// m0_a, this macro computes the group after m3_a and writes it to m0_a, and +// likewise for *_b. This means that the next (m0_a, m1_a, m2_a, m3_a) is the +// current (m1_a, m2_a, m3_a, m0_a), and likewise for *_b, so the caller must +// cycle through the registers accordingly. +.macro do_4rounds_2x i, m0_a, m1_a, m2_a, m3_a, m0_b, m1_b, m2_b, m3_b + movdqa (\i-32)*4(SHA256CONSTANTS), TMP_A + movdqa TMP_A, TMP_B + paddd \m0_a, TMP_A + paddd \m0_b, TMP_B +.if \i < 48 + sha256msg1 \m1_a, \m0_a + sha256msg1 \m1_b, \m0_b +.endif + movdqa TMP_A, MSG + sha256rnds2 STATE0_A, STATE1_A + movdqa TMP_B, MSG + sha256rnds2 STATE0_B, STATE1_B + pshufd $0x0E, TMP_A, MSG + sha256rnds2 STATE1_A, STATE0_A + pshufd $0x0E, TMP_B, MSG + sha256rnds2 STATE1_B, STATE0_B +.if \i < 48 + movdqa \m3_a, TMP_A + movdqa \m3_b, TMP_B + palignr $4, \m2_a, TMP_A + palignr $4, \m2_b, TMP_B + paddd TMP_A, \m0_a + paddd TMP_B, \m0_b + sha256msg2 \m3_a, \m0_a + sha256msg2 \m3_b, \m0_b +.endif +.endm + +// +// void sha256_ni_finup2x(const struct __sha256_ctx *ctx, +// const u8 *data1, const u8 *data2, int len, +// u8 out1[SHA256_DIGEST_SIZE], +// u8 out2[SHA256_DIGEST_SIZE]); +// +// This function computes the SHA-256 digests of two messages |data1| and +// |data2| that are both |len| bytes long, starting from the initial context +// |ctx|. |len| must be at least SHA256_BLOCK_SIZE. +// +// The instructions for the two SHA-256 operations are interleaved. On many +// CPUs, this is almost twice as fast as hashing each message individually due +// to taking better advantage of the CPU's SHA-256 and SIMD throughput. +// +SYM_FUNC_START(sha256_ni_finup2x) + // Allocate 128 bytes of stack space, 16-byte aligned. + push %rbx + push %rbp + mov %rsp, %rbp + sub $128, %rsp + and $~15, %rsp + + // Load the shuffle mask for swapping the endianness of 32-bit words. + movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK + + // Set up pointer to the round constants. + lea K256+32*4(%rip), SHA256CONSTANTS + + // Initially we're not processing the final blocks. + xor FINAL_STEP, FINAL_STEP + + // Load the initial state from ctx->state. + movdqu OFFSETOF_STATE+0*16(CTX), STATE0_A // DCBA + movdqu OFFSETOF_STATE+1*16(CTX), STATE1_A // HGFE + movdqa STATE0_A, TMP_A + punpcklqdq STATE1_A, STATE0_A // FEBA + punpckhqdq TMP_A, STATE1_A // DCHG + pshufd $0x1B, STATE0_A, STATE0_A // ABEF + pshufd $0xB1, STATE1_A, STATE1_A // CDGH + + // Load ctx->bytecount. Take the mod 64 of it to get the number of + // bytes that are buffered in ctx->buf. Also save it in a register with + // LEN added to it. + mov LEN, LEN + mov OFFSETOF_BYTECOUNT(CTX), %rbx + lea (%rbx, LEN64, 1), COUNT + and $63, %ebx + jz .Lfinup2x_enter_loop // No bytes buffered? + + // %ebx bytes (1 to 63) are currently buffered in ctx->buf. Load them + // followed by the first 64 - %ebx bytes of data. Since LEN >= 64, we + // just load 64 bytes from each of ctx->buf, DATA1, and DATA2 + // unconditionally and rearrange the data as needed. + + movdqu OFFSETOF_BUF+0*16(CTX), MSG0_A + movdqu OFFSETOF_BUF+1*16(CTX), MSG1_A + movdqu OFFSETOF_BUF+2*16(CTX), MSG2_A + movdqu OFFSETOF_BUF+3*16(CTX), MSG3_A + movdqa MSG0_A, 0*16(%rsp) + movdqa MSG1_A, 1*16(%rsp) + movdqa MSG2_A, 2*16(%rsp) + movdqa MSG3_A, 3*16(%rsp) + + movdqu 0*16(DATA1), MSG0_A + movdqu 1*16(DATA1), MSG1_A + movdqu 2*16(DATA1), MSG2_A + movdqu 3*16(DATA1), MSG3_A + movdqu MSG0_A, 0*16(%rsp,%rbx) + movdqu MSG1_A, 1*16(%rsp,%rbx) + movdqu MSG2_A, 2*16(%rsp,%rbx) + movdqu MSG3_A, 3*16(%rsp,%rbx) + movdqa 0*16(%rsp), MSG0_A + movdqa 1*16(%rsp), MSG1_A + movdqa 2*16(%rsp), MSG2_A + movdqa 3*16(%rsp), MSG3_A + + movdqu 0*16(DATA2), MSG0_B + movdqu 1*16(DATA2), MSG1_B + movdqu 2*16(DATA2), MSG2_B + movdqu 3*16(DATA2), MSG3_B + movdqu MSG0_B, 0*16(%rsp,%rbx) + movdqu MSG1_B, 1*16(%rsp,%rbx) + movdqu MSG2_B, 2*16(%rsp,%rbx) + movdqu MSG3_B, 3*16(%rsp,%rbx) + movdqa 0*16(%rsp), MSG0_B + movdqa 1*16(%rsp), MSG1_B + movdqa 2*16(%rsp), MSG2_B + movdqa 3*16(%rsp), MSG3_B + + sub $64, %rbx // rbx = buffered - 64 + sub %rbx, DATA1 // DATA1 += 64 - buffered + sub %rbx, DATA2 // DATA2 += 64 - buffered + add %ebx, LEN // LEN += buffered - 64 + movdqa STATE0_A, STATE0_B + movdqa STATE1_A, STATE1_B + jmp .Lfinup2x_loop_have_data + +.Lfinup2x_enter_loop: + sub $64, LEN + movdqa STATE0_A, STATE0_B + movdqa STATE1_A, STATE1_B +.Lfinup2x_loop: + // Load the next two data blocks. + movdqu 0*16(DATA1), MSG0_A + movdqu 0*16(DATA2), MSG0_B + movdqu 1*16(DATA1), MSG1_A + movdqu 1*16(DATA2), MSG1_B + movdqu 2*16(DATA1), MSG2_A + movdqu 2*16(DATA2), MSG2_B + movdqu 3*16(DATA1), MSG3_A + movdqu 3*16(DATA2), MSG3_B + add $64, DATA1 + add $64, DATA2 +.Lfinup2x_loop_have_data: + // Convert the words of the data blocks from big endian. + pshufb SHUF_MASK, MSG0_A + pshufb SHUF_MASK, MSG0_B + pshufb SHUF_MASK, MSG1_A + pshufb SHUF_MASK, MSG1_B + pshufb SHUF_MASK, MSG2_A + pshufb SHUF_MASK, MSG2_B + pshufb SHUF_MASK, MSG3_A + pshufb SHUF_MASK, MSG3_B +.Lfinup2x_loop_have_bswapped_data: + + // Save the original state for each block. + movdqa STATE0_A, 0*16(%rsp) + movdqa STATE0_B, 1*16(%rsp) + movdqa STATE1_A, 2*16(%rsp) + movdqa STATE1_B, 3*16(%rsp) + + // Do the SHA-256 rounds on each block. +.irp i, 0, 16, 32, 48 + do_4rounds_2x (\i + 0), MSG0_A, MSG1_A, MSG2_A, MSG3_A, \ + MSG0_B, MSG1_B, MSG2_B, MSG3_B + do_4rounds_2x (\i + 4), MSG1_A, MSG2_A, MSG3_A, MSG0_A, \ + MSG1_B, MSG2_B, MSG3_B, MSG0_B + do_4rounds_2x (\i + 8), MSG2_A, MSG3_A, MSG0_A, MSG1_A, \ + MSG2_B, MSG3_B, MSG0_B, MSG1_B + do_4rounds_2x (\i + 12), MSG3_A, MSG0_A, MSG1_A, MSG2_A, \ + MSG3_B, MSG0_B, MSG1_B, MSG2_B +.endr + + // Add the original state for each block. + paddd 0*16(%rsp), STATE0_A + paddd 1*16(%rsp), STATE0_B + paddd 2*16(%rsp), STATE1_A + paddd 3*16(%rsp), STATE1_B + + // Update LEN and loop back if more blocks remain. + sub $64, LEN + jge .Lfinup2x_loop + + // Check if any final blocks need to be handled. + // FINAL_STEP = 2: all done + // FINAL_STEP = 1: need to do count-only padding block + // FINAL_STEP = 0: need to do the block with 0x80 padding byte + cmp $1, FINAL_STEP + jg .Lfinup2x_done + je .Lfinup2x_finalize_countonly + add $64, LEN + jz .Lfinup2x_finalize_blockaligned + + // Not block-aligned; 1 <= LEN <= 63 data bytes remain. Pad the block. + // To do this, write the padding starting with the 0x80 byte to + // &sp[64]. Then for each message, copy the last 64 data bytes to sp + // and load from &sp[64 - LEN] to get the needed padding block. This + // code relies on the data buffers being >= 64 bytes in length. + mov $64, %ebx + sub LEN, %ebx // ebx = 64 - LEN + sub %rbx, DATA1 // DATA1 -= 64 - LEN + sub %rbx, DATA2 // DATA2 -= 64 - LEN + mov $0x80, FINAL_STEP // using FINAL_STEP as a temporary + movd FINAL_STEP, MSG0_A + pxor MSG1_A, MSG1_A + movdqa MSG0_A, 4*16(%rsp) + movdqa MSG1_A, 5*16(%rsp) + movdqa MSG1_A, 6*16(%rsp) + movdqa MSG1_A, 7*16(%rsp) + cmp $56, LEN + jge 1f // will COUNT spill into its own block? + shl $3, COUNT + bswap COUNT + mov COUNT, 56(%rsp,%rbx) + mov $2, FINAL_STEP // won't need count-only block + jmp 2f +1: + mov $1, FINAL_STEP // will need count-only block +2: + movdqu 0*16(DATA1), MSG0_A + movdqu 1*16(DATA1), MSG1_A + movdqu 2*16(DATA1), MSG2_A + movdqu 3*16(DATA1), MSG3_A + movdqa MSG0_A, 0*16(%rsp) + movdqa MSG1_A, 1*16(%rsp) + movdqa MSG2_A, 2*16(%rsp) + movdqa MSG3_A, 3*16(%rsp) + movdqu 0*16(%rsp,%rbx), MSG0_A + movdqu 1*16(%rsp,%rbx), MSG1_A + movdqu 2*16(%rsp,%rbx), MSG2_A + movdqu 3*16(%rsp,%rbx), MSG3_A + + movdqu 0*16(DATA2), MSG0_B + movdqu 1*16(DATA2), MSG1_B + movdqu 2*16(DATA2), MSG2_B + movdqu 3*16(DATA2), MSG3_B + movdqa MSG0_B, 0*16(%rsp) + movdqa MSG1_B, 1*16(%rsp) + movdqa MSG2_B, 2*16(%rsp) + movdqa MSG3_B, 3*16(%rsp) + movdqu 0*16(%rsp,%rbx), MSG0_B + movdqu 1*16(%rsp,%rbx), MSG1_B + movdqu 2*16(%rsp,%rbx), MSG2_B + movdqu 3*16(%rsp,%rbx), MSG3_B + jmp .Lfinup2x_loop_have_data + + // Prepare a padding block, either: + // + // {0x80, 0, 0, 0, ..., count (as __be64)} + // This is for a block aligned message. + // + // { 0, 0, 0, 0, ..., count (as __be64)} + // This is for a message whose length mod 64 is >= 56. + // + // Pre-swap the endianness of the words. +.Lfinup2x_finalize_countonly: + pxor MSG0_A, MSG0_A + jmp 1f + +.Lfinup2x_finalize_blockaligned: + mov $0x80000000, %ebx + movd %ebx, MSG0_A +1: + pxor MSG1_A, MSG1_A + pxor MSG2_A, MSG2_A + ror $29, COUNT + movq COUNT, MSG3_A + pslldq $8, MSG3_A + movdqa MSG0_A, MSG0_B + pxor MSG1_B, MSG1_B + pxor MSG2_B, MSG2_B + movdqa MSG3_A, MSG3_B + mov $2, FINAL_STEP + jmp .Lfinup2x_loop_have_bswapped_data + +.Lfinup2x_done: + // Write the two digests with all bytes in the correct order. + movdqa STATE0_A, TMP_A + movdqa STATE0_B, TMP_B + punpcklqdq STATE1_A, STATE0_A // GHEF + punpcklqdq STATE1_B, STATE0_B + punpckhqdq TMP_A, STATE1_A // ABCD + punpckhqdq TMP_B, STATE1_B + pshufd $0xB1, STATE0_A, STATE0_A // HGFE + pshufd $0xB1, STATE0_B, STATE0_B + pshufd $0x1B, STATE1_A, STATE1_A // DCBA + pshufd $0x1B, STATE1_B, STATE1_B + pshufb SHUF_MASK, STATE0_A + pshufb SHUF_MASK, STATE0_B + pshufb SHUF_MASK, STATE1_A + pshufb SHUF_MASK, STATE1_B + movdqu STATE0_A, 1*16(OUT1) + movdqu STATE0_B, 1*16(OUT2) + movdqu STATE1_A, 0*16(OUT1) + movdqu STATE1_B, 0*16(OUT2) + + mov %rbp, %rsp + pop %rbp + pop %rbx + RET +SYM_FUNC_END(sha256_ni_finup2x) + .section .rodata.cst256.K256, "aM", @progbits, 256 .align 64 K256: diff --git a/lib/crypto/x86/sha256.h b/lib/crypto/x86/sha256.h index 41fa95fbc3bf..38e33b22a092 100644 --- a/lib/crypto/x86/sha256.h +++ b/lib/crypto/x86/sha256.h @@ -7,6 +7,8 @@ #include <asm/fpu/api.h> #include <linux/static_call.h> +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha_ni); + DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_blocks_generic); #define DEFINE_X86_SHA256_FN(c_fn, asm_fn) \ @@ -35,11 +37,48 @@ static void sha256_blocks(struct sha256_block_state *state, static_call(sha256_blocks_x86)(state, data, nblocks); } +static_assert(offsetof(struct __sha256_ctx, state) == 0); +static_assert(offsetof(struct __sha256_ctx, bytecount) == 32); +static_assert(offsetof(struct __sha256_ctx, buf) == 40); +asmlinkage void sha256_ni_finup2x(const struct __sha256_ctx *ctx, + const u8 *data1, const u8 *data2, int len, + u8 out1[SHA256_DIGEST_SIZE], + u8 out2[SHA256_DIGEST_SIZE]); + +#define sha256_finup_2x_arch sha256_finup_2x_arch +static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx, + const u8 *data1, const u8 *data2, size_t len, + u8 out1[SHA256_DIGEST_SIZE], + u8 out2[SHA256_DIGEST_SIZE]) +{ + /* + * The assembly requires len >= SHA256_BLOCK_SIZE && len <= INT_MAX. + * Further limit len to 65536 to avoid spending too long with preemption + * disabled. (Of course, in practice len is nearly always 4096 anyway.) + */ + if (static_branch_likely(&have_sha_ni) && len >= SHA256_BLOCK_SIZE && + len <= 65536 && likely(irq_fpu_usable())) { + kernel_fpu_begin(); + sha256_ni_finup2x(ctx, data1, data2, len, out1, out2); + kernel_fpu_end(); + kmsan_unpoison_memory(out1, SHA256_DIGEST_SIZE); + kmsan_unpoison_memory(out2, SHA256_DIGEST_SIZE); + return true; + } + return false; +} + +static bool sha256_finup_2x_is_optimized_arch(void) +{ + return static_key_enabled(&have_sha_ni); +} + #define sha256_mod_init_arch sha256_mod_init_arch static void sha256_mod_init_arch(void) { if (boot_cpu_has(X86_FEATURE_SHA_NI)) { static_call_update(sha256_blocks_x86, sha256_blocks_ni); + static_branch_enable(&have_sha_ni); } else if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL) && boot_cpu_has(X86_FEATURE_AVX)) { |