diff options
Diffstat (limited to 'lib')
-rw-r--r-- | lib/Kconfig.debug | 16 | ||||
-rw-r--r-- | lib/clz_ctz.c | 8 | ||||
-rw-r--r-- | lib/crypto/arm64/sha256-ce.S | 284 | ||||
-rw-r--r-- | lib/crypto/arm64/sha256.h | 37 | ||||
-rw-r--r-- | lib/crypto/sha256.c | 71 | ||||
-rw-r--r-- | lib/crypto/tests/sha256_kunit.c | 184 | ||||
-rw-r--r-- | lib/crypto/x86/sha256-ni-asm.S | 368 | ||||
-rw-r--r-- | lib/crypto/x86/sha256.h | 39 | ||||
-rw-r--r-- | lib/raid6/recov_rvv.c | 2 | ||||
-rw-r--r-- | lib/raid6/rvv.c | 63 | ||||
-rw-r--r-- | lib/tests/Makefile | 1 | ||||
-rw-r--r-- | lib/tests/ffs_kunit.c | 566 |
12 files changed, 1588 insertions, 51 deletions
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index dc0e0c6ed075..24939b8553e6 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2479,6 +2479,20 @@ config STRING_HELPERS_KUNIT_TEST depends on KUNIT default KUNIT_ALL_TESTS +config FFS_KUNIT_TEST + tristate "KUnit test ffs-family functions at runtime" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS + help + This builds KUnit tests for ffs-family bit manipulation functions + including ffs(), __ffs(), fls(), __fls(), fls64(), and __ffs64(). + + These tests validate mathematical correctness, edge case handling, + and cross-architecture consistency of bit scanning functions. + + For more information on KUnit and unit tests in general, + please refer to Documentation/dev-tools/kunit/. + config TEST_KSTRTOX tristate "Test kstrto*() family of functions at runtime" @@ -2894,7 +2908,7 @@ config FORTIFY_KUNIT_TEST config LONGEST_SYM_KUNIT_TEST tristate "Test the longest symbol possible" if !KUNIT_ALL_TESTS depends on KUNIT && KPROBES - depends on !PREFIX_SYMBOLS && !CFI_CLANG && !GCOV_KERNEL + depends on !PREFIX_SYMBOLS && !CFI && !GCOV_KERNEL default KUNIT_ALL_TESTS help Tests the longest symbol possible diff --git a/lib/clz_ctz.c b/lib/clz_ctz.c index fb8c0c5c2bd2..8778ec44bf63 100644 --- a/lib/clz_ctz.c +++ b/lib/clz_ctz.c @@ -15,28 +15,28 @@ #include <linux/kernel.h> int __weak __ctzsi2(int val); -int __weak __ctzsi2(int val) +int __weak __attribute_const__ __ctzsi2(int val) { return __ffs(val); } EXPORT_SYMBOL(__ctzsi2); int __weak __clzsi2(int val); -int __weak __clzsi2(int val) +int __weak __attribute_const__ __clzsi2(int val) { return 32 - fls(val); } EXPORT_SYMBOL(__clzsi2); int __weak __clzdi2(u64 val); -int __weak __clzdi2(u64 val) +int __weak __attribute_const__ __clzdi2(u64 val) { return 64 - fls64(val); } EXPORT_SYMBOL(__clzdi2); int __weak __ctzdi2(u64 val); -int __weak __ctzdi2(u64 val) +int __weak __attribute_const__ __ctzdi2(u64 val) { return __ffs64(val); } diff --git a/lib/crypto/arm64/sha256-ce.S b/lib/crypto/arm64/sha256-ce.S index b99d9589c421..410174ba5237 100644 --- a/lib/crypto/arm64/sha256-ce.S +++ b/lib/crypto/arm64/sha256-ce.S @@ -70,18 +70,22 @@ .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 + .macro load_round_constants tmp + adr_l \tmp, .Lsha2_rcon + ld1 { v0.4s- v3.4s}, [\tmp], #64 + ld1 { v4.4s- v7.4s}, [\tmp], #64 + ld1 { v8.4s-v11.4s}, [\tmp], #64 + ld1 {v12.4s-v15.4s}, [\tmp] + .endm + /* * size_t __sha256_ce_transform(struct sha256_block_state *state, * const u8 *data, size_t nblocks); */ .text SYM_FUNC_START(__sha256_ce_transform) - /* load round constants */ - adr_l x8, .Lsha2_rcon - ld1 { v0.4s- v3.4s}, [x8], #64 - ld1 { v4.4s- v7.4s}, [x8], #64 - ld1 { v8.4s-v11.4s}, [x8], #64 - ld1 {v12.4s-v15.4s}, [x8] + + load_round_constants x8 /* load state */ ld1 {dgav.4s, dgbv.4s}, [x0] @@ -134,3 +138,271 @@ CPU_LE( rev32 v19.16b, v19.16b ) mov x0, x2 ret SYM_FUNC_END(__sha256_ce_transform) + + .unreq dga + .unreq dgav + .unreq dgb + .unreq dgbv + .unreq t0 + .unreq t1 + .unreq dg0q + .unreq dg0v + .unreq dg1q + .unreq dg1v + .unreq dg2q + .unreq dg2v + + // parameters for sha256_ce_finup2x() + ctx .req x0 + data1 .req x1 + data2 .req x2 + len .req w3 + out1 .req x4 + out2 .req x5 + + // other scalar variables + count .req x6 + final_step .req w7 + + // x8-x9 are used as temporaries. + + // v0-v15 are used to cache the SHA-256 round constants. + // v16-v19 are used for the message schedule for the first message. + // v20-v23 are used for the message schedule for the second message. + // v24-v31 are used for the state and temporaries as given below. + // *_a are for the first message and *_b for the second. + state0_a_q .req q24 + state0_a .req v24 + state1_a_q .req q25 + state1_a .req v25 + state0_b_q .req q26 + state0_b .req v26 + state1_b_q .req q27 + state1_b .req v27 + t0_a .req v28 + t0_b .req v29 + t1_a_q .req q30 + t1_a .req v30 + t1_b_q .req q31 + t1_b .req v31 + +#define OFFSETOF_BYTECOUNT 32 // offsetof(struct __sha256_ctx, bytecount) +#define OFFSETOF_BUF 40 // offsetof(struct __sha256_ctx, buf) +// offsetof(struct __sha256_ctx, state) is assumed to be 0. + + // Do 4 rounds of SHA-256 for each of two messages (interleaved). m0_a + // and m0_b contain the current 4 message schedule words for the first + // and second message respectively. + // + // If not all the message schedule words have been computed yet, then + // this also computes 4 more message schedule words for each message. + // m1_a-m3_a contain the next 3 groups of 4 message schedule words for + // the first message, and likewise m1_b-m3_b for the second. After + // consuming the current value of m0_a, this macro computes the group + // after m3_a and writes it to m0_a, and likewise for *_b. This means + // that the next (m0_a, m1_a, m2_a, m3_a) is the current (m1_a, m2_a, + // m3_a, m0_a), and likewise for *_b, so the caller must cycle through + // the registers accordingly. + .macro do_4rounds_2x i, k, m0_a, m1_a, m2_a, m3_a, \ + m0_b, m1_b, m2_b, m3_b + add t0_a\().4s, \m0_a\().4s, \k\().4s + add t0_b\().4s, \m0_b\().4s, \k\().4s + .if \i < 48 + sha256su0 \m0_a\().4s, \m1_a\().4s + sha256su0 \m0_b\().4s, \m1_b\().4s + sha256su1 \m0_a\().4s, \m2_a\().4s, \m3_a\().4s + sha256su1 \m0_b\().4s, \m2_b\().4s, \m3_b\().4s + .endif + mov t1_a.16b, state0_a.16b + mov t1_b.16b, state0_b.16b + sha256h state0_a_q, state1_a_q, t0_a\().4s + sha256h state0_b_q, state1_b_q, t0_b\().4s + sha256h2 state1_a_q, t1_a_q, t0_a\().4s + sha256h2 state1_b_q, t1_b_q, t0_b\().4s + .endm + + .macro do_16rounds_2x i, k0, k1, k2, k3 + do_4rounds_2x \i + 0, \k0, v16, v17, v18, v19, v20, v21, v22, v23 + do_4rounds_2x \i + 4, \k1, v17, v18, v19, v16, v21, v22, v23, v20 + do_4rounds_2x \i + 8, \k2, v18, v19, v16, v17, v22, v23, v20, v21 + do_4rounds_2x \i + 12, \k3, v19, v16, v17, v18, v23, v20, v21, v22 + .endm + +// +// void sha256_ce_finup2x(const struct __sha256_ctx *ctx, +// const u8 *data1, const u8 *data2, int len, +// u8 out1[SHA256_DIGEST_SIZE], +// u8 out2[SHA256_DIGEST_SIZE]); +// +// This function computes the SHA-256 digests of two messages |data1| and +// |data2| that are both |len| bytes long, starting from the initial context +// |ctx|. |len| must be at least SHA256_BLOCK_SIZE. +// +// The instructions for the two SHA-256 operations are interleaved. On many +// CPUs, this is almost twice as fast as hashing each message individually due +// to taking better advantage of the CPU's SHA-256 and SIMD throughput. +// +SYM_FUNC_START(sha256_ce_finup2x) + sub sp, sp, #128 + mov final_step, #0 + load_round_constants x8 + + // Load the initial state from ctx->state. + ld1 {state0_a.4s-state1_a.4s}, [ctx] + + // Load ctx->bytecount. Take the mod 64 of it to get the number of + // bytes that are buffered in ctx->buf. Also save it in a register with + // len added to it. + ldr x8, [ctx, #OFFSETOF_BYTECOUNT] + add count, x8, len, sxtw + and x8, x8, #63 + cbz x8, .Lfinup2x_enter_loop // No bytes buffered? + + // x8 bytes (1 to 63) are currently buffered in ctx->buf. Load them + // followed by the first 64 - x8 bytes of data. Since len >= 64, we + // just load 64 bytes from each of ctx->buf, data1, and data2 + // unconditionally and rearrange the data as needed. + add x9, ctx, #OFFSETOF_BUF + ld1 {v16.16b-v19.16b}, [x9] + st1 {v16.16b-v19.16b}, [sp] + + ld1 {v16.16b-v19.16b}, [data1], #64 + add x9, sp, x8 + st1 {v16.16b-v19.16b}, [x9] + ld1 {v16.4s-v19.4s}, [sp] + + ld1 {v20.16b-v23.16b}, [data2], #64 + st1 {v20.16b-v23.16b}, [x9] + ld1 {v20.4s-v23.4s}, [sp] + + sub len, len, #64 + sub data1, data1, x8 + sub data2, data2, x8 + add len, len, w8 + mov state0_b.16b, state0_a.16b + mov state1_b.16b, state1_a.16b + b .Lfinup2x_loop_have_data + +.Lfinup2x_enter_loop: + sub len, len, #64 + mov state0_b.16b, state0_a.16b + mov state1_b.16b, state1_a.16b +.Lfinup2x_loop: + // Load the next two data blocks. + ld1 {v16.4s-v19.4s}, [data1], #64 + ld1 {v20.4s-v23.4s}, [data2], #64 +.Lfinup2x_loop_have_data: + // Convert the words of the data blocks from big endian. +CPU_LE( rev32 v16.16b, v16.16b ) +CPU_LE( rev32 v17.16b, v17.16b ) +CPU_LE( rev32 v18.16b, v18.16b ) +CPU_LE( rev32 v19.16b, v19.16b ) +CPU_LE( rev32 v20.16b, v20.16b ) +CPU_LE( rev32 v21.16b, v21.16b ) +CPU_LE( rev32 v22.16b, v22.16b ) +CPU_LE( rev32 v23.16b, v23.16b ) +.Lfinup2x_loop_have_bswapped_data: + + // Save the original state for each block. + st1 {state0_a.4s-state1_b.4s}, [sp] + + // Do the SHA-256 rounds on each block. + do_16rounds_2x 0, v0, v1, v2, v3 + do_16rounds_2x 16, v4, v5, v6, v7 + do_16rounds_2x 32, v8, v9, v10, v11 + do_16rounds_2x 48, v12, v13, v14, v15 + + // Add the original state for each block. + ld1 {v16.4s-v19.4s}, [sp] + add state0_a.4s, state0_a.4s, v16.4s + add state1_a.4s, state1_a.4s, v17.4s + add state0_b.4s, state0_b.4s, v18.4s + add state1_b.4s, state1_b.4s, v19.4s + + // Update len and loop back if more blocks remain. + sub len, len, #64 + tbz len, #31, .Lfinup2x_loop // len >= 0? + + // Check if any final blocks need to be handled. + // final_step = 2: all done + // final_step = 1: need to do count-only padding block + // final_step = 0: need to do the block with 0x80 padding byte + tbnz final_step, #1, .Lfinup2x_done + tbnz final_step, #0, .Lfinup2x_finalize_countonly + add len, len, #64 + cbz len, .Lfinup2x_finalize_blockaligned + + // Not block-aligned; 1 <= len <= 63 data bytes remain. Pad the block. + // To do this, write the padding starting with the 0x80 byte to + // &sp[64]. Then for each message, copy the last 64 data bytes to sp + // and load from &sp[64 - len] to get the needed padding block. This + // code relies on the data buffers being >= 64 bytes in length. + sub w8, len, #64 // w8 = len - 64 + add data1, data1, w8, sxtw // data1 += len - 64 + add data2, data2, w8, sxtw // data2 += len - 64 +CPU_LE( mov x9, #0x80 ) +CPU_LE( fmov d16, x9 ) +CPU_BE( movi v16.16b, #0 ) +CPU_BE( mov x9, #0x8000000000000000 ) +CPU_BE( mov v16.d[1], x9 ) + movi v17.16b, #0 + stp q16, q17, [sp, #64] + stp q17, q17, [sp, #96] + sub x9, sp, w8, sxtw // x9 = &sp[64 - len] + cmp len, #56 + b.ge 1f // will count spill into its own block? + lsl count, count, #3 +CPU_LE( rev count, count ) + str count, [x9, #56] + mov final_step, #2 // won't need count-only block + b 2f +1: + mov final_step, #1 // will need count-only block +2: + ld1 {v16.16b-v19.16b}, [data1] + st1 {v16.16b-v19.16b}, [sp] + ld1 {v16.4s-v19.4s}, [x9] + ld1 {v20.16b-v23.16b}, [data2] + st1 {v20.16b-v23.16b}, [sp] + ld1 {v20.4s-v23.4s}, [x9] + b .Lfinup2x_loop_have_data + + // Prepare a padding block, either: + // + // {0x80, 0, 0, 0, ..., count (as __be64)} + // This is for a block aligned message. + // + // { 0, 0, 0, 0, ..., count (as __be64)} + // This is for a message whose length mod 64 is >= 56. + // + // Pre-swap the endianness of the words. +.Lfinup2x_finalize_countonly: + movi v16.2d, #0 + b 1f +.Lfinup2x_finalize_blockaligned: + mov x8, #0x80000000 + fmov d16, x8 +1: + movi v17.2d, #0 + movi v18.2d, #0 + ror count, count, #29 // ror(lsl(count, 3), 32) + mov v19.d[0], xzr + mov v19.d[1], count + mov v20.16b, v16.16b + movi v21.2d, #0 + movi v22.2d, #0 + mov v23.16b, v19.16b + mov final_step, #2 + b .Lfinup2x_loop_have_bswapped_data + +.Lfinup2x_done: + // Write the two digests with all bytes in the correct order. +CPU_LE( rev32 state0_a.16b, state0_a.16b ) +CPU_LE( rev32 state1_a.16b, state1_a.16b ) +CPU_LE( rev32 state0_b.16b, state0_b.16b ) +CPU_LE( rev32 state1_b.16b, state1_b.16b ) + st1 {state0_a.4s-state1_a.4s}, [out1] + st1 {state0_b.4s-state1_b.4s}, [out2] + add sp, sp, #128 + ret +SYM_FUNC_END(sha256_ce_finup2x) diff --git a/lib/crypto/arm64/sha256.h b/lib/crypto/arm64/sha256.h index be4aeda9d0e6..80d06df27d3a 100644 --- a/lib/crypto/arm64/sha256.h +++ b/lib/crypto/arm64/sha256.h @@ -44,6 +44,43 @@ static void sha256_blocks(struct sha256_block_state *state, } } +static_assert(offsetof(struct __sha256_ctx, state) == 0); +static_assert(offsetof(struct __sha256_ctx, bytecount) == 32); +static_assert(offsetof(struct __sha256_ctx, buf) == 40); +asmlinkage void sha256_ce_finup2x(const struct __sha256_ctx *ctx, + const u8 *data1, const u8 *data2, int len, + u8 out1[SHA256_DIGEST_SIZE], + u8 out2[SHA256_DIGEST_SIZE]); + +#define sha256_finup_2x_arch sha256_finup_2x_arch +static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx, + const u8 *data1, const u8 *data2, size_t len, + u8 out1[SHA256_DIGEST_SIZE], + u8 out2[SHA256_DIGEST_SIZE]) +{ + /* + * The assembly requires len >= SHA256_BLOCK_SIZE && len <= INT_MAX. + * Further limit len to 65536 to avoid spending too long with preemption + * disabled. (Of course, in practice len is nearly always 4096 anyway.) + */ + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && + static_branch_likely(&have_ce) && len >= SHA256_BLOCK_SIZE && + len <= 65536 && likely(may_use_simd())) { + kernel_neon_begin(); + sha256_ce_finup2x(ctx, data1, data2, len, out1, out2); + kernel_neon_end(); + kmsan_unpoison_memory(out1, SHA256_DIGEST_SIZE); + kmsan_unpoison_memory(out2, SHA256_DIGEST_SIZE); + return true; + } + return false; +} + +static bool sha256_finup_2x_is_optimized_arch(void) +{ + return static_key_enabled(&have_ce); +} + #ifdef CONFIG_KERNEL_MODE_NEON #define sha256_mod_init_arch sha256_mod_init_arch static void sha256_mod_init_arch(void) diff --git a/lib/crypto/sha256.c b/lib/crypto/sha256.c index 8fa15165d23e..881b935418ce 100644 --- a/lib/crypto/sha256.c +++ b/lib/crypto/sha256.c @@ -25,13 +25,20 @@ static const struct sha256_block_state sha224_iv = { }, }; -static const struct sha256_block_state sha256_iv = { - .h = { - SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3, - SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7, +static const struct sha256_ctx initial_sha256_ctx = { + .ctx = { + .state = { + .h = { + SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3, + SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7, + }, + }, + .bytecount = 0, }, }; +#define sha256_iv (initial_sha256_ctx.ctx.state) + static const u32 sha256_K[64] = { 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, @@ -261,8 +268,62 @@ void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE]) } EXPORT_SYMBOL(sha256); -/* pre-boot environment (as indicated by __DISABLE_EXPORTS) doesn't need HMAC */ +/* + * Pre-boot environment (as indicated by __DISABLE_EXPORTS being defined) + * doesn't need either HMAC support or interleaved hashing support + */ #ifndef __DISABLE_EXPORTS + +#ifndef sha256_finup_2x_arch +static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx, + const u8 *data1, const u8 *data2, size_t len, + u8 out1[SHA256_DIGEST_SIZE], + u8 out2[SHA256_DIGEST_SIZE]) +{ + return false; +} +static bool sha256_finup_2x_is_optimized_arch(void) +{ + return false; +} +#endif + +/* Sequential fallback implementation of sha256_finup_2x() */ +static noinline_for_stack void sha256_finup_2x_sequential( + const struct __sha256_ctx *ctx, const u8 *data1, const u8 *data2, + size_t len, u8 out1[SHA256_DIGEST_SIZE], u8 out2[SHA256_DIGEST_SIZE]) +{ + struct __sha256_ctx mut_ctx; + + mut_ctx = *ctx; + __sha256_update(&mut_ctx, data1, len); + __sha256_final(&mut_ctx, out1, SHA256_DIGEST_SIZE); + + mut_ctx = *ctx; + __sha256_update(&mut_ctx, data2, len); + __sha256_final(&mut_ctx, out2, SHA256_DIGEST_SIZE); +} + +void sha256_finup_2x(const struct sha256_ctx *ctx, const u8 *data1, + const u8 *data2, size_t len, u8 out1[SHA256_DIGEST_SIZE], + u8 out2[SHA256_DIGEST_SIZE]) +{ + if (ctx == NULL) + ctx = &initial_sha256_ctx; + + if (likely(sha256_finup_2x_arch(&ctx->ctx, data1, data2, len, out1, + out2))) + return; + sha256_finup_2x_sequential(&ctx->ctx, data1, data2, len, out1, out2); +} +EXPORT_SYMBOL_GPL(sha256_finup_2x); + +bool sha256_finup_2x_is_optimized(void) +{ + return sha256_finup_2x_is_optimized_arch(); +} +EXPORT_SYMBOL_GPL(sha256_finup_2x_is_optimized); + static void __hmac_sha256_preparekey(struct sha256_block_state *istate, struct sha256_block_state *ostate, const u8 *raw_key, size_t raw_key_len, diff --git a/lib/crypto/tests/sha256_kunit.c b/lib/crypto/tests/sha256_kunit.c index 1cd4caee6010..dcedfca06df6 100644 --- a/lib/crypto/tests/sha256_kunit.c +++ b/lib/crypto/tests/sha256_kunit.c @@ -5,6 +5,7 @@ #include <crypto/sha2.h> #include "sha256-testvecs.h" +/* Generate the HASH_KUNIT_CASES using hash-test-template.h. */ #define HASH sha256 #define HASH_CTX sha256_ctx #define HASH_SIZE SHA256_DIGEST_SIZE @@ -21,9 +22,192 @@ #define HMAC_USINGRAWKEY hmac_sha256_usingrawkey #include "hash-test-template.h" +static void free_guarded_buf(void *buf) +{ + vfree(buf); +} + +/* + * Allocate a KUnit-managed buffer that has length @len bytes immediately + * followed by an unmapped page, and assert that the allocation succeeds. + */ +static void *alloc_guarded_buf(struct kunit *test, size_t len) +{ + size_t full_len = round_up(len, PAGE_SIZE); + void *buf = vmalloc(full_len); + + KUNIT_ASSERT_NOT_NULL(test, buf); + KUNIT_ASSERT_EQ(test, 0, + kunit_add_action_or_reset(test, free_guarded_buf, buf)); + return buf + full_len - len; +} + +/* + * Test for sha256_finup_2x(). Specifically, choose various data lengths and + * salt lengths, and for each one, verify that sha256_finup_2x() produces the + * same results as sha256_update() and sha256_final(). + * + * Use guarded buffers for all inputs and outputs to reliably detect any + * out-of-bounds reads or writes, even if they occur in assembly code. + */ +static void test_sha256_finup_2x(struct kunit *test) +{ + const size_t max_data_len = 16384; + u8 *data1_buf, *data2_buf, *hash1, *hash2; + u8 expected_hash1[SHA256_DIGEST_SIZE]; + u8 expected_hash2[SHA256_DIGEST_SIZE]; + u8 salt[SHA256_BLOCK_SIZE]; + struct sha256_ctx *ctx; + + data1_buf = alloc_guarded_buf(test, max_data_len); + data2_buf = alloc_guarded_buf(test, max_data_len); + hash1 = alloc_guarded_buf(test, SHA256_DIGEST_SIZE); + hash2 = alloc_guarded_buf(test, SHA256_DIGEST_SIZE); + ctx = alloc_guarded_buf(test, sizeof(*ctx)); + + rand_bytes(data1_buf, max_data_len); + rand_bytes(data2_buf, max_data_len); + rand_bytes(salt, sizeof(salt)); + + for (size_t i = 0; i < 500; i++) { + size_t salt_len = rand_length(sizeof(salt)); + size_t data_len = rand_length(max_data_len); + const u8 *data1 = data1_buf + max_data_len - data_len; + const u8 *data2 = data2_buf + max_data_len - data_len; + struct sha256_ctx orig_ctx; + + sha256_init(ctx); + sha256_update(ctx, salt, salt_len); + orig_ctx = *ctx; + + sha256_finup_2x(ctx, data1, data2, data_len, hash1, hash2); + KUNIT_ASSERT_MEMEQ_MSG( + test, ctx, &orig_ctx, sizeof(*ctx), + "sha256_finup_2x() modified its ctx argument"); + + sha256_update(ctx, data1, data_len); + sha256_final(ctx, expected_hash1); + sha256_update(&orig_ctx, data2, data_len); + sha256_final(&orig_ctx, expected_hash2); + KUNIT_ASSERT_MEMEQ_MSG( + test, hash1, expected_hash1, SHA256_DIGEST_SIZE, + "Wrong hash1 with salt_len=%zu data_len=%zu", salt_len, + data_len); + KUNIT_ASSERT_MEMEQ_MSG( + test, hash2, expected_hash2, SHA256_DIGEST_SIZE, + "Wrong hash2 with salt_len=%zu data_len=%zu", salt_len, + data_len); + } +} + +/* Test sha256_finup_2x() with ctx == NULL */ +static void test_sha256_finup_2x_defaultctx(struct kunit *test) +{ + const size_t data_len = 128; + struct sha256_ctx ctx; + u8 hash1_a[SHA256_DIGEST_SIZE]; + u8 hash2_a[SHA256_DIGEST_SIZE]; + u8 hash1_b[SHA256_DIGEST_SIZE]; + u8 hash2_b[SHA256_DIGEST_SIZE]; + + rand_bytes(test_buf, 2 * data_len); + + sha256_init(&ctx); + sha256_finup_2x(&ctx, test_buf, &test_buf[data_len], data_len, hash1_a, + hash2_a); + + sha256_finup_2x(NULL, test_buf, &test_buf[data_len], data_len, hash1_b, + hash2_b); + + KUNIT_ASSERT_MEMEQ(test, hash1_a, hash1_b, SHA256_DIGEST_SIZE); + KUNIT_ASSERT_MEMEQ(test, hash2_a, hash2_b, SHA256_DIGEST_SIZE); +} + +/* + * Test that sha256_finup_2x() and sha256_update/final() produce consistent + * results with total message lengths that require more than 32 bits. + */ +static void test_sha256_finup_2x_hugelen(struct kunit *test) +{ + const size_t data_len = 4 * SHA256_BLOCK_SIZE; + struct sha256_ctx ctx = {}; + u8 expected_hash[SHA256_DIGEST_SIZE]; + u8 hash[SHA256_DIGEST_SIZE]; + + rand_bytes(test_buf, data_len); + for (size_t align = 0; align < SHA256_BLOCK_SIZE; align++) { + sha256_init(&ctx); + ctx.ctx.bytecount = 0x123456789abcd00 + align; + + sha256_finup_2x(&ctx, test_buf, test_buf, data_len, hash, hash); + + sha256_update(&ctx, test_buf, data_len); + sha256_final(&ctx, expected_hash); + + KUNIT_ASSERT_MEMEQ(test, hash, expected_hash, + SHA256_DIGEST_SIZE); + } +} + +/* Benchmark for sha256_finup_2x() */ +static void benchmark_sha256_finup_2x(struct kunit *test) +{ + /* + * Try a few different salt lengths, since sha256_finup_2x() performance + * may vary slightly for the same data_len depending on how many bytes + * were already processed in the initial context. + */ + static const size_t salt_lens_to_test[] = { 0, 32, 64 }; + const size_t data_len = 4096; + const size_t num_iters = 4096; + struct sha256_ctx ctx; + u8 hash1[SHA256_DIGEST_SIZE]; + u8 hash2[SHA256_DIGEST_SIZE]; + + if (!IS_ENABLED(CONFIG_CRYPTO_LIB_BENCHMARK)) + kunit_skip(test, "not enabled"); + if (!sha256_finup_2x_is_optimized()) + kunit_skip(test, "not relevant"); + + rand_bytes(test_buf, data_len * 2); + + /* Warm-up */ + for (size_t i = 0; i < num_iters; i++) + sha256_finup_2x(NULL, &test_buf[0], &test_buf[data_len], + data_len, hash1, hash2); + + for (size_t i = 0; i < ARRAY_SIZE(salt_lens_to_test); i++) { + size_t salt_len = salt_lens_to_test[i]; + u64 t0, t1; + + /* + * Prepare the initial context. The time to process the salt is + * not measured; we're just interested in sha256_finup_2x(). + */ + sha256_init(&ctx); + sha256_update(&ctx, test_buf, salt_len); + + preempt_disable(); + t0 = ktime_get_ns(); + for (size_t j = 0; j < num_iters; j++) + sha256_finup_2x(&ctx, &test_buf[0], &test_buf[data_len], + data_len, hash1, hash2); + t1 = ktime_get_ns(); + preempt_enable(); + kunit_info(test, "data_len=%zu salt_len=%zu: %llu MB/s", + data_len, salt_len, + div64_u64((u64)data_len * 2 * num_iters * 1000, + t1 - t0 ?: 1)); + } +} + static struct kunit_case hash_test_cases[] = { HASH_KUNIT_CASES, + KUNIT_CASE(test_sha256_finup_2x), + KUNIT_CASE(test_sha256_finup_2x_defaultctx), + KUNIT_CASE(test_sha256_finup_2x_hugelen), KUNIT_CASE(benchmark_hash), + KUNIT_CASE(benchmark_sha256_finup_2x), {}, }; diff --git a/lib/crypto/x86/sha256-ni-asm.S b/lib/crypto/x86/sha256-ni-asm.S index 4bd9490ffc66..de5f707e7ef7 100644 --- a/lib/crypto/x86/sha256-ni-asm.S +++ b/lib/crypto/x86/sha256-ni-asm.S @@ -165,6 +165,374 @@ SYM_FUNC_START(sha256_ni_transform) RET SYM_FUNC_END(sha256_ni_transform) +#undef DIGEST_PTR +#undef DATA_PTR +#undef NUM_BLKS +#undef SHA256CONSTANTS +#undef MSG +#undef STATE0 +#undef STATE1 +#undef MSG0 +#undef MSG1 +#undef MSG2 +#undef MSG3 +#undef TMP +#undef SHUF_MASK +#undef ABEF_SAVE +#undef CDGH_SAVE + +// parameters for sha256_ni_finup2x() +#define CTX %rdi +#define DATA1 %rsi +#define DATA2 %rdx +#define LEN %ecx +#define LEN8 %cl +#define LEN64 %rcx +#define OUT1 %r8 +#define OUT2 %r9 + +// other scalar variables +#define SHA256CONSTANTS %rax +#define COUNT %r10 +#define COUNT32 %r10d +#define FINAL_STEP %r11d + +// rbx is used as a temporary. + +#define MSG %xmm0 // sha256rnds2 implicit operand +#define STATE0_A %xmm1 +#define STATE1_A %xmm2 +#define STATE0_B %xmm3 +#define STATE1_B %xmm4 +#define TMP_A %xmm5 +#define TMP_B %xmm6 +#define MSG0_A %xmm7 +#define MSG1_A %xmm8 +#define MSG2_A %xmm9 +#define MSG3_A %xmm10 +#define MSG0_B %xmm11 +#define MSG1_B %xmm12 +#define MSG2_B %xmm13 +#define MSG3_B %xmm14 +#define SHUF_MASK %xmm15 + +#define OFFSETOF_STATE 0 // offsetof(struct __sha256_ctx, state) +#define OFFSETOF_BYTECOUNT 32 // offsetof(struct __sha256_ctx, bytecount) +#define OFFSETOF_BUF 40 // offsetof(struct __sha256_ctx, buf) + +// Do 4 rounds of SHA-256 for each of two messages (interleaved). m0_a and m0_b +// contain the current 4 message schedule words for the first and second message +// respectively. +// +// If not all the message schedule words have been computed yet, then this also +// computes 4 more message schedule words for each message. m1_a-m3_a contain +// the next 3 groups of 4 message schedule words for the first message, and +// likewise m1_b-m3_b for the second. After consuming the current value of +// m0_a, this macro computes the group after m3_a and writes it to m0_a, and +// likewise for *_b. This means that the next (m0_a, m1_a, m2_a, m3_a) is the +// current (m1_a, m2_a, m3_a, m0_a), and likewise for *_b, so the caller must +// cycle through the registers accordingly. +.macro do_4rounds_2x i, m0_a, m1_a, m2_a, m3_a, m0_b, m1_b, m2_b, m3_b + movdqa (\i-32)*4(SHA256CONSTANTS), TMP_A + movdqa TMP_A, TMP_B + paddd \m0_a, TMP_A + paddd \m0_b, TMP_B +.if \i < 48 + sha256msg1 \m1_a, \m0_a + sha256msg1 \m1_b, \m0_b +.endif + movdqa TMP_A, MSG + sha256rnds2 STATE0_A, STATE1_A + movdqa TMP_B, MSG + sha256rnds2 STATE0_B, STATE1_B + pshufd $0x0E, TMP_A, MSG + sha256rnds2 STATE1_A, STATE0_A + pshufd $0x0E, TMP_B, MSG + sha256rnds2 STATE1_B, STATE0_B +.if \i < 48 + movdqa \m3_a, TMP_A + movdqa \m3_b, TMP_B + palignr $4, \m2_a, TMP_A + palignr $4, \m2_b, TMP_B + paddd TMP_A, \m0_a + paddd TMP_B, \m0_b + sha256msg2 \m3_a, \m0_a + sha256msg2 \m3_b, \m0_b +.endif +.endm + +// +// void sha256_ni_finup2x(const struct __sha256_ctx *ctx, +// const u8 *data1, const u8 *data2, int len, +// u8 out1[SHA256_DIGEST_SIZE], +// u8 out2[SHA256_DIGEST_SIZE]); +// +// This function computes the SHA-256 digests of two messages |data1| and +// |data2| that are both |len| bytes long, starting from the initial context +// |ctx|. |len| must be at least SHA256_BLOCK_SIZE. +// +// The instructions for the two SHA-256 operations are interleaved. On many +// CPUs, this is almost twice as fast as hashing each message individually due +// to taking better advantage of the CPU's SHA-256 and SIMD throughput. +// +SYM_FUNC_START(sha256_ni_finup2x) + // Allocate 128 bytes of stack space, 16-byte aligned. + push %rbx + push %rbp + mov %rsp, %rbp + sub $128, %rsp + and $~15, %rsp + + // Load the shuffle mask for swapping the endianness of 32-bit words. + movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK + + // Set up pointer to the round constants. + lea K256+32*4(%rip), SHA256CONSTANTS + + // Initially we're not processing the final blocks. + xor FINAL_STEP, FINAL_STEP + + // Load the initial state from ctx->state. + movdqu OFFSETOF_STATE+0*16(CTX), STATE0_A // DCBA + movdqu OFFSETOF_STATE+1*16(CTX), STATE1_A // HGFE + movdqa STATE0_A, TMP_A + punpcklqdq STATE1_A, STATE0_A // FEBA + punpckhqdq TMP_A, STATE1_A // DCHG + pshufd $0x1B, STATE0_A, STATE0_A // ABEF + pshufd $0xB1, STATE1_A, STATE1_A // CDGH + + // Load ctx->bytecount. Take the mod 64 of it to get the number of + // bytes that are buffered in ctx->buf. Also save it in a register with + // LEN added to it. + mov LEN, LEN + mov OFFSETOF_BYTECOUNT(CTX), %rbx + lea (%rbx, LEN64, 1), COUNT + and $63, %ebx + jz .Lfinup2x_enter_loop // No bytes buffered? + + // %ebx bytes (1 to 63) are currently buffered in ctx->buf. Load them + // followed by the first 64 - %ebx bytes of data. Since LEN >= 64, we + // just load 64 bytes from each of ctx->buf, DATA1, and DATA2 + // unconditionally and rearrange the data as needed. + + movdqu OFFSETOF_BUF+0*16(CTX), MSG0_A + movdqu OFFSETOF_BUF+1*16(CTX), MSG1_A + movdqu OFFSETOF_BUF+2*16(CTX), MSG2_A + movdqu OFFSETOF_BUF+3*16(CTX), MSG3_A + movdqa MSG0_A, 0*16(%rsp) + movdqa MSG1_A, 1*16(%rsp) + movdqa MSG2_A, 2*16(%rsp) + movdqa MSG3_A, 3*16(%rsp) + + movdqu 0*16(DATA1), MSG0_A + movdqu 1*16(DATA1), MSG1_A + movdqu 2*16(DATA1), MSG2_A + movdqu 3*16(DATA1), MSG3_A + movdqu MSG0_A, 0*16(%rsp,%rbx) + movdqu MSG1_A, 1*16(%rsp,%rbx) + movdqu MSG2_A, 2*16(%rsp,%rbx) + movdqu MSG3_A, 3*16(%rsp,%rbx) + movdqa 0*16(%rsp), MSG0_A + movdqa 1*16(%rsp), MSG1_A + movdqa 2*16(%rsp), MSG2_A + movdqa 3*16(%rsp), MSG3_A + + movdqu 0*16(DATA2), MSG0_B + movdqu 1*16(DATA2), MSG1_B + movdqu 2*16(DATA2), MSG2_B + movdqu 3*16(DATA2), MSG3_B + movdqu MSG0_B, 0*16(%rsp,%rbx) + movdqu MSG1_B, 1*16(%rsp,%rbx) + movdqu MSG2_B, 2*16(%rsp,%rbx) + movdqu MSG3_B, 3*16(%rsp,%rbx) + movdqa 0*16(%rsp), MSG0_B + movdqa 1*16(%rsp), MSG1_B + movdqa 2*16(%rsp), MSG2_B + movdqa 3*16(%rsp), MSG3_B + + sub $64, %rbx // rbx = buffered - 64 + sub %rbx, DATA1 // DATA1 += 64 - buffered + sub %rbx, DATA2 // DATA2 += 64 - buffered + add %ebx, LEN // LEN += buffered - 64 + movdqa STATE0_A, STATE0_B + movdqa STATE1_A, STATE1_B + jmp .Lfinup2x_loop_have_data + +.Lfinup2x_enter_loop: + sub $64, LEN + movdqa STATE0_A, STATE0_B + movdqa STATE1_A, STATE1_B +.Lfinup2x_loop: + // Load the next two data blocks. + movdqu 0*16(DATA1), MSG0_A + movdqu 0*16(DATA2), MSG0_B + movdqu 1*16(DATA1), MSG1_A + movdqu 1*16(DATA2), MSG1_B + movdqu 2*16(DATA1), MSG2_A + movdqu 2*16(DATA2), MSG2_B + movdqu 3*16(DATA1), MSG3_A + movdqu 3*16(DATA2), MSG3_B + add $64, DATA1 + add $64, DATA2 +.Lfinup2x_loop_have_data: + // Convert the words of the data blocks from big endian. + pshufb SHUF_MASK, MSG0_A + pshufb SHUF_MASK, MSG0_B + pshufb SHUF_MASK, MSG1_A + pshufb SHUF_MASK, MSG1_B + pshufb SHUF_MASK, MSG2_A + pshufb SHUF_MASK, MSG2_B + pshufb SHUF_MASK, MSG3_A + pshufb SHUF_MASK, MSG3_B +.Lfinup2x_loop_have_bswapped_data: + + // Save the original state for each block. + movdqa STATE0_A, 0*16(%rsp) + movdqa STATE0_B, 1*16(%rsp) + movdqa STATE1_A, 2*16(%rsp) + movdqa STATE1_B, 3*16(%rsp) + + // Do the SHA-256 rounds on each block. +.irp i, 0, 16, 32, 48 + do_4rounds_2x (\i + 0), MSG0_A, MSG1_A, MSG2_A, MSG3_A, \ + MSG0_B, MSG1_B, MSG2_B, MSG3_B + do_4rounds_2x (\i + 4), MSG1_A, MSG2_A, MSG3_A, MSG0_A, \ + MSG1_B, MSG2_B, MSG3_B, MSG0_B + do_4rounds_2x (\i + 8), MSG2_A, MSG3_A, MSG0_A, MSG1_A, \ + MSG2_B, MSG3_B, MSG0_B, MSG1_B + do_4rounds_2x (\i + 12), MSG3_A, MSG0_A, MSG1_A, MSG2_A, \ + MSG3_B, MSG0_B, MSG1_B, MSG2_B +.endr + + // Add the original state for each block. + paddd 0*16(%rsp), STATE0_A + paddd 1*16(%rsp), STATE0_B + paddd 2*16(%rsp), STATE1_A + paddd 3*16(%rsp), STATE1_B + + // Update LEN and loop back if more blocks remain. + sub $64, LEN + jge .Lfinup2x_loop + + // Check if any final blocks need to be handled. + // FINAL_STEP = 2: all done + // FINAL_STEP = 1: need to do count-only padding block + // FINAL_STEP = 0: need to do the block with 0x80 padding byte + cmp $1, FINAL_STEP + jg .Lfinup2x_done + je .Lfinup2x_finalize_countonly + add $64, LEN + jz .Lfinup2x_finalize_blockaligned + + // Not block-aligned; 1 <= LEN <= 63 data bytes remain. Pad the block. + // To do this, write the padding starting with the 0x80 byte to + // &sp[64]. Then for each message, copy the last 64 data bytes to sp + // and load from &sp[64 - LEN] to get the needed padding block. This + // code relies on the data buffers being >= 64 bytes in length. + mov $64, %ebx + sub LEN, %ebx // ebx = 64 - LEN + sub %rbx, DATA1 // DATA1 -= 64 - LEN + sub %rbx, DATA2 // DATA2 -= 64 - LEN + mov $0x80, FINAL_STEP // using FINAL_STEP as a temporary + movd FINAL_STEP, MSG0_A + pxor MSG1_A, MSG1_A + movdqa MSG0_A, 4*16(%rsp) + movdqa MSG1_A, 5*16(%rsp) + movdqa MSG1_A, 6*16(%rsp) + movdqa MSG1_A, 7*16(%rsp) + cmp $56, LEN + jge 1f // will COUNT spill into its own block? + shl $3, COUNT + bswap COUNT + mov COUNT, 56(%rsp,%rbx) + mov $2, FINAL_STEP // won't need count-only block + jmp 2f +1: + mov $1, FINAL_STEP // will need count-only block +2: + movdqu 0*16(DATA1), MSG0_A + movdqu 1*16(DATA1), MSG1_A + movdqu 2*16(DATA1), MSG2_A + movdqu 3*16(DATA1), MSG3_A + movdqa MSG0_A, 0*16(%rsp) + movdqa MSG1_A, 1*16(%rsp) + movdqa MSG2_A, 2*16(%rsp) + movdqa MSG3_A, 3*16(%rsp) + movdqu 0*16(%rsp,%rbx), MSG0_A + movdqu 1*16(%rsp,%rbx), MSG1_A + movdqu 2*16(%rsp,%rbx), MSG2_A + movdqu 3*16(%rsp,%rbx), MSG3_A + + movdqu 0*16(DATA2), MSG0_B + movdqu 1*16(DATA2), MSG1_B + movdqu 2*16(DATA2), MSG2_B + movdqu 3*16(DATA2), MSG3_B + movdqa MSG0_B, 0*16(%rsp) + movdqa MSG1_B, 1*16(%rsp) + movdqa MSG2_B, 2*16(%rsp) + movdqa MSG3_B, 3*16(%rsp) + movdqu 0*16(%rsp,%rbx), MSG0_B + movdqu 1*16(%rsp,%rbx), MSG1_B + movdqu 2*16(%rsp,%rbx), MSG2_B + movdqu 3*16(%rsp,%rbx), MSG3_B + jmp .Lfinup2x_loop_have_data + + // Prepare a padding block, either: + // + // {0x80, 0, 0, 0, ..., count (as __be64)} + // This is for a block aligned message. + // + // { 0, 0, 0, 0, ..., count (as __be64)} + // This is for a message whose length mod 64 is >= 56. + // + // Pre-swap the endianness of the words. +.Lfinup2x_finalize_countonly: + pxor MSG0_A, MSG0_A + jmp 1f + +.Lfinup2x_finalize_blockaligned: + mov $0x80000000, %ebx + movd %ebx, MSG0_A +1: + pxor MSG1_A, MSG1_A + pxor MSG2_A, MSG2_A + ror $29, COUNT + movq COUNT, MSG3_A + pslldq $8, MSG3_A + movdqa MSG0_A, MSG0_B + pxor MSG1_B, MSG1_B + pxor MSG2_B, MSG2_B + movdqa MSG3_A, MSG3_B + mov $2, FINAL_STEP + jmp .Lfinup2x_loop_have_bswapped_data + +.Lfinup2x_done: + // Write the two digests with all bytes in the correct order. + movdqa STATE0_A, TMP_A + movdqa STATE0_B, TMP_B + punpcklqdq STATE1_A, STATE0_A // GHEF + punpcklqdq STATE1_B, STATE0_B + punpckhqdq TMP_A, STATE1_A // ABCD + punpckhqdq TMP_B, STATE1_B + pshufd $0xB1, STATE0_A, STATE0_A // HGFE + pshufd $0xB1, STATE0_B, STATE0_B + pshufd $0x1B, STATE1_A, STATE1_A // DCBA + pshufd $0x1B, STATE1_B, STATE1_B + pshufb SHUF_MASK, STATE0_A + pshufb SHUF_MASK, STATE0_B + pshufb SHUF_MASK, STATE1_A + pshufb SHUF_MASK, STATE1_B + movdqu STATE0_A, 1*16(OUT1) + movdqu STATE0_B, 1*16(OUT2) + movdqu STATE1_A, 0*16(OUT1) + movdqu STATE1_B, 0*16(OUT2) + + mov %rbp, %rsp + pop %rbp + pop %rbx + RET +SYM_FUNC_END(sha256_ni_finup2x) + .section .rodata.cst256.K256, "aM", @progbits, 256 .align 64 K256: diff --git a/lib/crypto/x86/sha256.h b/lib/crypto/x86/sha256.h index 41fa95fbc3bf..38e33b22a092 100644 --- a/lib/crypto/x86/sha256.h +++ b/lib/crypto/x86/sha256.h @@ -7,6 +7,8 @@ #include <asm/fpu/api.h> #include <linux/static_call.h> +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha_ni); + DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_blocks_generic); #define DEFINE_X86_SHA256_FN(c_fn, asm_fn) \ @@ -35,11 +37,48 @@ static void sha256_blocks(struct sha256_block_state *state, static_call(sha256_blocks_x86)(state, data, nblocks); } +static_assert(offsetof(struct __sha256_ctx, state) == 0); +static_assert(offsetof(struct __sha256_ctx, bytecount) == 32); +static_assert(offsetof(struct __sha256_ctx, buf) == 40); +asmlinkage void sha256_ni_finup2x(const struct __sha256_ctx *ctx, + const u8 *data1, const u8 *data2, int len, + u8 out1[SHA256_DIGEST_SIZE], + u8 out2[SHA256_DIGEST_SIZE]); + +#define sha256_finup_2x_arch sha256_finup_2x_arch +static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx, + const u8 *data1, const u8 *data2, size_t len, + u8 out1[SHA256_DIGEST_SIZE], + u8 out2[SHA256_DIGEST_SIZE]) +{ + /* + * The assembly requires len >= SHA256_BLOCK_SIZE && len <= INT_MAX. + * Further limit len to 65536 to avoid spending too long with preemption + * disabled. (Of course, in practice len is nearly always 4096 anyway.) + */ + if (static_branch_likely(&have_sha_ni) && len >= SHA256_BLOCK_SIZE && + len <= 65536 && likely(irq_fpu_usable())) { + kernel_fpu_begin(); + sha256_ni_finup2x(ctx, data1, data2, len, out1, out2); + kernel_fpu_end(); + kmsan_unpoison_memory(out1, SHA256_DIGEST_SIZE); + kmsan_unpoison_memory(out2, SHA256_DIGEST_SIZE); + return true; + } + return false; +} + +static bool sha256_finup_2x_is_optimized_arch(void) +{ + return static_key_enabled(&have_sha_ni); +} + #define sha256_mod_init_arch sha256_mod_init_arch static void sha256_mod_init_arch(void) { if (boot_cpu_has(X86_FEATURE_SHA_NI)) { static_call_update(sha256_blocks_x86, sha256_blocks_ni); + static_branch_enable(&have_sha_ni); } else if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL) && boot_cpu_has(X86_FEATURE_AVX)) { diff --git a/lib/raid6/recov_rvv.c b/lib/raid6/recov_rvv.c index 5d54c4b437df..5f779719c3d3 100644 --- a/lib/raid6/recov_rvv.c +++ b/lib/raid6/recov_rvv.c @@ -4,9 +4,7 @@ * Author: Chunyan Zhang <zhangchunyan@iscas.ac.cn> */ -#include <asm/simd.h> #include <asm/vector.h> -#include <crypto/internal/simd.h> #include <linux/raid/pq.h> static int rvv_has_vector(void) diff --git a/lib/raid6/rvv.c b/lib/raid6/rvv.c index 7d82efa5b14f..89da5fc247aa 100644 --- a/lib/raid6/rvv.c +++ b/lib/raid6/rvv.c @@ -9,11 +9,8 @@ * Copyright 2002-2004 H. Peter Anvin */ -#include <asm/simd.h> #include <asm/vector.h> -#include <crypto/internal/simd.h> #include <linux/raid/pq.h> -#include <linux/types.h> #include "rvv.h" #define NSIZE (riscv_v_vsize / 32) /* NSIZE = vlenb */ @@ -47,7 +44,7 @@ static void raid6_rvv1_gen_syndrome_real(int disks, unsigned long bytes, void ** asm volatile (".option push\n" ".option arch,+v\n" "vle8.v v0, (%[wp0])\n" - "vle8.v v1, (%[wp0])\n" + "vmv.v.v v1, v0\n" ".option pop\n" : : [wp0]"r"(&dptr[z0][d + 0 * NSIZE]) @@ -120,7 +117,7 @@ static void raid6_rvv1_xor_syndrome_real(int disks, int start, int stop, asm volatile (".option push\n" ".option arch,+v\n" "vle8.v v0, (%[wp0])\n" - "vle8.v v1, (%[wp0])\n" + "vmv.v.v v1, v0\n" ".option pop\n" : : [wp0]"r"(&dptr[z0][d + 0 * NSIZE]) @@ -221,9 +218,9 @@ static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void ** asm volatile (".option push\n" ".option arch,+v\n" "vle8.v v0, (%[wp0])\n" - "vle8.v v1, (%[wp0])\n" + "vmv.v.v v1, v0\n" "vle8.v v4, (%[wp1])\n" - "vle8.v v5, (%[wp1])\n" + "vmv.v.v v5, v4\n" ".option pop\n" : : [wp0]"r"(&dptr[z0][d + 0 * NSIZE]), @@ -313,9 +310,9 @@ static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop, asm volatile (".option push\n" ".option arch,+v\n" "vle8.v v0, (%[wp0])\n" - "vle8.v v1, (%[wp0])\n" + "vmv.v.v v1, v0\n" "vle8.v v4, (%[wp1])\n" - "vle8.v v5, (%[wp1])\n" + "vmv.v.v v5, v4\n" ".option pop\n" : : [wp0]"r"(&dptr[z0][d + 0 * NSIZE]), @@ -443,13 +440,13 @@ static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void ** asm volatile (".option push\n" ".option arch,+v\n" "vle8.v v0, (%[wp0])\n" - "vle8.v v1, (%[wp0])\n" + "vmv.v.v v1, v0\n" "vle8.v v4, (%[wp1])\n" - "vle8.v v5, (%[wp1])\n" + "vmv.v.v v5, v4\n" "vle8.v v8, (%[wp2])\n" - "vle8.v v9, (%[wp2])\n" + "vmv.v.v v9, v8\n" "vle8.v v12, (%[wp3])\n" - "vle8.v v13, (%[wp3])\n" + "vmv.v.v v13, v12\n" ".option pop\n" : : [wp0]"r"(&dptr[z0][d + 0 * NSIZE]), @@ -569,13 +566,13 @@ static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop, asm volatile (".option push\n" ".option arch,+v\n" "vle8.v v0, (%[wp0])\n" - "vle8.v v1, (%[wp0])\n" + "vmv.v.v v1, v0\n" "vle8.v v4, (%[wp1])\n" - "vle8.v v5, (%[wp1])\n" + "vmv.v.v v5, v4\n" "vle8.v v8, (%[wp2])\n" - "vle8.v v9, (%[wp2])\n" + "vmv.v.v v9, v8\n" "vle8.v v12, (%[wp3])\n" - "vle8.v v13, (%[wp3])\n" + "vmv.v.v v13, v12\n" ".option pop\n" : : [wp0]"r"(&dptr[z0][d + 0 * NSIZE]), @@ -757,21 +754,21 @@ static void raid6_rvv8_gen_syndrome_real(int disks, unsigned long bytes, void ** asm volatile (".option push\n" ".option arch,+v\n" "vle8.v v0, (%[wp0])\n" - "vle8.v v1, (%[wp0])\n" + "vmv.v.v v1, v0\n" "vle8.v v4, (%[wp1])\n" - "vle8.v v5, (%[wp1])\n" + "vmv.v.v v5, v4\n" "vle8.v v8, (%[wp2])\n" - "vle8.v v9, (%[wp2])\n" + "vmv.v.v v9, v8\n" "vle8.v v12, (%[wp3])\n" - "vle8.v v13, (%[wp3])\n" + "vmv.v.v v13, v12\n" "vle8.v v16, (%[wp4])\n" - "vle8.v v17, (%[wp4])\n" + "vmv.v.v v17, v16\n" "vle8.v v20, (%[wp5])\n" - "vle8.v v21, (%[wp5])\n" + "vmv.v.v v21, v20\n" "vle8.v v24, (%[wp6])\n" - "vle8.v v25, (%[wp6])\n" + "vmv.v.v v25, v24\n" "vle8.v v28, (%[wp7])\n" - "vle8.v v29, (%[wp7])\n" + "vmv.v.v v29, v28\n" ".option pop\n" : : [wp0]"r"(&dptr[z0][d + 0 * NSIZE]), @@ -951,21 +948,21 @@ static void raid6_rvv8_xor_syndrome_real(int disks, int start, int stop, asm volatile (".option push\n" ".option arch,+v\n" "vle8.v v0, (%[wp0])\n" - "vle8.v v1, (%[wp0])\n" + "vmv.v.v v1, v0\n" "vle8.v v4, (%[wp1])\n" - "vle8.v v5, (%[wp1])\n" + "vmv.v.v v5, v4\n" "vle8.v v8, (%[wp2])\n" - "vle8.v v9, (%[wp2])\n" + "vmv.v.v v9, v8\n" "vle8.v v12, (%[wp3])\n" - "vle8.v v13, (%[wp3])\n" + "vmv.v.v v13, v12\n" "vle8.v v16, (%[wp4])\n" - "vle8.v v17, (%[wp4])\n" + "vmv.v.v v17, v16\n" "vle8.v v20, (%[wp5])\n" - "vle8.v v21, (%[wp5])\n" + "vmv.v.v v21, v20\n" "vle8.v v24, (%[wp6])\n" - "vle8.v v25, (%[wp6])\n" + "vmv.v.v v25, v24\n" "vle8.v v28, (%[wp7])\n" - "vle8.v v29, (%[wp7])\n" + "vmv.v.v v29, v28\n" ".option pop\n" : : [wp0]"r"(&dptr[z0][d + 0 * NSIZE]), diff --git a/lib/tests/Makefile b/lib/tests/Makefile index fa6d728a8b5b..f7460831cfdd 100644 --- a/lib/tests/Makefile +++ b/lib/tests/Makefile @@ -10,6 +10,7 @@ obj-$(CONFIG_BLACKHOLE_DEV_KUNIT_TEST) += blackhole_dev_kunit.o obj-$(CONFIG_CHECKSUM_KUNIT) += checksum_kunit.o obj-$(CONFIG_CMDLINE_KUNIT_TEST) += cmdline_kunit.o obj-$(CONFIG_CPUMASK_KUNIT_TEST) += cpumask_kunit.o +obj-$(CONFIG_FFS_KUNIT_TEST) += ffs_kunit.o CFLAGS_fortify_kunit.o += $(call cc-disable-warning, unsequenced) CFLAGS_fortify_kunit.o += $(call cc-disable-warning, stringop-overread) CFLAGS_fortify_kunit.o += $(call cc-disable-warning, stringop-truncation) diff --git a/lib/tests/ffs_kunit.c b/lib/tests/ffs_kunit.c new file mode 100644 index 000000000000..9a329cdc09c2 --- /dev/null +++ b/lib/tests/ffs_kunit.c @@ -0,0 +1,566 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * KUnit tests for ffs()-family functions + */ +#include <kunit/test.h> +#include <linux/bitops.h> + +/* + * Test data structures + */ +struct ffs_test_case { + unsigned long input; + int expected_ffs; /* ffs() result (1-based) */ + int expected_fls; /* fls() result (1-based) */ + const char *description; +}; + +struct ffs64_test_case { + u64 input; + int expected_fls64; /* fls64() result (1-based) */ + unsigned int expected_ffs64_0based; /* __ffs64() result (0-based) */ + const char *description; +}; + +/* + * Basic edge cases - core functionality validation + */ +static const struct ffs_test_case basic_test_cases[] = { + /* Zero case - special handling */ + {0x00000000, 0, 0, "zero value"}, + + /* Single bit patterns - powers of 2 */ + {0x00000001, 1, 1, "bit 0 set"}, + {0x00000002, 2, 2, "bit 1 set"}, + {0x00000004, 3, 3, "bit 2 set"}, + {0x00000008, 4, 4, "bit 3 set"}, + {0x00000010, 5, 5, "bit 4 set"}, + {0x00000020, 6, 6, "bit 5 set"}, + {0x00000040, 7, 7, "bit 6 set"}, + {0x00000080, 8, 8, "bit 7 set"}, + {0x00000100, 9, 9, "bit 8 set"}, + {0x00008000, 16, 16, "bit 15 set"}, + {0x00010000, 17, 17, "bit 16 set"}, + {0x40000000, 31, 31, "bit 30 set"}, + {0x80000000, 32, 32, "bit 31 set (sign bit)"}, + + /* Maximum values */ + {0xFFFFFFFF, 1, 32, "all bits set"}, + + /* Multiple bit patterns */ + {0x00000003, 1, 2, "bits 0-1 set"}, + {0x00000007, 1, 3, "bits 0-2 set"}, + {0x0000000F, 1, 4, "bits 0-3 set"}, + {0x000000FF, 1, 8, "bits 0-7 set"}, + {0x0000FFFF, 1, 16, "bits 0-15 set"}, + {0x7FFFFFFF, 1, 31, "bits 0-30 set"}, + + /* Sparse patterns */ + {0x00000101, 1, 9, "bits 0,8 set"}, + {0x00001001, 1, 13, "bits 0,12 set"}, + {0x80000001, 1, 32, "bits 0,31 set"}, + {0x40000002, 2, 31, "bits 1,30 set"}, +}; + +/* + * 64-bit test cases + */ +static const struct ffs64_test_case ffs64_test_cases[] = { + /* Zero case */ + {0x0000000000000000ULL, 0, 0, "zero value"}, + + /* Single bit patterns */ + {0x0000000000000001ULL, 1, 0, "bit 0 set"}, + {0x0000000000000002ULL, 2, 1, "bit 1 set"}, + {0x0000000000000004ULL, 3, 2, "bit 2 set"}, + {0x0000000000000008ULL, 4, 3, "bit 3 set"}, + {0x0000000000008000ULL, 16, 15, "bit 15 set"}, + {0x0000000000010000ULL, 17, 16, "bit 16 set"}, + {0x0000000080000000ULL, 32, 31, "bit 31 set"}, + {0x0000000100000000ULL, 33, 32, "bit 32 set"}, + {0x0000000200000000ULL, 34, 33, "bit 33 set"}, + {0x4000000000000000ULL, 63, 62, "bit 62 set"}, + {0x8000000000000000ULL, 64, 63, "bit 63 set (sign bit)"}, + + /* Maximum values */ + {0xFFFFFFFFFFFFFFFFULL, 64, 0, "all bits set"}, + + /* Cross 32-bit boundary patterns */ + {0x00000000FFFFFFFFULL, 32, 0, "lower 32 bits set"}, + {0xFFFFFFFF00000000ULL, 64, 32, "upper 32 bits set"}, + {0x8000000000000001ULL, 64, 0, "bits 0,63 set"}, + {0x4000000000000002ULL, 63, 1, "bits 1,62 set"}, + + /* Mixed patterns */ + {0x00000001FFFFFFFFULL, 33, 0, "bit 32 + lower 32 bits"}, + {0xFFFFFFFF80000000ULL, 64, 31, "upper 32 bits + bit 31"}, +}; + +/* + * Helper function to validate ffs results with detailed error messages + */ +static void validate_ffs_result(struct kunit *test, unsigned long input, + int actual, int expected, const char *func_name, + const char *description) +{ + KUNIT_EXPECT_EQ_MSG(test, actual, expected, + "%s(0x%08lx) [%s]: expected %d, got %d", + func_name, input, description, expected, actual); +} + +/* + * Helper function to validate 64-bit ffs results + */ +static void validate_ffs64_result(struct kunit *test, u64 input, + int actual, int expected, const char *func_name, + const char *description) +{ + KUNIT_EXPECT_EQ_MSG(test, actual, expected, + "%s(0x%016llx) [%s]: expected %d, got %d", + func_name, input, description, expected, actual); +} + +/* + * Helper function to validate mathematical relationships between functions + */ +static void validate_ffs_relationships(struct kunit *test, unsigned long input) +{ + int ffs_result; + int fls_result; + unsigned int ffs_0based; + unsigned int fls_0based; + + if (input == 0) { + /* Special case: zero input */ + KUNIT_EXPECT_EQ(test, ffs(input), 0); + KUNIT_EXPECT_EQ(test, fls(input), 0); + /* __ffs and __fls are undefined for 0, but often return specific values */ + return; + } + + ffs_result = ffs(input); + fls_result = fls(input); + ffs_0based = __ffs(input); + fls_0based = __fls(input); + + /* Relationship: ffs(x) == __ffs(x) + 1 for x != 0 */ + KUNIT_EXPECT_EQ_MSG(test, ffs_result, ffs_0based + 1, + "ffs(0x%08lx) != __ffs(0x%08lx) + 1: %d != %u + 1", + input, input, ffs_result, ffs_0based); + + /* Relationship: fls(x) == __fls(x) + 1 for x != 0 */ + KUNIT_EXPECT_EQ_MSG(test, fls_result, fls_0based + 1, + "fls(0x%08lx) != __fls(0x%08lx) + 1: %d != %u + 1", + input, input, fls_result, fls_0based); + + /* Range validation */ + KUNIT_EXPECT_GE(test, ffs_result, 1); + KUNIT_EXPECT_LE(test, ffs_result, BITS_PER_LONG); + KUNIT_EXPECT_GE(test, fls_result, 1); + KUNIT_EXPECT_LE(test, fls_result, BITS_PER_LONG); +} + +/* + * Helper function to validate 64-bit relationships + */ +static void validate_ffs64_relationships(struct kunit *test, u64 input) +{ + int fls64_result; + unsigned int ffs64_0based; + + if (input == 0) { + KUNIT_EXPECT_EQ(test, fls64(input), 0); + return; + } + + fls64_result = fls64(input); + ffs64_0based = __ffs64(input); + + /* Range validation */ + KUNIT_EXPECT_GE(test, fls64_result, 1); + KUNIT_EXPECT_LE(test, fls64_result, 64); + KUNIT_EXPECT_LT(test, ffs64_0based, 64); + + /* + * Relationships with 32-bit functions should hold for small values + * on all architectures. + */ + if (input <= 0xFFFFFFFFULL) { + unsigned long input_32 = (unsigned long)input; + KUNIT_EXPECT_EQ_MSG(test, fls64(input), fls(input_32), + "fls64(0x%llx) != fls(0x%lx): %d != %d", + input, input_32, fls64(input), fls(input_32)); + + if (input != 0) { + KUNIT_EXPECT_EQ_MSG(test, __ffs64(input), __ffs(input_32), + "__ffs64(0x%llx) != __ffs(0x%lx): %lu != %lu", + input, input_32, + (unsigned long)__ffs64(input), + (unsigned long)__ffs(input_32)); + } + } +} + +/* + * Test basic correctness of all ffs-family functions + */ +static void ffs_basic_correctness_test(struct kunit *test) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(basic_test_cases); i++) { + const struct ffs_test_case *tc = &basic_test_cases[i]; + + /* Test ffs() */ + validate_ffs_result(test, tc->input, ffs(tc->input), + tc->expected_ffs, "ffs", tc->description); + + /* Test fls() */ + validate_ffs_result(test, tc->input, fls(tc->input), + tc->expected_fls, "fls", tc->description); + + /* Test __ffs() - skip zero case as it's undefined */ + if (tc->input != 0) { + /* Calculate expected __ffs() result: __ffs(x) == ffs(x) - 1 */ + unsigned int expected_ffs_0based = tc->expected_ffs - 1; + validate_ffs_result(test, tc->input, __ffs(tc->input), + expected_ffs_0based, "__ffs", tc->description); + } + + /* Test __fls() - skip zero case as it's undefined */ + if (tc->input != 0) { + /* Calculate expected __fls() result: __fls(x) == fls(x) - 1 */ + unsigned int expected_fls_0based = tc->expected_fls - 1; + validate_ffs_result(test, tc->input, __fls(tc->input), + expected_fls_0based, "__fls", tc->description); + } + } +} + +/* + * Test 64-bit function correctness + */ +static void ffs64_correctness_test(struct kunit *test) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(ffs64_test_cases); i++) { + const struct ffs64_test_case *tc = &ffs64_test_cases[i]; + + /* Test fls64() */ + validate_ffs64_result(test, tc->input, fls64(tc->input), + tc->expected_fls64, "fls64", tc->description); + + /* Test __ffs64() - skip zero case as it's undefined */ + if (tc->input != 0) { + validate_ffs64_result(test, tc->input, __ffs64(tc->input), + tc->expected_ffs64_0based, "__ffs64", + tc->description); + } + } +} + +/* + * Test mathematical relationships between functions + */ +static void ffs_mathematical_relationships_test(struct kunit *test) +{ + int i; + + /* Test basic cases */ + for (i = 0; i < ARRAY_SIZE(basic_test_cases); i++) { + validate_ffs_relationships(test, basic_test_cases[i].input); + } + + /* Test 64-bit cases */ + for (i = 0; i < ARRAY_SIZE(ffs64_test_cases); i++) { + validate_ffs64_relationships(test, ffs64_test_cases[i].input); + } +} + +/* + * Test edge cases and boundary conditions + */ +static void ffs_edge_cases_test(struct kunit *test) +{ + unsigned long test_patterns[] = { + /* Powers of 2 */ + 1UL, 2UL, 4UL, 8UL, 16UL, 32UL, 64UL, 128UL, + 256UL, 512UL, 1024UL, 2048UL, 4096UL, 8192UL, + + /* Powers of 2 minus 1 */ + 1UL, 3UL, 7UL, 15UL, 31UL, 63UL, 127UL, 255UL, + 511UL, 1023UL, 2047UL, 4095UL, 8191UL, + + /* Boundary values */ + 0x7FFFFFFFUL, /* Maximum positive 32-bit */ + 0x80000000UL, /* Minimum negative 32-bit */ + 0xFFFFFFFFUL, /* Maximum 32-bit unsigned */ + }; + int i; + + for (i = 0; i < ARRAY_SIZE(test_patterns); i++) { + validate_ffs_relationships(test, test_patterns[i]); + } +} + +/* + * Test 64-bit edge cases + */ +static void ffs64_edge_cases_test(struct kunit *test) +{ + u64 test_patterns_64[] = { + /* 64-bit powers of 2 */ + 0x0000000100000000ULL, /* 2^32 */ + 0x0000000200000000ULL, /* 2^33 */ + 0x0000000400000000ULL, /* 2^34 */ + 0x0000001000000000ULL, /* 2^36 */ + 0x0000010000000000ULL, /* 2^40 */ + 0x0001000000000000ULL, /* 2^48 */ + 0x0100000000000000ULL, /* 2^56 */ + 0x4000000000000000ULL, /* 2^62 */ + 0x8000000000000000ULL, /* 2^63 */ + + /* Cross-boundary patterns */ + 0x00000000FFFFFFFFULL, /* Lower 32 bits */ + 0xFFFFFFFF00000000ULL, /* Upper 32 bits */ + 0x7FFFFFFFFFFFFFFFULL, /* Maximum positive 64-bit */ + 0xFFFFFFFFFFFFFFFFULL, /* Maximum 64-bit unsigned */ + }; + int i; + + for (i = 0; i < ARRAY_SIZE(test_patterns_64); i++) { + validate_ffs64_relationships(test, test_patterns_64[i]); + } +} + +/* + * ffz() test data - Find First Zero bit test cases + */ +struct ffz_test_case { + unsigned long input; + unsigned long expected_ffz; + const char *description; +}; + +static const struct ffz_test_case ffz_test_cases[] = { + /* Zero bits in specific positions */ + {0xFFFFFFFE, 0, "bit 0 is zero"}, /* ...11111110 */ + {0xFFFFFFFD, 1, "bit 1 is zero"}, /* ...11111101 */ + {0xFFFFFFFB, 2, "bit 2 is zero"}, /* ...11111011 */ + {0xFFFFFFF7, 3, "bit 3 is zero"}, /* ...11110111 */ + {0xFFFFFFEF, 4, "bit 4 is zero"}, /* ...11101111 */ + {0xFFFFFFDF, 5, "bit 5 is zero"}, /* ...11011111 */ + {0xFFFFFFBF, 6, "bit 6 is zero"}, /* ...10111111 */ + {0xFFFFFF7F, 7, "bit 7 is zero"}, /* ...01111111 */ + {0xFFFFFEFF, 8, "bit 8 is zero"}, /* Gap in bit 8 */ + {0xFFFF7FFF, 15, "bit 15 is zero"}, /* Gap in bit 15 */ + {0xFFFEFFFF, 16, "bit 16 is zero"}, /* Gap in bit 16 */ + {0xBFFFFFFF, 30, "bit 30 is zero"}, /* Gap in bit 30 */ + {0x7FFFFFFF, 31, "bit 31 is zero"}, /* 01111111... */ + + /* Multiple zero patterns */ + {0xFFFFFFFC, 0, "bits 0-1 are zero"}, /* ...11111100 */ + {0xFFFFFFF8, 0, "bits 0-2 are zero"}, /* ...11111000 */ + {0xFFFFFFF0, 0, "bits 0-3 are zero"}, /* ...11110000 */ + {0xFFFFFF00, 0, "bits 0-7 are zero"}, /* ...00000000 */ + {0xFFFF0000, 0, "bits 0-15 are zero"}, /* Lower 16 bits zero */ + + /* All zeros (special case) */ + {0x00000000, 0, "all bits zero"}, + + /* Complex patterns */ + {0xFFFDFFFF, 17, "bit 17 is zero"}, /* Gap in bit 17 */ + {0xFFF7FFFF, 19, "bit 19 is zero"}, /* Gap in bit 19 */ + {0xF7FFFFFF, 27, "bit 27 is zero"}, /* Gap in bit 27 */ + {0xDFFFFFFF, 29, "bit 29 is zero"}, /* Gap in bit 29 */ +}; + +/* + * Test basic correctness of ffz() function + */ +static void ffz_basic_correctness_test(struct kunit *test) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(ffz_test_cases); i++) { + const struct ffz_test_case *tc = &ffz_test_cases[i]; + unsigned long result = ffz(tc->input); + + KUNIT_EXPECT_EQ_MSG(test, result, tc->expected_ffz, + "ffz(0x%08lx) [%s]: expected %lu, got %lu", + tc->input, tc->description, tc->expected_ffz, result); + } +} + +/* + * Test mathematical relationships between ffz() and other functions + */ +static void validate_ffz_relationships(struct kunit *test, unsigned long input) +{ + unsigned long ffz_result; + + if (input == 0) { + /* ffz(0) should return 0 (first zero bit is at position 0) */ + KUNIT_EXPECT_EQ(test, ffz(input), 0); + return; + } + + if (input == ~0UL) { + /* ffz(~0) is undefined (no zero bits) - just verify it doesn't crash */ + ffz_result = ffz(input); + /* Implementation-defined behavior, just ensure it completes */ + return; + } + + ffz_result = ffz(input); + + /* Range validation - result should be within valid bit range */ + KUNIT_EXPECT_LT(test, ffz_result, BITS_PER_LONG); + + /* Verify the bit at ffz_result position is actually zero */ + KUNIT_EXPECT_EQ_MSG(test, (input >> ffz_result) & 1, 0, + "ffz(0x%08lx) = %lu, but bit %lu is not zero", + input, ffz_result, ffz_result); + + /* Core relationship: if we set the ffz bit, ffz should find a different bit */ + if (ffz_result < BITS_PER_LONG - 1) { + unsigned long modified = input | (1UL << ffz_result); + if (modified != ~0UL) { /* Skip if all bits would be set */ + unsigned long new_ffz = ffz(modified); + KUNIT_EXPECT_NE_MSG(test, new_ffz, ffz_result, + "ffz(0x%08lx) = %lu, but setting that bit doesn't change ffz result", + input, ffz_result); + } + } +} + +static void ffz_mathematical_relationships_test(struct kunit *test) +{ + unsigned long test_patterns[] = { + /* Powers of 2 with one bit clear */ + 0xFFFFFFFE, 0xFFFFFFFD, 0xFFFFFFFB, 0xFFFFFFF7, + 0xFFFFFFEF, 0xFFFFFFDF, 0xFFFFFFBF, 0xFFFFFF7F, + + /* Multiple patterns */ + 0xFFFFFF00, 0xFFFFF000, 0xFFFF0000, 0xFFF00000, + 0x7FFFFFFF, 0x3FFFFFFF, 0x1FFFFFFF, 0x0FFFFFFF, + + /* Complex bit patterns */ + 0xAAAAAAAA, 0x55555555, 0xCCCCCCCC, 0x33333333, + 0xF0F0F0F0, 0x0F0F0F0F, 0xFF00FF00, 0x00FF00FF, + }; + int i; + + /* Test basic test cases */ + for (i = 0; i < ARRAY_SIZE(ffz_test_cases); i++) { + validate_ffz_relationships(test, ffz_test_cases[i].input); + } + + /* Test additional patterns */ + for (i = 0; i < ARRAY_SIZE(test_patterns); i++) { + validate_ffz_relationships(test, test_patterns[i]); + } +} + +/* + * Test edge cases and boundary conditions for ffz() + */ +static void ffz_edge_cases_test(struct kunit *test) +{ + unsigned long edge_patterns[] = { + /* Boundary values */ + 0x00000000, /* All zeros */ + 0x80000000, /* Only MSB set */ + 0x00000001, /* Only LSB set */ + 0x7FFFFFFF, /* MSB clear */ + 0xFFFFFFFE, /* LSB clear */ + + /* Powers of 2 complement patterns (one zero bit each) */ + ~(1UL << 0), ~(1UL << 1), ~(1UL << 2), ~(1UL << 3), + ~(1UL << 4), ~(1UL << 8), ~(1UL << 16), ~(1UL << 31), + + /* Walking zero patterns */ + 0xFFFFFFFE, 0xFFFFFFFD, 0xFFFFFFFB, 0xFFFFFFF7, + 0xFFFFFFEF, 0xFFFFFFDF, 0xFFFFFFBF, 0xFFFFFF7F, + 0xFFFFFEFF, 0xFFFFFDFF, 0xFFFFFBFF, 0xFFFFF7FF, + + /* Multiple zeros */ + 0xFFFFFF00, 0xFFFFF000, 0xFFFF0000, 0xFFF00000, + 0xFF000000, 0xF0000000, 0x00000000, + }; + int i; + + for (i = 0; i < ARRAY_SIZE(edge_patterns); i++) { + validate_ffz_relationships(test, edge_patterns[i]); + } +} + +/* + * To have useful build error output, split the tests into separate + * functions so it's clear which are missing __attribute_const__. + */ +#define CREATE_WRAPPER(func) \ +static noinline bool build_test_##func(void) \ +{ \ + int init_##func = 32; \ + int result_##func = func(6); \ + \ + /* Does the static initializer vanish after calling func? */ \ + BUILD_BUG_ON(init_##func < 32); \ + \ + /* "Consume" the results so optimizer doesn't drop them. */ \ + barrier_data(&init_##func); \ + barrier_data(&result_##func); \ + \ + return true; \ +} +CREATE_WRAPPER(ffs) +CREATE_WRAPPER(fls) +CREATE_WRAPPER(__ffs) +CREATE_WRAPPER(__fls) +CREATE_WRAPPER(ffz) +#undef CREATE_WRAPPER + +/* + * Make sure that __attribute_const__ has be applied to all the + * functions. This is a regression test for: + * https://github.com/KSPP/linux/issues/364 + */ +static void ffs_attribute_const_test(struct kunit *test) +{ + KUNIT_EXPECT_TRUE(test, build_test_ffs()); + KUNIT_EXPECT_TRUE(test, build_test_fls()); + KUNIT_EXPECT_TRUE(test, build_test___ffs()); + KUNIT_EXPECT_TRUE(test, build_test___fls()); + KUNIT_EXPECT_TRUE(test, build_test_ffz()); +} + +/* + * KUnit test case definitions + */ +static struct kunit_case ffs_test_cases[] = { + KUNIT_CASE(ffs_basic_correctness_test), + KUNIT_CASE(ffs64_correctness_test), + KUNIT_CASE(ffs_mathematical_relationships_test), + KUNIT_CASE(ffs_edge_cases_test), + KUNIT_CASE(ffs64_edge_cases_test), + KUNIT_CASE(ffz_basic_correctness_test), + KUNIT_CASE(ffz_mathematical_relationships_test), + KUNIT_CASE(ffz_edge_cases_test), + KUNIT_CASE(ffs_attribute_const_test), + {} +}; + +/* + * KUnit test suite definition + */ +static struct kunit_suite ffs_test_suite = { + .name = "ffs", + .test_cases = ffs_test_cases, +}; + +kunit_test_suites(&ffs_test_suite); + +MODULE_DESCRIPTION("KUnit tests for ffs()-family functions"); +MODULE_LICENSE("GPL"); |