summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
Diffstat (limited to 'lib')
-rw-r--r--lib/Kconfig.debug16
-rw-r--r--lib/clz_ctz.c8
-rw-r--r--lib/crypto/arm64/sha256-ce.S284
-rw-r--r--lib/crypto/arm64/sha256.h37
-rw-r--r--lib/crypto/sha256.c71
-rw-r--r--lib/crypto/tests/sha256_kunit.c184
-rw-r--r--lib/crypto/x86/sha256-ni-asm.S368
-rw-r--r--lib/crypto/x86/sha256.h39
-rw-r--r--lib/raid6/recov_rvv.c2
-rw-r--r--lib/raid6/rvv.c63
-rw-r--r--lib/tests/Makefile1
-rw-r--r--lib/tests/ffs_kunit.c566
12 files changed, 1588 insertions, 51 deletions
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index dc0e0c6ed075..24939b8553e6 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -2479,6 +2479,20 @@ config STRING_HELPERS_KUNIT_TEST
depends on KUNIT
default KUNIT_ALL_TESTS
+config FFS_KUNIT_TEST
+ tristate "KUnit test ffs-family functions at runtime" if !KUNIT_ALL_TESTS
+ depends on KUNIT
+ default KUNIT_ALL_TESTS
+ help
+ This builds KUnit tests for ffs-family bit manipulation functions
+ including ffs(), __ffs(), fls(), __fls(), fls64(), and __ffs64().
+
+ These tests validate mathematical correctness, edge case handling,
+ and cross-architecture consistency of bit scanning functions.
+
+ For more information on KUnit and unit tests in general,
+ please refer to Documentation/dev-tools/kunit/.
+
config TEST_KSTRTOX
tristate "Test kstrto*() family of functions at runtime"
@@ -2894,7 +2908,7 @@ config FORTIFY_KUNIT_TEST
config LONGEST_SYM_KUNIT_TEST
tristate "Test the longest symbol possible" if !KUNIT_ALL_TESTS
depends on KUNIT && KPROBES
- depends on !PREFIX_SYMBOLS && !CFI_CLANG && !GCOV_KERNEL
+ depends on !PREFIX_SYMBOLS && !CFI && !GCOV_KERNEL
default KUNIT_ALL_TESTS
help
Tests the longest symbol possible
diff --git a/lib/clz_ctz.c b/lib/clz_ctz.c
index fb8c0c5c2bd2..8778ec44bf63 100644
--- a/lib/clz_ctz.c
+++ b/lib/clz_ctz.c
@@ -15,28 +15,28 @@
#include <linux/kernel.h>
int __weak __ctzsi2(int val);
-int __weak __ctzsi2(int val)
+int __weak __attribute_const__ __ctzsi2(int val)
{
return __ffs(val);
}
EXPORT_SYMBOL(__ctzsi2);
int __weak __clzsi2(int val);
-int __weak __clzsi2(int val)
+int __weak __attribute_const__ __clzsi2(int val)
{
return 32 - fls(val);
}
EXPORT_SYMBOL(__clzsi2);
int __weak __clzdi2(u64 val);
-int __weak __clzdi2(u64 val)
+int __weak __attribute_const__ __clzdi2(u64 val)
{
return 64 - fls64(val);
}
EXPORT_SYMBOL(__clzdi2);
int __weak __ctzdi2(u64 val);
-int __weak __ctzdi2(u64 val)
+int __weak __attribute_const__ __ctzdi2(u64 val)
{
return __ffs64(val);
}
diff --git a/lib/crypto/arm64/sha256-ce.S b/lib/crypto/arm64/sha256-ce.S
index b99d9589c421..410174ba5237 100644
--- a/lib/crypto/arm64/sha256-ce.S
+++ b/lib/crypto/arm64/sha256-ce.S
@@ -70,18 +70,22 @@
.word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
.word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+ .macro load_round_constants tmp
+ adr_l \tmp, .Lsha2_rcon
+ ld1 { v0.4s- v3.4s}, [\tmp], #64
+ ld1 { v4.4s- v7.4s}, [\tmp], #64
+ ld1 { v8.4s-v11.4s}, [\tmp], #64
+ ld1 {v12.4s-v15.4s}, [\tmp]
+ .endm
+
/*
* size_t __sha256_ce_transform(struct sha256_block_state *state,
* const u8 *data, size_t nblocks);
*/
.text
SYM_FUNC_START(__sha256_ce_transform)
- /* load round constants */
- adr_l x8, .Lsha2_rcon
- ld1 { v0.4s- v3.4s}, [x8], #64
- ld1 { v4.4s- v7.4s}, [x8], #64
- ld1 { v8.4s-v11.4s}, [x8], #64
- ld1 {v12.4s-v15.4s}, [x8]
+
+ load_round_constants x8
/* load state */
ld1 {dgav.4s, dgbv.4s}, [x0]
@@ -134,3 +138,271 @@ CPU_LE( rev32 v19.16b, v19.16b )
mov x0, x2
ret
SYM_FUNC_END(__sha256_ce_transform)
+
+ .unreq dga
+ .unreq dgav
+ .unreq dgb
+ .unreq dgbv
+ .unreq t0
+ .unreq t1
+ .unreq dg0q
+ .unreq dg0v
+ .unreq dg1q
+ .unreq dg1v
+ .unreq dg2q
+ .unreq dg2v
+
+ // parameters for sha256_ce_finup2x()
+ ctx .req x0
+ data1 .req x1
+ data2 .req x2
+ len .req w3
+ out1 .req x4
+ out2 .req x5
+
+ // other scalar variables
+ count .req x6
+ final_step .req w7
+
+ // x8-x9 are used as temporaries.
+
+ // v0-v15 are used to cache the SHA-256 round constants.
+ // v16-v19 are used for the message schedule for the first message.
+ // v20-v23 are used for the message schedule for the second message.
+ // v24-v31 are used for the state and temporaries as given below.
+ // *_a are for the first message and *_b for the second.
+ state0_a_q .req q24
+ state0_a .req v24
+ state1_a_q .req q25
+ state1_a .req v25
+ state0_b_q .req q26
+ state0_b .req v26
+ state1_b_q .req q27
+ state1_b .req v27
+ t0_a .req v28
+ t0_b .req v29
+ t1_a_q .req q30
+ t1_a .req v30
+ t1_b_q .req q31
+ t1_b .req v31
+
+#define OFFSETOF_BYTECOUNT 32 // offsetof(struct __sha256_ctx, bytecount)
+#define OFFSETOF_BUF 40 // offsetof(struct __sha256_ctx, buf)
+// offsetof(struct __sha256_ctx, state) is assumed to be 0.
+
+ // Do 4 rounds of SHA-256 for each of two messages (interleaved). m0_a
+ // and m0_b contain the current 4 message schedule words for the first
+ // and second message respectively.
+ //
+ // If not all the message schedule words have been computed yet, then
+ // this also computes 4 more message schedule words for each message.
+ // m1_a-m3_a contain the next 3 groups of 4 message schedule words for
+ // the first message, and likewise m1_b-m3_b for the second. After
+ // consuming the current value of m0_a, this macro computes the group
+ // after m3_a and writes it to m0_a, and likewise for *_b. This means
+ // that the next (m0_a, m1_a, m2_a, m3_a) is the current (m1_a, m2_a,
+ // m3_a, m0_a), and likewise for *_b, so the caller must cycle through
+ // the registers accordingly.
+ .macro do_4rounds_2x i, k, m0_a, m1_a, m2_a, m3_a, \
+ m0_b, m1_b, m2_b, m3_b
+ add t0_a\().4s, \m0_a\().4s, \k\().4s
+ add t0_b\().4s, \m0_b\().4s, \k\().4s
+ .if \i < 48
+ sha256su0 \m0_a\().4s, \m1_a\().4s
+ sha256su0 \m0_b\().4s, \m1_b\().4s
+ sha256su1 \m0_a\().4s, \m2_a\().4s, \m3_a\().4s
+ sha256su1 \m0_b\().4s, \m2_b\().4s, \m3_b\().4s
+ .endif
+ mov t1_a.16b, state0_a.16b
+ mov t1_b.16b, state0_b.16b
+ sha256h state0_a_q, state1_a_q, t0_a\().4s
+ sha256h state0_b_q, state1_b_q, t0_b\().4s
+ sha256h2 state1_a_q, t1_a_q, t0_a\().4s
+ sha256h2 state1_b_q, t1_b_q, t0_b\().4s
+ .endm
+
+ .macro do_16rounds_2x i, k0, k1, k2, k3
+ do_4rounds_2x \i + 0, \k0, v16, v17, v18, v19, v20, v21, v22, v23
+ do_4rounds_2x \i + 4, \k1, v17, v18, v19, v16, v21, v22, v23, v20
+ do_4rounds_2x \i + 8, \k2, v18, v19, v16, v17, v22, v23, v20, v21
+ do_4rounds_2x \i + 12, \k3, v19, v16, v17, v18, v23, v20, v21, v22
+ .endm
+
+//
+// void sha256_ce_finup2x(const struct __sha256_ctx *ctx,
+// const u8 *data1, const u8 *data2, int len,
+// u8 out1[SHA256_DIGEST_SIZE],
+// u8 out2[SHA256_DIGEST_SIZE]);
+//
+// This function computes the SHA-256 digests of two messages |data1| and
+// |data2| that are both |len| bytes long, starting from the initial context
+// |ctx|. |len| must be at least SHA256_BLOCK_SIZE.
+//
+// The instructions for the two SHA-256 operations are interleaved. On many
+// CPUs, this is almost twice as fast as hashing each message individually due
+// to taking better advantage of the CPU's SHA-256 and SIMD throughput.
+//
+SYM_FUNC_START(sha256_ce_finup2x)
+ sub sp, sp, #128
+ mov final_step, #0
+ load_round_constants x8
+
+ // Load the initial state from ctx->state.
+ ld1 {state0_a.4s-state1_a.4s}, [ctx]
+
+ // Load ctx->bytecount. Take the mod 64 of it to get the number of
+ // bytes that are buffered in ctx->buf. Also save it in a register with
+ // len added to it.
+ ldr x8, [ctx, #OFFSETOF_BYTECOUNT]
+ add count, x8, len, sxtw
+ and x8, x8, #63
+ cbz x8, .Lfinup2x_enter_loop // No bytes buffered?
+
+ // x8 bytes (1 to 63) are currently buffered in ctx->buf. Load them
+ // followed by the first 64 - x8 bytes of data. Since len >= 64, we
+ // just load 64 bytes from each of ctx->buf, data1, and data2
+ // unconditionally and rearrange the data as needed.
+ add x9, ctx, #OFFSETOF_BUF
+ ld1 {v16.16b-v19.16b}, [x9]
+ st1 {v16.16b-v19.16b}, [sp]
+
+ ld1 {v16.16b-v19.16b}, [data1], #64
+ add x9, sp, x8
+ st1 {v16.16b-v19.16b}, [x9]
+ ld1 {v16.4s-v19.4s}, [sp]
+
+ ld1 {v20.16b-v23.16b}, [data2], #64
+ st1 {v20.16b-v23.16b}, [x9]
+ ld1 {v20.4s-v23.4s}, [sp]
+
+ sub len, len, #64
+ sub data1, data1, x8
+ sub data2, data2, x8
+ add len, len, w8
+ mov state0_b.16b, state0_a.16b
+ mov state1_b.16b, state1_a.16b
+ b .Lfinup2x_loop_have_data
+
+.Lfinup2x_enter_loop:
+ sub len, len, #64
+ mov state0_b.16b, state0_a.16b
+ mov state1_b.16b, state1_a.16b
+.Lfinup2x_loop:
+ // Load the next two data blocks.
+ ld1 {v16.4s-v19.4s}, [data1], #64
+ ld1 {v20.4s-v23.4s}, [data2], #64
+.Lfinup2x_loop_have_data:
+ // Convert the words of the data blocks from big endian.
+CPU_LE( rev32 v16.16b, v16.16b )
+CPU_LE( rev32 v17.16b, v17.16b )
+CPU_LE( rev32 v18.16b, v18.16b )
+CPU_LE( rev32 v19.16b, v19.16b )
+CPU_LE( rev32 v20.16b, v20.16b )
+CPU_LE( rev32 v21.16b, v21.16b )
+CPU_LE( rev32 v22.16b, v22.16b )
+CPU_LE( rev32 v23.16b, v23.16b )
+.Lfinup2x_loop_have_bswapped_data:
+
+ // Save the original state for each block.
+ st1 {state0_a.4s-state1_b.4s}, [sp]
+
+ // Do the SHA-256 rounds on each block.
+ do_16rounds_2x 0, v0, v1, v2, v3
+ do_16rounds_2x 16, v4, v5, v6, v7
+ do_16rounds_2x 32, v8, v9, v10, v11
+ do_16rounds_2x 48, v12, v13, v14, v15
+
+ // Add the original state for each block.
+ ld1 {v16.4s-v19.4s}, [sp]
+ add state0_a.4s, state0_a.4s, v16.4s
+ add state1_a.4s, state1_a.4s, v17.4s
+ add state0_b.4s, state0_b.4s, v18.4s
+ add state1_b.4s, state1_b.4s, v19.4s
+
+ // Update len and loop back if more blocks remain.
+ sub len, len, #64
+ tbz len, #31, .Lfinup2x_loop // len >= 0?
+
+ // Check if any final blocks need to be handled.
+ // final_step = 2: all done
+ // final_step = 1: need to do count-only padding block
+ // final_step = 0: need to do the block with 0x80 padding byte
+ tbnz final_step, #1, .Lfinup2x_done
+ tbnz final_step, #0, .Lfinup2x_finalize_countonly
+ add len, len, #64
+ cbz len, .Lfinup2x_finalize_blockaligned
+
+ // Not block-aligned; 1 <= len <= 63 data bytes remain. Pad the block.
+ // To do this, write the padding starting with the 0x80 byte to
+ // &sp[64]. Then for each message, copy the last 64 data bytes to sp
+ // and load from &sp[64 - len] to get the needed padding block. This
+ // code relies on the data buffers being >= 64 bytes in length.
+ sub w8, len, #64 // w8 = len - 64
+ add data1, data1, w8, sxtw // data1 += len - 64
+ add data2, data2, w8, sxtw // data2 += len - 64
+CPU_LE( mov x9, #0x80 )
+CPU_LE( fmov d16, x9 )
+CPU_BE( movi v16.16b, #0 )
+CPU_BE( mov x9, #0x8000000000000000 )
+CPU_BE( mov v16.d[1], x9 )
+ movi v17.16b, #0
+ stp q16, q17, [sp, #64]
+ stp q17, q17, [sp, #96]
+ sub x9, sp, w8, sxtw // x9 = &sp[64 - len]
+ cmp len, #56
+ b.ge 1f // will count spill into its own block?
+ lsl count, count, #3
+CPU_LE( rev count, count )
+ str count, [x9, #56]
+ mov final_step, #2 // won't need count-only block
+ b 2f
+1:
+ mov final_step, #1 // will need count-only block
+2:
+ ld1 {v16.16b-v19.16b}, [data1]
+ st1 {v16.16b-v19.16b}, [sp]
+ ld1 {v16.4s-v19.4s}, [x9]
+ ld1 {v20.16b-v23.16b}, [data2]
+ st1 {v20.16b-v23.16b}, [sp]
+ ld1 {v20.4s-v23.4s}, [x9]
+ b .Lfinup2x_loop_have_data
+
+ // Prepare a padding block, either:
+ //
+ // {0x80, 0, 0, 0, ..., count (as __be64)}
+ // This is for a block aligned message.
+ //
+ // { 0, 0, 0, 0, ..., count (as __be64)}
+ // This is for a message whose length mod 64 is >= 56.
+ //
+ // Pre-swap the endianness of the words.
+.Lfinup2x_finalize_countonly:
+ movi v16.2d, #0
+ b 1f
+.Lfinup2x_finalize_blockaligned:
+ mov x8, #0x80000000
+ fmov d16, x8
+1:
+ movi v17.2d, #0
+ movi v18.2d, #0
+ ror count, count, #29 // ror(lsl(count, 3), 32)
+ mov v19.d[0], xzr
+ mov v19.d[1], count
+ mov v20.16b, v16.16b
+ movi v21.2d, #0
+ movi v22.2d, #0
+ mov v23.16b, v19.16b
+ mov final_step, #2
+ b .Lfinup2x_loop_have_bswapped_data
+
+.Lfinup2x_done:
+ // Write the two digests with all bytes in the correct order.
+CPU_LE( rev32 state0_a.16b, state0_a.16b )
+CPU_LE( rev32 state1_a.16b, state1_a.16b )
+CPU_LE( rev32 state0_b.16b, state0_b.16b )
+CPU_LE( rev32 state1_b.16b, state1_b.16b )
+ st1 {state0_a.4s-state1_a.4s}, [out1]
+ st1 {state0_b.4s-state1_b.4s}, [out2]
+ add sp, sp, #128
+ ret
+SYM_FUNC_END(sha256_ce_finup2x)
diff --git a/lib/crypto/arm64/sha256.h b/lib/crypto/arm64/sha256.h
index be4aeda9d0e6..80d06df27d3a 100644
--- a/lib/crypto/arm64/sha256.h
+++ b/lib/crypto/arm64/sha256.h
@@ -44,6 +44,43 @@ static void sha256_blocks(struct sha256_block_state *state,
}
}
+static_assert(offsetof(struct __sha256_ctx, state) == 0);
+static_assert(offsetof(struct __sha256_ctx, bytecount) == 32);
+static_assert(offsetof(struct __sha256_ctx, buf) == 40);
+asmlinkage void sha256_ce_finup2x(const struct __sha256_ctx *ctx,
+ const u8 *data1, const u8 *data2, int len,
+ u8 out1[SHA256_DIGEST_SIZE],
+ u8 out2[SHA256_DIGEST_SIZE]);
+
+#define sha256_finup_2x_arch sha256_finup_2x_arch
+static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx,
+ const u8 *data1, const u8 *data2, size_t len,
+ u8 out1[SHA256_DIGEST_SIZE],
+ u8 out2[SHA256_DIGEST_SIZE])
+{
+ /*
+ * The assembly requires len >= SHA256_BLOCK_SIZE && len <= INT_MAX.
+ * Further limit len to 65536 to avoid spending too long with preemption
+ * disabled. (Of course, in practice len is nearly always 4096 anyway.)
+ */
+ if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+ static_branch_likely(&have_ce) && len >= SHA256_BLOCK_SIZE &&
+ len <= 65536 && likely(may_use_simd())) {
+ kernel_neon_begin();
+ sha256_ce_finup2x(ctx, data1, data2, len, out1, out2);
+ kernel_neon_end();
+ kmsan_unpoison_memory(out1, SHA256_DIGEST_SIZE);
+ kmsan_unpoison_memory(out2, SHA256_DIGEST_SIZE);
+ return true;
+ }
+ return false;
+}
+
+static bool sha256_finup_2x_is_optimized_arch(void)
+{
+ return static_key_enabled(&have_ce);
+}
+
#ifdef CONFIG_KERNEL_MODE_NEON
#define sha256_mod_init_arch sha256_mod_init_arch
static void sha256_mod_init_arch(void)
diff --git a/lib/crypto/sha256.c b/lib/crypto/sha256.c
index 8fa15165d23e..881b935418ce 100644
--- a/lib/crypto/sha256.c
+++ b/lib/crypto/sha256.c
@@ -25,13 +25,20 @@ static const struct sha256_block_state sha224_iv = {
},
};
-static const struct sha256_block_state sha256_iv = {
- .h = {
- SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3,
- SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7,
+static const struct sha256_ctx initial_sha256_ctx = {
+ .ctx = {
+ .state = {
+ .h = {
+ SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3,
+ SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7,
+ },
+ },
+ .bytecount = 0,
},
};
+#define sha256_iv (initial_sha256_ctx.ctx.state)
+
static const u32 sha256_K[64] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1,
0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
@@ -261,8 +268,62 @@ void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE])
}
EXPORT_SYMBOL(sha256);
-/* pre-boot environment (as indicated by __DISABLE_EXPORTS) doesn't need HMAC */
+/*
+ * Pre-boot environment (as indicated by __DISABLE_EXPORTS being defined)
+ * doesn't need either HMAC support or interleaved hashing support
+ */
#ifndef __DISABLE_EXPORTS
+
+#ifndef sha256_finup_2x_arch
+static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx,
+ const u8 *data1, const u8 *data2, size_t len,
+ u8 out1[SHA256_DIGEST_SIZE],
+ u8 out2[SHA256_DIGEST_SIZE])
+{
+ return false;
+}
+static bool sha256_finup_2x_is_optimized_arch(void)
+{
+ return false;
+}
+#endif
+
+/* Sequential fallback implementation of sha256_finup_2x() */
+static noinline_for_stack void sha256_finup_2x_sequential(
+ const struct __sha256_ctx *ctx, const u8 *data1, const u8 *data2,
+ size_t len, u8 out1[SHA256_DIGEST_SIZE], u8 out2[SHA256_DIGEST_SIZE])
+{
+ struct __sha256_ctx mut_ctx;
+
+ mut_ctx = *ctx;
+ __sha256_update(&mut_ctx, data1, len);
+ __sha256_final(&mut_ctx, out1, SHA256_DIGEST_SIZE);
+
+ mut_ctx = *ctx;
+ __sha256_update(&mut_ctx, data2, len);
+ __sha256_final(&mut_ctx, out2, SHA256_DIGEST_SIZE);
+}
+
+void sha256_finup_2x(const struct sha256_ctx *ctx, const u8 *data1,
+ const u8 *data2, size_t len, u8 out1[SHA256_DIGEST_SIZE],
+ u8 out2[SHA256_DIGEST_SIZE])
+{
+ if (ctx == NULL)
+ ctx = &initial_sha256_ctx;
+
+ if (likely(sha256_finup_2x_arch(&ctx->ctx, data1, data2, len, out1,
+ out2)))
+ return;
+ sha256_finup_2x_sequential(&ctx->ctx, data1, data2, len, out1, out2);
+}
+EXPORT_SYMBOL_GPL(sha256_finup_2x);
+
+bool sha256_finup_2x_is_optimized(void)
+{
+ return sha256_finup_2x_is_optimized_arch();
+}
+EXPORT_SYMBOL_GPL(sha256_finup_2x_is_optimized);
+
static void __hmac_sha256_preparekey(struct sha256_block_state *istate,
struct sha256_block_state *ostate,
const u8 *raw_key, size_t raw_key_len,
diff --git a/lib/crypto/tests/sha256_kunit.c b/lib/crypto/tests/sha256_kunit.c
index 1cd4caee6010..dcedfca06df6 100644
--- a/lib/crypto/tests/sha256_kunit.c
+++ b/lib/crypto/tests/sha256_kunit.c
@@ -5,6 +5,7 @@
#include <crypto/sha2.h>
#include "sha256-testvecs.h"
+/* Generate the HASH_KUNIT_CASES using hash-test-template.h. */
#define HASH sha256
#define HASH_CTX sha256_ctx
#define HASH_SIZE SHA256_DIGEST_SIZE
@@ -21,9 +22,192 @@
#define HMAC_USINGRAWKEY hmac_sha256_usingrawkey
#include "hash-test-template.h"
+static void free_guarded_buf(void *buf)
+{
+ vfree(buf);
+}
+
+/*
+ * Allocate a KUnit-managed buffer that has length @len bytes immediately
+ * followed by an unmapped page, and assert that the allocation succeeds.
+ */
+static void *alloc_guarded_buf(struct kunit *test, size_t len)
+{
+ size_t full_len = round_up(len, PAGE_SIZE);
+ void *buf = vmalloc(full_len);
+
+ KUNIT_ASSERT_NOT_NULL(test, buf);
+ KUNIT_ASSERT_EQ(test, 0,
+ kunit_add_action_or_reset(test, free_guarded_buf, buf));
+ return buf + full_len - len;
+}
+
+/*
+ * Test for sha256_finup_2x(). Specifically, choose various data lengths and
+ * salt lengths, and for each one, verify that sha256_finup_2x() produces the
+ * same results as sha256_update() and sha256_final().
+ *
+ * Use guarded buffers for all inputs and outputs to reliably detect any
+ * out-of-bounds reads or writes, even if they occur in assembly code.
+ */
+static void test_sha256_finup_2x(struct kunit *test)
+{
+ const size_t max_data_len = 16384;
+ u8 *data1_buf, *data2_buf, *hash1, *hash2;
+ u8 expected_hash1[SHA256_DIGEST_SIZE];
+ u8 expected_hash2[SHA256_DIGEST_SIZE];
+ u8 salt[SHA256_BLOCK_SIZE];
+ struct sha256_ctx *ctx;
+
+ data1_buf = alloc_guarded_buf(test, max_data_len);
+ data2_buf = alloc_guarded_buf(test, max_data_len);
+ hash1 = alloc_guarded_buf(test, SHA256_DIGEST_SIZE);
+ hash2 = alloc_guarded_buf(test, SHA256_DIGEST_SIZE);
+ ctx = alloc_guarded_buf(test, sizeof(*ctx));
+
+ rand_bytes(data1_buf, max_data_len);
+ rand_bytes(data2_buf, max_data_len);
+ rand_bytes(salt, sizeof(salt));
+
+ for (size_t i = 0; i < 500; i++) {
+ size_t salt_len = rand_length(sizeof(salt));
+ size_t data_len = rand_length(max_data_len);
+ const u8 *data1 = data1_buf + max_data_len - data_len;
+ const u8 *data2 = data2_buf + max_data_len - data_len;
+ struct sha256_ctx orig_ctx;
+
+ sha256_init(ctx);
+ sha256_update(ctx, salt, salt_len);
+ orig_ctx = *ctx;
+
+ sha256_finup_2x(ctx, data1, data2, data_len, hash1, hash2);
+ KUNIT_ASSERT_MEMEQ_MSG(
+ test, ctx, &orig_ctx, sizeof(*ctx),
+ "sha256_finup_2x() modified its ctx argument");
+
+ sha256_update(ctx, data1, data_len);
+ sha256_final(ctx, expected_hash1);
+ sha256_update(&orig_ctx, data2, data_len);
+ sha256_final(&orig_ctx, expected_hash2);
+ KUNIT_ASSERT_MEMEQ_MSG(
+ test, hash1, expected_hash1, SHA256_DIGEST_SIZE,
+ "Wrong hash1 with salt_len=%zu data_len=%zu", salt_len,
+ data_len);
+ KUNIT_ASSERT_MEMEQ_MSG(
+ test, hash2, expected_hash2, SHA256_DIGEST_SIZE,
+ "Wrong hash2 with salt_len=%zu data_len=%zu", salt_len,
+ data_len);
+ }
+}
+
+/* Test sha256_finup_2x() with ctx == NULL */
+static void test_sha256_finup_2x_defaultctx(struct kunit *test)
+{
+ const size_t data_len = 128;
+ struct sha256_ctx ctx;
+ u8 hash1_a[SHA256_DIGEST_SIZE];
+ u8 hash2_a[SHA256_DIGEST_SIZE];
+ u8 hash1_b[SHA256_DIGEST_SIZE];
+ u8 hash2_b[SHA256_DIGEST_SIZE];
+
+ rand_bytes(test_buf, 2 * data_len);
+
+ sha256_init(&ctx);
+ sha256_finup_2x(&ctx, test_buf, &test_buf[data_len], data_len, hash1_a,
+ hash2_a);
+
+ sha256_finup_2x(NULL, test_buf, &test_buf[data_len], data_len, hash1_b,
+ hash2_b);
+
+ KUNIT_ASSERT_MEMEQ(test, hash1_a, hash1_b, SHA256_DIGEST_SIZE);
+ KUNIT_ASSERT_MEMEQ(test, hash2_a, hash2_b, SHA256_DIGEST_SIZE);
+}
+
+/*
+ * Test that sha256_finup_2x() and sha256_update/final() produce consistent
+ * results with total message lengths that require more than 32 bits.
+ */
+static void test_sha256_finup_2x_hugelen(struct kunit *test)
+{
+ const size_t data_len = 4 * SHA256_BLOCK_SIZE;
+ struct sha256_ctx ctx = {};
+ u8 expected_hash[SHA256_DIGEST_SIZE];
+ u8 hash[SHA256_DIGEST_SIZE];
+
+ rand_bytes(test_buf, data_len);
+ for (size_t align = 0; align < SHA256_BLOCK_SIZE; align++) {
+ sha256_init(&ctx);
+ ctx.ctx.bytecount = 0x123456789abcd00 + align;
+
+ sha256_finup_2x(&ctx, test_buf, test_buf, data_len, hash, hash);
+
+ sha256_update(&ctx, test_buf, data_len);
+ sha256_final(&ctx, expected_hash);
+
+ KUNIT_ASSERT_MEMEQ(test, hash, expected_hash,
+ SHA256_DIGEST_SIZE);
+ }
+}
+
+/* Benchmark for sha256_finup_2x() */
+static void benchmark_sha256_finup_2x(struct kunit *test)
+{
+ /*
+ * Try a few different salt lengths, since sha256_finup_2x() performance
+ * may vary slightly for the same data_len depending on how many bytes
+ * were already processed in the initial context.
+ */
+ static const size_t salt_lens_to_test[] = { 0, 32, 64 };
+ const size_t data_len = 4096;
+ const size_t num_iters = 4096;
+ struct sha256_ctx ctx;
+ u8 hash1[SHA256_DIGEST_SIZE];
+ u8 hash2[SHA256_DIGEST_SIZE];
+
+ if (!IS_ENABLED(CONFIG_CRYPTO_LIB_BENCHMARK))
+ kunit_skip(test, "not enabled");
+ if (!sha256_finup_2x_is_optimized())
+ kunit_skip(test, "not relevant");
+
+ rand_bytes(test_buf, data_len * 2);
+
+ /* Warm-up */
+ for (size_t i = 0; i < num_iters; i++)
+ sha256_finup_2x(NULL, &test_buf[0], &test_buf[data_len],
+ data_len, hash1, hash2);
+
+ for (size_t i = 0; i < ARRAY_SIZE(salt_lens_to_test); i++) {
+ size_t salt_len = salt_lens_to_test[i];
+ u64 t0, t1;
+
+ /*
+ * Prepare the initial context. The time to process the salt is
+ * not measured; we're just interested in sha256_finup_2x().
+ */
+ sha256_init(&ctx);
+ sha256_update(&ctx, test_buf, salt_len);
+
+ preempt_disable();
+ t0 = ktime_get_ns();
+ for (size_t j = 0; j < num_iters; j++)
+ sha256_finup_2x(&ctx, &test_buf[0], &test_buf[data_len],
+ data_len, hash1, hash2);
+ t1 = ktime_get_ns();
+ preempt_enable();
+ kunit_info(test, "data_len=%zu salt_len=%zu: %llu MB/s",
+ data_len, salt_len,
+ div64_u64((u64)data_len * 2 * num_iters * 1000,
+ t1 - t0 ?: 1));
+ }
+}
+
static struct kunit_case hash_test_cases[] = {
HASH_KUNIT_CASES,
+ KUNIT_CASE(test_sha256_finup_2x),
+ KUNIT_CASE(test_sha256_finup_2x_defaultctx),
+ KUNIT_CASE(test_sha256_finup_2x_hugelen),
KUNIT_CASE(benchmark_hash),
+ KUNIT_CASE(benchmark_sha256_finup_2x),
{},
};
diff --git a/lib/crypto/x86/sha256-ni-asm.S b/lib/crypto/x86/sha256-ni-asm.S
index 4bd9490ffc66..de5f707e7ef7 100644
--- a/lib/crypto/x86/sha256-ni-asm.S
+++ b/lib/crypto/x86/sha256-ni-asm.S
@@ -165,6 +165,374 @@ SYM_FUNC_START(sha256_ni_transform)
RET
SYM_FUNC_END(sha256_ni_transform)
+#undef DIGEST_PTR
+#undef DATA_PTR
+#undef NUM_BLKS
+#undef SHA256CONSTANTS
+#undef MSG
+#undef STATE0
+#undef STATE1
+#undef MSG0
+#undef MSG1
+#undef MSG2
+#undef MSG3
+#undef TMP
+#undef SHUF_MASK
+#undef ABEF_SAVE
+#undef CDGH_SAVE
+
+// parameters for sha256_ni_finup2x()
+#define CTX %rdi
+#define DATA1 %rsi
+#define DATA2 %rdx
+#define LEN %ecx
+#define LEN8 %cl
+#define LEN64 %rcx
+#define OUT1 %r8
+#define OUT2 %r9
+
+// other scalar variables
+#define SHA256CONSTANTS %rax
+#define COUNT %r10
+#define COUNT32 %r10d
+#define FINAL_STEP %r11d
+
+// rbx is used as a temporary.
+
+#define MSG %xmm0 // sha256rnds2 implicit operand
+#define STATE0_A %xmm1
+#define STATE1_A %xmm2
+#define STATE0_B %xmm3
+#define STATE1_B %xmm4
+#define TMP_A %xmm5
+#define TMP_B %xmm6
+#define MSG0_A %xmm7
+#define MSG1_A %xmm8
+#define MSG2_A %xmm9
+#define MSG3_A %xmm10
+#define MSG0_B %xmm11
+#define MSG1_B %xmm12
+#define MSG2_B %xmm13
+#define MSG3_B %xmm14
+#define SHUF_MASK %xmm15
+
+#define OFFSETOF_STATE 0 // offsetof(struct __sha256_ctx, state)
+#define OFFSETOF_BYTECOUNT 32 // offsetof(struct __sha256_ctx, bytecount)
+#define OFFSETOF_BUF 40 // offsetof(struct __sha256_ctx, buf)
+
+// Do 4 rounds of SHA-256 for each of two messages (interleaved). m0_a and m0_b
+// contain the current 4 message schedule words for the first and second message
+// respectively.
+//
+// If not all the message schedule words have been computed yet, then this also
+// computes 4 more message schedule words for each message. m1_a-m3_a contain
+// the next 3 groups of 4 message schedule words for the first message, and
+// likewise m1_b-m3_b for the second. After consuming the current value of
+// m0_a, this macro computes the group after m3_a and writes it to m0_a, and
+// likewise for *_b. This means that the next (m0_a, m1_a, m2_a, m3_a) is the
+// current (m1_a, m2_a, m3_a, m0_a), and likewise for *_b, so the caller must
+// cycle through the registers accordingly.
+.macro do_4rounds_2x i, m0_a, m1_a, m2_a, m3_a, m0_b, m1_b, m2_b, m3_b
+ movdqa (\i-32)*4(SHA256CONSTANTS), TMP_A
+ movdqa TMP_A, TMP_B
+ paddd \m0_a, TMP_A
+ paddd \m0_b, TMP_B
+.if \i < 48
+ sha256msg1 \m1_a, \m0_a
+ sha256msg1 \m1_b, \m0_b
+.endif
+ movdqa TMP_A, MSG
+ sha256rnds2 STATE0_A, STATE1_A
+ movdqa TMP_B, MSG
+ sha256rnds2 STATE0_B, STATE1_B
+ pshufd $0x0E, TMP_A, MSG
+ sha256rnds2 STATE1_A, STATE0_A
+ pshufd $0x0E, TMP_B, MSG
+ sha256rnds2 STATE1_B, STATE0_B
+.if \i < 48
+ movdqa \m3_a, TMP_A
+ movdqa \m3_b, TMP_B
+ palignr $4, \m2_a, TMP_A
+ palignr $4, \m2_b, TMP_B
+ paddd TMP_A, \m0_a
+ paddd TMP_B, \m0_b
+ sha256msg2 \m3_a, \m0_a
+ sha256msg2 \m3_b, \m0_b
+.endif
+.endm
+
+//
+// void sha256_ni_finup2x(const struct __sha256_ctx *ctx,
+// const u8 *data1, const u8 *data2, int len,
+// u8 out1[SHA256_DIGEST_SIZE],
+// u8 out2[SHA256_DIGEST_SIZE]);
+//
+// This function computes the SHA-256 digests of two messages |data1| and
+// |data2| that are both |len| bytes long, starting from the initial context
+// |ctx|. |len| must be at least SHA256_BLOCK_SIZE.
+//
+// The instructions for the two SHA-256 operations are interleaved. On many
+// CPUs, this is almost twice as fast as hashing each message individually due
+// to taking better advantage of the CPU's SHA-256 and SIMD throughput.
+//
+SYM_FUNC_START(sha256_ni_finup2x)
+ // Allocate 128 bytes of stack space, 16-byte aligned.
+ push %rbx
+ push %rbp
+ mov %rsp, %rbp
+ sub $128, %rsp
+ and $~15, %rsp
+
+ // Load the shuffle mask for swapping the endianness of 32-bit words.
+ movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
+
+ // Set up pointer to the round constants.
+ lea K256+32*4(%rip), SHA256CONSTANTS
+
+ // Initially we're not processing the final blocks.
+ xor FINAL_STEP, FINAL_STEP
+
+ // Load the initial state from ctx->state.
+ movdqu OFFSETOF_STATE+0*16(CTX), STATE0_A // DCBA
+ movdqu OFFSETOF_STATE+1*16(CTX), STATE1_A // HGFE
+ movdqa STATE0_A, TMP_A
+ punpcklqdq STATE1_A, STATE0_A // FEBA
+ punpckhqdq TMP_A, STATE1_A // DCHG
+ pshufd $0x1B, STATE0_A, STATE0_A // ABEF
+ pshufd $0xB1, STATE1_A, STATE1_A // CDGH
+
+ // Load ctx->bytecount. Take the mod 64 of it to get the number of
+ // bytes that are buffered in ctx->buf. Also save it in a register with
+ // LEN added to it.
+ mov LEN, LEN
+ mov OFFSETOF_BYTECOUNT(CTX), %rbx
+ lea (%rbx, LEN64, 1), COUNT
+ and $63, %ebx
+ jz .Lfinup2x_enter_loop // No bytes buffered?
+
+ // %ebx bytes (1 to 63) are currently buffered in ctx->buf. Load them
+ // followed by the first 64 - %ebx bytes of data. Since LEN >= 64, we
+ // just load 64 bytes from each of ctx->buf, DATA1, and DATA2
+ // unconditionally and rearrange the data as needed.
+
+ movdqu OFFSETOF_BUF+0*16(CTX), MSG0_A
+ movdqu OFFSETOF_BUF+1*16(CTX), MSG1_A
+ movdqu OFFSETOF_BUF+2*16(CTX), MSG2_A
+ movdqu OFFSETOF_BUF+3*16(CTX), MSG3_A
+ movdqa MSG0_A, 0*16(%rsp)
+ movdqa MSG1_A, 1*16(%rsp)
+ movdqa MSG2_A, 2*16(%rsp)
+ movdqa MSG3_A, 3*16(%rsp)
+
+ movdqu 0*16(DATA1), MSG0_A
+ movdqu 1*16(DATA1), MSG1_A
+ movdqu 2*16(DATA1), MSG2_A
+ movdqu 3*16(DATA1), MSG3_A
+ movdqu MSG0_A, 0*16(%rsp,%rbx)
+ movdqu MSG1_A, 1*16(%rsp,%rbx)
+ movdqu MSG2_A, 2*16(%rsp,%rbx)
+ movdqu MSG3_A, 3*16(%rsp,%rbx)
+ movdqa 0*16(%rsp), MSG0_A
+ movdqa 1*16(%rsp), MSG1_A
+ movdqa 2*16(%rsp), MSG2_A
+ movdqa 3*16(%rsp), MSG3_A
+
+ movdqu 0*16(DATA2), MSG0_B
+ movdqu 1*16(DATA2), MSG1_B
+ movdqu 2*16(DATA2), MSG2_B
+ movdqu 3*16(DATA2), MSG3_B
+ movdqu MSG0_B, 0*16(%rsp,%rbx)
+ movdqu MSG1_B, 1*16(%rsp,%rbx)
+ movdqu MSG2_B, 2*16(%rsp,%rbx)
+ movdqu MSG3_B, 3*16(%rsp,%rbx)
+ movdqa 0*16(%rsp), MSG0_B
+ movdqa 1*16(%rsp), MSG1_B
+ movdqa 2*16(%rsp), MSG2_B
+ movdqa 3*16(%rsp), MSG3_B
+
+ sub $64, %rbx // rbx = buffered - 64
+ sub %rbx, DATA1 // DATA1 += 64 - buffered
+ sub %rbx, DATA2 // DATA2 += 64 - buffered
+ add %ebx, LEN // LEN += buffered - 64
+ movdqa STATE0_A, STATE0_B
+ movdqa STATE1_A, STATE1_B
+ jmp .Lfinup2x_loop_have_data
+
+.Lfinup2x_enter_loop:
+ sub $64, LEN
+ movdqa STATE0_A, STATE0_B
+ movdqa STATE1_A, STATE1_B
+.Lfinup2x_loop:
+ // Load the next two data blocks.
+ movdqu 0*16(DATA1), MSG0_A
+ movdqu 0*16(DATA2), MSG0_B
+ movdqu 1*16(DATA1), MSG1_A
+ movdqu 1*16(DATA2), MSG1_B
+ movdqu 2*16(DATA1), MSG2_A
+ movdqu 2*16(DATA2), MSG2_B
+ movdqu 3*16(DATA1), MSG3_A
+ movdqu 3*16(DATA2), MSG3_B
+ add $64, DATA1
+ add $64, DATA2
+.Lfinup2x_loop_have_data:
+ // Convert the words of the data blocks from big endian.
+ pshufb SHUF_MASK, MSG0_A
+ pshufb SHUF_MASK, MSG0_B
+ pshufb SHUF_MASK, MSG1_A
+ pshufb SHUF_MASK, MSG1_B
+ pshufb SHUF_MASK, MSG2_A
+ pshufb SHUF_MASK, MSG2_B
+ pshufb SHUF_MASK, MSG3_A
+ pshufb SHUF_MASK, MSG3_B
+.Lfinup2x_loop_have_bswapped_data:
+
+ // Save the original state for each block.
+ movdqa STATE0_A, 0*16(%rsp)
+ movdqa STATE0_B, 1*16(%rsp)
+ movdqa STATE1_A, 2*16(%rsp)
+ movdqa STATE1_B, 3*16(%rsp)
+
+ // Do the SHA-256 rounds on each block.
+.irp i, 0, 16, 32, 48
+ do_4rounds_2x (\i + 0), MSG0_A, MSG1_A, MSG2_A, MSG3_A, \
+ MSG0_B, MSG1_B, MSG2_B, MSG3_B
+ do_4rounds_2x (\i + 4), MSG1_A, MSG2_A, MSG3_A, MSG0_A, \
+ MSG1_B, MSG2_B, MSG3_B, MSG0_B
+ do_4rounds_2x (\i + 8), MSG2_A, MSG3_A, MSG0_A, MSG1_A, \
+ MSG2_B, MSG3_B, MSG0_B, MSG1_B
+ do_4rounds_2x (\i + 12), MSG3_A, MSG0_A, MSG1_A, MSG2_A, \
+ MSG3_B, MSG0_B, MSG1_B, MSG2_B
+.endr
+
+ // Add the original state for each block.
+ paddd 0*16(%rsp), STATE0_A
+ paddd 1*16(%rsp), STATE0_B
+ paddd 2*16(%rsp), STATE1_A
+ paddd 3*16(%rsp), STATE1_B
+
+ // Update LEN and loop back if more blocks remain.
+ sub $64, LEN
+ jge .Lfinup2x_loop
+
+ // Check if any final blocks need to be handled.
+ // FINAL_STEP = 2: all done
+ // FINAL_STEP = 1: need to do count-only padding block
+ // FINAL_STEP = 0: need to do the block with 0x80 padding byte
+ cmp $1, FINAL_STEP
+ jg .Lfinup2x_done
+ je .Lfinup2x_finalize_countonly
+ add $64, LEN
+ jz .Lfinup2x_finalize_blockaligned
+
+ // Not block-aligned; 1 <= LEN <= 63 data bytes remain. Pad the block.
+ // To do this, write the padding starting with the 0x80 byte to
+ // &sp[64]. Then for each message, copy the last 64 data bytes to sp
+ // and load from &sp[64 - LEN] to get the needed padding block. This
+ // code relies on the data buffers being >= 64 bytes in length.
+ mov $64, %ebx
+ sub LEN, %ebx // ebx = 64 - LEN
+ sub %rbx, DATA1 // DATA1 -= 64 - LEN
+ sub %rbx, DATA2 // DATA2 -= 64 - LEN
+ mov $0x80, FINAL_STEP // using FINAL_STEP as a temporary
+ movd FINAL_STEP, MSG0_A
+ pxor MSG1_A, MSG1_A
+ movdqa MSG0_A, 4*16(%rsp)
+ movdqa MSG1_A, 5*16(%rsp)
+ movdqa MSG1_A, 6*16(%rsp)
+ movdqa MSG1_A, 7*16(%rsp)
+ cmp $56, LEN
+ jge 1f // will COUNT spill into its own block?
+ shl $3, COUNT
+ bswap COUNT
+ mov COUNT, 56(%rsp,%rbx)
+ mov $2, FINAL_STEP // won't need count-only block
+ jmp 2f
+1:
+ mov $1, FINAL_STEP // will need count-only block
+2:
+ movdqu 0*16(DATA1), MSG0_A
+ movdqu 1*16(DATA1), MSG1_A
+ movdqu 2*16(DATA1), MSG2_A
+ movdqu 3*16(DATA1), MSG3_A
+ movdqa MSG0_A, 0*16(%rsp)
+ movdqa MSG1_A, 1*16(%rsp)
+ movdqa MSG2_A, 2*16(%rsp)
+ movdqa MSG3_A, 3*16(%rsp)
+ movdqu 0*16(%rsp,%rbx), MSG0_A
+ movdqu 1*16(%rsp,%rbx), MSG1_A
+ movdqu 2*16(%rsp,%rbx), MSG2_A
+ movdqu 3*16(%rsp,%rbx), MSG3_A
+
+ movdqu 0*16(DATA2), MSG0_B
+ movdqu 1*16(DATA2), MSG1_B
+ movdqu 2*16(DATA2), MSG2_B
+ movdqu 3*16(DATA2), MSG3_B
+ movdqa MSG0_B, 0*16(%rsp)
+ movdqa MSG1_B, 1*16(%rsp)
+ movdqa MSG2_B, 2*16(%rsp)
+ movdqa MSG3_B, 3*16(%rsp)
+ movdqu 0*16(%rsp,%rbx), MSG0_B
+ movdqu 1*16(%rsp,%rbx), MSG1_B
+ movdqu 2*16(%rsp,%rbx), MSG2_B
+ movdqu 3*16(%rsp,%rbx), MSG3_B
+ jmp .Lfinup2x_loop_have_data
+
+ // Prepare a padding block, either:
+ //
+ // {0x80, 0, 0, 0, ..., count (as __be64)}
+ // This is for a block aligned message.
+ //
+ // { 0, 0, 0, 0, ..., count (as __be64)}
+ // This is for a message whose length mod 64 is >= 56.
+ //
+ // Pre-swap the endianness of the words.
+.Lfinup2x_finalize_countonly:
+ pxor MSG0_A, MSG0_A
+ jmp 1f
+
+.Lfinup2x_finalize_blockaligned:
+ mov $0x80000000, %ebx
+ movd %ebx, MSG0_A
+1:
+ pxor MSG1_A, MSG1_A
+ pxor MSG2_A, MSG2_A
+ ror $29, COUNT
+ movq COUNT, MSG3_A
+ pslldq $8, MSG3_A
+ movdqa MSG0_A, MSG0_B
+ pxor MSG1_B, MSG1_B
+ pxor MSG2_B, MSG2_B
+ movdqa MSG3_A, MSG3_B
+ mov $2, FINAL_STEP
+ jmp .Lfinup2x_loop_have_bswapped_data
+
+.Lfinup2x_done:
+ // Write the two digests with all bytes in the correct order.
+ movdqa STATE0_A, TMP_A
+ movdqa STATE0_B, TMP_B
+ punpcklqdq STATE1_A, STATE0_A // GHEF
+ punpcklqdq STATE1_B, STATE0_B
+ punpckhqdq TMP_A, STATE1_A // ABCD
+ punpckhqdq TMP_B, STATE1_B
+ pshufd $0xB1, STATE0_A, STATE0_A // HGFE
+ pshufd $0xB1, STATE0_B, STATE0_B
+ pshufd $0x1B, STATE1_A, STATE1_A // DCBA
+ pshufd $0x1B, STATE1_B, STATE1_B
+ pshufb SHUF_MASK, STATE0_A
+ pshufb SHUF_MASK, STATE0_B
+ pshufb SHUF_MASK, STATE1_A
+ pshufb SHUF_MASK, STATE1_B
+ movdqu STATE0_A, 1*16(OUT1)
+ movdqu STATE0_B, 1*16(OUT2)
+ movdqu STATE1_A, 0*16(OUT1)
+ movdqu STATE1_B, 0*16(OUT2)
+
+ mov %rbp, %rsp
+ pop %rbp
+ pop %rbx
+ RET
+SYM_FUNC_END(sha256_ni_finup2x)
+
.section .rodata.cst256.K256, "aM", @progbits, 256
.align 64
K256:
diff --git a/lib/crypto/x86/sha256.h b/lib/crypto/x86/sha256.h
index 41fa95fbc3bf..38e33b22a092 100644
--- a/lib/crypto/x86/sha256.h
+++ b/lib/crypto/x86/sha256.h
@@ -7,6 +7,8 @@
#include <asm/fpu/api.h>
#include <linux/static_call.h>
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha_ni);
+
DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_blocks_generic);
#define DEFINE_X86_SHA256_FN(c_fn, asm_fn) \
@@ -35,11 +37,48 @@ static void sha256_blocks(struct sha256_block_state *state,
static_call(sha256_blocks_x86)(state, data, nblocks);
}
+static_assert(offsetof(struct __sha256_ctx, state) == 0);
+static_assert(offsetof(struct __sha256_ctx, bytecount) == 32);
+static_assert(offsetof(struct __sha256_ctx, buf) == 40);
+asmlinkage void sha256_ni_finup2x(const struct __sha256_ctx *ctx,
+ const u8 *data1, const u8 *data2, int len,
+ u8 out1[SHA256_DIGEST_SIZE],
+ u8 out2[SHA256_DIGEST_SIZE]);
+
+#define sha256_finup_2x_arch sha256_finup_2x_arch
+static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx,
+ const u8 *data1, const u8 *data2, size_t len,
+ u8 out1[SHA256_DIGEST_SIZE],
+ u8 out2[SHA256_DIGEST_SIZE])
+{
+ /*
+ * The assembly requires len >= SHA256_BLOCK_SIZE && len <= INT_MAX.
+ * Further limit len to 65536 to avoid spending too long with preemption
+ * disabled. (Of course, in practice len is nearly always 4096 anyway.)
+ */
+ if (static_branch_likely(&have_sha_ni) && len >= SHA256_BLOCK_SIZE &&
+ len <= 65536 && likely(irq_fpu_usable())) {
+ kernel_fpu_begin();
+ sha256_ni_finup2x(ctx, data1, data2, len, out1, out2);
+ kernel_fpu_end();
+ kmsan_unpoison_memory(out1, SHA256_DIGEST_SIZE);
+ kmsan_unpoison_memory(out2, SHA256_DIGEST_SIZE);
+ return true;
+ }
+ return false;
+}
+
+static bool sha256_finup_2x_is_optimized_arch(void)
+{
+ return static_key_enabled(&have_sha_ni);
+}
+
#define sha256_mod_init_arch sha256_mod_init_arch
static void sha256_mod_init_arch(void)
{
if (boot_cpu_has(X86_FEATURE_SHA_NI)) {
static_call_update(sha256_blocks_x86, sha256_blocks_ni);
+ static_branch_enable(&have_sha_ni);
} else if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
NULL) &&
boot_cpu_has(X86_FEATURE_AVX)) {
diff --git a/lib/raid6/recov_rvv.c b/lib/raid6/recov_rvv.c
index 5d54c4b437df..5f779719c3d3 100644
--- a/lib/raid6/recov_rvv.c
+++ b/lib/raid6/recov_rvv.c
@@ -4,9 +4,7 @@
* Author: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
*/
-#include <asm/simd.h>
#include <asm/vector.h>
-#include <crypto/internal/simd.h>
#include <linux/raid/pq.h>
static int rvv_has_vector(void)
diff --git a/lib/raid6/rvv.c b/lib/raid6/rvv.c
index 7d82efa5b14f..89da5fc247aa 100644
--- a/lib/raid6/rvv.c
+++ b/lib/raid6/rvv.c
@@ -9,11 +9,8 @@
* Copyright 2002-2004 H. Peter Anvin
*/
-#include <asm/simd.h>
#include <asm/vector.h>
-#include <crypto/internal/simd.h>
#include <linux/raid/pq.h>
-#include <linux/types.h>
#include "rvv.h"
#define NSIZE (riscv_v_vsize / 32) /* NSIZE = vlenb */
@@ -47,7 +44,7 @@ static void raid6_rvv1_gen_syndrome_real(int disks, unsigned long bytes, void **
asm volatile (".option push\n"
".option arch,+v\n"
"vle8.v v0, (%[wp0])\n"
- "vle8.v v1, (%[wp0])\n"
+ "vmv.v.v v1, v0\n"
".option pop\n"
: :
[wp0]"r"(&dptr[z0][d + 0 * NSIZE])
@@ -120,7 +117,7 @@ static void raid6_rvv1_xor_syndrome_real(int disks, int start, int stop,
asm volatile (".option push\n"
".option arch,+v\n"
"vle8.v v0, (%[wp0])\n"
- "vle8.v v1, (%[wp0])\n"
+ "vmv.v.v v1, v0\n"
".option pop\n"
: :
[wp0]"r"(&dptr[z0][d + 0 * NSIZE])
@@ -221,9 +218,9 @@ static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void **
asm volatile (".option push\n"
".option arch,+v\n"
"vle8.v v0, (%[wp0])\n"
- "vle8.v v1, (%[wp0])\n"
+ "vmv.v.v v1, v0\n"
"vle8.v v4, (%[wp1])\n"
- "vle8.v v5, (%[wp1])\n"
+ "vmv.v.v v5, v4\n"
".option pop\n"
: :
[wp0]"r"(&dptr[z0][d + 0 * NSIZE]),
@@ -313,9 +310,9 @@ static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
asm volatile (".option push\n"
".option arch,+v\n"
"vle8.v v0, (%[wp0])\n"
- "vle8.v v1, (%[wp0])\n"
+ "vmv.v.v v1, v0\n"
"vle8.v v4, (%[wp1])\n"
- "vle8.v v5, (%[wp1])\n"
+ "vmv.v.v v5, v4\n"
".option pop\n"
: :
[wp0]"r"(&dptr[z0][d + 0 * NSIZE]),
@@ -443,13 +440,13 @@ static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **
asm volatile (".option push\n"
".option arch,+v\n"
"vle8.v v0, (%[wp0])\n"
- "vle8.v v1, (%[wp0])\n"
+ "vmv.v.v v1, v0\n"
"vle8.v v4, (%[wp1])\n"
- "vle8.v v5, (%[wp1])\n"
+ "vmv.v.v v5, v4\n"
"vle8.v v8, (%[wp2])\n"
- "vle8.v v9, (%[wp2])\n"
+ "vmv.v.v v9, v8\n"
"vle8.v v12, (%[wp3])\n"
- "vle8.v v13, (%[wp3])\n"
+ "vmv.v.v v13, v12\n"
".option pop\n"
: :
[wp0]"r"(&dptr[z0][d + 0 * NSIZE]),
@@ -569,13 +566,13 @@ static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
asm volatile (".option push\n"
".option arch,+v\n"
"vle8.v v0, (%[wp0])\n"
- "vle8.v v1, (%[wp0])\n"
+ "vmv.v.v v1, v0\n"
"vle8.v v4, (%[wp1])\n"
- "vle8.v v5, (%[wp1])\n"
+ "vmv.v.v v5, v4\n"
"vle8.v v8, (%[wp2])\n"
- "vle8.v v9, (%[wp2])\n"
+ "vmv.v.v v9, v8\n"
"vle8.v v12, (%[wp3])\n"
- "vle8.v v13, (%[wp3])\n"
+ "vmv.v.v v13, v12\n"
".option pop\n"
: :
[wp0]"r"(&dptr[z0][d + 0 * NSIZE]),
@@ -757,21 +754,21 @@ static void raid6_rvv8_gen_syndrome_real(int disks, unsigned long bytes, void **
asm volatile (".option push\n"
".option arch,+v\n"
"vle8.v v0, (%[wp0])\n"
- "vle8.v v1, (%[wp0])\n"
+ "vmv.v.v v1, v0\n"
"vle8.v v4, (%[wp1])\n"
- "vle8.v v5, (%[wp1])\n"
+ "vmv.v.v v5, v4\n"
"vle8.v v8, (%[wp2])\n"
- "vle8.v v9, (%[wp2])\n"
+ "vmv.v.v v9, v8\n"
"vle8.v v12, (%[wp3])\n"
- "vle8.v v13, (%[wp3])\n"
+ "vmv.v.v v13, v12\n"
"vle8.v v16, (%[wp4])\n"
- "vle8.v v17, (%[wp4])\n"
+ "vmv.v.v v17, v16\n"
"vle8.v v20, (%[wp5])\n"
- "vle8.v v21, (%[wp5])\n"
+ "vmv.v.v v21, v20\n"
"vle8.v v24, (%[wp6])\n"
- "vle8.v v25, (%[wp6])\n"
+ "vmv.v.v v25, v24\n"
"vle8.v v28, (%[wp7])\n"
- "vle8.v v29, (%[wp7])\n"
+ "vmv.v.v v29, v28\n"
".option pop\n"
: :
[wp0]"r"(&dptr[z0][d + 0 * NSIZE]),
@@ -951,21 +948,21 @@ static void raid6_rvv8_xor_syndrome_real(int disks, int start, int stop,
asm volatile (".option push\n"
".option arch,+v\n"
"vle8.v v0, (%[wp0])\n"
- "vle8.v v1, (%[wp0])\n"
+ "vmv.v.v v1, v0\n"
"vle8.v v4, (%[wp1])\n"
- "vle8.v v5, (%[wp1])\n"
+ "vmv.v.v v5, v4\n"
"vle8.v v8, (%[wp2])\n"
- "vle8.v v9, (%[wp2])\n"
+ "vmv.v.v v9, v8\n"
"vle8.v v12, (%[wp3])\n"
- "vle8.v v13, (%[wp3])\n"
+ "vmv.v.v v13, v12\n"
"vle8.v v16, (%[wp4])\n"
- "vle8.v v17, (%[wp4])\n"
+ "vmv.v.v v17, v16\n"
"vle8.v v20, (%[wp5])\n"
- "vle8.v v21, (%[wp5])\n"
+ "vmv.v.v v21, v20\n"
"vle8.v v24, (%[wp6])\n"
- "vle8.v v25, (%[wp6])\n"
+ "vmv.v.v v25, v24\n"
"vle8.v v28, (%[wp7])\n"
- "vle8.v v29, (%[wp7])\n"
+ "vmv.v.v v29, v28\n"
".option pop\n"
: :
[wp0]"r"(&dptr[z0][d + 0 * NSIZE]),
diff --git a/lib/tests/Makefile b/lib/tests/Makefile
index fa6d728a8b5b..f7460831cfdd 100644
--- a/lib/tests/Makefile
+++ b/lib/tests/Makefile
@@ -10,6 +10,7 @@ obj-$(CONFIG_BLACKHOLE_DEV_KUNIT_TEST) += blackhole_dev_kunit.o
obj-$(CONFIG_CHECKSUM_KUNIT) += checksum_kunit.o
obj-$(CONFIG_CMDLINE_KUNIT_TEST) += cmdline_kunit.o
obj-$(CONFIG_CPUMASK_KUNIT_TEST) += cpumask_kunit.o
+obj-$(CONFIG_FFS_KUNIT_TEST) += ffs_kunit.o
CFLAGS_fortify_kunit.o += $(call cc-disable-warning, unsequenced)
CFLAGS_fortify_kunit.o += $(call cc-disable-warning, stringop-overread)
CFLAGS_fortify_kunit.o += $(call cc-disable-warning, stringop-truncation)
diff --git a/lib/tests/ffs_kunit.c b/lib/tests/ffs_kunit.c
new file mode 100644
index 000000000000..9a329cdc09c2
--- /dev/null
+++ b/lib/tests/ffs_kunit.c
@@ -0,0 +1,566 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KUnit tests for ffs()-family functions
+ */
+#include <kunit/test.h>
+#include <linux/bitops.h>
+
+/*
+ * Test data structures
+ */
+struct ffs_test_case {
+ unsigned long input;
+ int expected_ffs; /* ffs() result (1-based) */
+ int expected_fls; /* fls() result (1-based) */
+ const char *description;
+};
+
+struct ffs64_test_case {
+ u64 input;
+ int expected_fls64; /* fls64() result (1-based) */
+ unsigned int expected_ffs64_0based; /* __ffs64() result (0-based) */
+ const char *description;
+};
+
+/*
+ * Basic edge cases - core functionality validation
+ */
+static const struct ffs_test_case basic_test_cases[] = {
+ /* Zero case - special handling */
+ {0x00000000, 0, 0, "zero value"},
+
+ /* Single bit patterns - powers of 2 */
+ {0x00000001, 1, 1, "bit 0 set"},
+ {0x00000002, 2, 2, "bit 1 set"},
+ {0x00000004, 3, 3, "bit 2 set"},
+ {0x00000008, 4, 4, "bit 3 set"},
+ {0x00000010, 5, 5, "bit 4 set"},
+ {0x00000020, 6, 6, "bit 5 set"},
+ {0x00000040, 7, 7, "bit 6 set"},
+ {0x00000080, 8, 8, "bit 7 set"},
+ {0x00000100, 9, 9, "bit 8 set"},
+ {0x00008000, 16, 16, "bit 15 set"},
+ {0x00010000, 17, 17, "bit 16 set"},
+ {0x40000000, 31, 31, "bit 30 set"},
+ {0x80000000, 32, 32, "bit 31 set (sign bit)"},
+
+ /* Maximum values */
+ {0xFFFFFFFF, 1, 32, "all bits set"},
+
+ /* Multiple bit patterns */
+ {0x00000003, 1, 2, "bits 0-1 set"},
+ {0x00000007, 1, 3, "bits 0-2 set"},
+ {0x0000000F, 1, 4, "bits 0-3 set"},
+ {0x000000FF, 1, 8, "bits 0-7 set"},
+ {0x0000FFFF, 1, 16, "bits 0-15 set"},
+ {0x7FFFFFFF, 1, 31, "bits 0-30 set"},
+
+ /* Sparse patterns */
+ {0x00000101, 1, 9, "bits 0,8 set"},
+ {0x00001001, 1, 13, "bits 0,12 set"},
+ {0x80000001, 1, 32, "bits 0,31 set"},
+ {0x40000002, 2, 31, "bits 1,30 set"},
+};
+
+/*
+ * 64-bit test cases
+ */
+static const struct ffs64_test_case ffs64_test_cases[] = {
+ /* Zero case */
+ {0x0000000000000000ULL, 0, 0, "zero value"},
+
+ /* Single bit patterns */
+ {0x0000000000000001ULL, 1, 0, "bit 0 set"},
+ {0x0000000000000002ULL, 2, 1, "bit 1 set"},
+ {0x0000000000000004ULL, 3, 2, "bit 2 set"},
+ {0x0000000000000008ULL, 4, 3, "bit 3 set"},
+ {0x0000000000008000ULL, 16, 15, "bit 15 set"},
+ {0x0000000000010000ULL, 17, 16, "bit 16 set"},
+ {0x0000000080000000ULL, 32, 31, "bit 31 set"},
+ {0x0000000100000000ULL, 33, 32, "bit 32 set"},
+ {0x0000000200000000ULL, 34, 33, "bit 33 set"},
+ {0x4000000000000000ULL, 63, 62, "bit 62 set"},
+ {0x8000000000000000ULL, 64, 63, "bit 63 set (sign bit)"},
+
+ /* Maximum values */
+ {0xFFFFFFFFFFFFFFFFULL, 64, 0, "all bits set"},
+
+ /* Cross 32-bit boundary patterns */
+ {0x00000000FFFFFFFFULL, 32, 0, "lower 32 bits set"},
+ {0xFFFFFFFF00000000ULL, 64, 32, "upper 32 bits set"},
+ {0x8000000000000001ULL, 64, 0, "bits 0,63 set"},
+ {0x4000000000000002ULL, 63, 1, "bits 1,62 set"},
+
+ /* Mixed patterns */
+ {0x00000001FFFFFFFFULL, 33, 0, "bit 32 + lower 32 bits"},
+ {0xFFFFFFFF80000000ULL, 64, 31, "upper 32 bits + bit 31"},
+};
+
+/*
+ * Helper function to validate ffs results with detailed error messages
+ */
+static void validate_ffs_result(struct kunit *test, unsigned long input,
+ int actual, int expected, const char *func_name,
+ const char *description)
+{
+ KUNIT_EXPECT_EQ_MSG(test, actual, expected,
+ "%s(0x%08lx) [%s]: expected %d, got %d",
+ func_name, input, description, expected, actual);
+}
+
+/*
+ * Helper function to validate 64-bit ffs results
+ */
+static void validate_ffs64_result(struct kunit *test, u64 input,
+ int actual, int expected, const char *func_name,
+ const char *description)
+{
+ KUNIT_EXPECT_EQ_MSG(test, actual, expected,
+ "%s(0x%016llx) [%s]: expected %d, got %d",
+ func_name, input, description, expected, actual);
+}
+
+/*
+ * Helper function to validate mathematical relationships between functions
+ */
+static void validate_ffs_relationships(struct kunit *test, unsigned long input)
+{
+ int ffs_result;
+ int fls_result;
+ unsigned int ffs_0based;
+ unsigned int fls_0based;
+
+ if (input == 0) {
+ /* Special case: zero input */
+ KUNIT_EXPECT_EQ(test, ffs(input), 0);
+ KUNIT_EXPECT_EQ(test, fls(input), 0);
+ /* __ffs and __fls are undefined for 0, but often return specific values */
+ return;
+ }
+
+ ffs_result = ffs(input);
+ fls_result = fls(input);
+ ffs_0based = __ffs(input);
+ fls_0based = __fls(input);
+
+ /* Relationship: ffs(x) == __ffs(x) + 1 for x != 0 */
+ KUNIT_EXPECT_EQ_MSG(test, ffs_result, ffs_0based + 1,
+ "ffs(0x%08lx) != __ffs(0x%08lx) + 1: %d != %u + 1",
+ input, input, ffs_result, ffs_0based);
+
+ /* Relationship: fls(x) == __fls(x) + 1 for x != 0 */
+ KUNIT_EXPECT_EQ_MSG(test, fls_result, fls_0based + 1,
+ "fls(0x%08lx) != __fls(0x%08lx) + 1: %d != %u + 1",
+ input, input, fls_result, fls_0based);
+
+ /* Range validation */
+ KUNIT_EXPECT_GE(test, ffs_result, 1);
+ KUNIT_EXPECT_LE(test, ffs_result, BITS_PER_LONG);
+ KUNIT_EXPECT_GE(test, fls_result, 1);
+ KUNIT_EXPECT_LE(test, fls_result, BITS_PER_LONG);
+}
+
+/*
+ * Helper function to validate 64-bit relationships
+ */
+static void validate_ffs64_relationships(struct kunit *test, u64 input)
+{
+ int fls64_result;
+ unsigned int ffs64_0based;
+
+ if (input == 0) {
+ KUNIT_EXPECT_EQ(test, fls64(input), 0);
+ return;
+ }
+
+ fls64_result = fls64(input);
+ ffs64_0based = __ffs64(input);
+
+ /* Range validation */
+ KUNIT_EXPECT_GE(test, fls64_result, 1);
+ KUNIT_EXPECT_LE(test, fls64_result, 64);
+ KUNIT_EXPECT_LT(test, ffs64_0based, 64);
+
+ /*
+ * Relationships with 32-bit functions should hold for small values
+ * on all architectures.
+ */
+ if (input <= 0xFFFFFFFFULL) {
+ unsigned long input_32 = (unsigned long)input;
+ KUNIT_EXPECT_EQ_MSG(test, fls64(input), fls(input_32),
+ "fls64(0x%llx) != fls(0x%lx): %d != %d",
+ input, input_32, fls64(input), fls(input_32));
+
+ if (input != 0) {
+ KUNIT_EXPECT_EQ_MSG(test, __ffs64(input), __ffs(input_32),
+ "__ffs64(0x%llx) != __ffs(0x%lx): %lu != %lu",
+ input, input_32,
+ (unsigned long)__ffs64(input),
+ (unsigned long)__ffs(input_32));
+ }
+ }
+}
+
+/*
+ * Test basic correctness of all ffs-family functions
+ */
+static void ffs_basic_correctness_test(struct kunit *test)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(basic_test_cases); i++) {
+ const struct ffs_test_case *tc = &basic_test_cases[i];
+
+ /* Test ffs() */
+ validate_ffs_result(test, tc->input, ffs(tc->input),
+ tc->expected_ffs, "ffs", tc->description);
+
+ /* Test fls() */
+ validate_ffs_result(test, tc->input, fls(tc->input),
+ tc->expected_fls, "fls", tc->description);
+
+ /* Test __ffs() - skip zero case as it's undefined */
+ if (tc->input != 0) {
+ /* Calculate expected __ffs() result: __ffs(x) == ffs(x) - 1 */
+ unsigned int expected_ffs_0based = tc->expected_ffs - 1;
+ validate_ffs_result(test, tc->input, __ffs(tc->input),
+ expected_ffs_0based, "__ffs", tc->description);
+ }
+
+ /* Test __fls() - skip zero case as it's undefined */
+ if (tc->input != 0) {
+ /* Calculate expected __fls() result: __fls(x) == fls(x) - 1 */
+ unsigned int expected_fls_0based = tc->expected_fls - 1;
+ validate_ffs_result(test, tc->input, __fls(tc->input),
+ expected_fls_0based, "__fls", tc->description);
+ }
+ }
+}
+
+/*
+ * Test 64-bit function correctness
+ */
+static void ffs64_correctness_test(struct kunit *test)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(ffs64_test_cases); i++) {
+ const struct ffs64_test_case *tc = &ffs64_test_cases[i];
+
+ /* Test fls64() */
+ validate_ffs64_result(test, tc->input, fls64(tc->input),
+ tc->expected_fls64, "fls64", tc->description);
+
+ /* Test __ffs64() - skip zero case as it's undefined */
+ if (tc->input != 0) {
+ validate_ffs64_result(test, tc->input, __ffs64(tc->input),
+ tc->expected_ffs64_0based, "__ffs64",
+ tc->description);
+ }
+ }
+}
+
+/*
+ * Test mathematical relationships between functions
+ */
+static void ffs_mathematical_relationships_test(struct kunit *test)
+{
+ int i;
+
+ /* Test basic cases */
+ for (i = 0; i < ARRAY_SIZE(basic_test_cases); i++) {
+ validate_ffs_relationships(test, basic_test_cases[i].input);
+ }
+
+ /* Test 64-bit cases */
+ for (i = 0; i < ARRAY_SIZE(ffs64_test_cases); i++) {
+ validate_ffs64_relationships(test, ffs64_test_cases[i].input);
+ }
+}
+
+/*
+ * Test edge cases and boundary conditions
+ */
+static void ffs_edge_cases_test(struct kunit *test)
+{
+ unsigned long test_patterns[] = {
+ /* Powers of 2 */
+ 1UL, 2UL, 4UL, 8UL, 16UL, 32UL, 64UL, 128UL,
+ 256UL, 512UL, 1024UL, 2048UL, 4096UL, 8192UL,
+
+ /* Powers of 2 minus 1 */
+ 1UL, 3UL, 7UL, 15UL, 31UL, 63UL, 127UL, 255UL,
+ 511UL, 1023UL, 2047UL, 4095UL, 8191UL,
+
+ /* Boundary values */
+ 0x7FFFFFFFUL, /* Maximum positive 32-bit */
+ 0x80000000UL, /* Minimum negative 32-bit */
+ 0xFFFFFFFFUL, /* Maximum 32-bit unsigned */
+ };
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(test_patterns); i++) {
+ validate_ffs_relationships(test, test_patterns[i]);
+ }
+}
+
+/*
+ * Test 64-bit edge cases
+ */
+static void ffs64_edge_cases_test(struct kunit *test)
+{
+ u64 test_patterns_64[] = {
+ /* 64-bit powers of 2 */
+ 0x0000000100000000ULL, /* 2^32 */
+ 0x0000000200000000ULL, /* 2^33 */
+ 0x0000000400000000ULL, /* 2^34 */
+ 0x0000001000000000ULL, /* 2^36 */
+ 0x0000010000000000ULL, /* 2^40 */
+ 0x0001000000000000ULL, /* 2^48 */
+ 0x0100000000000000ULL, /* 2^56 */
+ 0x4000000000000000ULL, /* 2^62 */
+ 0x8000000000000000ULL, /* 2^63 */
+
+ /* Cross-boundary patterns */
+ 0x00000000FFFFFFFFULL, /* Lower 32 bits */
+ 0xFFFFFFFF00000000ULL, /* Upper 32 bits */
+ 0x7FFFFFFFFFFFFFFFULL, /* Maximum positive 64-bit */
+ 0xFFFFFFFFFFFFFFFFULL, /* Maximum 64-bit unsigned */
+ };
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(test_patterns_64); i++) {
+ validate_ffs64_relationships(test, test_patterns_64[i]);
+ }
+}
+
+/*
+ * ffz() test data - Find First Zero bit test cases
+ */
+struct ffz_test_case {
+ unsigned long input;
+ unsigned long expected_ffz;
+ const char *description;
+};
+
+static const struct ffz_test_case ffz_test_cases[] = {
+ /* Zero bits in specific positions */
+ {0xFFFFFFFE, 0, "bit 0 is zero"}, /* ...11111110 */
+ {0xFFFFFFFD, 1, "bit 1 is zero"}, /* ...11111101 */
+ {0xFFFFFFFB, 2, "bit 2 is zero"}, /* ...11111011 */
+ {0xFFFFFFF7, 3, "bit 3 is zero"}, /* ...11110111 */
+ {0xFFFFFFEF, 4, "bit 4 is zero"}, /* ...11101111 */
+ {0xFFFFFFDF, 5, "bit 5 is zero"}, /* ...11011111 */
+ {0xFFFFFFBF, 6, "bit 6 is zero"}, /* ...10111111 */
+ {0xFFFFFF7F, 7, "bit 7 is zero"}, /* ...01111111 */
+ {0xFFFFFEFF, 8, "bit 8 is zero"}, /* Gap in bit 8 */
+ {0xFFFF7FFF, 15, "bit 15 is zero"}, /* Gap in bit 15 */
+ {0xFFFEFFFF, 16, "bit 16 is zero"}, /* Gap in bit 16 */
+ {0xBFFFFFFF, 30, "bit 30 is zero"}, /* Gap in bit 30 */
+ {0x7FFFFFFF, 31, "bit 31 is zero"}, /* 01111111... */
+
+ /* Multiple zero patterns */
+ {0xFFFFFFFC, 0, "bits 0-1 are zero"}, /* ...11111100 */
+ {0xFFFFFFF8, 0, "bits 0-2 are zero"}, /* ...11111000 */
+ {0xFFFFFFF0, 0, "bits 0-3 are zero"}, /* ...11110000 */
+ {0xFFFFFF00, 0, "bits 0-7 are zero"}, /* ...00000000 */
+ {0xFFFF0000, 0, "bits 0-15 are zero"}, /* Lower 16 bits zero */
+
+ /* All zeros (special case) */
+ {0x00000000, 0, "all bits zero"},
+
+ /* Complex patterns */
+ {0xFFFDFFFF, 17, "bit 17 is zero"}, /* Gap in bit 17 */
+ {0xFFF7FFFF, 19, "bit 19 is zero"}, /* Gap in bit 19 */
+ {0xF7FFFFFF, 27, "bit 27 is zero"}, /* Gap in bit 27 */
+ {0xDFFFFFFF, 29, "bit 29 is zero"}, /* Gap in bit 29 */
+};
+
+/*
+ * Test basic correctness of ffz() function
+ */
+static void ffz_basic_correctness_test(struct kunit *test)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(ffz_test_cases); i++) {
+ const struct ffz_test_case *tc = &ffz_test_cases[i];
+ unsigned long result = ffz(tc->input);
+
+ KUNIT_EXPECT_EQ_MSG(test, result, tc->expected_ffz,
+ "ffz(0x%08lx) [%s]: expected %lu, got %lu",
+ tc->input, tc->description, tc->expected_ffz, result);
+ }
+}
+
+/*
+ * Test mathematical relationships between ffz() and other functions
+ */
+static void validate_ffz_relationships(struct kunit *test, unsigned long input)
+{
+ unsigned long ffz_result;
+
+ if (input == 0) {
+ /* ffz(0) should return 0 (first zero bit is at position 0) */
+ KUNIT_EXPECT_EQ(test, ffz(input), 0);
+ return;
+ }
+
+ if (input == ~0UL) {
+ /* ffz(~0) is undefined (no zero bits) - just verify it doesn't crash */
+ ffz_result = ffz(input);
+ /* Implementation-defined behavior, just ensure it completes */
+ return;
+ }
+
+ ffz_result = ffz(input);
+
+ /* Range validation - result should be within valid bit range */
+ KUNIT_EXPECT_LT(test, ffz_result, BITS_PER_LONG);
+
+ /* Verify the bit at ffz_result position is actually zero */
+ KUNIT_EXPECT_EQ_MSG(test, (input >> ffz_result) & 1, 0,
+ "ffz(0x%08lx) = %lu, but bit %lu is not zero",
+ input, ffz_result, ffz_result);
+
+ /* Core relationship: if we set the ffz bit, ffz should find a different bit */
+ if (ffz_result < BITS_PER_LONG - 1) {
+ unsigned long modified = input | (1UL << ffz_result);
+ if (modified != ~0UL) { /* Skip if all bits would be set */
+ unsigned long new_ffz = ffz(modified);
+ KUNIT_EXPECT_NE_MSG(test, new_ffz, ffz_result,
+ "ffz(0x%08lx) = %lu, but setting that bit doesn't change ffz result",
+ input, ffz_result);
+ }
+ }
+}
+
+static void ffz_mathematical_relationships_test(struct kunit *test)
+{
+ unsigned long test_patterns[] = {
+ /* Powers of 2 with one bit clear */
+ 0xFFFFFFFE, 0xFFFFFFFD, 0xFFFFFFFB, 0xFFFFFFF7,
+ 0xFFFFFFEF, 0xFFFFFFDF, 0xFFFFFFBF, 0xFFFFFF7F,
+
+ /* Multiple patterns */
+ 0xFFFFFF00, 0xFFFFF000, 0xFFFF0000, 0xFFF00000,
+ 0x7FFFFFFF, 0x3FFFFFFF, 0x1FFFFFFF, 0x0FFFFFFF,
+
+ /* Complex bit patterns */
+ 0xAAAAAAAA, 0x55555555, 0xCCCCCCCC, 0x33333333,
+ 0xF0F0F0F0, 0x0F0F0F0F, 0xFF00FF00, 0x00FF00FF,
+ };
+ int i;
+
+ /* Test basic test cases */
+ for (i = 0; i < ARRAY_SIZE(ffz_test_cases); i++) {
+ validate_ffz_relationships(test, ffz_test_cases[i].input);
+ }
+
+ /* Test additional patterns */
+ for (i = 0; i < ARRAY_SIZE(test_patterns); i++) {
+ validate_ffz_relationships(test, test_patterns[i]);
+ }
+}
+
+/*
+ * Test edge cases and boundary conditions for ffz()
+ */
+static void ffz_edge_cases_test(struct kunit *test)
+{
+ unsigned long edge_patterns[] = {
+ /* Boundary values */
+ 0x00000000, /* All zeros */
+ 0x80000000, /* Only MSB set */
+ 0x00000001, /* Only LSB set */
+ 0x7FFFFFFF, /* MSB clear */
+ 0xFFFFFFFE, /* LSB clear */
+
+ /* Powers of 2 complement patterns (one zero bit each) */
+ ~(1UL << 0), ~(1UL << 1), ~(1UL << 2), ~(1UL << 3),
+ ~(1UL << 4), ~(1UL << 8), ~(1UL << 16), ~(1UL << 31),
+
+ /* Walking zero patterns */
+ 0xFFFFFFFE, 0xFFFFFFFD, 0xFFFFFFFB, 0xFFFFFFF7,
+ 0xFFFFFFEF, 0xFFFFFFDF, 0xFFFFFFBF, 0xFFFFFF7F,
+ 0xFFFFFEFF, 0xFFFFFDFF, 0xFFFFFBFF, 0xFFFFF7FF,
+
+ /* Multiple zeros */
+ 0xFFFFFF00, 0xFFFFF000, 0xFFFF0000, 0xFFF00000,
+ 0xFF000000, 0xF0000000, 0x00000000,
+ };
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(edge_patterns); i++) {
+ validate_ffz_relationships(test, edge_patterns[i]);
+ }
+}
+
+/*
+ * To have useful build error output, split the tests into separate
+ * functions so it's clear which are missing __attribute_const__.
+ */
+#define CREATE_WRAPPER(func) \
+static noinline bool build_test_##func(void) \
+{ \
+ int init_##func = 32; \
+ int result_##func = func(6); \
+ \
+ /* Does the static initializer vanish after calling func? */ \
+ BUILD_BUG_ON(init_##func < 32); \
+ \
+ /* "Consume" the results so optimizer doesn't drop them. */ \
+ barrier_data(&init_##func); \
+ barrier_data(&result_##func); \
+ \
+ return true; \
+}
+CREATE_WRAPPER(ffs)
+CREATE_WRAPPER(fls)
+CREATE_WRAPPER(__ffs)
+CREATE_WRAPPER(__fls)
+CREATE_WRAPPER(ffz)
+#undef CREATE_WRAPPER
+
+/*
+ * Make sure that __attribute_const__ has be applied to all the
+ * functions. This is a regression test for:
+ * https://github.com/KSPP/linux/issues/364
+ */
+static void ffs_attribute_const_test(struct kunit *test)
+{
+ KUNIT_EXPECT_TRUE(test, build_test_ffs());
+ KUNIT_EXPECT_TRUE(test, build_test_fls());
+ KUNIT_EXPECT_TRUE(test, build_test___ffs());
+ KUNIT_EXPECT_TRUE(test, build_test___fls());
+ KUNIT_EXPECT_TRUE(test, build_test_ffz());
+}
+
+/*
+ * KUnit test case definitions
+ */
+static struct kunit_case ffs_test_cases[] = {
+ KUNIT_CASE(ffs_basic_correctness_test),
+ KUNIT_CASE(ffs64_correctness_test),
+ KUNIT_CASE(ffs_mathematical_relationships_test),
+ KUNIT_CASE(ffs_edge_cases_test),
+ KUNIT_CASE(ffs64_edge_cases_test),
+ KUNIT_CASE(ffz_basic_correctness_test),
+ KUNIT_CASE(ffz_mathematical_relationships_test),
+ KUNIT_CASE(ffz_edge_cases_test),
+ KUNIT_CASE(ffs_attribute_const_test),
+ {}
+};
+
+/*
+ * KUnit test suite definition
+ */
+static struct kunit_suite ffs_test_suite = {
+ .name = "ffs",
+ .test_cases = ffs_test_cases,
+};
+
+kunit_test_suites(&ffs_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests for ffs()-family functions");
+MODULE_LICENSE("GPL");